├── .circleci
    └── config.yml
├── .gitignore
├── .nojekyll
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── __init__.py
├── benchmarks
    ├── README.md
    ├── a3c
    │   ├── README.md
    │   ├── ant_a3c_16_workers.png
    │   ├── half_cheetah_a3c_16_workers.png
    │   ├── hopper_a3c_16_workers.png
    │   ├── inverted_pendulum_a3c.png
    │   ├── space_invaders_a3c_16_workers.png
    │   └── walker2d_a3c_16_workers.png
    ├── acer
    │   ├── README.md
    │   ├── breakout_acer_16_workers.png
    │   ├── pong_acer_16_workers.png
    │   └── space_invaders_acer_16_workers.png
    ├── bootstrapped_dqn
    │   ├── README.md
    │   ├── breakout_bootstrapped_dqn.png
    │   ├── pong_bootstrapped_dqn.png
    │   └── space_invaders_bootstrapped_dqn.png
    ├── clipped_ppo
    │   ├── README.md
    │   ├── ant_clipped_ppo.png
    │   ├── half_cheetah_clipped_ppo.png
    │   ├── hopper_clipped_ppo.png
    │   ├── humanoid_clipped_ppo.png
    │   ├── inverted_double_pendulum_clipped_ppo.png
    │   ├── inverted_pendulum_clipped_ppo.png
    │   ├── reacher_clipped_ppo.png
    │   ├── swimmer_clipped_ppo.png
    │   └── walker2d_clipped_ppo.png
    ├── ddpg
    │   ├── README.md
    │   ├── ant_ddpg.png
    │   ├── half_cheetah_ddpg.png
    │   ├── hopper_ddpg.png
    │   ├── humanoid_ddpg.png
    │   ├── inverted_double_pendulum_ddpg.png
    │   ├── inverted_pendulum_ddpg.png
    │   ├── reacher_ddpg.png
    │   ├── swimmer_ddpg.png
    │   └── walker2d_ddpg.png
    ├── ddpg_her
    │   ├── README.md
    │   ├── fetch_ddpg_her_pick_and_place_8_workers.png
    │   ├── fetch_ddpg_her_push_8_workers.png
    │   ├── fetch_ddpg_her_reach_1_worker.png
    │   └── fetch_ddpg_her_slide_8_workers.png
    ├── dfp
    │   ├── README.md
    │   ├── doom_basic_dfp_8_workers.png
    │   ├── doom_health_dfp_8_workers.png
    │   └── doom_health_supreme_dfp_8_workers.png
    ├── dqn
    │   ├── README.md
    │   ├── breakout_dqn.png
    │   ├── pong_dqn.png
    │   └── space_invaders_dqn.png
    ├── dueling_ddqn
    │   ├── README.md
    │   ├── breakout_dueling_ddqn.png
    │   ├── pong_dueling_ddqn.png
    │   └── space_invaders_dueling_ddqn.png
    ├── dueling_ddqn_with_per
    │   ├── README.md
    │   ├── breakout_dueling_ddqn_with_per.png
    │   ├── pong_dueling_ddqn_with_per.png
    │   └── space_invaders_dueling_ddqn_with_per.png
    ├── qr_dqn
    │   ├── README.md
    │   ├── breakout_qr_dqn.png
    │   └── pong_qr_dqn.png
    ├── sac
    │   ├── README.md
    │   ├── half_cheetah_sac.png
    │   ├── hopper_sac.png
    │   ├── humanoid_sac.png
    │   ├── inverted_pendulum_sac.png
    │   └── walker2d_sac.png
    └── td3
    │   ├── README.md
    │   ├── ant.png
    │   ├── half_cheetah.png
    │   ├── hopper.png
    │   ├── reacher.png
    │   └── walker2d.png
├── dist-coach-config.template
├── docker
    ├── Dockerfile
    ├── Dockerfile.base
    ├── Dockerfile.doom_environment
    ├── Dockerfile.gym_environment
    ├── Dockerfile.mujoco_environment
    ├── Dockerfile.starcraft_environment
    ├── Makefile
    └── README.md
├── docs
    ├── .nojekyll
    ├── _images
    │   ├── ac.png
    │   ├── acer.png
    │   ├── act.png
    │   ├── algorithms.png
    │   ├── attention_discretization.png
    │   ├── bollinger_bands.png
    │   ├── box_discretization.png
    │   ├── box_masking.png
    │   ├── bs_dqn.png
    │   ├── cil.png
    │   ├── compare_by_num_episodes.png
    │   ├── compare_by_time.png
    │   ├── ddpg.png
    │   ├── design.png
    │   ├── dfp.png
    │   ├── distributed.png
    │   ├── distributional_dqn.png
    │   ├── dqn.png
    │   ├── dueling_dqn.png
    │   ├── filters.png
    │   ├── full_discrete_action_space_map.png
    │   ├── horizontal-scale-out.png
    │   ├── improve.png
    │   ├── linear_box_to_box_map.png
    │   ├── naf.png
    │   ├── nec.png
    │   ├── network.png
    │   ├── observe.png
    │   ├── partial_discrete_action_space_map.png
    │   ├── pg.png
    │   ├── ppo.png
    │   ├── qr_dqn.png
    │   ├── rainbow.png
    │   ├── sac.png
    │   ├── separate_signals.png
    │   ├── td3.png
    │   ├── train.png
    │   ├── updating_dynamically.gif
    │   └── wolpertinger.png
    ├── _modules
    │   ├── index.html
    │   └── rl_coach
    │   │   ├── agents
    │   │       ├── acer_agent.html
    │   │       ├── actor_critic_agent.html
    │   │       ├── agent.html
    │   │       ├── agent_interface.html
    │   │       ├── bc_agent.html
    │   │       ├── categorical_dqn_agent.html
    │   │       ├── cil_agent.html
    │   │       ├── clipped_ppo_agent.html
    │   │       ├── ddpg_agent.html
    │   │       ├── dfp_agent.html
    │   │       ├── dqn_agent.html
    │   │       ├── mmc_agent.html
    │   │       ├── n_step_q_agent.html
    │   │       ├── naf_agent.html
    │   │       ├── nec_agent.html
    │   │       ├── pal_agent.html
    │   │       ├── policy_gradients_agent.html
    │   │       ├── ppo_agent.html
    │   │       ├── qr_dqn_agent.html
    │   │       ├── rainbow_dqn_agent.html
    │   │       ├── soft_actor_critic_agent.html
    │   │       ├── td3_agent.html
    │   │       ├── value_optimization_agent.html
    │   │       └── wolpertinger_agent.html
    │   │   ├── architectures
    │   │       ├── architecture.html
    │   │       └── network_wrapper.html
    │   │   ├── base_parameters.html
    │   │   ├── core_types.html
    │   │   ├── data_stores
    │   │       ├── nfs_data_store.html
    │   │       └── s3_data_store.html
    │   │   ├── environments
    │   │       ├── carla_environment.html
    │   │       ├── control_suite_environment.html
    │   │       ├── doom_environment.html
    │   │       ├── environment.html
    │   │       ├── gym_environment.html
    │   │       └── starcraft2_environment.html
    │   │   ├── exploration_policies
    │   │       ├── additive_noise.html
    │   │       ├── boltzmann.html
    │   │       ├── bootstrapped.html
    │   │       ├── categorical.html
    │   │       ├── continuous_entropy.html
    │   │       ├── e_greedy.html
    │   │       ├── exploration_policy.html
    │   │       ├── greedy.html
    │   │       ├── ou_process.html
    │   │       ├── parameter_noise.html
    │   │       ├── truncated_normal.html
    │   │       └── ucb.html
    │   │   ├── filters
    │   │       ├── action
    │   │       │   ├── attention_discretization.html
    │   │       │   ├── box_discretization.html
    │   │       │   ├── box_masking.html
    │   │       │   ├── full_discrete_action_space_map.html
    │   │       │   ├── linear_box_to_box_map.html
    │   │       │   └── partial_discrete_action_space_map.html
    │   │       ├── observation
    │   │       │   ├── observation_clipping_filter.html
    │   │       │   ├── observation_crop_filter.html
    │   │       │   ├── observation_move_axis_filter.html
    │   │       │   ├── observation_normalization_filter.html
    │   │       │   ├── observation_reduction_by_sub_parts_name_filter.html
    │   │       │   ├── observation_rescale_size_by_factor_filter.html
    │   │       │   ├── observation_rescale_to_size_filter.html
    │   │       │   ├── observation_rgb_to_y_filter.html
    │   │       │   ├── observation_squeeze_filter.html
    │   │       │   ├── observation_stacking_filter.html
    │   │       │   └── observation_to_uint8_filter.html
    │   │       └── reward
    │   │       │   ├── reward_clipping_filter.html
    │   │       │   ├── reward_normalization_filter.html
    │   │       │   └── reward_rescale_filter.html
    │   │   ├── memories
    │   │       ├── backend
    │   │       │   └── redis.html
    │   │       ├── episodic
    │   │       │   ├── episodic_experience_replay.html
    │   │       │   ├── episodic_hindsight_experience_replay.html
    │   │       │   ├── episodic_hrl_hindsight_experience_replay.html
    │   │       │   └── single_episode_buffer.html
    │   │       └── non_episodic
    │   │       │   ├── balanced_experience_replay.html
    │   │       │   ├── differentiable_neural_dictionary.html
    │   │       │   ├── experience_replay.html
    │   │       │   ├── prioritized_experience_replay.html
    │   │       │   └── transition_collection.html
    │   │   ├── orchestrators
    │   │       └── kubernetes_orchestrator.html
    │   │   └── spaces.html
    ├── _sources
    │   ├── components
    │   │   ├── additional_parameters.rst.txt
    │   │   ├── agents
    │   │   │   ├── imitation
    │   │   │   │   ├── bc.rst.txt
    │   │   │   │   └── cil.rst.txt
    │   │   │   ├── index.rst.txt
    │   │   │   ├── other
    │   │   │   │   └── dfp.rst.txt
    │   │   │   ├── policy_optimization
    │   │   │   │   ├── ac.rst.txt
    │   │   │   │   ├── acer.rst.txt
    │   │   │   │   ├── cppo.rst.txt
    │   │   │   │   ├── ddpg.rst.txt
    │   │   │   │   ├── hac.rst.txt
    │   │   │   │   ├── pg.rst.txt
    │   │   │   │   ├── ppo.rst.txt
    │   │   │   │   ├── sac.rst.txt
    │   │   │   │   ├── td3.rst.txt
    │   │   │   │   └── wolpertinger.rst.txt
    │   │   │   └── value_optimization
    │   │   │   │   ├── bs_dqn.rst.txt
    │   │   │   │   ├── categorical_dqn.rst.txt
    │   │   │   │   ├── double_dqn.rst.txt
    │   │   │   │   ├── dqn.rst.txt
    │   │   │   │   ├── dueling_dqn.rst.txt
    │   │   │   │   ├── mmc.rst.txt
    │   │   │   │   ├── n_step.rst.txt
    │   │   │   │   ├── naf.rst.txt
    │   │   │   │   ├── nec.rst.txt
    │   │   │   │   ├── pal.rst.txt
    │   │   │   │   ├── qr_dqn.rst.txt
    │   │   │   │   └── rainbow.rst.txt
    │   │   ├── architectures
    │   │   │   └── index.rst.txt
    │   │   ├── core_types.rst.txt
    │   │   ├── data_stores
    │   │   │   └── index.rst.txt
    │   │   ├── environments
    │   │   │   └── index.rst.txt
    │   │   ├── exploration_policies
    │   │   │   └── index.rst.txt
    │   │   ├── filters
    │   │   │   ├── index.rst.txt
    │   │   │   ├── input_filters.rst.txt
    │   │   │   └── output_filters.rst.txt
    │   │   ├── memories
    │   │   │   └── index.rst.txt
    │   │   ├── memory_backends
    │   │   │   └── index.rst.txt
    │   │   ├── orchestrators
    │   │   │   └── index.rst.txt
    │   │   └── spaces.rst.txt
    │   ├── contributing
    │   │   ├── add_agent.rst.txt
    │   │   └── add_env.rst.txt
    │   ├── dashboard.rst.txt
    │   ├── design
    │   │   ├── control_flow.rst.txt
    │   │   ├── horizontal_scaling.rst.txt
    │   │   └── network.rst.txt
    │   ├── dist_usage.rst.txt
    │   ├── features
    │   │   ├── algorithms.rst.txt
    │   │   ├── batch_rl.rst.txt
    │   │   ├── benchmarks.rst.txt
    │   │   ├── environments.rst.txt
    │   │   └── index.rst.txt
    │   ├── index.rst.txt
    │   ├── selecting_an_algorithm.rst.txt
    │   ├── test.rst.txt
    │   └── usage.rst.txt
    ├── _static
    │   ├── basic.css
    │   ├── css
    │   │   ├── badge_only.css
    │   │   ├── custom.css
    │   │   └── theme.css
    │   ├── dark_logo.png
    │   ├── doctools.js
    │   ├── documentation_options.js
    │   ├── file.png
    │   ├── fonts
    │   │   ├── Inconsolata-Bold.ttf
    │   │   ├── Inconsolata-Regular.ttf
    │   │   ├── Inconsolata.ttf
    │   │   ├── Lato-Bold.ttf
    │   │   ├── Lato-Regular.ttf
    │   │   ├── Lato
    │   │   │   ├── lato-bold.eot
    │   │   │   ├── lato-bold.ttf
    │   │   │   ├── lato-bold.woff
    │   │   │   ├── lato-bold.woff2
    │   │   │   ├── lato-bolditalic.eot
    │   │   │   ├── lato-bolditalic.ttf
    │   │   │   ├── lato-bolditalic.woff
    │   │   │   ├── lato-bolditalic.woff2
    │   │   │   ├── lato-italic.eot
    │   │   │   ├── lato-italic.ttf
    │   │   │   ├── lato-italic.woff
    │   │   │   ├── lato-italic.woff2
    │   │   │   ├── lato-regular.eot
    │   │   │   ├── lato-regular.ttf
    │   │   │   ├── lato-regular.woff
    │   │   │   └── lato-regular.woff2
    │   │   ├── RobotoSlab-Bold.ttf
    │   │   ├── RobotoSlab-Regular.ttf
    │   │   ├── RobotoSlab
    │   │   │   ├── roboto-slab-v7-bold.eot
    │   │   │   ├── roboto-slab-v7-bold.ttf
    │   │   │   ├── roboto-slab-v7-bold.woff
    │   │   │   ├── roboto-slab-v7-bold.woff2
    │   │   │   ├── roboto-slab-v7-regular.eot
    │   │   │   ├── roboto-slab-v7-regular.ttf
    │   │   │   ├── roboto-slab-v7-regular.woff
    │   │   │   └── roboto-slab-v7-regular.woff2
    │   │   ├── fontawesome-webfont.eot
    │   │   ├── fontawesome-webfont.svg
    │   │   ├── fontawesome-webfont.ttf
    │   │   ├── fontawesome-webfont.woff
    │   │   └── fontawesome-webfont.woff2
    │   ├── jquery-3.2.1.js
    │   ├── jquery.js
    │   ├── js
    │   │   ├── modernizr.min.js
    │   │   └── theme.js
    │   ├── language_data.js
    │   ├── minus.png
    │   ├── plus.png
    │   ├── pygments.css
    │   ├── searchtools.js
    │   ├── underscore-1.3.1.js
    │   └── underscore.js
    ├── components
    │   ├── additional_parameters.html
    │   ├── agents
    │   │   ├── imitation
    │   │   │   ├── bc.html
    │   │   │   └── cil.html
    │   │   ├── index.html
    │   │   ├── other
    │   │   │   └── dfp.html
    │   │   ├── policy_optimization
    │   │   │   ├── ac.html
    │   │   │   ├── acer.html
    │   │   │   ├── cppo.html
    │   │   │   ├── ddpg.html
    │   │   │   ├── hac.html
    │   │   │   ├── pg.html
    │   │   │   ├── ppo.html
    │   │   │   ├── sac.html
    │   │   │   ├── td3.html
    │   │   │   └── wolpertinger.html
    │   │   └── value_optimization
    │   │   │   ├── bs_dqn.html
    │   │   │   ├── categorical_dqn.html
    │   │   │   ├── double_dqn.html
    │   │   │   ├── dqn.html
    │   │   │   ├── dueling_dqn.html
    │   │   │   ├── mmc.html
    │   │   │   ├── n_step.html
    │   │   │   ├── naf.html
    │   │   │   ├── nec.html
    │   │   │   ├── pal.html
    │   │   │   ├── qr_dqn.html
    │   │   │   └── rainbow.html
    │   ├── architectures
    │   │   └── index.html
    │   ├── core_types.html
    │   ├── data_stores
    │   │   └── index.html
    │   ├── environments
    │   │   └── index.html
    │   ├── exploration_policies
    │   │   └── index.html
    │   ├── filters
    │   │   ├── index.html
    │   │   ├── input_filters.html
    │   │   └── output_filters.html
    │   ├── memories
    │   │   └── index.html
    │   ├── memory_backends
    │   │   └── index.html
    │   ├── orchestrators
    │   │   └── index.html
    │   └── spaces.html
    ├── contributing
    │   ├── add_agent.html
    │   └── add_env.html
    ├── dashboard.html
    ├── design
    │   ├── control_flow.html
    │   ├── horizontal_scaling.html
    │   └── network.html
    ├── dist_usage.html
    ├── features
    │   ├── algorithms.html
    │   ├── batch_rl.html
    │   ├── benchmarks.html
    │   ├── environments.html
    │   └── index.html
    ├── genindex.html
    ├── index.html
    ├── objects.inv
    ├── search.html
    ├── searchindex.js
    ├── selecting_an_algorithm.html
    ├── test.html
    └── usage.html
├── docs_raw
    ├── Makefile
    ├── README.md
    ├── __init__.py
    ├── build_docs.sh
    ├── make.bat
    └── source
    │   ├── __init__.py
    │   ├── _static
    │       ├── css
    │       │   └── custom.css
    │       └── img
    │       │   ├── act.png
    │       │   ├── algorithms.png
    │       │   ├── attention_discretization.png
    │       │   ├── bollinger_bands.png
    │       │   ├── box_discretization.png
    │       │   ├── box_masking.png
    │       │   ├── compare_by_num_episodes.png
    │       │   ├── compare_by_time.png
    │       │   ├── dark_logo.png
    │       │   ├── design.png
    │       │   ├── design_imgs
    │       │       ├── ac.png
    │       │       ├── acer.png
    │       │       ├── bs_dqn.png
    │       │       ├── cil.png
    │       │       ├── ddpg.png
    │       │       ├── dfp.png
    │       │       ├── distributional_dqn.png
    │       │       ├── dqn.png
    │       │       ├── dueling_dqn.png
    │       │       ├── naf.png
    │       │       ├── nec.png
    │       │       ├── pg.png
    │       │       ├── ppo.png
    │       │       ├── qr_dqn.png
    │       │       ├── rainbow.png
    │       │       ├── sac.png
    │       │       ├── td3.png
    │       │       └── wolpertinger.png
    │       │   ├── diagrams.xml
    │       │   ├── distributed.png
    │       │   ├── filters.png
    │       │   ├── full_discrete_action_space_map.png
    │       │   ├── graph.png
    │       │   ├── horizontal-scale-out.png
    │       │   ├── improve.png
    │       │   ├── level.png
    │       │   ├── linear_box_to_box_map.png
    │       │   ├── network.png
    │       │   ├── observe.png
    │       │   ├── output_filters.xml
    │       │   ├── partial_discrete_action_space_map.png
    │       │   ├── separate_signals.png
    │       │   ├── train.png
    │       │   └── updating_dynamically.gif
    │   ├── _templates
    │       └── layout.html
    │   ├── algorithms.xml
    │   ├── components
    │       ├── additional_parameters.rst
    │       ├── agents
    │       │   ├── imitation
    │       │   │   ├── bc.rst
    │       │   │   └── cil.rst
    │       │   ├── index.rst
    │       │   ├── other
    │       │   │   └── dfp.rst
    │       │   ├── policy_optimization
    │       │   │   ├── ac.rst
    │       │   │   ├── acer.rst
    │       │   │   ├── cppo.rst
    │       │   │   ├── ddpg.rst
    │       │   │   ├── hac.rst
    │       │   │   ├── pg.rst
    │       │   │   ├── ppo.rst
    │       │   │   ├── sac.rst
    │       │   │   ├── td3.rst
    │       │   │   └── wolpertinger.rst
    │       │   └── value_optimization
    │       │   │   ├── bs_dqn.rst
    │       │   │   ├── categorical_dqn.rst
    │       │   │   ├── double_dqn.rst
    │       │   │   ├── dqn.rst
    │       │   │   ├── dueling_dqn.rst
    │       │   │   ├── mmc.rst
    │       │   │   ├── n_step.rst
    │       │   │   ├── naf.rst
    │       │   │   ├── nec.rst
    │       │   │   ├── pal.rst
    │       │   │   ├── qr_dqn.rst
    │       │   │   └── rainbow.rst
    │       ├── architectures
    │       │   └── index.rst
    │       ├── core_types.rst
    │       ├── data_stores
    │       │   └── index.rst
    │       ├── environments
    │       │   └── index.rst
    │       ├── exploration_policies
    │       │   └── index.rst
    │       ├── filters
    │       │   ├── index.rst
    │       │   ├── input_filters.rst
    │       │   └── output_filters.rst
    │       ├── memories
    │       │   └── index.rst
    │       ├── memory_backends
    │       │   └── index.rst
    │       ├── orchestrators
    │       │   └── index.rst
    │       └── spaces.rst
    │   ├── conf.py
    │   ├── contributing
    │       ├── add_agent.rst
    │       └── add_env.rst
    │   ├── dashboard.rst
    │   ├── design
    │       ├── control_flow.rst
    │       ├── horizontal_scaling.rst
    │       └── network.rst
    │   ├── diagrams.xml
    │   ├── dist_usage.rst
    │   ├── features
    │       ├── algorithms.rst
    │       ├── batch_rl.rst
    │       ├── benchmarks.rst
    │       ├── environments.rst
    │       └── index.rst
    │   ├── index.rst
    │   ├── selecting_an_algorithm.rst
    │   ├── test.rst
    │   └── usage.rst
├── img
    ├── ant.gif
    ├── carla.gif
    ├── coach_logo.png
    ├── dashboard.gif
    ├── dashboard.png
    ├── doom_deathmatch.gif
    ├── doom_health.gif
    ├── fetch_slide.gif
    ├── minitaur.gif
    ├── montezuma.gif
    ├── pendulum.gif
    └── starcraft.gif
├── requirements.txt
├── rl_coach
    ├── __init__.py
    ├── agents
    │   ├── __init__.py
    │   ├── acer_agent.py
    │   ├── actor_critic_agent.py
    │   ├── agent.py
    │   ├── agent_interface.py
    │   ├── bc_agent.py
    │   ├── bootstrapped_dqn_agent.py
    │   ├── categorical_dqn_agent.py
    │   ├── cil_agent.py
    │   ├── clipped_ppo_agent.py
    │   ├── composite_agent.py
    │   ├── ddpg_agent.py
    │   ├── ddqn_agent.py
    │   ├── ddqn_bcq_agent.py
    │   ├── dfp_agent.py
    │   ├── dqn_agent.py
    │   ├── hac_ddpg_agent.py
    │   ├── human_agent.py
    │   ├── imitation_agent.py
    │   ├── mmc_agent.py
    │   ├── n_step_q_agent.py
    │   ├── naf_agent.py
    │   ├── nec_agent.py
    │   ├── pal_agent.py
    │   ├── policy_gradients_agent.py
    │   ├── policy_optimization_agent.py
    │   ├── ppo_agent.py
    │   ├── qr_dqn_agent.py
    │   ├── rainbow_dqn_agent.py
    │   ├── soft_actor_critic_agent.py
    │   ├── td3_agent.py
    │   ├── td3_exp_agent.py
    │   ├── value_optimization_agent.py
    │   └── wolpertinger_agent.py
    ├── architectures
    │   ├── __init__.py
    │   ├── architecture.py
    │   ├── embedder_parameters.py
    │   ├── head_parameters.py
    │   ├── layers.py
    │   ├── middleware_parameters.py
    │   ├── mxnet_components
    │   │   ├── __init__.py
    │   │   ├── architecture.py
    │   │   ├── embedders
    │   │   │   ├── __init__.py
    │   │   │   ├── embedder.py
    │   │   │   ├── image_embedder.py
    │   │   │   ├── tensor_embedder.py
    │   │   │   └── vector_embedder.py
    │   │   ├── general_network.py
    │   │   ├── heads
    │   │   │   ├── __init__.py
    │   │   │   ├── head.py
    │   │   │   ├── ppo_head.py
    │   │   │   ├── ppo_v_head.py
    │   │   │   ├── q_head.py
    │   │   │   └── v_head.py
    │   │   ├── layers.py
    │   │   ├── middlewares
    │   │   │   ├── __init__.py
    │   │   │   ├── fc_middleware.py
    │   │   │   ├── lstm_middleware.py
    │   │   │   └── middleware.py
    │   │   ├── savers.py
    │   │   └── utils.py
    │   ├── network_wrapper.py
    │   └── tensorflow_components
    │   │   ├── __init__.py
    │   │   ├── architecture.py
    │   │   ├── distributed_tf_utils.py
    │   │   ├── embedders
    │   │       ├── __init__.py
    │   │       ├── embedder.py
    │   │       ├── image_embedder.py
    │   │       ├── tensor_embedder.py
    │   │       └── vector_embedder.py
    │   │   ├── general_network.py
    │   │   ├── heads
    │   │       ├── RND_head.py
    │   │       ├── __init__.py
    │   │       ├── acer_policy_head.py
    │   │       ├── categorical_q_head.py
    │   │       ├── cil_head.py
    │   │       ├── classification_head.py
    │   │       ├── ddpg_actor_head.py
    │   │       ├── ddpg_v_head.py
    │   │       ├── dnd_q_head.py
    │   │       ├── dueling_q_head.py
    │   │       ├── head.py
    │   │       ├── measurements_prediction_head.py
    │   │       ├── naf_head.py
    │   │       ├── policy_head.py
    │   │       ├── ppo_head.py
    │   │       ├── ppo_v_head.py
    │   │       ├── q_head.py
    │   │       ├── quantile_regression_q_head.py
    │   │       ├── rainbow_q_head.py
    │   │       ├── sac_head.py
    │   │       ├── sac_q_head.py
    │   │       ├── td3_v_head.py
    │   │       ├── v_head.py
    │   │       └── wolpertinger_actor_head.py
    │   │   ├── layers.py
    │   │   ├── middlewares
    │   │       ├── __init__.py
    │   │       ├── fc_middleware.py
    │   │       ├── lstm_middleware.py
    │   │       └── middleware.py
    │   │   ├── savers.py
    │   │   ├── shared_variables.py
    │   │   └── utils.py
    ├── base_parameters.py
    ├── checkpoint.py
    ├── coach.py
    ├── core_types.py
    ├── dashboard.py
    ├── dashboard_components
    │   ├── __init__.py
    │   ├── boards.py
    │   ├── episodic_board.py
    │   ├── experiment_board.py
    │   ├── globals.py
    │   ├── landing_page.py
    │   ├── signals.py
    │   ├── signals_file.py
    │   ├── signals_file_base.py
    │   ├── signals_files_group.py
    │   └── spinner.css
    ├── data_stores
    │   ├── __init__.py
    │   ├── checkpoint_data_store.py
    │   ├── data_store.py
    │   ├── data_store_impl.py
    │   ├── nfs_data_store.py
    │   ├── redis_data_store.py
    │   └── s3_data_store.py
    ├── debug_utils.py
    ├── environments
    │   ├── CarlaSettings.ini
    │   ├── README.md
    │   ├── __init__.py
    │   ├── carla_environment.py
    │   ├── control_suite_environment.py
    │   ├── doom
    │   │   ├── D2_navigation.cfg
    │   │   ├── D2_navigation.wad
    │   │   ├── D3_battle.cfg
    │   │   └── D3_battle.wad
    │   ├── doom_environment.py
    │   ├── environment.py
    │   ├── environment_interface.py
    │   ├── gym_environment.py
    │   ├── mujoco
    │   │   ├── __init__.py
    │   │   ├── common
    │   │   │   ├── __init__.py
    │   │   │   ├── materials.xml
    │   │   │   ├── skybox.xml
    │   │   │   └── visual.xml
    │   │   ├── pendulum_with_goals.py
    │   │   └── pendulum_with_goals.xml
    │   ├── robosuite
    │   │   ├── cube_exp.py
    │   │   └── osc_pose.json
    │   ├── robosuite_environment.py
    │   ├── starcraft2_environment.py
    │   └── toy_problems
    │   │   ├── __init__.py
    │   │   ├── bit_flip.py
    │   │   └── exploration_chain.py
    ├── exploration_policies
    │   ├── README.md
    │   ├── __init__.py
    │   ├── additive_noise.py
    │   ├── boltzmann.py
    │   ├── bootstrapped.py
    │   ├── categorical.py
    │   ├── continuous_entropy.py
    │   ├── e_greedy.py
    │   ├── exploration_policy.py
    │   ├── greedy.py
    │   ├── ou_process.py
    │   ├── parameter_noise.py
    │   ├── truncated_normal.py
    │   └── ucb.py
    ├── filters
    │   ├── README.md
    │   ├── __init__.py
    │   ├── action
    │   │   ├── __init__.py
    │   │   ├── action_filter.py
    │   │   ├── attention_discretization.py
    │   │   ├── box_discretization.py
    │   │   ├── box_masking.py
    │   │   ├── full_discrete_action_space_map.py
    │   │   ├── linear_box_to_box_map.py
    │   │   └── partial_discrete_action_space_map.py
    │   ├── filter.py
    │   ├── observation
    │   │   ├── __init__.py
    │   │   ├── observation_clipping_filter.py
    │   │   ├── observation_crop_filter.py
    │   │   ├── observation_filter.py
    │   │   ├── observation_move_axis_filter.py
    │   │   ├── observation_normalization_filter.py
    │   │   ├── observation_reduction_by_sub_parts_name_filter.py
    │   │   ├── observation_rescale_size_by_factor_filter.py
    │   │   ├── observation_rescale_to_size_filter.py
    │   │   ├── observation_rgb_to_y_filter.py
    │   │   ├── observation_squeeze_filter.py
    │   │   ├── observation_stacking_filter.py
    │   │   └── observation_to_uint8_filter.py
    │   └── reward
    │   │   ├── __init__.py
    │   │   ├── reward_clipping_filter.py
    │   │   ├── reward_ewma_normalization_filter.py
    │   │   ├── reward_filter.py
    │   │   ├── reward_normalization_filter.py
    │   │   └── reward_rescale_filter.py
    ├── graph_managers
    │   ├── README.md
    │   ├── __init__.py
    │   ├── basic_rl_graph_manager.py
    │   ├── batch_rl_graph_manager.py
    │   ├── graph_manager.py
    │   ├── hac_graph_manager.py
    │   └── hrl_graph_manager.py
    ├── level_manager.py
    ├── logger.py
    ├── memories
    │   ├── __init__.py
    │   ├── backend
    │   │   ├── __init__.py
    │   │   ├── memory.py
    │   │   ├── memory_impl.py
    │   │   └── redis.py
    │   ├── episodic
    │   │   ├── __init__.py
    │   │   ├── episodic_experience_replay.py
    │   │   ├── episodic_hindsight_experience_replay.py
    │   │   ├── episodic_hrl_hindsight_experience_replay.py
    │   │   └── single_episode_buffer.py
    │   ├── memory.py
    │   └── non_episodic
    │   │   ├── __init__.py
    │   │   ├── balanced_experience_replay.py
    │   │   ├── differentiable_neural_dictionary.py
    │   │   ├── experience_replay.py
    │   │   ├── prioritized_experience_replay.py
    │   │   └── transition_collection.py
    ├── off_policy_evaluators
    │   ├── __init__.py
    │   ├── bandits
    │   │   ├── __init__.py
    │   │   └── doubly_robust.py
    │   ├── ope_manager.py
    │   └── rl
    │   │   ├── __init__.py
    │   │   ├── sequential_doubly_robust.py
    │   │   └── weighted_importance_sampling.py
    ├── orchestrators
    │   ├── __init__.py
    │   ├── deploy.py
    │   └── kubernetes_orchestrator.py
    ├── plot_atari.py
    ├── presets
    │   ├── Acrobot_DDQN_BCQ_BatchRL.py
    │   ├── Atari_A3C.py
    │   ├── Atari_A3C_LSTM.py
    │   ├── Atari_ACER.py
    │   ├── Atari_Bootstrapped_DQN.py
    │   ├── Atari_C51.py
    │   ├── Atari_DDQN.py
    │   ├── Atari_DDQN_with_PER.py
    │   ├── Atari_DQN.py
    │   ├── Atari_DQN_with_PER.py
    │   ├── Atari_Dueling_DDQN.py
    │   ├── Atari_Dueling_DDQN_with_PER_OpenAI.py
    │   ├── Atari_NEC.py
    │   ├── Atari_NStepQ.py
    │   ├── Atari_QR_DQN.py
    │   ├── Atari_Rainbow.py
    │   ├── Atari_UCB_with_Q_Ensembles.py
    │   ├── BitFlip_DQN.py
    │   ├── BitFlip_DQN_HER.py
    │   ├── CARLA_3_Cameras_DDPG.py
    │   ├── CARLA_CIL.py
    │   ├── CARLA_DDPG.py
    │   ├── CARLA_Dueling_DDQN.py
    │   ├── CartPole_A3C.py
    │   ├── CartPole_ACER.py
    │   ├── CartPole_ClippedPPO.py
    │   ├── CartPole_DDQN_BCQ_BatchRL.py
    │   ├── CartPole_DDQN_BatchRL.py
    │   ├── CartPole_DFP.py
    │   ├── CartPole_DQN.py
    │   ├── CartPole_Dueling_DDQN.py
    │   ├── CartPole_NEC.py
    │   ├── CartPole_NStepQ.py
    │   ├── CartPole_PAL.py
    │   ├── CartPole_PG.py
    │   ├── CartPole_QR_DQN.py
    │   ├── CartPole_Rainbow.py
    │   ├── ControlSuite_DDPG.py
    │   ├── Doom_Basic_A3C.py
    │   ├── Doom_Basic_ACER.py
    │   ├── Doom_Basic_BC.py
    │   ├── Doom_Basic_DFP.py
    │   ├── Doom_Basic_DQN.py
    │   ├── Doom_Basic_Dueling_DDQN.py
    │   ├── Doom_Battle_DFP.py
    │   ├── Doom_Health_DFP.py
    │   ├── Doom_Health_MMC.py
    │   ├── Doom_Health_Supreme_DFP.py
    │   ├── ExplorationChain_Bootstrapped_DQN.py
    │   ├── ExplorationChain_Dueling_DDQN.py
    │   ├── ExplorationChain_UCB_Q_ensembles.py
    │   ├── Fetch_DDPG_HER_baselines.py
    │   ├── InvertedPendulum_PG.py
    │   ├── MontezumaRevenge_BC.py
    │   ├── Mujoco_A3C.py
    │   ├── Mujoco_A3C_LSTM.py
    │   ├── Mujoco_ClippedPPO.py
    │   ├── Mujoco_DDPG.py
    │   ├── Mujoco_NAF.py
    │   ├── Mujoco_PPO.py
    │   ├── Mujoco_SAC.py
    │   ├── Mujoco_TD3.py
    │   ├── Mujoco_Wolpertinger.py
    │   ├── Pendulum_HAC.py
    │   ├── README.md
    │   ├── RoboSuite_CubeExp_Random.py
    │   ├── RoboSuite_CubeExp_TD3_Goal_Based.py
    │   ├── RoboSuite_CubeExp_TD3_Intrinsic_Reward.py
    │   ├── Starcraft_CollectMinerals_A3C.py
    │   ├── Starcraft_CollectMinerals_Dueling_DDQN.py
    │   └── __init__.py
    ├── renderer.py
    ├── rollout_worker.py
    ├── run_multiple_seeds.py
    ├── saver.py
    ├── schedules.py
    ├── spaces.py
    ├── tests
    │   ├── README.md
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   └── test_agent_external_communication.py
    │   ├── architectures
    │   │   ├── __init__.py
    │   │   ├── mxnet_components
    │   │   │   ├── __init__.py
    │   │   │   ├── embedders
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_image_embedder.py
    │   │   │   │   └── test_vector_embedder.py
    │   │   │   ├── heads
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_head.py
    │   │   │   │   ├── test_ppo_head.py
    │   │   │   │   ├── test_ppo_v_head.py
    │   │   │   │   ├── test_q_head.py
    │   │   │   │   └── test_v_head.py
    │   │   │   ├── middlewares
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_fc_middleware.py
    │   │   │   │   └── test_lstm_middleware.py
    │   │   │   └── test_utils.py
    │   │   └── tensorflow_components
    │   │   │   ├── __init__.py
    │   │   │   └── embedders
    │   │   │       ├── __init__.py
    │   │   │       ├── test_identity_embedder.py
    │   │   │       ├── test_image_embedder.py
    │   │   │       └── test_vector_embedder.py
    │   ├── conftest.py
    │   ├── environments
    │   │   ├── __init__.py
    │   │   └── test_gym_environment.py
    │   ├── exploration_policies
    │   │   ├── __init__.py
    │   │   ├── test_additive_noise.py
    │   │   ├── test_e_greedy.py
    │   │   ├── test_greedy.py
    │   │   └── test_ou_process.py
    │   ├── filters
    │   │   ├── __init__.py
    │   │   ├── action
    │   │   │   ├── __init__.py
    │   │   │   ├── test_attention_discretization.py
    │   │   │   ├── test_box_discretization.py
    │   │   │   ├── test_box_masking.py
    │   │   │   └── test_linear_box_to_box_map.py
    │   │   ├── observation
    │   │   │   ├── __init__.py
    │   │   │   ├── test_observation_crop_filter.py
    │   │   │   ├── test_observation_reduction_by_sub_parts_name_filter.py
    │   │   │   ├── test_observation_rescale_size_by_factor_filter.py
    │   │   │   ├── test_observation_rescale_to_size_filter.py
    │   │   │   ├── test_observation_rgb_to_y_filter.py
    │   │   │   ├── test_observation_squeeze_filter.py
    │   │   │   ├── test_observation_stacking_filter.py
    │   │   │   └── test_observation_to_uint8_filter.py
    │   │   ├── reward
    │   │   │   ├── __init__.py
    │   │   │   ├── test_reward_clipping_filter.py
    │   │   │   └── test_reward_rescale_filter.py
    │   │   └── test_filters_stacking.py
    │   ├── graph_managers
    │   │   ├── __init__.py
    │   │   ├── test_basic_rl_graph_manager.py
    │   │   └── test_graph_manager.py
    │   ├── memories
    │   │   ├── __init__.py
    │   │   ├── test_differential_neural_dictionary.py
    │   │   ├── test_hindsight_experience_replay.py
    │   │   ├── test_prioritized_experience_replay.py
    │   │   └── test_single_episode_buffer.py
    │   ├── presets
    │   │   ├── __init__.py
    │   │   └── test_presets.py
    │   ├── pytest.ini
    │   ├── test_checkpoint.py
    │   ├── test_coach_args.py
    │   ├── test_core_types.py
    │   ├── test_dist_coach.py
    │   ├── test_eks.py
    │   ├── test_global_variable_saver.py
    │   ├── test_golden.py
    │   ├── test_saver.py
    │   ├── test_schedules.py
    │   ├── test_spaces.py
    │   ├── trace_tests.py
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── args_utils.py
    │   │   ├── definitions.py
    │   │   ├── presets_utils.py
    │   │   └── test_utils.py
    ├── training_worker.py
    ├── utilities
    │   ├── __init__.py
    │   ├── carla_dataset_to_replay_buffer.py
    │   └── shared_running_stats.py
    └── utils.py
├── setup.py
└── tutorials
    ├── 0. Quick Start Guide.ipynb
    ├── 1. Implementing an Algorithm.ipynb
    ├── 2. Adding an Environment.ipynb
    ├── 3. Implementing a Hierarchical RL Graph.ipynb
    ├── 4. Batch Reinforcement Learning.ipynb
    ├── 5. Goal-Based Data Collection.ipynb
    ├── Resources
        ├── acrobot_dataset.csv
        ├── exploration.py
        └── img
        │   ├── dr.png
        │   ├── model_selection.png
        │   └── wis.png
    └── python_invocation_example.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | experiments
 3 | *.pyc
 4 | checkpoints
 5 | _vizdoom.ini
 6 | *.*~
 7 | MUJOCO_LOG.TXT
 8 | test_log.txt
 9 | .test
10 | tf_logs
11 | bullet3
12 | roboschool
13 | *.csv
14 | *.doc
15 | *.orig
16 | docs/site
17 | coach_env
18 | venv
19 | build
20 | rl_coach.egg*
21 | rl_coach_slim.egg*
22 | contrib
23 | test_log_*
24 | dist
25 | .DS_Store
26 | datasets
27 | .cache
28 | .pytest_cache
29 | core
30 | trace_test*
31 | *.swp
32 | *.swo
33 | .cache/
34 | *.pyc
35 | coachenv
36 | 


--------------------------------------------------------------------------------
/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/.nojekyll


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include rl_coach/dashboard_components/*.css
2 | include rl_coach/environments/doom/*.cfg
3 | include rl_coach/environments/doom/*.wad
4 | include rl_coach/environments/mujoco/common/*.xml
5 | include rl_coach/environments/mujoco/*.xml
6 | include rl_coach/environments/*.ini
7 | include rl_coach/tests/*.ini
8 | include requirements.txt


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/__init__.py


--------------------------------------------------------------------------------
/benchmarks/a3c/ant_a3c_16_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/ant_a3c_16_workers.png


--------------------------------------------------------------------------------
/benchmarks/a3c/half_cheetah_a3c_16_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/half_cheetah_a3c_16_workers.png


--------------------------------------------------------------------------------
/benchmarks/a3c/hopper_a3c_16_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/hopper_a3c_16_workers.png


--------------------------------------------------------------------------------
/benchmarks/a3c/inverted_pendulum_a3c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/inverted_pendulum_a3c.png


--------------------------------------------------------------------------------
/benchmarks/a3c/space_invaders_a3c_16_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/space_invaders_a3c_16_workers.png


--------------------------------------------------------------------------------
/benchmarks/a3c/walker2d_a3c_16_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/walker2d_a3c_16_workers.png


--------------------------------------------------------------------------------
/benchmarks/acer/README.md:
--------------------------------------------------------------------------------
 1 | # ACER
 2 | 
 3 | Each experiment uses 3 seeds.
 4 | The parameters used for ACER are the same parameters as described in the [original paper](https://arxiv.org/abs/1611.01224), except for the optimizer (changed to ADAM) and learning rate (1e-4) used.
 5 | 
 6 | ### Breakout ACER - 16 workers
 7 | 
 8 | ```bash
 9 | coach -p Atari_ACER -lvl breakout -n 16
10 | ```
11 | 
12 | <img src="breakout_acer_16_workers.png" alt="Breakout ACER" width="800"/>
13 | 
14 | ### Space Invaders ACER - 16 workers
15 | 
16 | ```bash
17 | coach -p Atari_ACER -lvl space_invaders -n 16
18 | ```
19 | 
20 | <img src="space_invaders_acer_16_workers.png" alt="Space Invaders ACER" width="800"/>
21 | 
22 | ### Pong ACER - 16 workers
23 | 
24 | ```bash
25 | coach -p Atari_ACER -lvl pong -n 16
26 | ```
27 | 
28 | <img src="pong_acer_16_workers.png" alt="Pong ACER" width="800"/>
29 | 


--------------------------------------------------------------------------------
/benchmarks/acer/breakout_acer_16_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/acer/breakout_acer_16_workers.png


--------------------------------------------------------------------------------
/benchmarks/acer/pong_acer_16_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/acer/pong_acer_16_workers.png


--------------------------------------------------------------------------------
/benchmarks/acer/space_invaders_acer_16_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/acer/space_invaders_acer_16_workers.png


--------------------------------------------------------------------------------
/benchmarks/bootstrapped_dqn/README.md:
--------------------------------------------------------------------------------
 1 | # Bootstrapped DQN
 2 | 
 3 | Each experiment uses 3 seeds.
 4 | The parameters used for Bootstrapped DQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1602.04621.pdf).
 5 | 
 6 | ### Breakout Bootstrapped DQN - single worker
 7 | 
 8 | ```bash
 9 | coach -p Atari_Bootstrapped_DQN -lvl breakout
10 | ```
11 | 
12 | <img src="breakout_bootstrapped_dqn.png" alt="Breakout Bootstrapped DQN" width="800"/>
13 | 
14 | 
15 | ### Pong Bootstrapped DQN - single worker
16 | 
17 | ```bash
18 | coach -p Atari_Bootstrapped_DQN -lvl pong
19 | ```
20 | 
21 | <img src="pong_bootstrapped_dqn.png" alt="Pong Bootstrapped DQN" width="800"/>
22 | 
23 | 
24 | ### Space Invaders Bootstrapped DQN - single worker
25 | 
26 | ```bash
27 | coach -p Atari_Bootstrapped_DQN -lvl space_invaders
28 | ```
29 | 
30 | <img src="space_invaders_bootstrapped_dqn.png" alt="Space Invaders Bootstrapped DQN" width="800"/>
31 | 
32 | 


--------------------------------------------------------------------------------
/benchmarks/bootstrapped_dqn/breakout_bootstrapped_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/bootstrapped_dqn/breakout_bootstrapped_dqn.png


--------------------------------------------------------------------------------
/benchmarks/bootstrapped_dqn/pong_bootstrapped_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/bootstrapped_dqn/pong_bootstrapped_dqn.png


--------------------------------------------------------------------------------
/benchmarks/bootstrapped_dqn/space_invaders_bootstrapped_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/bootstrapped_dqn/space_invaders_bootstrapped_dqn.png


--------------------------------------------------------------------------------
/benchmarks/clipped_ppo/ant_clipped_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/ant_clipped_ppo.png


--------------------------------------------------------------------------------
/benchmarks/clipped_ppo/half_cheetah_clipped_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/half_cheetah_clipped_ppo.png


--------------------------------------------------------------------------------
/benchmarks/clipped_ppo/hopper_clipped_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/hopper_clipped_ppo.png


--------------------------------------------------------------------------------
/benchmarks/clipped_ppo/humanoid_clipped_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/humanoid_clipped_ppo.png


--------------------------------------------------------------------------------
/benchmarks/clipped_ppo/inverted_double_pendulum_clipped_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/inverted_double_pendulum_clipped_ppo.png


--------------------------------------------------------------------------------
/benchmarks/clipped_ppo/inverted_pendulum_clipped_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/inverted_pendulum_clipped_ppo.png


--------------------------------------------------------------------------------
/benchmarks/clipped_ppo/reacher_clipped_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/reacher_clipped_ppo.png


--------------------------------------------------------------------------------
/benchmarks/clipped_ppo/swimmer_clipped_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/swimmer_clipped_ppo.png


--------------------------------------------------------------------------------
/benchmarks/clipped_ppo/walker2d_clipped_ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/walker2d_clipped_ppo.png


--------------------------------------------------------------------------------
/benchmarks/ddpg/ant_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/ant_ddpg.png


--------------------------------------------------------------------------------
/benchmarks/ddpg/half_cheetah_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/half_cheetah_ddpg.png


--------------------------------------------------------------------------------
/benchmarks/ddpg/hopper_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/hopper_ddpg.png


--------------------------------------------------------------------------------
/benchmarks/ddpg/humanoid_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/humanoid_ddpg.png


--------------------------------------------------------------------------------
/benchmarks/ddpg/inverted_double_pendulum_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/inverted_double_pendulum_ddpg.png


--------------------------------------------------------------------------------
/benchmarks/ddpg/inverted_pendulum_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/inverted_pendulum_ddpg.png


--------------------------------------------------------------------------------
/benchmarks/ddpg/reacher_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/reacher_ddpg.png


--------------------------------------------------------------------------------
/benchmarks/ddpg/swimmer_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/swimmer_ddpg.png


--------------------------------------------------------------------------------
/benchmarks/ddpg/walker2d_ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/walker2d_ddpg.png


--------------------------------------------------------------------------------
/benchmarks/ddpg_her/README.md:
--------------------------------------------------------------------------------
 1 | # DDPG with Hindsight Experience Replay
 2 | 
 3 | Each experiment uses 3 seeds.
 4 | The parameters used for DDPG HER are the same parameters as described in the [following paper](https://arxiv.org/abs/1802.09464).
 5 | 
 6 | ### Fetch Reach DDPG HER - single worker
 7 | 
 8 | ```bash
 9 | coach -p Fetch_DDPG_HER_baselines -lvl reach
10 | ```
11 | 
12 | <img src="fetch_ddpg_her_reach_1_worker.png" alt="Fetch DDPG HER Reach 1 Worker" width="800"/>
13 | 
14 | 
15 | ### Fetch Push DDPG HER - 8 workers
16 | 
17 | ```bash
18 | coach -p Fetch_DDPG_HER_baselines -lvl push -n 8
19 | ```
20 | 
21 | <img src="fetch_ddpg_her_push_8_workers.png" alt="Fetch DDPG HER Push 8 Worker" width="800"/>
22 | 
23 | 
24 | ### Fetch Slide DDPG HER - 8 workers
25 | 
26 | ```bash
27 | coach -p Fetch_DDPG_HER_baselines -lvl slide -n 8
28 | ```
29 | 
30 | <img src="fetch_ddpg_her_slide_8_workers.png" alt="Fetch DDPG HER Slide 8 Worker" width="800"/>
31 | 
32 | 
33 | ### Fetch Pick And Place DDPG HER - 8 workers
34 | 
35 | ```bash
36 | coach -p Fetch_DDPG_HER -lvl pick_and_place -n 8
37 | ```
38 | 
39 | <img src="fetch_ddpg_her_pick_and_place_8_workers.png" alt="Fetch DDPG HER Pick And Place 8 Workers" width="800"/>
40 | 
41 | 


--------------------------------------------------------------------------------
/benchmarks/ddpg_her/fetch_ddpg_her_pick_and_place_8_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg_her/fetch_ddpg_her_pick_and_place_8_workers.png


--------------------------------------------------------------------------------
/benchmarks/ddpg_her/fetch_ddpg_her_push_8_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg_her/fetch_ddpg_her_push_8_workers.png


--------------------------------------------------------------------------------
/benchmarks/ddpg_her/fetch_ddpg_her_reach_1_worker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg_her/fetch_ddpg_her_reach_1_worker.png


--------------------------------------------------------------------------------
/benchmarks/ddpg_her/fetch_ddpg_her_slide_8_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg_her/fetch_ddpg_her_slide_8_workers.png


--------------------------------------------------------------------------------
/benchmarks/dfp/README.md:
--------------------------------------------------------------------------------
 1 | # DFP
 2 | 
 3 | Each experiment uses 3 seeds.
 4 | The parameters used for DFP are the same parameters as described in the [original paper](https://arxiv.org/abs/1611.01779).
 5 | 
 6 | ### Doom Basic DFP - 8 workers
 7 | 
 8 | ```bash
 9 | coach -p Doom_Basic_DFP -n 8
10 | ```
11 | 
12 | <img src="doom_basic_dfp_8_workers.png" alt="Doom Basic DFP 8 workers" width="800"/>
13 | 
14 | 
15 | ### Doom Health (D1: Basic) DFP - 8 workers
16 | 
17 | ```bash
18 | coach -p Doom_Health_DFP -n 8
19 | ```
20 | 
21 | <img src="doom_health_dfp_8_workers.png" alt="Doom Health DFP 8 workers" width="800"/>
22 | 
23 | 
24 | 
25 | ### Doom Health Supreme (D2: Navigation) DFP - 8 workers
26 | 
27 | ```bash
28 | coach -p Doom_Health_Supreme_DFP -n 8
29 | ```
30 | 
31 | <img src="doom_health_supreme_dfp_8_workers.png" alt="Doom Health Supreme DFP 8 workers" width="800"/>
32 | 


--------------------------------------------------------------------------------
/benchmarks/dfp/doom_basic_dfp_8_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dfp/doom_basic_dfp_8_workers.png


--------------------------------------------------------------------------------
/benchmarks/dfp/doom_health_dfp_8_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dfp/doom_health_dfp_8_workers.png


--------------------------------------------------------------------------------
/benchmarks/dfp/doom_health_supreme_dfp_8_workers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dfp/doom_health_supreme_dfp_8_workers.png


--------------------------------------------------------------------------------
/benchmarks/dqn/README.md:
--------------------------------------------------------------------------------
 1 | # DQN
 2 | 
 3 | Each experiment uses 3 seeds.
 4 | The parameters used for DQN are the same parameters as described in the [original paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf), except for the optimizer (changed to ADAM) and learning rate (1e-4) used.
 5 | 
 6 | ### Breakout DQN - single worker
 7 | 
 8 | ```bash
 9 | coach -p Atari_DQN -lvl breakout
10 | ```
11 | 
12 | <img src="breakout_dqn.png" alt="Breakout DQN" width="800"/>
13 | 
14 | ### Pong DQN - single worker
15 | 
16 | ```bash
17 | coach -p Atari_DQN -lvl pong
18 | ```
19 | 
20 | <img src="pong_dqn.png" alt="Pong DQN" width="800"/>
21 | 
22 | ### Space Invaders DQN - single worker
23 | 
24 | ```bash
25 | coach -p Atari_DQN -lvl space_invaders
26 | ```
27 | 
28 | <img src="space_invaders_dqn.png" alt="Space Invaders DQN" width="800"/>
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/benchmarks/dqn/breakout_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dqn/breakout_dqn.png


--------------------------------------------------------------------------------
/benchmarks/dqn/pong_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dqn/pong_dqn.png


--------------------------------------------------------------------------------
/benchmarks/dqn/space_invaders_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dqn/space_invaders_dqn.png


--------------------------------------------------------------------------------
/benchmarks/dueling_ddqn/README.md:
--------------------------------------------------------------------------------
 1 | # Dueling DDQN
 2 | 
 3 | Each experiment uses 3 seeds and is trained for 10k environment steps.
 4 | The parameters used for Dueling DDQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1706.01502).
 5 | 
 6 | ### Pong Dueling DDQN - single worker
 7 | 
 8 | ```bash
 9 | coach -p Atari_Dueling_DDQN -lvl pong
10 | ```
11 | 
12 | <img src="pong_dueling_ddqn.png" alt="Pong Dueling DDQN" width="800"/>
13 | 
14 | 
15 | ### Breakout Dueling DDQN - single worker
16 | 
17 | ```bash
18 | coach -p Atari_Dueling_DDQN -lvl breakout
19 | ```
20 | 
21 | <img src="breakout_dueling_ddqn.png" alt="Breakout Dueling DDQN" width="800"/>
22 | 
23 | 
24 | ### Space Invaders Dueling DDQN - single worker
25 | 
26 | ```bash
27 | coach -p Atari_Dueling_DDQN -lvl space_invaders
28 | ```
29 | 
30 | <img src="space_invaders_dueling_ddqn.png" alt="Space Invaders Dueling DDQN" width="800"/>
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/benchmarks/dueling_ddqn/breakout_dueling_ddqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn/breakout_dueling_ddqn.png


--------------------------------------------------------------------------------
/benchmarks/dueling_ddqn/pong_dueling_ddqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn/pong_dueling_ddqn.png


--------------------------------------------------------------------------------
/benchmarks/dueling_ddqn/space_invaders_dueling_ddqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn/space_invaders_dueling_ddqn.png


--------------------------------------------------------------------------------
/benchmarks/dueling_ddqn_with_per/README.md:
--------------------------------------------------------------------------------
 1 | # Dueling DDQN with Prioritized Experience Replay
 2 | 
 3 | Each experiment uses 3 seeds and is trained for 10k environment steps.
 4 | The parameters used for Dueling DDQN with PER are the same parameters as described in the [following paper](https://arxiv.org/abs/1511.05952).
 5 | 
 6 | ### Breakout Dueling DDQN with PER - single worker
 7 | 
 8 | ```bash
 9 | coach -p Atari_Dueling_DDQN_with_PER_OpenAI -lvl breakout
10 | ```
11 | 
12 | <img src="breakout_dueling_ddqn_with_per.png" alt="Breakout Dueling DDQN with PER" width="800"/>
13 | 
14 | 
15 | ### Pong Dueling DDQN with PER - single worker
16 | 
17 | ```bash
18 | coach -p Atari_Dueling_DDQN_with_PER_OpenAI -lvl pong
19 | ```
20 | 
21 | <img src="pong_dueling_ddqn_with_per.png" alt="Pong Dueling DDQN with PER" width="800"/>
22 | 
23 | 
24 | ### Space Invaders Dueling DDQN with PER - single worker
25 | 
26 | ```bash
27 | coach -p Atari_Dueling_DDQN_with_PER_OpenAI -lvl space_invaders
28 | ```
29 | 
30 | <img src="space_invaders_dueling_ddqn_with_per.png" alt="Space Invaders Dueling DDQN with PER" width="800"/>
31 | 
32 | 


--------------------------------------------------------------------------------
/benchmarks/dueling_ddqn_with_per/breakout_dueling_ddqn_with_per.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn_with_per/breakout_dueling_ddqn_with_per.png


--------------------------------------------------------------------------------
/benchmarks/dueling_ddqn_with_per/pong_dueling_ddqn_with_per.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn_with_per/pong_dueling_ddqn_with_per.png


--------------------------------------------------------------------------------
/benchmarks/dueling_ddqn_with_per/space_invaders_dueling_ddqn_with_per.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn_with_per/space_invaders_dueling_ddqn_with_per.png


--------------------------------------------------------------------------------
/benchmarks/qr_dqn/README.md:
--------------------------------------------------------------------------------
 1 | # Quantile Regression DQN
 2 | 
 3 | Each experiment uses 3 seeds and is trained for 10k environment steps.
 4 | The parameters used for QR-DQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1710.10044.pdf).
 5 | 
 6 | ### Breakout QR-DQN - single worker
 7 | 
 8 | ```bash
 9 | coach -p Atari_QR_DQN -lvl breakout
10 | ```
11 | 
12 | <img src="breakout_qr_dqn.png" alt="Breakout QR-DQN" width="800"/>
13 | 
14 | 
15 | ### Pong QR-DQN - single worker
16 | 
17 | ```bash
18 | coach -p Atari_QR_DQN -lvl pong
19 | ```
20 | 
21 | <img src="pong_qr_dqn.png" alt="Pong QR-DQN" width="800"/>
22 | 


--------------------------------------------------------------------------------
/benchmarks/qr_dqn/breakout_qr_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/qr_dqn/breakout_qr_dqn.png


--------------------------------------------------------------------------------
/benchmarks/qr_dqn/pong_qr_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/qr_dqn/pong_qr_dqn.png


--------------------------------------------------------------------------------
/benchmarks/sac/README.md:
--------------------------------------------------------------------------------
 1 | # Soft Actor Critic
 2 | 
 3 | Each experiment uses 3 seeds and is trained for 3M environment steps.
 4 | The parameters used for SAC are the same parameters as described in the [original paper](https://arxiv.org/abs/1801.01290).
 5 | 
 6 | ### Inverted Pendulum SAC - single worker
 7 | 
 8 | ```bash
 9 | coach -p Mujoco_SAC -lvl inverted_pendulum
10 | ```
11 | 
12 | <img src="inverted_pendulum_sac.png" alt="Inverted Pendulum SAC" width="800"/>
13 | 
14 | 
15 | ### Hopper Clipped SAC - single worker
16 | 
17 | ```bash
18 | coach -p Mujoco_SAC -lvl hopper
19 | ```
20 | 
21 | <img src="hopper_sac.png" alt="Hopper SAC" width="800"/>
22 | 
23 | 
24 | ### Half Cheetah Clipped SAC - single worker
25 | 
26 | ```bash
27 | coach -p Mujoco_SAC -lvl half_cheetah
28 | ```
29 | 
30 | <img src="half_cheetah_sac.png" alt="Half Cheetah SAC" width="800"/>
31 | 
32 | 
33 | ### Walker 2D Clipped SAC - single worker
34 | 
35 | ```bash
36 | coach -p Mujoco_SAC -lvl walker2d
37 | ```
38 | 
39 | <img src="walker2d_sac.png" alt="Walker 2D SAC" width="800"/>
40 | 
41 | 
42 | ### Humanoid Clipped SAC - single worker
43 | 
44 | ```bash
45 | coach -p Mujoco_SAC -lvl humanoid
46 | ```
47 | 
48 | <img src="humanoid_sac.png" alt="Humanoid SAC" width="800"/>
49 | 


--------------------------------------------------------------------------------
/benchmarks/sac/half_cheetah_sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/half_cheetah_sac.png


--------------------------------------------------------------------------------
/benchmarks/sac/hopper_sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/hopper_sac.png


--------------------------------------------------------------------------------
/benchmarks/sac/humanoid_sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/humanoid_sac.png


--------------------------------------------------------------------------------
/benchmarks/sac/inverted_pendulum_sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/inverted_pendulum_sac.png


--------------------------------------------------------------------------------
/benchmarks/sac/walker2d_sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/walker2d_sac.png


--------------------------------------------------------------------------------
/benchmarks/td3/README.md:
--------------------------------------------------------------------------------
 1 | # Twin Delayed DDPG
 2 | 
 3 | Each experiment uses 5 seeds and is trained for 1M environment steps.
 4 | The parameters used for TD3 are the same parameters as described in the [original paper](https://arxiv.org/pdf/1802.09477.pdf), and [repository](https://github.com/sfujim/TD3). 
 5 | 
 6 | ### Ant TD3 - single worker
 7 | 
 8 | ```bash
 9 | coach -p Mujoco_TD3 -lvl ant
10 | ```
11 | 
12 | <img src="ant.png" alt="Ant TD3" width="800"/>
13 | 
14 | 
15 | ### Hopper TD3 - single worker
16 | 
17 | ```bash
18 | coach -p Mujoco_TD3 -lvl hopper
19 | ```
20 | 
21 | <img src="hopper.png" alt="Hopper TD3" width="800"/>
22 | 
23 | 
24 | ### Half Cheetah TD3 - single worker
25 | 
26 | ```bash
27 | coach -p Mujoco_TD3 -lvl half_cheetah
28 | ```
29 | 
30 | <img src="half_cheetah.png" alt="Half Cheetah TD3" width="800"/>
31 | 
32 | 
33 | ### Reacher TD3 - single worker
34 | 
35 | ```bash
36 | coach -p Mujoco_TD3 -lvl reacher
37 | ```
38 | 
39 | <img src="reacher.png" alt="Reacher TD3" width="800"/>
40 | 
41 | 
42 | ### Walker2D TD3 - single worker
43 | 
44 | ```bash
45 | coach -p Mujoco_TD3 -lvl walker2d
46 | ```
47 | 
48 | <img src="walker2d.png" alt="Walker2D TD3" width="800"/>
49 | 


--------------------------------------------------------------------------------
/benchmarks/td3/ant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/ant.png


--------------------------------------------------------------------------------
/benchmarks/td3/half_cheetah.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/half_cheetah.png


--------------------------------------------------------------------------------
/benchmarks/td3/hopper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/hopper.png


--------------------------------------------------------------------------------
/benchmarks/td3/reacher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/reacher.png


--------------------------------------------------------------------------------
/benchmarks/td3/walker2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/walker2d.png


--------------------------------------------------------------------------------
/dist-coach-config.template:
--------------------------------------------------------------------------------
1 | [coach]
2 | image = <insert-image-name>
3 | memory_backend = redispubsub
4 | data_store = s3
5 | s3_end_point = s3.amazonaws.com
6 | s3_bucket_name = <insert-s3-bucket-name>
7 | s3_creds_file = <insert-path-for-s3-creds-file>
8 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM coach-base:master as builder
 2 | 
 3 | # prep some of the more common environments
 4 | # Gym (installed with coach)
 5 | RUN pip3 install gym[atari]==0.12.5 box2d
 6 | # Mujoco
 7 | RUN mkdir -p ~/.mujoco \
 8 |     && wget https://www.roboti.us/download/mjpro150_linux.zip -O mujoco.zip \
 9 |     && unzip mujoco.zip -d ~/.mujoco \
10 |     && rm mujoco.zip
11 | ARG MUJOCO_KEY
12 | ENV MUJOCO_KEY=$MUJOCO_KEY
13 | ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH
14 | RUN echo $MUJOCO_KEY | base64 --decode > /root/.mujoco/mjkey.txt
15 | RUN pip3 install mujoco_py==1.50.1.68
16 | # Vizdoom
17 | RUN pip3 install vizdoom==1.1.7
18 | 
19 | RUN mkdir /root/src
20 | COPY setup.py /root/src/.
21 | COPY requirements.txt /root/src/.
22 | RUN pip3 install -r /root/src/requirements.txt
23 | 
24 | FROM coach-base:master
25 | WORKDIR /root/src
26 | COPY --from=builder /root/.mujoco /root/.mujoco
27 | ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH
28 | COPY --from=builder /root/.cache /root/.cache
29 | COPY setup.py /root/src/.
30 | COPY requirements.txt /root/src/.
31 | COPY README.md /root/src/.
32 | RUN pip3 install gym[atari]==0.12.5 box2d mujoco_py==1.50.1.68 vizdoom==1.1.7 && pip3 install -e .[all] && rm -rf /root/.cache
33 | COPY . /root/src
34 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.doom_environment:
--------------------------------------------------------------------------------
 1 | FROM coach-base:master as builder
 2 | 
 3 | # prep vizdoom and any of its related requirements.
 4 | RUN pip3 install vizdoom==1.1.7
 5 | 
 6 | # add coach source starting with files that could trigger
 7 | # re-build if dependencies change.
 8 | RUN mkdir /root/src
 9 | COPY setup.py /root/src/.
10 | COPY requirements.txt /root/src/.
11 | RUN pip3 install -r /root/src/requirements.txt
12 | 
13 | FROM coach-base:master
14 | WORKDIR /root/src
15 | COPY --from=builder /root/.cache /root/.cache
16 | COPY setup.py /root/src/.
17 | COPY requirements.txt /root/src/.
18 | COPY README.md /root/src/.
19 | RUN pip3 install vizdoom==1.1.7 && pip3 install -e .[all] && rm -rf /root/.cache
20 | COPY . /root/src
21 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.gym_environment:
--------------------------------------------------------------------------------
 1 | FROM coach-base:master as builder
 2 | 
 3 | # prep gym and any of its related requirements.
 4 | RUN pip3 install gym[atari,box2d,classic_control]==0.12.5
 5 | 
 6 | # add coach source starting with files that could trigger
 7 | # re-build if dependencies change.
 8 | RUN mkdir /root/src
 9 | COPY setup.py /root/src/.
10 | COPY requirements.txt /root/src/.
11 | RUN pip3 install -r /root/src/requirements.txt
12 | 
13 | FROM coach-base:master
14 | WORKDIR /root/src
15 | COPY --from=builder /root/.cache /root/.cache
16 | COPY setup.py /root/src/.
17 | COPY requirements.txt /root/src/.
18 | COPY README.md /root/src/.
19 | RUN pip3 install gym[atari,box2d,classic_control]==0.12.5 && pip3 install -e .[all] && rm -rf /root/.cache
20 | COPY . /root/src
21 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.mujoco_environment:
--------------------------------------------------------------------------------
 1 | FROM coach-base:master as builder
 2 | 
 3 | # prep mujoco and any of its related requirements.
 4 | # Mujoco
 5 | RUN mkdir -p ~/.mujoco \
 6 |     && wget https://www.roboti.us/download/mjpro150_linux.zip -O mujoco.zip \
 7 |     && unzip -n mujoco.zip -d ~/.mujoco \
 8 |     && rm mujoco.zip
 9 | ARG MUJOCO_KEY
10 | ENV MUJOCO_KEY=$MUJOCO_KEY
11 | ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH
12 | RUN echo $MUJOCO_KEY | base64 --decode > /root/.mujoco/mjkey.txt
13 | RUN pip3 install mujoco_py==1.50.1.68
14 | 
15 | # add coach source starting with files that could trigger
16 | # re-build if dependencies change.
17 | RUN mkdir /root/src
18 | COPY setup.py /root/src/.
19 | COPY requirements.txt /root/src/.
20 | RUN pip3 install -r /root/src/requirements.txt
21 | 
22 | FROM coach-base:master
23 | WORKDIR /root/src
24 | COPY --from=builder /root/.mujoco /root/.mujoco
25 | ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH
26 | COPY --from=builder /root/.cache /root/.cache
27 | COPY setup.py /root/src/.
28 | COPY requirements.txt /root/src/.
29 | COPY README.md /root/src/.
30 | RUN pip3 install mujoco_py==1.50.1.68 && pip3 install -e .[all] && rm -rf /root/.cache
31 | COPY . /root/src
32 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.starcraft_environment:
--------------------------------------------------------------------------------
 1 | FROM coach-base:master as builder
 2 | 
 3 | # prep pysc2 and any of its related requirements.
 4 | RUN wget http://blzdistsc2-a.akamaihd.net/Linux/SC2.3.17.zip -O sc2.zip \
 5 |     && unzip -P 'iagreetotheeula' -d ~ sc2.zip \
 6 |     && rm sc2.zip
 7 | RUN wget https://github.com/deepmind/pysc2/releases/download/v1.2/mini_games.zip -O mini_games.zip \
 8 |     && unzip -d ~/StarCraftII/Maps mini_games.zip \
 9 |     && rm mini_games.zip
10 | RUN pip3 install pysc2
11 | 
12 | # add coach source starting with files that could trigger
13 | # re-build if dependencies change.
14 | RUN mkdir /root/src
15 | COPY setup.py /root/src/.
16 | COPY requirements.txt /root/src/.
17 | RUN pip3 install -r /root/src/requirements.txt
18 | 
19 | FROM coach-base:master
20 | WORKDIR /root/src
21 | COPY --from=builder /root/StarCraftII /root/StarCraftII
22 | COPY --from=builder /root/.cache /root/.cache
23 | COPY setup.py /root/src/.
24 | COPY requirements.txt /root/src/.
25 | COPY README.md /root/src/.
26 | RUN pip3 install pysc2 && pip3 install -e .[all] && rm -rf /root/.cache
27 | COPY . /root/src
28 | 


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/_images/ac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/ac.png


--------------------------------------------------------------------------------
/docs/_images/acer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/acer.png


--------------------------------------------------------------------------------
/docs/_images/act.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/act.png


--------------------------------------------------------------------------------
/docs/_images/algorithms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/algorithms.png


--------------------------------------------------------------------------------
/docs/_images/attention_discretization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/attention_discretization.png


--------------------------------------------------------------------------------
/docs/_images/bollinger_bands.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/bollinger_bands.png


--------------------------------------------------------------------------------
/docs/_images/box_discretization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/box_discretization.png


--------------------------------------------------------------------------------
/docs/_images/box_masking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/box_masking.png


--------------------------------------------------------------------------------
/docs/_images/bs_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/bs_dqn.png


--------------------------------------------------------------------------------
/docs/_images/cil.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/cil.png


--------------------------------------------------------------------------------
/docs/_images/compare_by_num_episodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/compare_by_num_episodes.png


--------------------------------------------------------------------------------
/docs/_images/compare_by_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/compare_by_time.png


--------------------------------------------------------------------------------
/docs/_images/ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/ddpg.png


--------------------------------------------------------------------------------
/docs/_images/design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/design.png


--------------------------------------------------------------------------------
/docs/_images/dfp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/dfp.png


--------------------------------------------------------------------------------
/docs/_images/distributed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/distributed.png


--------------------------------------------------------------------------------
/docs/_images/distributional_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/distributional_dqn.png


--------------------------------------------------------------------------------
/docs/_images/dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/dqn.png


--------------------------------------------------------------------------------
/docs/_images/dueling_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/dueling_dqn.png


--------------------------------------------------------------------------------
/docs/_images/filters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/filters.png


--------------------------------------------------------------------------------
/docs/_images/full_discrete_action_space_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/full_discrete_action_space_map.png


--------------------------------------------------------------------------------
/docs/_images/horizontal-scale-out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/horizontal-scale-out.png


--------------------------------------------------------------------------------
/docs/_images/improve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/improve.png


--------------------------------------------------------------------------------
/docs/_images/linear_box_to_box_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/linear_box_to_box_map.png


--------------------------------------------------------------------------------
/docs/_images/naf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/naf.png


--------------------------------------------------------------------------------
/docs/_images/nec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/nec.png


--------------------------------------------------------------------------------
/docs/_images/network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/network.png


--------------------------------------------------------------------------------
/docs/_images/observe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/observe.png


--------------------------------------------------------------------------------
/docs/_images/partial_discrete_action_space_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/partial_discrete_action_space_map.png


--------------------------------------------------------------------------------
/docs/_images/pg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/pg.png


--------------------------------------------------------------------------------
/docs/_images/ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/ppo.png


--------------------------------------------------------------------------------
/docs/_images/qr_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/qr_dqn.png


--------------------------------------------------------------------------------
/docs/_images/rainbow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/rainbow.png


--------------------------------------------------------------------------------
/docs/_images/sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/sac.png


--------------------------------------------------------------------------------
/docs/_images/separate_signals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/separate_signals.png


--------------------------------------------------------------------------------
/docs/_images/td3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/td3.png


--------------------------------------------------------------------------------
/docs/_images/train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/train.png


--------------------------------------------------------------------------------
/docs/_images/updating_dynamically.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/updating_dynamically.gif


--------------------------------------------------------------------------------
/docs/_images/wolpertinger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/wolpertinger.png


--------------------------------------------------------------------------------
/docs/_sources/components/additional_parameters.rst.txt:
--------------------------------------------------------------------------------
 1 | Additional Parameters
 2 | =====================
 3 | 
 4 | VisualizationParameters
 5 | -----------------------
 6 | .. autoclass:: rl_coach.base_parameters.VisualizationParameters
 7 | 
 8 | PresetValidationParameters
 9 | --------------------------
10 | .. autoclass:: rl_coach.base_parameters.PresetValidationParameters
11 | 
12 | TaskParameters
13 | --------------
14 | .. autoclass:: rl_coach.base_parameters.TaskParameters
15 | 
16 | DistributedTaskParameters
17 | -------------------------
18 | .. autoclass:: rl_coach.base_parameters.DistributedTaskParameters
19 | 


--------------------------------------------------------------------------------
/docs/_sources/components/agents/imitation/bc.rst.txt:
--------------------------------------------------------------------------------
 1 | Behavioral Cloning
 2 | ==================
 3 | 
 4 | **Actions space:** Discrete | Continuous
 5 | 
 6 | Network Structure
 7 | -----------------
 8 | 
 9 | .. image:: /_static/img/design_imgs/pg.png
10 |    :align: center
11 | 
12 | 
13 | Algorithm Description
14 | ---------------------
15 | 
16 | Training the network
17 | ++++++++++++++++++++
18 | 
19 | The replay buffer contains the expert demonstrations for the task.
20 | These demonstrations are given as state, action tuples, and with no reward.
21 | The training goal is to reduce the difference between the actions predicted by the network and the actions taken by
22 | the expert for each state.
23 | 
24 | 1. Sample a batch of transitions from the replay buffer.
25 | 2. Use the current states as input to the network, and the expert actions as the targets of the network.
26 | 3. For the network head, we use the policy head, which uses the cross entropy loss function.
27 | 
28 | 
29 | .. autoclass:: rl_coach.agents.bc_agent.BCAlgorithmParameters


--------------------------------------------------------------------------------
/docs/_sources/components/agents/index.rst.txt:
--------------------------------------------------------------------------------
 1 | Agents
 2 | ======
 3 | 
 4 | Coach supports many state-of-the-art reinforcement learning algorithms, which are separated into three main classes -
 5 | value optimization, policy optimization and imitation learning.
 6 | A detailed description of those algorithms can be found by navigating to each of the algorithm pages.
 7 | 
 8 | .. image:: /_static/img/algorithms.png
 9 |    :width: 600px
10 |    :align: center
11 | 
12 | .. toctree::
13 |    :maxdepth: 1
14 |    :caption: Agents
15 | 
16 |    policy_optimization/ac
17 |    policy_optimization/acer
18 |    imitation/bc
19 |    value_optimization/bs_dqn
20 |    value_optimization/categorical_dqn
21 |    imitation/cil
22 |    policy_optimization/cppo
23 |    policy_optimization/ddpg
24 |    other/dfp
25 |    value_optimization/double_dqn
26 |    value_optimization/dqn
27 |    value_optimization/dueling_dqn
28 |    value_optimization/mmc
29 |    value_optimization/n_step
30 |    value_optimization/naf
31 |    value_optimization/nec
32 |    value_optimization/pal
33 |    policy_optimization/pg
34 |    policy_optimization/ppo
35 |    value_optimization/rainbow
36 |    value_optimization/qr_dqn
37 |    policy_optimization/sac
38 |    policy_optimization/td3
39 |    policy_optimization/wolpertinger
40 | 
41 | 
42 | 
43 | .. autoclass:: rl_coach.base_parameters.AgentParameters
44 | 
45 | .. autoclass:: rl_coach.agents.agent.Agent
46 |    :members:
47 |    :inherited-members:
48 | 
49 | 


--------------------------------------------------------------------------------
/docs/_sources/components/agents/policy_optimization/ac.rst.txt:
--------------------------------------------------------------------------------
 1 | Actor-Critic
 2 | ============
 3 | 
 4 | **Actions space:** Discrete | Continuous
 5 | 
 6 | **References:** `Asynchronous Methods for Deep Reinforcement Learning <https://arxiv.org/abs/1602.01783>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/ac.png
12 |    :width: 500px
13 |    :align: center
14 | 
15 | Algorithm Description
16 | ---------------------
17 | 
18 | Choosing an action - Discrete actions
19 | +++++++++++++++++++++++++++++++++++++
20 | 
21 | The policy network is used in order to predict action probabilites. While training, a sample is taken from a categorical
22 | distribution assigned with these probabilities. When testing, the action with the highest probability is used.
23 | 
24 | Training the network
25 | ++++++++++++++++++++
26 | A batch of :math:`T_{max}` transitions is used, and the advantages are calculated upon it.
27 | 
28 | Advantages can be calculated by either of the following methods (configured by the selected preset) -
29 | 
30 | 1. **A_VALUE** - Estimating advantage directly:
31 |    :math:`A(s_t, a_t) = \underbrace{\sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})}_{Q(s_t, a_t)} - V(s_t)`
32 |    where :math:`k` is :math:`T_{max} - State\_Index` for each state in the batch.
33 | 
34 | 2. **GAE** - By following the `Generalized Advantage Estimation <https://arxiv.org/abs/1506.02438>`_ paper.
35 | 
36 | The advantages are then used in order to accumulate gradients according to 
37 | :math:`L = -\mathop{\mathbb{E}} [log (\pi) \cdot A]`
38 | 
39 | 
40 | .. autoclass:: rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters


--------------------------------------------------------------------------------
/docs/_sources/components/agents/policy_optimization/hac.rst.txt:
--------------------------------------------------------------------------------
 1 | Hierarchical Actor Critic
 2 | =========================
 3 | 
 4 | **Actions space:** Continuous
 5 | 
 6 | **References:** `Hierarchical Reinforcement Learning with Hindsight <https://arxiv.org/abs/1805.08180>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/ddpg.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | Choosing an action
17 | ++++++++++++++++++
18 | 
19 | Pass the current states through the actor network, and get an action mean vector :math:`\mu`.
20 | While in training phase, use a continuous exploration policy, such as the Ornstein-Uhlenbeck process,
21 | to add exploration noise to the action. When testing, use the mean vector :math:`\mu` as-is.
22 | 
23 | Training the network
24 | ++++++++++++++++++++
25 | 


--------------------------------------------------------------------------------
/docs/_sources/components/agents/value_optimization/categorical_dqn.rst.txt:
--------------------------------------------------------------------------------
 1 | Categorical DQN
 2 | ===============
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `A Distributional Perspective on Reinforcement Learning <https://arxiv.org/abs/1707.06887>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/distributional_dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | 1. Sample a batch of transitions from the replay buffer.
21 | 
22 | 2. The Bellman update is projected to the set of atoms representing the :math:`Q` values distribution, such
23 |    that the :math:`i-th` component of the projected update is calculated as follows:
24 | 
25 |    :math:`(\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))`
26 | 
27 |    where:
28 |    *  :math:`[ \cdot ]` bounds its argument in the range :math:`[a, b]`
29 |    *  :math:`\hat{T}_{z_{j}}` is the Bellman update for atom :math:`z_j`: :math:`\hat{T}_{z_{j}} := r+\gamma z_j`
30 | 
31 | 
32 | 3. Network is trained with the cross entropy loss between the resulting probability distribution and the target
33 |    probability distribution.   Only the target of the actions that were actually taken is updated.
34 | 
35 | 4. Once in every few thousand steps, weights are copied from the online network to the target network.
36 | 
37 | 
38 | 
39 | .. autoclass:: rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters
40 | 


--------------------------------------------------------------------------------
/docs/_sources/components/agents/value_optimization/double_dqn.rst.txt:
--------------------------------------------------------------------------------
 1 | Double DQN
 2 | ==========
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Deep Reinforcement Learning with Double Q-learning <https://arxiv.org/abs/1509.06461.pdf>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | 1. Sample a batch of transitions from the replay buffer.
21 | 
22 | 2. Using the next states from the sampled batch, run the online network in order to find the :math:`Q` maximizing
23 |    action :math:`argmax_a Q(s_{t+1},a)`. For these actions, use the corresponding next states and run the target
24 |    network to calculate :math:`Q(s_{t+1},argmax_a Q(s_{t+1},a))`.
25 | 
26 | 3. In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
27 |    use the current states from the sampled batch, and run the online network to get the current Q values predictions.
28 |    Set those values as the targets for the actions that were not actually played.
29 | 
30 | 4. For each action that was played, use the following equation for calculating the targets of the network:
31 |    :math:`y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))`
32 | 
33 | 5. Finally, train the online network using the current states as inputs, and with the aforementioned targets.
34 | 
35 | 6. Once in every few thousand steps, copy the weights from the online network to the target network.
36 | 


--------------------------------------------------------------------------------
/docs/_sources/components/agents/value_optimization/dqn.rst.txt:
--------------------------------------------------------------------------------
 1 | Deep Q Networks
 2 | ===============
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Playing Atari with Deep Reinforcement Learning <https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | 1. Sample a batch of transitions from the replay buffer.
21 | 
22 | 2. Using the next states from the sampled batch, run the target network to calculate the :math:`Q` values for each of
23 |    the actions :math:`Q(s_{t+1},a)`, and keep only the maximum value for each state.
24 | 
25 | 3. In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
26 |    use the current states from the sampled batch, and run the online network to get the current Q values predictions.
27 |    Set those values as the targets for the actions that were not actually played.
28 | 
29 | 4. For each action that was played, use the following equation for calculating the targets of the network:​
30 |    :math:`y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})`
31 | 
32 | 5. Finally, train the online network using the current states as inputs, and with the aforementioned targets.
33 | 
34 | 6. Once in every few thousand steps, copy the weights from the online network to the target network.
35 | 
36 | 
37 | .. autoclass:: rl_coach.agents.dqn_agent.DQNAlgorithmParameters
38 | 


--------------------------------------------------------------------------------
/docs/_sources/components/agents/value_optimization/dueling_dqn.rst.txt:
--------------------------------------------------------------------------------
 1 | Dueling DQN
 2 | ===========
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Dueling Network Architectures for Deep Reinforcement Learning <https://arxiv.org/abs/1511.06581>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dueling_dqn.png
12 |    :align: center
13 | 
14 | General Description
15 | -------------------
16 | Dueling DQN presents a change in the network structure comparing to DQN.
17 | 
18 | Dueling DQN uses a specialized *Dueling Q Head* in order to separate :math:`Q` to an :math:`A` (advantage)
19 | stream and a :math:`V` stream. Adding this type of structure to the network head allows the network to better differentiate
20 | actions from one another, and significantly improves the learning.
21 | 
22 | In many states, the values of the different actions are very similar, and it is less important which action to take.
23 | This is especially important in environments where there are many actions to choose from. In DQN, on each training
24 | iteration, for each of the states in the batch, we update the :ath:`Q` values only for the specific actions taken in
25 | those states. This results in slower learning as we do not learn the :math:`Q` values for actions that were not taken yet.
26 | On dueling architecture, on the other hand, learning is faster - as we start learning the state-value even if only a
27 | single action has been taken at this state.


--------------------------------------------------------------------------------
/docs/_sources/components/agents/value_optimization/mmc.rst.txt:
--------------------------------------------------------------------------------
 1 | Mixed Monte Carlo
 2 | =================
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Count-Based Exploration with Neural Density Models <https://arxiv.org/abs/1703.01310>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | Training the network
17 | ++++++++++++++++++++
18 | 
19 | In MMC, targets are calculated as a mixture between Double DQN targets and full Monte Carlo samples (total discounted returns).
20 | 
21 | The DDQN targets are calculated in the same manner as in the DDQN agent:
22 | 
23 | :math:`y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))`
24 | 
25 | The Monte Carlo targets are calculated by summing up the discounted rewards across the entire episode:
26 | 
27 | :math:`y_t^{MC}=\sum_{j=0}^T\gamma^j r(s_{t+j},a_{t+j} )`
28 | 
29 | A mixing ratio $\alpha$ is then used to get the final targets:
30 | 
31 | :math:`y_t=(1-\alpha)\cdot y_t^{DDQN}+\alpha \cdot y_t^{MC}`
32 | 
33 | Finally, the online network is trained using the current states as inputs, and the calculated targets.
34 | Once in every few thousand steps, copy the weights from the online network to the target network.
35 | 
36 | 
37 | .. autoclass:: rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters
38 | 


--------------------------------------------------------------------------------
/docs/_sources/components/agents/value_optimization/n_step.rst.txt:
--------------------------------------------------------------------------------
 1 | N-Step Q Learning
 2 | =================
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Asynchronous Methods for Deep Reinforcement Learning <https://arxiv.org/abs/1602.01783>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | The :math:`N`-step Q learning algorithm works in similar manner to DQN except for the following changes:
21 | 
22 | 1. No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
23 |    :math:`N` steps using the latest :math:`N` steps played by the agent.
24 | 
25 | 2. In order to stabilize the learning, multiple workers work together to update the network.
26 |    This creates the same effect as uncorrelating the samples used for training.
27 | 
28 | 3. Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
29 |    to form the :math:`N`-step Q targets, according to the following equation:
30 |    :math:`R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})`
31 |    where :math:`k` is :math:`T_{max} - State\_Index` for each state in the batch
32 | 
33 | 
34 | 
35 | .. autoclass:: rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters
36 | 


--------------------------------------------------------------------------------
/docs/_sources/components/agents/value_optimization/naf.rst.txt:
--------------------------------------------------------------------------------
 1 | Normalized Advantage Functions
 2 | ==============================
 3 | 
 4 | **Actions space:** Continuous
 5 | 
 6 | **References:** `Continuous Deep Q-Learning with Model-based Acceleration <https://arxiv.org/abs/1603.00748.pdf>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/naf.png
12 |    :width: 600px
13 |    :align: center
14 | 
15 | Algorithm Description
16 | ---------------------
17 | Choosing an action
18 | ++++++++++++++++++
19 | The current state is used as an input to the network. The action mean :math:`\mu(s_t )` is extracted from the output head.
20 | It is then passed to the exploration policy which adds noise in order to encourage exploration.
21 | 
22 | Training the network
23 | ++++++++++++++++++++
24 | The network is trained by using the following targets:
25 | :math:`y_t=r(s_t,a_t )+\gamma\cdot V(s_{t+1})`
26 | Use the next states as the inputs to the target network and extract the :math:`V` value, from within the head,
27 | to get :math:`V(s_{t+1} )`. Then, update the online network using the current states and actions as inputs,
28 | and :math:`y_t` as the targets.
29 | After every training step, use a soft update in order to copy the weights from the online network to the target network.
30 | 
31 | 
32 | 
33 | .. autoclass:: rl_coach.agents.naf_agent.NAFAlgorithmParameters
34 | 


--------------------------------------------------------------------------------
/docs/_sources/components/agents/value_optimization/qr_dqn.rst.txt:
--------------------------------------------------------------------------------
 1 | Quantile Regression DQN
 2 | =======================
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Distributional Reinforcement Learning with Quantile Regression <https://arxiv.org/abs/1710.10044>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/qr_dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | 1. Sample a batch of transitions from the replay buffer.
21 | 
22 | 2. First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
23 |    by following the Bellman equation.
24 |    Next, the current quantile locations for the current states are predicted, sorted, and used for calculating the
25 |    quantile midpoints targets.
26 | 
27 | 3. The network is trained with the quantile regression loss between the resulting quantile locations and the target
28 |    quantile locations. Only the targets of the actions that were actually taken are updated.
29 | 
30 | 4. Once in every few thousand steps, weights are copied from the online network to the target network.
31 | 
32 | 
33 | .. autoclass:: rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters


--------------------------------------------------------------------------------
/docs/_sources/components/architectures/index.rst.txt:
--------------------------------------------------------------------------------
 1 | Architectures
 2 | =============
 3 | 
 4 | Architectures contain all the classes that implement the neural network related stuff for the agent.
 5 | Since Coach is intended to work with multiple neural network frameworks, each framework will implement its
 6 | own components under a dedicated directory. For example, tensorflow components will contain all the neural network
 7 | parts that are implemented using TensorFlow.
 8 | 
 9 | .. autoclass:: rl_coach.base_parameters.NetworkParameters
10 | 
11 | Architecture
12 | ------------
13 | .. autoclass:: rl_coach.architectures.architecture.Architecture
14 |    :members:
15 |    :inherited-members:
16 | 
17 | NetworkWrapper
18 | --------------
19 | 
20 | .. image:: /_static/img/distributed.png
21 |    :width: 600px
22 |    :align: center
23 | 
24 | .. autoclass:: rl_coach.architectures.network_wrapper.NetworkWrapper
25 |    :members:
26 |    :inherited-members:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/_sources/components/core_types.rst.txt:
--------------------------------------------------------------------------------
 1 | Core Types
 2 | ==========
 3 | 
 4 | ActionInfo
 5 | ----------
 6 | .. autoclass:: rl_coach.core_types.ActionInfo
 7 |    :members:
 8 |    :inherited-members:
 9 | 
10 | Batch
11 | -----
12 | .. autoclass:: rl_coach.core_types.Batch
13 |    :members:
14 |    :inherited-members:
15 | 
16 | EnvResponse
17 | -----------
18 | .. autoclass:: rl_coach.core_types.EnvResponse
19 |    :members:
20 |    :inherited-members:
21 | 
22 | Episode
23 | -------
24 | .. autoclass:: rl_coach.core_types.Episode
25 |    :members:
26 |    :inherited-members:
27 | 
28 | Transition
29 | ----------
30 | .. autoclass:: rl_coach.core_types.Transition
31 |    :members:
32 |    :inherited-members:
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/_sources/components/data_stores/index.rst.txt:
--------------------------------------------------------------------------------
 1 | Data Stores
 2 | ===========
 3 | 
 4 | S3DataStore
 5 | -----------
 6 | .. autoclass:: rl_coach.data_stores.s3_data_store.S3DataStore
 7 | 
 8 | NFSDataStore
 9 | ------------
10 | .. autoclass:: rl_coach.data_stores.nfs_data_store.NFSDataStore
11 | 


--------------------------------------------------------------------------------
/docs/_sources/components/filters/index.rst.txt:
--------------------------------------------------------------------------------
 1 | Filters
 2 | =======
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 |    :caption: Filters
 7 | 
 8 |    input_filters
 9 |    output_filters
10 | 
11 | Filters are a mechanism in Coach that allows doing pre-processing and post-processing of the internal agent information.
12 | There are two filter categories -
13 | 
14 | * **Input filters** - these are filters that process the information passed **into** the agent from the environment.
15 |   This information includes the observation and the reward. Input filters therefore allow rescaling observations,
16 |   normalizing rewards, stack observations, etc.
17 | 
18 | * **Output filters** - these are filters that process the information going **out** of the agent into the environment.
19 |   This information includes the action the agent chooses to take. Output filters therefore allow conversion of
20 |   actions from one space into another. For example, the agent can take :math:`N` discrete actions, that will be mapped by
21 |   the output filter onto :math:`N` continuous actions.
22 | 
23 | Filters can be stacked on top of each other in order to build complex processing flows of the inputs or outputs.
24 | 
25 | .. image:: /_static/img/filters.png
26 |    :width: 350px
27 |    :align: center
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/_sources/components/filters/output_filters.rst.txt:
--------------------------------------------------------------------------------
 1 | Output Filters
 2 | --------------
 3 | 
 4 | The output filters only process the actions.
 5 | 
 6 | Action Filters
 7 | ++++++++++++++
 8 | 
 9 | .. autoclass:: rl_coach.filters.action.AttentionDiscretization
10 | 
11 | .. image:: /_static/img/attention_discretization.png
12 |    :align: center
13 | 
14 | .. autoclass:: rl_coach.filters.action.BoxDiscretization
15 | 
16 | .. image:: /_static/img/box_discretization.png
17 |    :align: center
18 | 
19 | .. autoclass:: rl_coach.filters.action.BoxMasking
20 | 
21 | .. image:: /_static/img/box_masking.png
22 |    :align: center
23 | 
24 | .. autoclass:: rl_coach.filters.action.PartialDiscreteActionSpaceMap
25 | 
26 | .. image:: /_static/img/partial_discrete_action_space_map.png
27 |    :align: center
28 | 
29 | .. autoclass:: rl_coach.filters.action.FullDiscreteActionSpaceMap
30 | 
31 | .. image:: /_static/img/full_discrete_action_space_map.png
32 |    :align: center
33 | 
34 | .. autoclass:: rl_coach.filters.action.LinearBoxToBoxMap
35 | 
36 | .. image:: /_static/img/linear_box_to_box_map.png
37 |    :align: center


--------------------------------------------------------------------------------
/docs/_sources/components/memories/index.rst.txt:
--------------------------------------------------------------------------------
 1 | Memories
 2 | ========
 3 | 
 4 | Episodic Memories
 5 | -----------------
 6 | 
 7 | EpisodicExperienceReplay
 8 | ++++++++++++++++++++++++
 9 | .. autoclass:: rl_coach.memories.episodic.EpisodicExperienceReplay
10 | 
11 | EpisodicHindsightExperienceReplay
12 | +++++++++++++++++++++++++++++++++
13 | .. autoclass:: rl_coach.memories.episodic.EpisodicHindsightExperienceReplay
14 | 
15 | EpisodicHRLHindsightExperienceReplay
16 | ++++++++++++++++++++++++++++++++++++
17 | .. autoclass:: rl_coach.memories.episodic.EpisodicHRLHindsightExperienceReplay
18 | 
19 | SingleEpisodeBuffer
20 | +++++++++++++++++++
21 | .. autoclass:: rl_coach.memories.episodic.SingleEpisodeBuffer
22 | 
23 | 
24 | Non-Episodic Memories
25 | ---------------------
26 | BalancedExperienceReplay
27 | ++++++++++++++++++++++++
28 | .. autoclass:: rl_coach.memories.non_episodic.BalancedExperienceReplay
29 | 
30 | QDND
31 | ++++
32 | .. autoclass:: rl_coach.memories.non_episodic.QDND
33 | 
34 | ExperienceReplay
35 | ++++++++++++++++
36 | .. autoclass:: rl_coach.memories.non_episodic.ExperienceReplay
37 | 
38 | PrioritizedExperienceReplay
39 | +++++++++++++++++++++++++++
40 | .. autoclass:: rl_coach.memories.non_episodic.PrioritizedExperienceReplay
41 | 
42 | TransitionCollection
43 | ++++++++++++++++++++
44 | .. autoclass:: rl_coach.memories.non_episodic.TransitionCollection
45 | 


--------------------------------------------------------------------------------
/docs/_sources/components/memory_backends/index.rst.txt:
--------------------------------------------------------------------------------
1 | Memory Backends
2 | ===============
3 | 
4 | RedisPubSubBackend
5 | ------------------
6 | .. autoclass:: rl_coach.memories.backend.redis.RedisPubSubBackend
7 | 


--------------------------------------------------------------------------------
/docs/_sources/components/orchestrators/index.rst.txt:
--------------------------------------------------------------------------------
1 | Orchestrators
2 | =============
3 | 
4 | 
5 | Kubernetes
6 | ----------
7 | .. autoclass:: rl_coach.orchestrators.kubernetes_orchestrator.Kubernetes
8 | 


--------------------------------------------------------------------------------
/docs/_sources/components/spaces.rst.txt:
--------------------------------------------------------------------------------
 1 | Spaces
 2 | ======
 3 | 
 4 | Space
 5 | -----
 6 | .. autoclass:: rl_coach.spaces.Space
 7 |    :members:
 8 |    :inherited-members:
 9 | 
10 | 
11 | 
12 | Observation Spaces
13 | ------------------
14 | .. autoclass:: rl_coach.spaces.ObservationSpace
15 |    :members:
16 |    :inherited-members:
17 | 
18 | VectorObservationSpace
19 | ++++++++++++++++++++++
20 | .. autoclass:: rl_coach.spaces.VectorObservationSpace
21 | 
22 | PlanarMapsObservationSpace
23 | ++++++++++++++++++++++++++
24 | .. autoclass:: rl_coach.spaces.PlanarMapsObservationSpace
25 | 
26 | ImageObservationSpace
27 | +++++++++++++++++++++
28 | .. autoclass:: rl_coach.spaces.ImageObservationSpace
29 | 
30 | 
31 | 
32 | Action Spaces
33 | -------------
34 | .. autoclass:: rl_coach.spaces.ActionSpace
35 |    :members:
36 |    :inherited-members:
37 | 
38 | AttentionActionSpace
39 | ++++++++++++++++++++
40 | .. autoclass:: rl_coach.spaces.AttentionActionSpace
41 | 
42 | BoxActionSpace
43 | ++++++++++++++
44 | .. autoclass:: rl_coach.spaces.BoxActionSpace
45 | 
46 | DiscreteActionSpace
47 | ++++++++++++++++++++
48 | .. autoclass:: rl_coach.spaces.DiscreteActionSpace
49 | 
50 | MultiSelectActionSpace
51 | ++++++++++++++++++++++
52 | .. autoclass:: rl_coach.spaces.MultiSelectActionSpace
53 | 
54 | CompoundActionSpace
55 | +++++++++++++++++++
56 | .. autoclass:: rl_coach.spaces.CompoundActionSpace
57 | 
58 | 
59 | 
60 | Goal Spaces
61 | -----------
62 | .. autoclass:: rl_coach.spaces.GoalsSpace
63 |    :members:
64 |    :inherited-members:
65 | 


--------------------------------------------------------------------------------
/docs/_sources/features/algorithms.rst.txt:
--------------------------------------------------------------------------------
 1 | Algorithms
 2 | ==========
 3 | 
 4 | Coach supports many state-of-the-art reinforcement learning algorithms, which are separated into three main classes -
 5 | value optimization, policy optimization and imitation learning.
 6 | A detailed description of those algorithms may be found in the `agents <../components/agents/index.html>`_ section.
 7 | 
 8 | .. image:: /_static/img/algorithms.png
 9 |    :width: 600px
10 |    :align: center


--------------------------------------------------------------------------------
/docs/_sources/features/batch_rl.rst.txt:
--------------------------------------------------------------------------------
 1 | Batch Reinforcement Learning
 2 | ============================
 3 | 
 4 | Coach supports Batch Reinforcement Learning, where learning is based solely on a (fixed) batch of data.
 5 | In Batch RL, we are given a dataset of experience, which was collected using some (one or more) deployed policies, and we would
 6 | like to use it to learn a better policy than what was used to collect the dataset.
 7 | There is no simulator to interact with, and so we cannot collect any new data, meaning we often cannot explore the MDP any further.
 8 | To make things even harder, we would also like to use the dataset in order to evaluate the newly learned policy
 9 | (using off-policy evaluation), since we do not have a simulator which we can use to evaluate the policy on.
10 | Batch RL is also often beneficial in cases where we just want to separate the inference (data collection) from the
11 | training process of a new policy. This is often the case where we have a system on which we could quite easily deploy a policy
12 | and collect experience data, but cannot easily use that system's setup to online train a new policy (as is often the
13 | case with more standard RL algorithms).
14 | 
15 | Coach supports (almost) all of the integrated off-policy algorithms with Batch RL.
16 | 
17 | A lot more details and example usage can be found in the
18 | `tutorial <https://github.com/NervanaSystems/coach/blob/master/tutorials/4.%20Batch%20Reinforcement%20Learning.ipynb>`_.


--------------------------------------------------------------------------------
/docs/_sources/features/benchmarks.rst.txt:
--------------------------------------------------------------------------------
 1 | Benchmarks
 2 | ==========
 3 | 
 4 | Reinforcement learning is a developing field, and so far it has been particularly difficult to reproduce some of the
 5 | results published in the original papers. Some reasons for this are:
 6 | 
 7 | * Reinforcement learning algorithms are notoriously known as having an unstable learning process.
 8 |   The data the neural networks trains on is dynamic, and depends on the random seed defined for the environment.
 9 | 
10 | * Reinforcement learning algorithms have many moving parts. For some environments and agents, there are many
11 |   "tricks" which are needed to get the exact behavior the paper authors had seen. Also, there are **a lot** of
12 |   hyper-parameters to set.
13 | 
14 | In order for a reinforcement learning implementation to be useful for research or for data science, it must be
15 | shown that it achieves the expected behavior. For this reason, we collected a set of benchmark results from most
16 | of the algorithms implemented in Coach. The algorithms were tested on a subset of the same environments that were
17 | used in the original papers, and with multiple seed for each environment.
18 | Additionally, Coach uses some strict testing mechanisms to try and make sure the results we show for these
19 | benchmarks stay intact as Coach continues to develop.
20 | 
21 | To see the benchmark results, please visit the
22 | `following GitHub page <https://github.com/NervanaSystems/coach/tree/master/benchmarks>`_.


--------------------------------------------------------------------------------
/docs/_sources/features/index.rst.txt:
--------------------------------------------------------------------------------
 1 | Features
 2 | ========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 |    :caption: Features
 7 | 
 8 |    algorithms
 9 |    environments
10 |    benchmarks
11 |    batch_rl


--------------------------------------------------------------------------------
/docs/_sources/test.rst.txt:
--------------------------------------------------------------------------------
1 | test
2 | ----
3 | 
4 | .. important:: Its a note! in markdown!
5 | 
6 | .. autoclass:: rl_coach.agents.dqn_agent.DQNAgent
7 |       :members:
8 |       :inherited-members:


--------------------------------------------------------------------------------
/docs/_static/css/custom.css:
--------------------------------------------------------------------------------
 1 | /* Docs background */
 2 | .wy-side-nav-search{
 3 |   background-color: #043c74;
 4 | }
 5 | 
 6 | /* Mobile version */
 7 | .wy-nav-top{
 8 |   background-color: #043c74;
 9 | }
10 | 
11 | 
12 | .green {
13 |     color: green;
14 | }
15 | 
16 | .red {
17 |     color: red;
18 | }
19 | 
20 | .blue {
21 |     color: blue;
22 | }
23 | 
24 | .yellow {
25 |     color: yellow;
26 | }
27 | 
28 | .badge {
29 |     border: 2px;
30 |     border-style: solid;
31 |     border-color: #6C8EBF;
32 |     border-radius: 5px;
33 |     padding: 3px 15px 3px 15px;
34 |     margin: 5px;
35 |     display: inline-block;
36 |     font-weight: bold;
37 |     font-size: 16px;
38 |     background: #DAE8FC;
39 | }
40 | 
41 | .badge:hover {
42 |     cursor: pointer;
43 | }
44 | 
45 | .badge > a {
46 |     color: black;
47 | }
48 | 
49 | .bordered-container {
50 |     border: 0px;
51 |     border-style: solid;
52 |     border-radius: 8px;
53 |     padding: 15px;
54 |     margin-bottom: 20px;
55 |     background: #f2f2f2;
56 | }
57 | 
58 | .questionnaire {
59 |     font-size: 1.2em;
60 |     line-height: 1.5em;
61 | }


--------------------------------------------------------------------------------
/docs/_static/dark_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/dark_logo.png


--------------------------------------------------------------------------------
/docs/_static/documentation_options.js:
--------------------------------------------------------------------------------
 1 | var DOCUMENTATION_OPTIONS = {
 2 |     URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
 3 |     VERSION: '0.12.0',
 4 |     LANGUAGE: 'None',
 5 |     COLLAPSE_INDEX: false,
 6 |     FILE_SUFFIX: '.html',
 7 |     HAS_SOURCE: true,
 8 |     SOURCELINK_SUFFIX: '.txt',
 9 |     NAVIGATION_WITH_KEYS: false
10 | };


--------------------------------------------------------------------------------
/docs/_static/file.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/file.png


--------------------------------------------------------------------------------
/docs/_static/fonts/Inconsolata-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Inconsolata-Bold.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/Inconsolata-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Inconsolata-Regular.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/Inconsolata.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Inconsolata.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato-Bold.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato-Regular.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-bold.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bold.eot


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bold.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bold.woff


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bold.woff2


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-bolditalic.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bolditalic.eot


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-bolditalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bolditalic.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-bolditalic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bolditalic.woff


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-bolditalic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bolditalic.woff2


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-italic.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-italic.eot


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-italic.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-italic.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-italic.woff


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-italic.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-italic.woff2


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-regular.eot


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-regular.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-regular.woff


--------------------------------------------------------------------------------
/docs/_static/fonts/Lato/lato-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-regular.woff2


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab-Bold.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab-Regular.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff


--------------------------------------------------------------------------------
/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2


--------------------------------------------------------------------------------
/docs/_static/fonts/fontawesome-webfont.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/fontawesome-webfont.eot


--------------------------------------------------------------------------------
/docs/_static/fonts/fontawesome-webfont.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/fontawesome-webfont.ttf


--------------------------------------------------------------------------------
/docs/_static/fonts/fontawesome-webfont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/fontawesome-webfont.woff


--------------------------------------------------------------------------------
/docs/_static/fonts/fontawesome-webfont.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/fontawesome-webfont.woff2


--------------------------------------------------------------------------------
/docs/_static/minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/minus.png


--------------------------------------------------------------------------------
/docs/_static/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/plus.png


--------------------------------------------------------------------------------
/docs/objects.inv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/objects.inv


--------------------------------------------------------------------------------
/docs_raw/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = source
 8 | BUILDDIR      = build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs_raw/README.md:
--------------------------------------------------------------------------------
 1 | # Coach Documentation
 2 | 
 3 | Coach uses Sphinx with a Read The Docs theme for its documentation website.
 4 | The website is hosted on GitHub Pages, and is automatically pulled from the repository through the built docs directory.
 5 | 
 6 | To build automatically, first go to 'docs_raw' directory; the following is installing all required packages, making html
 7 | copying all new docs into 'coach/docs/'
 8 | 
 9 | Run the following command (make sure the it's an executable file):
10 | ```
11 | ./build_docs.sh
12 | ```
13 | 
14 | To build manually the documentation website locally, first install the following requirements:
15 | 
16 | ```
17 | pip install Sphinx
18 | pip install recommonmark
19 | pip install sphinx_rtd_theme
20 | pip install sphinx-autobuild
21 | pip install sphinx-argparse
22 | ```
23 | 
24 | Then there are two option to build:
25 | 1. Build using the make file (recommended). Run from within the `docs_raw` directory:
26 | 
27 | ```
28 | make html
29 | cp source/_static/css/custom.css build/html/_static/css/
30 | rm -rf ../docs/
31 | mkdir ../docs
32 | touch ../docs/.nojekyll
33 | cp -R build/html/* ../docs/
34 | ```
35 | 
36 | 2. Build automatically after every change while editing the files:
37 | 
38 | ```
39 | sphinx-autobuild source build/html
40 | ```
41 | 


--------------------------------------------------------------------------------
/docs_raw/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/__init__.py


--------------------------------------------------------------------------------
/docs_raw/build_docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "installing requirements..."
 4 | 
 5 | pip3 install Sphinx
 6 | pip3 install recommonmark
 7 | pip3 install sphinx_rtd_theme
 8 | pip3 install sphinx-autobuild
 9 | pip3 install sphinx-argparse
10 | 
11 | echo "Making docs..."
12 | 
13 | make html
14 | 
15 | echo "Copying new docs into coach/docs/"
16 | 
17 | cp source/_static/css/custom.css build/html/_static/css/
18 | rm -rf ../docs/
19 | mkdir ../docs
20 | touch ../docs/.nojekyll
21 | cp -R build/html/* ../docs/
22 | rm -r build
23 | 
24 | echo "Finished!"


--------------------------------------------------------------------------------
/docs_raw/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs_raw/source/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/__init__.py


--------------------------------------------------------------------------------
/docs_raw/source/_static/css/custom.css:
--------------------------------------------------------------------------------
 1 | /* Docs background */
 2 | .wy-side-nav-search{
 3 |   background-color: #043c74;
 4 | }
 5 | 
 6 | /* Mobile version */
 7 | .wy-nav-top{
 8 |   background-color: #043c74;
 9 | }
10 | 
11 | 
12 | .green {
13 |     color: green;
14 | }
15 | 
16 | .red {
17 |     color: red;
18 | }
19 | 
20 | .blue {
21 |     color: blue;
22 | }
23 | 
24 | .yellow {
25 |     color: yellow;
26 | }
27 | 
28 | .badge {
29 |     border: 2px;
30 |     border-style: solid;
31 |     border-color: #6C8EBF;
32 |     border-radius: 5px;
33 |     padding: 3px 15px 3px 15px;
34 |     margin: 5px;
35 |     display: inline-block;
36 |     font-weight: bold;
37 |     font-size: 16px;
38 |     background: #DAE8FC;
39 | }
40 | 
41 | .badge:hover {
42 |     cursor: pointer;
43 | }
44 | 
45 | .badge > a {
46 |     color: black;
47 | }
48 | 
49 | .bordered-container {
50 |     border: 0px;
51 |     border-style: solid;
52 |     border-radius: 8px;
53 |     padding: 15px;
54 |     margin-bottom: 20px;
55 |     background: #f2f2f2;
56 | }
57 | 
58 | .questionnaire {
59 |     font-size: 1.2em;
60 |     line-height: 1.5em;
61 | }


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/act.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/act.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/algorithms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/algorithms.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/attention_discretization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/attention_discretization.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/bollinger_bands.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/bollinger_bands.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/box_discretization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/box_discretization.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/box_masking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/box_masking.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/compare_by_num_episodes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/compare_by_num_episodes.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/compare_by_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/compare_by_time.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/dark_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/dark_logo.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/ac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/ac.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/acer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/acer.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/bs_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/bs_dqn.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/cil.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/cil.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/ddpg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/ddpg.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/dfp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/dfp.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/distributional_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/distributional_dqn.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/dqn.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/dueling_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/dueling_dqn.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/naf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/naf.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/nec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/nec.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/pg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/pg.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/ppo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/ppo.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/qr_dqn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/qr_dqn.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/rainbow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/rainbow.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/sac.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/td3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/td3.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/design_imgs/wolpertinger.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/wolpertinger.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/distributed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/distributed.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/filters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/filters.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/full_discrete_action_space_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/full_discrete_action_space_map.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/graph.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/horizontal-scale-out.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/horizontal-scale-out.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/improve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/improve.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/level.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/level.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/linear_box_to_box_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/linear_box_to_box_map.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/network.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/observe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/observe.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/partial_discrete_action_space_map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/partial_discrete_action_space_map.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/separate_signals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/separate_signals.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/train.png


--------------------------------------------------------------------------------
/docs_raw/source/_static/img/updating_dynamically.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/updating_dynamically.gif


--------------------------------------------------------------------------------
/docs_raw/source/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | {% block extrahead %}
3 |     <link href="{{ pathto("_static/css/custom.css", True) }}" rel="stylesheet" type="text/css">
4 | {% endblock %}


--------------------------------------------------------------------------------
/docs_raw/source/components/additional_parameters.rst:
--------------------------------------------------------------------------------
 1 | Additional Parameters
 2 | =====================
 3 | 
 4 | VisualizationParameters
 5 | -----------------------
 6 | .. autoclass:: rl_coach.base_parameters.VisualizationParameters
 7 | 
 8 | PresetValidationParameters
 9 | --------------------------
10 | .. autoclass:: rl_coach.base_parameters.PresetValidationParameters
11 | 
12 | TaskParameters
13 | --------------
14 | .. autoclass:: rl_coach.base_parameters.TaskParameters
15 | 
16 | DistributedTaskParameters
17 | -------------------------
18 | .. autoclass:: rl_coach.base_parameters.DistributedTaskParameters
19 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/imitation/bc.rst:
--------------------------------------------------------------------------------
 1 | Behavioral Cloning
 2 | ==================
 3 | 
 4 | **Actions space:** Discrete | Continuous
 5 | 
 6 | Network Structure
 7 | -----------------
 8 | 
 9 | .. image:: /_static/img/design_imgs/pg.png
10 |    :align: center
11 | 
12 | 
13 | Algorithm Description
14 | ---------------------
15 | 
16 | Training the network
17 | ++++++++++++++++++++
18 | 
19 | The replay buffer contains the expert demonstrations for the task.
20 | These demonstrations are given as state, action tuples, and with no reward.
21 | The training goal is to reduce the difference between the actions predicted by the network and the actions taken by
22 | the expert for each state.
23 | 
24 | 1. Sample a batch of transitions from the replay buffer.
25 | 2. Use the current states as input to the network, and the expert actions as the targets of the network.
26 | 3. For the network head, we use the policy head, which uses the cross entropy loss function.
27 | 
28 | 
29 | .. autoclass:: rl_coach.agents.bc_agent.BCAlgorithmParameters


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/index.rst:
--------------------------------------------------------------------------------
 1 | Agents
 2 | ======
 3 | 
 4 | Coach supports many state-of-the-art reinforcement learning algorithms, which are separated into three main classes -
 5 | value optimization, policy optimization and imitation learning.
 6 | A detailed description of those algorithms can be found by navigating to each of the algorithm pages.
 7 | 
 8 | .. image:: /_static/img/algorithms.png
 9 |    :width: 600px
10 |    :align: center
11 | 
12 | .. toctree::
13 |    :maxdepth: 1
14 |    :caption: Agents
15 | 
16 |    policy_optimization/ac
17 |    policy_optimization/acer
18 |    imitation/bc
19 |    value_optimization/bs_dqn
20 |    value_optimization/categorical_dqn
21 |    imitation/cil
22 |    policy_optimization/cppo
23 |    policy_optimization/ddpg
24 |    other/dfp
25 |    value_optimization/double_dqn
26 |    value_optimization/dqn
27 |    value_optimization/dueling_dqn
28 |    value_optimization/mmc
29 |    value_optimization/n_step
30 |    value_optimization/naf
31 |    value_optimization/nec
32 |    value_optimization/pal
33 |    policy_optimization/pg
34 |    policy_optimization/ppo
35 |    value_optimization/rainbow
36 |    value_optimization/qr_dqn
37 |    policy_optimization/sac
38 |    policy_optimization/td3
39 |    policy_optimization/wolpertinger
40 | 
41 | 
42 | 
43 | .. autoclass:: rl_coach.base_parameters.AgentParameters
44 | 
45 | .. autoclass:: rl_coach.agents.agent.Agent
46 |    :members:
47 |    :inherited-members:
48 | 
49 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/policy_optimization/ac.rst:
--------------------------------------------------------------------------------
 1 | Actor-Critic
 2 | ============
 3 | 
 4 | **Actions space:** Discrete | Continuous
 5 | 
 6 | **References:** `Asynchronous Methods for Deep Reinforcement Learning <https://arxiv.org/abs/1602.01783>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/ac.png
12 |    :width: 500px
13 |    :align: center
14 | 
15 | Algorithm Description
16 | ---------------------
17 | 
18 | Choosing an action - Discrete actions
19 | +++++++++++++++++++++++++++++++++++++
20 | 
21 | The policy network is used in order to predict action probabilites. While training, a sample is taken from a categorical
22 | distribution assigned with these probabilities. When testing, the action with the highest probability is used.
23 | 
24 | Training the network
25 | ++++++++++++++++++++
26 | A batch of :math:`T_{max}` transitions is used, and the advantages are calculated upon it.
27 | 
28 | Advantages can be calculated by either of the following methods (configured by the selected preset) -
29 | 
30 | 1. **A_VALUE** - Estimating advantage directly:
31 |    :math:`A(s_t, a_t) = \underbrace{\sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})}_{Q(s_t, a_t)} - V(s_t)`
32 |    where :math:`k` is :math:`T_{max} - State\_Index` for each state in the batch.
33 | 
34 | 2. **GAE** - By following the `Generalized Advantage Estimation <https://arxiv.org/abs/1506.02438>`_ paper.
35 | 
36 | The advantages are then used in order to accumulate gradients according to 
37 | :math:`L = -\mathop{\mathbb{E}} [log (\pi) \cdot A]`
38 | 
39 | 
40 | .. autoclass:: rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/policy_optimization/hac.rst:
--------------------------------------------------------------------------------
 1 | Hierarchical Actor Critic
 2 | =========================
 3 | 
 4 | **Actions space:** Continuous
 5 | 
 6 | **References:** `Hierarchical Reinforcement Learning with Hindsight <https://arxiv.org/abs/1805.08180>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/ddpg.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | Choosing an action
17 | ++++++++++++++++++
18 | 
19 | Pass the current states through the actor network, and get an action mean vector :math:`\mu`.
20 | While in training phase, use a continuous exploration policy, such as the Ornstein-Uhlenbeck process,
21 | to add exploration noise to the action. When testing, use the mean vector :math:`\mu` as-is.
22 | 
23 | Training the network
24 | ++++++++++++++++++++
25 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/value_optimization/categorical_dqn.rst:
--------------------------------------------------------------------------------
 1 | Categorical DQN
 2 | ===============
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `A Distributional Perspective on Reinforcement Learning <https://arxiv.org/abs/1707.06887>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/distributional_dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | 1. Sample a batch of transitions from the replay buffer.
21 | 
22 | 2. The Bellman update is projected to the set of atoms representing the :math:`Q` values distribution, such
23 |    that the :math:`i-th` component of the projected update is calculated as follows:
24 | 
25 |    :math:`(\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))`
26 | 
27 |    where:
28 |    *  :math:`[ \cdot ]` bounds its argument in the range :math:`[a, b]`
29 |    *  :math:`\hat{T}_{z_{j}}` is the Bellman update for atom :math:`z_j`: :math:`\hat{T}_{z_{j}} := r+\gamma z_j`
30 | 
31 | 
32 | 3. Network is trained with the cross entropy loss between the resulting probability distribution and the target
33 |    probability distribution.   Only the target of the actions that were actually taken is updated.
34 | 
35 | 4. Once in every few thousand steps, weights are copied from the online network to the target network.
36 | 
37 | 
38 | 
39 | .. autoclass:: rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters
40 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/value_optimization/double_dqn.rst:
--------------------------------------------------------------------------------
 1 | Double DQN
 2 | ==========
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Deep Reinforcement Learning with Double Q-learning <https://arxiv.org/abs/1509.06461.pdf>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | 1. Sample a batch of transitions from the replay buffer.
21 | 
22 | 2. Using the next states from the sampled batch, run the online network in order to find the :math:`Q` maximizing
23 |    action :math:`argmax_a Q(s_{t+1},a)`. For these actions, use the corresponding next states and run the target
24 |    network to calculate :math:`Q(s_{t+1},argmax_a Q(s_{t+1},a))`.
25 | 
26 | 3. In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
27 |    use the current states from the sampled batch, and run the online network to get the current Q values predictions.
28 |    Set those values as the targets for the actions that were not actually played.
29 | 
30 | 4. For each action that was played, use the following equation for calculating the targets of the network:
31 |    :math:`y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))`
32 | 
33 | 5. Finally, train the online network using the current states as inputs, and with the aforementioned targets.
34 | 
35 | 6. Once in every few thousand steps, copy the weights from the online network to the target network.
36 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/value_optimization/dqn.rst:
--------------------------------------------------------------------------------
 1 | Deep Q Networks
 2 | ===============
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Playing Atari with Deep Reinforcement Learning <https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | 1. Sample a batch of transitions from the replay buffer.
21 | 
22 | 2. Using the next states from the sampled batch, run the target network to calculate the :math:`Q` values for each of
23 |    the actions :math:`Q(s_{t+1},a)`, and keep only the maximum value for each state.
24 | 
25 | 3. In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss),
26 |    use the current states from the sampled batch, and run the online network to get the current Q values predictions.
27 |    Set those values as the targets for the actions that were not actually played.
28 | 
29 | 4. For each action that was played, use the following equation for calculating the targets of the network:​
30 |    :math:`y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})`
31 | 
32 | 5. Finally, train the online network using the current states as inputs, and with the aforementioned targets.
33 | 
34 | 6. Once in every few thousand steps, copy the weights from the online network to the target network.
35 | 
36 | 
37 | .. autoclass:: rl_coach.agents.dqn_agent.DQNAlgorithmParameters
38 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/value_optimization/dueling_dqn.rst:
--------------------------------------------------------------------------------
 1 | Dueling DQN
 2 | ===========
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Dueling Network Architectures for Deep Reinforcement Learning <https://arxiv.org/abs/1511.06581>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dueling_dqn.png
12 |    :align: center
13 | 
14 | General Description
15 | -------------------
16 | Dueling DQN presents a change in the network structure comparing to DQN.
17 | 
18 | Dueling DQN uses a specialized *Dueling Q Head* in order to separate :math:`Q` to an :math:`A` (advantage)
19 | stream and a :math:`V` stream. Adding this type of structure to the network head allows the network to better differentiate
20 | actions from one another, and significantly improves the learning.
21 | 
22 | In many states, the values of the different actions are very similar, and it is less important which action to take.
23 | This is especially important in environments where there are many actions to choose from. In DQN, on each training
24 | iteration, for each of the states in the batch, we update the :ath:`Q` values only for the specific actions taken in
25 | those states. This results in slower learning as we do not learn the :math:`Q` values for actions that were not taken yet.
26 | On dueling architecture, on the other hand, learning is faster - as we start learning the state-value even if only a
27 | single action has been taken at this state.


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/value_optimization/mmc.rst:
--------------------------------------------------------------------------------
 1 | Mixed Monte Carlo
 2 | =================
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Count-Based Exploration with Neural Density Models <https://arxiv.org/abs/1703.01310>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | Training the network
17 | ++++++++++++++++++++
18 | 
19 | In MMC, targets are calculated as a mixture between Double DQN targets and full Monte Carlo samples (total discounted returns).
20 | 
21 | The DDQN targets are calculated in the same manner as in the DDQN agent:
22 | 
23 | :math:`y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))`
24 | 
25 | The Monte Carlo targets are calculated by summing up the discounted rewards across the entire episode:
26 | 
27 | :math:`y_t^{MC}=\sum_{j=0}^T\gamma^j r(s_{t+j},a_{t+j} )`
28 | 
29 | A mixing ratio $\alpha$ is then used to get the final targets:
30 | 
31 | :math:`y_t=(1-\alpha)\cdot y_t^{DDQN}+\alpha \cdot y_t^{MC}`
32 | 
33 | Finally, the online network is trained using the current states as inputs, and the calculated targets.
34 | Once in every few thousand steps, copy the weights from the online network to the target network.
35 | 
36 | 
37 | .. autoclass:: rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters
38 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/value_optimization/n_step.rst:
--------------------------------------------------------------------------------
 1 | N-Step Q Learning
 2 | =================
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Asynchronous Methods for Deep Reinforcement Learning <https://arxiv.org/abs/1602.01783>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | The :math:`N`-step Q learning algorithm works in similar manner to DQN except for the following changes:
21 | 
22 | 1. No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every
23 |    :math:`N` steps using the latest :math:`N` steps played by the agent.
24 | 
25 | 2. In order to stabilize the learning, multiple workers work together to update the network.
26 |    This creates the same effect as uncorrelating the samples used for training.
27 | 
28 | 3. Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated
29 |    to form the :math:`N`-step Q targets, according to the following equation:
30 |    :math:`R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})`
31 |    where :math:`k` is :math:`T_{max} - State\_Index` for each state in the batch
32 | 
33 | 
34 | 
35 | .. autoclass:: rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters
36 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/value_optimization/naf.rst:
--------------------------------------------------------------------------------
 1 | Normalized Advantage Functions
 2 | ==============================
 3 | 
 4 | **Actions space:** Continuous
 5 | 
 6 | **References:** `Continuous Deep Q-Learning with Model-based Acceleration <https://arxiv.org/abs/1603.00748.pdf>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/naf.png
12 |    :width: 600px
13 |    :align: center
14 | 
15 | Algorithm Description
16 | ---------------------
17 | Choosing an action
18 | ++++++++++++++++++
19 | The current state is used as an input to the network. The action mean :math:`\mu(s_t )` is extracted from the output head.
20 | It is then passed to the exploration policy which adds noise in order to encourage exploration.
21 | 
22 | Training the network
23 | ++++++++++++++++++++
24 | The network is trained by using the following targets:
25 | :math:`y_t=r(s_t,a_t )+\gamma\cdot V(s_{t+1})`
26 | Use the next states as the inputs to the target network and extract the :math:`V` value, from within the head,
27 | to get :math:`V(s_{t+1} )`. Then, update the online network using the current states and actions as inputs,
28 | and :math:`y_t` as the targets.
29 | After every training step, use a soft update in order to copy the weights from the online network to the target network.
30 | 
31 | 
32 | 
33 | .. autoclass:: rl_coach.agents.naf_agent.NAFAlgorithmParameters
34 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/agents/value_optimization/qr_dqn.rst:
--------------------------------------------------------------------------------
 1 | Quantile Regression DQN
 2 | =======================
 3 | 
 4 | **Actions space:** Discrete
 5 | 
 6 | **References:** `Distributional Reinforcement Learning with Quantile Regression <https://arxiv.org/abs/1710.10044>`_
 7 | 
 8 | Network Structure
 9 | -----------------
10 | 
11 | .. image:: /_static/img/design_imgs/qr_dqn.png
12 |    :align: center
13 | 
14 | Algorithm Description
15 | ---------------------
16 | 
17 | Training the network
18 | ++++++++++++++++++++
19 | 
20 | 1. Sample a batch of transitions from the replay buffer.
21 | 
22 | 2. First, the next state quantiles are predicted. These are used in order to calculate the targets for the network,
23 |    by following the Bellman equation.
24 |    Next, the current quantile locations for the current states are predicted, sorted, and used for calculating the
25 |    quantile midpoints targets.
26 | 
27 | 3. The network is trained with the quantile regression loss between the resulting quantile locations and the target
28 |    quantile locations. Only the targets of the actions that were actually taken are updated.
29 | 
30 | 4. Once in every few thousand steps, weights are copied from the online network to the target network.
31 | 
32 | 
33 | .. autoclass:: rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters


--------------------------------------------------------------------------------
/docs_raw/source/components/architectures/index.rst:
--------------------------------------------------------------------------------
 1 | Architectures
 2 | =============
 3 | 
 4 | Architectures contain all the classes that implement the neural network related stuff for the agent.
 5 | Since Coach is intended to work with multiple neural network frameworks, each framework will implement its
 6 | own components under a dedicated directory. For example, tensorflow components will contain all the neural network
 7 | parts that are implemented using TensorFlow.
 8 | 
 9 | .. autoclass:: rl_coach.base_parameters.NetworkParameters
10 | 
11 | Architecture
12 | ------------
13 | .. autoclass:: rl_coach.architectures.architecture.Architecture
14 |    :members:
15 |    :inherited-members:
16 | 
17 | NetworkWrapper
18 | --------------
19 | 
20 | .. image:: /_static/img/distributed.png
21 |    :width: 600px
22 |    :align: center
23 | 
24 | .. autoclass:: rl_coach.architectures.network_wrapper.NetworkWrapper
25 |    :members:
26 |    :inherited-members:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/core_types.rst:
--------------------------------------------------------------------------------
 1 | Core Types
 2 | ==========
 3 | 
 4 | ActionInfo
 5 | ----------
 6 | .. autoclass:: rl_coach.core_types.ActionInfo
 7 |    :members:
 8 |    :inherited-members:
 9 | 
10 | Batch
11 | -----
12 | .. autoclass:: rl_coach.core_types.Batch
13 |    :members:
14 |    :inherited-members:
15 | 
16 | EnvResponse
17 | -----------
18 | .. autoclass:: rl_coach.core_types.EnvResponse
19 |    :members:
20 |    :inherited-members:
21 | 
22 | Episode
23 | -------
24 | .. autoclass:: rl_coach.core_types.Episode
25 |    :members:
26 |    :inherited-members:
27 | 
28 | Transition
29 | ----------
30 | .. autoclass:: rl_coach.core_types.Transition
31 |    :members:
32 |    :inherited-members:
33 | 
34 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/data_stores/index.rst:
--------------------------------------------------------------------------------
 1 | Data Stores
 2 | ===========
 3 | 
 4 | S3DataStore
 5 | -----------
 6 | .. autoclass:: rl_coach.data_stores.s3_data_store.S3DataStore
 7 | 
 8 | NFSDataStore
 9 | ------------
10 | .. autoclass:: rl_coach.data_stores.nfs_data_store.NFSDataStore
11 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/filters/index.rst:
--------------------------------------------------------------------------------
 1 | Filters
 2 | =======
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 |    :caption: Filters
 7 | 
 8 |    input_filters
 9 |    output_filters
10 | 
11 | Filters are a mechanism in Coach that allows doing pre-processing and post-processing of the internal agent information.
12 | There are two filter categories -
13 | 
14 | * **Input filters** - these are filters that process the information passed **into** the agent from the environment.
15 |   This information includes the observation and the reward. Input filters therefore allow rescaling observations,
16 |   normalizing rewards, stack observations, etc.
17 | 
18 | * **Output filters** - these are filters that process the information going **out** of the agent into the environment.
19 |   This information includes the action the agent chooses to take. Output filters therefore allow conversion of
20 |   actions from one space into another. For example, the agent can take :math:`N` discrete actions, that will be mapped by
21 |   the output filter onto :math:`N` continuous actions.
22 | 
23 | Filters can be stacked on top of each other in order to build complex processing flows of the inputs or outputs.
24 | 
25 | .. image:: /_static/img/filters.png
26 |    :width: 350px
27 |    :align: center
28 | 
29 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/filters/output_filters.rst:
--------------------------------------------------------------------------------
 1 | Output Filters
 2 | --------------
 3 | 
 4 | The output filters only process the actions.
 5 | 
 6 | Action Filters
 7 | ++++++++++++++
 8 | 
 9 | .. autoclass:: rl_coach.filters.action.AttentionDiscretization
10 | 
11 | .. image:: /_static/img/attention_discretization.png
12 |    :align: center
13 | 
14 | .. autoclass:: rl_coach.filters.action.BoxDiscretization
15 | 
16 | .. image:: /_static/img/box_discretization.png
17 |    :align: center
18 | 
19 | .. autoclass:: rl_coach.filters.action.BoxMasking
20 | 
21 | .. image:: /_static/img/box_masking.png
22 |    :align: center
23 | 
24 | .. autoclass:: rl_coach.filters.action.PartialDiscreteActionSpaceMap
25 | 
26 | .. image:: /_static/img/partial_discrete_action_space_map.png
27 |    :align: center
28 | 
29 | .. autoclass:: rl_coach.filters.action.FullDiscreteActionSpaceMap
30 | 
31 | .. image:: /_static/img/full_discrete_action_space_map.png
32 |    :align: center
33 | 
34 | .. autoclass:: rl_coach.filters.action.LinearBoxToBoxMap
35 | 
36 | .. image:: /_static/img/linear_box_to_box_map.png
37 |    :align: center


--------------------------------------------------------------------------------
/docs_raw/source/components/memories/index.rst:
--------------------------------------------------------------------------------
 1 | Memories
 2 | ========
 3 | 
 4 | Episodic Memories
 5 | -----------------
 6 | 
 7 | EpisodicExperienceReplay
 8 | ++++++++++++++++++++++++
 9 | .. autoclass:: rl_coach.memories.episodic.EpisodicExperienceReplay
10 | 
11 | EpisodicHindsightExperienceReplay
12 | +++++++++++++++++++++++++++++++++
13 | .. autoclass:: rl_coach.memories.episodic.EpisodicHindsightExperienceReplay
14 | 
15 | EpisodicHRLHindsightExperienceReplay
16 | ++++++++++++++++++++++++++++++++++++
17 | .. autoclass:: rl_coach.memories.episodic.EpisodicHRLHindsightExperienceReplay
18 | 
19 | SingleEpisodeBuffer
20 | +++++++++++++++++++
21 | .. autoclass:: rl_coach.memories.episodic.SingleEpisodeBuffer
22 | 
23 | 
24 | Non-Episodic Memories
25 | ---------------------
26 | BalancedExperienceReplay
27 | ++++++++++++++++++++++++
28 | .. autoclass:: rl_coach.memories.non_episodic.BalancedExperienceReplay
29 | 
30 | QDND
31 | ++++
32 | .. autoclass:: rl_coach.memories.non_episodic.QDND
33 | 
34 | ExperienceReplay
35 | ++++++++++++++++
36 | .. autoclass:: rl_coach.memories.non_episodic.ExperienceReplay
37 | 
38 | PrioritizedExperienceReplay
39 | +++++++++++++++++++++++++++
40 | .. autoclass:: rl_coach.memories.non_episodic.PrioritizedExperienceReplay
41 | 
42 | TransitionCollection
43 | ++++++++++++++++++++
44 | .. autoclass:: rl_coach.memories.non_episodic.TransitionCollection
45 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/memory_backends/index.rst:
--------------------------------------------------------------------------------
1 | Memory Backends
2 | ===============
3 | 
4 | RedisPubSubBackend
5 | ------------------
6 | .. autoclass:: rl_coach.memories.backend.redis.RedisPubSubBackend
7 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/orchestrators/index.rst:
--------------------------------------------------------------------------------
1 | Orchestrators
2 | =============
3 | 
4 | 
5 | Kubernetes
6 | ----------
7 | .. autoclass:: rl_coach.orchestrators.kubernetes_orchestrator.Kubernetes
8 | 


--------------------------------------------------------------------------------
/docs_raw/source/components/spaces.rst:
--------------------------------------------------------------------------------
 1 | Spaces
 2 | ======
 3 | 
 4 | Space
 5 | -----
 6 | .. autoclass:: rl_coach.spaces.Space
 7 |    :members:
 8 |    :inherited-members:
 9 | 
10 | 
11 | 
12 | Observation Spaces
13 | ------------------
14 | .. autoclass:: rl_coach.spaces.ObservationSpace
15 |    :members:
16 |    :inherited-members:
17 | 
18 | VectorObservationSpace
19 | ++++++++++++++++++++++
20 | .. autoclass:: rl_coach.spaces.VectorObservationSpace
21 | 
22 | PlanarMapsObservationSpace
23 | ++++++++++++++++++++++++++
24 | .. autoclass:: rl_coach.spaces.PlanarMapsObservationSpace
25 | 
26 | ImageObservationSpace
27 | +++++++++++++++++++++
28 | .. autoclass:: rl_coach.spaces.ImageObservationSpace
29 | 
30 | 
31 | 
32 | Action Spaces
33 | -------------
34 | .. autoclass:: rl_coach.spaces.ActionSpace
35 |    :members:
36 |    :inherited-members:
37 | 
38 | AttentionActionSpace
39 | ++++++++++++++++++++
40 | .. autoclass:: rl_coach.spaces.AttentionActionSpace
41 | 
42 | BoxActionSpace
43 | ++++++++++++++
44 | .. autoclass:: rl_coach.spaces.BoxActionSpace
45 | 
46 | DiscreteActionSpace
47 | ++++++++++++++++++++
48 | .. autoclass:: rl_coach.spaces.DiscreteActionSpace
49 | 
50 | MultiSelectActionSpace
51 | ++++++++++++++++++++++
52 | .. autoclass:: rl_coach.spaces.MultiSelectActionSpace
53 | 
54 | CompoundActionSpace
55 | +++++++++++++++++++
56 | .. autoclass:: rl_coach.spaces.CompoundActionSpace
57 | 
58 | 
59 | 
60 | Goal Spaces
61 | -----------
62 | .. autoclass:: rl_coach.spaces.GoalsSpace
63 |    :members:
64 |    :inherited-members:
65 | 


--------------------------------------------------------------------------------
/docs_raw/source/features/algorithms.rst:
--------------------------------------------------------------------------------
 1 | Algorithms
 2 | ==========
 3 | 
 4 | Coach supports many state-of-the-art reinforcement learning algorithms, which are separated into three main classes -
 5 | value optimization, policy optimization and imitation learning.
 6 | A detailed description of those algorithms may be found in the `agents <../components/agents/index.html>`_ section.
 7 | 
 8 | .. image:: /_static/img/algorithms.png
 9 |    :width: 600px
10 |    :align: center


--------------------------------------------------------------------------------
/docs_raw/source/features/batch_rl.rst:
--------------------------------------------------------------------------------
 1 | Batch Reinforcement Learning
 2 | ============================
 3 | 
 4 | Coach supports Batch Reinforcement Learning, where learning is based solely on a (fixed) batch of data.
 5 | In Batch RL, we are given a dataset of experience, which was collected using some (one or more) deployed policies, and we would
 6 | like to use it to learn a better policy than what was used to collect the dataset.
 7 | There is no simulator to interact with, and so we cannot collect any new data, meaning we often cannot explore the MDP any further.
 8 | To make things even harder, we would also like to use the dataset in order to evaluate the newly learned policy
 9 | (using off-policy evaluation), since we do not have a simulator which we can use to evaluate the policy on.
10 | Batch RL is also often beneficial in cases where we just want to separate the inference (data collection) from the
11 | training process of a new policy. This is often the case where we have a system on which we could quite easily deploy a policy
12 | and collect experience data, but cannot easily use that system's setup to online train a new policy (as is often the
13 | case with more standard RL algorithms).
14 | 
15 | Coach supports (almost) all of the integrated off-policy algorithms with Batch RL.
16 | 
17 | A lot more details and example usage can be found in the
18 | `tutorial <https://github.com/NervanaSystems/coach/blob/master/tutorials/4.%20Batch%20Reinforcement%20Learning.ipynb>`_.


--------------------------------------------------------------------------------
/docs_raw/source/features/benchmarks.rst:
--------------------------------------------------------------------------------
 1 | Benchmarks
 2 | ==========
 3 | 
 4 | Reinforcement learning is a developing field, and so far it has been particularly difficult to reproduce some of the
 5 | results published in the original papers. Some reasons for this are:
 6 | 
 7 | * Reinforcement learning algorithms are notoriously known as having an unstable learning process.
 8 |   The data the neural networks trains on is dynamic, and depends on the random seed defined for the environment.
 9 | 
10 | * Reinforcement learning algorithms have many moving parts. For some environments and agents, there are many
11 |   "tricks" which are needed to get the exact behavior the paper authors had seen. Also, there are **a lot** of
12 |   hyper-parameters to set.
13 | 
14 | In order for a reinforcement learning implementation to be useful for research or for data science, it must be
15 | shown that it achieves the expected behavior. For this reason, we collected a set of benchmark results from most
16 | of the algorithms implemented in Coach. The algorithms were tested on a subset of the same environments that were
17 | used in the original papers, and with multiple seed for each environment.
18 | Additionally, Coach uses some strict testing mechanisms to try and make sure the results we show for these
19 | benchmarks stay intact as Coach continues to develop.
20 | 
21 | To see the benchmark results, please visit the
22 | `following GitHub page <https://github.com/NervanaSystems/coach/tree/master/benchmarks>`_.


--------------------------------------------------------------------------------
/docs_raw/source/features/index.rst:
--------------------------------------------------------------------------------
 1 | Features
 2 | ========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 |    :caption: Features
 7 | 
 8 |    algorithms
 9 |    environments
10 |    benchmarks
11 |    batch_rl


--------------------------------------------------------------------------------
/docs_raw/source/test.rst:
--------------------------------------------------------------------------------
1 | test
2 | ----
3 | 
4 | .. important:: Its a note! in markdown!
5 | 
6 | .. autoclass:: rl_coach.agents.dqn_agent.DQNAgent
7 |       :members:
8 |       :inherited-members:


--------------------------------------------------------------------------------
/img/ant.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/ant.gif


--------------------------------------------------------------------------------
/img/carla.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/carla.gif


--------------------------------------------------------------------------------
/img/coach_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/coach_logo.png


--------------------------------------------------------------------------------
/img/dashboard.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/dashboard.gif


--------------------------------------------------------------------------------
/img/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/dashboard.png


--------------------------------------------------------------------------------
/img/doom_deathmatch.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/doom_deathmatch.gif


--------------------------------------------------------------------------------
/img/doom_health.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/doom_health.gif


--------------------------------------------------------------------------------
/img/fetch_slide.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/fetch_slide.gif


--------------------------------------------------------------------------------
/img/minitaur.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/minitaur.gif


--------------------------------------------------------------------------------
/img/montezuma.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/montezuma.gif


--------------------------------------------------------------------------------
/img/pendulum.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/pendulum.gif


--------------------------------------------------------------------------------
/img/starcraft.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/starcraft.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | annoy>=1.8.3
 2 | Pillow>=9.0.1
 3 | matplotlib>=2.0.2
 4 | numpy>=1.14.5
 5 | pandas>=0.22.0
 6 | pygame>=1.9.3
 7 | PyOpenGL>=3.1.0
 8 | scipy>=0.19.0
 9 | scikit-image>=0.13.0
10 | gym==0.12.5
11 | bokeh==1.0.4
12 | kubernetes>=8.0.0b1,<=8.0.1
13 | redis>=2.10.6
14 | minio>=4.0.5
15 | pytest>=3.8.2
16 | psutil>=5.5.0
17 | joblib>=0.17.0
18 | 


--------------------------------------------------------------------------------
/rl_coach/__init__.py:
--------------------------------------------------------------------------------
1 | # This gets rid of NumPy FutureWarnings that occur at TF import
2 | import warnings
3 | warnings.filterwarnings('ignore',category=FutureWarning)
4 | 
5 | # This gets rid of TF 2.0 related deprecation warnings
6 | import tensorflow as tf
7 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
8 | 


--------------------------------------------------------------------------------
/rl_coach/agents/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation 
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/rl_coach/architectures/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation 
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/rl_coach/architectures/mxnet_components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/architectures/mxnet_components/__init__.py


--------------------------------------------------------------------------------
/rl_coach/architectures/mxnet_components/embedders/__init__.py:
--------------------------------------------------------------------------------
1 | from .image_embedder import ImageEmbedder
2 | from .tensor_embedder import TensorEmbedder
3 | from .vector_embedder import VectorEmbedder
4 | 
5 | __all__ = ['ImageEmbedder',
6 |            'TensorEmbedder',
7 |            'VectorEmbedder']
8 | 


--------------------------------------------------------------------------------
/rl_coach/architectures/mxnet_components/heads/__init__.py:
--------------------------------------------------------------------------------
 1 | from .head import Head, HeadLoss
 2 | from .q_head import QHead
 3 | from .ppo_head import PPOHead
 4 | from .ppo_v_head import PPOVHead
 5 | from .v_head import VHead
 6 | 
 7 | __all__ = [
 8 |     'Head',
 9 |     'HeadLoss',
10 |     'QHead',
11 |     'PPOHead',
12 |     'PPOVHead',
13 |     'VHead'
14 | ]
15 | 


--------------------------------------------------------------------------------
/rl_coach/architectures/mxnet_components/middlewares/__init__.py:
--------------------------------------------------------------------------------
1 | from .fc_middleware import FCMiddleware
2 | from .lstm_middleware import LSTMMiddleware
3 | 
4 | __all__ = ["FCMiddleware", "LSTMMiddleware"]


--------------------------------------------------------------------------------
/rl_coach/architectures/tensorflow_components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/architectures/tensorflow_components/__init__.py


--------------------------------------------------------------------------------
/rl_coach/architectures/tensorflow_components/embedders/__init__.py:
--------------------------------------------------------------------------------
1 | from .image_embedder import ImageEmbedder
2 | from .vector_embedder import VectorEmbedder
3 | from .tensor_embedder import TensorEmbedder
4 | 
5 | __all__ = ['ImageEmbedder', 'VectorEmbedder', 'TensorEmbedder']
6 | 


--------------------------------------------------------------------------------
/rl_coach/architectures/tensorflow_components/heads/__init__.py:
--------------------------------------------------------------------------------
 1 | from .q_head import QHead
 2 | from .categorical_q_head import CategoricalQHead
 3 | from .ddpg_actor_head import DDPGActor
 4 | from .dnd_q_head import DNDQHead
 5 | from .dueling_q_head import DuelingQHead
 6 | from .measurements_prediction_head import MeasurementsPredictionHead
 7 | from .naf_head import NAFHead
 8 | from .policy_head import PolicyHead
 9 | from .ppo_head import PPOHead
10 | from .ppo_v_head import PPOVHead
11 | from .quantile_regression_q_head import QuantileRegressionQHead
12 | from .rainbow_q_head import RainbowQHead
13 | from .v_head import VHead
14 | from .acer_policy_head import ACERPolicyHead
15 | from .sac_head import SACPolicyHead
16 | from .sac_q_head import SACQHead
17 | from .classification_head import ClassificationHead
18 | from .cil_head import RegressionHead
19 | from .td3_v_head import TD3VHead
20 | from .ddpg_v_head import DDPGVHead
21 | from .wolpertinger_actor_head import WolpertingerActorHead
22 | from .RND_head import RNDHead
23 | 
24 | __all__ = [
25 |     'CategoricalQHead',
26 |     'DDPGActor',
27 |     'DNDQHead',
28 |     'DuelingQHead',
29 |     'MeasurementsPredictionHead',
30 |     'NAFHead',
31 |     'PolicyHead',
32 |     'PPOHead',
33 |     'PPOVHead',
34 |     'QHead',
35 |     'QuantileRegressionQHead',
36 |     'RainbowQHead',
37 |     'VHead',
38 |     'ACERPolicyHead',
39 |     'SACPolicyHead',
40 |     'SACQHead',
41 |     'ClassificationHead',
42 |     'RegressionHead',
43 |     'TD3VHead',
44 |     'DDPGVHead',
45 |     'WolpertingerActorHead',
46 |     'RNDHead'
47 | ]
48 | 


--------------------------------------------------------------------------------
/rl_coach/architectures/tensorflow_components/middlewares/__init__.py:
--------------------------------------------------------------------------------
1 | from .fc_middleware import FCMiddleware
2 | from .lstm_middleware import LSTMMiddleware
3 | 
4 | __all__ = ["FCMiddleware", "LSTMMiddleware"]
5 | 


--------------------------------------------------------------------------------
/rl_coach/dashboard_components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/dashboard_components/__init__.py


--------------------------------------------------------------------------------
/rl_coach/dashboard_components/boards.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | 
18 | from bokeh.layouts import column
19 | from bokeh.models.widgets import Panel, Tabs
20 | from rl_coach.dashboard_components.experiment_board import experiment_board_layout
21 | from rl_coach.dashboard_components.episodic_board import episodic_board_layout
22 | from rl_coach.dashboard_components.globals import spinner, layouts
23 | from bokeh.models.widgets import Div
24 | 
25 | # ---------------- Build Website Layout -------------------
26 | 
27 | # title
28 | title = Div(text="""<h1>Coach Dashboard</h1>""")
29 | center = Div(text="""<style>html { padding-left: 50px; } </style>""")
30 | tab1 = Panel(child=experiment_board_layout, title='experiment board')
31 | # tab2 = Panel(child=episodic_board_layout, title='episodic board')
32 | # tabs = Tabs(tabs=[tab1, tab2])
33 | tabs = Tabs(tabs=[tab1])
34 | 
35 | layout = column(title, center, tabs)
36 | layout = column(layout, spinner)
37 | 
38 | layouts['boards'] = layout
39 | 


--------------------------------------------------------------------------------
/rl_coach/data_stores/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation 
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/rl_coach/environments/README.md:
--------------------------------------------------------------------------------
 1 | A custom environment implementation should look like this:
 2 | 
 3 | ```bash
 4 | from coach.filters.input_filter import InputFilter
 5 | 
 6 | class CustomFilter(InputFilter):
 7 |   def __init__(self):
 8 |     ...
 9 |   def _filter(self, env_response: EnvResponse) -> EnvResponse:
10 |     ...
11 |   def _get_filtered_observation_space(self, input_observation_space: ObservationSpace) -> ObservationSpace:
12 |     ...
13 |   def _get_filtered_reward_space(self, input_reward_space: RewardSpace) -> RewardSpace:
14 |     ...
15 |   def _validate_input_observation_space(self, input_observation_space: ObservationSpace):
16 |     ...
17 |   def _reset(self):
18 |     ...
19 | ```
20 | 


--------------------------------------------------------------------------------
/rl_coach/environments/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation 
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | 


--------------------------------------------------------------------------------
/rl_coach/environments/doom/D2_navigation.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = D2_navigation.wad
 6 | doom_map = map01
 7 | 
 8 | # Rewards
 9 | 
10 | # Each step is good for you!
11 | living_reward = 1
12 | # And death is not!
13 | death_penalty = 0
14 | 
15 | # Rendering options
16 | screen_resolution = RES_160X120
17 | screen_format = GRAY8
18 | render_hud = false
19 | render_crosshair = false
20 | render_weapon = false
21 | render_decals = false
22 | render_particles = false
23 | window_visible = false
24 | 
25 | # make episodes finish after 2100 actions (tics)
26 | episode_timeout = 2100
27 | 
28 | # Available buttons
29 | available_buttons = 
30 | 	{ 
31 | 		TURN_LEFT 
32 | 		TURN_RIGHT 
33 | 		MOVE_FORWARD 
34 | 	}
35 | 
36 | # Game variables that will be in the state
37 | available_game_variables = { HEALTH }
38 | 
39 | mode = PLAYER
40 | 


--------------------------------------------------------------------------------
/rl_coach/environments/doom/D2_navigation.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/environments/doom/D2_navigation.wad


--------------------------------------------------------------------------------
/rl_coach/environments/doom/D3_battle.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | # modifty these to point to your vizdoom binary and freedoom2.wad
 6 | doom_scenario_path = D3_battle.wad
 7 | doom_map = map01
 8 | 
 9 | # Rewards
10 | 
11 | living_reward = 0
12 | death_penalty = 0
13 | 
14 | # Rendering options
15 | screen_resolution = RES_320X240
16 | screen_format = CRCGCB
17 | render_hud = false
18 | render_crosshair = true
19 | render_weapon = true
20 | render_decals = false
21 | render_particles = false
22 | window_visible = false
23 | 
24 | # make episodes finish after 2100 actions (tics)
25 | episode_timeout = 2100
26 | 
27 | # Available buttons
28 | available_buttons = 
29 |     { 
30 |         MOVE_FORWARD
31 |         MOVE_BACKWARD
32 |         MOVE_RIGHT
33 |         MOVE_LEFT       
34 |         TURN_LEFT
35 |         TURN_RIGHT
36 |         ATTACK
37 |         SPEED
38 |     }
39 | 
40 | # Game variables that will be in the state
41 | available_game_variables = {AMMO2 HEALTH USER2}
42 | 
43 | mode = PLAYER
44 | doom_skill = 2
45 | 


--------------------------------------------------------------------------------
/rl_coach/environments/doom/D3_battle.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/environments/doom/D3_battle.wad


--------------------------------------------------------------------------------
/rl_coach/environments/mujoco/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/environments/mujoco/__init__.py


--------------------------------------------------------------------------------
/rl_coach/environments/mujoco/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 The dm_control Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or  implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ============================================================================
15 | 
16 | """Functions to manage the common assets for domains."""
17 | 
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 | 
22 | import os
23 | from dm_control.utils import resources
24 | 
25 | _SUITE_DIR = os.path.dirname(os.path.dirname(__file__))
26 | _FILENAMES = [
27 |     "common/materials.xml",
28 |     "common/skybox.xml",
29 |     "common/visual.xml",
30 | ]
31 | 
32 | ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename))
33 |           for filename in _FILENAMES}
34 | 
35 | 
36 | def read_model(model_filename):
37 |   """Reads a model XML file and returns its contents as a string."""
38 |   return resources.GetResource(os.path.join(_SUITE_DIR, model_filename))
39 | 


--------------------------------------------------------------------------------
/rl_coach/environments/mujoco/common/materials.xml:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Common textures, colors and materials to be used throughout this suite. Some
 3 | materials such as xxx_highlight are activated on occurence of certain events,
 4 | for example receiving a positive reward.
 5 | -->
 6 | <mujoco>
 7 |   <asset>
 8 |     <texture name="grid" type="2d" builtin="checker" rgb1=".1 .2 .3" rgb2=".2 .3 .4" width="300" height="300" mark="edge" markrgb=".2 .3 .4"/>
 9 |     <material name="grid" texture="grid" texrepeat="1 1" texuniform="true" reflectance=".2"/>
10 |     <material name="self" rgba=".7 .5 .3 1"/>
11 |     <material name="self_default" rgba=".7 .5 .3 1"/>
12 |     <material name="self_highlight" rgba="0 .5 .3 1"/>
13 |     <material name="effector" rgba=".7 .4 .2 1"/>
14 |     <material name="effector_default" rgba=".7 .4 .2 1"/>
15 |     <material name="effector_highlight" rgba="0 .5 .3 1"/>
16 |     <material name="decoration" rgba=".3 .5 .7 1"/>
17 |     <material name="eye" rgba="0 .2 1 1"/>
18 |     <material name="target" rgba=".6 .3 .3 1"/>
19 |     <material name="target_default" rgba=".6 .3 .3 1"/>
20 |     <material name="target_highlight" rgba=".6 .3 .3 .4"/>
21 |   </asset>
22 | </mujoco>
23 | 


--------------------------------------------------------------------------------
/rl_coach/environments/mujoco/common/skybox.xml:
--------------------------------------------------------------------------------
1 | <mujoco>
2 |   <asset>
3 |       <texture name="skybox" type="skybox" builtin="gradient" rgb1=".4 .6 .8" rgb2="0 0 0"
4 |                width="800" height="800" mark="random" markrgb="1 1 1"/>
5 |   </asset>
6 | </mujoco>
7 | 


--------------------------------------------------------------------------------
/rl_coach/environments/mujoco/common/visual.xml:
--------------------------------------------------------------------------------
1 | <mujoco>
2 |   <visual>
3 |     <headlight ambient=".4 .4 .4" diffuse=".8 .8 .8" specular="0.1 0.1 0.1"/>
4 |     <map znear=".01"/>
5 |     <quality shadowsize="2048"/>
6 |   </visual>
7 | </mujoco>
8 | 


--------------------------------------------------------------------------------
/rl_coach/environments/robosuite/osc_pose.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "OSC_POSE",
 3 |   "input_max": 1,
 4 |   "input_min": -1,
 5 |   "output_max": [0.125, 0.125, 0.125, 0.5, 0.5, 0.5],
 6 |   "output_min": [-0.125, -0.125, -0.125, -0.5, -0.5, -0.5],
 7 |   "kp": 150,
 8 |   "damping_ratio": 1,
 9 |   "impedance_mode": "fixed",
10 |   "kp_limits": [0, 300],
11 |   "damping_ratio_limits": [0, 10],
12 |   "position_limits": [[-0.22, -0.35, 0.82], [0.22, 0.35, 1.3]],
13 |   "orientation_limits": null,
14 |   "uncouple_pos_ori": true,
15 |   "control_delta": true,
16 |   "interpolation": null,
17 |   "ramp_ratio": 0.2
18 | }


--------------------------------------------------------------------------------
/rl_coach/environments/toy_problems/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/environments/toy_problems/__init__.py


--------------------------------------------------------------------------------
/rl_coach/exploration_policies/README.md:
--------------------------------------------------------------------------------
 1 | # Exploration Policy
 2 | 
 3 | An exploration policy is a module that is responsible for choosing the action according to the action values, the
 4 | current phase, its internal state and the specific exploration policy algorithm.
 5 | 
 6 | A custom exploration policy should implement both the exploration policy class and the exploration policy parameters
 7 | class, which defines the parameters and the location of the exploration policy module.
 8 | The parameters of the exploration policy class should match the parameters in the exploration policy parameters class.
 9 | 
10 | Exploration policies typically have some control parameter that defines its current exploration state, and
11 | a schedule for this parameter. This schedule can be defined using the Schedule class which is defined in
12 | exploration_policy.py.
13 | 
14 | A custom implementation should look as follows:
15 | 
16 | ```
17 | class CustomExplorationParameters(ExplorationParameters):
18 |     def __init__(self):
19 |         super().__init__()
20 |         ...
21 | 
22 |     @property
23 |     def path(self):
24 |         return 'module_path:class_name'
25 | 
26 | 
27 | class CustomExplorationPolicy(ExplorationPolicy):
28 |     def __init__(self, action_space: ActionSpace, ...):
29 |         super().__init__(action_space)
30 | 
31 |     def reset(self):
32 |         ...
33 | 
34 |     def get_action(self, action_values: List[ActionType]) -> ActionType:
35 |         ...
36 | 
37 |     def change_phase(self, phase):
38 |         ...
39 | 
40 |     def get_control_param(self):
41 |         ...
42 | ```


--------------------------------------------------------------------------------
/rl_coach/exploration_policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/exploration_policies/__init__.py


--------------------------------------------------------------------------------
/rl_coach/filters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/filters/__init__.py


--------------------------------------------------------------------------------
/rl_coach/filters/action/__init__.py:
--------------------------------------------------------------------------------
 1 | from .attention_discretization import AttentionDiscretization
 2 | from .box_discretization import BoxDiscretization
 3 | from .box_masking import BoxMasking
 4 | from .full_discrete_action_space_map import FullDiscreteActionSpaceMap
 5 | from .linear_box_to_box_map import LinearBoxToBoxMap
 6 | from .partial_discrete_action_space_map import PartialDiscreteActionSpaceMap
 7 | __all__ = [
 8 |     'AttentionDiscretization',
 9 |     'BoxDiscretization',
10 |     'BoxMasking',
11 |     'FullDiscreteActionSpaceMap',
12 |     'LinearBoxToBoxMap',
13 |     'PartialDiscreteActionSpaceMap'
14 | ]


--------------------------------------------------------------------------------
/rl_coach/filters/action/full_discrete_action_space_map.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from rl_coach.filters.action.partial_discrete_action_space_map import PartialDiscreteActionSpaceMap
18 | from rl_coach.spaces import ActionSpace, DiscreteActionSpace
19 | 
20 | 
21 | class FullDiscreteActionSpaceMap(PartialDiscreteActionSpaceMap):
22 |     """
23 |     Full map of two countable action spaces. This works in a similar way to the
24 |     PartialDiscreteActionSpaceMap, but maps the entire source action space into the entire target action space, without
25 |     masking any actions.
26 |     For example, if there are 10 multiselect actions in the output space, the actions 0-9 will be mapped to those
27 |     multiselect actions.
28 |     """
29 |     def __init__(self):
30 |         super().__init__()
31 | 
32 |     def get_unfiltered_action_space(self, output_action_space: ActionSpace) -> DiscreteActionSpace:
33 |         self.target_actions = output_action_space.actions
34 |         return super().get_unfiltered_action_space(output_action_space)
35 | 


--------------------------------------------------------------------------------
/rl_coach/filters/observation/__init__.py:
--------------------------------------------------------------------------------
 1 | from .observation_clipping_filter import ObservationClippingFilter
 2 | from .observation_crop_filter import ObservationCropFilter
 3 | from .observation_move_axis_filter import ObservationMoveAxisFilter
 4 | from .observation_normalization_filter import ObservationNormalizationFilter
 5 | from .observation_reduction_by_sub_parts_name_filter import ObservationReductionBySubPartsNameFilter
 6 | from .observation_rescale_size_by_factor_filter import ObservationRescaleSizeByFactorFilter
 7 | from .observation_rescale_to_size_filter import ObservationRescaleToSizeFilter
 8 | from .observation_rgb_to_y_filter import ObservationRGBToYFilter
 9 | from .observation_squeeze_filter import ObservationSqueezeFilter
10 | from .observation_stacking_filter import ObservationStackingFilter
11 | from .observation_to_uint8_filter import ObservationToUInt8Filter
12 | 
13 | __all__ = [
14 |     'ObservationClippingFilter',
15 |     'ObservationCropFilter',
16 |     'ObservationMoveAxisFilter',
17 |     'ObservationNormalizationFilter',
18 |     'ObservationReductionBySubPartsNameFilter',
19 |     'ObservationRescaleSizeByFactorFilter',
20 |     'ObservationRescaleToSizeFilter',
21 |     'ObservationRGBToYFilter',
22 |     'ObservationSqueezeFilter',
23 |     'ObservationStackingFilter',
24 |     'ObservationToUInt8Filter'
25 | ]


--------------------------------------------------------------------------------
/rl_coach/filters/observation/observation_filter.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from rl_coach.filters.filter import Filter
18 | from rl_coach.spaces import ObservationSpace
19 | 
20 | 
21 | class ObservationFilter(Filter):
22 |     def __init__(self):
23 |         super().__init__()
24 |         self.supports_batching = False
25 | 
26 |     def get_filtered_observation_space(self, input_observation_space: ObservationSpace) -> ObservationSpace:
27 |         """
28 |         This function should contain the logic for getting the filtered observation space
29 |         :param input_observation_space: the input observation space
30 |         :return: the filtered observation space
31 |         """
32 |         return input_observation_space
33 | 
34 |     def validate_input_observation_space(self, input_observation_space: ObservationSpace):
35 |         """
36 |         A function that implements validation of the input observation space
37 |         :param input_observation_space: the input observation space
38 |         :return: None
39 |         """
40 |         pass


--------------------------------------------------------------------------------
/rl_coach/filters/reward/__init__.py:
--------------------------------------------------------------------------------
 1 | from .reward_rescale_filter import RewardRescaleFilter
 2 | from .reward_clipping_filter import RewardClippingFilter
 3 | from .reward_normalization_filter import RewardNormalizationFilter
 4 | from .reward_ewma_normalization_filter import RewardEwmaNormalizationFilter
 5 | 
 6 | __all__ = [
 7 |     'RewardRescaleFilter',
 8 |     'RewardClippingFilter',
 9 |     'RewardNormalizationFilter',
10 |     'RewardEwmaNormalizationFilter'
11 | ]


--------------------------------------------------------------------------------
/rl_coach/filters/reward/reward_filter.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from rl_coach.filters.filter import Filter
18 | from rl_coach.spaces import RewardSpace
19 | 
20 | 
21 | class RewardFilter(Filter):
22 |     def __init__(self):
23 |         super().__init__()
24 |         self.supports_batching = False
25 | 
26 |     def get_filtered_reward_space(self, input_reward_space: RewardSpace) -> RewardSpace:
27 |         """
28 |         This function should contain the logic for getting the filtered reward space
29 |         :param input_reward_space: the input reward space
30 |         :return: the filtered reward space
31 |         """
32 |         return input_reward_space


--------------------------------------------------------------------------------
/rl_coach/graph_managers/README.md:
--------------------------------------------------------------------------------
 1 | # Block Factory
 2 | 
 3 | The block factory is a class which creates a block that fits into a specific RL scheme.
 4 | Example RL schemes are: self play, multi agent, HRL, basic RL, etc.
 5 | The block factory should create all the components of the block and return the block scheduler.
 6 | The block factory will then be used to create different combinations of components.
 7 | For example, an HRL factory can be later instantiated with:
 8 | * env = Atari Breakout
 9 | * master (top hierarchy level) agent = DDPG
10 | * slave (bottom hierarchy level) agent = DQN
11 | 
12 | A custom block factory implementation should look as follows:
13 | 
14 | ```
15 | class CustomFactory(BlockFactory):
16 |     def __init__(self, custom_params):
17 |         super().__init__()
18 | 
19 |     def _create_block(self, task_index: int, device=None) -> BlockScheduler:
20 |         """
21 |         Create all the block modules and the block scheduler
22 |         :param task_index: the index of the process on which the worker will be run
23 |         :return: the initialized block scheduler
24 |         """
25 | 
26 |         # Create env
27 |         # Create composite agents
28 |         # Create level managers
29 |         # Create block scheduler
30 | 
31 |         return block_scheduler
32 | ```


--------------------------------------------------------------------------------
/rl_coach/graph_managers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/graph_managers/__init__.py


--------------------------------------------------------------------------------
/rl_coach/memories/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/rl_coach/memories/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/memories/backend/__init__.py


--------------------------------------------------------------------------------
/rl_coach/memories/backend/memory_impl.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | 
18 | from rl_coach.memories.backend.memory import MemoryBackendParameters
19 | from rl_coach.memories.backend.redis import RedisPubSubBackend, RedisPubSubMemoryBackendParameters
20 | 
21 | 
22 | def get_memory_backend(params: MemoryBackendParameters):
23 | 
24 |     backend = None
25 |     if type(params) == RedisPubSubMemoryBackendParameters:
26 |         backend = RedisPubSubBackend(params)
27 | 
28 |     return backend
29 | 
30 | 
31 | def construct_memory_params(json: dict):
32 | 
33 |     if json['store_type'] == 'redispubsub':
34 |         memory_params = RedisPubSubMemoryBackendParameters(
35 |             json['redis_address'], json['redis_port'], channel=json.get('channel', ''), run_type=json['run_type']
36 |         )
37 |         return memory_params
38 | 


--------------------------------------------------------------------------------
/rl_coach/memories/episodic/__init__.py:
--------------------------------------------------------------------------------
 1 | from .episodic_experience_replay import EpisodicExperienceReplayParameters, EpisodicExperienceReplay
 2 | from .episodic_hindsight_experience_replay import EpisodicHindsightExperienceReplayParameters, EpisodicHindsightExperienceReplay
 3 | from .episodic_hrl_hindsight_experience_replay import EpisodicHRLHindsightExperienceReplayParameters, EpisodicHRLHindsightExperienceReplay
 4 | from .single_episode_buffer import SingleEpisodeBufferParameters, SingleEpisodeBuffer
 5 | __all__ = [
 6 |     'EpisodicExperienceReplayParameters',
 7 |     'EpisodicHindsightExperienceReplayParameters',
 8 |     'EpisodicHRLHindsightExperienceReplayParameters',
 9 |     'SingleEpisodeBufferParameters',
10 |     'EpisodicExperienceReplay',
11 |     'EpisodicHindsightExperienceReplay',
12 |     'EpisodicHRLHindsightExperienceReplay',
13 |     'SingleEpisodeBuffer'
14 | ]
15 | 


--------------------------------------------------------------------------------
/rl_coach/memories/episodic/single_episode_buffer.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay
18 | from rl_coach.memories.memory import MemoryGranularity, MemoryParameters
19 | 
20 | 
21 | class SingleEpisodeBufferParameters(MemoryParameters):
22 |     def __init__(self):
23 |         super().__init__()
24 |         del self.max_size
25 | 
26 |     @property
27 |     def path(self):
28 |         return 'rl_coach.memories.episodic.single_episode_buffer:SingleEpisodeBuffer'
29 | 
30 | 
31 | class SingleEpisodeBuffer(EpisodicExperienceReplay):
32 |     def __init__(self):
33 |         super().__init__((MemoryGranularity.Episodes, 1))
34 | 


--------------------------------------------------------------------------------
/rl_coach/memories/non_episodic/__init__.py:
--------------------------------------------------------------------------------
 1 | from .balanced_experience_replay import BalancedExperienceReplayParameters, BalancedExperienceReplay
 2 | from .differentiable_neural_dictionary import QDND
 3 | from .experience_replay import ExperienceReplayParameters, ExperienceReplay
 4 | from .prioritized_experience_replay import PrioritizedExperienceReplayParameters, PrioritizedExperienceReplay
 5 | from .transition_collection import TransitionCollection
 6 | __all__ = [
 7 |     'BalancedExperienceReplayParameters',
 8 |     'BalancedExperienceReplay',
 9 |     'QDND',
10 |     'ExperienceReplay',
11 |     'PrioritizedExperienceReplay',
12 |     'TransitionCollection'
13 | ]
14 | 


--------------------------------------------------------------------------------
/rl_coach/off_policy_evaluators/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2019 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/rl_coach/off_policy_evaluators/bandits/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2019 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/rl_coach/off_policy_evaluators/rl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/off_policy_evaluators/rl/__init__.py


--------------------------------------------------------------------------------
/rl_coach/orchestrators/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation 
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 


--------------------------------------------------------------------------------
/rl_coach/orchestrators/deploy.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2017 Intel Corporation
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | 
18 | 
19 | class DeployParameters(object):
20 | 
21 |     def __init__(self):
22 |         pass
23 | 
24 | 
25 | class Deploy(object):
26 | 
27 |     def __init__(self, deploy_parameters):
28 |         self.deploy_parameters = deploy_parameters
29 | 
30 |     def setup(self) -> bool:
31 |         pass
32 | 
33 |     def deploy(self) -> bool:
34 |         pass
35 | 


--------------------------------------------------------------------------------
/rl_coach/presets/Atari_C51.py:
--------------------------------------------------------------------------------
 1 | from rl_coach.agents.categorical_dqn_agent import CategoricalDQNAgentParameters
 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
 3 | from rl_coach.environments.environment import SingleLevelSelection
 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule
 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
 6 | 
 7 | #########
 8 | # Agent #
 9 | #########
10 | agent_params = CategoricalDQNAgentParameters()
11 | agent_params.network_wrappers['main'].learning_rate = 0.00025
12 | 
13 | ###############
14 | # Environment #
15 | ###############
16 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
17 | 
18 | ########
19 | # Test #
20 | ########
21 | preset_validation_params = PresetValidationParameters()
22 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
23 | 
24 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
25 |                                     schedule_params=atari_schedule, vis_params=VisualizationParameters(),
26 |                                     preset_validation_params=preset_validation_params)
27 | 


--------------------------------------------------------------------------------
/rl_coach/presets/Atari_DDQN.py:
--------------------------------------------------------------------------------
 1 | from rl_coach.agents.ddqn_agent import DDQNAgentParameters
 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
 3 | from rl_coach.environments.environment import SingleLevelSelection
 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule
 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
 6 | 
 7 | #########
 8 | # Agent #
 9 | #########
10 | agent_params = DDQNAgentParameters()
11 | agent_params.network_wrappers['main'].learning_rate = 0.00025
12 | 
13 | ###############
14 | # Environment #
15 | ###############
16 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
17 | 
18 | ########
19 | # Test #
20 | ########
21 | preset_validation_params = PresetValidationParameters()
22 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
23 | 
24 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
25 |                                     schedule_params=atari_schedule, vis_params=VisualizationParameters(),
26 |                                     preset_validation_params=preset_validation_params)


--------------------------------------------------------------------------------
/rl_coach/presets/Atari_DDQN_with_PER.py:
--------------------------------------------------------------------------------
 1 | from rl_coach.agents.ddqn_agent import DDQNAgentParameters
 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
 3 | from rl_coach.environments.environment import SingleLevelSelection
 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule
 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
 6 | from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplayParameters
 7 | from rl_coach.schedules import LinearSchedule
 8 | 
 9 | #########
10 | # Agent #
11 | #########
12 | agent_params = DDQNAgentParameters()
13 | agent_params.network_wrappers['main'].learning_rate = 0.00025/4
14 | agent_params.memory = PrioritizedExperienceReplayParameters()
15 | agent_params.memory.beta = LinearSchedule(0.4, 1, 12500000)  # 12.5M training iterations = 50M steps = 200M frames
16 | 
17 | ###############
18 | # Environment #
19 | ###############
20 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
21 | 
22 | ########
23 | # Test #
24 | ########
25 | preset_validation_params = PresetValidationParameters()
26 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
27 | 
28 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
29 |                                     schedule_params=atari_schedule, vis_params=VisualizationParameters(),
30 |                                     preset_validation_params=preset_validation_params)
31 | 


--------------------------------------------------------------------------------
/rl_coach/presets/Atari_DQN.py:
--------------------------------------------------------------------------------
 1 | from rl_coach.agents.dqn_agent import DQNAgentParameters
 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
 3 | from rl_coach.environments.environment import SingleLevelSelection
 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule
 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
 6 | 
 7 | #########
 8 | # Agent #
 9 | #########
10 | agent_params = DQNAgentParameters()
11 | # since we are using Adam instead of RMSProp, we adjust the learning rate as well
12 | agent_params.network_wrappers['main'].learning_rate = 0.0001
13 | 
14 | ###############
15 | # Environment #
16 | ###############
17 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
18 | 
19 | ########
20 | # Test #
21 | ########
22 | preset_validation_params = PresetValidationParameters()
23 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
24 | 
25 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
26 |                                     schedule_params=atari_schedule, vis_params=VisualizationParameters(),
27 |                                     preset_validation_params=preset_validation_params)
28 | 


--------------------------------------------------------------------------------
/rl_coach/presets/Atari_DQN_with_PER.py:
--------------------------------------------------------------------------------
 1 | from rl_coach.agents.dqn_agent import DQNAgentParameters
 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
 3 | from rl_coach.environments.environment import SingleLevelSelection
 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule
 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
 6 | from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplayParameters
 7 | from rl_coach.schedules import LinearSchedule
 8 | 
 9 | 
10 | #########
11 | # Agent #
12 | #########
13 | agent_params = DQNAgentParameters()
14 | agent_params.network_wrappers['main'].learning_rate = 0.00025
15 | agent_params.memory = PrioritizedExperienceReplayParameters()
16 | agent_params.memory.beta = LinearSchedule(0.4, 1, 12500000)  # 12.5M training iterations = 50M steps = 200M frames
17 | 
18 | ###############
19 | # Environment #
20 | ###############
21 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
22 | 
23 | ########
24 | # Test #
25 | ########
26 | preset_validation_params = PresetValidationParameters()
27 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
28 | 
29 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
30 |                                     schedule_params=atari_schedule, vis_params=VisualizationParameters(),
31 |                                     preset_validation_params=preset_validation_params)
32 | 


--------------------------------------------------------------------------------
/rl_coach/presets/Atari_QR_DQN.py:
--------------------------------------------------------------------------------
 1 | from rl_coach.agents.qr_dqn_agent import QuantileRegressionDQNAgentParameters
 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
 3 | from rl_coach.environments.environment import SingleLevelSelection
 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule
 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
 6 | 
 7 | #########
 8 | # Agent #
 9 | #########
10 | agent_params = QuantileRegressionDQNAgentParameters()
11 | agent_params.network_wrappers['main'].learning_rate = 0.00005  # called alpha in the paper
12 | agent_params.algorithm.huber_loss_interval = 1  # k = 0 for strict quantile loss, k = 1 for Huber quantile loss
13 | 
14 | ###############
15 | # Environment #
16 | ###############
17 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
18 | 
19 | ########
20 | # Test #
21 | ########
22 | preset_validation_params = PresetValidationParameters()
23 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
24 | 
25 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
26 |                                     schedule_params=atari_schedule, vis_params=VisualizationParameters(),
27 |                                     preset_validation_params=preset_validation_params)
28 | 


--------------------------------------------------------------------------------
/rl_coach/presets/Atari_UCB_with_Q_Ensembles.py:
--------------------------------------------------------------------------------
 1 | from rl_coach.agents.bootstrapped_dqn_agent import BootstrappedDQNAgentParameters
 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
 3 | from rl_coach.environments.environment import SingleLevelSelection
 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule
 5 | from rl_coach.exploration_policies.ucb import UCBParameters
 6 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
 7 | 
 8 | #########
 9 | # Agent #
10 | #########
11 | agent_params = BootstrappedDQNAgentParameters()
12 | agent_params.network_wrappers['main'].learning_rate = 0.00025
13 | agent_params.exploration = UCBParameters()
14 | 
15 | ###############
16 | # Environment #
17 | ###############
18 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))
19 | 
20 | ########
21 | # Test #
22 | ########
23 | preset_validation_params = PresetValidationParameters()
24 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']
25 | 
26 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
27 |                                     schedule_params=atari_schedule, vis_params=VisualizationParameters(),
28 |                                     preset_validation_params=preset_validation_params)
29 | 


--------------------------------------------------------------------------------
/rl_coach/presets/README.md:
--------------------------------------------------------------------------------
 1 | # Defining Presets
 2 | 
 3 | In Coach, we use a Preset mechanism in order to define reproducible experiments.
 4 | A Preset defines all the parameters of an experiment in a single file, and can be executed from the command
 5 | line using the file name.
 6 | Presets can be very simple by using the default parameters of the algorithm and environment.
 7 | They can also be explicit and define all the parameters in order to avoid hidden logic.
 8 | The outcome of a preset is a GraphManager.
 9 | 
10 | 
11 | Let's start with the simplest preset possible.
12 | We will define a preset for training the CartPole environment using Clipped PPO.
13 | The 3 minimal things we need to define in each preset are the agent, the environment and a schedule.
14 | 
15 | ```
16 | from rl_coach.agents.clipped_ppo_agent import ClippedPPOAgentParameters
17 | from rl_coach.environments.gym_environment import GymVectorEnvironment
18 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
19 | from rl_coach.graph_managers.graph_manager import SimpleSchedule
20 | 
21 | graph_manager = BasicRLGraphManager(
22 |     agent_params=ClippedPPOAgentParameters(),
23 |     env_params=GymVectorEnvironment(level='CartPole-v0'),
24 |     schedule_params=SimpleSchedule()
25 | )
26 | ```
27 | 
28 | Most presets in Coach are much more explicit than this. The motivation behind this is to be as transparent as
29 | possible regarding all the changes needed relative to the basic parameters defined in the algorithm paper.


--------------------------------------------------------------------------------
/rl_coach/presets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/presets/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/agents/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/agents/test_agent_external_communication.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from rl_coach.base_parameters import TaskParameters, Frameworks
 5 | 
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 7 | import tensorflow as tf
 8 | from tensorflow import logging
 9 | import pytest
10 | logging.set_verbosity(logging.INFO)
11 | 
12 | 
13 | @pytest.mark.unit_test
14 | def test_get_QActionStateValue_predictions():
15 |     tf.reset_default_graph()
16 |     from rl_coach.presets.CartPole_DQN import graph_manager as cartpole_dqn_graph_manager
17 |     assert cartpole_dqn_graph_manager
18 |     cartpole_dqn_graph_manager.create_graph(task_parameters=
19 |                                             TaskParameters(framework_type=Frameworks.tensorflow,
20 |                                                            experiment_path="./experiments/test"))
21 |     cartpole_dqn_graph_manager.improve_steps.num_steps = 1
22 |     cartpole_dqn_graph_manager.steps_between_evaluation_periods.num_steps = 5
23 | 
24 |     # graph_manager.improve()
25 |     #
26 |     # agent = graph_manager.level_managers[0].composite_agents['simple_rl_agent'].agents['simple_rl_agent/agent']
27 |     # some_state = agent.memory.sample(1)[0].state
28 |     # cartpole_dqn_predictions = agent.get_predictions(states=some_state, prediction_type=QActionStateValue)
29 |     # assert cartpole_dqn_predictions.shape == (1, 2)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     test_get_QActionStateValue_predictions()
34 | 


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/mxnet_components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/mxnet_components/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/mxnet_components/embedders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/mxnet_components/embedders/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/mxnet_components/embedders/test_image_embedder.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import os
 3 | import pytest
 4 | import sys
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 6 | 
 7 | 
 8 | from rl_coach.base_parameters import EmbedderScheme
 9 | from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
10 | from rl_coach.architectures.mxnet_components.embedders.image_embedder import ImageEmbedder
11 | 
12 | 
13 | @pytest.mark.unit_test
14 | def test_image_embedder():
15 |     params = InputEmbedderParameters(scheme=EmbedderScheme.Medium)
16 |     emb = ImageEmbedder(params=params)
17 |     emb.initialize()
18 |     # input is NHWC, and not MXNet default NCHW
19 |     input_data = mx.nd.random.uniform(low=0, high=1, shape=(10, 244, 244, 3))
20 |     output = emb(input_data)
21 |     assert len(output.shape) == 2  # since last block was flatten
22 |     assert output.shape[0] == 10  # since batch_size is 10
23 | 


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/mxnet_components/embedders/test_vector_embedder.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import os
 3 | import pytest
 4 | import sys
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 6 | 
 7 | 
 8 | from rl_coach.architectures.embedder_parameters import InputEmbedderParameters
 9 | from rl_coach.architectures.mxnet_components.embedders.vector_embedder import VectorEmbedder
10 | from rl_coach.base_parameters import EmbedderScheme
11 | 
12 | 
13 | @pytest.mark.unit_test
14 | def test_vector_embedder():
15 |     params = InputEmbedderParameters(scheme=EmbedderScheme.Medium)
16 |     emb = VectorEmbedder(params=params)
17 |     emb.initialize()
18 |     input_data = mx.nd.random.uniform(low=0, high=255, shape=(10, 100))
19 |     output = emb(input_data)
20 |     assert len(output.shape) == 2  # since last block was flatten
21 |     assert output.shape[0] == 10  # since batch_size is 10
22 |     assert output.shape[1] == 256  # since last dense layer has 256 units
23 | 


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/mxnet_components/heads/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/mxnet_components/heads/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/mxnet_components/heads/test_head.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import numpy as np
 3 | import os
 4 | import pytest
 5 | import sys
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 7 | 
 8 | 
 9 | from rl_coach.architectures.mxnet_components.heads.head import NormalizedRSSInitializer
10 | 
11 | 
12 | @pytest.mark.unit_test
13 | def test_normalized_rss_initializer():
14 |     target_rss = 0.5
15 |     units = 10
16 |     dense = mx.gluon.nn.Dense(units=units, weight_initializer=NormalizedRSSInitializer(target_rss))
17 |     dense.initialize()
18 | 
19 |     input_data = mx.random.uniform(shape=(25, 5))
20 |     output_data = dense(input_data)
21 | 
22 |     weights = dense.weight.data()
23 |     assert weights.shape == (10, 5)
24 |     rss = weights.square().sum(axis=1).sqrt()
25 |     np.testing.assert_almost_equal(rss.asnumpy(), np.tile(target_rss, units))
26 | 


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/mxnet_components/middlewares/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/mxnet_components/middlewares/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/mxnet_components/middlewares/test_fc_middleware.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import os
 3 | import pytest
 4 | import sys
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 6 | 
 7 | 
 8 | from rl_coach.base_parameters import MiddlewareScheme
 9 | from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
10 | from rl_coach.architectures.mxnet_components.middlewares.fc_middleware import FCMiddleware
11 | 
12 | 
13 | @pytest.mark.unit_test
14 | def test_fc_middleware():
15 |     params = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium)
16 |     mid = FCMiddleware(params=params)
17 |     mid.initialize()
18 |     embedded_data = mx.nd.random.uniform(low=0, high=1, shape=(10, 100))
19 |     output = mid(embedded_data)
20 |     assert output.ndim == 2  # since last block was flatten
21 |     assert output.shape[0] == 10  # since batch_size is 10
22 |     assert output.shape[1] == 512  # since last layer of middleware (middle scheme) had 512 units
23 | 


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/mxnet_components/middlewares/test_lstm_middleware.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import os
 3 | import pytest
 4 | import sys
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 6 | 
 7 | 
 8 | from rl_coach.base_parameters import MiddlewareScheme
 9 | from rl_coach.architectures.middleware_parameters import LSTMMiddlewareParameters
10 | from rl_coach.architectures.mxnet_components.middlewares.lstm_middleware import LSTMMiddleware
11 | 
12 | 
13 | @pytest.mark.unit_test
14 | def test_lstm_middleware():
15 |     params = LSTMMiddlewareParameters(number_of_lstm_cells=25, scheme=MiddlewareScheme.Medium)
16 |     mid = LSTMMiddleware(params=params)
17 |     mid.initialize()
18 |     # NTC
19 |     embedded_data = mx.nd.random.uniform(low=0, high=1, shape=(10, 15, 20))
20 |     # NTC -> TNC
21 |     output = mid(embedded_data)
22 |     assert output.ndim == 3  # since last block was flatten
23 |     assert output.shape[0] == 15  # since t is 15
24 |     assert output.shape[1] == 10  # since batch_size is 10
25 |     assert output.shape[2] == 25  # since number_of_lstm_cells is 25
26 | 


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/tensorflow_components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/tensorflow_components/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/tensorflow_components/embedders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/tensorflow_components/embedders/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/architectures/tensorflow_components/embedders/test_identity_embedder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from rl_coach.base_parameters import EmbedderScheme
 5 | 
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 7 | 
 8 | import pytest
 9 | import numpy as np
10 | from rl_coach.architectures.tensorflow_components.embedders.vector_embedder import VectorEmbedder
11 | import tensorflow as tf
12 | from tensorflow import logging
13 | 
14 | logging.set_verbosity(logging.INFO)
15 | 
16 | @pytest.fixture
17 | def reset():
18 |     tf.reset_default_graph()
19 | 
20 | 
21 | @pytest.mark.unit_test
22 | def test_embedder(reset):
23 |     embedder = VectorEmbedder(np.array([10, 10]), name="test", scheme=EmbedderScheme.Empty)
24 | 
25 |     # make sure the ops where not created yet
26 |     assert len(tf.get_default_graph().get_operations()) == 0
27 | 
28 |     # call the embedder
29 |     input_ph, output_ph = embedder()
30 | 
31 |     # make sure that now the ops were created
32 |     assert len(tf.get_default_graph().get_operations()) > 0
33 | 
34 |     # try feeding a batch of one example  # TODO: consider auto converting to batch
35 |     input = np.random.rand(1, 10, 10)
36 |     sess = tf.Session()
37 |     output = sess.run(embedder.output, {embedder.input: input})
38 |     assert output.shape == (1, 100)  # should have flattened the input
39 | 
40 |     # now make sure the returned placeholders behave the same
41 |     output = sess.run(output_ph, {input_ph: input})
42 |     assert output.shape == (1, 100)  # should have flattened the input
43 | 
44 |     # make sure the naming is correct
45 |     assert embedder.get_name() == "test"
46 | 


--------------------------------------------------------------------------------
/rl_coach/tests/environments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/environments/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/exploration_policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/exploration_policies/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/exploration_policies/test_additive_noise.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 4 | 
 5 | import pytest
 6 | 
 7 | from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
 8 | from rl_coach.exploration_policies.additive_noise import AdditiveNoise
 9 | from rl_coach.schedules import LinearSchedule
10 | import numpy as np
11 | 
12 | 
13 | @pytest.mark.unit_test
14 | def test_init():
15 |     # discrete control
16 |     action_space = DiscreteActionSpace(3)
17 |     noise_schedule = LinearSchedule(1.0, 1.0, 1000)
18 | 
19 |     # additive noise requires a bounded range for the actions
20 |     action_space = BoxActionSpace(np.array([10]))
21 |     with pytest.raises(ValueError):
22 |         policy = AdditiveNoise(action_space, noise_schedule, 0)
23 | 
24 | 
25 | @pytest.mark.unit_test
26 | def test_get_action():
27 |     # make sure noise is in range
28 |     action_space = BoxActionSpace(np.array([10]), -1, 1)
29 |     noise_schedule = LinearSchedule(1.0, 1.0, 1000)
30 |     policy = AdditiveNoise(action_space, noise_schedule, 0)
31 | 
32 |     # the action range is 2, so there is a ~0.1% chance that the noise will be larger than 3*std=3*2=6
33 |     for i in range(1000):
34 |         action = policy.get_action(np.zeros([10]))
35 |         assert np.all(action < 10)
36 |         # make sure there is no clipping of the action since it should be the environment that clips actions
37 |         assert np.all(action != 1.0)
38 |         assert np.all(action != -1.0)
39 |         # make sure that each action element has a different value
40 |         assert np.all(action[0] != action[1:])
41 | 


--------------------------------------------------------------------------------
/rl_coach/tests/exploration_policies/test_greedy.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 4 | 
 5 | import pytest
 6 | 
 7 | from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace
 8 | from rl_coach.exploration_policies.greedy import Greedy
 9 | import numpy as np
10 | 
11 | 
12 | @pytest.mark.unit_test
13 | def test_get_action():
14 |     # discrete control
15 |     action_space = DiscreteActionSpace(3)
16 |     policy = Greedy(action_space)
17 | 
18 |     best_action, _ = policy.get_action(np.array([10, 20, 30]))
19 |     assert best_action == 2
20 | 
21 |     # continuous control
22 |     action_space = BoxActionSpace(np.array([10]))
23 |     policy = Greedy(action_space)
24 | 
25 |     best_action = policy.get_action(np.array([1, 1, 1]))
26 |     assert np.all(best_action == np.array([1, 1, 1]))
27 | 
28 | 
29 | @pytest.mark.unit_test
30 | def test_get_control_param():
31 |     action_space = DiscreteActionSpace(3)
32 |     policy = Greedy(action_space)
33 |     assert policy.get_control_param() == 0
34 | 
35 | 


--------------------------------------------------------------------------------
/rl_coach/tests/filters/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/filters/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/filters/action/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/filters/action/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/filters/action/test_box_masking.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 4 | 
 5 | import pytest
 6 | from rl_coach.filters.action.box_masking import BoxMasking
 7 | from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
 8 | import numpy as np
 9 | 
10 | 
11 | @pytest.mark.unit_test
12 | def test_filter():
13 |     filter = BoxMasking(10, 20)
14 | 
15 |     # passing an output space that is wrong
16 |     with pytest.raises(ValueError):
17 |         filter.validate_output_action_space(DiscreteActionSpace(10))
18 | 
19 |     # 1 dimensional box
20 |     output_space = BoxActionSpace(1, 5, 30)
21 |     input_space = filter.get_unfiltered_action_space(output_space)
22 | 
23 |     action = np.array([2])
24 |     result = filter.filter(action)
25 |     assert result == np.array([12])
26 |     assert output_space.contains(result)
27 | 
28 | 


--------------------------------------------------------------------------------
/rl_coach/tests/filters/action/test_linear_box_to_box_map.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
 4 | 
 5 | import pytest
 6 | from rl_coach.filters.action.linear_box_to_box_map import LinearBoxToBoxMap
 7 | from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace
 8 | import numpy as np
 9 | 
10 | 
11 | @pytest.mark.unit_test
12 | def test_filter():
13 |     filter = LinearBoxToBoxMap(10, 20)
14 | 
15 |     # passing an output space that is wrong
16 |     with pytest.raises(ValueError):
17 |         filter.validate_output_action_space(DiscreteActionSpace(10))
18 | 
19 |     # 1 dimensional box
20 |     output_space = BoxActionSpace(1, 5, 35)
21 |     input_space = filter.get_unfiltered_action_space(output_space)
22 | 
23 |     action = np.array([2])
24 | 
25 |     action = np.array([12])
26 |     result = filter.filter(action)
27 |     assert result == np.array([11])
28 |     assert output_space.contains(result)
29 | 
30 | 


--------------------------------------------------------------------------------
/rl_coach/tests/filters/observation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/filters/observation/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/filters/reward/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/filters/reward/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/graph_managers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/graph_managers/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/memories/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/memories/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/presets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/presets/__init__.py


--------------------------------------------------------------------------------
/rl_coach/tests/pytest.ini:
--------------------------------------------------------------------------------
 1 | # content of pytest.ini
 2 | [pytest]
 3 | markers =
 4 |     unit_test: short test that checks that a module is acting correctly
 5 |     integration_test: long test that checks that the complete framework is running correctly
 6 | filterwarnings =
 7 |     ignore::DeprecationWarning
 8 | norecursedirs = 
 9 |     *mxnet*
10 | 


--------------------------------------------------------------------------------
/rl_coach/tests/test_saver.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from rl_coach.saver import Saver, SaverCollection
 4 | 
 5 | 
 6 | @pytest.mark.unit_test
 7 | def test_checkpoint_collection():
 8 |     class SaverTest(Saver):
 9 |         def __init__(self, path):
10 |             self._path = path
11 |             self._count = 1
12 | 
13 |         @property
14 |         def path(self):
15 |             return self._path
16 | 
17 |         def merge(self, other: 'Saver'):
18 |             assert isinstance(other, SaverTest)
19 |             assert self.path == other.path
20 |             self._count += other._count
21 | 
22 |     # test add
23 |     savers = SaverCollection(SaverTest('123'))
24 |     savers.add(SaverTest('123'))
25 |     savers.add(SaverTest('456'))
26 | 
27 |     def check_collection(mul):
28 |         paths = ['123', '456']
29 |         for c in savers:
30 |             paths.remove(c.path)
31 |             if c.path == '123':
32 |                 assert c._count == 2 * mul
33 |             elif c.path == '456':
34 |                 assert c._count == 1 * mul
35 |             else:
36 |                 assert False, "invalid path"
37 | 
38 |     check_collection(1)
39 | 
40 |     # test update
41 |     savers.update(savers)
42 |     check_collection(2)
43 | 


--------------------------------------------------------------------------------
/rl_coach/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/utils/__init__.py


--------------------------------------------------------------------------------
/rl_coach/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/utilities/__init__.py


--------------------------------------------------------------------------------
/tutorials/Resources/exploration.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | from typing import List
 4 | from rl_coach.core_types import ActionType
 5 | from rl_coach.spaces import ActionSpace
 6 | from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters
 7 | 
 8 | 
 9 | class MyExplorationPolicy(ExplorationPolicy):
10 |     """
11 |     An exploration policy takes the predicted actions or action values from the agent, and selects the action to
12 |     actually apply to the environment using some predefined algorithm.
13 |     """
14 |     def __init__(self, action_space: ActionSpace):
15 |         #self.phase = RunPhase.HEATUP
16 |         self.action_space = action_space
17 |         super().__init__(action_space)
18 | 
19 |     def get_action(self, action_values: List[ActionType]) -> ActionType:
20 |         if (np.random.rand() < 0.5):
21 |             chosen_action = self.action_space.sample()
22 |         else:
23 |             chosen_action = np.argmax(action_values)
24 |         probabilities = np.zeros(len(self.action_space.actions))
25 |         probabilities[chosen_action] = 1
26 |         return chosen_action, probabilities
27 | 
28 |     def get_control_param(self):
29 |         return 0
30 | 
31 | 
32 | 
33 | class MyExplorationParameters(ExplorationParameters):
34 |     def __init__(self):
35 |         super().__init__()
36 | 
37 |     @property
38 |     def path(self):
39 |         return 'exploration:MyExplorationPolicy'
40 | 


--------------------------------------------------------------------------------
/tutorials/Resources/img/dr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/tutorials/Resources/img/dr.png


--------------------------------------------------------------------------------
/tutorials/Resources/img/model_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/tutorials/Resources/img/model_selection.png


--------------------------------------------------------------------------------
/tutorials/Resources/img/wis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/tutorials/Resources/img/wis.png


--------------------------------------------------------------------------------
/tutorials/python_invocation_example.py:
--------------------------------------------------------------------------------
 1 | from rl_coach.agents.clipped_ppo_agent import ClippedPPOAgentParameters
 2 | from rl_coach.core_types import EnvironmentSteps
 3 | from rl_coach.environments.gym_environment import GymVectorEnvironment
 4 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
 5 | from rl_coach.graph_managers.graph_manager import SimpleSchedule
 6 | 
 7 | graph_manager = BasicRLGraphManager(
 8 |     agent_params=ClippedPPOAgentParameters(),
 9 |     env_params=GymVectorEnvironment(level='CartPole-v0'),
10 |     schedule_params=SimpleSchedule()
11 | )
12 | 
13 | graph_manager.heatup(EnvironmentSteps(100))
14 | graph_manager.train_and_act(EnvironmentSteps(100))


--------------------------------------------------------------------------------