├── .circleci └── config.yml ├── .gitignore ├── .nojekyll ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── __init__.py ├── benchmarks ├── README.md ├── a3c │ ├── README.md │ ├── ant_a3c_16_workers.png │ ├── half_cheetah_a3c_16_workers.png │ ├── hopper_a3c_16_workers.png │ ├── inverted_pendulum_a3c.png │ ├── space_invaders_a3c_16_workers.png │ └── walker2d_a3c_16_workers.png ├── acer │ ├── README.md │ ├── breakout_acer_16_workers.png │ ├── pong_acer_16_workers.png │ └── space_invaders_acer_16_workers.png ├── bootstrapped_dqn │ ├── README.md │ ├── breakout_bootstrapped_dqn.png │ ├── pong_bootstrapped_dqn.png │ └── space_invaders_bootstrapped_dqn.png ├── clipped_ppo │ ├── README.md │ ├── ant_clipped_ppo.png │ ├── half_cheetah_clipped_ppo.png │ ├── hopper_clipped_ppo.png │ ├── humanoid_clipped_ppo.png │ ├── inverted_double_pendulum_clipped_ppo.png │ ├── inverted_pendulum_clipped_ppo.png │ ├── reacher_clipped_ppo.png │ ├── swimmer_clipped_ppo.png │ └── walker2d_clipped_ppo.png ├── ddpg │ ├── README.md │ ├── ant_ddpg.png │ ├── half_cheetah_ddpg.png │ ├── hopper_ddpg.png │ ├── humanoid_ddpg.png │ ├── inverted_double_pendulum_ddpg.png │ ├── inverted_pendulum_ddpg.png │ ├── reacher_ddpg.png │ ├── swimmer_ddpg.png │ └── walker2d_ddpg.png ├── ddpg_her │ ├── README.md │ ├── fetch_ddpg_her_pick_and_place_8_workers.png │ ├── fetch_ddpg_her_push_8_workers.png │ ├── fetch_ddpg_her_reach_1_worker.png │ └── fetch_ddpg_her_slide_8_workers.png ├── dfp │ ├── README.md │ ├── doom_basic_dfp_8_workers.png │ ├── doom_health_dfp_8_workers.png │ └── doom_health_supreme_dfp_8_workers.png ├── dqn │ ├── README.md │ ├── breakout_dqn.png │ ├── pong_dqn.png │ └── space_invaders_dqn.png ├── dueling_ddqn │ ├── README.md │ ├── breakout_dueling_ddqn.png │ ├── pong_dueling_ddqn.png │ └── space_invaders_dueling_ddqn.png ├── dueling_ddqn_with_per │ ├── README.md │ ├── breakout_dueling_ddqn_with_per.png │ ├── pong_dueling_ddqn_with_per.png │ └── space_invaders_dueling_ddqn_with_per.png ├── qr_dqn │ ├── README.md │ ├── breakout_qr_dqn.png │ └── pong_qr_dqn.png ├── sac │ ├── README.md │ ├── half_cheetah_sac.png │ ├── hopper_sac.png │ ├── humanoid_sac.png │ ├── inverted_pendulum_sac.png │ └── walker2d_sac.png └── td3 │ ├── README.md │ ├── ant.png │ ├── half_cheetah.png │ ├── hopper.png │ ├── reacher.png │ └── walker2d.png ├── dist-coach-config.template ├── docker ├── Dockerfile ├── Dockerfile.base ├── Dockerfile.doom_environment ├── Dockerfile.gym_environment ├── Dockerfile.mujoco_environment ├── Dockerfile.starcraft_environment ├── Makefile └── README.md ├── docs ├── .nojekyll ├── _images │ ├── ac.png │ ├── acer.png │ ├── act.png │ ├── algorithms.png │ ├── attention_discretization.png │ ├── bollinger_bands.png │ ├── box_discretization.png │ ├── box_masking.png │ ├── bs_dqn.png │ ├── cil.png │ ├── compare_by_num_episodes.png │ ├── compare_by_time.png │ ├── ddpg.png │ ├── design.png │ ├── dfp.png │ ├── distributed.png │ ├── distributional_dqn.png │ ├── dqn.png │ ├── dueling_dqn.png │ ├── filters.png │ ├── full_discrete_action_space_map.png │ ├── horizontal-scale-out.png │ ├── improve.png │ ├── linear_box_to_box_map.png │ ├── naf.png │ ├── nec.png │ ├── network.png │ ├── observe.png │ ├── partial_discrete_action_space_map.png │ ├── pg.png │ ├── ppo.png │ ├── qr_dqn.png │ ├── rainbow.png │ ├── sac.png │ ├── separate_signals.png │ ├── td3.png │ ├── train.png │ ├── updating_dynamically.gif │ └── wolpertinger.png ├── _modules │ ├── index.html │ └── rl_coach │ │ ├── agents │ │ ├── acer_agent.html │ │ ├── actor_critic_agent.html │ │ ├── agent.html │ │ ├── agent_interface.html │ │ ├── bc_agent.html │ │ ├── categorical_dqn_agent.html │ │ ├── cil_agent.html │ │ ├── clipped_ppo_agent.html │ │ ├── ddpg_agent.html │ │ ├── dfp_agent.html │ │ ├── dqn_agent.html │ │ ├── mmc_agent.html │ │ ├── n_step_q_agent.html │ │ ├── naf_agent.html │ │ ├── nec_agent.html │ │ ├── pal_agent.html │ │ ├── policy_gradients_agent.html │ │ ├── ppo_agent.html │ │ ├── qr_dqn_agent.html │ │ ├── rainbow_dqn_agent.html │ │ ├── soft_actor_critic_agent.html │ │ ├── td3_agent.html │ │ ├── value_optimization_agent.html │ │ └── wolpertinger_agent.html │ │ ├── architectures │ │ ├── architecture.html │ │ └── network_wrapper.html │ │ ├── base_parameters.html │ │ ├── core_types.html │ │ ├── data_stores │ │ ├── nfs_data_store.html │ │ └── s3_data_store.html │ │ ├── environments │ │ ├── carla_environment.html │ │ ├── control_suite_environment.html │ │ ├── doom_environment.html │ │ ├── environment.html │ │ ├── gym_environment.html │ │ └── starcraft2_environment.html │ │ ├── exploration_policies │ │ ├── additive_noise.html │ │ ├── boltzmann.html │ │ ├── bootstrapped.html │ │ ├── categorical.html │ │ ├── continuous_entropy.html │ │ ├── e_greedy.html │ │ ├── exploration_policy.html │ │ ├── greedy.html │ │ ├── ou_process.html │ │ ├── parameter_noise.html │ │ ├── truncated_normal.html │ │ └── ucb.html │ │ ├── filters │ │ ├── action │ │ │ ├── attention_discretization.html │ │ │ ├── box_discretization.html │ │ │ ├── box_masking.html │ │ │ ├── full_discrete_action_space_map.html │ │ │ ├── linear_box_to_box_map.html │ │ │ └── partial_discrete_action_space_map.html │ │ ├── observation │ │ │ ├── observation_clipping_filter.html │ │ │ ├── observation_crop_filter.html │ │ │ ├── observation_move_axis_filter.html │ │ │ ├── observation_normalization_filter.html │ │ │ ├── observation_reduction_by_sub_parts_name_filter.html │ │ │ ├── observation_rescale_size_by_factor_filter.html │ │ │ ├── observation_rescale_to_size_filter.html │ │ │ ├── observation_rgb_to_y_filter.html │ │ │ ├── observation_squeeze_filter.html │ │ │ ├── observation_stacking_filter.html │ │ │ └── observation_to_uint8_filter.html │ │ └── reward │ │ │ ├── reward_clipping_filter.html │ │ │ ├── reward_normalization_filter.html │ │ │ └── reward_rescale_filter.html │ │ ├── memories │ │ ├── backend │ │ │ └── redis.html │ │ ├── episodic │ │ │ ├── episodic_experience_replay.html │ │ │ ├── episodic_hindsight_experience_replay.html │ │ │ ├── episodic_hrl_hindsight_experience_replay.html │ │ │ └── single_episode_buffer.html │ │ └── non_episodic │ │ │ ├── balanced_experience_replay.html │ │ │ ├── differentiable_neural_dictionary.html │ │ │ ├── experience_replay.html │ │ │ ├── prioritized_experience_replay.html │ │ │ └── transition_collection.html │ │ ├── orchestrators │ │ └── kubernetes_orchestrator.html │ │ └── spaces.html ├── _sources │ ├── components │ │ ├── additional_parameters.rst.txt │ │ ├── agents │ │ │ ├── imitation │ │ │ │ ├── bc.rst.txt │ │ │ │ └── cil.rst.txt │ │ │ ├── index.rst.txt │ │ │ ├── other │ │ │ │ └── dfp.rst.txt │ │ │ ├── policy_optimization │ │ │ │ ├── ac.rst.txt │ │ │ │ ├── acer.rst.txt │ │ │ │ ├── cppo.rst.txt │ │ │ │ ├── ddpg.rst.txt │ │ │ │ ├── hac.rst.txt │ │ │ │ ├── pg.rst.txt │ │ │ │ ├── ppo.rst.txt │ │ │ │ ├── sac.rst.txt │ │ │ │ ├── td3.rst.txt │ │ │ │ └── wolpertinger.rst.txt │ │ │ └── value_optimization │ │ │ │ ├── bs_dqn.rst.txt │ │ │ │ ├── categorical_dqn.rst.txt │ │ │ │ ├── double_dqn.rst.txt │ │ │ │ ├── dqn.rst.txt │ │ │ │ ├── dueling_dqn.rst.txt │ │ │ │ ├── mmc.rst.txt │ │ │ │ ├── n_step.rst.txt │ │ │ │ ├── naf.rst.txt │ │ │ │ ├── nec.rst.txt │ │ │ │ ├── pal.rst.txt │ │ │ │ ├── qr_dqn.rst.txt │ │ │ │ └── rainbow.rst.txt │ │ ├── architectures │ │ │ └── index.rst.txt │ │ ├── core_types.rst.txt │ │ ├── data_stores │ │ │ └── index.rst.txt │ │ ├── environments │ │ │ └── index.rst.txt │ │ ├── exploration_policies │ │ │ └── index.rst.txt │ │ ├── filters │ │ │ ├── index.rst.txt │ │ │ ├── input_filters.rst.txt │ │ │ └── output_filters.rst.txt │ │ ├── memories │ │ │ └── index.rst.txt │ │ ├── memory_backends │ │ │ └── index.rst.txt │ │ ├── orchestrators │ │ │ └── index.rst.txt │ │ └── spaces.rst.txt │ ├── contributing │ │ ├── add_agent.rst.txt │ │ └── add_env.rst.txt │ ├── dashboard.rst.txt │ ├── design │ │ ├── control_flow.rst.txt │ │ ├── horizontal_scaling.rst.txt │ │ └── network.rst.txt │ ├── dist_usage.rst.txt │ ├── features │ │ ├── algorithms.rst.txt │ │ ├── batch_rl.rst.txt │ │ ├── benchmarks.rst.txt │ │ ├── environments.rst.txt │ │ └── index.rst.txt │ ├── index.rst.txt │ ├── selecting_an_algorithm.rst.txt │ ├── test.rst.txt │ └── usage.rst.txt ├── _static │ ├── basic.css │ ├── css │ │ ├── badge_only.css │ │ ├── custom.css │ │ └── theme.css │ ├── dark_logo.png │ ├── doctools.js │ ├── documentation_options.js │ ├── file.png │ ├── fonts │ │ ├── Inconsolata-Bold.ttf │ │ ├── Inconsolata-Regular.ttf │ │ ├── Inconsolata.ttf │ │ ├── Lato-Bold.ttf │ │ ├── Lato-Regular.ttf │ │ ├── Lato │ │ │ ├── lato-bold.eot │ │ │ ├── lato-bold.ttf │ │ │ ├── lato-bold.woff │ │ │ ├── lato-bold.woff2 │ │ │ ├── lato-bolditalic.eot │ │ │ ├── lato-bolditalic.ttf │ │ │ ├── lato-bolditalic.woff │ │ │ ├── lato-bolditalic.woff2 │ │ │ ├── lato-italic.eot │ │ │ ├── lato-italic.ttf │ │ │ ├── lato-italic.woff │ │ │ ├── lato-italic.woff2 │ │ │ ├── lato-regular.eot │ │ │ ├── lato-regular.ttf │ │ │ ├── lato-regular.woff │ │ │ └── lato-regular.woff2 │ │ ├── RobotoSlab-Bold.ttf │ │ ├── RobotoSlab-Regular.ttf │ │ ├── RobotoSlab │ │ │ ├── roboto-slab-v7-bold.eot │ │ │ ├── roboto-slab-v7-bold.ttf │ │ │ ├── roboto-slab-v7-bold.woff │ │ │ ├── roboto-slab-v7-bold.woff2 │ │ │ ├── roboto-slab-v7-regular.eot │ │ │ ├── roboto-slab-v7-regular.ttf │ │ │ ├── roboto-slab-v7-regular.woff │ │ │ └── roboto-slab-v7-regular.woff2 │ │ ├── fontawesome-webfont.eot │ │ ├── fontawesome-webfont.svg │ │ ├── fontawesome-webfont.ttf │ │ ├── fontawesome-webfont.woff │ │ └── fontawesome-webfont.woff2 │ ├── jquery-3.2.1.js │ ├── jquery.js │ ├── js │ │ ├── modernizr.min.js │ │ └── theme.js │ ├── language_data.js │ ├── minus.png │ ├── plus.png │ ├── pygments.css │ ├── searchtools.js │ ├── underscore-1.3.1.js │ └── underscore.js ├── components │ ├── additional_parameters.html │ ├── agents │ │ ├── imitation │ │ │ ├── bc.html │ │ │ └── cil.html │ │ ├── index.html │ │ ├── other │ │ │ └── dfp.html │ │ ├── policy_optimization │ │ │ ├── ac.html │ │ │ ├── acer.html │ │ │ ├── cppo.html │ │ │ ├── ddpg.html │ │ │ ├── hac.html │ │ │ ├── pg.html │ │ │ ├── ppo.html │ │ │ ├── sac.html │ │ │ ├── td3.html │ │ │ └── wolpertinger.html │ │ └── value_optimization │ │ │ ├── bs_dqn.html │ │ │ ├── categorical_dqn.html │ │ │ ├── double_dqn.html │ │ │ ├── dqn.html │ │ │ ├── dueling_dqn.html │ │ │ ├── mmc.html │ │ │ ├── n_step.html │ │ │ ├── naf.html │ │ │ ├── nec.html │ │ │ ├── pal.html │ │ │ ├── qr_dqn.html │ │ │ └── rainbow.html │ ├── architectures │ │ └── index.html │ ├── core_types.html │ ├── data_stores │ │ └── index.html │ ├── environments │ │ └── index.html │ ├── exploration_policies │ │ └── index.html │ ├── filters │ │ ├── index.html │ │ ├── input_filters.html │ │ └── output_filters.html │ ├── memories │ │ └── index.html │ ├── memory_backends │ │ └── index.html │ ├── orchestrators │ │ └── index.html │ └── spaces.html ├── contributing │ ├── add_agent.html │ └── add_env.html ├── dashboard.html ├── design │ ├── control_flow.html │ ├── horizontal_scaling.html │ └── network.html ├── dist_usage.html ├── features │ ├── algorithms.html │ ├── batch_rl.html │ ├── benchmarks.html │ ├── environments.html │ └── index.html ├── genindex.html ├── index.html ├── objects.inv ├── search.html ├── searchindex.js ├── selecting_an_algorithm.html ├── test.html └── usage.html ├── docs_raw ├── Makefile ├── README.md ├── __init__.py ├── build_docs.sh ├── make.bat └── source │ ├── __init__.py │ ├── _static │ ├── css │ │ └── custom.css │ └── img │ │ ├── act.png │ │ ├── algorithms.png │ │ ├── attention_discretization.png │ │ ├── bollinger_bands.png │ │ ├── box_discretization.png │ │ ├── box_masking.png │ │ ├── compare_by_num_episodes.png │ │ ├── compare_by_time.png │ │ ├── dark_logo.png │ │ ├── design.png │ │ ├── design_imgs │ │ ├── ac.png │ │ ├── acer.png │ │ ├── bs_dqn.png │ │ ├── cil.png │ │ ├── ddpg.png │ │ ├── dfp.png │ │ ├── distributional_dqn.png │ │ ├── dqn.png │ │ ├── dueling_dqn.png │ │ ├── naf.png │ │ ├── nec.png │ │ ├── pg.png │ │ ├── ppo.png │ │ ├── qr_dqn.png │ │ ├── rainbow.png │ │ ├── sac.png │ │ ├── td3.png │ │ └── wolpertinger.png │ │ ├── diagrams.xml │ │ ├── distributed.png │ │ ├── filters.png │ │ ├── full_discrete_action_space_map.png │ │ ├── graph.png │ │ ├── horizontal-scale-out.png │ │ ├── improve.png │ │ ├── level.png │ │ ├── linear_box_to_box_map.png │ │ ├── network.png │ │ ├── observe.png │ │ ├── output_filters.xml │ │ ├── partial_discrete_action_space_map.png │ │ ├── separate_signals.png │ │ ├── train.png │ │ └── updating_dynamically.gif │ ├── _templates │ └── layout.html │ ├── algorithms.xml │ ├── components │ ├── additional_parameters.rst │ ├── agents │ │ ├── imitation │ │ │ ├── bc.rst │ │ │ └── cil.rst │ │ ├── index.rst │ │ ├── other │ │ │ └── dfp.rst │ │ ├── policy_optimization │ │ │ ├── ac.rst │ │ │ ├── acer.rst │ │ │ ├── cppo.rst │ │ │ ├── ddpg.rst │ │ │ ├── hac.rst │ │ │ ├── pg.rst │ │ │ ├── ppo.rst │ │ │ ├── sac.rst │ │ │ ├── td3.rst │ │ │ └── wolpertinger.rst │ │ └── value_optimization │ │ │ ├── bs_dqn.rst │ │ │ ├── categorical_dqn.rst │ │ │ ├── double_dqn.rst │ │ │ ├── dqn.rst │ │ │ ├── dueling_dqn.rst │ │ │ ├── mmc.rst │ │ │ ├── n_step.rst │ │ │ ├── naf.rst │ │ │ ├── nec.rst │ │ │ ├── pal.rst │ │ │ ├── qr_dqn.rst │ │ │ └── rainbow.rst │ ├── architectures │ │ └── index.rst │ ├── core_types.rst │ ├── data_stores │ │ └── index.rst │ ├── environments │ │ └── index.rst │ ├── exploration_policies │ │ └── index.rst │ ├── filters │ │ ├── index.rst │ │ ├── input_filters.rst │ │ └── output_filters.rst │ ├── memories │ │ └── index.rst │ ├── memory_backends │ │ └── index.rst │ ├── orchestrators │ │ └── index.rst │ └── spaces.rst │ ├── conf.py │ ├── contributing │ ├── add_agent.rst │ └── add_env.rst │ ├── dashboard.rst │ ├── design │ ├── control_flow.rst │ ├── horizontal_scaling.rst │ └── network.rst │ ├── diagrams.xml │ ├── dist_usage.rst │ ├── features │ ├── algorithms.rst │ ├── batch_rl.rst │ ├── benchmarks.rst │ ├── environments.rst │ └── index.rst │ ├── index.rst │ ├── selecting_an_algorithm.rst │ ├── test.rst │ └── usage.rst ├── img ├── ant.gif ├── carla.gif ├── coach_logo.png ├── dashboard.gif ├── dashboard.png ├── doom_deathmatch.gif ├── doom_health.gif ├── fetch_slide.gif ├── minitaur.gif ├── montezuma.gif ├── pendulum.gif └── starcraft.gif ├── requirements.txt ├── rl_coach ├── __init__.py ├── agents │ ├── __init__.py │ ├── acer_agent.py │ ├── actor_critic_agent.py │ ├── agent.py │ ├── agent_interface.py │ ├── bc_agent.py │ ├── bootstrapped_dqn_agent.py │ ├── categorical_dqn_agent.py │ ├── cil_agent.py │ ├── clipped_ppo_agent.py │ ├── composite_agent.py │ ├── ddpg_agent.py │ ├── ddqn_agent.py │ ├── ddqn_bcq_agent.py │ ├── dfp_agent.py │ ├── dqn_agent.py │ ├── hac_ddpg_agent.py │ ├── human_agent.py │ ├── imitation_agent.py │ ├── mmc_agent.py │ ├── n_step_q_agent.py │ ├── naf_agent.py │ ├── nec_agent.py │ ├── pal_agent.py │ ├── policy_gradients_agent.py │ ├── policy_optimization_agent.py │ ├── ppo_agent.py │ ├── qr_dqn_agent.py │ ├── rainbow_dqn_agent.py │ ├── soft_actor_critic_agent.py │ ├── td3_agent.py │ ├── td3_exp_agent.py │ ├── value_optimization_agent.py │ └── wolpertinger_agent.py ├── architectures │ ├── __init__.py │ ├── architecture.py │ ├── embedder_parameters.py │ ├── head_parameters.py │ ├── layers.py │ ├── middleware_parameters.py │ ├── mxnet_components │ │ ├── __init__.py │ │ ├── architecture.py │ │ ├── embedders │ │ │ ├── __init__.py │ │ │ ├── embedder.py │ │ │ ├── image_embedder.py │ │ │ ├── tensor_embedder.py │ │ │ └── vector_embedder.py │ │ ├── general_network.py │ │ ├── heads │ │ │ ├── __init__.py │ │ │ ├── head.py │ │ │ ├── ppo_head.py │ │ │ ├── ppo_v_head.py │ │ │ ├── q_head.py │ │ │ └── v_head.py │ │ ├── layers.py │ │ ├── middlewares │ │ │ ├── __init__.py │ │ │ ├── fc_middleware.py │ │ │ ├── lstm_middleware.py │ │ │ └── middleware.py │ │ ├── savers.py │ │ └── utils.py │ ├── network_wrapper.py │ └── tensorflow_components │ │ ├── __init__.py │ │ ├── architecture.py │ │ ├── distributed_tf_utils.py │ │ ├── embedders │ │ ├── __init__.py │ │ ├── embedder.py │ │ ├── image_embedder.py │ │ ├── tensor_embedder.py │ │ └── vector_embedder.py │ │ ├── general_network.py │ │ ├── heads │ │ ├── RND_head.py │ │ ├── __init__.py │ │ ├── acer_policy_head.py │ │ ├── categorical_q_head.py │ │ ├── cil_head.py │ │ ├── classification_head.py │ │ ├── ddpg_actor_head.py │ │ ├── ddpg_v_head.py │ │ ├── dnd_q_head.py │ │ ├── dueling_q_head.py │ │ ├── head.py │ │ ├── measurements_prediction_head.py │ │ ├── naf_head.py │ │ ├── policy_head.py │ │ ├── ppo_head.py │ │ ├── ppo_v_head.py │ │ ├── q_head.py │ │ ├── quantile_regression_q_head.py │ │ ├── rainbow_q_head.py │ │ ├── sac_head.py │ │ ├── sac_q_head.py │ │ ├── td3_v_head.py │ │ ├── v_head.py │ │ └── wolpertinger_actor_head.py │ │ ├── layers.py │ │ ├── middlewares │ │ ├── __init__.py │ │ ├── fc_middleware.py │ │ ├── lstm_middleware.py │ │ └── middleware.py │ │ ├── savers.py │ │ ├── shared_variables.py │ │ └── utils.py ├── base_parameters.py ├── checkpoint.py ├── coach.py ├── core_types.py ├── dashboard.py ├── dashboard_components │ ├── __init__.py │ ├── boards.py │ ├── episodic_board.py │ ├── experiment_board.py │ ├── globals.py │ ├── landing_page.py │ ├── signals.py │ ├── signals_file.py │ ├── signals_file_base.py │ ├── signals_files_group.py │ └── spinner.css ├── data_stores │ ├── __init__.py │ ├── checkpoint_data_store.py │ ├── data_store.py │ ├── data_store_impl.py │ ├── nfs_data_store.py │ ├── redis_data_store.py │ └── s3_data_store.py ├── debug_utils.py ├── environments │ ├── CarlaSettings.ini │ ├── README.md │ ├── __init__.py │ ├── carla_environment.py │ ├── control_suite_environment.py │ ├── doom │ │ ├── D2_navigation.cfg │ │ ├── D2_navigation.wad │ │ ├── D3_battle.cfg │ │ └── D3_battle.wad │ ├── doom_environment.py │ ├── environment.py │ ├── environment_interface.py │ ├── gym_environment.py │ ├── mujoco │ │ ├── __init__.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── materials.xml │ │ │ ├── skybox.xml │ │ │ └── visual.xml │ │ ├── pendulum_with_goals.py │ │ └── pendulum_with_goals.xml │ ├── robosuite │ │ ├── cube_exp.py │ │ └── osc_pose.json │ ├── robosuite_environment.py │ ├── starcraft2_environment.py │ └── toy_problems │ │ ├── __init__.py │ │ ├── bit_flip.py │ │ └── exploration_chain.py ├── exploration_policies │ ├── README.md │ ├── __init__.py │ ├── additive_noise.py │ ├── boltzmann.py │ ├── bootstrapped.py │ ├── categorical.py │ ├── continuous_entropy.py │ ├── e_greedy.py │ ├── exploration_policy.py │ ├── greedy.py │ ├── ou_process.py │ ├── parameter_noise.py │ ├── truncated_normal.py │ └── ucb.py ├── filters │ ├── README.md │ ├── __init__.py │ ├── action │ │ ├── __init__.py │ │ ├── action_filter.py │ │ ├── attention_discretization.py │ │ ├── box_discretization.py │ │ ├── box_masking.py │ │ ├── full_discrete_action_space_map.py │ │ ├── linear_box_to_box_map.py │ │ └── partial_discrete_action_space_map.py │ ├── filter.py │ ├── observation │ │ ├── __init__.py │ │ ├── observation_clipping_filter.py │ │ ├── observation_crop_filter.py │ │ ├── observation_filter.py │ │ ├── observation_move_axis_filter.py │ │ ├── observation_normalization_filter.py │ │ ├── observation_reduction_by_sub_parts_name_filter.py │ │ ├── observation_rescale_size_by_factor_filter.py │ │ ├── observation_rescale_to_size_filter.py │ │ ├── observation_rgb_to_y_filter.py │ │ ├── observation_squeeze_filter.py │ │ ├── observation_stacking_filter.py │ │ └── observation_to_uint8_filter.py │ └── reward │ │ ├── __init__.py │ │ ├── reward_clipping_filter.py │ │ ├── reward_ewma_normalization_filter.py │ │ ├── reward_filter.py │ │ ├── reward_normalization_filter.py │ │ └── reward_rescale_filter.py ├── graph_managers │ ├── README.md │ ├── __init__.py │ ├── basic_rl_graph_manager.py │ ├── batch_rl_graph_manager.py │ ├── graph_manager.py │ ├── hac_graph_manager.py │ └── hrl_graph_manager.py ├── level_manager.py ├── logger.py ├── memories │ ├── __init__.py │ ├── backend │ │ ├── __init__.py │ │ ├── memory.py │ │ ├── memory_impl.py │ │ └── redis.py │ ├── episodic │ │ ├── __init__.py │ │ ├── episodic_experience_replay.py │ │ ├── episodic_hindsight_experience_replay.py │ │ ├── episodic_hrl_hindsight_experience_replay.py │ │ └── single_episode_buffer.py │ ├── memory.py │ └── non_episodic │ │ ├── __init__.py │ │ ├── balanced_experience_replay.py │ │ ├── differentiable_neural_dictionary.py │ │ ├── experience_replay.py │ │ ├── prioritized_experience_replay.py │ │ └── transition_collection.py ├── off_policy_evaluators │ ├── __init__.py │ ├── bandits │ │ ├── __init__.py │ │ └── doubly_robust.py │ ├── ope_manager.py │ └── rl │ │ ├── __init__.py │ │ ├── sequential_doubly_robust.py │ │ └── weighted_importance_sampling.py ├── orchestrators │ ├── __init__.py │ ├── deploy.py │ └── kubernetes_orchestrator.py ├── plot_atari.py ├── presets │ ├── Acrobot_DDQN_BCQ_BatchRL.py │ ├── Atari_A3C.py │ ├── Atari_A3C_LSTM.py │ ├── Atari_ACER.py │ ├── Atari_Bootstrapped_DQN.py │ ├── Atari_C51.py │ ├── Atari_DDQN.py │ ├── Atari_DDQN_with_PER.py │ ├── Atari_DQN.py │ ├── Atari_DQN_with_PER.py │ ├── Atari_Dueling_DDQN.py │ ├── Atari_Dueling_DDQN_with_PER_OpenAI.py │ ├── Atari_NEC.py │ ├── Atari_NStepQ.py │ ├── Atari_QR_DQN.py │ ├── Atari_Rainbow.py │ ├── Atari_UCB_with_Q_Ensembles.py │ ├── BitFlip_DQN.py │ ├── BitFlip_DQN_HER.py │ ├── CARLA_3_Cameras_DDPG.py │ ├── CARLA_CIL.py │ ├── CARLA_DDPG.py │ ├── CARLA_Dueling_DDQN.py │ ├── CartPole_A3C.py │ ├── CartPole_ACER.py │ ├── CartPole_ClippedPPO.py │ ├── CartPole_DDQN_BCQ_BatchRL.py │ ├── CartPole_DDQN_BatchRL.py │ ├── CartPole_DFP.py │ ├── CartPole_DQN.py │ ├── CartPole_Dueling_DDQN.py │ ├── CartPole_NEC.py │ ├── CartPole_NStepQ.py │ ├── CartPole_PAL.py │ ├── CartPole_PG.py │ ├── CartPole_QR_DQN.py │ ├── CartPole_Rainbow.py │ ├── ControlSuite_DDPG.py │ ├── Doom_Basic_A3C.py │ ├── Doom_Basic_ACER.py │ ├── Doom_Basic_BC.py │ ├── Doom_Basic_DFP.py │ ├── Doom_Basic_DQN.py │ ├── Doom_Basic_Dueling_DDQN.py │ ├── Doom_Battle_DFP.py │ ├── Doom_Health_DFP.py │ ├── Doom_Health_MMC.py │ ├── Doom_Health_Supreme_DFP.py │ ├── ExplorationChain_Bootstrapped_DQN.py │ ├── ExplorationChain_Dueling_DDQN.py │ ├── ExplorationChain_UCB_Q_ensembles.py │ ├── Fetch_DDPG_HER_baselines.py │ ├── InvertedPendulum_PG.py │ ├── MontezumaRevenge_BC.py │ ├── Mujoco_A3C.py │ ├── Mujoco_A3C_LSTM.py │ ├── Mujoco_ClippedPPO.py │ ├── Mujoco_DDPG.py │ ├── Mujoco_NAF.py │ ├── Mujoco_PPO.py │ ├── Mujoco_SAC.py │ ├── Mujoco_TD3.py │ ├── Mujoco_Wolpertinger.py │ ├── Pendulum_HAC.py │ ├── README.md │ ├── RoboSuite_CubeExp_Random.py │ ├── RoboSuite_CubeExp_TD3_Goal_Based.py │ ├── RoboSuite_CubeExp_TD3_Intrinsic_Reward.py │ ├── Starcraft_CollectMinerals_A3C.py │ ├── Starcraft_CollectMinerals_Dueling_DDQN.py │ └── __init__.py ├── renderer.py ├── rollout_worker.py ├── run_multiple_seeds.py ├── saver.py ├── schedules.py ├── spaces.py ├── tests │ ├── README.md │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ └── test_agent_external_communication.py │ ├── architectures │ │ ├── __init__.py │ │ ├── mxnet_components │ │ │ ├── __init__.py │ │ │ ├── embedders │ │ │ │ ├── __init__.py │ │ │ │ ├── test_image_embedder.py │ │ │ │ └── test_vector_embedder.py │ │ │ ├── heads │ │ │ │ ├── __init__.py │ │ │ │ ├── test_head.py │ │ │ │ ├── test_ppo_head.py │ │ │ │ ├── test_ppo_v_head.py │ │ │ │ ├── test_q_head.py │ │ │ │ └── test_v_head.py │ │ │ ├── middlewares │ │ │ │ ├── __init__.py │ │ │ │ ├── test_fc_middleware.py │ │ │ │ └── test_lstm_middleware.py │ │ │ └── test_utils.py │ │ └── tensorflow_components │ │ │ ├── __init__.py │ │ │ └── embedders │ │ │ ├── __init__.py │ │ │ ├── test_identity_embedder.py │ │ │ ├── test_image_embedder.py │ │ │ └── test_vector_embedder.py │ ├── conftest.py │ ├── environments │ │ ├── __init__.py │ │ └── test_gym_environment.py │ ├── exploration_policies │ │ ├── __init__.py │ │ ├── test_additive_noise.py │ │ ├── test_e_greedy.py │ │ ├── test_greedy.py │ │ └── test_ou_process.py │ ├── filters │ │ ├── __init__.py │ │ ├── action │ │ │ ├── __init__.py │ │ │ ├── test_attention_discretization.py │ │ │ ├── test_box_discretization.py │ │ │ ├── test_box_masking.py │ │ │ └── test_linear_box_to_box_map.py │ │ ├── observation │ │ │ ├── __init__.py │ │ │ ├── test_observation_crop_filter.py │ │ │ ├── test_observation_reduction_by_sub_parts_name_filter.py │ │ │ ├── test_observation_rescale_size_by_factor_filter.py │ │ │ ├── test_observation_rescale_to_size_filter.py │ │ │ ├── test_observation_rgb_to_y_filter.py │ │ │ ├── test_observation_squeeze_filter.py │ │ │ ├── test_observation_stacking_filter.py │ │ │ └── test_observation_to_uint8_filter.py │ │ ├── reward │ │ │ ├── __init__.py │ │ │ ├── test_reward_clipping_filter.py │ │ │ └── test_reward_rescale_filter.py │ │ └── test_filters_stacking.py │ ├── graph_managers │ │ ├── __init__.py │ │ ├── test_basic_rl_graph_manager.py │ │ └── test_graph_manager.py │ ├── memories │ │ ├── __init__.py │ │ ├── test_differential_neural_dictionary.py │ │ ├── test_hindsight_experience_replay.py │ │ ├── test_prioritized_experience_replay.py │ │ └── test_single_episode_buffer.py │ ├── presets │ │ ├── __init__.py │ │ └── test_presets.py │ ├── pytest.ini │ ├── test_checkpoint.py │ ├── test_coach_args.py │ ├── test_core_types.py │ ├── test_dist_coach.py │ ├── test_eks.py │ ├── test_global_variable_saver.py │ ├── test_golden.py │ ├── test_saver.py │ ├── test_schedules.py │ ├── test_spaces.py │ ├── trace_tests.py │ └── utils │ │ ├── __init__.py │ │ ├── args_utils.py │ │ ├── definitions.py │ │ ├── presets_utils.py │ │ └── test_utils.py ├── training_worker.py ├── utilities │ ├── __init__.py │ ├── carla_dataset_to_replay_buffer.py │ └── shared_running_stats.py └── utils.py ├── setup.py └── tutorials ├── 0. Quick Start Guide.ipynb ├── 1. Implementing an Algorithm.ipynb ├── 2. Adding an Environment.ipynb ├── 3. Implementing a Hierarchical RL Graph.ipynb ├── 4. Batch Reinforcement Learning.ipynb ├── 5. Goal-Based Data Collection.ipynb ├── Resources ├── acrobot_dataset.csv ├── exploration.py └── img │ ├── dr.png │ ├── model_selection.png │ └── wis.png └── python_invocation_example.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | experiments 3 | *.pyc 4 | checkpoints 5 | _vizdoom.ini 6 | *.*~ 7 | MUJOCO_LOG.TXT 8 | test_log.txt 9 | .test 10 | tf_logs 11 | bullet3 12 | roboschool 13 | *.csv 14 | *.doc 15 | *.orig 16 | docs/site 17 | coach_env 18 | venv 19 | build 20 | rl_coach.egg* 21 | rl_coach_slim.egg* 22 | contrib 23 | test_log_* 24 | dist 25 | .DS_Store 26 | datasets 27 | .cache 28 | .pytest_cache 29 | core 30 | trace_test* 31 | *.swp 32 | *.swo 33 | .cache/ 34 | *.pyc 35 | coachenv 36 | -------------------------------------------------------------------------------- /.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/.nojekyll -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include rl_coach/dashboard_components/*.css 2 | include rl_coach/environments/doom/*.cfg 3 | include rl_coach/environments/doom/*.wad 4 | include rl_coach/environments/mujoco/common/*.xml 5 | include rl_coach/environments/mujoco/*.xml 6 | include rl_coach/environments/*.ini 7 | include rl_coach/tests/*.ini 8 | include requirements.txt -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/__init__.py -------------------------------------------------------------------------------- /benchmarks/a3c/ant_a3c_16_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/ant_a3c_16_workers.png -------------------------------------------------------------------------------- /benchmarks/a3c/half_cheetah_a3c_16_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/half_cheetah_a3c_16_workers.png -------------------------------------------------------------------------------- /benchmarks/a3c/hopper_a3c_16_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/hopper_a3c_16_workers.png -------------------------------------------------------------------------------- /benchmarks/a3c/inverted_pendulum_a3c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/inverted_pendulum_a3c.png -------------------------------------------------------------------------------- /benchmarks/a3c/space_invaders_a3c_16_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/space_invaders_a3c_16_workers.png -------------------------------------------------------------------------------- /benchmarks/a3c/walker2d_a3c_16_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/a3c/walker2d_a3c_16_workers.png -------------------------------------------------------------------------------- /benchmarks/acer/README.md: -------------------------------------------------------------------------------- 1 | # ACER 2 | 3 | Each experiment uses 3 seeds. 4 | The parameters used for ACER are the same parameters as described in the [original paper](https://arxiv.org/abs/1611.01224), except for the optimizer (changed to ADAM) and learning rate (1e-4) used. 5 | 6 | ### Breakout ACER - 16 workers 7 | 8 | ```bash 9 | coach -p Atari_ACER -lvl breakout -n 16 10 | ``` 11 | 12 | Breakout ACER 13 | 14 | ### Space Invaders ACER - 16 workers 15 | 16 | ```bash 17 | coach -p Atari_ACER -lvl space_invaders -n 16 18 | ``` 19 | 20 | Space Invaders ACER 21 | 22 | ### Pong ACER - 16 workers 23 | 24 | ```bash 25 | coach -p Atari_ACER -lvl pong -n 16 26 | ``` 27 | 28 | Pong ACER 29 | -------------------------------------------------------------------------------- /benchmarks/acer/breakout_acer_16_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/acer/breakout_acer_16_workers.png -------------------------------------------------------------------------------- /benchmarks/acer/pong_acer_16_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/acer/pong_acer_16_workers.png -------------------------------------------------------------------------------- /benchmarks/acer/space_invaders_acer_16_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/acer/space_invaders_acer_16_workers.png -------------------------------------------------------------------------------- /benchmarks/bootstrapped_dqn/README.md: -------------------------------------------------------------------------------- 1 | # Bootstrapped DQN 2 | 3 | Each experiment uses 3 seeds. 4 | The parameters used for Bootstrapped DQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1602.04621.pdf). 5 | 6 | ### Breakout Bootstrapped DQN - single worker 7 | 8 | ```bash 9 | coach -p Atari_Bootstrapped_DQN -lvl breakout 10 | ``` 11 | 12 | Breakout Bootstrapped DQN 13 | 14 | 15 | ### Pong Bootstrapped DQN - single worker 16 | 17 | ```bash 18 | coach -p Atari_Bootstrapped_DQN -lvl pong 19 | ``` 20 | 21 | Pong Bootstrapped DQN 22 | 23 | 24 | ### Space Invaders Bootstrapped DQN - single worker 25 | 26 | ```bash 27 | coach -p Atari_Bootstrapped_DQN -lvl space_invaders 28 | ``` 29 | 30 | Space Invaders Bootstrapped DQN 31 | 32 | -------------------------------------------------------------------------------- /benchmarks/bootstrapped_dqn/breakout_bootstrapped_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/bootstrapped_dqn/breakout_bootstrapped_dqn.png -------------------------------------------------------------------------------- /benchmarks/bootstrapped_dqn/pong_bootstrapped_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/bootstrapped_dqn/pong_bootstrapped_dqn.png -------------------------------------------------------------------------------- /benchmarks/bootstrapped_dqn/space_invaders_bootstrapped_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/bootstrapped_dqn/space_invaders_bootstrapped_dqn.png -------------------------------------------------------------------------------- /benchmarks/clipped_ppo/ant_clipped_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/ant_clipped_ppo.png -------------------------------------------------------------------------------- /benchmarks/clipped_ppo/half_cheetah_clipped_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/half_cheetah_clipped_ppo.png -------------------------------------------------------------------------------- /benchmarks/clipped_ppo/hopper_clipped_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/hopper_clipped_ppo.png -------------------------------------------------------------------------------- /benchmarks/clipped_ppo/humanoid_clipped_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/humanoid_clipped_ppo.png -------------------------------------------------------------------------------- /benchmarks/clipped_ppo/inverted_double_pendulum_clipped_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/inverted_double_pendulum_clipped_ppo.png -------------------------------------------------------------------------------- /benchmarks/clipped_ppo/inverted_pendulum_clipped_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/inverted_pendulum_clipped_ppo.png -------------------------------------------------------------------------------- /benchmarks/clipped_ppo/reacher_clipped_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/reacher_clipped_ppo.png -------------------------------------------------------------------------------- /benchmarks/clipped_ppo/swimmer_clipped_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/swimmer_clipped_ppo.png -------------------------------------------------------------------------------- /benchmarks/clipped_ppo/walker2d_clipped_ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/clipped_ppo/walker2d_clipped_ppo.png -------------------------------------------------------------------------------- /benchmarks/ddpg/ant_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/ant_ddpg.png -------------------------------------------------------------------------------- /benchmarks/ddpg/half_cheetah_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/half_cheetah_ddpg.png -------------------------------------------------------------------------------- /benchmarks/ddpg/hopper_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/hopper_ddpg.png -------------------------------------------------------------------------------- /benchmarks/ddpg/humanoid_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/humanoid_ddpg.png -------------------------------------------------------------------------------- /benchmarks/ddpg/inverted_double_pendulum_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/inverted_double_pendulum_ddpg.png -------------------------------------------------------------------------------- /benchmarks/ddpg/inverted_pendulum_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/inverted_pendulum_ddpg.png -------------------------------------------------------------------------------- /benchmarks/ddpg/reacher_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/reacher_ddpg.png -------------------------------------------------------------------------------- /benchmarks/ddpg/swimmer_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/swimmer_ddpg.png -------------------------------------------------------------------------------- /benchmarks/ddpg/walker2d_ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg/walker2d_ddpg.png -------------------------------------------------------------------------------- /benchmarks/ddpg_her/README.md: -------------------------------------------------------------------------------- 1 | # DDPG with Hindsight Experience Replay 2 | 3 | Each experiment uses 3 seeds. 4 | The parameters used for DDPG HER are the same parameters as described in the [following paper](https://arxiv.org/abs/1802.09464). 5 | 6 | ### Fetch Reach DDPG HER - single worker 7 | 8 | ```bash 9 | coach -p Fetch_DDPG_HER_baselines -lvl reach 10 | ``` 11 | 12 | Fetch DDPG HER Reach 1 Worker 13 | 14 | 15 | ### Fetch Push DDPG HER - 8 workers 16 | 17 | ```bash 18 | coach -p Fetch_DDPG_HER_baselines -lvl push -n 8 19 | ``` 20 | 21 | Fetch DDPG HER Push 8 Worker 22 | 23 | 24 | ### Fetch Slide DDPG HER - 8 workers 25 | 26 | ```bash 27 | coach -p Fetch_DDPG_HER_baselines -lvl slide -n 8 28 | ``` 29 | 30 | Fetch DDPG HER Slide 8 Worker 31 | 32 | 33 | ### Fetch Pick And Place DDPG HER - 8 workers 34 | 35 | ```bash 36 | coach -p Fetch_DDPG_HER -lvl pick_and_place -n 8 37 | ``` 38 | 39 | Fetch DDPG HER Pick And Place 8 Workers 40 | 41 | -------------------------------------------------------------------------------- /benchmarks/ddpg_her/fetch_ddpg_her_pick_and_place_8_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg_her/fetch_ddpg_her_pick_and_place_8_workers.png -------------------------------------------------------------------------------- /benchmarks/ddpg_her/fetch_ddpg_her_push_8_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg_her/fetch_ddpg_her_push_8_workers.png -------------------------------------------------------------------------------- /benchmarks/ddpg_her/fetch_ddpg_her_reach_1_worker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg_her/fetch_ddpg_her_reach_1_worker.png -------------------------------------------------------------------------------- /benchmarks/ddpg_her/fetch_ddpg_her_slide_8_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/ddpg_her/fetch_ddpg_her_slide_8_workers.png -------------------------------------------------------------------------------- /benchmarks/dfp/README.md: -------------------------------------------------------------------------------- 1 | # DFP 2 | 3 | Each experiment uses 3 seeds. 4 | The parameters used for DFP are the same parameters as described in the [original paper](https://arxiv.org/abs/1611.01779). 5 | 6 | ### Doom Basic DFP - 8 workers 7 | 8 | ```bash 9 | coach -p Doom_Basic_DFP -n 8 10 | ``` 11 | 12 | Doom Basic DFP 8 workers 13 | 14 | 15 | ### Doom Health (D1: Basic) DFP - 8 workers 16 | 17 | ```bash 18 | coach -p Doom_Health_DFP -n 8 19 | ``` 20 | 21 | Doom Health DFP 8 workers 22 | 23 | 24 | 25 | ### Doom Health Supreme (D2: Navigation) DFP - 8 workers 26 | 27 | ```bash 28 | coach -p Doom_Health_Supreme_DFP -n 8 29 | ``` 30 | 31 | Doom Health Supreme DFP 8 workers 32 | -------------------------------------------------------------------------------- /benchmarks/dfp/doom_basic_dfp_8_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dfp/doom_basic_dfp_8_workers.png -------------------------------------------------------------------------------- /benchmarks/dfp/doom_health_dfp_8_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dfp/doom_health_dfp_8_workers.png -------------------------------------------------------------------------------- /benchmarks/dfp/doom_health_supreme_dfp_8_workers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dfp/doom_health_supreme_dfp_8_workers.png -------------------------------------------------------------------------------- /benchmarks/dqn/README.md: -------------------------------------------------------------------------------- 1 | # DQN 2 | 3 | Each experiment uses 3 seeds. 4 | The parameters used for DQN are the same parameters as described in the [original paper](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf), except for the optimizer (changed to ADAM) and learning rate (1e-4) used. 5 | 6 | ### Breakout DQN - single worker 7 | 8 | ```bash 9 | coach -p Atari_DQN -lvl breakout 10 | ``` 11 | 12 | Breakout DQN 13 | 14 | ### Pong DQN - single worker 15 | 16 | ```bash 17 | coach -p Atari_DQN -lvl pong 18 | ``` 19 | 20 | Pong DQN 21 | 22 | ### Space Invaders DQN - single worker 23 | 24 | ```bash 25 | coach -p Atari_DQN -lvl space_invaders 26 | ``` 27 | 28 | Space Invaders DQN 29 | 30 | 31 | -------------------------------------------------------------------------------- /benchmarks/dqn/breakout_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dqn/breakout_dqn.png -------------------------------------------------------------------------------- /benchmarks/dqn/pong_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dqn/pong_dqn.png -------------------------------------------------------------------------------- /benchmarks/dqn/space_invaders_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dqn/space_invaders_dqn.png -------------------------------------------------------------------------------- /benchmarks/dueling_ddqn/README.md: -------------------------------------------------------------------------------- 1 | # Dueling DDQN 2 | 3 | Each experiment uses 3 seeds and is trained for 10k environment steps. 4 | The parameters used for Dueling DDQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1706.01502). 5 | 6 | ### Pong Dueling DDQN - single worker 7 | 8 | ```bash 9 | coach -p Atari_Dueling_DDQN -lvl pong 10 | ``` 11 | 12 | Pong Dueling DDQN 13 | 14 | 15 | ### Breakout Dueling DDQN - single worker 16 | 17 | ```bash 18 | coach -p Atari_Dueling_DDQN -lvl breakout 19 | ``` 20 | 21 | Breakout Dueling DDQN 22 | 23 | 24 | ### Space Invaders Dueling DDQN - single worker 25 | 26 | ```bash 27 | coach -p Atari_Dueling_DDQN -lvl space_invaders 28 | ``` 29 | 30 | Space Invaders Dueling DDQN 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /benchmarks/dueling_ddqn/breakout_dueling_ddqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn/breakout_dueling_ddqn.png -------------------------------------------------------------------------------- /benchmarks/dueling_ddqn/pong_dueling_ddqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn/pong_dueling_ddqn.png -------------------------------------------------------------------------------- /benchmarks/dueling_ddqn/space_invaders_dueling_ddqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn/space_invaders_dueling_ddqn.png -------------------------------------------------------------------------------- /benchmarks/dueling_ddqn_with_per/README.md: -------------------------------------------------------------------------------- 1 | # Dueling DDQN with Prioritized Experience Replay 2 | 3 | Each experiment uses 3 seeds and is trained for 10k environment steps. 4 | The parameters used for Dueling DDQN with PER are the same parameters as described in the [following paper](https://arxiv.org/abs/1511.05952). 5 | 6 | ### Breakout Dueling DDQN with PER - single worker 7 | 8 | ```bash 9 | coach -p Atari_Dueling_DDQN_with_PER_OpenAI -lvl breakout 10 | ``` 11 | 12 | Breakout Dueling DDQN with PER 13 | 14 | 15 | ### Pong Dueling DDQN with PER - single worker 16 | 17 | ```bash 18 | coach -p Atari_Dueling_DDQN_with_PER_OpenAI -lvl pong 19 | ``` 20 | 21 | Pong Dueling DDQN with PER 22 | 23 | 24 | ### Space Invaders Dueling DDQN with PER - single worker 25 | 26 | ```bash 27 | coach -p Atari_Dueling_DDQN_with_PER_OpenAI -lvl space_invaders 28 | ``` 29 | 30 | Space Invaders Dueling DDQN with PER 31 | 32 | -------------------------------------------------------------------------------- /benchmarks/dueling_ddqn_with_per/breakout_dueling_ddqn_with_per.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn_with_per/breakout_dueling_ddqn_with_per.png -------------------------------------------------------------------------------- /benchmarks/dueling_ddqn_with_per/pong_dueling_ddqn_with_per.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn_with_per/pong_dueling_ddqn_with_per.png -------------------------------------------------------------------------------- /benchmarks/dueling_ddqn_with_per/space_invaders_dueling_ddqn_with_per.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/dueling_ddqn_with_per/space_invaders_dueling_ddqn_with_per.png -------------------------------------------------------------------------------- /benchmarks/qr_dqn/README.md: -------------------------------------------------------------------------------- 1 | # Quantile Regression DQN 2 | 3 | Each experiment uses 3 seeds and is trained for 10k environment steps. 4 | The parameters used for QR-DQN are the same parameters as described in the [original paper](https://arxiv.org/abs/1710.10044.pdf). 5 | 6 | ### Breakout QR-DQN - single worker 7 | 8 | ```bash 9 | coach -p Atari_QR_DQN -lvl breakout 10 | ``` 11 | 12 | Breakout QR-DQN 13 | 14 | 15 | ### Pong QR-DQN - single worker 16 | 17 | ```bash 18 | coach -p Atari_QR_DQN -lvl pong 19 | ``` 20 | 21 | Pong QR-DQN 22 | -------------------------------------------------------------------------------- /benchmarks/qr_dqn/breakout_qr_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/qr_dqn/breakout_qr_dqn.png -------------------------------------------------------------------------------- /benchmarks/qr_dqn/pong_qr_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/qr_dqn/pong_qr_dqn.png -------------------------------------------------------------------------------- /benchmarks/sac/README.md: -------------------------------------------------------------------------------- 1 | # Soft Actor Critic 2 | 3 | Each experiment uses 3 seeds and is trained for 3M environment steps. 4 | The parameters used for SAC are the same parameters as described in the [original paper](https://arxiv.org/abs/1801.01290). 5 | 6 | ### Inverted Pendulum SAC - single worker 7 | 8 | ```bash 9 | coach -p Mujoco_SAC -lvl inverted_pendulum 10 | ``` 11 | 12 | Inverted Pendulum SAC 13 | 14 | 15 | ### Hopper Clipped SAC - single worker 16 | 17 | ```bash 18 | coach -p Mujoco_SAC -lvl hopper 19 | ``` 20 | 21 | Hopper SAC 22 | 23 | 24 | ### Half Cheetah Clipped SAC - single worker 25 | 26 | ```bash 27 | coach -p Mujoco_SAC -lvl half_cheetah 28 | ``` 29 | 30 | Half Cheetah SAC 31 | 32 | 33 | ### Walker 2D Clipped SAC - single worker 34 | 35 | ```bash 36 | coach -p Mujoco_SAC -lvl walker2d 37 | ``` 38 | 39 | Walker 2D SAC 40 | 41 | 42 | ### Humanoid Clipped SAC - single worker 43 | 44 | ```bash 45 | coach -p Mujoco_SAC -lvl humanoid 46 | ``` 47 | 48 | Humanoid SAC 49 | -------------------------------------------------------------------------------- /benchmarks/sac/half_cheetah_sac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/half_cheetah_sac.png -------------------------------------------------------------------------------- /benchmarks/sac/hopper_sac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/hopper_sac.png -------------------------------------------------------------------------------- /benchmarks/sac/humanoid_sac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/humanoid_sac.png -------------------------------------------------------------------------------- /benchmarks/sac/inverted_pendulum_sac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/inverted_pendulum_sac.png -------------------------------------------------------------------------------- /benchmarks/sac/walker2d_sac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/sac/walker2d_sac.png -------------------------------------------------------------------------------- /benchmarks/td3/README.md: -------------------------------------------------------------------------------- 1 | # Twin Delayed DDPG 2 | 3 | Each experiment uses 5 seeds and is trained for 1M environment steps. 4 | The parameters used for TD3 are the same parameters as described in the [original paper](https://arxiv.org/pdf/1802.09477.pdf), and [repository](https://github.com/sfujim/TD3). 5 | 6 | ### Ant TD3 - single worker 7 | 8 | ```bash 9 | coach -p Mujoco_TD3 -lvl ant 10 | ``` 11 | 12 | Ant TD3 13 | 14 | 15 | ### Hopper TD3 - single worker 16 | 17 | ```bash 18 | coach -p Mujoco_TD3 -lvl hopper 19 | ``` 20 | 21 | Hopper TD3 22 | 23 | 24 | ### Half Cheetah TD3 - single worker 25 | 26 | ```bash 27 | coach -p Mujoco_TD3 -lvl half_cheetah 28 | ``` 29 | 30 | Half Cheetah TD3 31 | 32 | 33 | ### Reacher TD3 - single worker 34 | 35 | ```bash 36 | coach -p Mujoco_TD3 -lvl reacher 37 | ``` 38 | 39 | Reacher TD3 40 | 41 | 42 | ### Walker2D TD3 - single worker 43 | 44 | ```bash 45 | coach -p Mujoco_TD3 -lvl walker2d 46 | ``` 47 | 48 | Walker2D TD3 49 | -------------------------------------------------------------------------------- /benchmarks/td3/ant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/ant.png -------------------------------------------------------------------------------- /benchmarks/td3/half_cheetah.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/half_cheetah.png -------------------------------------------------------------------------------- /benchmarks/td3/hopper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/hopper.png -------------------------------------------------------------------------------- /benchmarks/td3/reacher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/reacher.png -------------------------------------------------------------------------------- /benchmarks/td3/walker2d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/benchmarks/td3/walker2d.png -------------------------------------------------------------------------------- /dist-coach-config.template: -------------------------------------------------------------------------------- 1 | [coach] 2 | image = 3 | memory_backend = redispubsub 4 | data_store = s3 5 | s3_end_point = s3.amazonaws.com 6 | s3_bucket_name = 7 | s3_creds_file = 8 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM coach-base:master as builder 2 | 3 | # prep some of the more common environments 4 | # Gym (installed with coach) 5 | RUN pip3 install gym[atari]==0.12.5 box2d 6 | # Mujoco 7 | RUN mkdir -p ~/.mujoco \ 8 | && wget https://www.roboti.us/download/mjpro150_linux.zip -O mujoco.zip \ 9 | && unzip mujoco.zip -d ~/.mujoco \ 10 | && rm mujoco.zip 11 | ARG MUJOCO_KEY 12 | ENV MUJOCO_KEY=$MUJOCO_KEY 13 | ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH 14 | RUN echo $MUJOCO_KEY | base64 --decode > /root/.mujoco/mjkey.txt 15 | RUN pip3 install mujoco_py==1.50.1.68 16 | # Vizdoom 17 | RUN pip3 install vizdoom==1.1.7 18 | 19 | RUN mkdir /root/src 20 | COPY setup.py /root/src/. 21 | COPY requirements.txt /root/src/. 22 | RUN pip3 install -r /root/src/requirements.txt 23 | 24 | FROM coach-base:master 25 | WORKDIR /root/src 26 | COPY --from=builder /root/.mujoco /root/.mujoco 27 | ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH 28 | COPY --from=builder /root/.cache /root/.cache 29 | COPY setup.py /root/src/. 30 | COPY requirements.txt /root/src/. 31 | COPY README.md /root/src/. 32 | RUN pip3 install gym[atari]==0.12.5 box2d mujoco_py==1.50.1.68 vizdoom==1.1.7 && pip3 install -e .[all] && rm -rf /root/.cache 33 | COPY . /root/src 34 | -------------------------------------------------------------------------------- /docker/Dockerfile.doom_environment: -------------------------------------------------------------------------------- 1 | FROM coach-base:master as builder 2 | 3 | # prep vizdoom and any of its related requirements. 4 | RUN pip3 install vizdoom==1.1.7 5 | 6 | # add coach source starting with files that could trigger 7 | # re-build if dependencies change. 8 | RUN mkdir /root/src 9 | COPY setup.py /root/src/. 10 | COPY requirements.txt /root/src/. 11 | RUN pip3 install -r /root/src/requirements.txt 12 | 13 | FROM coach-base:master 14 | WORKDIR /root/src 15 | COPY --from=builder /root/.cache /root/.cache 16 | COPY setup.py /root/src/. 17 | COPY requirements.txt /root/src/. 18 | COPY README.md /root/src/. 19 | RUN pip3 install vizdoom==1.1.7 && pip3 install -e .[all] && rm -rf /root/.cache 20 | COPY . /root/src 21 | -------------------------------------------------------------------------------- /docker/Dockerfile.gym_environment: -------------------------------------------------------------------------------- 1 | FROM coach-base:master as builder 2 | 3 | # prep gym and any of its related requirements. 4 | RUN pip3 install gym[atari,box2d,classic_control]==0.12.5 5 | 6 | # add coach source starting with files that could trigger 7 | # re-build if dependencies change. 8 | RUN mkdir /root/src 9 | COPY setup.py /root/src/. 10 | COPY requirements.txt /root/src/. 11 | RUN pip3 install -r /root/src/requirements.txt 12 | 13 | FROM coach-base:master 14 | WORKDIR /root/src 15 | COPY --from=builder /root/.cache /root/.cache 16 | COPY setup.py /root/src/. 17 | COPY requirements.txt /root/src/. 18 | COPY README.md /root/src/. 19 | RUN pip3 install gym[atari,box2d,classic_control]==0.12.5 && pip3 install -e .[all] && rm -rf /root/.cache 20 | COPY . /root/src 21 | -------------------------------------------------------------------------------- /docker/Dockerfile.mujoco_environment: -------------------------------------------------------------------------------- 1 | FROM coach-base:master as builder 2 | 3 | # prep mujoco and any of its related requirements. 4 | # Mujoco 5 | RUN mkdir -p ~/.mujoco \ 6 | && wget https://www.roboti.us/download/mjpro150_linux.zip -O mujoco.zip \ 7 | && unzip -n mujoco.zip -d ~/.mujoco \ 8 | && rm mujoco.zip 9 | ARG MUJOCO_KEY 10 | ENV MUJOCO_KEY=$MUJOCO_KEY 11 | ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH 12 | RUN echo $MUJOCO_KEY | base64 --decode > /root/.mujoco/mjkey.txt 13 | RUN pip3 install mujoco_py==1.50.1.68 14 | 15 | # add coach source starting with files that could trigger 16 | # re-build if dependencies change. 17 | RUN mkdir /root/src 18 | COPY setup.py /root/src/. 19 | COPY requirements.txt /root/src/. 20 | RUN pip3 install -r /root/src/requirements.txt 21 | 22 | FROM coach-base:master 23 | WORKDIR /root/src 24 | COPY --from=builder /root/.mujoco /root/.mujoco 25 | ENV LD_LIBRARY_PATH /root/.mujoco/mjpro150/bin:$LD_LIBRARY_PATH 26 | COPY --from=builder /root/.cache /root/.cache 27 | COPY setup.py /root/src/. 28 | COPY requirements.txt /root/src/. 29 | COPY README.md /root/src/. 30 | RUN pip3 install mujoco_py==1.50.1.68 && pip3 install -e .[all] && rm -rf /root/.cache 31 | COPY . /root/src 32 | -------------------------------------------------------------------------------- /docker/Dockerfile.starcraft_environment: -------------------------------------------------------------------------------- 1 | FROM coach-base:master as builder 2 | 3 | # prep pysc2 and any of its related requirements. 4 | RUN wget http://blzdistsc2-a.akamaihd.net/Linux/SC2.3.17.zip -O sc2.zip \ 5 | && unzip -P 'iagreetotheeula' -d ~ sc2.zip \ 6 | && rm sc2.zip 7 | RUN wget https://github.com/deepmind/pysc2/releases/download/v1.2/mini_games.zip -O mini_games.zip \ 8 | && unzip -d ~/StarCraftII/Maps mini_games.zip \ 9 | && rm mini_games.zip 10 | RUN pip3 install pysc2 11 | 12 | # add coach source starting with files that could trigger 13 | # re-build if dependencies change. 14 | RUN mkdir /root/src 15 | COPY setup.py /root/src/. 16 | COPY requirements.txt /root/src/. 17 | RUN pip3 install -r /root/src/requirements.txt 18 | 19 | FROM coach-base:master 20 | WORKDIR /root/src 21 | COPY --from=builder /root/StarCraftII /root/StarCraftII 22 | COPY --from=builder /root/.cache /root/.cache 23 | COPY setup.py /root/src/. 24 | COPY requirements.txt /root/src/. 25 | COPY README.md /root/src/. 26 | RUN pip3 install pysc2 && pip3 install -e .[all] && rm -rf /root/.cache 27 | COPY . /root/src 28 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/.nojekyll -------------------------------------------------------------------------------- /docs/_images/ac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/ac.png -------------------------------------------------------------------------------- /docs/_images/acer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/acer.png -------------------------------------------------------------------------------- /docs/_images/act.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/act.png -------------------------------------------------------------------------------- /docs/_images/algorithms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/algorithms.png -------------------------------------------------------------------------------- /docs/_images/attention_discretization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/attention_discretization.png -------------------------------------------------------------------------------- /docs/_images/bollinger_bands.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/bollinger_bands.png -------------------------------------------------------------------------------- /docs/_images/box_discretization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/box_discretization.png -------------------------------------------------------------------------------- /docs/_images/box_masking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/box_masking.png -------------------------------------------------------------------------------- /docs/_images/bs_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/bs_dqn.png -------------------------------------------------------------------------------- /docs/_images/cil.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/cil.png -------------------------------------------------------------------------------- /docs/_images/compare_by_num_episodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/compare_by_num_episodes.png -------------------------------------------------------------------------------- /docs/_images/compare_by_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/compare_by_time.png -------------------------------------------------------------------------------- /docs/_images/ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/ddpg.png -------------------------------------------------------------------------------- /docs/_images/design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/design.png -------------------------------------------------------------------------------- /docs/_images/dfp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/dfp.png -------------------------------------------------------------------------------- /docs/_images/distributed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/distributed.png -------------------------------------------------------------------------------- /docs/_images/distributional_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/distributional_dqn.png -------------------------------------------------------------------------------- /docs/_images/dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/dqn.png -------------------------------------------------------------------------------- /docs/_images/dueling_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/dueling_dqn.png -------------------------------------------------------------------------------- /docs/_images/filters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/filters.png -------------------------------------------------------------------------------- /docs/_images/full_discrete_action_space_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/full_discrete_action_space_map.png -------------------------------------------------------------------------------- /docs/_images/horizontal-scale-out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/horizontal-scale-out.png -------------------------------------------------------------------------------- /docs/_images/improve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/improve.png -------------------------------------------------------------------------------- /docs/_images/linear_box_to_box_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/linear_box_to_box_map.png -------------------------------------------------------------------------------- /docs/_images/naf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/naf.png -------------------------------------------------------------------------------- /docs/_images/nec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/nec.png -------------------------------------------------------------------------------- /docs/_images/network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/network.png -------------------------------------------------------------------------------- /docs/_images/observe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/observe.png -------------------------------------------------------------------------------- /docs/_images/partial_discrete_action_space_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/partial_discrete_action_space_map.png -------------------------------------------------------------------------------- /docs/_images/pg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/pg.png -------------------------------------------------------------------------------- /docs/_images/ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/ppo.png -------------------------------------------------------------------------------- /docs/_images/qr_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/qr_dqn.png -------------------------------------------------------------------------------- /docs/_images/rainbow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/rainbow.png -------------------------------------------------------------------------------- /docs/_images/sac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/sac.png -------------------------------------------------------------------------------- /docs/_images/separate_signals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/separate_signals.png -------------------------------------------------------------------------------- /docs/_images/td3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/td3.png -------------------------------------------------------------------------------- /docs/_images/train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/train.png -------------------------------------------------------------------------------- /docs/_images/updating_dynamically.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/updating_dynamically.gif -------------------------------------------------------------------------------- /docs/_images/wolpertinger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_images/wolpertinger.png -------------------------------------------------------------------------------- /docs/_sources/components/additional_parameters.rst.txt: -------------------------------------------------------------------------------- 1 | Additional Parameters 2 | ===================== 3 | 4 | VisualizationParameters 5 | ----------------------- 6 | .. autoclass:: rl_coach.base_parameters.VisualizationParameters 7 | 8 | PresetValidationParameters 9 | -------------------------- 10 | .. autoclass:: rl_coach.base_parameters.PresetValidationParameters 11 | 12 | TaskParameters 13 | -------------- 14 | .. autoclass:: rl_coach.base_parameters.TaskParameters 15 | 16 | DistributedTaskParameters 17 | ------------------------- 18 | .. autoclass:: rl_coach.base_parameters.DistributedTaskParameters 19 | -------------------------------------------------------------------------------- /docs/_sources/components/agents/imitation/bc.rst.txt: -------------------------------------------------------------------------------- 1 | Behavioral Cloning 2 | ================== 3 | 4 | **Actions space:** Discrete | Continuous 5 | 6 | Network Structure 7 | ----------------- 8 | 9 | .. image:: /_static/img/design_imgs/pg.png 10 | :align: center 11 | 12 | 13 | Algorithm Description 14 | --------------------- 15 | 16 | Training the network 17 | ++++++++++++++++++++ 18 | 19 | The replay buffer contains the expert demonstrations for the task. 20 | These demonstrations are given as state, action tuples, and with no reward. 21 | The training goal is to reduce the difference between the actions predicted by the network and the actions taken by 22 | the expert for each state. 23 | 24 | 1. Sample a batch of transitions from the replay buffer. 25 | 2. Use the current states as input to the network, and the expert actions as the targets of the network. 26 | 3. For the network head, we use the policy head, which uses the cross entropy loss function. 27 | 28 | 29 | .. autoclass:: rl_coach.agents.bc_agent.BCAlgorithmParameters -------------------------------------------------------------------------------- /docs/_sources/components/agents/index.rst.txt: -------------------------------------------------------------------------------- 1 | Agents 2 | ====== 3 | 4 | Coach supports many state-of-the-art reinforcement learning algorithms, which are separated into three main classes - 5 | value optimization, policy optimization and imitation learning. 6 | A detailed description of those algorithms can be found by navigating to each of the algorithm pages. 7 | 8 | .. image:: /_static/img/algorithms.png 9 | :width: 600px 10 | :align: center 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :caption: Agents 15 | 16 | policy_optimization/ac 17 | policy_optimization/acer 18 | imitation/bc 19 | value_optimization/bs_dqn 20 | value_optimization/categorical_dqn 21 | imitation/cil 22 | policy_optimization/cppo 23 | policy_optimization/ddpg 24 | other/dfp 25 | value_optimization/double_dqn 26 | value_optimization/dqn 27 | value_optimization/dueling_dqn 28 | value_optimization/mmc 29 | value_optimization/n_step 30 | value_optimization/naf 31 | value_optimization/nec 32 | value_optimization/pal 33 | policy_optimization/pg 34 | policy_optimization/ppo 35 | value_optimization/rainbow 36 | value_optimization/qr_dqn 37 | policy_optimization/sac 38 | policy_optimization/td3 39 | policy_optimization/wolpertinger 40 | 41 | 42 | 43 | .. autoclass:: rl_coach.base_parameters.AgentParameters 44 | 45 | .. autoclass:: rl_coach.agents.agent.Agent 46 | :members: 47 | :inherited-members: 48 | 49 | -------------------------------------------------------------------------------- /docs/_sources/components/agents/policy_optimization/ac.rst.txt: -------------------------------------------------------------------------------- 1 | Actor-Critic 2 | ============ 3 | 4 | **Actions space:** Discrete | Continuous 5 | 6 | **References:** `Asynchronous Methods for Deep Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/ac.png 12 | :width: 500px 13 | :align: center 14 | 15 | Algorithm Description 16 | --------------------- 17 | 18 | Choosing an action - Discrete actions 19 | +++++++++++++++++++++++++++++++++++++ 20 | 21 | The policy network is used in order to predict action probabilites. While training, a sample is taken from a categorical 22 | distribution assigned with these probabilities. When testing, the action with the highest probability is used. 23 | 24 | Training the network 25 | ++++++++++++++++++++ 26 | A batch of :math:`T_{max}` transitions is used, and the advantages are calculated upon it. 27 | 28 | Advantages can be calculated by either of the following methods (configured by the selected preset) - 29 | 30 | 1. **A_VALUE** - Estimating advantage directly: 31 | :math:`A(s_t, a_t) = \underbrace{\sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})}_{Q(s_t, a_t)} - V(s_t)` 32 | where :math:`k` is :math:`T_{max} - State\_Index` for each state in the batch. 33 | 34 | 2. **GAE** - By following the `Generalized Advantage Estimation `_ paper. 35 | 36 | The advantages are then used in order to accumulate gradients according to 37 | :math:`L = -\mathop{\mathbb{E}} [log (\pi) \cdot A]` 38 | 39 | 40 | .. autoclass:: rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters -------------------------------------------------------------------------------- /docs/_sources/components/agents/policy_optimization/hac.rst.txt: -------------------------------------------------------------------------------- 1 | Hierarchical Actor Critic 2 | ========================= 3 | 4 | **Actions space:** Continuous 5 | 6 | **References:** `Hierarchical Reinforcement Learning with Hindsight `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/ddpg.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | Choosing an action 17 | ++++++++++++++++++ 18 | 19 | Pass the current states through the actor network, and get an action mean vector :math:`\mu`. 20 | While in training phase, use a continuous exploration policy, such as the Ornstein-Uhlenbeck process, 21 | to add exploration noise to the action. When testing, use the mean vector :math:`\mu` as-is. 22 | 23 | Training the network 24 | ++++++++++++++++++++ 25 | -------------------------------------------------------------------------------- /docs/_sources/components/agents/value_optimization/categorical_dqn.rst.txt: -------------------------------------------------------------------------------- 1 | Categorical DQN 2 | =============== 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `A Distributional Perspective on Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/distributional_dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | 1. Sample a batch of transitions from the replay buffer. 21 | 22 | 2. The Bellman update is projected to the set of atoms representing the :math:`Q` values distribution, such 23 | that the :math:`i-th` component of the projected update is calculated as follows: 24 | 25 | :math:`(\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))` 26 | 27 | where: 28 | * :math:`[ \cdot ]` bounds its argument in the range :math:`[a, b]` 29 | * :math:`\hat{T}_{z_{j}}` is the Bellman update for atom :math:`z_j`: :math:`\hat{T}_{z_{j}} := r+\gamma z_j` 30 | 31 | 32 | 3. Network is trained with the cross entropy loss between the resulting probability distribution and the target 33 | probability distribution. Only the target of the actions that were actually taken is updated. 34 | 35 | 4. Once in every few thousand steps, weights are copied from the online network to the target network. 36 | 37 | 38 | 39 | .. autoclass:: rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters 40 | -------------------------------------------------------------------------------- /docs/_sources/components/agents/value_optimization/double_dqn.rst.txt: -------------------------------------------------------------------------------- 1 | Double DQN 2 | ========== 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Deep Reinforcement Learning with Double Q-learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | 1. Sample a batch of transitions from the replay buffer. 21 | 22 | 2. Using the next states from the sampled batch, run the online network in order to find the :math:`Q` maximizing 23 | action :math:`argmax_a Q(s_{t+1},a)`. For these actions, use the corresponding next states and run the target 24 | network to calculate :math:`Q(s_{t+1},argmax_a Q(s_{t+1},a))`. 25 | 26 | 3. In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss), 27 | use the current states from the sampled batch, and run the online network to get the current Q values predictions. 28 | Set those values as the targets for the actions that were not actually played. 29 | 30 | 4. For each action that was played, use the following equation for calculating the targets of the network: 31 | :math:`y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))` 32 | 33 | 5. Finally, train the online network using the current states as inputs, and with the aforementioned targets. 34 | 35 | 6. Once in every few thousand steps, copy the weights from the online network to the target network. 36 | -------------------------------------------------------------------------------- /docs/_sources/components/agents/value_optimization/dqn.rst.txt: -------------------------------------------------------------------------------- 1 | Deep Q Networks 2 | =============== 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Playing Atari with Deep Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | 1. Sample a batch of transitions from the replay buffer. 21 | 22 | 2. Using the next states from the sampled batch, run the target network to calculate the :math:`Q` values for each of 23 | the actions :math:`Q(s_{t+1},a)`, and keep only the maximum value for each state. 24 | 25 | 3. In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss), 26 | use the current states from the sampled batch, and run the online network to get the current Q values predictions. 27 | Set those values as the targets for the actions that were not actually played. 28 | 29 | 4. For each action that was played, use the following equation for calculating the targets of the network:​ 30 | :math:`y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})` 31 | 32 | 5. Finally, train the online network using the current states as inputs, and with the aforementioned targets. 33 | 34 | 6. Once in every few thousand steps, copy the weights from the online network to the target network. 35 | 36 | 37 | .. autoclass:: rl_coach.agents.dqn_agent.DQNAlgorithmParameters 38 | -------------------------------------------------------------------------------- /docs/_sources/components/agents/value_optimization/dueling_dqn.rst.txt: -------------------------------------------------------------------------------- 1 | Dueling DQN 2 | =========== 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Dueling Network Architectures for Deep Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dueling_dqn.png 12 | :align: center 13 | 14 | General Description 15 | ------------------- 16 | Dueling DQN presents a change in the network structure comparing to DQN. 17 | 18 | Dueling DQN uses a specialized *Dueling Q Head* in order to separate :math:`Q` to an :math:`A` (advantage) 19 | stream and a :math:`V` stream. Adding this type of structure to the network head allows the network to better differentiate 20 | actions from one another, and significantly improves the learning. 21 | 22 | In many states, the values of the different actions are very similar, and it is less important which action to take. 23 | This is especially important in environments where there are many actions to choose from. In DQN, on each training 24 | iteration, for each of the states in the batch, we update the :ath:`Q` values only for the specific actions taken in 25 | those states. This results in slower learning as we do not learn the :math:`Q` values for actions that were not taken yet. 26 | On dueling architecture, on the other hand, learning is faster - as we start learning the state-value even if only a 27 | single action has been taken at this state. -------------------------------------------------------------------------------- /docs/_sources/components/agents/value_optimization/mmc.rst.txt: -------------------------------------------------------------------------------- 1 | Mixed Monte Carlo 2 | ================= 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Count-Based Exploration with Neural Density Models `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | Training the network 17 | ++++++++++++++++++++ 18 | 19 | In MMC, targets are calculated as a mixture between Double DQN targets and full Monte Carlo samples (total discounted returns). 20 | 21 | The DDQN targets are calculated in the same manner as in the DDQN agent: 22 | 23 | :math:`y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))` 24 | 25 | The Monte Carlo targets are calculated by summing up the discounted rewards across the entire episode: 26 | 27 | :math:`y_t^{MC}=\sum_{j=0}^T\gamma^j r(s_{t+j},a_{t+j} )` 28 | 29 | A mixing ratio $\alpha$ is then used to get the final targets: 30 | 31 | :math:`y_t=(1-\alpha)\cdot y_t^{DDQN}+\alpha \cdot y_t^{MC}` 32 | 33 | Finally, the online network is trained using the current states as inputs, and the calculated targets. 34 | Once in every few thousand steps, copy the weights from the online network to the target network. 35 | 36 | 37 | .. autoclass:: rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters 38 | -------------------------------------------------------------------------------- /docs/_sources/components/agents/value_optimization/n_step.rst.txt: -------------------------------------------------------------------------------- 1 | N-Step Q Learning 2 | ================= 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Asynchronous Methods for Deep Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | The :math:`N`-step Q learning algorithm works in similar manner to DQN except for the following changes: 21 | 22 | 1. No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every 23 | :math:`N` steps using the latest :math:`N` steps played by the agent. 24 | 25 | 2. In order to stabilize the learning, multiple workers work together to update the network. 26 | This creates the same effect as uncorrelating the samples used for training. 27 | 28 | 3. Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated 29 | to form the :math:`N`-step Q targets, according to the following equation: 30 | :math:`R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})` 31 | where :math:`k` is :math:`T_{max} - State\_Index` for each state in the batch 32 | 33 | 34 | 35 | .. autoclass:: rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters 36 | -------------------------------------------------------------------------------- /docs/_sources/components/agents/value_optimization/naf.rst.txt: -------------------------------------------------------------------------------- 1 | Normalized Advantage Functions 2 | ============================== 3 | 4 | **Actions space:** Continuous 5 | 6 | **References:** `Continuous Deep Q-Learning with Model-based Acceleration `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/naf.png 12 | :width: 600px 13 | :align: center 14 | 15 | Algorithm Description 16 | --------------------- 17 | Choosing an action 18 | ++++++++++++++++++ 19 | The current state is used as an input to the network. The action mean :math:`\mu(s_t )` is extracted from the output head. 20 | It is then passed to the exploration policy which adds noise in order to encourage exploration. 21 | 22 | Training the network 23 | ++++++++++++++++++++ 24 | The network is trained by using the following targets: 25 | :math:`y_t=r(s_t,a_t )+\gamma\cdot V(s_{t+1})` 26 | Use the next states as the inputs to the target network and extract the :math:`V` value, from within the head, 27 | to get :math:`V(s_{t+1} )`. Then, update the online network using the current states and actions as inputs, 28 | and :math:`y_t` as the targets. 29 | After every training step, use a soft update in order to copy the weights from the online network to the target network. 30 | 31 | 32 | 33 | .. autoclass:: rl_coach.agents.naf_agent.NAFAlgorithmParameters 34 | -------------------------------------------------------------------------------- /docs/_sources/components/agents/value_optimization/qr_dqn.rst.txt: -------------------------------------------------------------------------------- 1 | Quantile Regression DQN 2 | ======================= 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Distributional Reinforcement Learning with Quantile Regression `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/qr_dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | 1. Sample a batch of transitions from the replay buffer. 21 | 22 | 2. First, the next state quantiles are predicted. These are used in order to calculate the targets for the network, 23 | by following the Bellman equation. 24 | Next, the current quantile locations for the current states are predicted, sorted, and used for calculating the 25 | quantile midpoints targets. 26 | 27 | 3. The network is trained with the quantile regression loss between the resulting quantile locations and the target 28 | quantile locations. Only the targets of the actions that were actually taken are updated. 29 | 30 | 4. Once in every few thousand steps, weights are copied from the online network to the target network. 31 | 32 | 33 | .. autoclass:: rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters -------------------------------------------------------------------------------- /docs/_sources/components/architectures/index.rst.txt: -------------------------------------------------------------------------------- 1 | Architectures 2 | ============= 3 | 4 | Architectures contain all the classes that implement the neural network related stuff for the agent. 5 | Since Coach is intended to work with multiple neural network frameworks, each framework will implement its 6 | own components under a dedicated directory. For example, tensorflow components will contain all the neural network 7 | parts that are implemented using TensorFlow. 8 | 9 | .. autoclass:: rl_coach.base_parameters.NetworkParameters 10 | 11 | Architecture 12 | ------------ 13 | .. autoclass:: rl_coach.architectures.architecture.Architecture 14 | :members: 15 | :inherited-members: 16 | 17 | NetworkWrapper 18 | -------------- 19 | 20 | .. image:: /_static/img/distributed.png 21 | :width: 600px 22 | :align: center 23 | 24 | .. autoclass:: rl_coach.architectures.network_wrapper.NetworkWrapper 25 | :members: 26 | :inherited-members: 27 | 28 | -------------------------------------------------------------------------------- /docs/_sources/components/core_types.rst.txt: -------------------------------------------------------------------------------- 1 | Core Types 2 | ========== 3 | 4 | ActionInfo 5 | ---------- 6 | .. autoclass:: rl_coach.core_types.ActionInfo 7 | :members: 8 | :inherited-members: 9 | 10 | Batch 11 | ----- 12 | .. autoclass:: rl_coach.core_types.Batch 13 | :members: 14 | :inherited-members: 15 | 16 | EnvResponse 17 | ----------- 18 | .. autoclass:: rl_coach.core_types.EnvResponse 19 | :members: 20 | :inherited-members: 21 | 22 | Episode 23 | ------- 24 | .. autoclass:: rl_coach.core_types.Episode 25 | :members: 26 | :inherited-members: 27 | 28 | Transition 29 | ---------- 30 | .. autoclass:: rl_coach.core_types.Transition 31 | :members: 32 | :inherited-members: 33 | 34 | -------------------------------------------------------------------------------- /docs/_sources/components/data_stores/index.rst.txt: -------------------------------------------------------------------------------- 1 | Data Stores 2 | =========== 3 | 4 | S3DataStore 5 | ----------- 6 | .. autoclass:: rl_coach.data_stores.s3_data_store.S3DataStore 7 | 8 | NFSDataStore 9 | ------------ 10 | .. autoclass:: rl_coach.data_stores.nfs_data_store.NFSDataStore 11 | -------------------------------------------------------------------------------- /docs/_sources/components/filters/index.rst.txt: -------------------------------------------------------------------------------- 1 | Filters 2 | ======= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Filters 7 | 8 | input_filters 9 | output_filters 10 | 11 | Filters are a mechanism in Coach that allows doing pre-processing and post-processing of the internal agent information. 12 | There are two filter categories - 13 | 14 | * **Input filters** - these are filters that process the information passed **into** the agent from the environment. 15 | This information includes the observation and the reward. Input filters therefore allow rescaling observations, 16 | normalizing rewards, stack observations, etc. 17 | 18 | * **Output filters** - these are filters that process the information going **out** of the agent into the environment. 19 | This information includes the action the agent chooses to take. Output filters therefore allow conversion of 20 | actions from one space into another. For example, the agent can take :math:`N` discrete actions, that will be mapped by 21 | the output filter onto :math:`N` continuous actions. 22 | 23 | Filters can be stacked on top of each other in order to build complex processing flows of the inputs or outputs. 24 | 25 | .. image:: /_static/img/filters.png 26 | :width: 350px 27 | :align: center 28 | 29 | -------------------------------------------------------------------------------- /docs/_sources/components/filters/output_filters.rst.txt: -------------------------------------------------------------------------------- 1 | Output Filters 2 | -------------- 3 | 4 | The output filters only process the actions. 5 | 6 | Action Filters 7 | ++++++++++++++ 8 | 9 | .. autoclass:: rl_coach.filters.action.AttentionDiscretization 10 | 11 | .. image:: /_static/img/attention_discretization.png 12 | :align: center 13 | 14 | .. autoclass:: rl_coach.filters.action.BoxDiscretization 15 | 16 | .. image:: /_static/img/box_discretization.png 17 | :align: center 18 | 19 | .. autoclass:: rl_coach.filters.action.BoxMasking 20 | 21 | .. image:: /_static/img/box_masking.png 22 | :align: center 23 | 24 | .. autoclass:: rl_coach.filters.action.PartialDiscreteActionSpaceMap 25 | 26 | .. image:: /_static/img/partial_discrete_action_space_map.png 27 | :align: center 28 | 29 | .. autoclass:: rl_coach.filters.action.FullDiscreteActionSpaceMap 30 | 31 | .. image:: /_static/img/full_discrete_action_space_map.png 32 | :align: center 33 | 34 | .. autoclass:: rl_coach.filters.action.LinearBoxToBoxMap 35 | 36 | .. image:: /_static/img/linear_box_to_box_map.png 37 | :align: center -------------------------------------------------------------------------------- /docs/_sources/components/memories/index.rst.txt: -------------------------------------------------------------------------------- 1 | Memories 2 | ======== 3 | 4 | Episodic Memories 5 | ----------------- 6 | 7 | EpisodicExperienceReplay 8 | ++++++++++++++++++++++++ 9 | .. autoclass:: rl_coach.memories.episodic.EpisodicExperienceReplay 10 | 11 | EpisodicHindsightExperienceReplay 12 | +++++++++++++++++++++++++++++++++ 13 | .. autoclass:: rl_coach.memories.episodic.EpisodicHindsightExperienceReplay 14 | 15 | EpisodicHRLHindsightExperienceReplay 16 | ++++++++++++++++++++++++++++++++++++ 17 | .. autoclass:: rl_coach.memories.episodic.EpisodicHRLHindsightExperienceReplay 18 | 19 | SingleEpisodeBuffer 20 | +++++++++++++++++++ 21 | .. autoclass:: rl_coach.memories.episodic.SingleEpisodeBuffer 22 | 23 | 24 | Non-Episodic Memories 25 | --------------------- 26 | BalancedExperienceReplay 27 | ++++++++++++++++++++++++ 28 | .. autoclass:: rl_coach.memories.non_episodic.BalancedExperienceReplay 29 | 30 | QDND 31 | ++++ 32 | .. autoclass:: rl_coach.memories.non_episodic.QDND 33 | 34 | ExperienceReplay 35 | ++++++++++++++++ 36 | .. autoclass:: rl_coach.memories.non_episodic.ExperienceReplay 37 | 38 | PrioritizedExperienceReplay 39 | +++++++++++++++++++++++++++ 40 | .. autoclass:: rl_coach.memories.non_episodic.PrioritizedExperienceReplay 41 | 42 | TransitionCollection 43 | ++++++++++++++++++++ 44 | .. autoclass:: rl_coach.memories.non_episodic.TransitionCollection 45 | -------------------------------------------------------------------------------- /docs/_sources/components/memory_backends/index.rst.txt: -------------------------------------------------------------------------------- 1 | Memory Backends 2 | =============== 3 | 4 | RedisPubSubBackend 5 | ------------------ 6 | .. autoclass:: rl_coach.memories.backend.redis.RedisPubSubBackend 7 | -------------------------------------------------------------------------------- /docs/_sources/components/orchestrators/index.rst.txt: -------------------------------------------------------------------------------- 1 | Orchestrators 2 | ============= 3 | 4 | 5 | Kubernetes 6 | ---------- 7 | .. autoclass:: rl_coach.orchestrators.kubernetes_orchestrator.Kubernetes 8 | -------------------------------------------------------------------------------- /docs/_sources/components/spaces.rst.txt: -------------------------------------------------------------------------------- 1 | Spaces 2 | ====== 3 | 4 | Space 5 | ----- 6 | .. autoclass:: rl_coach.spaces.Space 7 | :members: 8 | :inherited-members: 9 | 10 | 11 | 12 | Observation Spaces 13 | ------------------ 14 | .. autoclass:: rl_coach.spaces.ObservationSpace 15 | :members: 16 | :inherited-members: 17 | 18 | VectorObservationSpace 19 | ++++++++++++++++++++++ 20 | .. autoclass:: rl_coach.spaces.VectorObservationSpace 21 | 22 | PlanarMapsObservationSpace 23 | ++++++++++++++++++++++++++ 24 | .. autoclass:: rl_coach.spaces.PlanarMapsObservationSpace 25 | 26 | ImageObservationSpace 27 | +++++++++++++++++++++ 28 | .. autoclass:: rl_coach.spaces.ImageObservationSpace 29 | 30 | 31 | 32 | Action Spaces 33 | ------------- 34 | .. autoclass:: rl_coach.spaces.ActionSpace 35 | :members: 36 | :inherited-members: 37 | 38 | AttentionActionSpace 39 | ++++++++++++++++++++ 40 | .. autoclass:: rl_coach.spaces.AttentionActionSpace 41 | 42 | BoxActionSpace 43 | ++++++++++++++ 44 | .. autoclass:: rl_coach.spaces.BoxActionSpace 45 | 46 | DiscreteActionSpace 47 | ++++++++++++++++++++ 48 | .. autoclass:: rl_coach.spaces.DiscreteActionSpace 49 | 50 | MultiSelectActionSpace 51 | ++++++++++++++++++++++ 52 | .. autoclass:: rl_coach.spaces.MultiSelectActionSpace 53 | 54 | CompoundActionSpace 55 | +++++++++++++++++++ 56 | .. autoclass:: rl_coach.spaces.CompoundActionSpace 57 | 58 | 59 | 60 | Goal Spaces 61 | ----------- 62 | .. autoclass:: rl_coach.spaces.GoalsSpace 63 | :members: 64 | :inherited-members: 65 | -------------------------------------------------------------------------------- /docs/_sources/features/algorithms.rst.txt: -------------------------------------------------------------------------------- 1 | Algorithms 2 | ========== 3 | 4 | Coach supports many state-of-the-art reinforcement learning algorithms, which are separated into three main classes - 5 | value optimization, policy optimization and imitation learning. 6 | A detailed description of those algorithms may be found in the `agents <../components/agents/index.html>`_ section. 7 | 8 | .. image:: /_static/img/algorithms.png 9 | :width: 600px 10 | :align: center -------------------------------------------------------------------------------- /docs/_sources/features/batch_rl.rst.txt: -------------------------------------------------------------------------------- 1 | Batch Reinforcement Learning 2 | ============================ 3 | 4 | Coach supports Batch Reinforcement Learning, where learning is based solely on a (fixed) batch of data. 5 | In Batch RL, we are given a dataset of experience, which was collected using some (one or more) deployed policies, and we would 6 | like to use it to learn a better policy than what was used to collect the dataset. 7 | There is no simulator to interact with, and so we cannot collect any new data, meaning we often cannot explore the MDP any further. 8 | To make things even harder, we would also like to use the dataset in order to evaluate the newly learned policy 9 | (using off-policy evaluation), since we do not have a simulator which we can use to evaluate the policy on. 10 | Batch RL is also often beneficial in cases where we just want to separate the inference (data collection) from the 11 | training process of a new policy. This is often the case where we have a system on which we could quite easily deploy a policy 12 | and collect experience data, but cannot easily use that system's setup to online train a new policy (as is often the 13 | case with more standard RL algorithms). 14 | 15 | Coach supports (almost) all of the integrated off-policy algorithms with Batch RL. 16 | 17 | A lot more details and example usage can be found in the 18 | `tutorial `_. -------------------------------------------------------------------------------- /docs/_sources/features/benchmarks.rst.txt: -------------------------------------------------------------------------------- 1 | Benchmarks 2 | ========== 3 | 4 | Reinforcement learning is a developing field, and so far it has been particularly difficult to reproduce some of the 5 | results published in the original papers. Some reasons for this are: 6 | 7 | * Reinforcement learning algorithms are notoriously known as having an unstable learning process. 8 | The data the neural networks trains on is dynamic, and depends on the random seed defined for the environment. 9 | 10 | * Reinforcement learning algorithms have many moving parts. For some environments and agents, there are many 11 | "tricks" which are needed to get the exact behavior the paper authors had seen. Also, there are **a lot** of 12 | hyper-parameters to set. 13 | 14 | In order for a reinforcement learning implementation to be useful for research or for data science, it must be 15 | shown that it achieves the expected behavior. For this reason, we collected a set of benchmark results from most 16 | of the algorithms implemented in Coach. The algorithms were tested on a subset of the same environments that were 17 | used in the original papers, and with multiple seed for each environment. 18 | Additionally, Coach uses some strict testing mechanisms to try and make sure the results we show for these 19 | benchmarks stay intact as Coach continues to develop. 20 | 21 | To see the benchmark results, please visit the 22 | `following GitHub page `_. -------------------------------------------------------------------------------- /docs/_sources/features/index.rst.txt: -------------------------------------------------------------------------------- 1 | Features 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Features 7 | 8 | algorithms 9 | environments 10 | benchmarks 11 | batch_rl -------------------------------------------------------------------------------- /docs/_sources/test.rst.txt: -------------------------------------------------------------------------------- 1 | test 2 | ---- 3 | 4 | .. important:: Its a note! in markdown! 5 | 6 | .. autoclass:: rl_coach.agents.dqn_agent.DQNAgent 7 | :members: 8 | :inherited-members: -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | /* Docs background */ 2 | .wy-side-nav-search{ 3 | background-color: #043c74; 4 | } 5 | 6 | /* Mobile version */ 7 | .wy-nav-top{ 8 | background-color: #043c74; 9 | } 10 | 11 | 12 | .green { 13 | color: green; 14 | } 15 | 16 | .red { 17 | color: red; 18 | } 19 | 20 | .blue { 21 | color: blue; 22 | } 23 | 24 | .yellow { 25 | color: yellow; 26 | } 27 | 28 | .badge { 29 | border: 2px; 30 | border-style: solid; 31 | border-color: #6C8EBF; 32 | border-radius: 5px; 33 | padding: 3px 15px 3px 15px; 34 | margin: 5px; 35 | display: inline-block; 36 | font-weight: bold; 37 | font-size: 16px; 38 | background: #DAE8FC; 39 | } 40 | 41 | .badge:hover { 42 | cursor: pointer; 43 | } 44 | 45 | .badge > a { 46 | color: black; 47 | } 48 | 49 | .bordered-container { 50 | border: 0px; 51 | border-style: solid; 52 | border-radius: 8px; 53 | padding: 15px; 54 | margin-bottom: 20px; 55 | background: #f2f2f2; 56 | } 57 | 58 | .questionnaire { 59 | font-size: 1.2em; 60 | line-height: 1.5em; 61 | } -------------------------------------------------------------------------------- /docs/_static/dark_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/dark_logo.png -------------------------------------------------------------------------------- /docs/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | var DOCUMENTATION_OPTIONS = { 2 | URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), 3 | VERSION: '0.12.0', 4 | LANGUAGE: 'None', 5 | COLLAPSE_INDEX: false, 6 | FILE_SUFFIX: '.html', 7 | HAS_SOURCE: true, 8 | SOURCELINK_SUFFIX: '.txt', 9 | NAVIGATION_WITH_KEYS: false 10 | }; -------------------------------------------------------------------------------- /docs/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/file.png -------------------------------------------------------------------------------- /docs/_static/fonts/Inconsolata-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Inconsolata-Bold.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/Inconsolata-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Inconsolata-Regular.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/Inconsolata.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Inconsolata.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/Lato-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato-Bold.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/Lato-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato-Regular.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bold.eot -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bold.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bold.woff -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bold.woff2 -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-bolditalic.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bolditalic.eot -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-bolditalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bolditalic.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-bolditalic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bolditalic.woff -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-bolditalic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-bolditalic.woff2 -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-italic.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-italic.eot -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-italic.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-italic.woff -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-italic.woff2 -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-regular.eot -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-regular.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-regular.woff -------------------------------------------------------------------------------- /docs/_static/fonts/Lato/lato-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/Lato/lato-regular.woff2 -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab-Bold.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab-Regular.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2 -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff -------------------------------------------------------------------------------- /docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2 -------------------------------------------------------------------------------- /docs/_static/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /docs/_static/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /docs/_static/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /docs/_static/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /docs/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/minus.png -------------------------------------------------------------------------------- /docs/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/_static/plus.png -------------------------------------------------------------------------------- /docs/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs/objects.inv -------------------------------------------------------------------------------- /docs_raw/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = source 8 | BUILDDIR = build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs_raw/README.md: -------------------------------------------------------------------------------- 1 | # Coach Documentation 2 | 3 | Coach uses Sphinx with a Read The Docs theme for its documentation website. 4 | The website is hosted on GitHub Pages, and is automatically pulled from the repository through the built docs directory. 5 | 6 | To build automatically, first go to 'docs_raw' directory; the following is installing all required packages, making html 7 | copying all new docs into 'coach/docs/' 8 | 9 | Run the following command (make sure the it's an executable file): 10 | ``` 11 | ./build_docs.sh 12 | ``` 13 | 14 | To build manually the documentation website locally, first install the following requirements: 15 | 16 | ``` 17 | pip install Sphinx 18 | pip install recommonmark 19 | pip install sphinx_rtd_theme 20 | pip install sphinx-autobuild 21 | pip install sphinx-argparse 22 | ``` 23 | 24 | Then there are two option to build: 25 | 1. Build using the make file (recommended). Run from within the `docs_raw` directory: 26 | 27 | ``` 28 | make html 29 | cp source/_static/css/custom.css build/html/_static/css/ 30 | rm -rf ../docs/ 31 | mkdir ../docs 32 | touch ../docs/.nojekyll 33 | cp -R build/html/* ../docs/ 34 | ``` 35 | 36 | 2. Build automatically after every change while editing the files: 37 | 38 | ``` 39 | sphinx-autobuild source build/html 40 | ``` 41 | -------------------------------------------------------------------------------- /docs_raw/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/__init__.py -------------------------------------------------------------------------------- /docs_raw/build_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "installing requirements..." 4 | 5 | pip3 install Sphinx 6 | pip3 install recommonmark 7 | pip3 install sphinx_rtd_theme 8 | pip3 install sphinx-autobuild 9 | pip3 install sphinx-argparse 10 | 11 | echo "Making docs..." 12 | 13 | make html 14 | 15 | echo "Copying new docs into coach/docs/" 16 | 17 | cp source/_static/css/custom.css build/html/_static/css/ 18 | rm -rf ../docs/ 19 | mkdir ../docs 20 | touch ../docs/.nojekyll 21 | cp -R build/html/* ../docs/ 22 | rm -r build 23 | 24 | echo "Finished!" -------------------------------------------------------------------------------- /docs_raw/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs_raw/source/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/__init__.py -------------------------------------------------------------------------------- /docs_raw/source/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | /* Docs background */ 2 | .wy-side-nav-search{ 3 | background-color: #043c74; 4 | } 5 | 6 | /* Mobile version */ 7 | .wy-nav-top{ 8 | background-color: #043c74; 9 | } 10 | 11 | 12 | .green { 13 | color: green; 14 | } 15 | 16 | .red { 17 | color: red; 18 | } 19 | 20 | .blue { 21 | color: blue; 22 | } 23 | 24 | .yellow { 25 | color: yellow; 26 | } 27 | 28 | .badge { 29 | border: 2px; 30 | border-style: solid; 31 | border-color: #6C8EBF; 32 | border-radius: 5px; 33 | padding: 3px 15px 3px 15px; 34 | margin: 5px; 35 | display: inline-block; 36 | font-weight: bold; 37 | font-size: 16px; 38 | background: #DAE8FC; 39 | } 40 | 41 | .badge:hover { 42 | cursor: pointer; 43 | } 44 | 45 | .badge > a { 46 | color: black; 47 | } 48 | 49 | .bordered-container { 50 | border: 0px; 51 | border-style: solid; 52 | border-radius: 8px; 53 | padding: 15px; 54 | margin-bottom: 20px; 55 | background: #f2f2f2; 56 | } 57 | 58 | .questionnaire { 59 | font-size: 1.2em; 60 | line-height: 1.5em; 61 | } -------------------------------------------------------------------------------- /docs_raw/source/_static/img/act.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/act.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/algorithms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/algorithms.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/attention_discretization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/attention_discretization.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/bollinger_bands.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/bollinger_bands.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/box_discretization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/box_discretization.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/box_masking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/box_masking.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/compare_by_num_episodes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/compare_by_num_episodes.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/compare_by_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/compare_by_time.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/dark_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/dark_logo.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/ac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/ac.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/acer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/acer.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/bs_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/bs_dqn.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/cil.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/cil.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/ddpg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/ddpg.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/dfp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/dfp.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/distributional_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/distributional_dqn.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/dqn.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/dueling_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/dueling_dqn.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/naf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/naf.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/nec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/nec.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/pg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/pg.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/ppo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/ppo.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/qr_dqn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/qr_dqn.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/rainbow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/rainbow.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/sac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/sac.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/td3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/td3.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/design_imgs/wolpertinger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/design_imgs/wolpertinger.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/distributed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/distributed.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/filters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/filters.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/full_discrete_action_space_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/full_discrete_action_space_map.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/graph.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/horizontal-scale-out.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/horizontal-scale-out.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/improve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/improve.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/level.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/level.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/linear_box_to_box_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/linear_box_to_box_map.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/network.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/observe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/observe.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/partial_discrete_action_space_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/partial_discrete_action_space_map.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/separate_signals.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/separate_signals.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/train.png -------------------------------------------------------------------------------- /docs_raw/source/_static/img/updating_dynamically.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/docs_raw/source/_static/img/updating_dynamically.gif -------------------------------------------------------------------------------- /docs_raw/source/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | {% block extrahead %} 3 | 4 | {% endblock %} -------------------------------------------------------------------------------- /docs_raw/source/components/additional_parameters.rst: -------------------------------------------------------------------------------- 1 | Additional Parameters 2 | ===================== 3 | 4 | VisualizationParameters 5 | ----------------------- 6 | .. autoclass:: rl_coach.base_parameters.VisualizationParameters 7 | 8 | PresetValidationParameters 9 | -------------------------- 10 | .. autoclass:: rl_coach.base_parameters.PresetValidationParameters 11 | 12 | TaskParameters 13 | -------------- 14 | .. autoclass:: rl_coach.base_parameters.TaskParameters 15 | 16 | DistributedTaskParameters 17 | ------------------------- 18 | .. autoclass:: rl_coach.base_parameters.DistributedTaskParameters 19 | -------------------------------------------------------------------------------- /docs_raw/source/components/agents/imitation/bc.rst: -------------------------------------------------------------------------------- 1 | Behavioral Cloning 2 | ================== 3 | 4 | **Actions space:** Discrete | Continuous 5 | 6 | Network Structure 7 | ----------------- 8 | 9 | .. image:: /_static/img/design_imgs/pg.png 10 | :align: center 11 | 12 | 13 | Algorithm Description 14 | --------------------- 15 | 16 | Training the network 17 | ++++++++++++++++++++ 18 | 19 | The replay buffer contains the expert demonstrations for the task. 20 | These demonstrations are given as state, action tuples, and with no reward. 21 | The training goal is to reduce the difference between the actions predicted by the network and the actions taken by 22 | the expert for each state. 23 | 24 | 1. Sample a batch of transitions from the replay buffer. 25 | 2. Use the current states as input to the network, and the expert actions as the targets of the network. 26 | 3. For the network head, we use the policy head, which uses the cross entropy loss function. 27 | 28 | 29 | .. autoclass:: rl_coach.agents.bc_agent.BCAlgorithmParameters -------------------------------------------------------------------------------- /docs_raw/source/components/agents/index.rst: -------------------------------------------------------------------------------- 1 | Agents 2 | ====== 3 | 4 | Coach supports many state-of-the-art reinforcement learning algorithms, which are separated into three main classes - 5 | value optimization, policy optimization and imitation learning. 6 | A detailed description of those algorithms can be found by navigating to each of the algorithm pages. 7 | 8 | .. image:: /_static/img/algorithms.png 9 | :width: 600px 10 | :align: center 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :caption: Agents 15 | 16 | policy_optimization/ac 17 | policy_optimization/acer 18 | imitation/bc 19 | value_optimization/bs_dqn 20 | value_optimization/categorical_dqn 21 | imitation/cil 22 | policy_optimization/cppo 23 | policy_optimization/ddpg 24 | other/dfp 25 | value_optimization/double_dqn 26 | value_optimization/dqn 27 | value_optimization/dueling_dqn 28 | value_optimization/mmc 29 | value_optimization/n_step 30 | value_optimization/naf 31 | value_optimization/nec 32 | value_optimization/pal 33 | policy_optimization/pg 34 | policy_optimization/ppo 35 | value_optimization/rainbow 36 | value_optimization/qr_dqn 37 | policy_optimization/sac 38 | policy_optimization/td3 39 | policy_optimization/wolpertinger 40 | 41 | 42 | 43 | .. autoclass:: rl_coach.base_parameters.AgentParameters 44 | 45 | .. autoclass:: rl_coach.agents.agent.Agent 46 | :members: 47 | :inherited-members: 48 | 49 | -------------------------------------------------------------------------------- /docs_raw/source/components/agents/policy_optimization/ac.rst: -------------------------------------------------------------------------------- 1 | Actor-Critic 2 | ============ 3 | 4 | **Actions space:** Discrete | Continuous 5 | 6 | **References:** `Asynchronous Methods for Deep Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/ac.png 12 | :width: 500px 13 | :align: center 14 | 15 | Algorithm Description 16 | --------------------- 17 | 18 | Choosing an action - Discrete actions 19 | +++++++++++++++++++++++++++++++++++++ 20 | 21 | The policy network is used in order to predict action probabilites. While training, a sample is taken from a categorical 22 | distribution assigned with these probabilities. When testing, the action with the highest probability is used. 23 | 24 | Training the network 25 | ++++++++++++++++++++ 26 | A batch of :math:`T_{max}` transitions is used, and the advantages are calculated upon it. 27 | 28 | Advantages can be calculated by either of the following methods (configured by the selected preset) - 29 | 30 | 1. **A_VALUE** - Estimating advantage directly: 31 | :math:`A(s_t, a_t) = \underbrace{\sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})}_{Q(s_t, a_t)} - V(s_t)` 32 | where :math:`k` is :math:`T_{max} - State\_Index` for each state in the batch. 33 | 34 | 2. **GAE** - By following the `Generalized Advantage Estimation `_ paper. 35 | 36 | The advantages are then used in order to accumulate gradients according to 37 | :math:`L = -\mathop{\mathbb{E}} [log (\pi) \cdot A]` 38 | 39 | 40 | .. autoclass:: rl_coach.agents.actor_critic_agent.ActorCriticAlgorithmParameters -------------------------------------------------------------------------------- /docs_raw/source/components/agents/policy_optimization/hac.rst: -------------------------------------------------------------------------------- 1 | Hierarchical Actor Critic 2 | ========================= 3 | 4 | **Actions space:** Continuous 5 | 6 | **References:** `Hierarchical Reinforcement Learning with Hindsight `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/ddpg.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | Choosing an action 17 | ++++++++++++++++++ 18 | 19 | Pass the current states through the actor network, and get an action mean vector :math:`\mu`. 20 | While in training phase, use a continuous exploration policy, such as the Ornstein-Uhlenbeck process, 21 | to add exploration noise to the action. When testing, use the mean vector :math:`\mu` as-is. 22 | 23 | Training the network 24 | ++++++++++++++++++++ 25 | -------------------------------------------------------------------------------- /docs_raw/source/components/agents/value_optimization/categorical_dqn.rst: -------------------------------------------------------------------------------- 1 | Categorical DQN 2 | =============== 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `A Distributional Perspective on Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/distributional_dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | 1. Sample a batch of transitions from the replay buffer. 21 | 22 | 2. The Bellman update is projected to the set of atoms representing the :math:`Q` values distribution, such 23 | that the :math:`i-th` component of the projected update is calculated as follows: 24 | 25 | :math:`(\Phi \hat{T} Z_{\theta}(s_t,a_t))_i=\sum_{j=0}^{N-1}\Big[1-\frac{\lvert[\hat{T}_{z_{j}}]^{V_{MAX}}_{V_{MIN}}-z_i\rvert}{\Delta z}\Big]^1_0 \ p_j(s_{t+1}, \pi(s_{t+1}))` 26 | 27 | where: 28 | * :math:`[ \cdot ]` bounds its argument in the range :math:`[a, b]` 29 | * :math:`\hat{T}_{z_{j}}` is the Bellman update for atom :math:`z_j`: :math:`\hat{T}_{z_{j}} := r+\gamma z_j` 30 | 31 | 32 | 3. Network is trained with the cross entropy loss between the resulting probability distribution and the target 33 | probability distribution. Only the target of the actions that were actually taken is updated. 34 | 35 | 4. Once in every few thousand steps, weights are copied from the online network to the target network. 36 | 37 | 38 | 39 | .. autoclass:: rl_coach.agents.categorical_dqn_agent.CategoricalDQNAlgorithmParameters 40 | -------------------------------------------------------------------------------- /docs_raw/source/components/agents/value_optimization/double_dqn.rst: -------------------------------------------------------------------------------- 1 | Double DQN 2 | ========== 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Deep Reinforcement Learning with Double Q-learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | 1. Sample a batch of transitions from the replay buffer. 21 | 22 | 2. Using the next states from the sampled batch, run the online network in order to find the :math:`Q` maximizing 23 | action :math:`argmax_a Q(s_{t+1},a)`. For these actions, use the corresponding next states and run the target 24 | network to calculate :math:`Q(s_{t+1},argmax_a Q(s_{t+1},a))`. 25 | 26 | 3. In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss), 27 | use the current states from the sampled batch, and run the online network to get the current Q values predictions. 28 | Set those values as the targets for the actions that were not actually played. 29 | 30 | 4. For each action that was played, use the following equation for calculating the targets of the network: 31 | :math:`y_t=r(s_t,a_t )+\gamma \cdot Q(s_{t+1},argmax_a Q(s_{t+1},a))` 32 | 33 | 5. Finally, train the online network using the current states as inputs, and with the aforementioned targets. 34 | 35 | 6. Once in every few thousand steps, copy the weights from the online network to the target network. 36 | -------------------------------------------------------------------------------- /docs_raw/source/components/agents/value_optimization/dqn.rst: -------------------------------------------------------------------------------- 1 | Deep Q Networks 2 | =============== 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Playing Atari with Deep Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | 1. Sample a batch of transitions from the replay buffer. 21 | 22 | 2. Using the next states from the sampled batch, run the target network to calculate the :math:`Q` values for each of 23 | the actions :math:`Q(s_{t+1},a)`, and keep only the maximum value for each state. 24 | 25 | 3. In order to zero out the updates for the actions that were not played (resulting from zeroing the MSE loss), 26 | use the current states from the sampled batch, and run the online network to get the current Q values predictions. 27 | Set those values as the targets for the actions that were not actually played. 28 | 29 | 4. For each action that was played, use the following equation for calculating the targets of the network:​ 30 | :math:`y_t=r(s_t,a_t )+\gamma \cdot max_a Q(s_{t+1})` 31 | 32 | 5. Finally, train the online network using the current states as inputs, and with the aforementioned targets. 33 | 34 | 6. Once in every few thousand steps, copy the weights from the online network to the target network. 35 | 36 | 37 | .. autoclass:: rl_coach.agents.dqn_agent.DQNAlgorithmParameters 38 | -------------------------------------------------------------------------------- /docs_raw/source/components/agents/value_optimization/dueling_dqn.rst: -------------------------------------------------------------------------------- 1 | Dueling DQN 2 | =========== 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Dueling Network Architectures for Deep Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dueling_dqn.png 12 | :align: center 13 | 14 | General Description 15 | ------------------- 16 | Dueling DQN presents a change in the network structure comparing to DQN. 17 | 18 | Dueling DQN uses a specialized *Dueling Q Head* in order to separate :math:`Q` to an :math:`A` (advantage) 19 | stream and a :math:`V` stream. Adding this type of structure to the network head allows the network to better differentiate 20 | actions from one another, and significantly improves the learning. 21 | 22 | In many states, the values of the different actions are very similar, and it is less important which action to take. 23 | This is especially important in environments where there are many actions to choose from. In DQN, on each training 24 | iteration, for each of the states in the batch, we update the :ath:`Q` values only for the specific actions taken in 25 | those states. This results in slower learning as we do not learn the :math:`Q` values for actions that were not taken yet. 26 | On dueling architecture, on the other hand, learning is faster - as we start learning the state-value even if only a 27 | single action has been taken at this state. -------------------------------------------------------------------------------- /docs_raw/source/components/agents/value_optimization/mmc.rst: -------------------------------------------------------------------------------- 1 | Mixed Monte Carlo 2 | ================= 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Count-Based Exploration with Neural Density Models `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | Training the network 17 | ++++++++++++++++++++ 18 | 19 | In MMC, targets are calculated as a mixture between Double DQN targets and full Monte Carlo samples (total discounted returns). 20 | 21 | The DDQN targets are calculated in the same manner as in the DDQN agent: 22 | 23 | :math:`y_t^{DDQN}=r(s_t,a_t )+\gamma Q(s_{t+1},argmax_a Q(s_{t+1},a))` 24 | 25 | The Monte Carlo targets are calculated by summing up the discounted rewards across the entire episode: 26 | 27 | :math:`y_t^{MC}=\sum_{j=0}^T\gamma^j r(s_{t+j},a_{t+j} )` 28 | 29 | A mixing ratio $\alpha$ is then used to get the final targets: 30 | 31 | :math:`y_t=(1-\alpha)\cdot y_t^{DDQN}+\alpha \cdot y_t^{MC}` 32 | 33 | Finally, the online network is trained using the current states as inputs, and the calculated targets. 34 | Once in every few thousand steps, copy the weights from the online network to the target network. 35 | 36 | 37 | .. autoclass:: rl_coach.agents.mmc_agent.MixedMonteCarloAlgorithmParameters 38 | -------------------------------------------------------------------------------- /docs_raw/source/components/agents/value_optimization/n_step.rst: -------------------------------------------------------------------------------- 1 | N-Step Q Learning 2 | ================= 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Asynchronous Methods for Deep Reinforcement Learning `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | The :math:`N`-step Q learning algorithm works in similar manner to DQN except for the following changes: 21 | 22 | 1. No replay buffer is used. Instead of sampling random batches of transitions, the network is trained every 23 | :math:`N` steps using the latest :math:`N` steps played by the agent. 24 | 25 | 2. In order to stabilize the learning, multiple workers work together to update the network. 26 | This creates the same effect as uncorrelating the samples used for training. 27 | 28 | 3. Instead of using single-step Q targets for the network, the rewards from $N$ consequent steps are accumulated 29 | to form the :math:`N`-step Q targets, according to the following equation: 30 | :math:`R(s_t, a_t) = \sum_{i=t}^{i=t + k - 1} \gamma^{i-t}r_i +\gamma^{k} V(s_{t+k})` 31 | where :math:`k` is :math:`T_{max} - State\_Index` for each state in the batch 32 | 33 | 34 | 35 | .. autoclass:: rl_coach.agents.n_step_q_agent.NStepQAlgorithmParameters 36 | -------------------------------------------------------------------------------- /docs_raw/source/components/agents/value_optimization/naf.rst: -------------------------------------------------------------------------------- 1 | Normalized Advantage Functions 2 | ============================== 3 | 4 | **Actions space:** Continuous 5 | 6 | **References:** `Continuous Deep Q-Learning with Model-based Acceleration `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/naf.png 12 | :width: 600px 13 | :align: center 14 | 15 | Algorithm Description 16 | --------------------- 17 | Choosing an action 18 | ++++++++++++++++++ 19 | The current state is used as an input to the network. The action mean :math:`\mu(s_t )` is extracted from the output head. 20 | It is then passed to the exploration policy which adds noise in order to encourage exploration. 21 | 22 | Training the network 23 | ++++++++++++++++++++ 24 | The network is trained by using the following targets: 25 | :math:`y_t=r(s_t,a_t )+\gamma\cdot V(s_{t+1})` 26 | Use the next states as the inputs to the target network and extract the :math:`V` value, from within the head, 27 | to get :math:`V(s_{t+1} )`. Then, update the online network using the current states and actions as inputs, 28 | and :math:`y_t` as the targets. 29 | After every training step, use a soft update in order to copy the weights from the online network to the target network. 30 | 31 | 32 | 33 | .. autoclass:: rl_coach.agents.naf_agent.NAFAlgorithmParameters 34 | -------------------------------------------------------------------------------- /docs_raw/source/components/agents/value_optimization/qr_dqn.rst: -------------------------------------------------------------------------------- 1 | Quantile Regression DQN 2 | ======================= 3 | 4 | **Actions space:** Discrete 5 | 6 | **References:** `Distributional Reinforcement Learning with Quantile Regression `_ 7 | 8 | Network Structure 9 | ----------------- 10 | 11 | .. image:: /_static/img/design_imgs/qr_dqn.png 12 | :align: center 13 | 14 | Algorithm Description 15 | --------------------- 16 | 17 | Training the network 18 | ++++++++++++++++++++ 19 | 20 | 1. Sample a batch of transitions from the replay buffer. 21 | 22 | 2. First, the next state quantiles are predicted. These are used in order to calculate the targets for the network, 23 | by following the Bellman equation. 24 | Next, the current quantile locations for the current states are predicted, sorted, and used for calculating the 25 | quantile midpoints targets. 26 | 27 | 3. The network is trained with the quantile regression loss between the resulting quantile locations and the target 28 | quantile locations. Only the targets of the actions that were actually taken are updated. 29 | 30 | 4. Once in every few thousand steps, weights are copied from the online network to the target network. 31 | 32 | 33 | .. autoclass:: rl_coach.agents.qr_dqn_agent.QuantileRegressionDQNAlgorithmParameters -------------------------------------------------------------------------------- /docs_raw/source/components/architectures/index.rst: -------------------------------------------------------------------------------- 1 | Architectures 2 | ============= 3 | 4 | Architectures contain all the classes that implement the neural network related stuff for the agent. 5 | Since Coach is intended to work with multiple neural network frameworks, each framework will implement its 6 | own components under a dedicated directory. For example, tensorflow components will contain all the neural network 7 | parts that are implemented using TensorFlow. 8 | 9 | .. autoclass:: rl_coach.base_parameters.NetworkParameters 10 | 11 | Architecture 12 | ------------ 13 | .. autoclass:: rl_coach.architectures.architecture.Architecture 14 | :members: 15 | :inherited-members: 16 | 17 | NetworkWrapper 18 | -------------- 19 | 20 | .. image:: /_static/img/distributed.png 21 | :width: 600px 22 | :align: center 23 | 24 | .. autoclass:: rl_coach.architectures.network_wrapper.NetworkWrapper 25 | :members: 26 | :inherited-members: 27 | 28 | -------------------------------------------------------------------------------- /docs_raw/source/components/core_types.rst: -------------------------------------------------------------------------------- 1 | Core Types 2 | ========== 3 | 4 | ActionInfo 5 | ---------- 6 | .. autoclass:: rl_coach.core_types.ActionInfo 7 | :members: 8 | :inherited-members: 9 | 10 | Batch 11 | ----- 12 | .. autoclass:: rl_coach.core_types.Batch 13 | :members: 14 | :inherited-members: 15 | 16 | EnvResponse 17 | ----------- 18 | .. autoclass:: rl_coach.core_types.EnvResponse 19 | :members: 20 | :inherited-members: 21 | 22 | Episode 23 | ------- 24 | .. autoclass:: rl_coach.core_types.Episode 25 | :members: 26 | :inherited-members: 27 | 28 | Transition 29 | ---------- 30 | .. autoclass:: rl_coach.core_types.Transition 31 | :members: 32 | :inherited-members: 33 | 34 | -------------------------------------------------------------------------------- /docs_raw/source/components/data_stores/index.rst: -------------------------------------------------------------------------------- 1 | Data Stores 2 | =========== 3 | 4 | S3DataStore 5 | ----------- 6 | .. autoclass:: rl_coach.data_stores.s3_data_store.S3DataStore 7 | 8 | NFSDataStore 9 | ------------ 10 | .. autoclass:: rl_coach.data_stores.nfs_data_store.NFSDataStore 11 | -------------------------------------------------------------------------------- /docs_raw/source/components/filters/index.rst: -------------------------------------------------------------------------------- 1 | Filters 2 | ======= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Filters 7 | 8 | input_filters 9 | output_filters 10 | 11 | Filters are a mechanism in Coach that allows doing pre-processing and post-processing of the internal agent information. 12 | There are two filter categories - 13 | 14 | * **Input filters** - these are filters that process the information passed **into** the agent from the environment. 15 | This information includes the observation and the reward. Input filters therefore allow rescaling observations, 16 | normalizing rewards, stack observations, etc. 17 | 18 | * **Output filters** - these are filters that process the information going **out** of the agent into the environment. 19 | This information includes the action the agent chooses to take. Output filters therefore allow conversion of 20 | actions from one space into another. For example, the agent can take :math:`N` discrete actions, that will be mapped by 21 | the output filter onto :math:`N` continuous actions. 22 | 23 | Filters can be stacked on top of each other in order to build complex processing flows of the inputs or outputs. 24 | 25 | .. image:: /_static/img/filters.png 26 | :width: 350px 27 | :align: center 28 | 29 | -------------------------------------------------------------------------------- /docs_raw/source/components/filters/output_filters.rst: -------------------------------------------------------------------------------- 1 | Output Filters 2 | -------------- 3 | 4 | The output filters only process the actions. 5 | 6 | Action Filters 7 | ++++++++++++++ 8 | 9 | .. autoclass:: rl_coach.filters.action.AttentionDiscretization 10 | 11 | .. image:: /_static/img/attention_discretization.png 12 | :align: center 13 | 14 | .. autoclass:: rl_coach.filters.action.BoxDiscretization 15 | 16 | .. image:: /_static/img/box_discretization.png 17 | :align: center 18 | 19 | .. autoclass:: rl_coach.filters.action.BoxMasking 20 | 21 | .. image:: /_static/img/box_masking.png 22 | :align: center 23 | 24 | .. autoclass:: rl_coach.filters.action.PartialDiscreteActionSpaceMap 25 | 26 | .. image:: /_static/img/partial_discrete_action_space_map.png 27 | :align: center 28 | 29 | .. autoclass:: rl_coach.filters.action.FullDiscreteActionSpaceMap 30 | 31 | .. image:: /_static/img/full_discrete_action_space_map.png 32 | :align: center 33 | 34 | .. autoclass:: rl_coach.filters.action.LinearBoxToBoxMap 35 | 36 | .. image:: /_static/img/linear_box_to_box_map.png 37 | :align: center -------------------------------------------------------------------------------- /docs_raw/source/components/memories/index.rst: -------------------------------------------------------------------------------- 1 | Memories 2 | ======== 3 | 4 | Episodic Memories 5 | ----------------- 6 | 7 | EpisodicExperienceReplay 8 | ++++++++++++++++++++++++ 9 | .. autoclass:: rl_coach.memories.episodic.EpisodicExperienceReplay 10 | 11 | EpisodicHindsightExperienceReplay 12 | +++++++++++++++++++++++++++++++++ 13 | .. autoclass:: rl_coach.memories.episodic.EpisodicHindsightExperienceReplay 14 | 15 | EpisodicHRLHindsightExperienceReplay 16 | ++++++++++++++++++++++++++++++++++++ 17 | .. autoclass:: rl_coach.memories.episodic.EpisodicHRLHindsightExperienceReplay 18 | 19 | SingleEpisodeBuffer 20 | +++++++++++++++++++ 21 | .. autoclass:: rl_coach.memories.episodic.SingleEpisodeBuffer 22 | 23 | 24 | Non-Episodic Memories 25 | --------------------- 26 | BalancedExperienceReplay 27 | ++++++++++++++++++++++++ 28 | .. autoclass:: rl_coach.memories.non_episodic.BalancedExperienceReplay 29 | 30 | QDND 31 | ++++ 32 | .. autoclass:: rl_coach.memories.non_episodic.QDND 33 | 34 | ExperienceReplay 35 | ++++++++++++++++ 36 | .. autoclass:: rl_coach.memories.non_episodic.ExperienceReplay 37 | 38 | PrioritizedExperienceReplay 39 | +++++++++++++++++++++++++++ 40 | .. autoclass:: rl_coach.memories.non_episodic.PrioritizedExperienceReplay 41 | 42 | TransitionCollection 43 | ++++++++++++++++++++ 44 | .. autoclass:: rl_coach.memories.non_episodic.TransitionCollection 45 | -------------------------------------------------------------------------------- /docs_raw/source/components/memory_backends/index.rst: -------------------------------------------------------------------------------- 1 | Memory Backends 2 | =============== 3 | 4 | RedisPubSubBackend 5 | ------------------ 6 | .. autoclass:: rl_coach.memories.backend.redis.RedisPubSubBackend 7 | -------------------------------------------------------------------------------- /docs_raw/source/components/orchestrators/index.rst: -------------------------------------------------------------------------------- 1 | Orchestrators 2 | ============= 3 | 4 | 5 | Kubernetes 6 | ---------- 7 | .. autoclass:: rl_coach.orchestrators.kubernetes_orchestrator.Kubernetes 8 | -------------------------------------------------------------------------------- /docs_raw/source/components/spaces.rst: -------------------------------------------------------------------------------- 1 | Spaces 2 | ====== 3 | 4 | Space 5 | ----- 6 | .. autoclass:: rl_coach.spaces.Space 7 | :members: 8 | :inherited-members: 9 | 10 | 11 | 12 | Observation Spaces 13 | ------------------ 14 | .. autoclass:: rl_coach.spaces.ObservationSpace 15 | :members: 16 | :inherited-members: 17 | 18 | VectorObservationSpace 19 | ++++++++++++++++++++++ 20 | .. autoclass:: rl_coach.spaces.VectorObservationSpace 21 | 22 | PlanarMapsObservationSpace 23 | ++++++++++++++++++++++++++ 24 | .. autoclass:: rl_coach.spaces.PlanarMapsObservationSpace 25 | 26 | ImageObservationSpace 27 | +++++++++++++++++++++ 28 | .. autoclass:: rl_coach.spaces.ImageObservationSpace 29 | 30 | 31 | 32 | Action Spaces 33 | ------------- 34 | .. autoclass:: rl_coach.spaces.ActionSpace 35 | :members: 36 | :inherited-members: 37 | 38 | AttentionActionSpace 39 | ++++++++++++++++++++ 40 | .. autoclass:: rl_coach.spaces.AttentionActionSpace 41 | 42 | BoxActionSpace 43 | ++++++++++++++ 44 | .. autoclass:: rl_coach.spaces.BoxActionSpace 45 | 46 | DiscreteActionSpace 47 | ++++++++++++++++++++ 48 | .. autoclass:: rl_coach.spaces.DiscreteActionSpace 49 | 50 | MultiSelectActionSpace 51 | ++++++++++++++++++++++ 52 | .. autoclass:: rl_coach.spaces.MultiSelectActionSpace 53 | 54 | CompoundActionSpace 55 | +++++++++++++++++++ 56 | .. autoclass:: rl_coach.spaces.CompoundActionSpace 57 | 58 | 59 | 60 | Goal Spaces 61 | ----------- 62 | .. autoclass:: rl_coach.spaces.GoalsSpace 63 | :members: 64 | :inherited-members: 65 | -------------------------------------------------------------------------------- /docs_raw/source/features/algorithms.rst: -------------------------------------------------------------------------------- 1 | Algorithms 2 | ========== 3 | 4 | Coach supports many state-of-the-art reinforcement learning algorithms, which are separated into three main classes - 5 | value optimization, policy optimization and imitation learning. 6 | A detailed description of those algorithms may be found in the `agents <../components/agents/index.html>`_ section. 7 | 8 | .. image:: /_static/img/algorithms.png 9 | :width: 600px 10 | :align: center -------------------------------------------------------------------------------- /docs_raw/source/features/batch_rl.rst: -------------------------------------------------------------------------------- 1 | Batch Reinforcement Learning 2 | ============================ 3 | 4 | Coach supports Batch Reinforcement Learning, where learning is based solely on a (fixed) batch of data. 5 | In Batch RL, we are given a dataset of experience, which was collected using some (one or more) deployed policies, and we would 6 | like to use it to learn a better policy than what was used to collect the dataset. 7 | There is no simulator to interact with, and so we cannot collect any new data, meaning we often cannot explore the MDP any further. 8 | To make things even harder, we would also like to use the dataset in order to evaluate the newly learned policy 9 | (using off-policy evaluation), since we do not have a simulator which we can use to evaluate the policy on. 10 | Batch RL is also often beneficial in cases where we just want to separate the inference (data collection) from the 11 | training process of a new policy. This is often the case where we have a system on which we could quite easily deploy a policy 12 | and collect experience data, but cannot easily use that system's setup to online train a new policy (as is often the 13 | case with more standard RL algorithms). 14 | 15 | Coach supports (almost) all of the integrated off-policy algorithms with Batch RL. 16 | 17 | A lot more details and example usage can be found in the 18 | `tutorial `_. -------------------------------------------------------------------------------- /docs_raw/source/features/benchmarks.rst: -------------------------------------------------------------------------------- 1 | Benchmarks 2 | ========== 3 | 4 | Reinforcement learning is a developing field, and so far it has been particularly difficult to reproduce some of the 5 | results published in the original papers. Some reasons for this are: 6 | 7 | * Reinforcement learning algorithms are notoriously known as having an unstable learning process. 8 | The data the neural networks trains on is dynamic, and depends on the random seed defined for the environment. 9 | 10 | * Reinforcement learning algorithms have many moving parts. For some environments and agents, there are many 11 | "tricks" which are needed to get the exact behavior the paper authors had seen. Also, there are **a lot** of 12 | hyper-parameters to set. 13 | 14 | In order for a reinforcement learning implementation to be useful for research or for data science, it must be 15 | shown that it achieves the expected behavior. For this reason, we collected a set of benchmark results from most 16 | of the algorithms implemented in Coach. The algorithms were tested on a subset of the same environments that were 17 | used in the original papers, and with multiple seed for each environment. 18 | Additionally, Coach uses some strict testing mechanisms to try and make sure the results we show for these 19 | benchmarks stay intact as Coach continues to develop. 20 | 21 | To see the benchmark results, please visit the 22 | `following GitHub page `_. -------------------------------------------------------------------------------- /docs_raw/source/features/index.rst: -------------------------------------------------------------------------------- 1 | Features 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Features 7 | 8 | algorithms 9 | environments 10 | benchmarks 11 | batch_rl -------------------------------------------------------------------------------- /docs_raw/source/test.rst: -------------------------------------------------------------------------------- 1 | test 2 | ---- 3 | 4 | .. important:: Its a note! in markdown! 5 | 6 | .. autoclass:: rl_coach.agents.dqn_agent.DQNAgent 7 | :members: 8 | :inherited-members: -------------------------------------------------------------------------------- /img/ant.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/ant.gif -------------------------------------------------------------------------------- /img/carla.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/carla.gif -------------------------------------------------------------------------------- /img/coach_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/coach_logo.png -------------------------------------------------------------------------------- /img/dashboard.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/dashboard.gif -------------------------------------------------------------------------------- /img/dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/dashboard.png -------------------------------------------------------------------------------- /img/doom_deathmatch.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/doom_deathmatch.gif -------------------------------------------------------------------------------- /img/doom_health.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/doom_health.gif -------------------------------------------------------------------------------- /img/fetch_slide.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/fetch_slide.gif -------------------------------------------------------------------------------- /img/minitaur.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/minitaur.gif -------------------------------------------------------------------------------- /img/montezuma.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/montezuma.gif -------------------------------------------------------------------------------- /img/pendulum.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/pendulum.gif -------------------------------------------------------------------------------- /img/starcraft.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/img/starcraft.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | annoy>=1.8.3 2 | Pillow>=9.0.1 3 | matplotlib>=2.0.2 4 | numpy>=1.14.5 5 | pandas>=0.22.0 6 | pygame>=1.9.3 7 | PyOpenGL>=3.1.0 8 | scipy>=0.19.0 9 | scikit-image>=0.13.0 10 | gym==0.12.5 11 | bokeh==1.0.4 12 | kubernetes>=8.0.0b1,<=8.0.1 13 | redis>=2.10.6 14 | minio>=4.0.5 15 | pytest>=3.8.2 16 | psutil>=5.5.0 17 | joblib>=0.17.0 18 | -------------------------------------------------------------------------------- /rl_coach/__init__.py: -------------------------------------------------------------------------------- 1 | # This gets rid of NumPy FutureWarnings that occur at TF import 2 | import warnings 3 | warnings.filterwarnings('ignore',category=FutureWarning) 4 | 5 | # This gets rid of TF 2.0 related deprecation warnings 6 | import tensorflow as tf 7 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 8 | -------------------------------------------------------------------------------- /rl_coach/agents/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /rl_coach/architectures/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /rl_coach/architectures/mxnet_components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/architectures/mxnet_components/__init__.py -------------------------------------------------------------------------------- /rl_coach/architectures/mxnet_components/embedders/__init__.py: -------------------------------------------------------------------------------- 1 | from .image_embedder import ImageEmbedder 2 | from .tensor_embedder import TensorEmbedder 3 | from .vector_embedder import VectorEmbedder 4 | 5 | __all__ = ['ImageEmbedder', 6 | 'TensorEmbedder', 7 | 'VectorEmbedder'] 8 | -------------------------------------------------------------------------------- /rl_coach/architectures/mxnet_components/heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .head import Head, HeadLoss 2 | from .q_head import QHead 3 | from .ppo_head import PPOHead 4 | from .ppo_v_head import PPOVHead 5 | from .v_head import VHead 6 | 7 | __all__ = [ 8 | 'Head', 9 | 'HeadLoss', 10 | 'QHead', 11 | 'PPOHead', 12 | 'PPOVHead', 13 | 'VHead' 14 | ] 15 | -------------------------------------------------------------------------------- /rl_coach/architectures/mxnet_components/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | from .fc_middleware import FCMiddleware 2 | from .lstm_middleware import LSTMMiddleware 3 | 4 | __all__ = ["FCMiddleware", "LSTMMiddleware"] -------------------------------------------------------------------------------- /rl_coach/architectures/tensorflow_components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/architectures/tensorflow_components/__init__.py -------------------------------------------------------------------------------- /rl_coach/architectures/tensorflow_components/embedders/__init__.py: -------------------------------------------------------------------------------- 1 | from .image_embedder import ImageEmbedder 2 | from .vector_embedder import VectorEmbedder 3 | from .tensor_embedder import TensorEmbedder 4 | 5 | __all__ = ['ImageEmbedder', 'VectorEmbedder', 'TensorEmbedder'] 6 | -------------------------------------------------------------------------------- /rl_coach/architectures/tensorflow_components/heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .q_head import QHead 2 | from .categorical_q_head import CategoricalQHead 3 | from .ddpg_actor_head import DDPGActor 4 | from .dnd_q_head import DNDQHead 5 | from .dueling_q_head import DuelingQHead 6 | from .measurements_prediction_head import MeasurementsPredictionHead 7 | from .naf_head import NAFHead 8 | from .policy_head import PolicyHead 9 | from .ppo_head import PPOHead 10 | from .ppo_v_head import PPOVHead 11 | from .quantile_regression_q_head import QuantileRegressionQHead 12 | from .rainbow_q_head import RainbowQHead 13 | from .v_head import VHead 14 | from .acer_policy_head import ACERPolicyHead 15 | from .sac_head import SACPolicyHead 16 | from .sac_q_head import SACQHead 17 | from .classification_head import ClassificationHead 18 | from .cil_head import RegressionHead 19 | from .td3_v_head import TD3VHead 20 | from .ddpg_v_head import DDPGVHead 21 | from .wolpertinger_actor_head import WolpertingerActorHead 22 | from .RND_head import RNDHead 23 | 24 | __all__ = [ 25 | 'CategoricalQHead', 26 | 'DDPGActor', 27 | 'DNDQHead', 28 | 'DuelingQHead', 29 | 'MeasurementsPredictionHead', 30 | 'NAFHead', 31 | 'PolicyHead', 32 | 'PPOHead', 33 | 'PPOVHead', 34 | 'QHead', 35 | 'QuantileRegressionQHead', 36 | 'RainbowQHead', 37 | 'VHead', 38 | 'ACERPolicyHead', 39 | 'SACPolicyHead', 40 | 'SACQHead', 41 | 'ClassificationHead', 42 | 'RegressionHead', 43 | 'TD3VHead', 44 | 'DDPGVHead', 45 | 'WolpertingerActorHead', 46 | 'RNDHead' 47 | ] 48 | -------------------------------------------------------------------------------- /rl_coach/architectures/tensorflow_components/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | from .fc_middleware import FCMiddleware 2 | from .lstm_middleware import LSTMMiddleware 3 | 4 | __all__ = ["FCMiddleware", "LSTMMiddleware"] 5 | -------------------------------------------------------------------------------- /rl_coach/dashboard_components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/dashboard_components/__init__.py -------------------------------------------------------------------------------- /rl_coach/dashboard_components/boards.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | from bokeh.layouts import column 19 | from bokeh.models.widgets import Panel, Tabs 20 | from rl_coach.dashboard_components.experiment_board import experiment_board_layout 21 | from rl_coach.dashboard_components.episodic_board import episodic_board_layout 22 | from rl_coach.dashboard_components.globals import spinner, layouts 23 | from bokeh.models.widgets import Div 24 | 25 | # ---------------- Build Website Layout ------------------- 26 | 27 | # title 28 | title = Div(text="""

Coach Dashboard

""") 29 | center = Div(text="""""") 30 | tab1 = Panel(child=experiment_board_layout, title='experiment board') 31 | # tab2 = Panel(child=episodic_board_layout, title='episodic board') 32 | # tabs = Tabs(tabs=[tab1, tab2]) 33 | tabs = Tabs(tabs=[tab1]) 34 | 35 | layout = column(title, center, tabs) 36 | layout = column(layout, spinner) 37 | 38 | layouts['boards'] = layout 39 | -------------------------------------------------------------------------------- /rl_coach/data_stores/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /rl_coach/environments/README.md: -------------------------------------------------------------------------------- 1 | A custom environment implementation should look like this: 2 | 3 | ```bash 4 | from coach.filters.input_filter import InputFilter 5 | 6 | class CustomFilter(InputFilter): 7 | def __init__(self): 8 | ... 9 | def _filter(self, env_response: EnvResponse) -> EnvResponse: 10 | ... 11 | def _get_filtered_observation_space(self, input_observation_space: ObservationSpace) -> ObservationSpace: 12 | ... 13 | def _get_filtered_reward_space(self, input_reward_space: RewardSpace) -> RewardSpace: 14 | ... 15 | def _validate_input_observation_space(self, input_observation_space: ObservationSpace): 16 | ... 17 | def _reset(self): 18 | ... 19 | ``` 20 | -------------------------------------------------------------------------------- /rl_coach/environments/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | -------------------------------------------------------------------------------- /rl_coach/environments/doom/D2_navigation.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = D2_navigation.wad 6 | doom_map = map01 7 | 8 | # Rewards 9 | 10 | # Each step is good for you! 11 | living_reward = 1 12 | # And death is not! 13 | death_penalty = 0 14 | 15 | # Rendering options 16 | screen_resolution = RES_160X120 17 | screen_format = GRAY8 18 | render_hud = false 19 | render_crosshair = false 20 | render_weapon = false 21 | render_decals = false 22 | render_particles = false 23 | window_visible = false 24 | 25 | # make episodes finish after 2100 actions (tics) 26 | episode_timeout = 2100 27 | 28 | # Available buttons 29 | available_buttons = 30 | { 31 | TURN_LEFT 32 | TURN_RIGHT 33 | MOVE_FORWARD 34 | } 35 | 36 | # Game variables that will be in the state 37 | available_game_variables = { HEALTH } 38 | 39 | mode = PLAYER 40 | -------------------------------------------------------------------------------- /rl_coach/environments/doom/D2_navigation.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/environments/doom/D2_navigation.wad -------------------------------------------------------------------------------- /rl_coach/environments/doom/D3_battle.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | # modifty these to point to your vizdoom binary and freedoom2.wad 6 | doom_scenario_path = D3_battle.wad 7 | doom_map = map01 8 | 9 | # Rewards 10 | 11 | living_reward = 0 12 | death_penalty = 0 13 | 14 | # Rendering options 15 | screen_resolution = RES_320X240 16 | screen_format = CRCGCB 17 | render_hud = false 18 | render_crosshair = true 19 | render_weapon = true 20 | render_decals = false 21 | render_particles = false 22 | window_visible = false 23 | 24 | # make episodes finish after 2100 actions (tics) 25 | episode_timeout = 2100 26 | 27 | # Available buttons 28 | available_buttons = 29 | { 30 | MOVE_FORWARD 31 | MOVE_BACKWARD 32 | MOVE_RIGHT 33 | MOVE_LEFT 34 | TURN_LEFT 35 | TURN_RIGHT 36 | ATTACK 37 | SPEED 38 | } 39 | 40 | # Game variables that will be in the state 41 | available_game_variables = {AMMO2 HEALTH USER2} 42 | 43 | mode = PLAYER 44 | doom_skill = 2 45 | -------------------------------------------------------------------------------- /rl_coach/environments/doom/D3_battle.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/environments/doom/D3_battle.wad -------------------------------------------------------------------------------- /rl_coach/environments/mujoco/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/environments/mujoco/__init__.py -------------------------------------------------------------------------------- /rl_coach/environments/mujoco/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The dm_control Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================ 15 | 16 | """Functions to manage the common assets for domains.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import os 23 | from dm_control.utils import resources 24 | 25 | _SUITE_DIR = os.path.dirname(os.path.dirname(__file__)) 26 | _FILENAMES = [ 27 | "common/materials.xml", 28 | "common/skybox.xml", 29 | "common/visual.xml", 30 | ] 31 | 32 | ASSETS = {filename: resources.GetResource(os.path.join(_SUITE_DIR, filename)) 33 | for filename in _FILENAMES} 34 | 35 | 36 | def read_model(model_filename): 37 | """Reads a model XML file and returns its contents as a string.""" 38 | return resources.GetResource(os.path.join(_SUITE_DIR, model_filename)) 39 | -------------------------------------------------------------------------------- /rl_coach/environments/mujoco/common/materials.xml: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /rl_coach/environments/mujoco/common/skybox.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /rl_coach/environments/mujoco/common/visual.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /rl_coach/environments/robosuite/osc_pose.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "OSC_POSE", 3 | "input_max": 1, 4 | "input_min": -1, 5 | "output_max": [0.125, 0.125, 0.125, 0.5, 0.5, 0.5], 6 | "output_min": [-0.125, -0.125, -0.125, -0.5, -0.5, -0.5], 7 | "kp": 150, 8 | "damping_ratio": 1, 9 | "impedance_mode": "fixed", 10 | "kp_limits": [0, 300], 11 | "damping_ratio_limits": [0, 10], 12 | "position_limits": [[-0.22, -0.35, 0.82], [0.22, 0.35, 1.3]], 13 | "orientation_limits": null, 14 | "uncouple_pos_ori": true, 15 | "control_delta": true, 16 | "interpolation": null, 17 | "ramp_ratio": 0.2 18 | } -------------------------------------------------------------------------------- /rl_coach/environments/toy_problems/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/environments/toy_problems/__init__.py -------------------------------------------------------------------------------- /rl_coach/exploration_policies/README.md: -------------------------------------------------------------------------------- 1 | # Exploration Policy 2 | 3 | An exploration policy is a module that is responsible for choosing the action according to the action values, the 4 | current phase, its internal state and the specific exploration policy algorithm. 5 | 6 | A custom exploration policy should implement both the exploration policy class and the exploration policy parameters 7 | class, which defines the parameters and the location of the exploration policy module. 8 | The parameters of the exploration policy class should match the parameters in the exploration policy parameters class. 9 | 10 | Exploration policies typically have some control parameter that defines its current exploration state, and 11 | a schedule for this parameter. This schedule can be defined using the Schedule class which is defined in 12 | exploration_policy.py. 13 | 14 | A custom implementation should look as follows: 15 | 16 | ``` 17 | class CustomExplorationParameters(ExplorationParameters): 18 | def __init__(self): 19 | super().__init__() 20 | ... 21 | 22 | @property 23 | def path(self): 24 | return 'module_path:class_name' 25 | 26 | 27 | class CustomExplorationPolicy(ExplorationPolicy): 28 | def __init__(self, action_space: ActionSpace, ...): 29 | super().__init__(action_space) 30 | 31 | def reset(self): 32 | ... 33 | 34 | def get_action(self, action_values: List[ActionType]) -> ActionType: 35 | ... 36 | 37 | def change_phase(self, phase): 38 | ... 39 | 40 | def get_control_param(self): 41 | ... 42 | ``` -------------------------------------------------------------------------------- /rl_coach/exploration_policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/exploration_policies/__init__.py -------------------------------------------------------------------------------- /rl_coach/filters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/filters/__init__.py -------------------------------------------------------------------------------- /rl_coach/filters/action/__init__.py: -------------------------------------------------------------------------------- 1 | from .attention_discretization import AttentionDiscretization 2 | from .box_discretization import BoxDiscretization 3 | from .box_masking import BoxMasking 4 | from .full_discrete_action_space_map import FullDiscreteActionSpaceMap 5 | from .linear_box_to_box_map import LinearBoxToBoxMap 6 | from .partial_discrete_action_space_map import PartialDiscreteActionSpaceMap 7 | __all__ = [ 8 | 'AttentionDiscretization', 9 | 'BoxDiscretization', 10 | 'BoxMasking', 11 | 'FullDiscreteActionSpaceMap', 12 | 'LinearBoxToBoxMap', 13 | 'PartialDiscreteActionSpaceMap' 14 | ] -------------------------------------------------------------------------------- /rl_coach/filters/action/full_discrete_action_space_map.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from rl_coach.filters.action.partial_discrete_action_space_map import PartialDiscreteActionSpaceMap 18 | from rl_coach.spaces import ActionSpace, DiscreteActionSpace 19 | 20 | 21 | class FullDiscreteActionSpaceMap(PartialDiscreteActionSpaceMap): 22 | """ 23 | Full map of two countable action spaces. This works in a similar way to the 24 | PartialDiscreteActionSpaceMap, but maps the entire source action space into the entire target action space, without 25 | masking any actions. 26 | For example, if there are 10 multiselect actions in the output space, the actions 0-9 will be mapped to those 27 | multiselect actions. 28 | """ 29 | def __init__(self): 30 | super().__init__() 31 | 32 | def get_unfiltered_action_space(self, output_action_space: ActionSpace) -> DiscreteActionSpace: 33 | self.target_actions = output_action_space.actions 34 | return super().get_unfiltered_action_space(output_action_space) 35 | -------------------------------------------------------------------------------- /rl_coach/filters/observation/__init__.py: -------------------------------------------------------------------------------- 1 | from .observation_clipping_filter import ObservationClippingFilter 2 | from .observation_crop_filter import ObservationCropFilter 3 | from .observation_move_axis_filter import ObservationMoveAxisFilter 4 | from .observation_normalization_filter import ObservationNormalizationFilter 5 | from .observation_reduction_by_sub_parts_name_filter import ObservationReductionBySubPartsNameFilter 6 | from .observation_rescale_size_by_factor_filter import ObservationRescaleSizeByFactorFilter 7 | from .observation_rescale_to_size_filter import ObservationRescaleToSizeFilter 8 | from .observation_rgb_to_y_filter import ObservationRGBToYFilter 9 | from .observation_squeeze_filter import ObservationSqueezeFilter 10 | from .observation_stacking_filter import ObservationStackingFilter 11 | from .observation_to_uint8_filter import ObservationToUInt8Filter 12 | 13 | __all__ = [ 14 | 'ObservationClippingFilter', 15 | 'ObservationCropFilter', 16 | 'ObservationMoveAxisFilter', 17 | 'ObservationNormalizationFilter', 18 | 'ObservationReductionBySubPartsNameFilter', 19 | 'ObservationRescaleSizeByFactorFilter', 20 | 'ObservationRescaleToSizeFilter', 21 | 'ObservationRGBToYFilter', 22 | 'ObservationSqueezeFilter', 23 | 'ObservationStackingFilter', 24 | 'ObservationToUInt8Filter' 25 | ] -------------------------------------------------------------------------------- /rl_coach/filters/observation/observation_filter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from rl_coach.filters.filter import Filter 18 | from rl_coach.spaces import ObservationSpace 19 | 20 | 21 | class ObservationFilter(Filter): 22 | def __init__(self): 23 | super().__init__() 24 | self.supports_batching = False 25 | 26 | def get_filtered_observation_space(self, input_observation_space: ObservationSpace) -> ObservationSpace: 27 | """ 28 | This function should contain the logic for getting the filtered observation space 29 | :param input_observation_space: the input observation space 30 | :return: the filtered observation space 31 | """ 32 | return input_observation_space 33 | 34 | def validate_input_observation_space(self, input_observation_space: ObservationSpace): 35 | """ 36 | A function that implements validation of the input observation space 37 | :param input_observation_space: the input observation space 38 | :return: None 39 | """ 40 | pass -------------------------------------------------------------------------------- /rl_coach/filters/reward/__init__.py: -------------------------------------------------------------------------------- 1 | from .reward_rescale_filter import RewardRescaleFilter 2 | from .reward_clipping_filter import RewardClippingFilter 3 | from .reward_normalization_filter import RewardNormalizationFilter 4 | from .reward_ewma_normalization_filter import RewardEwmaNormalizationFilter 5 | 6 | __all__ = [ 7 | 'RewardRescaleFilter', 8 | 'RewardClippingFilter', 9 | 'RewardNormalizationFilter', 10 | 'RewardEwmaNormalizationFilter' 11 | ] -------------------------------------------------------------------------------- /rl_coach/filters/reward/reward_filter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from rl_coach.filters.filter import Filter 18 | from rl_coach.spaces import RewardSpace 19 | 20 | 21 | class RewardFilter(Filter): 22 | def __init__(self): 23 | super().__init__() 24 | self.supports_batching = False 25 | 26 | def get_filtered_reward_space(self, input_reward_space: RewardSpace) -> RewardSpace: 27 | """ 28 | This function should contain the logic for getting the filtered reward space 29 | :param input_reward_space: the input reward space 30 | :return: the filtered reward space 31 | """ 32 | return input_reward_space -------------------------------------------------------------------------------- /rl_coach/graph_managers/README.md: -------------------------------------------------------------------------------- 1 | # Block Factory 2 | 3 | The block factory is a class which creates a block that fits into a specific RL scheme. 4 | Example RL schemes are: self play, multi agent, HRL, basic RL, etc. 5 | The block factory should create all the components of the block and return the block scheduler. 6 | The block factory will then be used to create different combinations of components. 7 | For example, an HRL factory can be later instantiated with: 8 | * env = Atari Breakout 9 | * master (top hierarchy level) agent = DDPG 10 | * slave (bottom hierarchy level) agent = DQN 11 | 12 | A custom block factory implementation should look as follows: 13 | 14 | ``` 15 | class CustomFactory(BlockFactory): 16 | def __init__(self, custom_params): 17 | super().__init__() 18 | 19 | def _create_block(self, task_index: int, device=None) -> BlockScheduler: 20 | """ 21 | Create all the block modules and the block scheduler 22 | :param task_index: the index of the process on which the worker will be run 23 | :return: the initialized block scheduler 24 | """ 25 | 26 | # Create env 27 | # Create composite agents 28 | # Create level managers 29 | # Create block scheduler 30 | 31 | return block_scheduler 32 | ``` -------------------------------------------------------------------------------- /rl_coach/graph_managers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/graph_managers/__init__.py -------------------------------------------------------------------------------- /rl_coach/memories/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /rl_coach/memories/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/memories/backend/__init__.py -------------------------------------------------------------------------------- /rl_coach/memories/backend/memory_impl.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | from rl_coach.memories.backend.memory import MemoryBackendParameters 19 | from rl_coach.memories.backend.redis import RedisPubSubBackend, RedisPubSubMemoryBackendParameters 20 | 21 | 22 | def get_memory_backend(params: MemoryBackendParameters): 23 | 24 | backend = None 25 | if type(params) == RedisPubSubMemoryBackendParameters: 26 | backend = RedisPubSubBackend(params) 27 | 28 | return backend 29 | 30 | 31 | def construct_memory_params(json: dict): 32 | 33 | if json['store_type'] == 'redispubsub': 34 | memory_params = RedisPubSubMemoryBackendParameters( 35 | json['redis_address'], json['redis_port'], channel=json.get('channel', ''), run_type=json['run_type'] 36 | ) 37 | return memory_params 38 | -------------------------------------------------------------------------------- /rl_coach/memories/episodic/__init__.py: -------------------------------------------------------------------------------- 1 | from .episodic_experience_replay import EpisodicExperienceReplayParameters, EpisodicExperienceReplay 2 | from .episodic_hindsight_experience_replay import EpisodicHindsightExperienceReplayParameters, EpisodicHindsightExperienceReplay 3 | from .episodic_hrl_hindsight_experience_replay import EpisodicHRLHindsightExperienceReplayParameters, EpisodicHRLHindsightExperienceReplay 4 | from .single_episode_buffer import SingleEpisodeBufferParameters, SingleEpisodeBuffer 5 | __all__ = [ 6 | 'EpisodicExperienceReplayParameters', 7 | 'EpisodicHindsightExperienceReplayParameters', 8 | 'EpisodicHRLHindsightExperienceReplayParameters', 9 | 'SingleEpisodeBufferParameters', 10 | 'EpisodicExperienceReplay', 11 | 'EpisodicHindsightExperienceReplay', 12 | 'EpisodicHRLHindsightExperienceReplay', 13 | 'SingleEpisodeBuffer' 14 | ] 15 | -------------------------------------------------------------------------------- /rl_coach/memories/episodic/single_episode_buffer.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from rl_coach.memories.episodic.episodic_experience_replay import EpisodicExperienceReplay 18 | from rl_coach.memories.memory import MemoryGranularity, MemoryParameters 19 | 20 | 21 | class SingleEpisodeBufferParameters(MemoryParameters): 22 | def __init__(self): 23 | super().__init__() 24 | del self.max_size 25 | 26 | @property 27 | def path(self): 28 | return 'rl_coach.memories.episodic.single_episode_buffer:SingleEpisodeBuffer' 29 | 30 | 31 | class SingleEpisodeBuffer(EpisodicExperienceReplay): 32 | def __init__(self): 33 | super().__init__((MemoryGranularity.Episodes, 1)) 34 | -------------------------------------------------------------------------------- /rl_coach/memories/non_episodic/__init__.py: -------------------------------------------------------------------------------- 1 | from .balanced_experience_replay import BalancedExperienceReplayParameters, BalancedExperienceReplay 2 | from .differentiable_neural_dictionary import QDND 3 | from .experience_replay import ExperienceReplayParameters, ExperienceReplay 4 | from .prioritized_experience_replay import PrioritizedExperienceReplayParameters, PrioritizedExperienceReplay 5 | from .transition_collection import TransitionCollection 6 | __all__ = [ 7 | 'BalancedExperienceReplayParameters', 8 | 'BalancedExperienceReplay', 9 | 'QDND', 10 | 'ExperienceReplay', 11 | 'PrioritizedExperienceReplay', 12 | 'TransitionCollection' 13 | ] 14 | -------------------------------------------------------------------------------- /rl_coach/off_policy_evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /rl_coach/off_policy_evaluators/bandits/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /rl_coach/off_policy_evaluators/rl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/off_policy_evaluators/rl/__init__.py -------------------------------------------------------------------------------- /rl_coach/orchestrators/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | -------------------------------------------------------------------------------- /rl_coach/orchestrators/deploy.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2017 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | 19 | class DeployParameters(object): 20 | 21 | def __init__(self): 22 | pass 23 | 24 | 25 | class Deploy(object): 26 | 27 | def __init__(self, deploy_parameters): 28 | self.deploy_parameters = deploy_parameters 29 | 30 | def setup(self) -> bool: 31 | pass 32 | 33 | def deploy(self) -> bool: 34 | pass 35 | -------------------------------------------------------------------------------- /rl_coach/presets/Atari_C51.py: -------------------------------------------------------------------------------- 1 | from rl_coach.agents.categorical_dqn_agent import CategoricalDQNAgentParameters 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters 3 | from rl_coach.environments.environment import SingleLevelSelection 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager 6 | 7 | ######### 8 | # Agent # 9 | ######### 10 | agent_params = CategoricalDQNAgentParameters() 11 | agent_params.network_wrappers['main'].learning_rate = 0.00025 12 | 13 | ############### 14 | # Environment # 15 | ############### 16 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4)) 17 | 18 | ######## 19 | # Test # 20 | ######## 21 | preset_validation_params = PresetValidationParameters() 22 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders'] 23 | 24 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, 25 | schedule_params=atari_schedule, vis_params=VisualizationParameters(), 26 | preset_validation_params=preset_validation_params) 27 | -------------------------------------------------------------------------------- /rl_coach/presets/Atari_DDQN.py: -------------------------------------------------------------------------------- 1 | from rl_coach.agents.ddqn_agent import DDQNAgentParameters 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters 3 | from rl_coach.environments.environment import SingleLevelSelection 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager 6 | 7 | ######### 8 | # Agent # 9 | ######### 10 | agent_params = DDQNAgentParameters() 11 | agent_params.network_wrappers['main'].learning_rate = 0.00025 12 | 13 | ############### 14 | # Environment # 15 | ############### 16 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4)) 17 | 18 | ######## 19 | # Test # 20 | ######## 21 | preset_validation_params = PresetValidationParameters() 22 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders'] 23 | 24 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, 25 | schedule_params=atari_schedule, vis_params=VisualizationParameters(), 26 | preset_validation_params=preset_validation_params) -------------------------------------------------------------------------------- /rl_coach/presets/Atari_DDQN_with_PER.py: -------------------------------------------------------------------------------- 1 | from rl_coach.agents.ddqn_agent import DDQNAgentParameters 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters 3 | from rl_coach.environments.environment import SingleLevelSelection 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager 6 | from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplayParameters 7 | from rl_coach.schedules import LinearSchedule 8 | 9 | ######### 10 | # Agent # 11 | ######### 12 | agent_params = DDQNAgentParameters() 13 | agent_params.network_wrappers['main'].learning_rate = 0.00025/4 14 | agent_params.memory = PrioritizedExperienceReplayParameters() 15 | agent_params.memory.beta = LinearSchedule(0.4, 1, 12500000) # 12.5M training iterations = 50M steps = 200M frames 16 | 17 | ############### 18 | # Environment # 19 | ############### 20 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4)) 21 | 22 | ######## 23 | # Test # 24 | ######## 25 | preset_validation_params = PresetValidationParameters() 26 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders'] 27 | 28 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, 29 | schedule_params=atari_schedule, vis_params=VisualizationParameters(), 30 | preset_validation_params=preset_validation_params) 31 | -------------------------------------------------------------------------------- /rl_coach/presets/Atari_DQN.py: -------------------------------------------------------------------------------- 1 | from rl_coach.agents.dqn_agent import DQNAgentParameters 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters 3 | from rl_coach.environments.environment import SingleLevelSelection 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager 6 | 7 | ######### 8 | # Agent # 9 | ######### 10 | agent_params = DQNAgentParameters() 11 | # since we are using Adam instead of RMSProp, we adjust the learning rate as well 12 | agent_params.network_wrappers['main'].learning_rate = 0.0001 13 | 14 | ############### 15 | # Environment # 16 | ############### 17 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4)) 18 | 19 | ######## 20 | # Test # 21 | ######## 22 | preset_validation_params = PresetValidationParameters() 23 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders'] 24 | 25 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, 26 | schedule_params=atari_schedule, vis_params=VisualizationParameters(), 27 | preset_validation_params=preset_validation_params) 28 | -------------------------------------------------------------------------------- /rl_coach/presets/Atari_DQN_with_PER.py: -------------------------------------------------------------------------------- 1 | from rl_coach.agents.dqn_agent import DQNAgentParameters 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters 3 | from rl_coach.environments.environment import SingleLevelSelection 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager 6 | from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplayParameters 7 | from rl_coach.schedules import LinearSchedule 8 | 9 | 10 | ######### 11 | # Agent # 12 | ######### 13 | agent_params = DQNAgentParameters() 14 | agent_params.network_wrappers['main'].learning_rate = 0.00025 15 | agent_params.memory = PrioritizedExperienceReplayParameters() 16 | agent_params.memory.beta = LinearSchedule(0.4, 1, 12500000) # 12.5M training iterations = 50M steps = 200M frames 17 | 18 | ############### 19 | # Environment # 20 | ############### 21 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4)) 22 | 23 | ######## 24 | # Test # 25 | ######## 26 | preset_validation_params = PresetValidationParameters() 27 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders'] 28 | 29 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, 30 | schedule_params=atari_schedule, vis_params=VisualizationParameters(), 31 | preset_validation_params=preset_validation_params) 32 | -------------------------------------------------------------------------------- /rl_coach/presets/Atari_QR_DQN.py: -------------------------------------------------------------------------------- 1 | from rl_coach.agents.qr_dqn_agent import QuantileRegressionDQNAgentParameters 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters 3 | from rl_coach.environments.environment import SingleLevelSelection 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule 5 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager 6 | 7 | ######### 8 | # Agent # 9 | ######### 10 | agent_params = QuantileRegressionDQNAgentParameters() 11 | agent_params.network_wrappers['main'].learning_rate = 0.00005 # called alpha in the paper 12 | agent_params.algorithm.huber_loss_interval = 1 # k = 0 for strict quantile loss, k = 1 for Huber quantile loss 13 | 14 | ############### 15 | # Environment # 16 | ############### 17 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4)) 18 | 19 | ######## 20 | # Test # 21 | ######## 22 | preset_validation_params = PresetValidationParameters() 23 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders'] 24 | 25 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, 26 | schedule_params=atari_schedule, vis_params=VisualizationParameters(), 27 | preset_validation_params=preset_validation_params) 28 | -------------------------------------------------------------------------------- /rl_coach/presets/Atari_UCB_with_Q_Ensembles.py: -------------------------------------------------------------------------------- 1 | from rl_coach.agents.bootstrapped_dqn_agent import BootstrappedDQNAgentParameters 2 | from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters 3 | from rl_coach.environments.environment import SingleLevelSelection 4 | from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule 5 | from rl_coach.exploration_policies.ucb import UCBParameters 6 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager 7 | 8 | ######### 9 | # Agent # 10 | ######### 11 | agent_params = BootstrappedDQNAgentParameters() 12 | agent_params.network_wrappers['main'].learning_rate = 0.00025 13 | agent_params.exploration = UCBParameters() 14 | 15 | ############### 16 | # Environment # 17 | ############### 18 | env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4)) 19 | 20 | ######## 21 | # Test # 22 | ######## 23 | preset_validation_params = PresetValidationParameters() 24 | preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders'] 25 | 26 | graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, 27 | schedule_params=atari_schedule, vis_params=VisualizationParameters(), 28 | preset_validation_params=preset_validation_params) 29 | -------------------------------------------------------------------------------- /rl_coach/presets/README.md: -------------------------------------------------------------------------------- 1 | # Defining Presets 2 | 3 | In Coach, we use a Preset mechanism in order to define reproducible experiments. 4 | A Preset defines all the parameters of an experiment in a single file, and can be executed from the command 5 | line using the file name. 6 | Presets can be very simple by using the default parameters of the algorithm and environment. 7 | They can also be explicit and define all the parameters in order to avoid hidden logic. 8 | The outcome of a preset is a GraphManager. 9 | 10 | 11 | Let's start with the simplest preset possible. 12 | We will define a preset for training the CartPole environment using Clipped PPO. 13 | The 3 minimal things we need to define in each preset are the agent, the environment and a schedule. 14 | 15 | ``` 16 | from rl_coach.agents.clipped_ppo_agent import ClippedPPOAgentParameters 17 | from rl_coach.environments.gym_environment import GymVectorEnvironment 18 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager 19 | from rl_coach.graph_managers.graph_manager import SimpleSchedule 20 | 21 | graph_manager = BasicRLGraphManager( 22 | agent_params=ClippedPPOAgentParameters(), 23 | env_params=GymVectorEnvironment(level='CartPole-v0'), 24 | schedule_params=SimpleSchedule() 25 | ) 26 | ``` 27 | 28 | Most presets in Coach are much more explicit than this. The motivation behind this is to be as transparent as 29 | possible regarding all the changes needed relative to the basic parameters defined in the algorithm paper. -------------------------------------------------------------------------------- /rl_coach/presets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/presets/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/agents/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/agents/test_agent_external_communication.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from rl_coach.base_parameters import TaskParameters, Frameworks 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 7 | import tensorflow as tf 8 | from tensorflow import logging 9 | import pytest 10 | logging.set_verbosity(logging.INFO) 11 | 12 | 13 | @pytest.mark.unit_test 14 | def test_get_QActionStateValue_predictions(): 15 | tf.reset_default_graph() 16 | from rl_coach.presets.CartPole_DQN import graph_manager as cartpole_dqn_graph_manager 17 | assert cartpole_dqn_graph_manager 18 | cartpole_dqn_graph_manager.create_graph(task_parameters= 19 | TaskParameters(framework_type=Frameworks.tensorflow, 20 | experiment_path="./experiments/test")) 21 | cartpole_dqn_graph_manager.improve_steps.num_steps = 1 22 | cartpole_dqn_graph_manager.steps_between_evaluation_periods.num_steps = 5 23 | 24 | # graph_manager.improve() 25 | # 26 | # agent = graph_manager.level_managers[0].composite_agents['simple_rl_agent'].agents['simple_rl_agent/agent'] 27 | # some_state = agent.memory.sample(1)[0].state 28 | # cartpole_dqn_predictions = agent.get_predictions(states=some_state, prediction_type=QActionStateValue) 29 | # assert cartpole_dqn_predictions.shape == (1, 2) 30 | 31 | 32 | if __name__ == '__main__': 33 | test_get_QActionStateValue_predictions() 34 | -------------------------------------------------------------------------------- /rl_coach/tests/architectures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/architectures/mxnet_components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/mxnet_components/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/architectures/mxnet_components/embedders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/mxnet_components/embedders/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/architectures/mxnet_components/embedders/test_image_embedder.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import os 3 | import pytest 4 | import sys 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 6 | 7 | 8 | from rl_coach.base_parameters import EmbedderScheme 9 | from rl_coach.architectures.embedder_parameters import InputEmbedderParameters 10 | from rl_coach.architectures.mxnet_components.embedders.image_embedder import ImageEmbedder 11 | 12 | 13 | @pytest.mark.unit_test 14 | def test_image_embedder(): 15 | params = InputEmbedderParameters(scheme=EmbedderScheme.Medium) 16 | emb = ImageEmbedder(params=params) 17 | emb.initialize() 18 | # input is NHWC, and not MXNet default NCHW 19 | input_data = mx.nd.random.uniform(low=0, high=1, shape=(10, 244, 244, 3)) 20 | output = emb(input_data) 21 | assert len(output.shape) == 2 # since last block was flatten 22 | assert output.shape[0] == 10 # since batch_size is 10 23 | -------------------------------------------------------------------------------- /rl_coach/tests/architectures/mxnet_components/embedders/test_vector_embedder.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import os 3 | import pytest 4 | import sys 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 6 | 7 | 8 | from rl_coach.architectures.embedder_parameters import InputEmbedderParameters 9 | from rl_coach.architectures.mxnet_components.embedders.vector_embedder import VectorEmbedder 10 | from rl_coach.base_parameters import EmbedderScheme 11 | 12 | 13 | @pytest.mark.unit_test 14 | def test_vector_embedder(): 15 | params = InputEmbedderParameters(scheme=EmbedderScheme.Medium) 16 | emb = VectorEmbedder(params=params) 17 | emb.initialize() 18 | input_data = mx.nd.random.uniform(low=0, high=255, shape=(10, 100)) 19 | output = emb(input_data) 20 | assert len(output.shape) == 2 # since last block was flatten 21 | assert output.shape[0] == 10 # since batch_size is 10 22 | assert output.shape[1] == 256 # since last dense layer has 256 units 23 | -------------------------------------------------------------------------------- /rl_coach/tests/architectures/mxnet_components/heads/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/mxnet_components/heads/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/architectures/mxnet_components/heads/test_head.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import numpy as np 3 | import os 4 | import pytest 5 | import sys 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 7 | 8 | 9 | from rl_coach.architectures.mxnet_components.heads.head import NormalizedRSSInitializer 10 | 11 | 12 | @pytest.mark.unit_test 13 | def test_normalized_rss_initializer(): 14 | target_rss = 0.5 15 | units = 10 16 | dense = mx.gluon.nn.Dense(units=units, weight_initializer=NormalizedRSSInitializer(target_rss)) 17 | dense.initialize() 18 | 19 | input_data = mx.random.uniform(shape=(25, 5)) 20 | output_data = dense(input_data) 21 | 22 | weights = dense.weight.data() 23 | assert weights.shape == (10, 5) 24 | rss = weights.square().sum(axis=1).sqrt() 25 | np.testing.assert_almost_equal(rss.asnumpy(), np.tile(target_rss, units)) 26 | -------------------------------------------------------------------------------- /rl_coach/tests/architectures/mxnet_components/middlewares/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/mxnet_components/middlewares/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/architectures/mxnet_components/middlewares/test_fc_middleware.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import os 3 | import pytest 4 | import sys 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 6 | 7 | 8 | from rl_coach.base_parameters import MiddlewareScheme 9 | from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters 10 | from rl_coach.architectures.mxnet_components.middlewares.fc_middleware import FCMiddleware 11 | 12 | 13 | @pytest.mark.unit_test 14 | def test_fc_middleware(): 15 | params = FCMiddlewareParameters(scheme=MiddlewareScheme.Medium) 16 | mid = FCMiddleware(params=params) 17 | mid.initialize() 18 | embedded_data = mx.nd.random.uniform(low=0, high=1, shape=(10, 100)) 19 | output = mid(embedded_data) 20 | assert output.ndim == 2 # since last block was flatten 21 | assert output.shape[0] == 10 # since batch_size is 10 22 | assert output.shape[1] == 512 # since last layer of middleware (middle scheme) had 512 units 23 | -------------------------------------------------------------------------------- /rl_coach/tests/architectures/mxnet_components/middlewares/test_lstm_middleware.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import os 3 | import pytest 4 | import sys 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 6 | 7 | 8 | from rl_coach.base_parameters import MiddlewareScheme 9 | from rl_coach.architectures.middleware_parameters import LSTMMiddlewareParameters 10 | from rl_coach.architectures.mxnet_components.middlewares.lstm_middleware import LSTMMiddleware 11 | 12 | 13 | @pytest.mark.unit_test 14 | def test_lstm_middleware(): 15 | params = LSTMMiddlewareParameters(number_of_lstm_cells=25, scheme=MiddlewareScheme.Medium) 16 | mid = LSTMMiddleware(params=params) 17 | mid.initialize() 18 | # NTC 19 | embedded_data = mx.nd.random.uniform(low=0, high=1, shape=(10, 15, 20)) 20 | # NTC -> TNC 21 | output = mid(embedded_data) 22 | assert output.ndim == 3 # since last block was flatten 23 | assert output.shape[0] == 15 # since t is 15 24 | assert output.shape[1] == 10 # since batch_size is 10 25 | assert output.shape[2] == 25 # since number_of_lstm_cells is 25 26 | -------------------------------------------------------------------------------- /rl_coach/tests/architectures/tensorflow_components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/tensorflow_components/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/architectures/tensorflow_components/embedders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/architectures/tensorflow_components/embedders/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/architectures/tensorflow_components/embedders/test_identity_embedder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from rl_coach.base_parameters import EmbedderScheme 5 | 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 7 | 8 | import pytest 9 | import numpy as np 10 | from rl_coach.architectures.tensorflow_components.embedders.vector_embedder import VectorEmbedder 11 | import tensorflow as tf 12 | from tensorflow import logging 13 | 14 | logging.set_verbosity(logging.INFO) 15 | 16 | @pytest.fixture 17 | def reset(): 18 | tf.reset_default_graph() 19 | 20 | 21 | @pytest.mark.unit_test 22 | def test_embedder(reset): 23 | embedder = VectorEmbedder(np.array([10, 10]), name="test", scheme=EmbedderScheme.Empty) 24 | 25 | # make sure the ops where not created yet 26 | assert len(tf.get_default_graph().get_operations()) == 0 27 | 28 | # call the embedder 29 | input_ph, output_ph = embedder() 30 | 31 | # make sure that now the ops were created 32 | assert len(tf.get_default_graph().get_operations()) > 0 33 | 34 | # try feeding a batch of one example # TODO: consider auto converting to batch 35 | input = np.random.rand(1, 10, 10) 36 | sess = tf.Session() 37 | output = sess.run(embedder.output, {embedder.input: input}) 38 | assert output.shape == (1, 100) # should have flattened the input 39 | 40 | # now make sure the returned placeholders behave the same 41 | output = sess.run(output_ph, {input_ph: input}) 42 | assert output.shape == (1, 100) # should have flattened the input 43 | 44 | # make sure the naming is correct 45 | assert embedder.get_name() == "test" 46 | -------------------------------------------------------------------------------- /rl_coach/tests/environments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/environments/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/exploration_policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/exploration_policies/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/exploration_policies/test_additive_noise.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 4 | 5 | import pytest 6 | 7 | from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace 8 | from rl_coach.exploration_policies.additive_noise import AdditiveNoise 9 | from rl_coach.schedules import LinearSchedule 10 | import numpy as np 11 | 12 | 13 | @pytest.mark.unit_test 14 | def test_init(): 15 | # discrete control 16 | action_space = DiscreteActionSpace(3) 17 | noise_schedule = LinearSchedule(1.0, 1.0, 1000) 18 | 19 | # additive noise requires a bounded range for the actions 20 | action_space = BoxActionSpace(np.array([10])) 21 | with pytest.raises(ValueError): 22 | policy = AdditiveNoise(action_space, noise_schedule, 0) 23 | 24 | 25 | @pytest.mark.unit_test 26 | def test_get_action(): 27 | # make sure noise is in range 28 | action_space = BoxActionSpace(np.array([10]), -1, 1) 29 | noise_schedule = LinearSchedule(1.0, 1.0, 1000) 30 | policy = AdditiveNoise(action_space, noise_schedule, 0) 31 | 32 | # the action range is 2, so there is a ~0.1% chance that the noise will be larger than 3*std=3*2=6 33 | for i in range(1000): 34 | action = policy.get_action(np.zeros([10])) 35 | assert np.all(action < 10) 36 | # make sure there is no clipping of the action since it should be the environment that clips actions 37 | assert np.all(action != 1.0) 38 | assert np.all(action != -1.0) 39 | # make sure that each action element has a different value 40 | assert np.all(action[0] != action[1:]) 41 | -------------------------------------------------------------------------------- /rl_coach/tests/exploration_policies/test_greedy.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 4 | 5 | import pytest 6 | 7 | from rl_coach.spaces import DiscreteActionSpace, BoxActionSpace 8 | from rl_coach.exploration_policies.greedy import Greedy 9 | import numpy as np 10 | 11 | 12 | @pytest.mark.unit_test 13 | def test_get_action(): 14 | # discrete control 15 | action_space = DiscreteActionSpace(3) 16 | policy = Greedy(action_space) 17 | 18 | best_action, _ = policy.get_action(np.array([10, 20, 30])) 19 | assert best_action == 2 20 | 21 | # continuous control 22 | action_space = BoxActionSpace(np.array([10])) 23 | policy = Greedy(action_space) 24 | 25 | best_action = policy.get_action(np.array([1, 1, 1])) 26 | assert np.all(best_action == np.array([1, 1, 1])) 27 | 28 | 29 | @pytest.mark.unit_test 30 | def test_get_control_param(): 31 | action_space = DiscreteActionSpace(3) 32 | policy = Greedy(action_space) 33 | assert policy.get_control_param() == 0 34 | 35 | -------------------------------------------------------------------------------- /rl_coach/tests/filters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/filters/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/filters/action/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/filters/action/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/filters/action/test_box_masking.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 4 | 5 | import pytest 6 | from rl_coach.filters.action.box_masking import BoxMasking 7 | from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace 8 | import numpy as np 9 | 10 | 11 | @pytest.mark.unit_test 12 | def test_filter(): 13 | filter = BoxMasking(10, 20) 14 | 15 | # passing an output space that is wrong 16 | with pytest.raises(ValueError): 17 | filter.validate_output_action_space(DiscreteActionSpace(10)) 18 | 19 | # 1 dimensional box 20 | output_space = BoxActionSpace(1, 5, 30) 21 | input_space = filter.get_unfiltered_action_space(output_space) 22 | 23 | action = np.array([2]) 24 | result = filter.filter(action) 25 | assert result == np.array([12]) 26 | assert output_space.contains(result) 27 | 28 | -------------------------------------------------------------------------------- /rl_coach/tests/filters/action/test_linear_box_to_box_map.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) 4 | 5 | import pytest 6 | from rl_coach.filters.action.linear_box_to_box_map import LinearBoxToBoxMap 7 | from rl_coach.spaces import BoxActionSpace, DiscreteActionSpace 8 | import numpy as np 9 | 10 | 11 | @pytest.mark.unit_test 12 | def test_filter(): 13 | filter = LinearBoxToBoxMap(10, 20) 14 | 15 | # passing an output space that is wrong 16 | with pytest.raises(ValueError): 17 | filter.validate_output_action_space(DiscreteActionSpace(10)) 18 | 19 | # 1 dimensional box 20 | output_space = BoxActionSpace(1, 5, 35) 21 | input_space = filter.get_unfiltered_action_space(output_space) 22 | 23 | action = np.array([2]) 24 | 25 | action = np.array([12]) 26 | result = filter.filter(action) 27 | assert result == np.array([11]) 28 | assert output_space.contains(result) 29 | 30 | -------------------------------------------------------------------------------- /rl_coach/tests/filters/observation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/filters/observation/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/filters/reward/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/filters/reward/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/graph_managers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/graph_managers/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/memories/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/memories/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/presets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/presets/__init__.py -------------------------------------------------------------------------------- /rl_coach/tests/pytest.ini: -------------------------------------------------------------------------------- 1 | # content of pytest.ini 2 | [pytest] 3 | markers = 4 | unit_test: short test that checks that a module is acting correctly 5 | integration_test: long test that checks that the complete framework is running correctly 6 | filterwarnings = 7 | ignore::DeprecationWarning 8 | norecursedirs = 9 | *mxnet* 10 | -------------------------------------------------------------------------------- /rl_coach/tests/test_saver.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from rl_coach.saver import Saver, SaverCollection 4 | 5 | 6 | @pytest.mark.unit_test 7 | def test_checkpoint_collection(): 8 | class SaverTest(Saver): 9 | def __init__(self, path): 10 | self._path = path 11 | self._count = 1 12 | 13 | @property 14 | def path(self): 15 | return self._path 16 | 17 | def merge(self, other: 'Saver'): 18 | assert isinstance(other, SaverTest) 19 | assert self.path == other.path 20 | self._count += other._count 21 | 22 | # test add 23 | savers = SaverCollection(SaverTest('123')) 24 | savers.add(SaverTest('123')) 25 | savers.add(SaverTest('456')) 26 | 27 | def check_collection(mul): 28 | paths = ['123', '456'] 29 | for c in savers: 30 | paths.remove(c.path) 31 | if c.path == '123': 32 | assert c._count == 2 * mul 33 | elif c.path == '456': 34 | assert c._count == 1 * mul 35 | else: 36 | assert False, "invalid path" 37 | 38 | check_collection(1) 39 | 40 | # test update 41 | savers.update(savers) 42 | check_collection(2) 43 | -------------------------------------------------------------------------------- /rl_coach/tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/tests/utils/__init__.py -------------------------------------------------------------------------------- /rl_coach/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/rl_coach/utilities/__init__.py -------------------------------------------------------------------------------- /tutorials/Resources/exploration.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from typing import List 4 | from rl_coach.core_types import ActionType 5 | from rl_coach.spaces import ActionSpace 6 | from rl_coach.exploration_policies.exploration_policy import ExplorationPolicy, ExplorationParameters 7 | 8 | 9 | class MyExplorationPolicy(ExplorationPolicy): 10 | """ 11 | An exploration policy takes the predicted actions or action values from the agent, and selects the action to 12 | actually apply to the environment using some predefined algorithm. 13 | """ 14 | def __init__(self, action_space: ActionSpace): 15 | #self.phase = RunPhase.HEATUP 16 | self.action_space = action_space 17 | super().__init__(action_space) 18 | 19 | def get_action(self, action_values: List[ActionType]) -> ActionType: 20 | if (np.random.rand() < 0.5): 21 | chosen_action = self.action_space.sample() 22 | else: 23 | chosen_action = np.argmax(action_values) 24 | probabilities = np.zeros(len(self.action_space.actions)) 25 | probabilities[chosen_action] = 1 26 | return chosen_action, probabilities 27 | 28 | def get_control_param(self): 29 | return 0 30 | 31 | 32 | 33 | class MyExplorationParameters(ExplorationParameters): 34 | def __init__(self): 35 | super().__init__() 36 | 37 | @property 38 | def path(self): 39 | return 'exploration:MyExplorationPolicy' 40 | -------------------------------------------------------------------------------- /tutorials/Resources/img/dr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/tutorials/Resources/img/dr.png -------------------------------------------------------------------------------- /tutorials/Resources/img/model_selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/tutorials/Resources/img/model_selection.png -------------------------------------------------------------------------------- /tutorials/Resources/img/wis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IntelLabs/coach/2c60cb5acd8cd3c9c381a5066c208e69fc273c7b/tutorials/Resources/img/wis.png -------------------------------------------------------------------------------- /tutorials/python_invocation_example.py: -------------------------------------------------------------------------------- 1 | from rl_coach.agents.clipped_ppo_agent import ClippedPPOAgentParameters 2 | from rl_coach.core_types import EnvironmentSteps 3 | from rl_coach.environments.gym_environment import GymVectorEnvironment 4 | from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager 5 | from rl_coach.graph_managers.graph_manager import SimpleSchedule 6 | 7 | graph_manager = BasicRLGraphManager( 8 | agent_params=ClippedPPOAgentParameters(), 9 | env_params=GymVectorEnvironment(level='CartPole-v0'), 10 | schedule_params=SimpleSchedule() 11 | ) 12 | 13 | graph_manager.heatup(EnvironmentSteps(100)) 14 | graph_manager.train_and_act(EnvironmentSteps(100)) --------------------------------------------------------------------------------