├── .gitignore ├── Chapter02 ├── .gitignore ├── 01_agent_anatomy.py ├── 02_cartpole_random.py ├── 03_random_action_wrapper.py └── 04_cartpole_random_monitor.py ├── Chapter03 ├── 01_modules.py ├── 02_tensorboard.py ├── 03_atari_gan.py └── 04_atari_gan_ignite.py ├── Chapter04 ├── 01_cartpole.py ├── 02_frozenlake_naive.py ├── 03_frozenlake_tweaked.py └── 04_frozenlake_nonslippery.py ├── Chapter05 ├── 01_frozenlake_v_iteration.py └── 02_frozenlake_q_iteration.py ├── Chapter06 ├── 01_frozenlake_q_learning.py ├── 02_dqn_pong.py ├── 03_dqn_play.py └── lib │ ├── __init__.py │ ├── dqn_model.py │ └── wrappers.py ├── Chapter07 ├── 01_actions.py ├── 02_agents.py ├── 03_exp_sources.py ├── 04_replay_buf.py ├── 05_target_net.py ├── 06_cartpole.py └── lib.py ├── Chapter08 ├── 01_dqn_basic.py ├── 02_dqn_n_steps.py ├── 03_dqn_double.py ├── 04_dqn_noisy_net.py ├── 05_dqn_prio_replay.py ├── 06_dqn_dueling.py ├── 07_dqn_distrib.py ├── 08_dqn_rainbow.py ├── adhoc │ ├── commute.py │ └── distr_test.py ├── bench │ ├── prio_buffer_bench.py │ └── simple_buffer_bench.py └── lib │ ├── __init__.py │ ├── common.py │ ├── dqn_extra.py │ └── dqn_model.py ├── Chapter09 ├── .gitignore ├── 00_slow_grads.py ├── 01_baseline.py ├── 02_n_envs.py ├── 03_parallel.py ├── 04_wrappers_n_env.py ├── 04_wrappers_parallel.py ├── attic │ ├── 02_env_steps.py │ └── 03_parallel_orig.py ├── img │ ├── 01_orig_tb.png │ ├── 02_steps-tb.png │ ├── 03-serial-blocks.png │ ├── 03_serial.png │ └── 04_parallel.png └── lib │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-311.pyc │ ├── common.cpython-311.pyc │ └── dqn_model.cpython-311.pyc │ ├── atari_wrappers.py │ ├── common.py │ └── dqn_model.py ├── Chapter10 ├── .gitignore ├── conftest.py ├── data │ ├── ch10-small-quotes.tgz │ └── unpack_data.sh ├── lib │ ├── __init__.py │ ├── common.py │ ├── data.py │ ├── environ.py │ ├── models.py │ └── validation.py ├── run_model.py ├── tests │ ├── test_data.py │ └── test_environ.py ├── train_model.py └── train_model_conv.py ├── Chapter11 ├── .gitignore ├── 01_cartpole_dqn.py ├── 02_cartpole_reinforce.py ├── 03_cartpole_reinforce_baseline.py ├── 04_cartpole_pg.py ├── 05_pong_pg.py ├── 05_pong_pg_tune.py └── lib │ ├── __init__.py │ └── common.py ├── Chapter12 ├── .gitignore ├── 01_cartpole_pg.py ├── 02_pong_a2c.py ├── 02_pong_a2c_tune.py ├── 03_a3c_grad.py └── lib │ ├── __init__.py │ └── common.py ├── Chapter13 ├── adhoc │ ├── hf_t1.py │ ├── hf_t2.py │ ├── lc_t1.py │ └── openai_check.py ├── chatgpt_auto.py ├── chatgpt_interactive.py ├── conftest.py ├── games │ ├── .gitignore │ └── make_games.sh ├── lib │ ├── __init__.py │ ├── common.py │ ├── model.py │ └── preproc.py ├── pytest.ini ├── requirements.txt ├── tests │ ├── __init__.py │ └── test_preproc.py ├── train_basic.py ├── train_preproc.py └── train_tr.py ├── Chapter14 ├── .gitignore ├── adhoc │ ├── 01_wob_create.py │ ├── 02_act_clicks.py │ ├── 03_clicker.py │ ├── 04_load_demo.py │ ├── 05_join_obs.py │ ├── 06_save_traj.py │ └── 06_save_traj_vec.py ├── demos │ ├── click-dialog │ │ ├── click-dialog_0421165244.json │ │ ├── click-dialog_0421165247.json │ │ ├── click-dialog_0421165250.json │ │ ├── click-dialog_0421165253.json │ │ ├── click-dialog_0421165255.json │ │ ├── click-dialog_0421165258.json │ │ ├── click-dialog_0421165300.json │ │ ├── click-dialog_0421165303.json │ │ ├── click-dialog_0421165306.json │ │ ├── click-dialog_0421165308.json │ │ ├── click-dialog_0421165311.json │ │ ├── click-dialog_0421165313.json │ │ ├── click-dialog_0421165316.json │ │ ├── click-dialog_0421165318.json │ │ ├── click-dialog_0421165320.json │ │ └── click-dialog_0421165323.json │ ├── click-tab │ │ ├── click-tab_0426161308.json │ │ ├── click-tab_0426161312.json │ │ ├── click-tab_0426161315.json │ │ ├── click-tab_0426161318.json │ │ ├── click-tab_0426161321.json │ │ ├── click-tab_0426161324.json │ │ ├── click-tab_0426161327.json │ │ ├── click-tab_0426161330.json │ │ ├── click-tab_0426161334.json │ │ ├── click-tab_0426161338.json │ │ ├── click-tab_0426161341.json │ │ ├── click-tab_0426161345.json │ │ ├── click-tab_0426161348.json │ │ ├── click-tab_0426161350.json │ │ └── click-tab_0426161353.json │ ├── count-sides │ │ ├── count-sides_0423161944.json │ │ ├── count-sides_0423161949.json │ │ ├── count-sides_0423161952.json │ │ ├── count-sides_0423161955.json │ │ ├── count-sides_0423161958.json │ │ ├── count-sides_0423162002.json │ │ ├── count-sides_0423162006.json │ │ ├── count-sides_0423162010.json │ │ ├── count-sides_0423162012.json │ │ ├── count-sides_0423162017.json │ │ ├── count-sides_0423162021.json │ │ ├── count-sides_0423162025.json │ │ ├── count-sides_0423162029.json │ │ ├── count-sides_0423162032.json │ │ ├── count-sides_0423162034.json │ │ ├── count-sides_0423162036.json │ │ ├── count-sides_0423162041.json │ │ ├── count-sides_0423162045.json │ │ ├── count-sides_0423162049.json │ │ ├── count-sides_0423162053.json │ │ ├── count-sides_0423162057.json │ │ ├── count-sides_0423162100.json │ │ ├── count-sides_0423162103.json │ │ ├── count-sides_0423162107.json │ │ └── count-sides_0423162110.json │ └── tic-tac-toe │ │ ├── tic-tac-toe_0423220647.json │ │ ├── tic-tac-toe_0423220653.json │ │ ├── tic-tac-toe_0423220658.json │ │ ├── tic-tac-toe_0423220704.json │ │ ├── tic-tac-toe_0423220710.json │ │ ├── tic-tac-toe_0423220718.json │ │ ├── tic-tac-toe_0423220724.json │ │ ├── tic-tac-toe_0423220730.json │ │ ├── tic-tac-toe_0423220737.json │ │ ├── tic-tac-toe_0423220744.json │ │ ├── tic-tac-toe_0423220749.json │ │ ├── tic-tac-toe_0423220754.json │ │ ├── tic-tac-toe_0423220801.json │ │ ├── tic-tac-toe_0423220805.json │ │ ├── tic-tac-toe_0423220812.json │ │ ├── tic-tac-toe_0423220818.json │ │ ├── tic-tac-toe_0423220823.json │ │ ├── tic-tac-toe_0423220828.json │ │ ├── tic-tac-toe_0423220835.json │ │ ├── tic-tac-toe_0423220839.json │ │ ├── tic-tac-toe_0423220843.json │ │ ├── tic-tac-toe_0423220847.json │ │ ├── tic-tac-toe_0423220852.json │ │ ├── tic-tac-toe_0423220856.json │ │ ├── tic-tac-toe_0423220902.json │ │ ├── tic-tac-toe_0423220905.json │ │ └── tic-tac-toe_0423220912.json ├── lib │ ├── common.py │ ├── demos.py │ ├── model.py │ └── wob.py ├── record_demo.py ├── requirements.txt ├── wob_click_mm_play.py ├── wob_click_mm_train.py ├── wob_click_play.py └── wob_click_train.py ├── Chapter15 ├── 01_check_env.py ├── 02_train_a2c.py ├── 03_play_a2c.py ├── 04_train_ddpg.py ├── 05_play_ddpg.py ├── 06_train_d4pg.py ├── lib │ ├── __init__.py │ ├── common.py │ └── model.py └── requirements.txt ├── Chapter16 ├── 01_train_a2c.py ├── 02_play.py ├── 03_train_ppo.py ├── 04_train_trpo.py ├── 05_train_acktr.py ├── 06_train_sac.py ├── lib │ ├── __init__.py │ ├── common.py │ ├── kfac.py │ ├── model.py │ └── trpo.py └── requirements.txt ├── Chapter17 ├── .gitignore ├── 01_cartpole_es.py ├── 02_cheetah_es.py ├── 03_cartpole_ga.py ├── 04_cheetah_ga.py └── lib │ ├── __init__.py │ └── common.py ├── Chapter18 ├── atari_dqn.py ├── atari_ppo.py ├── lib │ ├── __init__.py │ ├── common.py │ ├── dqn_extra.py │ └── ppo.py ├── mcar_dqn.py ├── mcar_ppo.py ├── riverswim.py └── tests │ ├── __init__.py │ └── test_ppo.py ├── Chapter19 ├── .gitignore ├── 01_a2c.py ├── 01_play.py ├── 02_label_ui.py ├── 03_reward_train.py ├── adhoc │ ├── obs_to_gif.py │ └── rw_model.py ├── lib │ ├── __init__.py │ ├── common.py │ ├── rlhf.py │ └── ui_tools.py └── requirements.txt ├── Chapter20 ├── .gitignore ├── lib │ ├── __init__.py │ ├── game.py │ ├── mcts.py │ ├── model.py │ └── muzero.py ├── play-mu.py ├── play.py ├── telegram-bot.py ├── tests │ ├── __init__.py │ ├── test_game.py │ ├── test_model.py │ └── test_muzero.py ├── tournament │ ├── 2ed │ │ ├── charts.ipynb │ │ ├── final-short.png │ │ ├── final.csv │ │ ├── final.svg │ │ ├── final.txt │ │ ├── final_plot.ipynb │ │ ├── semi-0.txt │ │ ├── semi-1.txt │ │ ├── semi-2.txt │ │ ├── semi-3.txt │ │ ├── semi-4.txt │ │ ├── semi-5.txt │ │ ├── semi-6.txt │ │ ├── semi-7.txt │ │ ├── semi-8.txt │ │ ├── semi-9.txt │ │ ├── semi-common.png │ │ └── semi-scores.png │ └── 3ed │ │ ├── .ipynb_checkpoints │ │ ├── charts-checkpoint.ipynb │ │ └── charts-mu-checkpoint.ipynb │ │ ├── charts-mu.ipynb │ │ ├── charts.ipynb │ │ ├── final.sh │ │ ├── final_plot.ipynb │ │ ├── mu-t5-6-res2.txt │ │ ├── mu-v1-wins.csv │ │ ├── semi-v1.txt │ │ ├── semi-v2.txt │ │ ├── v1-wins.csv │ │ └── v2-wins.csv ├── train-mu.py └── train.py ├── Chapter21 ├── .gitignore ├── csvs │ ├── 2ed │ │ ├── README.md │ │ ├── c2x2-paper-d200-t1-v2.csv │ │ ├── c2x2-paper-d200-t1.csv │ │ ├── c2x2-zero-goal-d200-t1-v2.csv │ │ ├── c2x2-zero-goal-d200-t1.csv │ │ ├── c3x3-paper-d200-t1-v2.csv │ │ ├── c3x3-paper-d200-t1.csv │ │ ├── c3x3-zero-goal-d200-no-decay-v2.csv │ │ ├── c3x3-zero-goal-d200-no-decay.csv │ │ ├── c3x3-zero-goal-d200-t1-v2.csv │ │ ├── c3x3-zero-goal-d200-t1.csv │ │ ├── c3x3 │ │ │ ├── c3-paper-d20-1.93e-1.csv │ │ │ ├── c3-zg-d20-noweight-no-decay=5.501e-1.csv │ │ │ ├── c3-zg-d20-noweight-no-decay=5.61e-1.csv │ │ │ ├── c3-zg-d20-noweight-no-decay=6.43e-1.csv │ │ │ ├── c3-zg-d20-noweight-no-decay=7.29e-1.csv │ │ │ ├── c3-zg-d20-noweight-no-decay=chp100k.csv │ │ │ ├── c3-zg-d20-noweight-no-decay=chp64k.csv │ │ │ ├── c3-zg-d20-noweight.csv │ │ │ └── c3-zg-d20.csv │ │ ├── t3-c2x2-mcts-c=0.01.csv │ │ ├── t3-c2x2-mcts-c=0.1.csv │ │ ├── t3-c2x2-mcts-c=1.csv │ │ ├── t3-c2x2-mcts-c=10.csv │ │ ├── t3-c2x2-mcts-c=100.csv │ │ ├── t3-c2x2-mcts-c=1000.csv │ │ ├── t3-c2x2-mcts-c=100000.csv │ │ ├── t3.1-c2x2-mcts-c=1.csv │ │ ├── t3.1-c2x2-mcts-c=10.csv │ │ ├── t3.1-c2x2-mcts-c=100-steps=100k.csv │ │ ├── t3.1-c2x2-mcts-c=100-steps=60k.csv │ │ ├── t3.1-c2x2-mcts-c=100.csv │ │ ├── t3.1-c2x2-mcts-c=1000.csv │ │ ├── t3.1-c2x2-mcts-c=10000.csv │ │ ├── t4-c2x2-mcts-c=10-steps=100k.csv │ │ ├── t4-c2x2-mcts-c=10-steps=200k.csv │ │ ├── t4-c2x2-mcts-c=10-steps=500k.csv │ │ ├── t4-c2x2-mcts-c=100-steps=100k-b10.csv │ │ ├── t4-c2x2-mcts-c=100-steps=100k-b100.csv │ │ ├── t4-c2x2-mcts-c=100-steps=100k.csv │ │ ├── t5-c2x2-1.0366e-01.csv │ │ ├── t5-c2x2-3.0742e-02.csv │ │ ├── t5-c2x2-6.0737e-02.csv │ │ ├── t6-c2x2-nu=1.csv │ │ ├── t6-c2x2-nu=10.csv │ │ ├── t6-c2x2-nu=1000.csv │ │ └── t7-best-paper-1.8184e-1.csv │ └── 3ed │ │ ├── 2x2-paper.csv │ │ ├── 2x2-zg-chpt-17k.csv │ │ ├── 3x3-paper.csv │ │ └── 3x3-zg-chpt-26k.csv ├── cubes_tests │ ├── 2ed │ │ ├── cube2x2_d3.txt │ │ ├── cube2x2_d4.txt │ │ ├── cube2x2_d5.txt │ │ ├── cube2x2_d6.txt │ │ ├── cube3x3_d10.txt │ │ ├── cube3x3_d1000.txt │ │ ├── cube3x3_d15.txt │ │ ├── cube3x3_d3.txt │ │ └── cube3x3_d3_norepeat.txt │ └── 3ed │ │ ├── 2x2-d1-50.txt │ │ └── 3x3-d1-50.txt ├── docs │ └── Notes.md ├── gen_cubes.py ├── ini │ ├── README.md │ ├── cube2x2-paper-d200.ini │ ├── cube2x2-zero-goal-d200.ini │ ├── cube3x3-paper-d20.ini │ ├── cube3x3-paper-d200.ini │ ├── cube3x3-zero-goal-d20-noweight.ini │ ├── cube3x3-zero-goal-d20.ini │ ├── cube3x3-zero-goal-d200-slow-decay.ini │ └── cube3x3-zero-goal-d200.ini ├── libcube │ ├── conf.py │ ├── cubes │ │ ├── __init__.py │ │ ├── _common.py │ │ ├── _env.py │ │ ├── cube2x2.py │ │ └── cube3x3.py │ ├── mcts.py │ └── model.py ├── models │ ├── .gitattributes │ └── 3ed │ │ ├── 2x2-paper │ │ └── best_3.2572e-02.dat │ │ ├── 2x2-zg │ │ └── chpt_017000.dat │ │ ├── 3x3-paper │ │ └── best_3.1818e-02.dat │ │ └── 3x3-zg │ │ └── chpt_026400.dat ├── nbs │ ├── 2ed │ │ ├── 01_paper-vs-zero_goal.ipynb │ │ ├── 02_fix_steps_limit.ipynb │ │ ├── 03_mcts_tuning.ipynb │ │ ├── 04_mcts_C-extra-data.ipynb │ │ ├── 05_batch_search.ipynb │ │ ├── 06_compare_models.ipynb │ │ └── 07_article_figs.ipynb │ └── 3ed │ │ └── 07_article_figs.ipynb ├── requirements.txt ├── run_tests.sh ├── solver.py ├── tests │ ├── __init__.py │ └── libcube │ │ ├── __init__.py │ │ └── cubes │ │ ├── __init__.py │ │ ├── test_cube2x2.py │ │ └── test_cube3x3.py ├── train.py └── train_debug.py ├── Chapter22 ├── .gitignore ├── battle_dqn.py ├── battle_play.py ├── forest_both_dqn.py ├── forest_both_play.py ├── forest_random.py ├── forest_tigers_dqn.py ├── forest_tigers_play.py ├── lib │ ├── __init__.py │ ├── common.py │ ├── data.py │ └── model.py └── requirements.txt ├── LICENSE ├── README.md ├── requirements.txt └── tools ├── avg_csv.py ├── ch12 ├── norm_dist.py └── norm_dist.svg └── plot.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | runs 4 | saves 5 | video 6 | __pycache__ 7 | .ipynb_checkpoints/ 8 | -------------------------------------------------------------------------------- /Chapter02/.gitignore: -------------------------------------------------------------------------------- 1 | video -------------------------------------------------------------------------------- /Chapter02/01_agent_anatomy.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import List 3 | 4 | 5 | class Environment: 6 | def __init__(self): 7 | self.steps_left = 10 8 | 9 | def get_observation(self) -> List[float]: 10 | return [0.0, 0.0, 0.0] 11 | 12 | def get_actions(self) -> List[int]: 13 | return [0, 1] 14 | 15 | def is_done(self) -> bool: 16 | return self.steps_left == 0 17 | 18 | def action(self, action: int) -> float: 19 | if self.is_done(): 20 | raise Exception("Game is over") 21 | self.steps_left -= 1 22 | return random.random() 23 | 24 | 25 | class Agent: 26 | def __init__(self): 27 | self.total_reward = 0.0 28 | 29 | def step(self, env: Environment): 30 | current_obs = env.get_observation() 31 | actions = env.get_actions() 32 | reward = env.action(random.choice(actions)) 33 | self.total_reward += reward 34 | 35 | 36 | if __name__ == "__main__": 37 | env = Environment() 38 | agent = Agent() 39 | 40 | while not env.is_done(): 41 | agent.step(env) 42 | 43 | print("Total reward got: %.4f" % agent.total_reward) 44 | -------------------------------------------------------------------------------- /Chapter02/02_cartpole_random.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | 3 | 4 | if __name__ == "__main__": 5 | env = gym.make("CartPole-v1") 6 | total_reward = 0.0 7 | total_steps = 0 8 | obs, _ = env.reset() 9 | 10 | while True: 11 | action = env.action_space.sample() 12 | obs, reward, is_done, is_trunc, _ = env.step(action) 13 | total_reward += reward 14 | total_steps += 1 15 | if is_done: 16 | break 17 | 18 | print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward)) 19 | -------------------------------------------------------------------------------- /Chapter02/03_random_action_wrapper.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import random 3 | 4 | 5 | class RandomActionWrapper(gym.ActionWrapper): 6 | def __init__(self, env: gym.Env, epsilon: float = 0.1): 7 | super(RandomActionWrapper, self).__init__(env) 8 | self.epsilon = epsilon 9 | 10 | def action(self, action: gym.core.WrapperActType) -> gym.core.WrapperActType: 11 | if random.random() < self.epsilon: 12 | action = self.env.action_space.sample() 13 | print(f"Random action {action}") 14 | return action 15 | return action 16 | 17 | 18 | if __name__ == "__main__": 19 | env = RandomActionWrapper(gym.make("CartPole-v1")) 20 | 21 | obs = env.reset() 22 | total_reward = 0.0 23 | 24 | while True: 25 | obs, reward, done, _, _ = env.step(0) 26 | total_reward += reward 27 | if done: 28 | break 29 | 30 | print(f"Reward got: {total_reward:.2f}") 31 | -------------------------------------------------------------------------------- /Chapter02/04_cartpole_random_monitor.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | 3 | 4 | if __name__ == "__main__": 5 | env = gym.make("CartPole-v1", render_mode="rgb_array") 6 | env = gym.wrappers.HumanRendering(env) 7 | # env = gym.wrappers.RecordVideo(env, video_folder="video") 8 | 9 | total_reward = 0.0 10 | total_steps = 0 11 | obs = env.reset() 12 | 13 | while True: 14 | action = env.action_space.sample() 15 | obs, reward, done, _, _ = env.step(action) 16 | total_reward += reward 17 | total_steps += 1 18 | if done: 19 | break 20 | 21 | print(f"Episode done in {total_steps} steps, total reward {total_reward:.2f}") 22 | env.close() 23 | -------------------------------------------------------------------------------- /Chapter03/01_modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class OurModule(nn.Module): 6 | def __init__(self, num_inputs, num_classes, dropout_prob=0.3): 7 | super(OurModule, self).__init__() 8 | self.pipe = nn.Sequential( 9 | nn.Linear(num_inputs, 5), 10 | nn.ReLU(), 11 | nn.Linear(5, 20), 12 | nn.ReLU(), 13 | nn.Linear(20, num_classes), 14 | nn.Dropout(p=dropout_prob), 15 | nn.Softmax(dim=1) 16 | ) 17 | 18 | def forward(self, x): 19 | return self.pipe(x) 20 | 21 | 22 | if __name__ == "__main__": 23 | net = OurModule(num_inputs=2, num_classes=3) 24 | print(net) 25 | v = torch.FloatTensor([[2, 3]]) 26 | out = net(v) 27 | print(out) 28 | print("Cuda's availability is %s" % torch.cuda.is_available()) 29 | if torch.cuda.is_available(): 30 | print("Data from cuda: %s" % out.to('cuda')) 31 | -------------------------------------------------------------------------------- /Chapter03/02_tensorboard.py: -------------------------------------------------------------------------------- 1 | import math 2 | from torch.utils.tensorboard.writer import SummaryWriter 3 | 4 | 5 | if __name__ == "__main__": 6 | writer = SummaryWriter() 7 | 8 | funcs = {"sin": math.sin, "cos": math.cos, "tan": math.tan} 9 | 10 | for angle in range(-360, 360): 11 | angle_rad = angle * math.pi / 180 12 | for name, fun in funcs.items(): 13 | val = fun(angle_rad) 14 | writer.add_scalar(name, val, angle) 15 | 16 | writer.close() 17 | -------------------------------------------------------------------------------- /Chapter06/01_frozenlake_q_learning.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import typing as tt 3 | import gymnasium as gym 4 | from collections import defaultdict 5 | from torch.utils.tensorboard.writer import SummaryWriter 6 | 7 | ENV_NAME = "FrozenLake-v1" 8 | #ENV_NAME = "FrozenLake8x8-v1" # uncomment for larger version 9 | GAMMA = 0.9 10 | ALPHA = 0.2 11 | TEST_EPISODES = 20 12 | 13 | State = int 14 | Action = int 15 | ValuesKey = tt.Tuple[State, Action] 16 | 17 | class Agent: 18 | def __init__(self): 19 | self.env = gym.make(ENV_NAME) 20 | self.state, _ = self.env.reset() 21 | self.values: tt.Dict[ValuesKey] = defaultdict(float) 22 | 23 | def sample_env(self) -> tt.Tuple[State, Action, float, State]: 24 | action = self.env.action_space.sample() 25 | old_state = self.state 26 | new_state, reward, is_done, is_tr, _ = self.env.step(action) 27 | if is_done or is_tr: 28 | self.state, _ = self.env.reset() 29 | else: 30 | self.state = new_state 31 | return old_state, action, float(reward), new_state 32 | 33 | def best_value_and_action(self, state: State) -> tt.Tuple[float, Action]: 34 | best_value, best_action = None, None 35 | for action in range(self.env.action_space.n): 36 | action_value = self.values[(state, action)] 37 | if best_value is None or best_value < action_value: 38 | best_value = action_value 39 | best_action = action 40 | return best_value, best_action 41 | 42 | def value_update(self, state: State, action: Action, reward: float, next_state: State): 43 | best_val, _ = self.best_value_and_action(next_state) 44 | new_val = reward + GAMMA * best_val 45 | old_val = self.values[(state, action)] 46 | key = (state, action) 47 | self.values[key] = old_val * (1-ALPHA) + new_val * ALPHA 48 | 49 | def play_episode(self, env: gym.Env) -> float: 50 | total_reward = 0.0 51 | state, _ = env.reset() 52 | while True: 53 | _, action = self.best_value_and_action(state) 54 | new_state, reward, is_done, is_tr, _ = env.step(action) 55 | total_reward += reward 56 | if is_done or is_tr: 57 | break 58 | state = new_state 59 | return total_reward 60 | 61 | 62 | if __name__ == "__main__": 63 | test_env = gym.make(ENV_NAME) 64 | agent = Agent() 65 | writer = SummaryWriter(comment="-q-learning") 66 | 67 | iter_no = 0 68 | best_reward = 0.0 69 | while True: 70 | iter_no += 1 71 | state, action, reward, next_state = agent.sample_env() 72 | agent.value_update(state, action, reward, next_state) 73 | 74 | test_reward = 0.0 75 | for _ in range(TEST_EPISODES): 76 | test_reward += agent.play_episode(test_env) 77 | test_reward /= TEST_EPISODES 78 | writer.add_scalar("reward", test_reward, iter_no) 79 | if test_reward > best_reward: 80 | print("%d: Best test reward updated %.3f -> %.3f" % (iter_no, best_reward, test_reward)) 81 | best_reward = test_reward 82 | if test_reward > 0.80: 83 | print("Solved in %d iterations!" % iter_no) 84 | break 85 | writer.close() 86 | -------------------------------------------------------------------------------- /Chapter06/03_dqn_play.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gymnasium as gym 3 | import argparse 4 | import numpy as np 5 | import typing as tt 6 | 7 | import torch 8 | 9 | from lib import wrappers 10 | from lib import dqn_model 11 | 12 | import collections 13 | 14 | DEFAULT_ENV_NAME = "PongNoFrameskip-v4" 15 | 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-m", "--model", required=True, help="Model file to load") 20 | parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME, 21 | help="Environment name to use, default=" + DEFAULT_ENV_NAME) 22 | parser.add_argument("-r", "--record", required=True, help="Directory for video") 23 | args = parser.parse_args() 24 | 25 | env = wrappers.make_env(args.env, render_mode="rgb_array") 26 | env = gym.wrappers.RecordVideo(env, video_folder=args.record) 27 | net = dqn_model.DQN(env.observation_space.shape, env.action_space.n) 28 | state = torch.load(args.model, map_location=lambda stg, _: stg, weights_only=True) 29 | net.load_state_dict(state) 30 | 31 | state, _ = env.reset() 32 | total_reward = 0.0 33 | c: tt.Dict[int, int] = collections.Counter() 34 | 35 | while True: 36 | state_v = torch.tensor(np.expand_dims(state, 0)) 37 | q_vals = net(state_v).data.numpy()[0] 38 | action = int(np.argmax(q_vals)) 39 | c[action] += 1 40 | state, reward, is_done, is_trunc, _ = env.step(action) 41 | total_reward += reward 42 | if is_done or is_trunc: 43 | break 44 | print("Total reward: %.2f" % total_reward) 45 | print("Action counts:", c) 46 | env.close() 47 | 48 | -------------------------------------------------------------------------------- /Chapter06/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter06/lib/__init__.py -------------------------------------------------------------------------------- /Chapter06/lib/dqn_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class DQN(nn.Module): 6 | def __init__(self, input_shape, n_actions): 7 | super(DQN, self).__init__() 8 | 9 | self.conv = nn.Sequential( 10 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 11 | nn.ReLU(), 12 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 13 | nn.ReLU(), 14 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 15 | nn.ReLU(), 16 | nn.Flatten(), 17 | ) 18 | size = self.conv(torch.zeros(1, *input_shape)).size()[-1] 19 | self.fc = nn.Sequential( 20 | nn.Linear(size, 512), 21 | nn.ReLU(), 22 | nn.Linear(512, n_actions) 23 | ) 24 | 25 | def forward(self, x: torch.ByteTensor): 26 | # scale on GPU 27 | xx = x / 255.0 28 | return self.fc(self.conv(xx)) 29 | -------------------------------------------------------------------------------- /Chapter06/lib/wrappers.py: -------------------------------------------------------------------------------- 1 | import typing as tt 2 | import gymnasium as gym 3 | from gymnasium import spaces 4 | import collections 5 | import numpy as np 6 | from stable_baselines3.common import atari_wrappers 7 | 8 | 9 | class ImageToPyTorch(gym.ObservationWrapper): 10 | def __init__(self, env): 11 | super(ImageToPyTorch, self).__init__(env) 12 | obs = self.observation_space 13 | assert isinstance(obs, gym.spaces.Box) 14 | assert len(obs.shape) == 3 15 | new_shape = (obs.shape[-1], obs.shape[0], obs.shape[1]) 16 | self.observation_space = gym.spaces.Box( 17 | low=obs.low.min(), high=obs.high.max(), 18 | shape=new_shape, dtype=obs.dtype) 19 | 20 | def observation(self, observation): 21 | return np.moveaxis(observation, 2, 0) 22 | 23 | 24 | class BufferWrapper(gym.ObservationWrapper): 25 | def __init__(self, env, n_steps): 26 | super(BufferWrapper, self).__init__(env) 27 | obs = env.observation_space 28 | assert isinstance(obs, spaces.Box) 29 | new_obs = gym.spaces.Box( 30 | obs.low.repeat(n_steps, axis=0), obs.high.repeat(n_steps, axis=0), 31 | dtype=obs.dtype) 32 | self.observation_space = new_obs 33 | self.buffer = collections.deque(maxlen=n_steps) 34 | 35 | def reset(self, *, seed: tt.Optional[int] = None, options: tt.Optional[dict[str, tt.Any]] = None): 36 | for _ in range(self.buffer.maxlen-1): 37 | self.buffer.append(self.env.observation_space.low) 38 | obs, extra = self.env.reset() 39 | return self.observation(obs), extra 40 | 41 | def observation(self, observation: np.ndarray) -> np.ndarray: 42 | self.buffer.append(observation) 43 | return np.concatenate(self.buffer) 44 | 45 | 46 | def make_env(env_name: str, **kwargs): 47 | env = gym.make(env_name, **kwargs) 48 | env = atari_wrappers.AtariWrapper(env, clip_reward=False, noop_max=0) 49 | env = ImageToPyTorch(env) 50 | env = BufferWrapper(env, n_steps=4) 51 | return env 52 | -------------------------------------------------------------------------------- /Chapter07/01_actions.py: -------------------------------------------------------------------------------- 1 | import ptan 2 | import numpy as np 3 | 4 | 5 | if __name__ == "__main__": 6 | q_vals = np.array([[1, 2, 3], [1, -1, 0]]) 7 | print("q_vals") 8 | print(q_vals) 9 | 10 | selector = ptan.actions.ArgmaxActionSelector() 11 | print("argmax:", selector(q_vals)) 12 | 13 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0) 14 | print("epsilon=0.0:", selector(q_vals)) 15 | 16 | selector.epsilon = 1.0 17 | print("epsilon=1.0:", selector(q_vals)) 18 | 19 | selector.epsilon = 0.5 20 | print("epsilon=0.5:", selector(q_vals)) 21 | selector.epsilon = 0.1 22 | print("epsilon=0.1:", selector(q_vals)) 23 | 24 | selector = ptan.actions.ProbabilityActionSelector() 25 | print("Actions sampled from three prob distributions:") 26 | for _ in range(10): 27 | acts = selector(np.array([ 28 | [0.1, 0.8, 0.1], 29 | [0.0, 0.0, 1.0], 30 | [0.5, 0.5, 0.0] 31 | ])) 32 | print(acts) 33 | -------------------------------------------------------------------------------- /Chapter07/02_agents.py: -------------------------------------------------------------------------------- 1 | import ptan 2 | import torch 3 | from torch import nn 4 | import numpy as np 5 | 6 | 7 | class DQNNet(nn.Module): 8 | def __init__(self, actions: int): 9 | super(DQNNet, self).__init__() 10 | self.actions = actions 11 | 12 | def forward(self, x): 13 | # we always produce diagonal tensor of shape 14 | # (batch_size, actions) 15 | return torch.eye(x.size()[0], self.actions) 16 | 17 | 18 | class PolicyNet(nn.Module): 19 | def __init__(self, actions: int): 20 | super(PolicyNet, self).__init__() 21 | self.actions = actions 22 | 23 | def forward(self, x): 24 | # Now we produce the tensor with first two actions 25 | # having the same logit scores 26 | shape = (x.size()[0], self.actions) 27 | res = torch.zeros(shape, dtype=torch.float32) 28 | res[:, 0] = 1 29 | res[:, 1] = 1 30 | return res 31 | 32 | 33 | if __name__ == "__main__": 34 | net = DQNNet(actions=3) 35 | net_out = net(torch.zeros(2, 10)) 36 | print("dqn_net:") 37 | print(net_out) 38 | 39 | selector = ptan.actions.ArgmaxActionSelector() 40 | agent = ptan.agent.DQNAgent(model=net, action_selector=selector) 41 | ag_out = agent(np.zeros(shape=(2, 5))) 42 | print("Argmax:", ag_out) 43 | 44 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0) 45 | agent = ptan.agent.DQNAgent(model=net, action_selector=selector) 46 | ag_out = agent(torch.zeros(10, 5))[0] 47 | print("eps=1.0:", ag_out) 48 | 49 | selector.epsilon = 0.5 50 | ag_out = agent(torch.zeros(10, 5))[0] 51 | print("eps=0.5:", ag_out) 52 | 53 | selector.epsilon = 0.1 54 | ag_out = agent(torch.zeros(10, 5))[0] 55 | print("eps=0.1:", ag_out) 56 | 57 | net = PolicyNet(actions=5) 58 | net_out = net(torch.zeros(6, 10)) 59 | print("policy_net:") 60 | print(net_out) 61 | 62 | selector = ptan.actions.ProbabilityActionSelector() 63 | agent = ptan.agent.PolicyAgent(model=net, action_selector=selector, apply_softmax=True) 64 | ag_out = agent(torch.zeros(6, 5))[0] 65 | print(ag_out) 66 | -------------------------------------------------------------------------------- /Chapter07/03_exp_sources.py: -------------------------------------------------------------------------------- 1 | from lib import * 2 | 3 | 4 | if __name__ == "__main__": 5 | env = ToyEnv() 6 | s, _ = env.reset() 7 | print(f"env.reset() -> {s}") 8 | s = env.step(1) 9 | print(f"env.step(1) -> {s}") 10 | s = env.step(2) 11 | print(f"env.step(2) -> {s}") 12 | 13 | for _ in range(10): 14 | r = env.step(0) 15 | print(r) 16 | 17 | agent = DullAgent(action=1) 18 | print("agent:", agent([1, 2])[0]) 19 | 20 | env = ToyEnv() 21 | agent = DullAgent(action=1) 22 | exp_source = ptan.experience.ExperienceSource( 23 | env=env, agent=agent, steps_count=2) 24 | for idx, exp in enumerate(exp_source): 25 | if idx > 15: 26 | break 27 | print(exp) 28 | 29 | exp_source = ptan.experience.ExperienceSource( 30 | env=env, agent=agent, steps_count=4) 31 | print(next(iter(exp_source))) 32 | 33 | exp_source = ptan.experience.ExperienceSource( 34 | env=[ToyEnv(), ToyEnv()], agent=agent, steps_count=2) 35 | for idx, exp in enumerate(exp_source): 36 | if idx > 4: 37 | break 38 | print(exp) 39 | 40 | print("ExperienceSourceFirstLast") 41 | exp_source = ptan.experience.ExperienceSourceFirstLast( 42 | env, agent, gamma=1.0, steps_count=1) 43 | for idx, exp in enumerate(exp_source): 44 | print(exp) 45 | if idx > 10: 46 | break 47 | -------------------------------------------------------------------------------- /Chapter07/04_replay_buf.py: -------------------------------------------------------------------------------- 1 | from lib import * 2 | 3 | 4 | if __name__ == "__main__": 5 | env = ToyEnv() 6 | agent = DullAgent(action=1) 7 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1.0, steps_count=1) 8 | buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=100) 9 | 10 | for step in range(6): 11 | buffer.populate(1) 12 | # if buffer is small enough, do nothing 13 | if len(buffer) < 5: 14 | continue 15 | batch = buffer.sample(4) 16 | print("Train time, %d batch samples:" % len(batch)) 17 | for s in batch: 18 | print(s) 19 | -------------------------------------------------------------------------------- /Chapter07/05_target_net.py: -------------------------------------------------------------------------------- 1 | from lib import * 2 | 3 | 4 | if __name__ == "__main__": 5 | net = DQNNet() 6 | print(net) 7 | tgt_net = ptan.agent.TargetNet(net) 8 | print("Main net:", net.ff.weight) 9 | print("Target net:", tgt_net.target_model.ff.weight) 10 | net.ff.weight.data += 1.0 11 | print("After update") 12 | print("Main net:", net.ff.weight) 13 | print("Target net:", tgt_net.target_model.ff.weight) 14 | tgt_net.sync() 15 | print("After sync") 16 | print("Main net:", net.ff.weight) 17 | print("Target net:", tgt_net.target_model.ff.weight) 18 | -------------------------------------------------------------------------------- /Chapter07/06_cartpole.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | from ptan.experience import ExperienceFirstLast, ExperienceSourceFirstLast 3 | import ptan 4 | import numpy as np 5 | import torch 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import torch.nn.functional as F 9 | import typing as tt 10 | 11 | 12 | HIDDEN_SIZE = 128 13 | BATCH_SIZE = 16 14 | TGT_NET_SYNC = 10 15 | GAMMA = 0.9 16 | REPLAY_SIZE = 1000 17 | LR = 1e-3 18 | EPS_DECAY = 0.99 19 | 20 | 21 | class Net(nn.Module): 22 | def __init__(self, obs_size: int, hidden_size: int, n_actions: int): 23 | super(Net, self).__init__() 24 | self.net = nn.Sequential( 25 | nn.Linear(obs_size, hidden_size), 26 | nn.ReLU(), 27 | nn.Linear(hidden_size, n_actions) 28 | ) 29 | 30 | def forward(self, x): 31 | return self.net(x.float()) 32 | 33 | 34 | @torch.no_grad() 35 | def unpack_batch(batch: tt.List[ExperienceFirstLast], net: Net, gamma: float): 36 | states = [] 37 | actions = [] 38 | rewards = [] 39 | done_masks = [] 40 | last_states = [] 41 | for exp in batch: 42 | states.append(exp.state) 43 | actions.append(exp.action) 44 | rewards.append(exp.reward) 45 | done_masks.append(exp.last_state is None) 46 | if exp.last_state is None: 47 | last_states.append(exp.state) 48 | else: 49 | last_states.append(exp.last_state) 50 | 51 | states_v = torch.as_tensor(np.stack(states)) 52 | actions_v = torch.tensor(actions) 53 | rewards_v = torch.tensor(rewards) 54 | last_states_v = torch.as_tensor(np.stack(last_states)) 55 | last_state_q_v = net(last_states_v) 56 | best_last_q_v = torch.max(last_state_q_v, dim=1)[0] 57 | best_last_q_v[done_masks] = 0.0 58 | return states_v, actions_v, best_last_q_v * gamma + rewards_v 59 | 60 | 61 | if __name__ == "__main__": 62 | env = gym.make("CartPole-v1") 63 | obs_size = env.observation_space.shape[0] 64 | n_actions = env.action_space.n 65 | 66 | net = Net(obs_size, HIDDEN_SIZE, n_actions) 67 | tgt_net = ptan.agent.TargetNet(net) 68 | selector = ptan.actions.ArgmaxActionSelector() 69 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1, selector=selector) 70 | agent = ptan.agent.DQNAgent(net, selector) 71 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA) 72 | buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) 73 | optimizer = optim.Adam(net.parameters(), LR) 74 | 75 | step = 0 76 | episode = 0 77 | solved = False 78 | 79 | while True: 80 | step += 1 81 | buffer.populate(1) 82 | 83 | for reward, steps in exp_source.pop_rewards_steps(): 84 | episode += 1 85 | print(f"{step}: episode {episode} done, reward={reward:.2f}, " 86 | f"epsilon={selector.epsilon:.2f}") 87 | solved = reward > 150 88 | if solved: 89 | print("Whee!") 90 | break 91 | if len(buffer) < 2*BATCH_SIZE: 92 | continue 93 | batch = buffer.sample(BATCH_SIZE) 94 | states_v, actions_v, tgt_q_v = unpack_batch(batch, tgt_net.target_model, GAMMA) 95 | optimizer.zero_grad() 96 | q_v = net(states_v) 97 | q_v = q_v.gather(1, actions_v.unsqueeze(-1)).squeeze(-1) 98 | loss_v = F.mse_loss(q_v, tgt_q_v) 99 | loss_v.backward() 100 | optimizer.step() 101 | selector.epsilon *= EPS_DECAY 102 | 103 | if step % TGT_NET_SYNC == 0: 104 | tgt_net.sync() 105 | -------------------------------------------------------------------------------- /Chapter07/lib.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import ptan 3 | import typing as tt 4 | import torch.nn as nn 5 | 6 | 7 | class ToyEnv(gym.Env): 8 | """ 9 | Environment with observation 0..4 and actions 0..2 10 | Observations are rotated sequentialy mod 5, reward is equal to given action. 11 | Episodes are having fixed length of 10 12 | """ 13 | 14 | def __init__(self): 15 | super(ToyEnv, self).__init__() 16 | self.observation_space = gym.spaces.Discrete(n=5) 17 | self.action_space = gym.spaces.Discrete(n=3) 18 | self.step_index = 0 19 | 20 | def reset(self): 21 | self.step_index = 0 22 | return self.step_index, {} 23 | 24 | def step(self, action: int): 25 | is_done = self.step_index == 10 26 | if is_done: 27 | return self.step_index % self.observation_space.n, 0.0, is_done, False, {} 28 | self.step_index += 1 29 | return self.step_index % self.observation_space.n, float(action), \ 30 | self.step_index == 10, False, {} 31 | 32 | 33 | class DullAgent(ptan.agent.BaseAgent): 34 | """ 35 | Agent always returns the fixed action 36 | """ 37 | def __init__(self, action: int): 38 | self.action = action 39 | 40 | def __call__(self, observations: tt.List[int], state: tt.Optional[list] = None) -> \ 41 | tt.Tuple[tt.List[int], tt.Optional[list]]: 42 | return [self.action for _ in observations], state 43 | 44 | 45 | class DQNNet(nn.Module): 46 | def __init__(self): 47 | super(DQNNet, self).__init__() 48 | self.ff = nn.Linear(5, 3) 49 | 50 | def forward(self, x): 51 | return self.ff(x) -------------------------------------------------------------------------------- /Chapter08/01_dqn_basic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gymnasium as gym 3 | import ptan 4 | import typing as tt 5 | 6 | import torch 7 | import torch.optim as optim 8 | 9 | from ignite.engine import Engine 10 | 11 | from lib import dqn_model, common 12 | 13 | NAME = "01_baseline" 14 | 15 | BEST_PONG = common.Hyperparams( 16 | env_name="PongNoFrameskip-v4", 17 | stop_reward=18.0, 18 | run_name="pong", 19 | replay_size=100_000, 20 | replay_initial=10_000, 21 | target_net_sync=1000, 22 | epsilon_frames=100_000, 23 | epsilon_final=0.02, 24 | learning_rate=9.932831968547505e-05, 25 | gamma=0.98, 26 | episodes_to_solve=340, 27 | ) 28 | 29 | 30 | def train(params: common.Hyperparams, 31 | device: torch.device, _: dict) -> tt.Optional[int]: 32 | env = gym.make(params.env_name) 33 | env = ptan.common.wrappers.wrap_dqn(env) 34 | 35 | net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) 36 | tgt_net = ptan.agent.TargetNet(net) 37 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params.epsilon_start) 38 | epsilon_tracker = common.EpsilonTracker(selector, params) 39 | agent = ptan.agent.DQNAgent(net, selector, device=device) 40 | 41 | exp_source = ptan.experience.ExperienceSourceFirstLast( 42 | env, agent, gamma=params.gamma, env_seed=common.SEED) 43 | buffer = ptan.experience.ExperienceReplayBuffer( 44 | exp_source, buffer_size=params.replay_size) 45 | optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) 46 | 47 | def process_batch(engine, batch): 48 | optimizer.zero_grad() 49 | loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, 50 | gamma=params.gamma, device=device) 51 | loss_v.backward() 52 | optimizer.step() 53 | epsilon_tracker.frame(engine.state.iteration) 54 | if engine.state.iteration % params.target_net_sync == 0: 55 | tgt_net.sync() 56 | return { 57 | "loss": loss_v.item(), 58 | "epsilon": selector.epsilon, 59 | } 60 | 61 | engine = Engine(process_batch) 62 | common.setup_ignite(engine, params, exp_source, NAME) 63 | r = engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size)) 64 | if r.solved: 65 | return r.episode 66 | 67 | 68 | if __name__ == "__main__": 69 | args = common.argparser().parse_args() 70 | common.train_or_tune(args, train, BEST_PONG) 71 | -------------------------------------------------------------------------------- /Chapter08/04_dqn_noisy_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gymnasium as gym 3 | import ptan 4 | import typing as tt 5 | 6 | import torch 7 | import torch.optim as optim 8 | 9 | from ignite.engine import Engine 10 | 11 | from lib import common, dqn_extra 12 | 13 | NAME = "04_noisy" 14 | NOISY_SNR_EVERY_ITERS = 100 15 | 16 | BEST_PONG = common.Hyperparams( 17 | env_name="PongNoFrameskip-v4", 18 | stop_reward=18.0, 19 | run_name="pong", 20 | replay_size=100_000, 21 | replay_initial=10_000, 22 | target_net_sync=1000, 23 | epsilon_frames=100_000, 24 | epsilon_final=0.02, 25 | learning_rate=7.142520950425814e-05, 26 | gamma=0.99, 27 | episodes_to_solve=273, 28 | ) 29 | 30 | 31 | 32 | def train(params: common.Hyperparams, 33 | device: torch.device, extra: dict) -> tt.Optional[int]: 34 | env = gym.make(params.env_name) 35 | env = ptan.common.wrappers.wrap_dqn(env) 36 | 37 | net = dqn_extra.NoisyDQN( 38 | env.observation_space.shape, 39 | env.action_space.n).to(device) 40 | 41 | tgt_net = ptan.agent.TargetNet(net) 42 | selector = ptan.actions.ArgmaxActionSelector() 43 | agent = ptan.agent.DQNAgent(net, selector, device=device) 44 | 45 | exp_source = ptan.experience.ExperienceSourceFirstLast( 46 | env, agent, gamma=params.gamma, env_seed=common.SEED) 47 | buffer = ptan.experience.ExperienceReplayBuffer( 48 | exp_source, buffer_size=params.replay_size) 49 | optimizer = optim.Adam(net.parameters(), 50 | lr=params.learning_rate) 51 | 52 | def process_batch(engine, batch): 53 | optimizer.zero_grad() 54 | loss_v = common.calc_loss_dqn( 55 | batch, net, tgt_net.target_model, 56 | gamma=params.gamma, device=device) 57 | loss_v.backward() 58 | optimizer.step() 59 | net.reset_noise() 60 | if engine.state.iteration % params.target_net_sync == 0: 61 | tgt_net.sync() 62 | if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0: 63 | for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()): 64 | engine.state.metrics[f'snr_{layer_idx+1}'] = sigma_l2 65 | return { 66 | "loss": loss_v.item(), 67 | } 68 | 69 | engine = Engine(process_batch) 70 | common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=('snr_1', 'snr_2')) 71 | r = engine.run(common.batch_generator(buffer, params.replay_initial, 72 | params.batch_size)) 73 | if r.solved: 74 | return r.episode 75 | 76 | 77 | if __name__ == "__main__": 78 | args = common.argparser().parse_args() 79 | common.train_or_tune(args, train, BEST_PONG) 80 | -------------------------------------------------------------------------------- /Chapter08/06_dqn_dueling.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gymnasium as gym 3 | import ptan 4 | import typing as tt 5 | 6 | import torch 7 | from torch import nn 8 | from torch import optim 9 | import numpy as np 10 | 11 | from ignite.engine import Engine 12 | 13 | from lib import dqn_extra, common 14 | 15 | NAME = "06_dueling" 16 | 17 | STATES_TO_EVALUATE = 1000 18 | EVAL_EVERY_FRAME = 100 19 | 20 | BEST_PONG = common.GAME_PARAMS['pong'] 21 | 22 | 23 | @torch.no_grad() 24 | def evaluate_states(states: np.ndarray, net: nn.Module, 25 | device: torch.device, engine: Engine): 26 | s_v = torch.as_tensor(states).to(device) 27 | adv, val = net.adv_val(s_v) 28 | engine.state.metrics['adv'] = adv.mean().item() 29 | engine.state.metrics['val'] = val.mean().item() 30 | 31 | 32 | def train(params: common.Hyperparams, 33 | device: torch.device, extra: dict) -> tt.Optional[int]: 34 | env = gym.make(params.env_name) 35 | env = ptan.common.wrappers.wrap_dqn(env) 36 | 37 | net = dqn_extra.DuelingDQN(env.observation_space.shape, 38 | env.action_space.n).to(device) 39 | 40 | tgt_net = ptan.agent.TargetNet(net) 41 | selector = ptan.actions.EpsilonGreedyActionSelector( 42 | epsilon=params.epsilon_start) 43 | epsilon_tracker = common.EpsilonTracker(selector, params) 44 | agent = ptan.agent.DQNAgent(net, selector, device=device) 45 | 46 | exp_source = ptan.experience.ExperienceSourceFirstLast( 47 | env, agent, gamma=params.gamma, env_seed=common.SEED) 48 | buffer = ptan.experience.ExperienceReplayBuffer( 49 | exp_source, buffer_size=params.replay_size) 50 | optimizer = optim.Adam(net.parameters(), 51 | lr=params.learning_rate) 52 | 53 | def process_batch(engine, batch): 54 | optimizer.zero_grad() 55 | loss_v = common.calc_loss_dqn( 56 | batch, net, tgt_net.target_model, 57 | gamma=params.gamma, device=device) 58 | loss_v.backward() 59 | optimizer.step() 60 | epsilon_tracker.frame(engine.state.iteration) 61 | if engine.state.iteration % params.target_net_sync == 0: 62 | tgt_net.sync() 63 | if engine.state.iteration % EVAL_EVERY_FRAME == 0: 64 | eval_states = getattr(engine.state, "eval_states", None) 65 | if eval_states is None: 66 | eval_states = buffer.sample(STATES_TO_EVALUATE) 67 | eval_states = [ 68 | np.asarray(transition.state) 69 | for transition in eval_states 70 | ] 71 | eval_states = np.asarray(eval_states) 72 | engine.state.eval_states = eval_states 73 | evaluate_states(eval_states, net, device, engine) 74 | return { 75 | "loss": loss_v.item(), 76 | "epsilon": selector.epsilon, 77 | } 78 | 79 | engine = Engine(process_batch) 80 | common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=('adv', 'val')) 81 | r = engine.run(common.batch_generator( 82 | buffer, params.replay_initial, params.batch_size)) 83 | if r.solved: 84 | return r.episode 85 | 86 | 87 | if __name__ == "__main__": 88 | args = common.argparser().parse_args() 89 | common.train_or_tune(args, train, BEST_PONG) -------------------------------------------------------------------------------- /Chapter08/adhoc/commute.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as mpl 3 | mpl.use("Agg") 4 | import matplotlib.pyplot as plt 5 | 6 | if __name__ == "__main__": 7 | plt.clf() 8 | v1 = np.random.normal(30, 2.0, size=2000) 9 | v2 = np.random.normal(90, 4.0, size=200) 10 | v = np.concatenate((v1, v2)) 11 | mean_time = v.mean() 12 | plt.hist(v, normed=True, bins=100) 13 | plt.title("Car commute time distribution\nmean=%.2f mins" % mean_time) 14 | plt.xlabel("Time, minutes") 15 | plt.ylabel("Probability") 16 | plt.savefig("commute-car.png") 17 | 18 | plt.clf() 19 | v1 = np.random.normal(40, 2.0, size=2000) 20 | v2 = np.random.normal(60, 1.0, size=50) 21 | v = np.concatenate((v1, v2)) 22 | mean_time = v.mean() 23 | plt.hist(v, normed=True, bins=100) 24 | plt.title("Train commute time distribution\nmean=%.2f mins" % mean_time) 25 | plt.xlabel("Time, minutes") 26 | plt.ylabel("Probability") 27 | plt.savefig("commute-train.png") 28 | -------------------------------------------------------------------------------- /Chapter08/adhoc/distr_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | 4 | import lib.dqn_extra 5 | 6 | sys.path.append("./") 7 | 8 | from lib import common 9 | 10 | import matplotlib as mpl 11 | mpl.use("Agg") 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | Vmax = 10 16 | Vmin = -10 17 | N_ATOMS = 51 18 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1) 19 | 20 | 21 | def save_distr(src, proj, name): 22 | plt.clf() 23 | p = np.arange(Vmin, Vmax+DELTA_Z, DELTA_Z) 24 | plt.subplot(2, 1, 1) 25 | plt.bar(p, src, width=0.5) 26 | plt.title("Source") 27 | plt.subplot(2, 1, 2) 28 | plt.bar(p, proj, width=0.5) 29 | plt.title("Projected") 30 | plt.savefig(name + ".png") 31 | 32 | 33 | if __name__ == "__main__": 34 | np.random.seed(123) 35 | atoms = np.arange(Vmin, Vmax+DELTA_Z, DELTA_Z) 36 | 37 | # single peak distribution 38 | src_hist = np.zeros(shape=(1, N_ATOMS), dtype=np.float32) 39 | src_hist[0, N_ATOMS//2+1] = 1.0 40 | proj_hist = lib.dqn_extra.distr_projection(src_hist, np.array([2], dtype=np.float32), np.array([False]), 41 | Vmin, Vmax, N_ATOMS, gamma=0.9) 42 | save_distr(src_hist[0], proj_hist[0], "peak-r=2") 43 | 44 | # normal distribution 45 | data = np.random.normal(size=1000, scale=3) 46 | hist = np.histogram(data, normed=True, bins=np.arange(Vmin - DELTA_Z/2, Vmax + DELTA_Z*3/2, DELTA_Z)) 47 | 48 | src_hist = hist[0] 49 | proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([False]), 50 | Vmin, Vmax, N_ATOMS, gamma=0.9) 51 | save_distr(hist[0], proj_hist[0], "normal-r=2") 52 | 53 | # normal distribution, but done episode 54 | proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([True]), 55 | Vmin, Vmax, N_ATOMS, gamma=0.9) 56 | save_distr(hist[0], proj_hist[0], "normal-done-r=2") 57 | 58 | # clipping for out-of-range distribution 59 | proj_dist = lib.dqn_extra.distr_projection(np.array([src_hist]), np.array([10], dtype=np.float32), np.array([False]), 60 | Vmin, Vmax, N_ATOMS, gamma=0.9) 61 | save_distr(hist[0], proj_dist[0], "normal-r=10") 62 | 63 | # test both done and not done, unclipped 64 | proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist, src_hist]), np.array([2, 2], dtype=np.float32), 65 | np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9) 66 | save_distr(src_hist, proj_hist[0], "both_not_clip-01-incomplete") 67 | save_distr(src_hist, proj_hist[1], "both_not_clip-02-complete") 68 | 69 | # test both done and not done, clipped right 70 | proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist, src_hist]), np.array([10, 10], dtype=np.float32), 71 | np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9) 72 | save_distr(src_hist, proj_hist[0], "both_clip-right-01-incomplete") 73 | save_distr(src_hist, proj_hist[1], "both_clip-right-02-complete") 74 | 75 | # test both done and not done, clipped left 76 | proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist, src_hist]), np.array([-10, -10], dtype=np.float32), 77 | np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9) 78 | save_distr(src_hist, proj_hist[0], "both_clip-left-01-incomplete") 79 | save_distr(src_hist, proj_hist[1], "both_clip-left-02-complete") 80 | 81 | pass 82 | -------------------------------------------------------------------------------- /Chapter08/bench/simple_buffer_bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Benchmark various Replay Buffer variants 4 | """ 5 | import timeit 6 | import numpy as np 7 | import collections 8 | 9 | 10 | SIZES = [10**n for n in (3, 4, 5)] 11 | DATA_SHAPE = (84, 84, 4) 12 | REPEAT_NUMBER = 10 13 | 14 | 15 | class ExperienceBufferDeque: 16 | def __init__(self, capacity): 17 | self.buffer = collections.deque(maxlen=capacity) 18 | 19 | def __len__(self): 20 | return len(self.buffer) 21 | 22 | def append(self, experience): 23 | self.buffer.append(experience) 24 | 25 | def sample(self, batch_size): 26 | indices = np.random.choice(len(self.buffer), batch_size, replace=True) 27 | return [self.buffer[idx] for idx in indices] 28 | 29 | 30 | class ExperienceBufferCircularList: 31 | def __init__(self, capacity): 32 | self.buffer = list() 33 | self.capacity = capacity 34 | self.pos = 0 35 | 36 | def __len__(self): 37 | return len(self.buffer) 38 | 39 | def append(self, experience): 40 | if len(self.buffer) < self.capacity: 41 | self.buffer.append(experience) 42 | else: 43 | self.buffer[self.pos] = experience 44 | self.pos = (self.pos + 1) % self.capacity 45 | 46 | def sample(self, batch_size): 47 | indices = np.random.choice(len(self.buffer), batch_size, replace=True) 48 | return [self.buffer[idx] for idx in indices] 49 | 50 | 51 | 52 | def fill_buf(buf, size): 53 | for _ in range(size): 54 | buf.append(np.zeros(DATA_SHAPE, dtype=np.uint8)) 55 | 56 | 57 | def bench_buffer(buf_class): 58 | print("Benchmarking %s" % buf_class.__name__) 59 | 60 | for size in SIZES: 61 | print(" Test size %d" % size) 62 | ns = globals() 63 | ns.update(locals()) 64 | t = timeit.timeit('fill_buf(buf, size)', setup='buf = buf_class(size)', number=REPEAT_NUMBER, globals=ns) 65 | print(" * Initial fill:\t%.2f items/s" % (size*REPEAT_NUMBER / t)) 66 | buf = buf_class(size) 67 | fill_buf(buf, size) 68 | ns.update(locals()) 69 | t = timeit.timeit('fill_buf(buf, size)', number=REPEAT_NUMBER, globals=ns) 70 | print(" * Append:\t\t%.2f items/s" % (size*REPEAT_NUMBER / t)) 71 | t = timeit.timeit('buf.sample(4)', number=REPEAT_NUMBER*100, globals=ns) 72 | print(" * Sample 4:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 73 | t = timeit.timeit('buf.sample(8)', number=REPEAT_NUMBER*100, globals=ns) 74 | print(" * Sample 8:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 75 | t = timeit.timeit('buf.sample(16)', number=REPEAT_NUMBER*100, globals=ns) 76 | print(" * Sample 16:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 77 | t = timeit.timeit('buf.sample(32)', number=REPEAT_NUMBER*100, globals=ns) 78 | print(" * Sample 32:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t)) 79 | 80 | 81 | 82 | if __name__ == "__main__": 83 | bench_buffer(ExperienceBufferCircularList) 84 | bench_buffer(ExperienceBufferDeque) 85 | pass 86 | -------------------------------------------------------------------------------- /Chapter08/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter08/lib/__init__.py -------------------------------------------------------------------------------- /Chapter08/lib/dqn_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import typing as tt 4 | 5 | 6 | class DQN(nn.Module): 7 | def __init__(self, input_shape: tt.Tuple[int, ...], 8 | n_actions: int): 9 | super(DQN, self).__init__() 10 | 11 | self.conv = nn.Sequential( 12 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 13 | nn.ReLU(), 14 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 15 | nn.ReLU(), 16 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 17 | nn.ReLU(), 18 | nn.Flatten(), 19 | ) 20 | size = self.conv(torch.zeros(1, *input_shape)).size()[-1] 21 | self.fc = nn.Sequential( 22 | nn.Linear(size, 512), 23 | nn.ReLU(), 24 | nn.Linear(512, n_actions) 25 | ) 26 | 27 | def forward(self, x: torch.ByteTensor): 28 | xx = x / 255.0 29 | return self.fc(self.conv(xx)) 30 | -------------------------------------------------------------------------------- /Chapter09/.gitignore: -------------------------------------------------------------------------------- 1 | runs 2 | res 3 | -------------------------------------------------------------------------------- /Chapter09/attic/03_parallel_orig.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gym 3 | import ptan 4 | import argparse 5 | 6 | import torch 7 | import torch.optim as optim 8 | import torch.multiprocessing as mp 9 | 10 | from tensorboardX import SummaryWriter 11 | 12 | from lib import dqn_model, common 13 | 14 | PLAY_STEPS = 4 15 | 16 | 17 | def play_func(params, net, cuda, exp_queue): 18 | env = gym.make(params.env_name) 19 | env = ptan.common.wrappers.wrap_dqn(env) 20 | device = torch.device("cuda" if cuda else "cpu") 21 | 22 | writer = SummaryWriter(comment="-" + params.run_name + "-03_parallel") 23 | 24 | selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params.epsilon_start) 25 | epsilon_tracker = common.EpsilonTracker(selector, params) 26 | agent = ptan.agent.DQNAgent(net, selector, device=device) 27 | exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma, steps_count=1) 28 | exp_source_iter = iter(exp_source) 29 | 30 | frame_idx = 0 31 | 32 | with common.RewardTracker(writer, params.stop_reward) as reward_tracker: 33 | while True: 34 | frame_idx += 1 35 | exp = next(exp_source_iter) 36 | exp_queue.put(exp) 37 | 38 | epsilon_tracker.frame(frame_idx) 39 | 40 | new_rewards = exp_source.pop_total_rewards() 41 | if new_rewards: 42 | if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon): 43 | break 44 | 45 | exp_queue.put(None) 46 | 47 | 48 | if __name__ == "__main__": 49 | mp.set_start_method('spawn') 50 | params = common.HYPERPARAMS['pong'] 51 | params.batch_size *= PLAY_STEPS 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda") 54 | args = parser.parse_args() 55 | device = torch.device("cuda" if args.cuda else "cpu") 56 | 57 | env = gym.make(params.env_name) 58 | env = ptan.common.wrappers.wrap_dqn(env) 59 | 60 | net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device) 61 | tgt_net = ptan.agent.TargetNet(net) 62 | 63 | buffer = ptan.experience.ExperienceReplayBuffer(experience_source=None, buffer_size=params.replay_size) 64 | optimizer = optim.Adam(net.parameters(), lr=params.learning_rate) 65 | 66 | exp_queue = mp.Queue(maxsize=PLAY_STEPS * 2) 67 | play_proc = mp.Process(target=play_func, args=(params, net, args.cuda, exp_queue)) 68 | play_proc.start() 69 | 70 | frame_idx = 0 71 | 72 | while play_proc.is_alive(): 73 | # frame_idx += PLAY_STEPS 74 | #for _ in range(PLAY_STEPS): 75 | while exp_queue.qsize() > 1: 76 | exp = exp_queue.get() 77 | if exp is None: 78 | play_proc.join() 79 | break 80 | buffer._add(exp) 81 | frame_idx += 1 82 | if frame_idx % params.target_net_sync == 0: 83 | tgt_net.sync() 84 | 85 | if len(buffer) < params.replay_initial: 86 | continue 87 | optimizer.zero_grad() 88 | batch = buffer.sample(params.batch_size) 89 | loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device) 90 | loss_v.backward() 91 | optimizer.step() 92 | 93 | -------------------------------------------------------------------------------- /Chapter09/img/01_orig_tb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/01_orig_tb.png -------------------------------------------------------------------------------- /Chapter09/img/02_steps-tb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/02_steps-tb.png -------------------------------------------------------------------------------- /Chapter09/img/03-serial-blocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/03-serial-blocks.png -------------------------------------------------------------------------------- /Chapter09/img/03_serial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/03_serial.png -------------------------------------------------------------------------------- /Chapter09/img/04_parallel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/04_parallel.png -------------------------------------------------------------------------------- /Chapter09/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/lib/__init__.py -------------------------------------------------------------------------------- /Chapter09/lib/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/lib/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /Chapter09/lib/__pycache__/common.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/lib/__pycache__/common.cpython-311.pyc -------------------------------------------------------------------------------- /Chapter09/lib/__pycache__/dqn_model.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/lib/__pycache__/dqn_model.cpython-311.pyc -------------------------------------------------------------------------------- /Chapter09/lib/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import numpy as np 3 | 4 | from ptan.common.wrappers import ImageToPyTorch, BufferWrapper 5 | from stable_baselines3.common.atari_wrappers import ( 6 | StickyActionEnv, NoopResetEnv, EpisodicLifeEnv, 7 | FireResetEnv, WarpFrame, ClipRewardEnv 8 | ) 9 | from stable_baselines3.common.type_aliases import AtariStepReturn 10 | 11 | 12 | class JustSkipEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]): 13 | """ 14 | Return only every ``skip``-th frame (frameskipping) 15 | 16 | :param env: Environment to wrap 17 | :param skip: Number of ``skip``-th frame 18 | The same action will be taken ``skip`` times. 19 | """ 20 | 21 | def __init__(self, env: gym.Env, skip: int = 4) -> None: 22 | super().__init__(env) 23 | self._skip = skip 24 | 25 | def step(self, action: int) -> AtariStepReturn: 26 | """ 27 | Step the environment with the given action 28 | Repeat action, sum reward, and max over last observations. 29 | 30 | :param action: the action 31 | :return: observation, reward, terminated, truncated, information 32 | """ 33 | total_reward = 0.0 34 | info = {} 35 | obs = None 36 | terminated = truncated = False 37 | for i in range(self._skip): 38 | obs, reward, terminated, truncated, info = self.env.step(action) 39 | done = terminated or truncated 40 | total_reward += float(reward) 41 | if done: 42 | break 43 | return obs, total_reward, terminated, truncated, info 44 | 45 | 46 | class AtariWrapper(gym.Wrapper[np.ndarray, int, np.ndarray, int]): 47 | def __init__( 48 | self, 49 | env: gym.Env, 50 | noop_max: int = 30, 51 | frame_skip: int = 4, 52 | screen_size: int = 84, 53 | terminal_on_life_loss: bool = True, 54 | clip_reward: bool = True, 55 | action_repeat_probability: float = 0.0, 56 | ) -> None: 57 | if action_repeat_probability > 0.0: 58 | env = StickyActionEnv(env, action_repeat_probability) 59 | if noop_max > 0: 60 | env = NoopResetEnv(env, noop_max=noop_max) 61 | # frame_skip=1 is the same as no frame-skip (action repeat) 62 | if frame_skip > 1: 63 | env = JustSkipEnv(env, skip=frame_skip) 64 | if terminal_on_life_loss: 65 | env = EpisodicLifeEnv(env) 66 | if "FIRE" in env.unwrapped.get_action_meanings(): # type: ignore[attr-defined] 67 | env = FireResetEnv(env) 68 | env = WarpFrame(env, width=screen_size, height=screen_size) 69 | if clip_reward: 70 | env = ClipRewardEnv(env) 71 | 72 | super().__init__(env) 73 | 74 | 75 | def wrap_dqn(env: gym.Env, stack_frames: int = 4, 76 | episodic_life: bool = True, clip_reward: bool = True, 77 | noop_max: int = 0) -> gym.Env: 78 | """ 79 | Apply a common set of wrappers for Atari games. 80 | :param env: Environment to wrap 81 | :param stack_frames: count of frames to stack, default=4 82 | :param episodic_life: convert life to end of episode 83 | :param clip_reward: reward clipping 84 | :param noop_max: how many NOOP actions to execute 85 | :return: wrapped environment 86 | """ 87 | assert 'NoFrameskip' in env.spec.id 88 | env = AtariWrapper( 89 | env, clip_reward=clip_reward, noop_max=noop_max, 90 | terminal_on_life_loss=episodic_life 91 | ) 92 | env = ImageToPyTorch(env) 93 | if stack_frames > 1: 94 | env = BufferWrapper(env, stack_frames) 95 | return env -------------------------------------------------------------------------------- /Chapter09/lib/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import dataclasses 5 | import typing as tt 6 | 7 | from ptan.actions import EpsilonGreedyActionSelector 8 | from ptan.experience import ExperienceFirstLast, \ 9 | ExperienceReplayBuffer 10 | 11 | SEED = 123 12 | 13 | 14 | @dataclasses.dataclass 15 | class Hyperparams: 16 | env_name: str 17 | stop_reward: float 18 | run_name: str 19 | replay_size: int 20 | replay_initial: int 21 | target_net_sync: int 22 | epsilon_frames: int 23 | 24 | learning_rate: float = 0.0001 25 | batch_size: int = 32 26 | gamma: float = 0.99 27 | epsilon_start: float = 1.0 28 | epsilon_final: float = 0.02 29 | 30 | 31 | GAME_PARAMS = { 32 | 'pong': Hyperparams( 33 | env_name="PongNoFrameskip-v4", 34 | stop_reward=18.0, 35 | run_name="pong", 36 | replay_size=100_000, 37 | replay_initial=10_000, 38 | target_net_sync=1000, 39 | epsilon_frames=100_000, 40 | ), 41 | } 42 | 43 | 44 | def unpack_batch(batch: tt.List[ExperienceFirstLast]): 45 | states, actions, rewards, dones, last_states = [],[],[],[],[] 46 | for exp in batch: 47 | states.append(exp.state) 48 | actions.append(exp.action) 49 | rewards.append(exp.reward) 50 | dones.append(exp.last_state is None) 51 | if exp.last_state is None: 52 | lstate = exp.state # the result will be masked anyway 53 | else: 54 | lstate = exp.last_state 55 | last_states.append(lstate) 56 | return np.asarray(states), \ 57 | np.array(actions), \ 58 | np.array(rewards, dtype=np.float32), \ 59 | np.array(dones, dtype=bool), \ 60 | np.asarray(last_states) 61 | 62 | 63 | def calc_loss_dqn( 64 | batch: tt.List[ExperienceFirstLast], 65 | net: nn.Module, tgt_net: nn.Module, 66 | gamma: float, device: torch.device) -> torch.Tensor: 67 | states, actions, rewards, dones, next_states = \ 68 | unpack_batch(batch) 69 | 70 | states_v = torch.as_tensor(states).to(device) 71 | next_states_v = torch.as_tensor(next_states).to(device) 72 | actions_v = torch.LongTensor(actions).to(device) 73 | rewards_v = torch.FloatTensor(rewards).to(device) 74 | done_mask = torch.BoolTensor(dones).to(device) 75 | 76 | actions_v = actions_v.unsqueeze(-1) 77 | state_action_vals = net(states_v).gather(1, actions_v) 78 | state_action_vals = state_action_vals.squeeze(-1) 79 | with torch.no_grad(): 80 | next_state_vals = tgt_net(next_states_v).max(1)[0] 81 | next_state_vals[done_mask] = 0.0 82 | 83 | bellman_vals = next_state_vals.detach() * gamma + rewards_v 84 | return nn.MSELoss()(state_action_vals, bellman_vals) 85 | 86 | 87 | class EpsilonTracker: 88 | def __init__(self, selector: EpsilonGreedyActionSelector, 89 | params: Hyperparams): 90 | self.selector = selector 91 | self.params = params 92 | self.frame(0) 93 | 94 | def frame(self, frame_idx: int): 95 | eps = self.params.epsilon_start - \ 96 | frame_idx / self.params.epsilon_frames 97 | self.selector.epsilon = max(self.params.epsilon_final, eps) 98 | 99 | 100 | def batch_generator(buffer: ExperienceReplayBuffer, 101 | initial: int, batch_size: int) -> \ 102 | tt.Generator[tt.List[ExperienceFirstLast], None, None]: 103 | buffer.populate(initial) 104 | while True: 105 | buffer.populate(1) 106 | yield buffer.sample(batch_size) 107 | 108 | -------------------------------------------------------------------------------- /Chapter09/lib/dqn_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import typing as tt 4 | 5 | 6 | class DQN(nn.Module): 7 | def __init__(self, input_shape: tt.Tuple[int, ...], 8 | n_actions: int): 9 | super(DQN, self).__init__() 10 | 11 | self.conv = nn.Sequential( 12 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 13 | nn.ReLU(), 14 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 15 | nn.ReLU(), 16 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 17 | nn.ReLU(), 18 | nn.Flatten(), 19 | ) 20 | size = self.conv(torch.zeros(1, *input_shape)).size()[-1] 21 | self.fc = nn.Sequential( 22 | nn.Linear(size, 512), 23 | nn.ReLU(), 24 | nn.Linear(512, n_actions) 25 | ) 26 | 27 | def forward(self, x: torch.ByteTensor): 28 | xx = x / 255.0 29 | return self.fc(self.conv(xx)) 30 | -------------------------------------------------------------------------------- /Chapter10/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | saves 3 | res 4 | -------------------------------------------------------------------------------- /Chapter10/conftest.py: -------------------------------------------------------------------------------- 1 | # this file adds current dir to the pytest path for modules import -------------------------------------------------------------------------------- /Chapter10/data/ch10-small-quotes.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter10/data/ch10-small-quotes.tgz -------------------------------------------------------------------------------- /Chapter10/data/unpack_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | tar xvf ch10-small-quotes.tgz 3 | -------------------------------------------------------------------------------- /Chapter10/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter10/lib/__init__.py -------------------------------------------------------------------------------- /Chapter10/lib/data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import glob 4 | import pathlib 5 | import numpy as np 6 | import typing as tt 7 | from dataclasses import dataclass 8 | 9 | 10 | @dataclass 11 | class Prices: 12 | open: np.ndarray 13 | high: np.ndarray 14 | low: np.ndarray 15 | close: np.ndarray 16 | volume: np.ndarray 17 | 18 | 19 | def read_csv(file_path: pathlib.Path, sep: str = ',', 20 | filter_data: bool = True, 21 | fix_open_price: bool = False) -> Prices: 22 | print("Reading", file_path) 23 | with file_path.open('rt', encoding='utf-8') as fd: 24 | reader = csv.reader(fd, delimiter=sep) 25 | h = next(reader) 26 | if '' not in h and sep == ',': 27 | return read_csv(file_path, ';') 28 | indices = [ 29 | h.index(s) 30 | for s in ('', '', '', 31 | '', '') 32 | ] 33 | o, h, l, c, v = [], [], [], [], [] 34 | count_out = 0 35 | count_filter = 0 36 | count_fixed = 0 37 | prev_vals = None 38 | filter_func = lambda v: abs(v-vals[0]) < 1e-8 39 | for row in reader: 40 | vals = list(map(float, [row[idx] for idx in indices])) 41 | if filter_data and all(map(filter_func, vals[:-1])): 42 | count_filter += 1 43 | continue 44 | 45 | po, ph, pl, pc, pv = vals 46 | 47 | # fix open price for current bar to match close price for the previous bar 48 | if fix_open_price and prev_vals is not None: 49 | ppo, pph, ppl, ppc, ppv = prev_vals 50 | if abs(po - ppc) > 1e-8: 51 | count_fixed += 1 52 | po = ppc 53 | pl = min(pl, po) 54 | ph = max(ph, po) 55 | count_out += 1 56 | o.append(po) 57 | c.append(pc) 58 | h.append(ph) 59 | l.append(pl) 60 | v.append(pv) 61 | prev_vals = vals 62 | print(f"Read done, got {count_filter + count_out} rows, " 63 | f"{count_filter} filtered, " 64 | f"{count_fixed} open prices adjusted") 65 | return Prices(open=np.array(o, dtype=np.float32), 66 | high=np.array(h, dtype=np.float32), 67 | low=np.array(l, dtype=np.float32), 68 | close=np.array(c, dtype=np.float32), 69 | volume=np.array(v, dtype=np.float32)) 70 | 71 | 72 | def prices_to_relative(prices: Prices): 73 | """ 74 | Convert prices to relative in respect to open price 75 | :param ochl: tuple with open, close, high, low 76 | :return: tuple with open, rel_close, rel_high, rel_low 77 | """ 78 | rh = (prices.high - prices.open) / prices.open 79 | rl = (prices.low - prices.open) / prices.open 80 | rc = (prices.close - prices.open) / prices.open 81 | return Prices(open=prices.open, high=rh, low=rl, 82 | close=rc, volume=prices.volume) 83 | 84 | 85 | def load_relative(csv_path: pathlib.Path | str) -> Prices: 86 | if isinstance(csv_path, str): 87 | csv_path = pathlib.Path(csv_path) 88 | return prices_to_relative(read_csv(csv_path)) 89 | 90 | 91 | def price_files(dir_name: str) -> tt.List[pathlib.Path]: 92 | result = [] 93 | for path in glob.glob(os.path.join(dir_name, "*.csv")): 94 | result.append(pathlib.Path(path)) 95 | return result 96 | 97 | 98 | def load_year_data( 99 | year: int, basedir: str = 'data' 100 | ) -> tt.Dict[str, Prices]: 101 | y = str(year)[-2:] 102 | result = {} 103 | for path in glob.glob(os.path.join(basedir, "*_%s*.csv" % y)): 104 | result[path] = load_relative(pathlib.Path(path)) 105 | return result 106 | -------------------------------------------------------------------------------- /Chapter10/lib/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import typing as tt 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class SimpleFFDQN(nn.Module): 9 | def __init__(self, obs_len: int, actions_n: int): 10 | super(SimpleFFDQN, self).__init__() 11 | 12 | self.fc_val = nn.Sequential( 13 | nn.Linear(obs_len, 512), 14 | nn.ReLU(), 15 | nn.Linear(512, 512), 16 | nn.ReLU(), 17 | nn.Linear(512, 1) 18 | ) 19 | 20 | self.fc_adv = nn.Sequential( 21 | nn.Linear(obs_len, 512), 22 | nn.ReLU(), 23 | nn.Linear(512, 512), 24 | nn.ReLU(), 25 | nn.Linear(512, actions_n) 26 | ) 27 | 28 | def forward(self, x: torch.Tensor) -> torch.Tensor: 29 | val = self.fc_val(x) 30 | adv = self.fc_adv(x) 31 | return val + (adv - adv.mean(dim=1, keepdim=True)) 32 | 33 | 34 | class DQNConv1D(nn.Module): 35 | def __init__(self, shape: tt.Tuple[int, ...], actions_n: int): 36 | super(DQNConv1D, self).__init__() 37 | 38 | self.conv = nn.Sequential( 39 | nn.Conv1d(shape[0], 128, 5), 40 | nn.ReLU(), 41 | nn.Conv1d(128, 128, 5), 42 | nn.ReLU(), 43 | nn.Flatten(), 44 | ) 45 | size = self.conv(torch.zeros(1, *shape)).size()[-1] 46 | 47 | self.fc_val = nn.Sequential( 48 | nn.Linear(size, 512), 49 | nn.ReLU(), 50 | nn.Linear(512, 1) 51 | ) 52 | 53 | self.fc_adv = nn.Sequential( 54 | nn.Linear(size, 512), 55 | nn.ReLU(), 56 | nn.Linear(512, actions_n) 57 | ) 58 | 59 | 60 | def forward(self, x: torch.Tensor) -> torch.Tensor: 61 | conv_out = self.conv(x) 62 | val = self.fc_val(conv_out) 63 | adv = self.fc_adv(conv_out) 64 | return val + (adv - adv.mean(dim=1, keepdim=True)) 65 | 66 | 67 | class DQNConv1DLarge(nn.Module): 68 | def __init__(self, shape, actions_n): 69 | super(DQNConv1DLarge, self).__init__() 70 | 71 | self.conv = nn.Sequential( 72 | nn.Conv1d(shape[0], 32, 3), 73 | nn.MaxPool1d(3, 2), 74 | nn.ReLU(), 75 | nn.Conv1d(32, 32, 3), 76 | nn.MaxPool1d(3, 2), 77 | nn.ReLU(), 78 | nn.Conv1d(32, 32, 3), 79 | nn.MaxPool1d(3, 2), 80 | nn.ReLU(), 81 | nn.Conv1d(32, 32, 3), 82 | nn.MaxPool1d(3, 2), 83 | nn.ReLU(), 84 | nn.Conv1d(32, 32, 3), 85 | nn.ReLU(), 86 | nn.Conv1d(32, 32, 3), 87 | nn.ReLU(), 88 | nn.Flatten(), 89 | ) 90 | size = self.conv(torch.zeros(1, *shape)).size()[-1] 91 | 92 | self.fc_val = nn.Sequential( 93 | nn.Linear(size, 512), 94 | nn.ReLU(), 95 | nn.Linear(512, 1) 96 | ) 97 | 98 | self.fc_adv = nn.Sequential( 99 | nn.Linear(size, 512), 100 | nn.ReLU(), 101 | nn.Linear(512, actions_n) 102 | ) 103 | 104 | def forward(self, x: torch.Tensor) -> torch.Tensor: 105 | conv_out = self.conv(x) 106 | val = self.fc_val(conv_out) 107 | adv = self.fc_adv(conv_out) 108 | return val + (adv - adv.mean(dim=1, keepdim=True)) 109 | -------------------------------------------------------------------------------- /Chapter10/lib/validation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | 5 | from lib import environ 6 | 7 | METRICS = ( 8 | 'episode_reward', 9 | 'episode_steps', 10 | 'order_profits', 11 | 'order_steps', 12 | ) 13 | 14 | 15 | def validation_run(env, net, episodes=100, device="cpu", epsilon=0.02, comission=0.1): 16 | stats = { metric: [] for metric in METRICS } 17 | 18 | for episode in range(episodes): 19 | obs, _ = env.reset() 20 | 21 | total_reward = 0.0 22 | position = None 23 | position_steps = None 24 | episode_steps = 0 25 | 26 | while True: 27 | obs_v = torch.tensor([obs]).to(device) 28 | out_v = net(obs_v) 29 | 30 | action_idx = out_v.max(dim=1)[1].item() 31 | if np.random.random() < epsilon: 32 | action_idx = env.action_space.sample() 33 | action = environ.Actions(action_idx) 34 | 35 | close_price = env._state._cur_close() 36 | 37 | if action == environ.Actions.Buy and position is None: 38 | position = close_price 39 | position_steps = 0 40 | elif action == environ.Actions.Close and position is not None: 41 | profit = close_price - position - (close_price + position) * comission / 100 42 | profit = 100.0 * profit / position 43 | stats['order_profits'].append(profit) 44 | stats['order_steps'].append(position_steps) 45 | position = None 46 | position_steps = None 47 | 48 | obs, reward, done, _, _ = env.step(action_idx) 49 | total_reward += reward 50 | episode_steps += 1 51 | if position_steps is not None: 52 | position_steps += 1 53 | if done: 54 | if position is not None: 55 | profit = close_price - position - (close_price + position) * comission / 100 56 | profit = 100.0 * profit / position 57 | stats['order_profits'].append(profit) 58 | stats['order_steps'].append(position_steps) 59 | break 60 | 61 | stats['episode_reward'].append(total_reward) 62 | stats['episode_steps'].append(episode_steps) 63 | 64 | return { key: np.mean(vals) for key, vals in stats.items() } 65 | -------------------------------------------------------------------------------- /Chapter10/run_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import numpy as np 4 | 5 | from lib import environ, data, models 6 | 7 | import torch 8 | 9 | import matplotlib as mpl 10 | mpl.use("Agg") 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | EPSILON = 0.02 15 | 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-d", "--data", required=True, help="CSV file with quotes to run the model") 20 | parser.add_argument("-m", "--model", required=True, help="Model file to load") 21 | parser.add_argument("-b", "--bars", type=int, default=50, help="Count of bars to feed into the model") 22 | parser.add_argument("-n", "--name", required=True, help="Name to use in output images") 23 | parser.add_argument("--commission", type=float, default=0.1, help="Commission size in percent, default=0.1") 24 | parser.add_argument("--conv", default=False, action="store_true", help="Use convolution model instead of FF") 25 | args = parser.parse_args() 26 | 27 | prices = data.load_relative(args.data) 28 | env = environ.StocksEnv({"TEST": prices}, bars_count=args.bars, reset_on_close=False, commission=args.commission, 29 | state_1d=args.conv, random_ofs_on_reset=False, reward_on_close=False, volumes=False) 30 | if args.conv: 31 | net = models.DQNConv1D(env.observation_space.shape, env.action_space.n) 32 | else: 33 | net = models.SimpleFFDQN(env.observation_space.shape[0], env.action_space.n) 34 | 35 | net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage, weights_only=True)) 36 | 37 | obs, _ = env.reset() 38 | start_price = env._state._cur_close() 39 | 40 | total_reward = 0.0 41 | step_idx = 0 42 | rewards = [] 43 | 44 | while True: 45 | step_idx += 1 46 | obs_v = torch.tensor([obs]) 47 | out_v = net(obs_v) 48 | action_idx = out_v.max(dim=1)[1].item() 49 | if np.random.random() < EPSILON: 50 | action_idx = env.action_space.sample() 51 | action = environ.Actions(action_idx) 52 | 53 | obs, reward, done, _, _ = env.step(action_idx) 54 | total_reward += reward 55 | rewards.append(total_reward) 56 | if step_idx % 100 == 0: 57 | print("%d: reward=%.3f" % (step_idx, total_reward)) 58 | if done: 59 | break 60 | 61 | plt.clf() 62 | plt.plot(rewards) 63 | plt.title("Total reward, data=%s" % args.name) 64 | plt.ylabel("Reward, %") 65 | plt.savefig("rewards-%s.png" % args.name) 66 | -------------------------------------------------------------------------------- /Chapter10/tests/test_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pathlib 3 | from lib import data 4 | 5 | 6 | def test_read_csv(): 7 | prices = data.read_csv(pathlib.Path("data/YNDX_160101_161231.csv")) 8 | assert isinstance(prices, data.Prices) 9 | 10 | 11 | def test_prices_to_relative(): 12 | t = data.Prices(open=np.array([1.0]), 13 | high=np.array([3.0]), 14 | low=np.array([0.5]), 15 | close=np.array([2.0]), 16 | volume=np.array([10])) 17 | rel = data.prices_to_relative(t) 18 | np.testing.assert_equal(rel.open, t.open) 19 | np.testing.assert_equal(rel.volume, t.volume) 20 | np.testing.assert_equal(rel.high, np.array([2.0])) # 200% growth 21 | np.testing.assert_equal(rel.low, np.array([-.5])) # 50% fall 22 | np.testing.assert_equal(rel.close, np.array([1.0])) # 100% growth 23 | 24 | 25 | def test_price_files(): 26 | files = data.price_files("data") 27 | assert len(files) > 0 28 | 29 | -------------------------------------------------------------------------------- /Chapter11/.gitignore: -------------------------------------------------------------------------------- 1 | res 2 | -------------------------------------------------------------------------------- /Chapter11/02_cartpole_reinforce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gymnasium as gym 3 | import ptan 4 | from ptan.experience import ExperienceSourceFirstLast 5 | import numpy as np 6 | import typing as tt 7 | from torch.utils.tensorboard.writer import SummaryWriter 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torch.optim as optim 13 | 14 | GAMMA = 0.99 15 | LEARNING_RATE = 0.01 16 | EPISODES_TO_TRAIN = 4 17 | 18 | 19 | class PGN(nn.Module): 20 | def __init__(self, input_size: int, n_actions: int): 21 | super(PGN, self).__init__() 22 | 23 | self.net = nn.Sequential( 24 | nn.Linear(input_size, 128), 25 | nn.ReLU(), 26 | nn.Linear(128, n_actions) 27 | ) 28 | 29 | def forward(self, x: torch.Tensor) -> torch.Tensor: 30 | return self.net(x) 31 | 32 | 33 | def calc_qvals(rewards: tt.List[float]) -> tt.List[float]: 34 | res = [] 35 | sum_r = 0.0 36 | for r in reversed(rewards): 37 | sum_r *= GAMMA 38 | sum_r += r 39 | res.append(sum_r) 40 | return list(reversed(res)) 41 | 42 | 43 | if __name__ == "__main__": 44 | env = gym.make("CartPole-v1") 45 | writer = SummaryWriter(comment="-cartpole-reinforce") 46 | 47 | net = PGN(env.observation_space.shape[0], env.action_space.n) 48 | print(net) 49 | 50 | agent = ptan.agent.PolicyAgent( 51 | net, preprocessor=ptan.agent.float32_preprocessor, apply_softmax=True) 52 | exp_source = ExperienceSourceFirstLast(env, agent, gamma=GAMMA) 53 | 54 | optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) 55 | 56 | total_rewards = [] 57 | done_episodes = 0 58 | 59 | batch_episodes = 0 60 | batch_states, batch_actions, batch_qvals = [], [], [] 61 | cur_rewards = [] 62 | 63 | for step_idx, exp in enumerate(exp_source): 64 | batch_states.append(exp.state) 65 | batch_actions.append(int(exp.action)) 66 | cur_rewards.append(exp.reward) 67 | 68 | if exp.last_state is None: 69 | batch_qvals.extend(calc_qvals(cur_rewards)) 70 | cur_rewards.clear() 71 | batch_episodes += 1 72 | 73 | # handle new rewards 74 | new_rewards = exp_source.pop_total_rewards() 75 | if new_rewards: 76 | done_episodes += 1 77 | reward = new_rewards[0] 78 | total_rewards.append(reward) 79 | mean_rewards = float(np.mean(total_rewards[-100:])) 80 | print(f"{step_idx}: reward: {reward:6.2f}, mean_100: {mean_rewards:6.2f}, " 81 | f"episodes: {done_episodes}") 82 | writer.add_scalar("reward", reward, step_idx) 83 | writer.add_scalar("reward_100", mean_rewards, step_idx) 84 | writer.add_scalar("episodes", done_episodes, step_idx) 85 | if mean_rewards > 450: 86 | print(f"Solved in {step_idx} steps and {done_episodes} episodes!") 87 | break 88 | 89 | if batch_episodes < EPISODES_TO_TRAIN: 90 | continue 91 | 92 | optimizer.zero_grad() 93 | states_t = torch.as_tensor(np.asarray(batch_states)) 94 | batch_actions_t = torch.as_tensor(np.asarray(batch_actions)) 95 | batch_qvals_t = torch.as_tensor(np.asarray(batch_qvals)) 96 | 97 | logits_t = net(states_t) 98 | log_prob_t = F.log_softmax(logits_t, dim=1) 99 | batch_idx = range(len(batch_states)) 100 | act_probs_t = log_prob_t[batch_idx, batch_actions_t] 101 | log_prob_actions_v = batch_qvals_t * act_probs_t 102 | loss_t = -log_prob_actions_v.mean() 103 | 104 | loss_t.backward() 105 | optimizer.step() 106 | 107 | batch_episodes = 0 108 | batch_states.clear() 109 | batch_actions.clear() 110 | batch_qvals.clear() 111 | 112 | writer.close() 113 | -------------------------------------------------------------------------------- /Chapter11/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter11/lib/__init__.py -------------------------------------------------------------------------------- /Chapter11/lib/common.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import numpy as np 4 | import typing as tt 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class RewardTracker: 11 | def __init__(self, writer, stop_reward): 12 | self.writer = writer 13 | self.stop_reward = stop_reward 14 | 15 | def __enter__(self): 16 | self.ts = time.time() 17 | self.ts_frame = 0 18 | self.total_rewards = [] 19 | return self 20 | 21 | def __exit__(self, *args): 22 | self.writer.close() 23 | 24 | def reward(self, reward, frame, epsilon=None): 25 | self.total_rewards.append(reward) 26 | speed = (frame - self.ts_frame) / (time.time() - self.ts) 27 | self.ts_frame = frame 28 | self.ts = time.time() 29 | mean_reward = np.mean(self.total_rewards[-100:]) 30 | epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon 31 | print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % ( 32 | frame, len(self.total_rewards), mean_reward, speed, epsilon_str 33 | )) 34 | sys.stdout.flush() 35 | if epsilon is not None: 36 | self.writer.add_scalar("epsilon", epsilon, frame) 37 | self.writer.add_scalar("speed", speed, frame) 38 | self.writer.add_scalar("reward_100", mean_reward, frame) 39 | self.writer.add_scalar("reward", reward, frame) 40 | if mean_reward > self.stop_reward: 41 | print("Solved in %d frames!" % frame) 42 | return True 43 | return False 44 | 45 | 46 | class AtariPGN(nn.Module): 47 | def __init__(self, input_shape: tt.Tuple[int, ...], n_actions: int): 48 | super(AtariPGN, self).__init__() 49 | 50 | self.conv = nn.Sequential( 51 | nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4), 52 | nn.ReLU(), 53 | nn.Conv2d(32, 64, kernel_size=4, stride=2), 54 | nn.ReLU(), 55 | nn.Conv2d(64, 64, kernel_size=3, stride=1), 56 | nn.ReLU(), 57 | nn.Flatten(), 58 | ) 59 | size = self.conv(torch.zeros(1, *input_shape)).size()[-1] 60 | 61 | self.fc = nn.Sequential( 62 | nn.Linear(size, 512), 63 | nn.ReLU(), 64 | nn.Linear(512, n_actions) 65 | ) 66 | 67 | def forward(self, x: torch.ByteTensor) -> torch.Tensor: 68 | xx = x / 255.0 69 | return self.fc(self.conv(xx)) 70 | 71 | -------------------------------------------------------------------------------- /Chapter12/.gitignore: -------------------------------------------------------------------------------- 1 | runs_arch 2 | -------------------------------------------------------------------------------- /Chapter12/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter12/lib/__init__.py -------------------------------------------------------------------------------- /Chapter13/adhoc/hf_t1.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline 2 | import numpy as np 3 | 4 | if __name__ == "__main__": 5 | c = pipeline("feature-extraction") 6 | r = c(["I'm disappointed by delivery service", "Test sentence"]) 7 | for rr in r: 8 | a = np.array(rr) 9 | print(a.shape) -------------------------------------------------------------------------------- /Chapter13/adhoc/hf_t2.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | import numpy as np 3 | 4 | if __name__ == "__main__": 5 | c = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") 6 | print(c.get_sentence_embedding_dimension()) 7 | r = c.encode(["I'm disappointed by delivery service", "Test sentence"], convert_to_tensor=True) 8 | print(r.shape) -------------------------------------------------------------------------------- /Chapter13/adhoc/lc_t1.py: -------------------------------------------------------------------------------- 1 | from langchain_openai import ChatOpenAI 2 | 3 | if __name__ == "__main__": 4 | llm = ChatOpenAI() 5 | 6 | r = llm.invoke("What do you know about TextWorld games?") 7 | print(r) -------------------------------------------------------------------------------- /Chapter13/adhoc/openai_check.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | 4 | OPENAI_KEY = os.environ["OPENAI_API_KEY"] 5 | 6 | def check_openai_api_key(api_key): 7 | client = openai.OpenAI(api_key=api_key) 8 | try: 9 | client.models.list() 10 | except openai.AuthenticationError as e: 11 | print(e) 12 | return False 13 | else: 14 | return True 15 | 16 | # Check the validity of the API key 17 | api_key_valid = check_openai_api_key(OPENAI_KEY) 18 | print("API key is valid:", api_key_valid) -------------------------------------------------------------------------------- /Chapter13/chatgpt_auto.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from textworld import gym, EnvInfos 4 | from textworld.gym import register_game 5 | from langchain_openai import ChatOpenAI 6 | from langchain_core.output_parsers import StrOutputParser 7 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder 8 | 9 | 10 | def play_game(env, max_steps: int = 20) -> bool: 11 | prompt_init = ChatPromptTemplate.from_messages([ 12 | ("system", "You're playing the interactive fiction game. " 13 | "Reply with just a command in lowercase and nothing else"), 14 | ("system", "Game objective: {objective}"), 15 | ("user", "Room description: {description}"), 16 | ("user", "What command you want to execute next?"), 17 | ]) 18 | llm = ChatOpenAI() 19 | output_parser = StrOutputParser() 20 | 21 | commands = [] 22 | 23 | obs, info = env.reset() 24 | init_msg = prompt_init.invoke({ 25 | "objective": info['objective'], 26 | "description": info['description'], 27 | }) 28 | 29 | context = init_msg.to_messages() 30 | ai_msg = llm.invoke(init_msg) 31 | context.append(ai_msg) 32 | cmd = output_parser.invoke(ai_msg) 33 | 34 | prompt_next = ChatPromptTemplate.from_messages([ 35 | MessagesPlaceholder(variable_name="chat_history"), 36 | ("user", "Last command result: {result}"), 37 | ("user", "Room description: {description}"), 38 | ("user", "What command you want to execute next?"), 39 | ]) 40 | 41 | for _ in range(max_steps): 42 | commands.append(cmd) 43 | print(">>>", cmd) 44 | obs, r, is_done, info = env.step(cmd) 45 | if is_done: 46 | print(f"I won in {len(commands)} steps!") 47 | return True 48 | 49 | user_msgs = prompt_next.invoke({ 50 | "chat_history": context, 51 | "result": obs.strip(), 52 | "description": info['description'], 53 | }) 54 | context = user_msgs.to_messages() 55 | ai_msg = llm.invoke(user_msgs) 56 | context.append(ai_msg) 57 | cmd = output_parser.invoke(ai_msg) 58 | 59 | print(f"Wasn't able to solve after {max_steps} steps, commands: {commands}") 60 | return False 61 | 62 | 63 | if __name__ == "__main__": 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument("-g", "--game", default="simple", 66 | help="Game prefix to be used during training, default=simple") 67 | parser.add_argument("indices", nargs='+', type=int, default=[1], help="Game indices to test on, default=1") 68 | args = parser.parse_args() 69 | 70 | count_games, count_won = 0, 0 71 | for index in args.indices: 72 | print(f"Starting game {index}\n") 73 | env_id = register_game( 74 | gamefile=f"games/{args.game}{index}.ulx", 75 | request_infos=EnvInfos( 76 | description=True, 77 | objective=True, 78 | ), 79 | ) 80 | env = gym.make(env_id) 81 | count_games += 1 82 | if play_game(env): 83 | count_won += 1 84 | print(f"Played {count_games}, won {count_won}") 85 | -------------------------------------------------------------------------------- /Chapter13/chatgpt_interactive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import textwrap 4 | from textworld import gym, EnvInfos 5 | from textworld.gym import register_game 6 | 7 | 8 | def play_game(env, max_steps: int = 20) -> bool: 9 | commands = [] 10 | 11 | obs, info = env.reset() 12 | 13 | print(textwrap.dedent("""\ 14 | You're playing the interactive fiction game. 15 | Here is the game objective: %s 16 | 17 | Here is the room description: %s 18 | 19 | What command do you want to execute next? Reply with 20 | just a command in lowercase and nothing else. 21 | """) % (info['objective'], info['description'])) 22 | 23 | print("=== Send this to chat.openai.com and type the reply...") 24 | 25 | while len(commands) < max_steps: 26 | cmd = input(">>> ") 27 | commands.append(cmd) 28 | obs, r, is_done, info = env.step(cmd) 29 | if is_done: 30 | print(f"You won in {len(commands)} steps! " 31 | f"Don't forget to congratulate ChatGPT!") 32 | return True 33 | 34 | print(textwrap.dedent("""\ 35 | Last command result: %s 36 | Room description: %s 37 | 38 | What's the next command? 39 | """) % (obs, info['description'])) 40 | print("=== Send this to chat.openai.com and type the reply...") 41 | 42 | print(f"Wasn't able to solve after {max_steps} steps, commands: {commands}") 43 | return False 44 | 45 | 46 | if __name__ == "__main__": 47 | parser = argparse.ArgumentParser() 48 | parser.add_argument("-g", "--game", default="simple", 49 | help="Game prefix to be used during training, default=simple") 50 | parser.add_argument("indices", nargs='+', type=int, default=[1], help="Game indices to test on, default=1") 51 | args = parser.parse_args() 52 | 53 | count_games, count_won = 0, 0 54 | for index in args.indices: 55 | env_id = register_game( 56 | gamefile=f"games/{args.game}{index}.ulx", 57 | request_infos=EnvInfos(description=True, objective=True), 58 | ) 59 | env = gym.make(env_id) 60 | count_games += 1 61 | print(f"Starting game {index}\n") 62 | if play_game(env): 63 | count_won += 1 64 | print(f"Played {count_games}, won {count_won}") -------------------------------------------------------------------------------- /Chapter13/conftest.py: -------------------------------------------------------------------------------- 1 | # this file adds current dir to the pytest path for modules import -------------------------------------------------------------------------------- /Chapter13/games/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | *.ni 3 | *.ulx 4 | *.z8 -------------------------------------------------------------------------------- /Chapter13/games/make_games.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | tw-make custom --world-size 5 --nb-objects 10 --quest-length 5 --quest-breadth 1 --seed 0 --output simple-val.ulx 3 | 4 | # change the range to generate more games 5 | for i in `seq 1 20`; do 6 | tw-make custom --world-size 5 --nb-objects 10 --quest-length 5 --quest-breadth 1 --seed $i --output simple$i.ulx 7 | done 8 | -------------------------------------------------------------------------------- /Chapter13/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter13/lib/__init__.py -------------------------------------------------------------------------------- /Chapter13/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::DeprecationWarning -------------------------------------------------------------------------------- /Chapter13/requirements.txt: -------------------------------------------------------------------------------- 1 | textworld==1.6.1 2 | transformers==4.46.0 3 | sentence-transformers==3.2.1 4 | langchain-openai==0.3.4 5 | langchain==0.2.3 -------------------------------------------------------------------------------- /Chapter13/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter13/tests/__init__.py -------------------------------------------------------------------------------- /Chapter13/tests/test_preproc.py: -------------------------------------------------------------------------------- 1 | from pytest import mark 2 | 3 | from lib.preproc import RelativeDirectionWrapper 4 | 5 | 6 | @mark.parametrize("abs_act, dir_name, exp_rel_act", [ 7 | ("go north", "north", "go forward"), 8 | ("go south", "south", "go forward"), 9 | ("go east", "east", "go forward"), 10 | ("go east", "north", "go right"), 11 | ("go west", "north", "go left"), 12 | ("go south", "north", "go back"), 13 | ("go west", "east", "go back"), 14 | ("go west", "south", "go right"), 15 | ("go east", "south", "go left"), 16 | ("go north", "south", "go back"), 17 | ("go south", "east", "go right"), 18 | ("go south", "west", "go left"), 19 | ]) 20 | def test_abs_to_rel(abs_act, dir_name, exp_rel_act): 21 | dir_idx = RelativeDirectionWrapper.ABSOLUTE_DIRS.index(dir_name) 22 | rel_act = RelativeDirectionWrapper.abs_to_rel(abs_act, dir_idx) 23 | assert isinstance(rel_act, str) 24 | assert rel_act == exp_rel_act 25 | 26 | 27 | @mark.parametrize("rel_act, dir_name, exp_abs_act", [ 28 | ("go forward", "north", "go north"), 29 | ("go right", "north", "go east"), 30 | ("go back", "north", "go south"), 31 | ("go left", "north", "go west"), 32 | 33 | ("go forward", "east", "go east"), 34 | ("go right", "east", "go south"), 35 | ("go back", "east", "go west"), 36 | ("go left", "east", "go north"), 37 | ]) 38 | def test_rel_to_abs(rel_act, dir_name, exp_abs_act): 39 | dir_idx = RelativeDirectionWrapper.ABSOLUTE_DIRS.index(dir_name) 40 | abs_act = RelativeDirectionWrapper.rel_to_abs(rel_act, dir_idx) 41 | assert isinstance(abs_act, str) 42 | assert abs_act == exp_abs_act 43 | 44 | 45 | @mark.parametrize("rel_act, dir_name, exp_new_dir", [ 46 | ("go forward", "north", "north"), 47 | ("go right", "north", "east"), 48 | ("go left", "north", "west"), 49 | ("go back", "north", "south"), 50 | 51 | ("go forward", "west", "west"), 52 | ("go right", "west", "north"), 53 | ("go left", "west", "south"), 54 | ("go back", "west", "east"), 55 | ]) 56 | def test_rel_execute(rel_act, dir_name, exp_new_dir): 57 | dir_idx = RelativeDirectionWrapper.ABSOLUTE_DIRS.index(dir_name) 58 | new_dir = RelativeDirectionWrapper.rel_execute(rel_act, dir_idx) 59 | assert isinstance(new_dir, int) 60 | new_dir_name = RelativeDirectionWrapper.ABSOLUTE_DIRS[new_dir] 61 | assert new_dir_name == exp_new_dir 62 | 63 | 64 | def test_update_vocabs(): 65 | v, v_r = {}, {} 66 | RelativeDirectionWrapper.update_vocabs(v, v_r) 67 | assert len(v) == 4 68 | assert len(v_r) == 4 69 | assert v == {0: "right", 1: "forward", 2: "left", 3: "back"} 70 | assert v_r == {"right": 0, "forward": 1, "left": 2, "back": 3} 71 | 72 | v, v_r = {0: "word", 1: "left"}, {"word": 0, "left": 1} 73 | RelativeDirectionWrapper.update_vocabs(v, v_r) 74 | assert len(v) == 5 75 | assert len(v_r) == 5 76 | assert v == {0: "word", 1: "left", 2: "right", 3: "forward", 4: "back"} 77 | assert v_r == {"word": 0, "left": 1, "right": 2, "forward": 3, "back": 4} 78 | -------------------------------------------------------------------------------- /Chapter14/.gitignore: -------------------------------------------------------------------------------- 1 | out -------------------------------------------------------------------------------- /Chapter14/adhoc/01_wob_create.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import time 3 | import gymnasium as gym 4 | import miniwob 5 | from miniwob.action import ActionTypes 6 | 7 | RENDER_ENV = False 8 | 9 | 10 | if __name__ == "__main__": 11 | gym.register_envs(miniwob) 12 | 13 | env = gym.make('miniwob/click-test-2-v1', render_mode='human' if RENDER_ENV else None) 14 | print(env) 15 | try: 16 | # Start a new episode. 17 | obs, info = env.reset() 18 | print("Obs keys:", list(obs.keys())) 19 | print("Info dict:", info) 20 | assert obs["utterance"] == "Click button ONE." 21 | assert obs["fields"] == (("target", "ONE"),) 22 | print("Screenshot shape:", obs['screenshot'].shape) 23 | if RENDER_ENV: 24 | # to let you look at the environment. 25 | time.sleep(2) 26 | 27 | # Find the HTML element with text "ONE". 28 | target_elems = [e for e in obs['dom_elements'] if e['text'] == "ONE"] 29 | assert target_elems 30 | print("Target elem:", target_elems[0]) 31 | 32 | # Click on the element. 33 | action = env.unwrapped.create_action( 34 | ActionTypes.CLICK_ELEMENT, ref=target_elems[0]["ref"]) 35 | obs, reward, terminated, truncated, info = env.step(action) 36 | print(reward, terminated, info) 37 | finally: 38 | env.close() 39 | -------------------------------------------------------------------------------- /Chapter14/adhoc/02_act_clicks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import time 3 | 4 | import numpy as np 5 | import gymnasium 6 | import miniwob 7 | import typing as tt 8 | from miniwob.action import ActionTypes, ActionSpaceConfig 9 | 10 | RENDER_ENV = True 11 | 12 | BIN_DX = 10 13 | BIN_DY = 10 14 | SIZE_Y = 210 15 | SIZE_X = 160 16 | BINS_X = SIZE_X // BIN_DX 17 | BINS_Y = SIZE_Y // BIN_DY 18 | 19 | 20 | 21 | def close_bins(elems: tt.Tuple[dict, ...]) -> tt.Tuple[int, int]: 22 | elem_ids = {e['ref']: e for e in elems} 23 | close_elem = None 24 | for e in elems: 25 | if e['text'] == 'Close': 26 | close_elem = e 27 | break 28 | # need to roll back while ref is negative 29 | while close_elem['ref'] < 0: 30 | close_elem = elem_ids[close_elem['parent']] 31 | x = close_elem['left'][0] + close_elem['width'][0] / 2.0 32 | y = close_elem['top'][0] + close_elem['height'][0] / 2.0 33 | return x // BIN_DX, y // BIN_DY 34 | 35 | 36 | 37 | if __name__ == "__main__": 38 | gymnasium.register_envs(miniwob) 39 | 40 | act_cfg = ActionSpaceConfig( 41 | action_types=(ActionTypes.CLICK_COORDS, ), 42 | coord_bins=(BINS_X, BINS_Y), 43 | ) 44 | env = gymnasium.make( 45 | 'miniwob/click-dialog-v1', 46 | render_mode='human' if RENDER_ENV else None, 47 | action_space_config=act_cfg, 48 | ) 49 | print(env) 50 | print(env.action_space) 51 | try: 52 | # Start a new episode. 53 | obs, info = env.reset() 54 | print("Obs keys:", list(obs.keys())) 55 | print("Info dict:", info) 56 | print("Screenshot shape:", obs['screenshot'].shape) 57 | coords = close_bins(obs['dom_elements']) 58 | 59 | action = { 60 | "action_type": 0, 61 | "coords": np.array(coords, dtype=np.int8) 62 | } 63 | print("action", action) 64 | if RENDER_ENV: 65 | time.sleep(3) 66 | obs, reward, is_done, is_trunc, info = env.step(action) 67 | print(reward, is_done, info) 68 | 69 | # Brute force to check that our action is correct (comment step() call above) 70 | if False: 71 | is_done = False 72 | for y in range(BINS_Y): 73 | for x in range(BINS_X): 74 | action = { 75 | "action_type": 0, 76 | "coords": np.array((x, y), dtype=np.int8) 77 | } 78 | obs, reward, is_done, is_trunc, info = env.step(action) 79 | if is_done: 80 | print("Episode done:", action) 81 | print(reward, is_done, info) 82 | break 83 | if is_done: 84 | break 85 | if RENDER_ENV: 86 | input() 87 | finally: 88 | env.close() 89 | -------------------------------------------------------------------------------- /Chapter14/adhoc/03_clicker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import time 4 | 5 | sys.path.append(".") 6 | import typing as tt 7 | from lib import wob 8 | 9 | RENDER_ENV = True 10 | 11 | 12 | def close_bins(elems: tt.Tuple[dict, ...]) -> int: 13 | elem_ids = {e['ref']: e for e in elems} 14 | close_elem = None 15 | for e in elems: 16 | if e['text'] == 'Close': 17 | close_elem = e 18 | break 19 | # need to roll back while ref is negative 20 | while close_elem['ref'] < 0: 21 | close_elem = elem_ids[close_elem['parent']] 22 | print(close_elem) 23 | x = close_elem['left'][0] #+ close_elem['width'][0] / 2.0 24 | y = close_elem['top'][0] #+ close_elem['height'][0] / 2.0 25 | i = int(x // wob.BIN_SIZE) 26 | j = int((y - wob.Y_OFS) // wob.BIN_SIZE) - 1 27 | print(f"found elem x={x}, y={y} -> i={i}, j={j} = {i} + {j*16}") 28 | return i + 16*j 29 | 30 | if __name__ == "__main__": 31 | env = wob.MiniWoBClickWrapper.create( 32 | 'miniwob/click-dialog-v1', keep_obs=True, 33 | render_mode='human' if RENDER_ENV else None 34 | ) 35 | print(env) 36 | print(env.action_space) 37 | print(env.observation_space) 38 | try: 39 | # Start a new episode. 40 | obs, info = env.reset() 41 | orig_obs = info.pop(wob.MiniWoBClickWrapper.FULL_OBS_KEY) 42 | print("Obs shape:", obs.shape) 43 | print("Info dict:", info) 44 | action = close_bins(orig_obs['dom_elements']) 45 | print("action", action) 46 | 47 | # switch between detected close action and brute force mode 48 | if False: 49 | obs, reward, is_done, is_trunc, info = env.step(action) 50 | info.pop(wob.MiniWoBClickWrapper.FULL_OBS_KEY) 51 | print(reward, is_done, info) 52 | else: 53 | is_done = False 54 | for action in range(env.action_space.n): 55 | time.sleep(0.001) 56 | obs, reward, is_done, is_trunc, info = env.step(action) 57 | info.pop(wob.MiniWoBClickWrapper.FULL_OBS_KEY) 58 | print(action, "=>", reward, is_done, info) 59 | if is_done: 60 | print("Episode done:", action) 61 | break 62 | finally: 63 | env.close() 64 | -------------------------------------------------------------------------------- /Chapter14/adhoc/04_load_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | sys.path.append(".") 4 | import argparse 5 | import pathlib 6 | 7 | from lib import demos 8 | 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("-s", "--save", help="If given, save observations to this image prefix") 13 | parser.add_argument("-i", "--input", help="Input file to parse", 14 | default="demos/click-dialog/click-dialog_0421123844.json") 15 | args = parser.parse_args() 16 | p = pathlib.Path(args.input) 17 | res = demos.load_demo_file(p, gamma=0.99, steps=2) 18 | for idx, e in enumerate(res): 19 | print(f"obs={e.state.shape}, act={e.action}, r={e.reward}, last={e.last_state is None}") 20 | if args.save is not None: 21 | name = f"{args.save}_{idx:04d}_a={e.action}.png" 22 | demos.save_obs_image(e.state, e.action, name) 23 | print("Saved to", name) 24 | -------------------------------------------------------------------------------- /Chapter14/adhoc/05_join_obs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | sys.path.append(".") 4 | import argparse 5 | import pathlib 6 | import pickle 7 | import json 8 | from lib import demos 9 | 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("--dat", required=True, help="Data file in json format") 14 | parser.add_argument("--obs", required=True, help="Observations in pickle format") 15 | parser.add_argument("--save", default=False, action="store_true", help="Save images from observations") 16 | args = parser.parse_args() 17 | 18 | p = pathlib.Path(args.obs) 19 | rel_obs = pickle.loads(p.read_bytes()) 20 | p = pathlib.Path(args.dat) 21 | data = json.loads(p.read_text()) 22 | 23 | if args.save: 24 | for k in sorted(rel_obs.keys()): 25 | f = f"{k:05d}.png" 26 | demos.save_obs_image(rel_obs[k]['screenshot'], action=None, file_name=f, transpose=False) 27 | new_data = demos.join_obs(data, rel_obs) 28 | pass -------------------------------------------------------------------------------- /Chapter14/adhoc/06_save_traj.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Tool saves trajectories from several games using the given model 4 | """ 5 | import sys 6 | sys.path.append(".") 7 | import pathlib 8 | import argparse 9 | import torch 10 | import torch.nn.functional as F 11 | import numpy as np 12 | 13 | from lib import model, wob, demos 14 | 15 | ENV_NAME = 'miniwob/count-sides-v1' 16 | 17 | if __name__ == "__main__": 18 | parser = argparse.ArgumentParser() 19 | parser.add_argument("-m", "--model", required=True, help="Model file") 20 | parser.add_argument("-o", "--output", required=True, help="Dir to save screenshots") 21 | args = parser.parse_args() 22 | 23 | env = wob.MiniWoBClickWrapper.create(ENV_NAME) 24 | 25 | net = model.Model(input_shape=wob.WOB_SHAPE, n_actions=env.action_space.n) 26 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 27 | out_dir = pathlib.Path(args.output) 28 | out_dir.mkdir(parents=True, exist_ok=True) 29 | 30 | obs, info = env.reset() 31 | step_idx = 0 32 | 33 | while True: 34 | obs_v = torch.tensor(np.expand_dims(obs, axis=0)) 35 | logits_v = net(obs_v)[0] 36 | policy = F.softmax(logits_v, dim=1).data.numpy()[0] 37 | action = np.random.choice(len(policy), p=policy) 38 | 39 | new_obs, reward, done, is_tr, info = env.step(action) 40 | print(f"{step_idx}: act={action}, r={reward}, done={done}, tr={is_tr}: {info}") 41 | 42 | p = out_dir / f"scr_{step_idx:03d}_act={action}_r={reward:.2f}_d={done:d}_tr={is_tr:d}.png" 43 | demos.save_obs_image(obs, action, str(p)) 44 | obs = new_obs 45 | step_idx += 1 46 | if is_tr or done: 47 | break 48 | p = out_dir / f"scr_{step_idx:03d}.png" 49 | demos.save_obs_image(obs, action=None, file_name=str(p)) 50 | 51 | env.close() 52 | -------------------------------------------------------------------------------- /Chapter14/adhoc/06_save_traj_vec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Tool saves trajectories from several games using the given model, vectorized version 4 | """ 5 | import sys 6 | sys.path.append(".") 7 | import pathlib 8 | import argparse 9 | import torch 10 | import torch.nn.functional as F 11 | import numpy as np 12 | import gymnasium as gym 13 | 14 | from lib import model, wob, demos 15 | 16 | ENV_NAME = 'miniwob/count-sides-v1' 17 | N_ENVS = 4 18 | 19 | if __name__ == "__main__": 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("-m", "--model", required=True, help="Model file") 22 | parser.add_argument("-o", "--output", required=True, help="Dir to save screenshots") 23 | parser.add_argument("-a", type=int, help="If given, this action will be executed") 24 | args = parser.parse_args() 25 | 26 | envs = [ 27 | lambda: wob.MiniWoBClickWrapper.create(ENV_NAME) 28 | for _ in range(N_ENVS) 29 | ] 30 | env = gym.vector.AsyncVectorEnv(envs) 31 | 32 | net = model.Model(input_shape=wob.WOB_SHAPE, n_actions=env.single_action_space.n) 33 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 34 | out_dir = pathlib.Path(args.output) 35 | for i in range(N_ENVS): 36 | (out_dir / str(i)).mkdir(parents=True, exist_ok=True) 37 | 38 | obs, info = env.reset() 39 | step_idx = 0 40 | done_envs = set() 41 | 42 | while len(done_envs) < N_ENVS: 43 | obs_v = torch.tensor(obs) 44 | logits_v = net(obs_v)[0] 45 | policy = F.softmax(logits_v, dim=1).data.numpy() 46 | actions = [ 47 | np.random.choice(len(policy[i]), p=policy[i]) if args.a is None else args.a 48 | for i in range(N_ENVS) 49 | ] 50 | 51 | new_obs, rewards, dones, is_trs, infos = env.step(actions) 52 | for i, (action, reward, done, is_tr) in enumerate(zip(actions, rewards, dones, is_trs)): 53 | b_x, b_y = wob.action_to_bins(action) 54 | print(f"{step_idx}-{i}: act={action}, b={b_x}_{b_y}, r={reward}, done={done}, tr={is_tr}") 55 | p = out_dir / str(i) / f"scr_{step_idx:03d}_act={action}_b={b_x}-{b_y}_r={reward:.2f}_d={done:d}_tr={is_tr:d}.png" 56 | demos.save_obs_image(obs[i], action, str(p)) 57 | if is_tr or done: 58 | done_envs.add(i) 59 | obs = new_obs 60 | step_idx += 1 61 | 62 | env.close() 63 | -------------------------------------------------------------------------------- /Chapter14/lib/common.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import typing as tt 4 | 5 | import numpy as np 6 | import torch 7 | from torch.utils.tensorboard.writer import SummaryWriter 8 | 9 | import ptan 10 | 11 | class RewardTracker: 12 | def __init__(self, writer: SummaryWriter): 13 | self.writer = writer 14 | 15 | def __enter__(self): 16 | self.ts = time.time() 17 | self.ts_frame = 0 18 | self.total_rewards = [] 19 | return self 20 | 21 | def __exit__(self, *args): 22 | self.writer.close() 23 | 24 | def reward(self, reward: float, frame: int, 25 | epsilon: tt.Optional[float] = None): 26 | self.total_rewards.append(reward) 27 | speed = (frame - self.ts_frame) / (time.time() - self.ts) 28 | self.ts_frame = frame 29 | self.ts = time.time() 30 | mean_reward = np.mean(self.total_rewards[-100:]) 31 | epsilon_str = "" 32 | if epsilon is not None: 33 | epsilon_str = f", eps {epsilon:.2f}" 34 | print(f"{frame}: done {len(self.total_rewards)} games, " 35 | f"mean reward {mean_reward:.3f}, " 36 | f"speed {speed:.2f} f/s{epsilon_str}") 37 | sys.stdout.flush() 38 | if epsilon is not None: 39 | self.writer.add_scalar("epsilon", epsilon, frame) 40 | self.writer.add_scalar("speed", speed, frame) 41 | self.writer.add_scalar("reward_100", mean_reward, frame) 42 | self.writer.add_scalar("reward", reward, frame) 43 | return mean_reward if len(self.total_rewards) > 30 else None 44 | 45 | 46 | def unpack_batch(batch, net, last_val_gamma, device="cpu", states_preprocessor=ptan.agent.default_states_preprocessor): 47 | """ 48 | Convert batch into training tensors 49 | :param batch: 50 | :param net: 51 | :return: states variable, actions tensor, reference values variable 52 | """ 53 | states = [] 54 | actions = [] 55 | rewards = [] 56 | not_done_idx = [] 57 | last_states = [] 58 | for idx, exp in enumerate(batch): 59 | states.append(exp.state) 60 | actions.append(int(exp.action)) 61 | rewards.append(exp.reward) 62 | if exp.last_state is not None: 63 | not_done_idx.append(idx) 64 | last_states.append(exp.last_state) 65 | states_v = states_preprocessor(states) 66 | if torch.is_tensor(states_v): 67 | states_v = states_v.to(device) 68 | actions_t = torch.LongTensor(actions).to(device) 69 | 70 | # handle rewards 71 | rewards_np = np.array(rewards, dtype=np.float32) 72 | if not_done_idx: 73 | last_states_v = states_preprocessor(last_states) 74 | if torch.is_tensor(last_states_v): 75 | last_states_v = last_states_v.to(device) 76 | last_vals_v = net(last_states_v)[1] 77 | last_vals_np = last_vals_v.data.cpu().numpy()[:, 0] 78 | rewards_np[not_done_idx] += last_val_gamma * last_vals_np 79 | 80 | ref_vals_v = torch.FloatTensor(rewards_np).to(device) 81 | return states_v, actions_t, ref_vals_v 82 | -------------------------------------------------------------------------------- /Chapter14/requirements.txt: -------------------------------------------------------------------------------- 1 | miniwob==1.0 2 | nltk==3.8.1 3 | bottle==0.12.25 4 | -------------------------------------------------------------------------------- /Chapter14/wob_click_mm_play.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import gym 4 | import universe 5 | import numpy as np 6 | 7 | import torch 8 | import torch.nn.functional as F 9 | 10 | from lib import wob_vnc, model_vnc 11 | 12 | 13 | ENV_NAME = "wob.mini.ClickTab-v0" 14 | REMOTE_ADDR = 'vnc://gpu:5910+15910' 15 | 16 | # docker run -d -p 5910:5900 -p 15910:15900 --privileged --ipc host --cap-add SYS_ADMIN 92756d1f08ac 17 | 18 | 19 | def step_env(env, action): 20 | idle_count = 0 21 | while True: 22 | obs, reward, is_done, info = env.step([action]) 23 | if obs[0] is None: 24 | idle_count += 1 25 | continue 26 | break 27 | return obs[0], reward[0], is_done[0], info, idle_count 28 | 29 | 30 | if __name__ == "__main__": 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument("-m", "--model", help="Model file to load") 33 | parser.add_argument("-n", "--name", required=True, help="Prefix to save screenshots") 34 | parser.add_argument("--count", type=int, default=1, help="Count of runs to play, default=1") 35 | parser.add_argument("--env", default=ENV_NAME, help="Environment name to solve, default=" + ENV_NAME) 36 | args = parser.parse_args() 37 | 38 | env_name = args.env 39 | if not env_name.startswith('wob.mini.'): 40 | env_name = "wob.mini." + env_name 41 | 42 | env = gym.make(env_name) 43 | env = universe.wrappers.experimental.SoftmaxClickMouse(env) 44 | env = wob_vnc.MiniWoBCropper(env, keep_text=True) 45 | wob_vnc.configure(env, REMOTE_ADDR) 46 | 47 | net = model_vnc.ModelMultimodal(input_shape=wob_vnc.WOB_SHAPE, n_actions=env.action_space.n) 48 | if args.model: 49 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 50 | preprocessor = model_vnc.MultimodalPreprocessor.load(args.model[:-4] + ".pre") 51 | else: 52 | preprocessor = model_vnc.MultimodalPreprocessor() 53 | env.reset() 54 | 55 | for round_idx in range(args.count): 56 | action = env.action_space.sample() 57 | step_idx = 0 58 | while True: 59 | obs, reward, done, info, idle_count = step_env(env, action) 60 | print(step_idx, reward, done, idle_count) 61 | img_name = "%s_r%02d_s%04d_%.3f_i%02d_d%d.png" % ( 62 | args.name, round_idx, step_idx, reward, idle_count, int(done)) 63 | obs_v = preprocessor([obs]) 64 | logits_v = net(obs_v)[0] 65 | policy = F.softmax(logits_v, dim=1).data.numpy()[0] 66 | action = np.random.choice(len(policy), p=policy) 67 | wob_vnc.save_obs(obs[0], img_name, action=action) 68 | step_idx += 1 69 | if done or reward != 0: 70 | print("Round %d done" % round_idx) 71 | break 72 | pass 73 | -------------------------------------------------------------------------------- /Chapter14/wob_click_play.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | from lib import wob, model 9 | 10 | 11 | ENV_NAME = 'miniwob/click-dialog-v1' 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("-m", "--model", help="Model file to load") 17 | parser.add_argument("--count", type=int, default=1, help="Count of episodes to play, default=1") 18 | parser.add_argument("--env", default=ENV_NAME, help="Environment name to solve, default=" + ENV_NAME) 19 | parser.add_argument("--verbose", default=False, action='store_true', help="Display every step") 20 | parser.add_argument("--render", default=False, action='store_true', help="Show browser window") 21 | args = parser.parse_args() 22 | 23 | env_name = args.env 24 | if not env_name.startswith('miniwob/'): 25 | env_name = "miniwob/" + env_name 26 | 27 | render_mode = 'human' if args.render else None 28 | env = wob.MiniWoBClickWrapper.create(env_name, render_mode=render_mode) 29 | 30 | net = model.Model(input_shape=wob.WOB_SHAPE, n_actions=env.action_space.n) 31 | if args.model: 32 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 33 | 34 | steps_count = 0 35 | reward_sum = 0 36 | 37 | for round_idx in range(args.count): 38 | step_idx = 0 39 | obs, info = env.reset() 40 | while True: 41 | obs_v = torch.tensor(np.expand_dims(obs, axis=0)) 42 | logits_v = net(obs_v)[0] 43 | policy = F.softmax(logits_v, dim=1).data.numpy()[0] 44 | action = np.random.choice(len(policy), p=policy) 45 | 46 | obs, reward, done, is_tr, info = env.step(action) 47 | if args.verbose: 48 | print(step_idx, reward, done, info) 49 | 50 | step_idx += 1 51 | reward_sum += reward 52 | steps_count += 1 53 | if done: 54 | print("Round %d done" % round_idx) 55 | break 56 | print("Done %d rounds, mean steps %.2f, mean reward %.3f" % ( 57 | args.count, steps_count / args.count, reward_sum / args.count 58 | )) 59 | 60 | if args.render: 61 | input("Press enter to close the browser >>> ") 62 | env.close() 63 | 64 | pass 65 | -------------------------------------------------------------------------------- /Chapter15/01_check_env.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gymnasium as gym 3 | 4 | ENV_ID = "MinitaurBulletEnv-v0" 5 | ENTRY = "pybullet_envs.bullet.minitaur_gym_env:MinitaurBulletEnv" 6 | RENDER = True 7 | 8 | 9 | if __name__ == "__main__": 10 | gym.register(ENV_ID, entry_point=ENTRY, max_episode_steps=1000, 11 | reward_threshold=15.0, disable_env_checker=True) 12 | env = gym.make(ENV_ID, render=RENDER) 13 | 14 | print("Observation space:", env.observation_space) 15 | print("Action space:", env.action_space) 16 | print(env) 17 | print(env.reset()) 18 | input("Press any key to exit\n") 19 | env.close() 20 | -------------------------------------------------------------------------------- /Chapter15/03_play_a2c.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import gymnasium as gym 4 | 5 | from lib import model, common 6 | 7 | import numpy as np 8 | import torch 9 | 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-m", "--model", required=True, help="Model file to load") 14 | parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled") 15 | args = parser.parse_args() 16 | 17 | common.register_env() 18 | env = gym.make(common.ENV_ID, render_mode='rgb_array') 19 | if args.record is not None: 20 | env = gym.wrappers.RecordVideo(env, video_folder=args.record) 21 | 22 | net = model.ModelA2C(env.observation_space.shape[0], env.action_space.shape[0]) 23 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 24 | 25 | obs, _ = env.reset() 26 | total_reward = 0.0 27 | total_steps = 0 28 | while True: 29 | obs_v = torch.FloatTensor(np.expand_dims(obs, 0)) 30 | mu_v, var_v, val_v = net(obs_v) 31 | action = mu_v.squeeze(dim=0).data.numpy() 32 | action = np.clip(action, -1, 1) 33 | obs, reward, done, is_tr, _ = env.step(action) 34 | total_reward += reward 35 | total_steps += 1 36 | if done or is_tr: 37 | break 38 | print("In %d steps we got %.3f reward" % (total_steps, total_reward)) 39 | env.close() 40 | -------------------------------------------------------------------------------- /Chapter15/05_play_ddpg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import gymnasium as gym 4 | 5 | from lib import model, common 6 | 7 | import numpy as np 8 | import torch 9 | 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-m", "--model", required=True, help="Model file to load") 14 | parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled") 15 | args = parser.parse_args() 16 | 17 | common.register_env() 18 | env = gym.make(common.ENV_ID, render_mode='rgb_array') 19 | if args.record is not None: 20 | env = gym.wrappers.RecordVideo(env, video_folder=args.record) 21 | 22 | net = model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]) 23 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 24 | 25 | obs, _ = env.reset() 26 | total_reward = 0.0 27 | total_steps = 0 28 | while True: 29 | obs_v = torch.FloatTensor(np.expand_dims(obs, 0)) 30 | mu_v = net(obs_v) 31 | action = mu_v.squeeze(dim=0).data.numpy() 32 | action = np.clip(action, -1, 1) 33 | obs, reward, done, is_tr, _ = env.step(action) 34 | total_reward += reward 35 | total_steps += 1 36 | if done or is_tr: 37 | break 38 | print("In %d steps we got %.3f reward" % (total_steps, total_reward)) 39 | env.close() -------------------------------------------------------------------------------- /Chapter15/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter15/lib/__init__.py -------------------------------------------------------------------------------- /Chapter15/lib/common.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import numpy as np 3 | import torch 4 | import ptan 5 | 6 | 7 | ENV_ID = "MinitaurBulletEnv-v0" 8 | ENTRY = "pybullet_envs.bullet.minitaur_gym_env:MinitaurBulletEnv" 9 | 10 | 11 | 12 | def register_env(): 13 | # Small hack to fix render_modes metadata 14 | from pybullet_envs.bullet.minitaur_gym_env import MinitaurBulletEnv 15 | MinitaurBulletEnv.metadata['render_modes'] = MinitaurBulletEnv.metadata.pop('render.modes') 16 | 17 | # register environment in gymnasium registry, not gym's 18 | gym.register( 19 | ENV_ID, entry_point=ENTRY, 20 | max_episode_steps=1000, reward_threshold=15.0, 21 | apply_api_compatibility=True, 22 | disable_env_checker=True, 23 | ) 24 | 25 | 26 | def unpack_batch_a2c(batch, net, last_val_gamma, device="cpu"): 27 | """ 28 | Convert batch into training tensors 29 | :param batch: 30 | :param net: 31 | :return: states variable, actions tensor, reference values variable 32 | """ 33 | states = [] 34 | actions = [] 35 | rewards = [] 36 | not_done_idx = [] 37 | last_states = [] 38 | for idx, exp in enumerate(batch): 39 | states.append(exp.state) 40 | actions.append(exp.action) 41 | rewards.append(exp.reward) 42 | if exp.last_state is not None: 43 | not_done_idx.append(idx) 44 | last_states.append(exp.last_state) 45 | states_v = ptan.agent.float32_preprocessor(states).to(device) 46 | actions_v = torch.FloatTensor(np.asarray(actions)).to(device) 47 | 48 | # handle rewards 49 | rewards_np = np.array(rewards, dtype=np.float32) 50 | if not_done_idx: 51 | last_states_v = ptan.agent.float32_preprocessor(last_states).to(device) 52 | last_vals_v = net(last_states_v)[2] 53 | last_vals_np = last_vals_v.data.cpu().numpy()[:, 0] 54 | rewards_np[not_done_idx] += last_val_gamma * last_vals_np 55 | 56 | ref_vals_v = torch.FloatTensor(rewards_np).to(device) 57 | return states_v, actions_v, ref_vals_v 58 | 59 | 60 | def unpack_batch_ddqn(batch, device="cpu"): 61 | states, actions, rewards, dones, last_states = [], [], [], [], [] 62 | for exp in batch: 63 | states.append(exp.state) 64 | actions.append(exp.action) 65 | rewards.append(exp.reward) 66 | dones.append(exp.last_state is None) 67 | if exp.last_state is None: 68 | last_states.append(exp.state) 69 | else: 70 | last_states.append(exp.last_state) 71 | states_v = ptan.agent.float32_preprocessor(states).to(device) 72 | actions_v = ptan.agent.float32_preprocessor(actions).to(device) 73 | rewards_v = ptan.agent.float32_preprocessor(rewards).to(device) 74 | last_states_v = ptan.agent.float32_preprocessor(last_states).to(device) 75 | dones_t = torch.BoolTensor(dones).to(device) 76 | return states_v, actions_v, rewards_v, dones_t, last_states_v 77 | -------------------------------------------------------------------------------- /Chapter15/requirements.txt: -------------------------------------------------------------------------------- 1 | pybullet==3.2.6 2 | gym==0.25.1 3 | numpy<2 -------------------------------------------------------------------------------- /Chapter16/02_play.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import gymnasium as gym 4 | 5 | from lib import common, model, kfac 6 | 7 | import numpy as np 8 | import torch 9 | 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-m", "--model", required=True, help="Model file to load") 14 | parser.add_argument("-e", "--env", choices=list(common.ENV_PARAMS.keys()), 15 | default='cheetah', help="Environment name to use, default=cheehah") 16 | parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled") 17 | parser.add_argument("--acktr", default=False, action='store_true', help="Enable Acktr-specific tweaks") 18 | parser.add_argument("--mujoco", default=False, action='store_true', help="Enable MuJoCo, default=PyBullet") 19 | args = parser.parse_args() 20 | 21 | env_id = common.register_env(args.env, args.mujoco) 22 | env = gym.make(env_id, render_mode='rgb_array') 23 | if args.record is not None: 24 | env = gym.wrappers.RecordVideo(env, video_folder=args.record) 25 | 26 | net = model.ModelActor(env.observation_space.shape[0], env.action_space.shape[0]) 27 | if args.acktr: 28 | opt = kfac.KFACOptimizer(net) 29 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 30 | 31 | obs, _ = env.reset() 32 | total_reward = 0.0 33 | total_steps = 0 34 | while True: 35 | obs_v = torch.FloatTensor(obs) 36 | mu_v = net(obs_v) 37 | action = mu_v.squeeze(dim=0).data.numpy() 38 | action = np.clip(action, -1, 1) 39 | if np.isscalar(action): 40 | action = [action] 41 | obs, reward, done, is_tr, _ = env.step(action) 42 | total_reward += reward 43 | total_steps += 1 44 | if done or is_tr: 45 | break 46 | print("In %d steps we got %.3f reward" % (total_steps, total_reward)) 47 | -------------------------------------------------------------------------------- /Chapter16/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter16/lib/__init__.py -------------------------------------------------------------------------------- /Chapter16/lib/trpo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | 5 | 6 | def get_flat_params_from(model): 7 | params = [] 8 | for param in model.parameters(): 9 | params.append(param.data.view(-1)) 10 | 11 | flat_params = torch.cat(params) 12 | return flat_params 13 | 14 | 15 | def set_flat_params_to(model, flat_params): 16 | prev_ind = 0 17 | for param in model.parameters(): 18 | flat_size = int(np.prod(list(param.size()))) 19 | param.data.copy_( 20 | flat_params[prev_ind:prev_ind + flat_size].view(param.size())) 21 | prev_ind += flat_size 22 | 23 | 24 | def conjugate_gradients(Avp, b, nsteps, residual_tol=1e-10, device="cpu"): 25 | x = torch.zeros(b.size()).to(device) 26 | r = b.clone() 27 | p = b.clone() 28 | rdotr = torch.dot(r, r) 29 | for i in range(nsteps): 30 | _Avp = Avp(p) 31 | alpha = rdotr / torch.dot(p, _Avp) 32 | x += alpha * p 33 | r -= alpha * _Avp 34 | new_rdotr = torch.dot(r, r) 35 | betta = new_rdotr / rdotr 36 | p = r + betta * p 37 | rdotr = new_rdotr 38 | if rdotr < residual_tol: 39 | break 40 | return x 41 | 42 | 43 | def linesearch(model, 44 | f, 45 | x, 46 | fullstep, 47 | expected_improve_rate, 48 | max_backtracks=10, 49 | accept_ratio=.1): 50 | fval = f().data 51 | for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)): 52 | xnew = x + fullstep * stepfrac 53 | set_flat_params_to(model, xnew) 54 | newfval = f().data 55 | actual_improve = fval - newfval 56 | expected_improve = expected_improve_rate * stepfrac 57 | ratio = actual_improve / expected_improve 58 | 59 | if ratio.item() > accept_ratio and actual_improve.item() > 0: 60 | return True, xnew 61 | return False, x 62 | 63 | 64 | def trpo_step(model, get_loss, get_kl, max_kl, damping, device="cpu"): 65 | loss = get_loss() 66 | grads = torch.autograd.grad(loss, model.parameters()) 67 | loss_grad = torch.cat([grad.view(-1) for grad in grads]).data 68 | 69 | def Fvp(v): 70 | kl = get_kl() 71 | kl = kl.mean() 72 | 73 | grads = torch.autograd.grad(kl, model.parameters(), create_graph=True) 74 | flat_grad_kl = torch.cat([grad.view(-1) for grad in grads]) 75 | 76 | v_v = v.clone().detach().to(device) 77 | kl_v = (flat_grad_kl * v_v).sum() 78 | grads = torch.autograd.grad(kl_v, model.parameters()) 79 | flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads]).data 80 | 81 | return flat_grad_grad_kl + v * damping 82 | 83 | stepdir = conjugate_gradients(Fvp, -loss_grad, 10, device=device) 84 | 85 | shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0, keepdim=True) 86 | 87 | lm = torch.sqrt(shs / max_kl) 88 | fullstep = stepdir / lm[0] 89 | 90 | neggdotstepdir = (-loss_grad * stepdir).sum(0, keepdim=True) 91 | 92 | prev_params = get_flat_params_from(model) 93 | success, new_params = linesearch(model, get_loss, prev_params, fullstep, 94 | neggdotstepdir / lm[0]) 95 | set_flat_params_to(model, new_params) 96 | 97 | return loss 98 | -------------------------------------------------------------------------------- /Chapter16/requirements.txt: -------------------------------------------------------------------------------- 1 | gymnasium[mujoco]==0.29.0 -------------------------------------------------------------------------------- /Chapter17/.gitignore: -------------------------------------------------------------------------------- 1 | res 2 | -------------------------------------------------------------------------------- /Chapter17/01_cartpole_es.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gymnasium as gym 3 | import time 4 | import numpy as np 5 | import typing as tt 6 | 7 | import torch 8 | import torch.nn as nn 9 | 10 | from torch.utils.tensorboard.writer import SummaryWriter 11 | 12 | from lib import common 13 | 14 | 15 | MAX_BATCH_EPISODES = 100 16 | MAX_BATCH_STEPS = 10000 17 | NOISE_STD = 0.001 18 | LEARNING_RATE = 0.001 19 | 20 | 21 | 22 | class Net(nn.Module): 23 | def __init__(self, obs_size: int, action_size: int): 24 | super(Net, self).__init__() 25 | self.net = nn.Sequential( 26 | nn.Linear(obs_size, 32), 27 | nn.ReLU(), 28 | nn.Linear(32, action_size), 29 | nn.Softmax(dim=1) 30 | ) 31 | 32 | def forward(self, x: torch.Tensor) -> torch.Tensor: 33 | return self.net(x) 34 | 35 | 36 | def train_step(net: Net, batch_noise: tt.List[common.TNoise], batch_reward: tt.List[float], 37 | writer: SummaryWriter, step_idx: int): 38 | weighted_noise = None 39 | norm_reward = np.array(batch_reward) 40 | norm_reward -= np.mean(norm_reward) 41 | s = np.std(norm_reward) 42 | if abs(s) > 1e-6: 43 | norm_reward /= s 44 | 45 | for noise, reward in zip(batch_noise, norm_reward): 46 | if weighted_noise is None: 47 | weighted_noise = [reward * p_n for p_n in noise] 48 | else: 49 | for w_n, p_n in zip(weighted_noise, noise): 50 | w_n += reward * p_n 51 | m_updates = [] 52 | for p, p_update in zip(net.parameters(), weighted_noise): 53 | update = p_update / (len(batch_reward) * NOISE_STD) 54 | p.data += LEARNING_RATE * update 55 | m_updates.append(torch.norm(update)) 56 | writer.add_scalar("update_l2", np.mean(m_updates), step_idx) 57 | 58 | 59 | if __name__ == "__main__": 60 | writer = SummaryWriter(comment="-cartpole-es") 61 | env = gym.make("CartPole-v1") 62 | 63 | net = Net(env.observation_space.shape[0], env.action_space.n) 64 | print(net) 65 | 66 | step_idx = 0 67 | while True: 68 | t_start = time.time() 69 | batch_noise = [] 70 | batch_reward = [] 71 | batch_steps = 0 72 | for _ in range(MAX_BATCH_EPISODES): 73 | noise, neg_noise = common.sample_noise(net) 74 | batch_noise.append(noise) 75 | batch_noise.append(neg_noise) 76 | reward, steps = common.eval_with_noise( 77 | env, net, noise, NOISE_STD) 78 | batch_reward.append(reward) 79 | batch_steps += steps 80 | reward, steps = common.eval_with_noise( 81 | env, net, neg_noise, NOISE_STD) 82 | batch_reward.append(reward) 83 | batch_steps += steps 84 | if batch_steps > MAX_BATCH_STEPS: 85 | break 86 | 87 | step_idx += 1 88 | m_reward = float(np.mean(batch_reward)) 89 | if m_reward > 199: 90 | print("Solved in %d steps" % step_idx) 91 | break 92 | 93 | train_step(net, batch_noise, batch_reward, writer, step_idx) 94 | writer.add_scalar("reward_mean", m_reward, step_idx) 95 | writer.add_scalar("reward_std", np.std(batch_reward), step_idx) 96 | writer.add_scalar("reward_max", np.max(batch_reward), step_idx) 97 | writer.add_scalar("batch_episodes", len(batch_reward), step_idx) 98 | writer.add_scalar("batch_steps", batch_steps, step_idx) 99 | speed = batch_steps / (time.time() - t_start) 100 | writer.add_scalar("speed", speed, step_idx) 101 | print("%d: reward=%.2f, speed=%.2f f/s" % ( 102 | step_idx, m_reward, speed)) 103 | -------------------------------------------------------------------------------- /Chapter17/03_cartpole_ga.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import gymnasium as gym 3 | import copy 4 | import numpy as np 5 | 6 | import torch 7 | import torch.nn as nn 8 | 9 | from torch.utils.tensorboard.writer import SummaryWriter 10 | from lib import common 11 | 12 | 13 | NOISE_STD = 0.01 14 | POPULATION_SIZE = 50 15 | PARENTS_COUNT = 10 16 | 17 | 18 | class Net(nn.Module): 19 | def __init__(self, obs_size: int, action_size: int): 20 | super(Net, self).__init__() 21 | self.net = nn.Sequential( 22 | nn.Linear(obs_size, 32), 23 | nn.ReLU(), 24 | nn.Linear(32, action_size), 25 | nn.Softmax(dim=1) 26 | ) 27 | 28 | def forward(self, x: torch.Tensor) -> torch.Tensor: 29 | return self.net(x) 30 | 31 | 32 | def mutate_parent(net: Net) -> Net: 33 | new_net = copy.deepcopy(net) 34 | for p in new_net.parameters(): 35 | noise = np.random.normal(size=p.data.size()) 36 | noise_t = torch.FloatTensor(noise) 37 | p.data += NOISE_STD * noise_t 38 | return new_net 39 | 40 | 41 | if __name__ == "__main__": 42 | env = gym.make("CartPole-v1") 43 | writer = SummaryWriter(comment="-cartpole-ga") 44 | 45 | gen_idx = 0 46 | nets = [ 47 | Net(env.observation_space.shape[0], env.action_space.n) 48 | for _ in range(POPULATION_SIZE) 49 | ] 50 | population = [ 51 | (net, common.evaluate(env, net)) 52 | for net in nets 53 | ] 54 | while True: 55 | population.sort(key=lambda p: p[1], reverse=True) 56 | rewards = [p[1] for p in population[:PARENTS_COUNT]] 57 | reward_mean = np.mean(rewards) 58 | reward_max = np.max(rewards) 59 | reward_std = np.std(rewards) 60 | 61 | writer.add_scalar("reward_mean", reward_mean, gen_idx) 62 | writer.add_scalar("reward_std", reward_std, gen_idx) 63 | writer.add_scalar("reward_max", reward_max, gen_idx) 64 | print("%d: reward_mean=%.2f, reward_max=%.2f, reward_std=%.2f" % ( 65 | gen_idx, reward_mean, reward_max, reward_std)) 66 | if reward_mean > 199: 67 | print("Solved in %d steps" % gen_idx) 68 | break 69 | 70 | # generate next population 71 | prev_population = population 72 | population = [population[0]] 73 | for _ in range(POPULATION_SIZE-1): 74 | parent_idx = np.random.randint(0, PARENTS_COUNT) 75 | parent = prev_population[parent_idx][0] 76 | net = mutate_parent(parent) 77 | fitness = common.evaluate(env, net) 78 | population.append((net, fitness)) 79 | gen_idx += 1 80 | writer.close() 81 | -------------------------------------------------------------------------------- /Chapter17/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter17/lib/__init__.py -------------------------------------------------------------------------------- /Chapter17/lib/common.py: -------------------------------------------------------------------------------- 1 | import typing as tt 2 | import torch 3 | from torch import nn 4 | import numpy as np 5 | import gymnasium as gym 6 | 7 | 8 | TNoise = tt.List[torch.Tensor] 9 | 10 | 11 | def sample_noise( 12 | net: nn.Module, 13 | device: torch.device = torch.device('cpu') 14 | ) -> tt.Tuple[TNoise, TNoise]: 15 | pos = [] 16 | neg = [] 17 | for p in net.parameters(): 18 | noise = np.random.normal(size=p.data.size()) 19 | pos.append(torch.FloatTensor(noise).to(device)) 20 | neg.append(torch.FloatTensor(-noise).to(device)) 21 | return pos, neg 22 | 23 | 24 | def evaluate(env: gym.Env, net: nn.Module, get_max_action: bool = True, 25 | device: torch.device = torch.device('cpu')) -> tt.Tuple[float, int]: 26 | obs, _ = env.reset() 27 | reward = 0.0 28 | steps = 0 29 | while True: 30 | obs_v = torch.FloatTensor(np.expand_dims(obs, 0)).to(device) 31 | act_v = net(obs_v) 32 | if get_max_action: 33 | act = act_v.max(dim=1)[1].data.numpy()[0] 34 | else: 35 | act = act_v.data.cpu().numpy()[0] 36 | obs, r, done, is_tr, _ = env.step(act) 37 | reward += r 38 | steps += 1 39 | if done or is_tr: 40 | break 41 | return reward, steps 42 | 43 | 44 | def eval_with_noise(env: gym.Env, net: nn.Module, noise: TNoise, noise_std: float, 45 | get_max_action: bool = True, device: torch.device = torch.device("cpu") 46 | ) -> tt.Tuple[float, int]: 47 | old_params = net.state_dict() 48 | for p, p_n in zip(net.parameters(), noise): 49 | p.data += noise_std * p_n 50 | r, s = evaluate(env, net, get_max_action=get_max_action, device=device) 51 | net.load_state_dict(old_params) 52 | return r, s 53 | -------------------------------------------------------------------------------- /Chapter18/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter18/lib/__init__.py -------------------------------------------------------------------------------- /Chapter18/riverswim.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import random 3 | import argparse 4 | import collections 5 | import typing as tt 6 | 7 | SEED = 2 8 | 9 | 10 | def get_action(state: int, total_states: int) -> int: 11 | """ 12 | Return action from the given state. Actions are selected randomly 13 | :param state: state we're currently in 14 | :return: 0 means left, 1 is right 15 | """ 16 | if state == 1: 17 | return 1 18 | if state == total_states: 19 | return 0 20 | return random.choice([0, 1]) 21 | 22 | 23 | def do_action(state: int, action: int) -> int: 24 | """ 25 | Simulate the action from the given state 26 | """ 27 | # left action always succeeds and brings us to the left 28 | if action == 0: 29 | return state-1 30 | 31 | if state == 1: 32 | return random.choices([1, 2], weights=[0.4, 0.6])[0] 33 | # the rest of states are the same 34 | delta = random.choices([-1, 0, 1], weights=[0.05, 0.6, 0.35])[0] 35 | return state + delta 36 | 37 | 38 | if __name__ == "__main__": 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("-n", "--steps", type=int, default=100, help="Amount of steps to simulate, default=100") 41 | parser.add_argument("--episode-length", type=int, default=10, help="Limit of one episode, default=10") 42 | parser.add_argument("--seed", type=int, default=SEED, help="Seed to use, default=%d" % SEED) 43 | parser.add_argument("--env-len", type=int, default=6, help="Amount of states in the environment, default=6") 44 | args = parser.parse_args() 45 | random.seed(args.seed) 46 | 47 | states_count: tt.Counter[int] = collections.Counter() 48 | state = 1 49 | episode_step = 0 50 | 51 | for _ in range(args.steps): 52 | action = get_action(state, args.env_len) 53 | state = do_action(state, action) 54 | states_count[state] += 1 55 | episode_step += 1 56 | if episode_step == args.episode_length: 57 | state = 1 58 | episode_step = 0 59 | 60 | for state in range(1, args.env_len+1): 61 | print("%d:\t%d" % (state, states_count[state])) 62 | -------------------------------------------------------------------------------- /Chapter18/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter18/tests/__init__.py -------------------------------------------------------------------------------- /Chapter18/tests/test_ppo.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from numpy import testing 3 | 4 | from lib import ppo 5 | 6 | 7 | class TestPPO(unittest.TestCase): 8 | def test_adv_ref(self): 9 | vals = [0, 0, 0, 0, 0] 10 | dones = [False, False, True, False, False] 11 | rewards = [1, 1, 1, 1, 1] 12 | 13 | adv_t, ref_t = ppo.calc_adv_ref(vals, dones, rewards, 1.0, 1.0) 14 | adv = adv_t.detach().numpy() 15 | ref = ref_t.detach().numpy() 16 | 17 | testing.assert_array_equal(ref, [3, 2, 1, 1]) 18 | testing.assert_array_equal(ref, adv) 19 | 20 | adv_t, ref_t = ppo.calc_adv_ref(vals, dones, rewards, 0.9, 1.0) 21 | adv = adv_t.detach().numpy() 22 | ref = ref_t.detach().numpy() 23 | 24 | testing.assert_array_almost_equal(ref, [2.71, 1.9, 1., 1.]) 25 | testing.assert_array_almost_equal(ref, adv) 26 | 27 | 28 | pass 29 | 30 | -------------------------------------------------------------------------------- /Chapter19/.gitignore: -------------------------------------------------------------------------------- 1 | db* 2 | *.dat 3 | rec* 4 | -------------------------------------------------------------------------------- /Chapter19/01_play.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import pathlib 4 | 5 | import gymnasium as gym 6 | 7 | from lib import common, rlhf 8 | import ptan 9 | 10 | import numpy as np 11 | import torch 12 | import torch.nn.functional as F 13 | 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("-m", "--model", required=True, help="Model file to load") 18 | parser.add_argument("-e", "--env", default="SeaquestNoFrameskip-v4", 19 | help="Environment name to use, default=SeaquestNoFrameskip-v4") 20 | parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled") 21 | parser.add_argument("-n", type=int, help="Count of experiments to run") 22 | parser.add_argument("--reward", help="Path to reward model, if not given - use env reward") 23 | args = parser.parse_args() 24 | 25 | rounds = args.n if args.n is not None else 1 26 | logs = [] 27 | 28 | for round in range(rounds): 29 | video_folder = args.record 30 | if args.n is not None: 31 | video_folder += "-" + str(round) 32 | env = gym.make(args.env, render_mode='rgb_array') 33 | if args.record is not None: 34 | env = gym.wrappers.RecordVideo(env, video_folder=video_folder) 35 | if args.reward is not None: 36 | p = pathlib.Path(args.reward) 37 | env = rlhf.RewardModelWrapper(env, p, dev=torch.device("cpu")) 38 | env = ptan.common.wrappers.wrap_dqn(env, clip_reward=False) 39 | print(env) 40 | 41 | net = common.AtariA2C(env.observation_space.shape, env.action_space.n) 42 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 43 | 44 | obs, _ = env.reset() 45 | total_reward = 0.0 46 | total_steps = 0 47 | while True: 48 | obs_v = torch.FloatTensor(obs).unsqueeze(0) 49 | policy_v = net(obs_v)[0] 50 | policy_v = F.softmax(policy_v, dim=1) 51 | probs = policy_v[0].detach().cpu().numpy() 52 | action = np.random.choice(len(probs), p=probs) 53 | obs, reward, done, is_tr, _ = env.step(action) 54 | total_reward += reward 55 | total_steps += 1 56 | if done or is_tr: 57 | break 58 | if total_steps > 100000: 59 | break 60 | logs.append("%d: %d steps we got %.3f reward" % (round, total_steps, total_reward)) 61 | env.close() 62 | print("\n".join(logs)) 63 | -------------------------------------------------------------------------------- /Chapter19/02_label_ui.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Web interface to label stored data 4 | """ 5 | import argparse 6 | 7 | from nicegui import ui 8 | import typing as tt 9 | 10 | from lib import ui_tools, rlhf 11 | 12 | db: tt.Optional[rlhf.Database] = None 13 | to_label: tt.List[rlhf.HumanLabel] = [] 14 | 15 | 16 | 17 | def label_ui(): 18 | with ui.splitter().classes("w-full") as splitter: 19 | with splitter.before: 20 | ui.label("List with data samples") 21 | with splitter.after: 22 | ui.label("Interface with gif") 23 | 24 | 25 | @ui.page(ui_tools.URL_ROOT, title="RLHF db overview") 26 | def view_root(): 27 | ui_tools.drawers(ui_tools.URL_ROOT) 28 | ui.label(f"DB path: {db.db_root}") 29 | ui.label(f"Trajectories: {len(db.paths)}") 30 | ui.label(f"Human Labels: {len(db.labels)}") 31 | 32 | 33 | @ui.page(ui_tools.URL_LABEL, title="RLHF label data") 34 | def view_label(): 35 | ui_tools.drawers(ui_tools.URL_LABEL) 36 | ui_tools.label_list_view(db, to_label) 37 | 38 | 39 | @ui.page(ui_tools.URL_DATA, title="RLHF existing data") 40 | def view_label(): 41 | ui_tools.drawers(ui_tools.URL_DATA) 42 | # make a copy, just in case 43 | labels_list = list(db.labels) 44 | ui_tools.label_list_view(db, labels_list, show_resample_list=False) 45 | 46 | 47 | if __name__ in {"__main__", "__mp_main__"}: 48 | parser = argparse.ArgumentParser() 49 | parser.add_argument("-d", "--db", required=True, help="DB path to label") 50 | args = parser.parse_args() 51 | 52 | db = rlhf.load_db(args.db) 53 | to_label = rlhf.sample_to_label(db) 54 | 55 | ui.run(host='0.0.0.0', port=8080, show=False) 56 | -------------------------------------------------------------------------------- /Chapter19/adhoc/obs_to_gif.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pickle 3 | import argparse 4 | import pathlib 5 | from PIL import Image 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("-i", "--input", required=True, help="Input file name") 11 | parser.add_argument("-o", "--output", required=True, help="Output file name") 12 | args = parser.parse_args() 13 | 14 | dat = pathlib.Path(args.input).read_bytes() 15 | steps = pickle.loads(dat) 16 | print(len(steps)) 17 | sh = steps[0].obs.shape 18 | im = Image.new("RGB", (sh[1], sh[0]), (0, 0, 0)) 19 | images = [ 20 | Image.fromarray(step.obs) 21 | for step in steps 22 | ] 23 | im.save(args.output, save_all=True, append_images=images, 24 | duration=300, loop=0) 25 | -------------------------------------------------------------------------------- /Chapter19/adhoc/rw_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | sys.path.append(".") 4 | import gymnasium as gym 5 | import pathlib 6 | import torch 7 | import argparse 8 | 9 | from lib import rlhf 10 | 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("-r", "--reward", required=True, 15 | help="Path to reward model file") 16 | parser.add_argument("-d", "--dev", default="cuda") 17 | args = parser.parse_args() 18 | dev = torch.device(args.dev) 19 | 20 | e = gym.make("SeaquestNoFrameskip-v4") 21 | p = pathlib.Path(args.reward) 22 | e = rlhf.RewardModelWrapper(e, p, dev) 23 | r, _ = e.reset() 24 | obs, r, is_done, is_tr, extra = e.step(0) 25 | print(obs.shape) 26 | print(r) 27 | -------------------------------------------------------------------------------- /Chapter19/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter19/lib/__init__.py -------------------------------------------------------------------------------- /Chapter19/requirements.txt: -------------------------------------------------------------------------------- 1 | nicegui==1.4.26 2 | -------------------------------------------------------------------------------- /Chapter20/.gitignore: -------------------------------------------------------------------------------- 1 | res 2 | -------------------------------------------------------------------------------- /Chapter20/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/lib/__init__.py -------------------------------------------------------------------------------- /Chapter20/play-mu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import time 4 | import argparse 5 | 6 | from lib import game 7 | from lib import muzero as mu 8 | 9 | import torch 10 | 11 | 12 | MCTS_SEARCHES = 10 13 | MCTS_BATCH_SIZE = 8 14 | 15 | 16 | if __name__ == "__main__": 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("models", nargs='+', help="The list of models (at least 2) to play against each other") 19 | parser.add_argument("-r", "--rounds", type=int, default=2, help="Count of rounds to perform for every pair, default=2") 20 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable CUDA") 21 | args = parser.parse_args() 22 | device = torch.device("cuda" if args.cuda else "cpu") 23 | params = mu.MuZeroParams(dev=device) 24 | 25 | nets = [] 26 | for fname in args.models: 27 | net = mu.MuZeroModels(mu.OBS_SHAPE, game.GAME_COLS) 28 | d = torch.load(fname, map_location=lambda storage, loc: storage, weights_only=True) 29 | net.set_state_dict(d) 30 | net.to(device) 31 | nets.append((fname, net)) 32 | 33 | total_agent = {} 34 | total_pairs = {} 35 | 36 | for idx1, n1 in enumerate(nets): 37 | for idx2, n2 in enumerate(nets): 38 | if idx1 == idx2: 39 | continue 40 | wins, losses, draws = 0, 0, 0 41 | ts = time.time() 42 | for _ in range(args.rounds): 43 | r, _ = mu.play_game(n1[1], n2[1], params, temperature=0) 44 | if r > 0.5: 45 | wins += 1 46 | elif r < -0.5: 47 | losses += 1 48 | else: 49 | draws += 1 50 | speed_games = args.rounds / (time.time() - ts) 51 | name_1, name_2 = n1[0], n2[0] 52 | print("%s vs %s -> w=%d, l=%d, d=%d" % (name_1, name_2, wins, losses, draws)) 53 | sys.stderr.write("Speed %.2f games/s\n" % speed_games) 54 | sys.stdout.flush() 55 | game.update_counts(total_agent, name_1, (wins, losses, draws)) 56 | game.update_counts(total_agent, name_2, (losses, wins, draws)) 57 | game.update_counts(total_pairs, (name_1, name_2), (wins, losses, draws)) 58 | 59 | # leaderboard by total wins 60 | total_leaders = list(total_agent.items()) 61 | total_leaders.sort(reverse=True, key=lambda p: p[1][0]) 62 | 63 | print("Leaderboard:") 64 | for name, (wins, losses, draws) in total_leaders: 65 | print("%s: \t w=%d, l=%d, d=%d" % (name, wins, losses, draws)) 66 | -------------------------------------------------------------------------------- /Chapter20/play.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import time 4 | import argparse 5 | 6 | from lib import game, model 7 | 8 | import torch 9 | 10 | 11 | MCTS_SEARCHES = 10 12 | MCTS_BATCH_SIZE = 8 13 | 14 | 15 | if __name__ == "__main__": 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("models", nargs='+', help="The list of models (at least 2) to play against each other") 18 | parser.add_argument("-r", "--rounds", type=int, default=2, help="Count of rounds to perform for every pair") 19 | parser.add_argument("--cuda", default=False, action="store_true", help="Enable CUDA") 20 | args = parser.parse_args() 21 | device = torch.device("cuda" if args.cuda else "cpu") 22 | 23 | nets = [] 24 | for fname in args.models: 25 | net = model.Net(model.OBS_SHAPE, game.GAME_COLS) 26 | net.load_state_dict(torch.load(fname, map_location=lambda storage, loc: storage, weights_only=True)) 27 | net = net.to(device) 28 | nets.append((fname, net)) 29 | 30 | total_agent = {} 31 | total_pairs = {} 32 | 33 | for idx1, n1 in enumerate(nets): 34 | for idx2, n2 in enumerate(nets): 35 | if idx1 == idx2: 36 | continue 37 | wins, losses, draws = 0, 0, 0 38 | ts = time.time() 39 | for _ in range(args.rounds): 40 | r, _ = model.play_game(mcts_stores=None, replay_buffer=None, net1=n1[1], net2=n2[1], steps_before_tau_0=0, 41 | mcts_searches=MCTS_SEARCHES, mcts_batch_size=MCTS_BATCH_SIZE, device=device) 42 | if r > 0.5: 43 | wins += 1 44 | elif r < -0.5: 45 | losses += 1 46 | else: 47 | draws += 1 48 | speed_games = args.rounds / (time.time() - ts) 49 | name_1, name_2 = n1[0], n2[0] 50 | print("%s vs %s -> w=%d, l=%d, d=%d" % (name_1, name_2, wins, losses, draws)) 51 | sys.stderr.write("Speed %.2f games/s\n" % speed_games) 52 | sys.stdout.flush() 53 | game.update_counts(total_agent, name_1, (wins, losses, draws)) 54 | game.update_counts(total_agent, name_2, (losses, wins, draws)) 55 | game.update_counts(total_pairs, (name_1, name_2), (wins, losses, draws)) 56 | 57 | # leaderboard by total wins 58 | total_leaders = list(total_agent.items()) 59 | total_leaders.sort(reverse=True, key=lambda p: p[1][0]) 60 | 61 | print("Leaderboard:") 62 | for name, (wins, losses, draws) in total_leaders: 63 | print("%s: \t w=%d, l=%d, d=%d" % (name, wins, losses, draws)) 64 | -------------------------------------------------------------------------------- /Chapter20/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/tests/__init__.py -------------------------------------------------------------------------------- /Chapter20/tests/test_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from lib import game, model 5 | 6 | 7 | class TestEncoding(unittest.TestCase): 8 | def test_encoding(self): 9 | s = [[0, 1, 0], [0], [1, 1, 1], [], [1], [], []] 10 | batch_v = model.state_lists_to_batch([s, s], [game.PLAYER_BLACK, game.PLAYER_WHITE]) 11 | batch = batch_v.data.numpy() 12 | np.testing.assert_equal(batch, [ 13 | # black player's view 14 | [ 15 | # player 16 | [ 17 | [0, 0, 0, 0, 0, 0, 0], 18 | [0, 0, 0, 0, 0, 0, 0], 19 | [0, 0, 0, 0, 0, 0, 0], 20 | [0, 0, 1, 0, 0, 0, 0], 21 | [1, 0, 1, 0, 0, 0, 0], 22 | [0, 0, 1, 0, 1, 0, 0], 23 | ], 24 | # opponent 25 | [ 26 | [0, 0, 0, 0, 0, 0, 0], 27 | [0, 0, 0, 0, 0, 0, 0], 28 | [0, 0, 0, 0, 0, 0, 0], 29 | [1, 0, 0, 0, 0, 0, 0], 30 | [0, 0, 0, 0, 0, 0, 0], 31 | [1, 1, 0, 0, 0, 0, 0], 32 | ] 33 | ], 34 | # white player's view 35 | [ 36 | # player 37 | [ 38 | [0, 0, 0, 0, 0, 0, 0], 39 | [0, 0, 0, 0, 0, 0, 0], 40 | [0, 0, 0, 0, 0, 0, 0], 41 | [1, 0, 0, 0, 0, 0, 0], 42 | [0, 0, 0, 0, 0, 0, 0], 43 | [1, 1, 0, 0, 0, 0, 0], 44 | ], 45 | # opponent 46 | [ 47 | [0, 0, 0, 0, 0, 0, 0], 48 | [0, 0, 0, 0, 0, 0, 0], 49 | [0, 0, 0, 0, 0, 0, 0], 50 | [0, 0, 1, 0, 0, 0, 0], 51 | [1, 0, 1, 0, 0, 0, 0], 52 | [0, 0, 1, 0, 1, 0, 0], 53 | ] 54 | ], 55 | ]) 56 | 57 | 58 | pass 59 | -------------------------------------------------------------------------------- /Chapter20/tests/test_muzero.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from lib import muzero as mu 3 | from lib import game 4 | 5 | 6 | def test_node(): 7 | n = mu.MCTSNode(0.5, first_plays=True) 8 | assert not n.is_expanded 9 | assert n.value == 0 10 | 11 | 12 | def test_mcts(): 13 | params = mu.MuZeroParams() 14 | models = mu.MuZeroModels(mu.OBS_SHAPE, game.GAME_COLS) 15 | min_max = mu.MinMaxStats() 16 | root = mu.run_mcts(0, game.INITIAL_STATE, params, models, 17 | search_rounds=10, min_max=min_max) 18 | assert root.is_expanded 19 | assert len(root.children) == game.GAME_COLS 20 | assert root.visit_count == 11 21 | 22 | 23 | def test_action_selection(): 24 | params = mu.MuZeroParams() 25 | root = mu.MCTSNode(0.5, first_plays=True) 26 | np.random.seed(10) 27 | v = root.select_action(1, params) 28 | assert v == 1 29 | for a in range(params.actions_count): 30 | root.children[a] = mu.MCTSNode(0.1, first_plays=False) 31 | root.children[0].visit_count = 100 32 | v = root.select_action(0.0000001, params) 33 | assert v == 0 34 | v = root.select_action(0.1, params) 35 | assert v == 0 36 | 37 | 38 | def test_play_game(): 39 | params = mu.MuZeroParams() 40 | models = mu.MuZeroModels(mu.OBS_SHAPE, game.GAME_COLS) 41 | reward, episode = mu.play_game( 42 | models, models, params, temperature=0, 43 | init_state=8516337133269602564 44 | ) 45 | assert episode -------------------------------------------------------------------------------- /Chapter20/tournament/2ed/final-short.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/tournament/2ed/final-short.png -------------------------------------------------------------------------------- /Chapter20/tournament/2ed/final.csv: -------------------------------------------------------------------------------- 1 | model_index,wins 2 | 6,0.5321052631578947 3 | 8,0.5236842105263158 4 | 10,0.4268421052631579 5 | 18,0.3813157894736842 6 | 20,0.4676315789473684 7 | 25,0.40131578947368424 8 | 37,0.4713157894736842 9 | 38,0.44105263157894736 10 | 46,0.47578947368421054 11 | 49,0.4886842105263158 12 | 50,0.5081578947368421 13 | 57,0.48210526315789476 14 | 66,0.5565789473684211 15 | 68,0.5344736842105263 16 | 72,0.5613157894736842 17 | 78,0.44552631578947366 18 | 87,0.5673684210526316 19 | 88,0.5671052631578948 20 | 91,0.5931578947368421 21 | 105,0.5692105263157895 22 | -------------------------------------------------------------------------------- /Chapter20/tournament/2ed/final_plot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import matplotlib.pyplot as plt\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 15, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "df = pd.read_csv(\"final.csv\")" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 17, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "fig = plt.figure()\n", 35 | "ax1 = fig.add_subplot(111)\n", 36 | "\n", 37 | "ax1.plot(df.model_index, df.wins, color='black', linewidth=.8, linestyle='-')\n", 38 | "ax1.grid(True, axis='both')\n", 39 | "ax1.set_xlabel(\"Model index\")\n", 40 | "ax1.set_xlim(0, max(df.model_index))\n", 41 | "ax1.set_ylabel(\"Win ratio\")\n", 42 | "plt.savefig(\"final.svg\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [] 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python [default]", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.5.2" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 2 76 | } 77 | -------------------------------------------------------------------------------- /Chapter20/tournament/2ed/semi-common.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/tournament/2ed/semi-common.png -------------------------------------------------------------------------------- /Chapter20/tournament/2ed/semi-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/tournament/2ed/semi-scores.png -------------------------------------------------------------------------------- /Chapter20/tournament/3ed/final.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ./play.py --cuda -r 10 saves/t1/best_088_39300.dat saves/t1/best_025_09900.dat saves/t1/best_022_08200.dat \ 4 | saves/t1/best_021_08100.dat saves/t1/best_009_03400.dat saves/t1/best_014_04700.dat saves/t1/best_008_02700.dat \ 5 | saves/t1/best_010_03500.dat saves/t1/best_029_11800.dat saves/t1/best_007_02300.dat \ 6 | saves/t2/best_069_41500.dat saves/t2/best_070_42200.dat saves/t2/best_066_38900.dat saves/t2/best_071_42600.dat \ 7 | saves/t2/best_059_33700.dat saves/t2/best_049_27500.dat saves/t2/best_068_41300.dat saves/t2/best_048_26700.dat \ 8 | saves/t2/best_058_32100.dat saves/t2/best_076_45200.dat > final.txt 9 | -------------------------------------------------------------------------------- /Chapter20/tournament/3ed/final_plot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import matplotlib.pyplot as plt\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 15, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "df = pd.read_csv(\"final.csv\")" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 17, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "fig = plt.figure()\n", 35 | "ax1 = fig.add_subplot(111)\n", 36 | "\n", 37 | "ax1.plot(df.model_index, df.wins, color='black', linewidth=.8, linestyle='-')\n", 38 | "ax1.grid(True, axis='both')\n", 39 | "ax1.set_xlabel(\"Model index\")\n", 40 | "ax1.set_xlim(0, max(df.model_index))\n", 41 | "ax1.set_ylabel(\"Win ratio\")\n", 42 | "plt.savefig(\"final.svg\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [] 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python [default]", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.5.2" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 2 76 | } 77 | -------------------------------------------------------------------------------- /Chapter20/tournament/3ed/mu-v1-wins.csv: -------------------------------------------------------------------------------- 1 | Wall time,Step,Value 2 | 0,5,0.6578947368421053 3 | 0,10,0.8921052631578947 4 | 0,15,0.7842105263157895 5 | 0,20,0.4263157894736842 6 | 0,25,0.41578947368421054 7 | 0,30,0.3868421052631579 8 | 0,35,0.4421052631578947 9 | 0,40,0.5368421052631579 10 | 0,45,0.55 11 | 0,50,0.5078947368421053 12 | 0,55,0.41578947368421054 13 | 0,60,0.42894736842105263 14 | 0,65,0.33157894736842103 15 | 0,70,0.37105263157894736 16 | 0,75,0.29210526315789476 17 | 0,80,0.32894736842105265 18 | 0,85,0.37105263157894736 19 | 0,90,0.3815789473684211 20 | 0,95,0.4131578947368421 21 | 0,100,0.48947368421052634 22 | 0,105,0.45 23 | 0,110,0.46842105263157896 24 | 0,115,0.46578947368421053 25 | 0,120,0.4394736842105263 26 | 0,125,0.48157894736842105 27 | 0,130,0.5184210526315789 28 | 0,135,0.5789473684210527 29 | 0,140,0.7026315789473684 30 | 0,145,0.7 31 | 0,150,0.718421052631579 32 | 0,155,0.7552631578947369 33 | 0,160,0.6210526315789474 34 | 0,165,0.6657894736842105 35 | 0,170,0.5421052631578948 36 | 0,175,0.4473684210526316 37 | 0,180,0.3447368421052632 38 | 0,185,0.29736842105263156 39 | 0,190,0.3815789473684211 40 | 0,195,0.48157894736842105 41 | -------------------------------------------------------------------------------- /Chapter20/tournament/3ed/v1-wins.csv: -------------------------------------------------------------------------------- 1 | Wall time,Step,Value 2 | 0,1,0.45738636363636365 3 | 0,2,0.475 4 | 0,3,0.4727272727272727 5 | 0,4,0.5022727272727273 6 | 0,5,0.5136363636363637 7 | 0,6,0.5107954545454545 8 | 0,7,0.5568181818181818 9 | 0,8,0.5670454545454545 10 | 0,9,0.5738636363636364 11 | 0,10,0.5664772727272728 12 | 0,11,0.5522727272727272 13 | 0,12,0.5272727272727272 14 | 0,13,0.5375 15 | 0,14,0.5698863636363637 16 | 0,15,0.49772727272727274 17 | 0,16,0.55625 18 | 0,17,0.5261363636363636 19 | 0,18,0.5426136363636364 20 | 0,19,0.5443181818181818 21 | 0,20,0.5113636363636364 22 | 0,21,0.5778409090909091 23 | 0,22,0.58125 24 | 0,23,0.5380681818181818 25 | 0,24,0.5477272727272727 26 | 0,25,0.5818181818181818 27 | 0,26,0.5210227272727272 28 | 0,27,0.4875 29 | 0,28,0.5107954545454545 30 | 0,29,0.5630681818181819 31 | 0,30,0.5289772727272727 32 | 0,31,0.5102272727272728 33 | 0,32,0.5022727272727273 34 | 0,33,0.47556818181818183 35 | 0,34,0.46761363636363634 36 | 0,35,0.48522727272727273 37 | 0,36,0.4630681818181818 38 | 0,37,0.44488636363636364 39 | 0,38,0.4653409090909091 40 | 0,39,0.4636363636363636 41 | 0,40,0.4596590909090909 42 | 0,41,0.45738636363636365 43 | 0,42,0.46988636363636366 44 | 0,43,0.4414772727272727 45 | 0,44,0.45340909090909093 46 | 0,45,0.48125 47 | 0,46,0.44829545454545455 48 | 0,47,0.48125 49 | 0,48,0.49318181818181817 50 | 0,49,0.4659090909090909 51 | 0,50,0.4511363636363636 52 | 0,51,0.48863636363636365 53 | 0,52,0.45340909090909093 54 | 0,53,0.4642045454545455 55 | 0,54,0.45454545454545453 56 | 0,55,0.46193181818181817 57 | 0,56,0.45397727272727273 58 | 0,57,0.4630681818181818 59 | 0,58,0.49886363636363634 60 | 0,59,0.5113636363636364 61 | 0,60,0.5051136363636364 62 | 0,61,0.49261363636363636 63 | 0,62,0.46704545454545454 64 | 0,63,0.49261363636363636 65 | 0,64,0.4732954545454545 66 | 0,65,0.4596590909090909 67 | 0,66,0.4744318181818182 68 | 0,67,0.4335227272727273 69 | 0,68,0.5119318181818182 70 | 0,69,0.4903409090909091 71 | 0,70,0.5164772727272727 72 | 0,71,0.48806818181818185 73 | 0,72,0.4653409090909091 74 | 0,73,0.49829545454545454 75 | 0,74,0.46136363636363636 76 | 0,75,0.4732954545454545 77 | 0,76,0.4903409090909091 78 | 0,77,0.4948863636363636 79 | 0,78,0.5426136363636364 80 | 0,79,0.5085227272727273 81 | 0,80,0.4965909090909091 82 | 0,81,0.5045454545454545 83 | 0,82,0.4778409090909091 84 | 0,83,0.48465909090909093 85 | 0,84,0.48295454545454547 86 | 0,85,0.48920454545454545 87 | 0,86,0.5551136363636363 88 | 0,87,0.5380681818181818 89 | 0,88,0.5835227272727272 90 | 0,89,0.4909090909090909 91 | -------------------------------------------------------------------------------- /Chapter20/tournament/3ed/v2-wins.csv: -------------------------------------------------------------------------------- 1 | Wall time,Step,Value 2 | 0,1,0.4404494382022472 3 | 0,2,0.3938202247191011 4 | 0,3,0.40224719101123596 5 | 0,4,0.4449438202247191 6 | 0,5,0.3938202247191011 7 | 0,6,0.3921348314606742 8 | 0,7,0.4174157303370786 9 | 0,8,0.44719101123595506 10 | 0,9,0.47752808988764045 11 | 0,10,0.48707865168539327 12 | 0,11,0.4584269662921348 13 | 0,12,0.46348314606741575 14 | 0,13,0.46629213483146065 15 | 0,14,0.48707865168539327 16 | 0,15,0.4398876404494382 17 | 0,16,0.48820224719101124 18 | 0,17,0.48764044943820223 19 | 0,18,0.4696629213483146 20 | 0,19,0.4657303370786517 21 | 0,20,0.48089887640449436 22 | 0,21,0.4410112359550562 23 | 0,22,0.49887640449438203 24 | 0,23,0.4926966292134832 25 | 0,24,0.4820224719101124 26 | 0,25,0.5022471910112359 27 | 0,26,0.5056179775280899 28 | 0,27,0.48820224719101124 29 | 0,28,0.451123595505618 30 | 0,29,0.5112359550561798 31 | 0,30,0.5168539325842697 32 | 0,31,0.4348314606741573 33 | 0,32,0.5151685393258427 34 | 0,33,0.5337078651685393 35 | 0,34,0.5483146067415731 36 | 0,35,0.551123595505618 37 | 0,36,0.5275280898876404 38 | 0,37,0.4966292134831461 39 | 0,38,0.48820224719101124 40 | 0,39,0.4910112359550562 41 | 0,40,0.5280898876404494 42 | 0,41,0.5415730337078651 43 | 0,42,0.5157303370786517 44 | 0,43,0.4646067415730337 45 | 0,44,0.500561797752809 46 | 0,45,0.48707865168539327 47 | 0,46,0.48707865168539327 48 | 0,47,0.5028089887640449 49 | 0,48,0.552247191011236 50 | 0,49,0.5561797752808989 51 | 0,50,0.49382022471910114 52 | 0,51,0.4691011235955056 53 | 0,52,0.5162921348314606 54 | 0,53,0.48820224719101124 55 | 0,54,0.5123595505617977 56 | 0,55,0.451123595505618 57 | 0,56,0.5022471910112359 58 | 0,57,0.5292134831460674 59 | 0,58,0.5516853932584269 60 | 0,59,0.5612359550561797 61 | 0,60,0.5044943820224719 62 | 0,61,0.4853932584269663 63 | 0,62,0.5073033707865169 64 | 0,63,0.4926966292134832 65 | 0,64,0.5404494382022472 66 | 0,65,0.5297752808988764 67 | 0,66,0.5646067415730337 68 | 0,67,0.5382022471910113 69 | 0,68,0.5561797752808989 70 | 0,69,0.5747191011235955 71 | 0,70,0.5707865168539326 72 | 0,71,0.5634831460674158 73 | 0,72,0.5376404494382022 74 | 0,73,0.4589887640449438 75 | 0,74,0.5162921348314606 76 | 0,75,0.5432584269662921 77 | 0,76,0.5516853932584269 78 | 0,77,0.501123595505618 79 | 0,78,0.5320224719101123 80 | 0,79,0.5129213483146068 81 | 0,80,0.5235955056179775 82 | 0,81,0.5455056179775281 83 | 0,82,0.548876404494382 84 | 0,83,0.4960674157303371 85 | 0,84,0.5140449438202247 86 | 0,85,0.547752808988764 87 | 0,86,0.47191011235955055 88 | 0,87,0.5134831460674157 89 | 0,88,0.5078651685393258 90 | 0,89,0.499438202247191 91 | 0,90,0.5348314606741573 92 | -------------------------------------------------------------------------------- /Chapter21/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | runs 3 | saves 4 | *.png 5 | .ipynb_checkpoints 6 | *.log 7 | -------------------------------------------------------------------------------- /Chapter21/csvs/2ed/README.md: -------------------------------------------------------------------------------- 1 | Description of produced test results 2 | 3 | # First results 4 | 5 | Test results from first models (paper versus zero-goal method). Solve tool run for 30k MCTS searches 6 | (but due to bug, actual amount of steps in some tests was much lower). 7 | 8 | ```` 9 | c2x2-paper-d200-t1.csv 10 | c2x2-zero-goal-d200-t1.csv 11 | c3x3-paper-d200-t1.csv 12 | c3x3-zero-goal-d200-no-decay.csv 13 | c3x3-zero-goal-d200-t1.csv 14 | ```` 15 | 16 | Analysis of the results are in notebook 17 | https://github.com/Shmuma/rl/blob/master/articles/01_rubic/nbs/01_paper-vs-zero_goal.ipynb 18 | 19 | # Fix of wrong steps 20 | 21 | Fixed with https://github.com/Shmuma/rl/commit/793aebc81b7bf323a8db930e8224521700383af5#diff-b9a7f0478383b0f6ad54ae87c8769b03 22 | 23 | ```` 24 | c2x2-paper-d200-t1-v2.csv 25 | c2x2-zero-goal-d200-t1-v2.csv 26 | c3x3-paper-d200-t1-v2.csv 27 | c3x3-zero-goal-d200-no-decay-v2.csv 28 | c3x3-zero-goal-d200-t1-v2.csv 29 | ```` 30 | 31 | -------------------------------------------------------------------------------- /Chapter21/cubes_tests/2ed/cube2x2_d3.txt: -------------------------------------------------------------------------------- 1 | 10,1,0 2 | 11,4,3 3 | 3,2,11 4 | 1,10,11 5 | 8,1,9 6 | 6,1,3 7 | 3,8,9 8 | 0,8,3 9 | 11,10,11 10 | 8,6,3 11 | 7,9,4 12 | 0,2,11 13 | 6,5,4 14 | 2,3,5 15 | 1,1,6 16 | 1,5,5 17 | 9,4,0 18 | 11,7,8 19 | 1,6,1 20 | 8,4,9 21 | 5,9,11 22 | 1,0,10 23 | 3,4,1 24 | 3,1,6 25 | 4,7,10 26 | 5,2,5 27 | 5,3,10 28 | 4,11,10 29 | 10,1,9 30 | 10,2,11 31 | 3,2,7 32 | 6,4,11 33 | 8,3,10 34 | 5,0,3 35 | 0,5,6 36 | 4,1,3 37 | 9,11,3 38 | 10,7,6 39 | 10,7,2 40 | 4,2,3 41 | 11,8,8 42 | 4,11,9 43 | 6,9,6 44 | 5,3,2 45 | 8,7,0 46 | 1,2,10 47 | 2,10,6 48 | 9,1,6 49 | 6,9,7 50 | 8,4,8 51 | 0,10,11 52 | 1,10,8 53 | 4,5,1 54 | 4,6,2 55 | 7,0,11 56 | 11,4,8 57 | 2,1,10 58 | 4,8,9 59 | 3,2,5 60 | 2,0,9 61 | 5,7,0 62 | 1,5,4 63 | 3,0,3 64 | 9,1,1 65 | 11,7,8 66 | 2,2,10 67 | 7,8,4 68 | 8,9,6 69 | 3,8,11 70 | 11,3,11 71 | 4,6,10 72 | 10,5,7 73 | 8,7,3 74 | 3,1,5 75 | 0,9,8 76 | 3,3,0 77 | 1,11,10 78 | 0,3,1 79 | 0,5,1 80 | 8,3,4 81 | 10,7,3 82 | 8,11,9 83 | 9,7,3 84 | 7,6,3 85 | 1,1,10 86 | 6,5,6 87 | 6,7,11 88 | 0,10,10 89 | 10,1,0 90 | 6,11,1 91 | 3,3,3 92 | 8,7,2 93 | 6,2,4 94 | 7,3,1 95 | 7,8,1 96 | 0,10,8 97 | 0,1,3 98 | 2,6,7 99 | 7,3,6 100 | 0,2,6 101 | -------------------------------------------------------------------------------- /Chapter21/cubes_tests/2ed/cube2x2_d4.txt: -------------------------------------------------------------------------------- 1 | 10,1,0,11 2 | 4,3,3,2 3 | 11,1,10,11 4 | 8,1,9,6 5 | 0,0,1,3 6 | 3,8,9,0 7 | 8,3,11,10 8 | 11,8,6,3 9 | 7,9,4,0 10 | 2,11,6,5 11 | 4,2,3,5 12 | 1,1,6,1 13 | 5,5,9,4 14 | 0,11,7,8 15 | 1,6,1,8 16 | 4,9,5,9 17 | 3,11,1,0 18 | 10,3,4,1 19 | 3,1,6,4 20 | 7,10,5,2 21 | 5,5,3,10 22 | 4,11,10,10 23 | 1,9,10,2 24 | 8,11,3,2 25 | 7,6,4,11 26 | 8,3,10,5 27 | 0,3,0,5 28 | 6,4,1,3 29 | 9,11,3,10 30 | 7,6,10,7 31 | 2,4,2,3 32 | 11,8,8,4 33 | 11,9,6,9 34 | 6,5,3,2 35 | 8,7,0,1 36 | 2,10,2,10 37 | 6,9,1,6 38 | 6,9,7,8 39 | 4,8,0,10 40 | 11,1,10,8 41 | 4,5,1,4 42 | 6,2,7,0 43 | 11,11,4,8 44 | 2,1,10,10 45 | 8,9,2,5 46 | 2,0,9,5 47 | 7,0,1,5 48 | 4,3,0,3 49 | 9,1,1,11 50 | 7,8,10,7 51 | 8,4,8,9 52 | 6,3,8,11 53 | 11,3,11,4 54 | 6,10,10,5 55 | 7,8,7,3 56 | 3,1,5,0 57 | 9,8,3,3 58 | 0,1,11,10 59 | 0,3,1,0 60 | 5,1,8,3 61 | 4,7,3,8 62 | 2,11,9,9 63 | 7,3,7,6 64 | 3,1,1,10 65 | 6,5,6,6 66 | 7,11,0,10 67 | 10,10,1,0 68 | 6,11,1,3 69 | 3,3,8,7 70 | 2,6,2,4 71 | 7,3,1,8 72 | 1,0,10,8 73 | 0,1,3,2 74 | 6,7,7,3 75 | 6,2,6,6 76 | 4,7,4,6 77 | 11,11,8,10 78 | 11,7,2,3 79 | 4,3,0,9 80 | 11,8,0,11 81 | 5,0,0,9 82 | 7,8,8,0 83 | 8,1,2,1 84 | 9,1,10,3 85 | 6,1,9,9 86 | 9,0,9,1 87 | 6,10,9,9 88 | 8,5,4,3 89 | 10,11,3,4 90 | 6,2,10,10 91 | 4,7,5,1 92 | 0,7,9,9 93 | 1,1,8,3 94 | 8,4,2,5 95 | 1,3,5,4 96 | 2,7,8,11 97 | 4,9,10,8 98 | 0,10,8,4 99 | 10,1,2,4 100 | 1,1,11,8 101 | -------------------------------------------------------------------------------- /Chapter21/cubes_tests/2ed/cube2x2_d5.txt: -------------------------------------------------------------------------------- 1 | 10,1,0,11,4 2 | 3,3,2,11,1 3 | 10,11,8,1,9 4 | 6,1,3,3,8 5 | 9,0,8,3,11 6 | 10,11,8,6,3 7 | 7,9,4,0,2 8 | 11,6,5,4,2 9 | 3,5,1,1,6 10 | 1,5,5,9,4 11 | 0,11,7,8,1 12 | 6,1,8,4,9 13 | 5,9,11,1,0 14 | 10,3,4,1,3 15 | 1,6,4,7,10 16 | 5,2,5,5,3 17 | 10,11,10,10,1 18 | 9,10,2,11,3 19 | 2,7,6,4,11 20 | 8,3,10,5,0 21 | 3,0,5,6,4 22 | 1,3,11,3,10 23 | 7,6,10,7,2 24 | 4,2,3,11,8 25 | 8,4,11,9,6 26 | 9,6,5,3,2 27 | 8,7,0,1,2 28 | 10,2,10,6,9 29 | 1,6,6,9,7 30 | 8,4,8,0,10 31 | 11,1,10,8,4 32 | 10,5,1,4,6 33 | 2,7,0,11,11 34 | 4,8,8,1,10 35 | 4,8,9,2,5 36 | 2,0,9,5,7 37 | 0,1,5,4,3 38 | 0,3,1,1,11 39 | 7,8,10,7,8 40 | 2,4,8,9,6 41 | 3,8,11,11,3 42 | 11,4,6,10,10 43 | 5,7,8,7,3 44 | 3,1,5,0,9 45 | 8,3,3,0,1 46 | 11,10,0,3,1 47 | 0,5,1,8,3 48 | 4,7,3,8,11 49 | 9,9,7,3,7 50 | 6,3,1,1,10 51 | 6,5,6,6,7 52 | 11,0,10,10,10 53 | 1,0,11,1,3 54 | 3,3,8,7,2 55 | 6,2,4,7,3 56 | 1,8,1,0,10 57 | 8,0,1,3,2 58 | 6,7,7,3,6 59 | 0,2,6,6,4 60 | 7,4,6,11,11 61 | 8,10,11,7,2 62 | 3,4,3,0,9 63 | 11,8,0,11,0 64 | 0,9,7,8,8 65 | 2,0,8,1,2 66 | 1,9,1,10,3 67 | 6,1,9,9,9 68 | 0,9,1,6,10 69 | 9,9,8,5,4 70 | 3,10,11,3,4 71 | 6,2,10,10,7 72 | 5,1,0,7,9 73 | 9,1,1,8,3 74 | 8,4,2,5,1 75 | 3,5,4,2,7 76 | 8,11,4,9,10 77 | 8,0,10,8,4 78 | 10,1,2,4,1 79 | 1,11,8,4,4 80 | 9,11,3,10,10 81 | 4,8,7,4,0 82 | 1,10,6,4,0 83 | 0,5,2,10,2 84 | 11,7,8,11,6 85 | 8,0,1,1,11 86 | 2,0,5,9,8 87 | 2,6,2,0,4 88 | 5,0,5,3,10 89 | 3,10,1,5,8 90 | 6,9,11,2,3 91 | 2,2,6,2,11 92 | 5,6,10,11,3 93 | 4,2,11,1,6 94 | 0,7,3,3,7 95 | 5,4,3,3,0 96 | 10,3,6,5,4 97 | 1,4,5,10,8 98 | 6,10,8,5,0 99 | 1,4,2,9,4 100 | 0,1,9,6,5 101 | -------------------------------------------------------------------------------- /Chapter21/cubes_tests/2ed/cube2x2_d6.txt: -------------------------------------------------------------------------------- 1 | 10,1,0,11,4,3 2 | 3,2,11,1,10,11 3 | 8,1,9,6,1,3 4 | 3,8,9,0,8,3 5 | 11,10,11,8,6,3 6 | 7,9,4,0,2,11 7 | 6,5,4,2,3,5 8 | 1,1,6,1,5,5 9 | 9,4,0,11,7,8 10 | 1,6,1,8,4,9 11 | 5,9,11,1,0,10 12 | 3,4,1,3,1,6 13 | 4,7,10,5,2,5 14 | 5,3,10,11,10,10 15 | 1,9,10,2,11,3 16 | 2,7,6,4,11,8 17 | 3,10,5,0,3,0 18 | 5,6,4,1,3,11 19 | 5,3,10,7,6,10 20 | 7,2,4,2,3,11 21 | 8,8,4,11,9,6 22 | 9,6,5,3,2,7 23 | 1,0,1,2,10,2 24 | 10,6,9,1,6,6 25 | 9,7,8,4,8,0 26 | 10,11,1,10,8,4 27 | 10,5,1,4,6,2 28 | 7,0,11,11,4,8 29 | 2,1,10,10,8,9 30 | 3,2,5,2,0,9 31 | 5,7,0,1,5,4 32 | 3,0,3,1,1,11 33 | 7,8,10,7,8,4 34 | 8,9,6,3,8,11 35 | 11,3,11,4,6,10 36 | 10,5,7,8,7,3 37 | 3,1,5,0,9,8 38 | 3,3,0,1,11,10 39 | 0,3,1,0,5,1 40 | 8,3,4,7,3,8 41 | 2,11,9,9,7,3 42 | 7,6,3,1,1,10 43 | 6,5,6,6,7,11 44 | 0,10,10,10,1,0 45 | 6,11,1,3,3,3 46 | 8,7,2,6,2,4 47 | 7,3,1,8,1,0 48 | 10,8,0,1,3,2 49 | 6,7,7,3,6,2 50 | 6,6,4,7,4,6 51 | 11,11,8,10,11,7 52 | 2,3,4,3,0,9 53 | 11,8,0,11,0,0 54 | 9,7,8,8,0,8 55 | 1,2,1,9,1,10 56 | 3,6,1,9,9,9 57 | 0,9,1,6,10,9 58 | 9,8,5,4,3,10 59 | 11,3,4,6,2,10 60 | 10,7,5,1,0,7 61 | 9,9,1,1,8,3 62 | 8,4,2,5,1,3 63 | 5,4,2,7,8,11 64 | 4,9,10,8,0,10 65 | 8,4,1,2,4,1 66 | 1,11,8,4,4,9 67 | 3,11,3,10,10,8 68 | 7,4,0,1,10,6 69 | 4,0,0,5,2,10 70 | 4,2,11,7,8,11 71 | 6,8,0,1,1,11 72 | 2,0,5,9,8,6 73 | 2,0,4,5,0,5 74 | 3,10,3,10,1,5 75 | 8,6,9,11,2,3 76 | 2,2,6,2,11,6 77 | 10,11,3,4,2,11 78 | 1,6,7,3,3,7 79 | 5,4,3,3,0,10 80 | 3,6,5,4,1,4 81 | 5,10,8,6,10,8 82 | 5,0,1,4,2,9 83 | 4,0,1,9,6,5 84 | 11,6,9,8,1,6 85 | 9,4,0,11,6,8 86 | 8,10,11,11,11,10 87 | 3,5,6,1,10,5 88 | 9,5,10,1,11,4 89 | 8,4,6,5,6,11 90 | 4,8,3,6,10,6 91 | 10,11,2,9,9,4 92 | 6,8,0,4,4,3 93 | 6,9,9,10,5,7 94 | 7,7,10,3,8,7 95 | 11,2,10,1,4,8 96 | 10,10,9,5,1,3 97 | 10,3,3,2,0,0 98 | 3,7,9,1,6,10 99 | 9,11,11,6,7,6 100 | 3,2,10,11,0,1 101 | -------------------------------------------------------------------------------- /Chapter21/cubes_tests/2ed/cube3x3_d10.txt: -------------------------------------------------------------------------------- 1 | 10,1,0,11,4,3,3,2,11,1 2 | 10,11,8,1,9,6,0,0,1,3 3 | 3,8,9,0,8,3,11,10,11,8 4 | 6,3,7,9,4,0,2,11,6,5 5 | 4,2,3,5,1,1,6,1,5,5 6 | 9,4,0,11,7,8,1,6,1,8 7 | 4,10,9,5,9,3,11,1,0,10 8 | 3,4,1,3,1,6,4,7,10,5 9 | 2,5,5,3,10,4,11,10,10,1 10 | 9,10,2,8,11,3,2,7,6,4 11 | 10,11,8,3,10,5,0,3,0,5 12 | 6,4,1,3,9,11,5,3,10,7 13 | 6,10,7,2,4,2,3,11,8,8 14 | 4,11,9,6,9,6,5,3,2,8 15 | 7,1,0,1,2,10,2,10,6,9 16 | 1,6,6,9,7,8,4,8,0,10 17 | 11,1,10,8,4,10,5,1,4,6 18 | 2,7,0,11,11,4,8,2,8,1 19 | 10,4,10,8,9,3,2,5,2,8 20 | 8,0,9,5,7,0,1,5,4,3 21 | 0,3,9,1,1,11,7,1,8,2 22 | 2,10,7,8,2,4,8,9,6,3 23 | 8,11,11,3,11,4,6,10,10,5 24 | 7,8,7,1,3,3,1,5,0,9 25 | 8,3,9,3,0,1,11,10,0,3 26 | 1,0,5,1,8,3,4,10,7,3 27 | 8,2,11,9,9,7,3,7,6,3 28 | 1,1,10,6,5,6,6,7,11,0 29 | 10,10,10,1,0,6,11,5,1,3 30 | 3,3,8,7,2,6,2,4,7,3 31 | 1,7,8,1,0,10,8,0,1,3 32 | 2,6,7,7,3,6,0,2,6,0 33 | 6,4,7,4,6,11,11,8,10,11 34 | 7,2,3,4,3,0,9,11,8,0 35 | 11,5,0,0,9,7,8,8,2,0 36 | 8,1,2,1,9,1,10,3,6,1 37 | 9,3,9,9,0,9,1,6,10,9 38 | 9,8,5,4,3,10,11,5,3,4 39 | 6,2,10,10,4,7,5,1,0,7 40 | 9,9,1,1,8,3,8,4,2,5 41 | 1,3,5,4,2,7,8,11,4,9 42 | 10,8,0,10,8,4,10,1,2,4 43 | 1,1,11,8,2,4,4,9,3,11 44 | 5,3,10,10,4,8,7,4,0,1 45 | 10,6,4,0,0,5,2,10,4,2 46 | 11,7,8,11,6,8,0,1,1,11 47 | 2,8,0,5,9,8,2,6,2,0 48 | 4,5,0,5,3,10,3,10,1,5 49 | 8,6,9,11,2,3,2,2,6,0 50 | 2,11,5,6,10,11,3,4,2,11 51 | 1,6,0,7,3,3,7,5,4,3 52 | 3,0,10,3,6,5,4,1,4,5 53 | 10,8,6,10,8,5,0,1,4,2 54 | 9,4,0,1,9,6,5,11,5,6 55 | 9,8,1,6,9,3,4,0,11,6 56 | 0,8,8,10,11,11,11,10,3,5 57 | 6,1,10,5,9,5,10,1,11,4 58 | 8,4,10,6,5,6,11,4,8,2 59 | 3,6,10,6,10,11,2,9,9,4 60 | 6,8,0,4,4,3,6,9,9,10 61 | 5,7,7,7,10,3,8,7,11,2 62 | 10,1,4,8,10,10,9,5,1,3 63 | 10,4,3,3,2,0,0,3,7,9 64 | 1,7,6,10,9,3,11,11,6,7 65 | 6,3,2,10,11,0,1,6,3,2 66 | 11,8,7,0,8,3,1,7,2,7 67 | 10,8,8,9,5,7,9,11,8,6 68 | 8,7,2,11,7,7,4,3,10,4 69 | 8,7,10,3,4,7,1,11,4,3 70 | 4,5,5,8,1,2,2,3,6,11 71 | 2,11,3,1,6,6,5,8,7,6 72 | 0,3,6,6,9,11,0,9,6,7 73 | 0,5,4,6,6,8,11,11,8,9 74 | 3,7,3,4,6,7,0,6,5,10 75 | 10,6,11,2,7,2,9,8,0,6 76 | 9,9,10,0,1,10,6,2,7,2 77 | 0,4,6,5,3,7,5,5,6,4 78 | 6,4,1,7,0,11,8,0,5,3 79 | 10,1,10,0,0,3,3,0,9,2 80 | 3,2,7,10,1,9,3,7,11,4 81 | 5,2,9,9,11,11,1,2,4,1 82 | 9,0,4,9,10,6,6,11,3,1 83 | 9,11,10,3,1,11,4,10,9,1 84 | 9,0,5,8,6,10,5,1,8,10 85 | 5,0,6,7,1,6,5,10,7,11 86 | 2,6,2,11,8,10,4,9,8,7 87 | 7,6,11,9,4,5,3,1,4,7 88 | 3,7,9,9,10,6,5,0,7,5 89 | 2,7,3,5,4,5,4,9,11,4 90 | 8,0,8,3,1,3,11,6,7,8 91 | 3,11,7,10,11,7,7,0,1,4 92 | 3,6,11,3,4,10,9,5,7,8 93 | 8,5,6,11,8,5,5,11,7,4 94 | 4,4,3,1,11,3,5,1,11,8 95 | 11,2,3,3,11,7,4,11,9,8 96 | 9,4,1,3,4,3,5,2,4,0 97 | 11,8,2,4,0,0,8,4,11,2 98 | 10,7,1,0,9,4,7,7,7,5 99 | 2,0,4,7,1,1,6,7,1,9 100 | 10,10,0,2,2,9,4,1,3,1 101 | -------------------------------------------------------------------------------- /Chapter21/cubes_tests/2ed/cube3x3_d3.txt: -------------------------------------------------------------------------------- 1 | 10,1,0 2 | 11,4,3 3 | 3,2,11 4 | 1,10,11 5 | 8,1,9 6 | 6,0,0 7 | 1,3,3 8 | 8,9,0 9 | 8,3,11 10 | 10,11,8 11 | 6,3,7 12 | 9,4,0 13 | 2,11,6 14 | 5,4,2 15 | 3,5,1 16 | 1,6,1 17 | 5,5,9 18 | 4,0,11 19 | 7,8,1 20 | 6,1,8 21 | 4,10,9 22 | 5,9,3 23 | 11,1,0 24 | 10,3,4 25 | 1,3,1 26 | 6,4,7 27 | 10,5,2 28 | 5,5,3 29 | 10,4,11 30 | 10,10,1 31 | 9,10,2 32 | 8,11,3 33 | 2,7,6 34 | 4,10,11 35 | 8,3,10 36 | 5,0,3 37 | 0,5,6 38 | 4,1,3 39 | 9,11,5 40 | 3,10,7 41 | 6,10,7 42 | 2,4,2 43 | 3,11,8 44 | 8,4,11 45 | 9,6,9 46 | 6,5,3 47 | 2,8,7 48 | 1,0,1 49 | 2,10,2 50 | 10,6,9 51 | 1,6,6 52 | 9,7,8 53 | 4,8,0 54 | 10,11,1 55 | 10,8,4 56 | 10,5,1 57 | 4,6,2 58 | 7,0,11 59 | 11,4,8 60 | 2,8,1 61 | 10,4,10 62 | 8,9,3 63 | 2,5,2 64 | 8,8,0 65 | 9,5,7 66 | 0,1,5 67 | 4,3,0 68 | 3,9,1 69 | 1,11,7 70 | 1,8,2 71 | 2,10,7 72 | 8,2,4 73 | 8,9,6 74 | 3,8,11 75 | 11,3,11 76 | 4,6,10 77 | 10,5,7 78 | 8,7,1 79 | 3,3,1 80 | 5,0,9 81 | 8,3,9 82 | 3,0,1 83 | 11,10,0 84 | 3,1,0 85 | 5,1,8 86 | 3,4,10 87 | 7,3,8 88 | 2,11,9 89 | 9,7,3 90 | 7,6,3 91 | 1,1,10 92 | 6,5,6 93 | 6,7,11 94 | 0,10,10 95 | 10,1,0 96 | 6,11,5 97 | 1,3,3 98 | 3,8,7 99 | 2,6,2 100 | 4,7,3 101 | -------------------------------------------------------------------------------- /Chapter21/cubes_tests/2ed/cube3x3_d3_norepeat.txt: -------------------------------------------------------------------------------- 1 | 10,1,0 2 | 11,4,3 3 | 3,2,11 4 | 1,10,11 5 | 8,1,9 6 | 6,1,3 7 | 3,8,9 8 | 0,8,3 9 | 11,10,11 10 | 8,6,3 11 | 7,9,4 12 | 0,2,11 13 | 6,5,4 14 | 2,3,5 15 | 1,1,6 16 | 1,5,5 17 | 9,4,0 18 | 11,7,8 19 | 1,6,1 20 | 8,4,9 21 | 5,9,11 22 | 1,0,10 23 | 3,4,1 24 | 3,1,6 25 | 4,7,10 26 | 5,2,5 27 | 5,3,10 28 | 4,11,10 29 | 10,1,9 30 | 10,2,11 31 | 3,2,7 32 | 6,4,11 33 | 8,3,10 34 | 5,0,3 35 | 0,5,6 36 | 4,1,3 37 | 9,11,3 38 | 10,7,6 39 | 10,7,2 40 | 4,2,3 41 | 11,8,8 42 | 4,11,9 43 | 6,9,6 44 | 5,3,2 45 | 8,7,0 46 | 1,2,10 47 | 2,10,6 48 | 9,1,6 49 | 6,9,7 50 | 8,4,8 51 | 0,10,11 52 | 1,10,8 53 | 4,5,1 54 | 4,6,2 55 | 7,0,11 56 | 11,4,8 57 | 2,1,10 58 | 4,8,9 59 | 3,2,5 60 | 2,0,9 61 | 5,7,0 62 | 1,5,4 63 | 3,0,3 64 | 9,1,1 65 | 11,7,8 66 | 2,2,10 67 | 7,8,4 68 | 8,9,6 69 | 3,8,11 70 | 11,3,11 71 | 4,6,10 72 | 10,5,7 73 | 8,7,3 74 | 3,1,5 75 | 0,9,8 76 | 3,3,0 77 | 1,11,10 78 | 0,3,1 79 | 0,5,1 80 | 8,3,4 81 | 10,7,3 82 | 8,11,9 83 | 9,7,3 84 | 7,6,3 85 | 1,1,10 86 | 6,5,6 87 | 6,7,11 88 | 0,10,10 89 | 10,1,0 90 | 6,11,1 91 | 3,3,3 92 | 8,7,2 93 | 6,2,4 94 | 7,3,1 95 | 7,8,1 96 | 0,10,8 97 | 0,1,3 98 | 2,6,7 99 | 7,3,6 100 | 0,2,6 101 | -------------------------------------------------------------------------------- /Chapter21/gen_cubes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Tool to generate test set for solver 4 | """ 5 | import argparse 6 | import random 7 | 8 | from libcube import cubes 9 | 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("-e", "--env", required=True, help="Type of env to train, supported types=%s" % cubes.names()) 14 | parser.add_argument("-n", "--number", type=int, default=10, help="Amount of scramble rounds, default=10") 15 | parser.add_argument("-d", "--depth", type=int, default=100, help="Scramble depth, default=10") 16 | parser.add_argument("--seed", type=int, default=42, help="Seed to use, if zero, no seed used. default=42") 17 | parser.add_argument("-o", "--output", required=True, help="Output file to produce") 18 | args = parser.parse_args() 19 | 20 | if args.seed: 21 | random.seed(args.seed) 22 | 23 | cube_env = cubes.get(args.env) 24 | assert isinstance(cube_env, cubes.CubeEnv) 25 | 26 | with open(args.output, "w+t", encoding="utf-8") as fd_out: 27 | for _ in range(args.number): 28 | s = cube_env.initial_state 29 | path = [] 30 | prev_a = None 31 | for _ in range(args.depth): 32 | a = cube_env.sample_action(prev_action=prev_a) 33 | path.append(a.value) 34 | s = cube_env.transform(s, a) 35 | prev_a = a 36 | fd_out.write(",".join(map(str, path)) + "\n") 37 | -------------------------------------------------------------------------------- /Chapter21/ini/README.md: -------------------------------------------------------------------------------- 1 | Configuration files with training/testing settings. 2 | 3 | # cube2x2-paper-d200 4 | Method from the paper applied to 2x2 cube with scrambling depth 200 during the training. 5 | 6 | Best policy is achieved after 8k batches (3.5 hours on 1080Ti), after 10k batches training diverges. 7 | 8 | # cube2x2-zero-goal-d200 9 | The same as in paper, but target value for goal states set to zero, which helps convergence a lot 10 | -------------------------------------------------------------------------------- /Chapter21/ini/cube2x2-paper-d200.ini: -------------------------------------------------------------------------------- 1 | [general] 2 | cube_type=cube2x2 3 | run_name=paper 4 | 5 | [train] 6 | cuda=True 7 | lr=1e-5 8 | batch_size=10000 9 | scramble_depth=200 10 | report_batches=10 11 | checkpoint_batches=100 12 | lr_decay=True 13 | lr_decay_gamma=0.95 14 | lr_decay_batches=1000 15 | -------------------------------------------------------------------------------- /Chapter21/ini/cube2x2-zero-goal-d200.ini: -------------------------------------------------------------------------------- 1 | [general] 2 | cube_type=cube2x2 3 | run_name=zero-goal 4 | 5 | [train] 6 | ; how to calculate target values, default is 'paper' 7 | value_targets_method=zero_goal_value 8 | ; limit of batches to train (train iterations) 9 | ;max_batches=4000 10 | ; use cuda 11 | cuda=True 12 | ; learning rate 13 | lr=5e-5 14 | ; count of cubes in single batch 15 | batch_size=10000 16 | ; how deeply to scramble cube 17 | scramble_depth=200 18 | ; how frequently to report training progress 19 | report_batches=10 20 | ; how frequently to save model (if commented out, won't be saved) 21 | checkpoint_batches=100 22 | ; enables LR decay 23 | lr_decay=True 24 | ; LR decay gamma (if enabled) 25 | lr_decay_gamma=0.99 26 | ; interval between decays 27 | lr_decay_batches=100 28 | -------------------------------------------------------------------------------- /Chapter21/ini/cube3x3-paper-d20.ini: -------------------------------------------------------------------------------- 1 | [general] 2 | cube_type=cube3x3 3 | run_name=paper 4 | 5 | [train] 6 | ; how to calculate target values, default is 'paper' 7 | value_targets_method=paper 8 | ; limit of batches to train (train iterations) 9 | max_batches=100000 10 | ; use cuda 11 | cuda=True 12 | ; learning rate 13 | lr=1e-5 14 | ; count of cubes in single batch 15 | batch_size=10000 16 | ; batches to keep in scramble buffer 17 | scramble_buffer_batches=10 18 | ; after how many iterations push fresh batch into the scramble buffer 19 | push_scramble_buffer_iters=100 20 | ; how deeply to scramble cube 21 | scramble_depth=20 22 | ; how frequently to report training progress 23 | report_batches=10 24 | ; how frequently to save model (if commented out, won't be saved) 25 | ;checkpoint_batches=100 26 | ; enables LR decay 27 | lr_decay=True 28 | ; LR decay gamma (if enabled) 29 | lr_decay_gamma=0.95 30 | ; interval between decays 31 | lr_decay_batches=1000 32 | -------------------------------------------------------------------------------- /Chapter21/ini/cube3x3-paper-d200.ini: -------------------------------------------------------------------------------- 1 | [general] 2 | cube_type=cube3x3 3 | run_name=paper 4 | 5 | [train] 6 | ; how to calculate target values, default is 'paper' 7 | value_targets_method=paper 8 | ; limit of batches to train (train iterations) 9 | ;max_batches=4000 10 | ; use cuda 11 | cuda=True 12 | ; learning rate 13 | lr=1e-5 14 | ; count of cubes in single batch 15 | batch_size=10000 16 | ; how deeply to scramble cube 17 | scramble_depth=200 18 | ; how frequently to report training progress 19 | report_batches=10 20 | ; how frequently to save model (if commented out, won't be saved) 21 | ;checkpoint_batches=100 22 | ; enables LR decay 23 | lr_decay=True 24 | ; LR decay gamma (if enabled) 25 | lr_decay_gamma=0.95 26 | ; interval between decays 27 | lr_decay_batches=1000 28 | -------------------------------------------------------------------------------- /Chapter21/ini/cube3x3-zero-goal-d20-noweight.ini: -------------------------------------------------------------------------------- 1 | [general] 2 | cube_type=cube3x3 3 | run_name=zero-goal-noweight 4 | 5 | [train] 6 | ; how to calculate target values, default is 'paper' 7 | value_targets_method=zero_goal_value 8 | ; limit of batches to train (train iterations) 9 | max_batches=100000 10 | ; use cuda 11 | cuda=True 12 | ; learning rate 13 | lr=1e-5 14 | ; count of cubes in single batch 15 | batch_size=10000 16 | ; batches to keep in scramble buffer 17 | scramble_buffer_batches=10 18 | ; after how many iterations push fresh batch into the scramble buffer 19 | push_scramble_buffer_iters=100 20 | ; how deeply to scramble cube 21 | scramble_depth=20 22 | ; how frequently to report training progress 23 | report_batches=10 24 | ; how frequently to save model (if commented out, won't be saved) 25 | checkpoint_batches=1000 26 | ; enables LR decay 27 | lr_decay=False 28 | ; LR decay gamma (if enabled) 29 | lr_decay_gamma=0.95 30 | ; interval between decays 31 | lr_decay_batches=1000 32 | ; perform weighting of training samples inverse by scramble depth, default=True 33 | weight_samples=False 34 | -------------------------------------------------------------------------------- /Chapter21/ini/cube3x3-zero-goal-d20.ini: -------------------------------------------------------------------------------- 1 | [general] 2 | cube_type=cube3x3 3 | run_name=zero-goal 4 | 5 | [train] 6 | ; how to calculate target values, default is 'paper' 7 | value_targets_method=zero_goal_value 8 | ; limit of batches to train (train iterations) 9 | max_batches=100000 10 | ; use cuda 11 | cuda=True 12 | ; learning rate 13 | lr=1e-4 14 | ; count of cubes in single batch 15 | batch_size=10000 16 | ; batches to keep in scramble buffer 17 | scramble_buffer_batches=10 18 | ; after how many iterations push fresh batch into the scramble buffer 19 | push_scramble_buffer_iters=100 20 | ; how deeply to scramble cube 21 | scramble_depth=20 22 | ; how frequently to report training progress 23 | report_batches=10 24 | ; how frequently to save model (if commented out, won't be saved) 25 | ;checkpoint_batches=100 26 | ; enables LR decay 27 | lr_decay=True 28 | ; LR decay gamma (if enabled) 29 | lr_decay_gamma=0.95 30 | ; interval between decays 31 | lr_decay_batches=1000 32 | -------------------------------------------------------------------------------- /Chapter21/ini/cube3x3-zero-goal-d200-slow-decay.ini: -------------------------------------------------------------------------------- 1 | [general] 2 | cube_type=cube3x3 3 | run_name=zero-goal-slow-decay 4 | 5 | [train] 6 | ; how to calculate target values, default is 'paper' 7 | value_targets_method=zero_goal_value 8 | ; limit of batches to train (train iterations) 9 | ;max_batches=40000 10 | ; use cuda 11 | cuda=True 12 | ; learning rate 13 | lr=5e-5 14 | ; count of cubes in single batch 15 | batch_size=10000 16 | ; how deeply to scramble cube 17 | scramble_depth=200 18 | ; how frequently to report training progress 19 | report_batches=10 20 | ; how frequently to save model (if commented out, won't be saved) 21 | checkpoint_batches=100 22 | ; enables LR decay 23 | lr_decay=True 24 | ; LR decay gamma (if enabled) 25 | lr_decay_gamma=0.99 26 | ; interval between decays 27 | lr_decay_batches=200 28 | -------------------------------------------------------------------------------- /Chapter21/ini/cube3x3-zero-goal-d200.ini: -------------------------------------------------------------------------------- 1 | [general] 2 | cube_type=cube3x3 3 | run_name=zero-goal 4 | 5 | [train] 6 | ; how to calculate target values, default is 'paper' 7 | value_targets_method=zero_goal_value 8 | ; limit of batches to train (train iterations) 9 | ;max_batches=40000 10 | ; use cuda 11 | cuda=True 12 | ; learning rate 13 | lr=5e-5 14 | ; count of cubes in single batch 15 | batch_size=10000 16 | ; how deeply to scramble cube 17 | scramble_depth=200 18 | ; how frequently to report training progress 19 | report_batches=10 20 | ; how frequently to save model (if commented out, won't be saved) 21 | checkpoint_batches=100 22 | ; enables LR decay 23 | lr_decay=True 24 | ; LR decay gamma (if enabled) 25 | lr_decay_gamma=0.95 26 | ; interval between decays 27 | lr_decay_batches=200 28 | -------------------------------------------------------------------------------- /Chapter21/libcube/conf.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import configparser 3 | 4 | 5 | class Config: 6 | """ 7 | Configuration for train/test/solve 8 | """ 9 | log = logging.getLogger("Config") 10 | 11 | def __init__(self, file_name): 12 | self.data = configparser.ConfigParser() 13 | self.log.info("Reading config file %s", file_name) 14 | if not self.data.read(file_name): 15 | raise ValueError("Config file %s not found" % file_name) 16 | 17 | # sections acessors 18 | @property 19 | def sect_general(self): 20 | return self.data['general'] 21 | 22 | @property 23 | def sect_train(self): 24 | return self.data['train'] 25 | 26 | # general section 27 | @property 28 | def cube_type(self): 29 | return self.sect_general['cube_type'] 30 | 31 | @property 32 | def run_name(self): 33 | return self.sect_general['run_name'] 34 | 35 | # train section 36 | @property 37 | def train_scramble_depth(self): 38 | return self.sect_train.getint('scramble_depth') 39 | 40 | @property 41 | def train_cuda(self): 42 | return self.sect_train.getboolean('cuda', fallback=False) 43 | 44 | @property 45 | def train_learning_rate(self): 46 | return self.sect_train.getfloat('lr') 47 | 48 | @property 49 | def train_batch_size(self): 50 | return self.sect_train.getint('batch_size') 51 | 52 | @property 53 | def train_report_batches(self): 54 | return self.sect_train.getint('report_batches') 55 | 56 | @property 57 | def train_checkpoint_batches(self): 58 | return self.sect_train.getint('checkpoint_batches') 59 | 60 | @property 61 | def train_lr_decay_enabled(self): 62 | return self.sect_train.getboolean('lr_decay', fallback=False) 63 | 64 | @property 65 | def train_lr_decay_batches(self): 66 | return self.sect_train.getint('lr_decay_batches') 67 | 68 | @property 69 | def train_lr_decay_gamma(self): 70 | return self.sect_train.getfloat('lr_decay_gamma', fallback=1.0) 71 | 72 | @property 73 | def train_value_targets_method(self): 74 | return self.sect_train.get('value_targets_method', fallback='paper') 75 | 76 | @property 77 | def train_max_batches(self): 78 | return self.sect_train.getint('max_batches') 79 | 80 | @property 81 | def scramble_buffer_batches(self): 82 | return self.sect_train.getint("scramble_buffer_batches", 10) 83 | 84 | @property 85 | def push_scramble_buffer_iters(self): 86 | return self.sect_train.getint('push_scramble_buffer_iters', 100) 87 | 88 | @property 89 | def weight_samples(self): 90 | return self.sect_train.getboolean('weight_samples', True) 91 | 92 | # higher-level functions 93 | def train_name(self, suffix=None): 94 | res = "%s-%s-d%d" % (self.cube_type, self.run_name, self.train_scramble_depth) 95 | if suffix is not None: 96 | res += "-" + suffix 97 | return res 98 | -------------------------------------------------------------------------------- /Chapter21/libcube/cubes/__init__.py: -------------------------------------------------------------------------------- 1 | from ._env import CubeEnv, get, names 2 | from . import cube3x3 3 | from . import cube2x2 4 | 5 | __all__ = ('CubeEnv', 'get', 'names') 6 | -------------------------------------------------------------------------------- /Chapter21/libcube/cubes/_common.py: -------------------------------------------------------------------------------- 1 | def _permute(t, m, is_inv=False): 2 | """ 3 | Perform permutation of tuple according to mapping m 4 | """ 5 | r = list(t) 6 | for from_idx, to_idx in m: 7 | if is_inv: 8 | r[from_idx] = t[to_idx] 9 | else: 10 | r[to_idx] = t[from_idx] 11 | return r 12 | 13 | 14 | def _rotate(corner_ort, corners): 15 | """ 16 | Rotate given corners 120 degrees 17 | """ 18 | r = list(corner_ort) 19 | for c, angle in corners: 20 | r[c] = (r[c] + angle) % 3 21 | return r 22 | 23 | 24 | # orient corner cubelet 25 | def _map_orient(cols, orient_id): 26 | if orient_id == 0: 27 | return cols 28 | elif orient_id == 1: 29 | return cols[2], cols[0], cols[1] 30 | else: 31 | return cols[1], cols[2], cols[0] 32 | 33 | -------------------------------------------------------------------------------- /Chapter21/models/.gitattributes: -------------------------------------------------------------------------------- 1 | *.dat filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /Chapter21/models/3ed/2x2-paper/best_3.2572e-02.dat: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b8d40fbf6fb3d9c4d0e06ed3dc45f1fb80f2da32a44b8c406fc8167d198482b1 3 | size 45148722 4 | -------------------------------------------------------------------------------- /Chapter21/models/3ed/2x2-zg/chpt_017000.dat: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2b011478a90bee5e5139196a3b189e246e32776e62dff10d5e46d2ace937a034 3 | size 45148658 4 | -------------------------------------------------------------------------------- /Chapter21/models/3ed/3x3-paper/best_3.1818e-02.dat: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a6e324dc71d107e223392594fc584c8c4b29ece2a1e4c4050da82c3df6988287 3 | size 49867314 4 | -------------------------------------------------------------------------------- /Chapter21/models/3ed/3x3-zg/chpt_026400.dat: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a11ea7ab683162e94805731e20db958d1defcef1e59c5e4fd9f017a40635903c 3 | size 49867250 4 | -------------------------------------------------------------------------------- /Chapter21/requirements.txt: -------------------------------------------------------------------------------- 1 | nose 2 | seaborn 3 | torch 4 | numpy 5 | tqdm 6 | tensorboard-pytorch 7 | -------------------------------------------------------------------------------- /Chapter21/run_tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | ./solver.py -e cube2x2 -m saves/cube2x2-paper-d200-2x2-d200-t1/best_3.2572e-02.dat --max-steps 30000 --cuda -o c2x2-paper-d200-t1.csv & 4 | ./solver.py -e cube2x2 -m saves/cube2x2-zero-goal-d200-2x2-d200-zg-t2/best_1.3816e-02.dat --max-steps 30000 --cuda -o c2x2-zero-goal-d200-t1.csv & 5 | ./solver.py -e cube3x3 --cuda --max-steps 30000 -m saves/cube3x3-paper-d200-3x3-paper-d200-t1/best_3.1818e-02.dat -o c3x3-paper-d200-t1.csv & 6 | ./solver.py -e cube3x3 --cuda --max-steps 30000 -m saves/cube3x3-zero-goal-d200-3x3-zg-d200-t1/best_2.0891e-02.dat -o c3x3-zero-goal-d200-t1.csv & 7 | #./solver.py -e cube3x3 --cuda --max-steps 30000 -m saves/cube3x3-zero-goal-d200-no-decay/best_2.1798e-02.dat -o c3x3-zero-goal-d200-no-decay-v2.csv & 8 | -------------------------------------------------------------------------------- /Chapter21/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter21/tests/__init__.py -------------------------------------------------------------------------------- /Chapter21/tests/libcube/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter21/tests/libcube/__init__.py -------------------------------------------------------------------------------- /Chapter21/tests/libcube/cubes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter21/tests/libcube/cubes/__init__.py -------------------------------------------------------------------------------- /Chapter21/train_debug.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Ad-hoc utility to analyze trained model and various training process details 4 | """ 5 | import argparse 6 | import logging 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | 14 | from libcube import cubes 15 | from libcube import model 16 | 17 | 18 | log = logging.getLogger("train_debug") 19 | 20 | 21 | # How many data to generate for plots 22 | MAX_DEPTH = 10 23 | ROUND_COUNTS = 100 24 | # debug params 25 | #MAX_DEPTH = 5 26 | #ROUND_COUNTS = 2 27 | 28 | 29 | def gen_states(cube_env, max_depth, round_counts): 30 | """ 31 | Generate random states of various scramble depth 32 | :param cube_env: CubeEnv instance 33 | :return: list of list of (state, correct_action_index) pairs 34 | """ 35 | assert isinstance(cube_env, cubes.CubeEnv) 36 | 37 | result = [[] for _ in range(max_depth)] 38 | for _ in range(round_counts): 39 | data = cube_env.scramble_cube(max_depth, return_inverse=True) 40 | for depth, state, inv_action in data: 41 | result[depth-1].append((state, inv_action.value)) 42 | return result 43 | 44 | 45 | if __name__ == "__main__": 46 | sns.set() 47 | 48 | logging.basicConfig(format="%(asctime)-15s %(levelname)s %(message)s", level=logging.INFO) 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument("-e", "--env", required=True, help="Type of env to train, supported types=%s" % cubes.names()) 51 | parser.add_argument("-m", "--model", required=True, help="Model file to load") 52 | parser.add_argument("-o", "--output", required=True, help="Output prefix for plots") 53 | args = parser.parse_args() 54 | 55 | cube_env = cubes.get(args.env) 56 | log.info("Selected cube: %s", cube_env) 57 | net = model.Net(cube_env.encoded_shape, len(cube_env.action_enum)) 58 | net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage, weights_only=True)) 59 | net.eval() 60 | log.info("Network loaded from %s", args.model) 61 | 62 | # model.make_train_data(cube_env, net, device='cpu', batch_size=10, scramble_depth=2, shuffle=False) 63 | 64 | states_by_depth = gen_states(cube_env, max_depth=MAX_DEPTH, round_counts=ROUND_COUNTS) 65 | # for idx, states in enumerate(states_by_depth): 66 | # log.info("%d: %s", idx, states) 67 | 68 | # flatten returned data 69 | data = [] 70 | for depth, states in enumerate(states_by_depth): 71 | for s, inv_action in states: 72 | data.append((depth+1, s, inv_action)) 73 | depths, states, inv_actions = map(list, zip(*data)) 74 | 75 | # process states with net 76 | enc_states = model.encode_states(cube_env, states) 77 | enc_states_t = torch.tensor(enc_states) 78 | policy_t, value_t = net(enc_states_t) 79 | value_t = value_t.squeeze(-1) 80 | value = value_t.cpu().detach().numpy() 81 | policy = F.softmax(policy_t, dim=1).cpu().detach().numpy() 82 | 83 | # plot value per depth of scramble 84 | plot = sns.lineplot(depths, value) 85 | plot.set_title("Values per depths") 86 | plot.get_figure().savefig(args.output + "-vals_vs_depths.png") 87 | 88 | # plot action match 89 | plt.clf() 90 | actions = np.argmax(policy, axis=1) 91 | actions_match = (actions == inv_actions).astype(np.int8) 92 | plot = sns.lineplot(depths, actions_match) 93 | plot.set_title("Actions accuracy per depths") 94 | plot.get_figure().savefig(args.output + "-acts_vs_depths.png") 95 | 96 | pass 97 | -------------------------------------------------------------------------------- /Chapter22/.gitignore: -------------------------------------------------------------------------------- 1 | render 2 | -------------------------------------------------------------------------------- /Chapter22/battle_play.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import torch 4 | import ptan 5 | from lib import model, data 6 | 7 | from gymnasium.wrappers.monitoring.video_recorder import VideoRecorder 8 | 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("-m", "--model", required=True, 13 | help="Model file to load") 14 | parser.add_argument("--map-size", type=int, default=data.MAP_SIZE, 15 | help="Size of the map, default=" + str(data.MAP_SIZE)) 16 | parser.add_argument("--render", default="render/battle.mp4", 17 | help="Name of the video file to render, default=render/battle.mp4") 18 | parser.add_argument("--walls", type=int, default=data.COUNT_WALLS, 19 | help="Count of walls, default=" + str(data.COUNT_WALLS)) 20 | parser.add_argument("--a", type=int, default=data.COUNT_BATTLERS, 21 | help="Count of tigers, default=" + str(data.COUNT_BATTLERS)) 22 | parser.add_argument("--b", type=int, default=data.COUNT_BATTLERS, 23 | help="Count of deer, default=" + str(data.COUNT_BATTLERS)) 24 | 25 | args = parser.parse_args() 26 | 27 | env = data.BattleEnv( 28 | map_size=args.map_size, 29 | count_walls=args.walls, 30 | count_a=args.a, 31 | count_b=args.b, 32 | render_mode="rgb_array", 33 | ) 34 | recorder = VideoRecorder(env, args.render) 35 | net = model.DQNModel( 36 | env.observation_spaces['a_0'].shape, 37 | env.action_spaces['a_0'].n, 38 | ) 39 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 40 | a_agent = ptan.agent.DQNAgent( 41 | net, ptan.actions.ArgmaxActionSelector()) 42 | b_agent = data.RandomMAgent(env, env.handles[0]) 43 | 44 | obs = env.reset() 45 | recorder.capture_frame() 46 | total_reward = 0.0 47 | total_steps = 0 48 | 49 | while env.agents: 50 | actions = {} 51 | b_obs = [ 52 | obs[agent_id] 53 | for agent_id in env.agents 54 | if agent_id.startswith("a") 55 | ] 56 | a_acts, _ = a_agent(b_obs) 57 | ofs = 0 58 | for agent_id in env.agents: 59 | if agent_id.startswith("a"): 60 | actions[agent_id] = a_acts[ofs] 61 | ofs += 1 62 | 63 | b_obs = [ 64 | obs[agent_id] 65 | for agent_id in env.agents 66 | if agent_id.startswith("b") 67 | ] 68 | b_acts, _ = b_agent(b_obs) 69 | ofs = 0 70 | for agent_id in env.agents: 71 | if agent_id.startswith("b"): 72 | actions[agent_id] = b_acts[ofs] 73 | ofs += 1 74 | 75 | obs, rewards, dones, _, _ = env.step(actions) 76 | recorder.capture_frame() 77 | total_steps += 1 78 | for agent_id, reward in rewards.items(): 79 | if agent_id.startswith("a"): 80 | total_reward += reward 81 | 82 | print("Episode steps: %d" % total_steps) 83 | print("Total reward: %.3f" % total_reward) 84 | print("Mean reward: %.3f" % (total_reward / args.a)) 85 | recorder.close() -------------------------------------------------------------------------------- /Chapter22/forest_random.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from gymnasium.wrappers.monitoring.video_recorder import VideoRecorder 3 | from lib import data 4 | from PIL import Image 5 | import pathlib 6 | import numpy as np 7 | 8 | RENDER_DIR = "render" 9 | 10 | 11 | def save_render(render: np.ndarray, path: pathlib.Path, step: int): 12 | img = Image.fromarray(render) 13 | p = path / f"render_{step:04d}.png" 14 | img.save(str(p)) 15 | 16 | 17 | if __name__ == "__main__": 18 | env = data.ForestEnv(render_mode="rgb_array") 19 | recorder = VideoRecorder(env, RENDER_DIR + "/forest-random.mp4") 20 | sum_rewards = {agent_id: 0.0 for agent_id in env.agents} 21 | sum_steps = {agent_id: 0 for agent_id in env.agents} 22 | obs = env.reset() 23 | recorder.capture_frame() 24 | assert isinstance(obs, dict) 25 | print(f"tiger_0: obs {obs['tiger_0'].shape}, act: {env.action_space('tiger_0')}") 26 | print(f"deer_0: obs {obs['deer_0'].shape}, act: {env.action_space('deer_0')}\n") 27 | step = 0 28 | save_render(env.render(), pathlib.Path(RENDER_DIR), step) 29 | 30 | while env.agents: 31 | actions = {agent_id: env.action_space(agent_id).sample() for agent_id in env.agents} 32 | all_obs, all_rewards, all_dones, all_trunc, all_info = env.step(actions) 33 | recorder.capture_frame() 34 | for agent_id, r in all_rewards.items(): 35 | sum_rewards[agent_id] += r 36 | sum_steps[agent_id] += 1 37 | step += 1 38 | save_render(env.render(), pathlib.Path(RENDER_DIR), step) 39 | 40 | final_rewards = list(sum_rewards.items()) 41 | final_rewards.sort(key=lambda p: p[1], reverse=True) 42 | for agent_id, r in final_rewards[:20]: 43 | print(f"{agent_id}: got {r:.2f} in {sum_steps[agent_id]} steps") 44 | recorder.close() -------------------------------------------------------------------------------- /Chapter22/forest_tigers_play.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import torch 4 | import ptan 5 | from lib import model, data 6 | 7 | from gymnasium.wrappers.monitoring.video_recorder import VideoRecorder 8 | 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("-m", "--model", required=True, 13 | help="Model file to load") 14 | parser.add_argument("--map-size", type=int, default=data.MAP_SIZE, 15 | help="Size of the map, default=" + str(data.MAP_SIZE)) 16 | parser.add_argument("--render", default="render/video.mp4", 17 | help="Name of the video file to render, default=render/video.mp4") 18 | parser.add_argument("--walls", type=int, default=data.COUNT_WALLS, 19 | help="Count of walls, default=" + str(data.COUNT_WALLS)) 20 | parser.add_argument("--tigers", type=int, default=data.COUNT_TIGERS, 21 | help="Count of tigers, default=" + str(data.COUNT_TIGERS)) 22 | parser.add_argument("--deer", type=int, default=data.COUNT_DEER, 23 | help="Count of deer, default=" + str(data.COUNT_DEER)) 24 | parser.add_argument("--mode", default='forest', choices=['forest', 'double_attack'], 25 | help="GridWorld mode, could be 'forest' or 'double_attack', default='forest'") 26 | 27 | args = parser.parse_args() 28 | 29 | if args.mode == 'forest': 30 | env = data.ForestEnv( 31 | map_size=args.map_size, 32 | count_walls=args.walls, 33 | count_tigers=args.tigers, 34 | count_deer=args.deer, 35 | render_mode="rgb_array", 36 | ) 37 | elif args.mode == 'double_attack': 38 | env = data.DoubleAttackEnv( 39 | map_size=args.map_size, 40 | count_walls=args.walls, 41 | count_tigers=args.tigers, 42 | count_deer=args.deer, 43 | render_mode="rgb_array", 44 | ) 45 | else: 46 | raise RuntimeError() 47 | recorder = VideoRecorder(env, args.render) 48 | net = model.DQNModel( 49 | env.observation_spaces['tiger_0'].shape, 50 | env.action_spaces['tiger_0'].n, 51 | ) 52 | net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True)) 53 | tiger_agent = ptan.agent.DQNAgent( 54 | net, ptan.actions.ArgmaxActionSelector()) 55 | deer_agent = data.RandomMAgent(env, env.handles[0]) 56 | 57 | obs = env.reset() 58 | recorder.capture_frame() 59 | total_reward = 0.0 60 | total_steps = 0 61 | 62 | while env.agents: 63 | actions = {} 64 | tiger_obs = [ 65 | obs[agent_id] 66 | for agent_id in env.agents 67 | if agent_id.startswith("tiger") 68 | ] 69 | tiger_acts, _ = tiger_agent(tiger_obs) 70 | ofs = 0 71 | for agent_id in env.agents: 72 | if agent_id.startswith("tiger"): 73 | actions[agent_id] = tiger_acts[ofs] 74 | ofs += 1 75 | 76 | deer_obs = [ 77 | obs[agent_id] 78 | for agent_id in env.agents 79 | if agent_id.startswith("deer") 80 | ] 81 | deer_acts, _ = deer_agent(deer_obs) 82 | ofs = 0 83 | for agent_id in env.agents: 84 | if agent_id.startswith("deer"): 85 | actions[agent_id] = deer_acts[ofs] 86 | ofs += 1 87 | 88 | obs, rewards, dones, _, _ = env.step(actions) 89 | recorder.capture_frame() 90 | total_steps += 1 91 | for agent_id, reward in rewards.items(): 92 | if agent_id.startswith("tiger"): 93 | total_reward += reward 94 | 95 | print("Episode steps: %d" % total_steps) 96 | print("Total reward: %.3f" % total_reward) 97 | print("Mean reward: %.3f" % (total_reward / args.tigers)) 98 | recorder.close() -------------------------------------------------------------------------------- /Chapter22/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter22/lib/__init__.py -------------------------------------------------------------------------------- /Chapter22/lib/common.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta, datetime 2 | from types import SimpleNamespace 3 | from typing import Iterable 4 | import warnings 5 | 6 | import ptan 7 | import ptan.ignite as ptan_ignite 8 | from ignite.engine import Engine 9 | from ignite.metrics import RunningAverage 10 | from ignite.contrib.handlers import tensorboard_logger as tb_logger 11 | 12 | 13 | class EpsilonTracker: 14 | def __init__(self, selector: ptan.actions.EpsilonGreedyActionSelector, 15 | params: SimpleNamespace): 16 | self.selector = selector 17 | self.params = params 18 | self.frame(0) 19 | 20 | def frame(self, frame_idx: int): 21 | eps = self.params.epsilon_start - \ 22 | frame_idx / self.params.epsilon_frames 23 | self.selector.epsilon = max(self.params.epsilon_final, eps) 24 | 25 | 26 | def batch_generator(buffer: ptan.experience.ExperienceReplayBuffer, 27 | initial: int, batch_size: int): 28 | buffer.populate(initial) 29 | while True: 30 | buffer.populate(1) 31 | yield buffer.sample(batch_size) 32 | 33 | 34 | def setup_ignite(engine: Engine, params: SimpleNamespace, 35 | exp_source, run_name: str, 36 | extra_metrics: Iterable[str] = (), 37 | loss_metrics: Iterable[str] = ('loss', )): 38 | warnings.simplefilter("ignore", category=UserWarning) 39 | handler = ptan_ignite.EndOfEpisodeHandler( 40 | exp_source, bound_avg_reward=params.stop_reward) 41 | handler.attach(engine) 42 | ptan_ignite.EpisodeFPSHandler().attach(engine) 43 | 44 | @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED) 45 | def episode_completed(trainer: Engine): 46 | passed = trainer.state.metrics.get('time_passed', 0) 47 | print("Episode %d: reward=%.4f, steps=%s, " 48 | "speed=%.1f f/s, elapsed=%s" % ( 49 | trainer.state.episode, trainer.state.episode_reward, 50 | trainer.state.episode_steps, 51 | trainer.state.metrics.get('avg_fps', 0), 52 | timedelta(seconds=int(passed)))) 53 | 54 | @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED) 55 | def game_solved(trainer: Engine): 56 | passed = trainer.state.metrics['time_passed'] 57 | print("Game solved in %s, after %d episodes " 58 | "and %d iterations!" % ( 59 | timedelta(seconds=int(passed)), 60 | trainer.state.episode, trainer.state.iteration)) 61 | trainer.should_terminate = True 62 | 63 | now = datetime.now().isoformat(timespec='minutes') 64 | logdir = f"runs/{now}-{params.run_name}-{run_name}" 65 | tb = tb_logger.TensorboardLogger(log_dir=logdir) 66 | for loss_name in loss_metrics: 67 | run_avg = RunningAverage(output_transform=lambda v: v[loss_name]) 68 | run_avg.attach(engine, "avg_" + loss_name) 69 | 70 | metrics = ['reward', 'steps', 'avg_reward'] 71 | handler = tb_logger.OutputHandler( 72 | tag="episodes", metric_names=metrics) 73 | event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED 74 | tb.attach(engine, log_handler=handler, event_name=event) 75 | 76 | # write to tensorboard every 100 iterations 77 | ptan_ignite.PeriodicEvents().attach(engine) 78 | metrics = ['avg_loss', 'avg_fps'] 79 | metrics.extend(extra_metrics) 80 | handler = tb_logger.OutputHandler( 81 | tag="train", metric_names=metrics, 82 | output_transform=lambda a: a) 83 | event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED 84 | tb.attach(engine, log_handler=handler, event_name=event) 85 | -------------------------------------------------------------------------------- /Chapter22/lib/model.py: -------------------------------------------------------------------------------- 1 | import ptan 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | from typing import List 6 | 7 | 8 | class DQNModel(nn.Module): 9 | def __init__(self, view_shape, n_actions): 10 | super(DQNModel, self).__init__() 11 | 12 | self.view_conv = nn.Sequential( 13 | nn.Conv2d(view_shape[0], 32, kernel_size=3, padding=0), 14 | nn.ReLU(), 15 | nn.Conv2d(32, 16, kernel_size=2, padding=1), # padding was added for deer model 16 | nn.ReLU(), 17 | nn.Flatten(), 18 | ) 19 | view_out_size = self.view_conv(torch.zeros(1, *view_shape)).size()[-1] 20 | self.fc = nn.Sequential( 21 | nn.Linear(view_out_size, 128), 22 | nn.ReLU(), 23 | nn.Linear(128, n_actions) 24 | ) 25 | 26 | def forward(self, x): 27 | conv_out = self.view_conv(x) 28 | return self.fc(conv_out) 29 | 30 | 31 | def unpack_batch(batch: List[ptan.experience.ExperienceFirstLast]): 32 | states, actions, rewards, dones, last_states = [],[],[],[],[] 33 | for exp in batch: 34 | states.append(exp.state) 35 | actions.append(exp.action) 36 | rewards.append(exp.reward) 37 | dones.append(exp.last_state is None) 38 | if exp.last_state is None: 39 | lstate = exp.state # the result will be masked anyway 40 | else: 41 | lstate = exp.last_state 42 | last_states.append(lstate) 43 | return states, np.array(actions), \ 44 | np.array(rewards, dtype=np.float32), \ 45 | np.array(dones, dtype=bool), \ 46 | last_states 47 | 48 | 49 | def calc_loss_dqn(batch, net, tgt_net, preprocessor, gamma, device="cpu"): 50 | states, actions, rewards, dones, next_states = \ 51 | unpack_batch(batch) 52 | 53 | states = preprocessor(states).to(device) 54 | next_states = preprocessor(next_states).to(device) 55 | 56 | actions_v = torch.tensor(actions).to(device) 57 | rewards_v = torch.tensor(rewards).to(device) 58 | done_mask = torch.BoolTensor(dones).to(device) 59 | 60 | actions_v = actions_v.unsqueeze(-1) 61 | state_action_vals = net(states).gather(1, actions_v) 62 | state_action_vals = state_action_vals.squeeze(-1) 63 | with torch.no_grad(): 64 | next_state_vals = tgt_net(next_states).max(1)[0] 65 | next_state_vals[done_mask] = 0.0 66 | 67 | bellman_vals = next_state_vals.detach() * gamma + rewards_v 68 | return nn.MSELoss()(state_action_vals, bellman_vals) 69 | -------------------------------------------------------------------------------- /Chapter22/requirements.txt: -------------------------------------------------------------------------------- 1 | magent2==0.3.3 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Max Lapan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gymnasium[atari]==0.29.1 2 | gymnasium[classic-control]==0.29.1 3 | gymnasium[accept-rom-license]==0.29.1 4 | moviepy==1.0.3 5 | numpy<2 6 | opencv-python==4.10.0.84 7 | torch==2.5.0 8 | torchvision==0.20.0 9 | pytorch-ignite==0.5.1 10 | tensorboard==2.18.0 11 | mypy==1.8.0 12 | ptan==0.8.1 13 | stable-baselines3==2.3.2 14 | torchrl==0.6.0 15 | ray[tune]==2.37.0 16 | pytest 17 | -------------------------------------------------------------------------------- /tools/avg_csv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import csv 3 | import pathlib 4 | import argparse 5 | import itertools 6 | import typing as tt 7 | from dataclasses import dataclass 8 | 9 | 10 | @dataclass 11 | class Series: 12 | start_wall: float 13 | time_deltas: tt.List[float] 14 | steps: tt.List[int] 15 | values: tt.List[float] 16 | 17 | @classmethod 18 | def read(cls, path: pathlib.Path) -> "Series": 19 | start_wall = None 20 | deltas = [] 21 | steps = [] 22 | values = [] 23 | with path.open('rt', encoding='utf-8') as fd: 24 | reader = csv.DictReader(fd) 25 | for r in reader: 26 | assert isinstance(r, dict) 27 | t = float(r["Wall time"]) 28 | if start_wall is None: 29 | start_wall = t 30 | deltas.append(t - start_wall) 31 | steps.append(int(r["Step"])) 32 | values.append(float(r["Value"])) 33 | return Series(start_wall=start_wall, time_deltas=deltas, steps=steps, values=values) 34 | 35 | def write(self, path: pathlib.Path): 36 | with path.open('wt', encoding='utf-8') as fd: 37 | writer = csv.DictWriter(fd, ('Wall time', 'Step', 'Value')) 38 | writer.writeheader() 39 | for dt, s, v in zip(self.time_deltas, self.steps, self.values): 40 | writer.writerow({ 41 | 'Wall time': self.start_wall + dt, 42 | 'Step': s, 43 | 'Value': v, 44 | }) 45 | 46 | def __iter__(self) -> tt.Generator[tt.Tuple[float, int, float], None, None]: 47 | yield from zip(self.time_deltas, self.steps, self.values) 48 | 49 | 50 | def mean_max_step(series: tt.List[Series]) -> float: 51 | return sum(map(lambda s: s.steps[-1], series)) / len(series) 52 | 53 | 54 | def avg_entries(entries: tt.Tuple[tt.Optional[tt.Tuple[float, int, float]], ...], 55 | do_sum: bool = False) -> tt.Tuple[float, int, float]: 56 | deltas = [] 57 | steps = [] 58 | values = [] 59 | for entry in entries: 60 | if entry is None: 61 | continue 62 | d, s, v = entry 63 | deltas.append(d) 64 | steps.append(s) 65 | values.append(v) 66 | if do_sum: 67 | return sum(deltas), int(sum(steps)), sum(values) 68 | else: 69 | return sum(deltas) / len(deltas), int(sum(steps) / len(steps)), sum(values) / len(values) 70 | 71 | 72 | def average_series(series: tt.List[Series], do_sum: bool = False) -> Series: 73 | mean_steps = mean_max_step(series) 74 | start_wall = series[0].start_wall 75 | deltas = [] 76 | steps = [] 77 | values = [] 78 | 79 | for vals in itertools.zip_longest(*series): 80 | dt, s, v = avg_entries(vals, do_sum=do_sum) 81 | if s <= mean_steps: 82 | deltas.append(dt) 83 | steps.append(s) 84 | values.append(v) 85 | return Series(start_wall=start_wall, time_deltas=deltas, steps=steps, values=values) 86 | 87 | 88 | if __name__ == "__main__": 89 | parser = argparse.ArgumentParser() 90 | parser.add_argument("-o", "--output", required=True, help="Output csv file to produce") 91 | parser.add_argument("--sum", default=False, action="store_true", help="Perform summation instead of average") 92 | parser.add_argument("files", nargs='+', help="Input csv files") 93 | args = parser.parse_args() 94 | 95 | series = [Series.read(pathlib.Path(n)) for n in args.files] 96 | res = average_series(series, do_sum=args.sum) 97 | res.write(pathlib.Path(args.output)) -------------------------------------------------------------------------------- /tools/ch12/norm_dist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | MEAN = 10 6 | SAMPLES = 1000000 7 | X_MAX = MEAN * 2 8 | BINS = 100 9 | STEP = X_MAX / BINS 10 | 11 | 12 | if __name__ == "__main__": 13 | c1, c2, c5 = [0] * BINS, [0] * BINS, [0] * BINS 14 | for _ in range(SAMPLES): 15 | v1 = np.random.normal(loc=MEAN, scale=1.0) 16 | v2 = np.random.normal(loc=MEAN, scale=2.0) 17 | v5 = np.random.normal(loc=MEAN, scale=5.0) 18 | if 0 <= v1 <= X_MAX: 19 | b = int(BINS * v1 / X_MAX) 20 | c1[b] += 1 21 | if 0 <= v2 <= X_MAX: 22 | b = int(BINS * v2 / X_MAX) 23 | c2[b] += 1 24 | if 0 <= v5 <= X_MAX: 25 | b = int(BINS * v5 / X_MAX) 26 | c5[b] += 1 27 | x = [STEP * i for i in range(BINS)] 28 | y1 = [c / SAMPLES for c in c1] 29 | y2 = [c / SAMPLES for c in c2] 30 | y5 = [c / SAMPLES for c in c5] 31 | print(x) 32 | print(y1) 33 | 34 | fig = plt.figure() 35 | ax1 = fig.add_subplot(111) 36 | ax1.plot(x, y1, color='black', linewidth=1.2, linestyle='-') 37 | ax1.plot(x, y2, color='black', linewidth=1.2, linestyle=':') 38 | ax1.plot(x, y5, color='black', linewidth=1.2, linestyle='--') 39 | ax1.legend(["Variance = 1.0", "Variance = 2.0", "Variance = 5.0"], 40 | loc='upper right', fancybox=True) 41 | ax1.set_xlim(0, X_MAX) 42 | ax1.grid(True, axis='both') 43 | ax1.set_title("Gaussian Distribution with mean = 10.0") 44 | plt.savefig("norm_dist.svg") 45 | pass --------------------------------------------------------------------------------