├── .gitignore
├── Chapter02
    ├── .gitignore
    ├── 01_agent_anatomy.py
    ├── 02_cartpole_random.py
    ├── 03_random_action_wrapper.py
    └── 04_cartpole_random_monitor.py
├── Chapter03
    ├── 01_modules.py
    ├── 02_tensorboard.py
    ├── 03_atari_gan.py
    └── 04_atari_gan_ignite.py
├── Chapter04
    ├── 01_cartpole.py
    ├── 02_frozenlake_naive.py
    ├── 03_frozenlake_tweaked.py
    └── 04_frozenlake_nonslippery.py
├── Chapter05
    ├── 01_frozenlake_v_iteration.py
    └── 02_frozenlake_q_iteration.py
├── Chapter06
    ├── 01_frozenlake_q_learning.py
    ├── 02_dqn_pong.py
    ├── 03_dqn_play.py
    └── lib
    │   ├── __init__.py
    │   ├── dqn_model.py
    │   └── wrappers.py
├── Chapter07
    ├── 01_actions.py
    ├── 02_agents.py
    ├── 03_exp_sources.py
    ├── 04_replay_buf.py
    ├── 05_target_net.py
    ├── 06_cartpole.py
    └── lib.py
├── Chapter08
    ├── 01_dqn_basic.py
    ├── 02_dqn_n_steps.py
    ├── 03_dqn_double.py
    ├── 04_dqn_noisy_net.py
    ├── 05_dqn_prio_replay.py
    ├── 06_dqn_dueling.py
    ├── 07_dqn_distrib.py
    ├── 08_dqn_rainbow.py
    ├── adhoc
    │   ├── commute.py
    │   └── distr_test.py
    ├── bench
    │   ├── prio_buffer_bench.py
    │   └── simple_buffer_bench.py
    └── lib
    │   ├── __init__.py
    │   ├── common.py
    │   ├── dqn_extra.py
    │   └── dqn_model.py
├── Chapter09
    ├── .gitignore
    ├── 00_slow_grads.py
    ├── 01_baseline.py
    ├── 02_n_envs.py
    ├── 03_parallel.py
    ├── 04_wrappers_n_env.py
    ├── 04_wrappers_parallel.py
    ├── attic
    │   ├── 02_env_steps.py
    │   └── 03_parallel_orig.py
    ├── img
    │   ├── 01_orig_tb.png
    │   ├── 02_steps-tb.png
    │   ├── 03-serial-blocks.png
    │   ├── 03_serial.png
    │   └── 04_parallel.png
    └── lib
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-311.pyc
    │       ├── common.cpython-311.pyc
    │       └── dqn_model.cpython-311.pyc
    │   ├── atari_wrappers.py
    │   ├── common.py
    │   └── dqn_model.py
├── Chapter10
    ├── .gitignore
    ├── conftest.py
    ├── data
    │   ├── ch10-small-quotes.tgz
    │   └── unpack_data.sh
    ├── lib
    │   ├── __init__.py
    │   ├── common.py
    │   ├── data.py
    │   ├── environ.py
    │   ├── models.py
    │   └── validation.py
    ├── run_model.py
    ├── tests
    │   ├── test_data.py
    │   └── test_environ.py
    ├── train_model.py
    └── train_model_conv.py
├── Chapter11
    ├── .gitignore
    ├── 01_cartpole_dqn.py
    ├── 02_cartpole_reinforce.py
    ├── 03_cartpole_reinforce_baseline.py
    ├── 04_cartpole_pg.py
    ├── 05_pong_pg.py
    ├── 05_pong_pg_tune.py
    └── lib
    │   ├── __init__.py
    │   └── common.py
├── Chapter12
    ├── .gitignore
    ├── 01_cartpole_pg.py
    ├── 02_pong_a2c.py
    ├── 02_pong_a2c_tune.py
    ├── 03_a3c_grad.py
    └── lib
    │   ├── __init__.py
    │   └── common.py
├── Chapter13
    ├── adhoc
    │   ├── hf_t1.py
    │   ├── hf_t2.py
    │   ├── lc_t1.py
    │   └── openai_check.py
    ├── chatgpt_auto.py
    ├── chatgpt_interactive.py
    ├── conftest.py
    ├── games
    │   ├── .gitignore
    │   └── make_games.sh
    ├── lib
    │   ├── __init__.py
    │   ├── common.py
    │   ├── model.py
    │   └── preproc.py
    ├── pytest.ini
    ├── requirements.txt
    ├── tests
    │   ├── __init__.py
    │   └── test_preproc.py
    ├── train_basic.py
    ├── train_preproc.py
    └── train_tr.py
├── Chapter14
    ├── .gitignore
    ├── adhoc
    │   ├── 01_wob_create.py
    │   ├── 02_act_clicks.py
    │   ├── 03_clicker.py
    │   ├── 04_load_demo.py
    │   ├── 05_join_obs.py
    │   ├── 06_save_traj.py
    │   └── 06_save_traj_vec.py
    ├── demos
    │   ├── click-dialog
    │   │   ├── click-dialog_0421165244.json
    │   │   ├── click-dialog_0421165247.json
    │   │   ├── click-dialog_0421165250.json
    │   │   ├── click-dialog_0421165253.json
    │   │   ├── click-dialog_0421165255.json
    │   │   ├── click-dialog_0421165258.json
    │   │   ├── click-dialog_0421165300.json
    │   │   ├── click-dialog_0421165303.json
    │   │   ├── click-dialog_0421165306.json
    │   │   ├── click-dialog_0421165308.json
    │   │   ├── click-dialog_0421165311.json
    │   │   ├── click-dialog_0421165313.json
    │   │   ├── click-dialog_0421165316.json
    │   │   ├── click-dialog_0421165318.json
    │   │   ├── click-dialog_0421165320.json
    │   │   └── click-dialog_0421165323.json
    │   ├── click-tab
    │   │   ├── click-tab_0426161308.json
    │   │   ├── click-tab_0426161312.json
    │   │   ├── click-tab_0426161315.json
    │   │   ├── click-tab_0426161318.json
    │   │   ├── click-tab_0426161321.json
    │   │   ├── click-tab_0426161324.json
    │   │   ├── click-tab_0426161327.json
    │   │   ├── click-tab_0426161330.json
    │   │   ├── click-tab_0426161334.json
    │   │   ├── click-tab_0426161338.json
    │   │   ├── click-tab_0426161341.json
    │   │   ├── click-tab_0426161345.json
    │   │   ├── click-tab_0426161348.json
    │   │   ├── click-tab_0426161350.json
    │   │   └── click-tab_0426161353.json
    │   ├── count-sides
    │   │   ├── count-sides_0423161944.json
    │   │   ├── count-sides_0423161949.json
    │   │   ├── count-sides_0423161952.json
    │   │   ├── count-sides_0423161955.json
    │   │   ├── count-sides_0423161958.json
    │   │   ├── count-sides_0423162002.json
    │   │   ├── count-sides_0423162006.json
    │   │   ├── count-sides_0423162010.json
    │   │   ├── count-sides_0423162012.json
    │   │   ├── count-sides_0423162017.json
    │   │   ├── count-sides_0423162021.json
    │   │   ├── count-sides_0423162025.json
    │   │   ├── count-sides_0423162029.json
    │   │   ├── count-sides_0423162032.json
    │   │   ├── count-sides_0423162034.json
    │   │   ├── count-sides_0423162036.json
    │   │   ├── count-sides_0423162041.json
    │   │   ├── count-sides_0423162045.json
    │   │   ├── count-sides_0423162049.json
    │   │   ├── count-sides_0423162053.json
    │   │   ├── count-sides_0423162057.json
    │   │   ├── count-sides_0423162100.json
    │   │   ├── count-sides_0423162103.json
    │   │   ├── count-sides_0423162107.json
    │   │   └── count-sides_0423162110.json
    │   └── tic-tac-toe
    │   │   ├── tic-tac-toe_0423220647.json
    │   │   ├── tic-tac-toe_0423220653.json
    │   │   ├── tic-tac-toe_0423220658.json
    │   │   ├── tic-tac-toe_0423220704.json
    │   │   ├── tic-tac-toe_0423220710.json
    │   │   ├── tic-tac-toe_0423220718.json
    │   │   ├── tic-tac-toe_0423220724.json
    │   │   ├── tic-tac-toe_0423220730.json
    │   │   ├── tic-tac-toe_0423220737.json
    │   │   ├── tic-tac-toe_0423220744.json
    │   │   ├── tic-tac-toe_0423220749.json
    │   │   ├── tic-tac-toe_0423220754.json
    │   │   ├── tic-tac-toe_0423220801.json
    │   │   ├── tic-tac-toe_0423220805.json
    │   │   ├── tic-tac-toe_0423220812.json
    │   │   ├── tic-tac-toe_0423220818.json
    │   │   ├── tic-tac-toe_0423220823.json
    │   │   ├── tic-tac-toe_0423220828.json
    │   │   ├── tic-tac-toe_0423220835.json
    │   │   ├── tic-tac-toe_0423220839.json
    │   │   ├── tic-tac-toe_0423220843.json
    │   │   ├── tic-tac-toe_0423220847.json
    │   │   ├── tic-tac-toe_0423220852.json
    │   │   ├── tic-tac-toe_0423220856.json
    │   │   ├── tic-tac-toe_0423220902.json
    │   │   ├── tic-tac-toe_0423220905.json
    │   │   └── tic-tac-toe_0423220912.json
    ├── lib
    │   ├── common.py
    │   ├── demos.py
    │   ├── model.py
    │   └── wob.py
    ├── record_demo.py
    ├── requirements.txt
    ├── wob_click_mm_play.py
    ├── wob_click_mm_train.py
    ├── wob_click_play.py
    └── wob_click_train.py
├── Chapter15
    ├── 01_check_env.py
    ├── 02_train_a2c.py
    ├── 03_play_a2c.py
    ├── 04_train_ddpg.py
    ├── 05_play_ddpg.py
    ├── 06_train_d4pg.py
    ├── lib
    │   ├── __init__.py
    │   ├── common.py
    │   └── model.py
    └── requirements.txt
├── Chapter16
    ├── 01_train_a2c.py
    ├── 02_play.py
    ├── 03_train_ppo.py
    ├── 04_train_trpo.py
    ├── 05_train_acktr.py
    ├── 06_train_sac.py
    ├── lib
    │   ├── __init__.py
    │   ├── common.py
    │   ├── kfac.py
    │   ├── model.py
    │   └── trpo.py
    └── requirements.txt
├── Chapter17
    ├── .gitignore
    ├── 01_cartpole_es.py
    ├── 02_cheetah_es.py
    ├── 03_cartpole_ga.py
    ├── 04_cheetah_ga.py
    └── lib
    │   ├── __init__.py
    │   └── common.py
├── Chapter18
    ├── atari_dqn.py
    ├── atari_ppo.py
    ├── lib
    │   ├── __init__.py
    │   ├── common.py
    │   ├── dqn_extra.py
    │   └── ppo.py
    ├── mcar_dqn.py
    ├── mcar_ppo.py
    ├── riverswim.py
    └── tests
    │   ├── __init__.py
    │   └── test_ppo.py
├── Chapter19
    ├── .gitignore
    ├── 01_a2c.py
    ├── 01_play.py
    ├── 02_label_ui.py
    ├── 03_reward_train.py
    ├── adhoc
    │   ├── obs_to_gif.py
    │   └── rw_model.py
    ├── lib
    │   ├── __init__.py
    │   ├── common.py
    │   ├── rlhf.py
    │   └── ui_tools.py
    └── requirements.txt
├── Chapter20
    ├── .gitignore
    ├── lib
    │   ├── __init__.py
    │   ├── game.py
    │   ├── mcts.py
    │   ├── model.py
    │   └── muzero.py
    ├── play-mu.py
    ├── play.py
    ├── telegram-bot.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_game.py
    │   ├── test_model.py
    │   └── test_muzero.py
    ├── tournament
    │   ├── 2ed
    │   │   ├── charts.ipynb
    │   │   ├── final-short.png
    │   │   ├── final.csv
    │   │   ├── final.svg
    │   │   ├── final.txt
    │   │   ├── final_plot.ipynb
    │   │   ├── semi-0.txt
    │   │   ├── semi-1.txt
    │   │   ├── semi-2.txt
    │   │   ├── semi-3.txt
    │   │   ├── semi-4.txt
    │   │   ├── semi-5.txt
    │   │   ├── semi-6.txt
    │   │   ├── semi-7.txt
    │   │   ├── semi-8.txt
    │   │   ├── semi-9.txt
    │   │   ├── semi-common.png
    │   │   └── semi-scores.png
    │   └── 3ed
    │   │   ├── .ipynb_checkpoints
    │   │       ├── charts-checkpoint.ipynb
    │   │       └── charts-mu-checkpoint.ipynb
    │   │   ├── charts-mu.ipynb
    │   │   ├── charts.ipynb
    │   │   ├── final.sh
    │   │   ├── final_plot.ipynb
    │   │   ├── mu-t5-6-res2.txt
    │   │   ├── mu-v1-wins.csv
    │   │   ├── semi-v1.txt
    │   │   ├── semi-v2.txt
    │   │   ├── v1-wins.csv
    │   │   └── v2-wins.csv
    ├── train-mu.py
    └── train.py
├── Chapter21
    ├── .gitignore
    ├── csvs
    │   ├── 2ed
    │   │   ├── README.md
    │   │   ├── c2x2-paper-d200-t1-v2.csv
    │   │   ├── c2x2-paper-d200-t1.csv
    │   │   ├── c2x2-zero-goal-d200-t1-v2.csv
    │   │   ├── c2x2-zero-goal-d200-t1.csv
    │   │   ├── c3x3-paper-d200-t1-v2.csv
    │   │   ├── c3x3-paper-d200-t1.csv
    │   │   ├── c3x3-zero-goal-d200-no-decay-v2.csv
    │   │   ├── c3x3-zero-goal-d200-no-decay.csv
    │   │   ├── c3x3-zero-goal-d200-t1-v2.csv
    │   │   ├── c3x3-zero-goal-d200-t1.csv
    │   │   ├── c3x3
    │   │   │   ├── c3-paper-d20-1.93e-1.csv
    │   │   │   ├── c3-zg-d20-noweight-no-decay=5.501e-1.csv
    │   │   │   ├── c3-zg-d20-noweight-no-decay=5.61e-1.csv
    │   │   │   ├── c3-zg-d20-noweight-no-decay=6.43e-1.csv
    │   │   │   ├── c3-zg-d20-noweight-no-decay=7.29e-1.csv
    │   │   │   ├── c3-zg-d20-noweight-no-decay=chp100k.csv
    │   │   │   ├── c3-zg-d20-noweight-no-decay=chp64k.csv
    │   │   │   ├── c3-zg-d20-noweight.csv
    │   │   │   └── c3-zg-d20.csv
    │   │   ├── t3-c2x2-mcts-c=0.01.csv
    │   │   ├── t3-c2x2-mcts-c=0.1.csv
    │   │   ├── t3-c2x2-mcts-c=1.csv
    │   │   ├── t3-c2x2-mcts-c=10.csv
    │   │   ├── t3-c2x2-mcts-c=100.csv
    │   │   ├── t3-c2x2-mcts-c=1000.csv
    │   │   ├── t3-c2x2-mcts-c=100000.csv
    │   │   ├── t3.1-c2x2-mcts-c=1.csv
    │   │   ├── t3.1-c2x2-mcts-c=10.csv
    │   │   ├── t3.1-c2x2-mcts-c=100-steps=100k.csv
    │   │   ├── t3.1-c2x2-mcts-c=100-steps=60k.csv
    │   │   ├── t3.1-c2x2-mcts-c=100.csv
    │   │   ├── t3.1-c2x2-mcts-c=1000.csv
    │   │   ├── t3.1-c2x2-mcts-c=10000.csv
    │   │   ├── t4-c2x2-mcts-c=10-steps=100k.csv
    │   │   ├── t4-c2x2-mcts-c=10-steps=200k.csv
    │   │   ├── t4-c2x2-mcts-c=10-steps=500k.csv
    │   │   ├── t4-c2x2-mcts-c=100-steps=100k-b10.csv
    │   │   ├── t4-c2x2-mcts-c=100-steps=100k-b100.csv
    │   │   ├── t4-c2x2-mcts-c=100-steps=100k.csv
    │   │   ├── t5-c2x2-1.0366e-01.csv
    │   │   ├── t5-c2x2-3.0742e-02.csv
    │   │   ├── t5-c2x2-6.0737e-02.csv
    │   │   ├── t6-c2x2-nu=1.csv
    │   │   ├── t6-c2x2-nu=10.csv
    │   │   ├── t6-c2x2-nu=1000.csv
    │   │   └── t7-best-paper-1.8184e-1.csv
    │   └── 3ed
    │   │   ├── 2x2-paper.csv
    │   │   ├── 2x2-zg-chpt-17k.csv
    │   │   ├── 3x3-paper.csv
    │   │   └── 3x3-zg-chpt-26k.csv
    ├── cubes_tests
    │   ├── 2ed
    │   │   ├── cube2x2_d3.txt
    │   │   ├── cube2x2_d4.txt
    │   │   ├── cube2x2_d5.txt
    │   │   ├── cube2x2_d6.txt
    │   │   ├── cube3x3_d10.txt
    │   │   ├── cube3x3_d1000.txt
    │   │   ├── cube3x3_d15.txt
    │   │   ├── cube3x3_d3.txt
    │   │   └── cube3x3_d3_norepeat.txt
    │   └── 3ed
    │   │   ├── 2x2-d1-50.txt
    │   │   └── 3x3-d1-50.txt
    ├── docs
    │   └── Notes.md
    ├── gen_cubes.py
    ├── ini
    │   ├── README.md
    │   ├── cube2x2-paper-d200.ini
    │   ├── cube2x2-zero-goal-d200.ini
    │   ├── cube3x3-paper-d20.ini
    │   ├── cube3x3-paper-d200.ini
    │   ├── cube3x3-zero-goal-d20-noweight.ini
    │   ├── cube3x3-zero-goal-d20.ini
    │   ├── cube3x3-zero-goal-d200-slow-decay.ini
    │   └── cube3x3-zero-goal-d200.ini
    ├── libcube
    │   ├── conf.py
    │   ├── cubes
    │   │   ├── __init__.py
    │   │   ├── _common.py
    │   │   ├── _env.py
    │   │   ├── cube2x2.py
    │   │   └── cube3x3.py
    │   ├── mcts.py
    │   └── model.py
    ├── models
    │   ├── .gitattributes
    │   └── 3ed
    │   │   ├── 2x2-paper
    │   │       └── best_3.2572e-02.dat
    │   │   ├── 2x2-zg
    │   │       └── chpt_017000.dat
    │   │   ├── 3x3-paper
    │   │       └── best_3.1818e-02.dat
    │   │   └── 3x3-zg
    │   │       └── chpt_026400.dat
    ├── nbs
    │   ├── 2ed
    │   │   ├── 01_paper-vs-zero_goal.ipynb
    │   │   ├── 02_fix_steps_limit.ipynb
    │   │   ├── 03_mcts_tuning.ipynb
    │   │   ├── 04_mcts_C-extra-data.ipynb
    │   │   ├── 05_batch_search.ipynb
    │   │   ├── 06_compare_models.ipynb
    │   │   └── 07_article_figs.ipynb
    │   └── 3ed
    │   │   └── 07_article_figs.ipynb
    ├── requirements.txt
    ├── run_tests.sh
    ├── solver.py
    ├── tests
    │   ├── __init__.py
    │   └── libcube
    │   │   ├── __init__.py
    │   │   └── cubes
    │   │       ├── __init__.py
    │   │       ├── test_cube2x2.py
    │   │       └── test_cube3x3.py
    ├── train.py
    └── train_debug.py
├── Chapter22
    ├── .gitignore
    ├── battle_dqn.py
    ├── battle_play.py
    ├── forest_both_dqn.py
    ├── forest_both_play.py
    ├── forest_random.py
    ├── forest_tigers_dqn.py
    ├── forest_tigers_play.py
    ├── lib
    │   ├── __init__.py
    │   ├── common.py
    │   ├── data.py
    │   └── model.py
    └── requirements.txt
├── LICENSE
├── README.md
├── requirements.txt
└── tools
    ├── avg_csv.py
    ├── ch12
        ├── norm_dist.py
        └── norm_dist.svg
    └── plot.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | .DS_Store
3 | runs
4 | saves
5 | video
6 | __pycache__
7 | .ipynb_checkpoints/
8 | 


--------------------------------------------------------------------------------
/Chapter02/.gitignore:
--------------------------------------------------------------------------------
1 | video


--------------------------------------------------------------------------------
/Chapter02/01_agent_anatomy.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from typing import List
 3 | 
 4 | 
 5 | class Environment:
 6 |     def __init__(self):
 7 |         self.steps_left = 10
 8 | 
 9 |     def get_observation(self) -> List[float]:
10 |         return [0.0, 0.0, 0.0]
11 | 
12 |     def get_actions(self) -> List[int]:
13 |         return [0, 1]
14 | 
15 |     def is_done(self) -> bool:
16 |         return self.steps_left == 0
17 | 
18 |     def action(self, action: int) -> float:
19 |         if self.is_done():
20 |             raise Exception("Game is over")
21 |         self.steps_left -= 1
22 |         return random.random()
23 | 
24 | 
25 | class Agent:
26 |     def __init__(self):
27 |         self.total_reward = 0.0
28 | 
29 |     def step(self, env: Environment):
30 |         current_obs = env.get_observation()
31 |         actions = env.get_actions()
32 |         reward = env.action(random.choice(actions))
33 |         self.total_reward += reward
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     env = Environment()
38 |     agent = Agent()
39 | 
40 |     while not env.is_done():
41 |         agent.step(env)
42 | 
43 |     print("Total reward got: %.4f" % agent.total_reward)
44 | 


--------------------------------------------------------------------------------
/Chapter02/02_cartpole_random.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | 
 3 | 
 4 | if __name__ == "__main__":
 5 |     env = gym.make("CartPole-v1")
 6 |     total_reward = 0.0
 7 |     total_steps = 0
 8 |     obs, _ = env.reset()
 9 | 
10 |     while True:
11 |         action = env.action_space.sample()
12 |         obs, reward, is_done, is_trunc, _ = env.step(action)
13 |         total_reward += reward
14 |         total_steps += 1
15 |         if is_done:
16 |             break
17 | 
18 |     print("Episode done in %d steps, total reward %.2f" % (total_steps, total_reward))
19 | 


--------------------------------------------------------------------------------
/Chapter02/03_random_action_wrapper.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import random
 3 | 
 4 | 
 5 | class RandomActionWrapper(gym.ActionWrapper):
 6 |     def __init__(self, env: gym.Env, epsilon: float = 0.1):
 7 |         super(RandomActionWrapper, self).__init__(env)
 8 |         self.epsilon = epsilon
 9 | 
10 |     def action(self, action: gym.core.WrapperActType) -> gym.core.WrapperActType:
11 |         if random.random() < self.epsilon:
12 |             action = self.env.action_space.sample()
13 |             print(f"Random action {action}")
14 |             return action
15 |         return action
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     env = RandomActionWrapper(gym.make("CartPole-v1"))
20 | 
21 |     obs = env.reset()
22 |     total_reward = 0.0
23 | 
24 |     while True:
25 |         obs, reward, done, _, _ = env.step(0)
26 |         total_reward += reward
27 |         if done:
28 |             break
29 | 
30 |     print(f"Reward got: {total_reward:.2f}")
31 | 


--------------------------------------------------------------------------------
/Chapter02/04_cartpole_random_monitor.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | 
 3 | 
 4 | if __name__ == "__main__":
 5 |     env = gym.make("CartPole-v1", render_mode="rgb_array")
 6 |     env = gym.wrappers.HumanRendering(env)
 7 |     # env = gym.wrappers.RecordVideo(env, video_folder="video")
 8 | 
 9 |     total_reward = 0.0
10 |     total_steps = 0
11 |     obs = env.reset()
12 | 
13 |     while True:
14 |         action = env.action_space.sample()
15 |         obs, reward, done, _, _ = env.step(action)
16 |         total_reward += reward
17 |         total_steps += 1
18 |         if done:
19 |             break
20 | 
21 |     print(f"Episode done in {total_steps} steps, total reward {total_reward:.2f}")
22 |     env.close()
23 | 


--------------------------------------------------------------------------------
/Chapter03/01_modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class OurModule(nn.Module):
 6 |     def __init__(self, num_inputs, num_classes, dropout_prob=0.3):
 7 |         super(OurModule, self).__init__()
 8 |         self.pipe = nn.Sequential(
 9 |             nn.Linear(num_inputs, 5),
10 |             nn.ReLU(),
11 |             nn.Linear(5, 20),
12 |             nn.ReLU(),
13 |             nn.Linear(20, num_classes),
14 |             nn.Dropout(p=dropout_prob),
15 |             nn.Softmax(dim=1)
16 |         )
17 | 
18 |     def forward(self, x):
19 |         return self.pipe(x)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     net = OurModule(num_inputs=2, num_classes=3)
24 |     print(net)
25 |     v = torch.FloatTensor([[2, 3]])
26 |     out = net(v)
27 |     print(out)
28 |     print("Cuda's availability is %s" % torch.cuda.is_available())
29 |     if torch.cuda.is_available():
30 |         print("Data from cuda: %s" % out.to('cuda'))
31 | 


--------------------------------------------------------------------------------
/Chapter03/02_tensorboard.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from torch.utils.tensorboard.writer import SummaryWriter
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     writer = SummaryWriter()
 7 | 
 8 |     funcs = {"sin": math.sin, "cos": math.cos, "tan": math.tan}
 9 | 
10 |     for angle in range(-360, 360):
11 |         angle_rad = angle * math.pi / 180
12 |         for name, fun in funcs.items():
13 |             val = fun(angle_rad)
14 |             writer.add_scalar(name, val, angle)
15 | 
16 |     writer.close()
17 | 


--------------------------------------------------------------------------------
/Chapter06/01_frozenlake_q_learning.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import typing as tt
 3 | import gymnasium as gym
 4 | from collections import defaultdict
 5 | from torch.utils.tensorboard.writer import SummaryWriter
 6 | 
 7 | ENV_NAME = "FrozenLake-v1"
 8 | #ENV_NAME = "FrozenLake8x8-v1"      # uncomment for larger version
 9 | GAMMA = 0.9
10 | ALPHA = 0.2
11 | TEST_EPISODES = 20
12 | 
13 | State = int
14 | Action = int
15 | ValuesKey = tt.Tuple[State, Action]
16 | 
17 | class Agent:
18 |     def __init__(self):
19 |         self.env = gym.make(ENV_NAME)
20 |         self.state, _ = self.env.reset()
21 |         self.values: tt.Dict[ValuesKey] = defaultdict(float)
22 | 
23 |     def sample_env(self) -> tt.Tuple[State, Action, float, State]:
24 |         action = self.env.action_space.sample()
25 |         old_state = self.state
26 |         new_state, reward, is_done, is_tr, _ = self.env.step(action)
27 |         if is_done or is_tr:
28 |             self.state, _ = self.env.reset()
29 |         else:
30 |             self.state = new_state
31 |         return old_state, action, float(reward), new_state
32 | 
33 |     def best_value_and_action(self, state: State) -> tt.Tuple[float, Action]:
34 |         best_value, best_action = None, None
35 |         for action in range(self.env.action_space.n):
36 |             action_value = self.values[(state, action)]
37 |             if best_value is None or best_value < action_value:
38 |                 best_value = action_value
39 |                 best_action = action
40 |         return best_value, best_action
41 | 
42 |     def value_update(self, state: State, action: Action, reward: float, next_state: State):
43 |         best_val, _ = self.best_value_and_action(next_state)
44 |         new_val = reward + GAMMA * best_val
45 |         old_val = self.values[(state, action)]
46 |         key = (state, action)
47 |         self.values[key] = old_val * (1-ALPHA) + new_val * ALPHA
48 | 
49 |     def play_episode(self, env: gym.Env) -> float:
50 |         total_reward = 0.0
51 |         state, _ = env.reset()
52 |         while True:
53 |             _, action = self.best_value_and_action(state)
54 |             new_state, reward, is_done, is_tr, _ = env.step(action)
55 |             total_reward += reward
56 |             if is_done or is_tr:
57 |                 break
58 |             state = new_state
59 |         return total_reward
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     test_env = gym.make(ENV_NAME)
64 |     agent = Agent()
65 |     writer = SummaryWriter(comment="-q-learning")
66 | 
67 |     iter_no = 0
68 |     best_reward = 0.0
69 |     while True:
70 |         iter_no += 1
71 |         state, action, reward, next_state = agent.sample_env()
72 |         agent.value_update(state, action, reward, next_state)
73 | 
74 |         test_reward = 0.0
75 |         for _ in range(TEST_EPISODES):
76 |             test_reward += agent.play_episode(test_env)
77 |         test_reward /= TEST_EPISODES
78 |         writer.add_scalar("reward", test_reward, iter_no)
79 |         if test_reward > best_reward:
80 |             print("%d: Best test reward updated %.3f -> %.3f" % (iter_no, best_reward, test_reward))
81 |             best_reward = test_reward
82 |         if test_reward > 0.80:
83 |             print("Solved in %d iterations!" % iter_no)
84 |             break
85 |     writer.close()
86 | 


--------------------------------------------------------------------------------
/Chapter06/03_dqn_play.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gymnasium as gym
 3 | import argparse
 4 | import numpy as np
 5 | import typing as tt
 6 | 
 7 | import torch
 8 | 
 9 | from lib import wrappers
10 | from lib import dqn_model
11 | 
12 | import collections
13 | 
14 | DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("-m", "--model", required=True, help="Model file to load")
20 |     parser.add_argument("-e", "--env", default=DEFAULT_ENV_NAME,
21 |                         help="Environment name to use, default=" + DEFAULT_ENV_NAME)
22 |     parser.add_argument("-r", "--record", required=True, help="Directory for video")
23 |     args = parser.parse_args()
24 | 
25 |     env = wrappers.make_env(args.env, render_mode="rgb_array")
26 |     env = gym.wrappers.RecordVideo(env, video_folder=args.record)
27 |     net = dqn_model.DQN(env.observation_space.shape, env.action_space.n)
28 |     state = torch.load(args.model, map_location=lambda stg, _: stg, weights_only=True)
29 |     net.load_state_dict(state)
30 | 
31 |     state, _ = env.reset()
32 |     total_reward = 0.0
33 |     c: tt.Dict[int, int] = collections.Counter()
34 | 
35 |     while True:
36 |         state_v = torch.tensor(np.expand_dims(state, 0))
37 |         q_vals = net(state_v).data.numpy()[0]
38 |         action = int(np.argmax(q_vals))
39 |         c[action] += 1
40 |         state, reward, is_done, is_trunc, _ = env.step(action)
41 |         total_reward += reward
42 |         if is_done or is_trunc:
43 |             break
44 |     print("Total reward: %.2f" % total_reward)
45 |     print("Action counts:", c)
46 |     env.close()
47 | 
48 | 


--------------------------------------------------------------------------------
/Chapter06/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter06/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter06/lib/dqn_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class DQN(nn.Module):
 6 |     def __init__(self, input_shape, n_actions):
 7 |         super(DQN, self).__init__()
 8 | 
 9 |         self.conv = nn.Sequential(
10 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
11 |             nn.ReLU(),
12 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
13 |             nn.ReLU(),
14 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
15 |             nn.ReLU(),
16 |             nn.Flatten(),
17 |         )
18 |         size = self.conv(torch.zeros(1, *input_shape)).size()[-1]
19 |         self.fc = nn.Sequential(
20 |             nn.Linear(size, 512),
21 |             nn.ReLU(),
22 |             nn.Linear(512, n_actions)
23 |         )
24 | 
25 |     def forward(self, x: torch.ByteTensor):
26 |         # scale on GPU
27 |         xx = x / 255.0
28 |         return self.fc(self.conv(xx))
29 | 


--------------------------------------------------------------------------------
/Chapter06/lib/wrappers.py:
--------------------------------------------------------------------------------
 1 | import typing as tt
 2 | import gymnasium as gym
 3 | from gymnasium import spaces
 4 | import collections
 5 | import numpy as np
 6 | from stable_baselines3.common import atari_wrappers
 7 | 
 8 | 
 9 | class ImageToPyTorch(gym.ObservationWrapper):
10 |     def __init__(self, env):
11 |         super(ImageToPyTorch, self).__init__(env)
12 |         obs = self.observation_space
13 |         assert isinstance(obs, gym.spaces.Box)
14 |         assert len(obs.shape) == 3
15 |         new_shape = (obs.shape[-1], obs.shape[0], obs.shape[1])
16 |         self.observation_space = gym.spaces.Box(
17 |             low=obs.low.min(), high=obs.high.max(),
18 |             shape=new_shape, dtype=obs.dtype)
19 | 
20 |     def observation(self, observation):
21 |         return np.moveaxis(observation, 2, 0)
22 | 
23 | 
24 | class BufferWrapper(gym.ObservationWrapper):
25 |     def __init__(self, env, n_steps):
26 |         super(BufferWrapper, self).__init__(env)
27 |         obs = env.observation_space
28 |         assert isinstance(obs, spaces.Box)
29 |         new_obs = gym.spaces.Box(
30 |             obs.low.repeat(n_steps, axis=0), obs.high.repeat(n_steps, axis=0),
31 |             dtype=obs.dtype)
32 |         self.observation_space = new_obs
33 |         self.buffer = collections.deque(maxlen=n_steps)
34 | 
35 |     def reset(self, *, seed: tt.Optional[int] = None, options: tt.Optional[dict[str, tt.Any]] = None):
36 |         for _ in range(self.buffer.maxlen-1):
37 |             self.buffer.append(self.env.observation_space.low)
38 |         obs, extra = self.env.reset()
39 |         return self.observation(obs), extra
40 | 
41 |     def observation(self, observation: np.ndarray) -> np.ndarray:
42 |         self.buffer.append(observation)
43 |         return np.concatenate(self.buffer)
44 | 
45 | 
46 | def make_env(env_name: str, **kwargs):
47 |     env = gym.make(env_name, **kwargs)
48 |     env = atari_wrappers.AtariWrapper(env, clip_reward=False, noop_max=0)
49 |     env = ImageToPyTorch(env)
50 |     env = BufferWrapper(env, n_steps=4)
51 |     return env
52 | 


--------------------------------------------------------------------------------
/Chapter07/01_actions.py:
--------------------------------------------------------------------------------
 1 | import ptan
 2 | import numpy as np
 3 | 
 4 | 
 5 | if __name__ == "__main__":
 6 |     q_vals = np.array([[1, 2, 3], [1, -1, 0]])
 7 |     print("q_vals")
 8 |     print(q_vals)
 9 | 
10 |     selector = ptan.actions.ArgmaxActionSelector()
11 |     print("argmax:", selector(q_vals))
12 | 
13 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0)
14 |     print("epsilon=0.0:", selector(q_vals))
15 | 
16 |     selector.epsilon = 1.0
17 |     print("epsilon=1.0:", selector(q_vals))
18 | 
19 |     selector.epsilon = 0.5
20 |     print("epsilon=0.5:", selector(q_vals))
21 |     selector.epsilon = 0.1
22 |     print("epsilon=0.1:", selector(q_vals))
23 | 
24 |     selector = ptan.actions.ProbabilityActionSelector()
25 |     print("Actions sampled from three prob distributions:")
26 |     for _ in range(10):
27 |         acts = selector(np.array([
28 |             [0.1, 0.8, 0.1],
29 |             [0.0, 0.0, 1.0],
30 |             [0.5, 0.5, 0.0]
31 |         ]))
32 |         print(acts)
33 | 


--------------------------------------------------------------------------------
/Chapter07/02_agents.py:
--------------------------------------------------------------------------------
 1 | import ptan
 2 | import torch
 3 | from torch import nn
 4 | import numpy as np
 5 | 
 6 | 
 7 | class DQNNet(nn.Module):
 8 |     def __init__(self, actions: int):
 9 |         super(DQNNet, self).__init__()
10 |         self.actions = actions
11 | 
12 |     def forward(self, x):
13 |         # we always produce diagonal tensor of shape
14 |         # (batch_size, actions)
15 |         return torch.eye(x.size()[0], self.actions)
16 | 
17 | 
18 | class PolicyNet(nn.Module):
19 |     def __init__(self, actions: int):
20 |         super(PolicyNet, self).__init__()
21 |         self.actions = actions
22 | 
23 |     def forward(self, x):
24 |         # Now we produce the tensor with first two actions
25 |         # having the same logit scores
26 |         shape = (x.size()[0], self.actions)
27 |         res = torch.zeros(shape, dtype=torch.float32)
28 |         res[:, 0] = 1
29 |         res[:, 1] = 1
30 |         return res
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     net = DQNNet(actions=3)
35 |     net_out = net(torch.zeros(2, 10))
36 |     print("dqn_net:")
37 |     print(net_out)
38 | 
39 |     selector = ptan.actions.ArgmaxActionSelector()
40 |     agent = ptan.agent.DQNAgent(model=net, action_selector=selector)
41 |     ag_out = agent(np.zeros(shape=(2, 5)))
42 |     print("Argmax:", ag_out)
43 | 
44 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
45 |     agent = ptan.agent.DQNAgent(model=net, action_selector=selector)
46 |     ag_out = agent(torch.zeros(10, 5))[0]
47 |     print("eps=1.0:", ag_out)
48 | 
49 |     selector.epsilon = 0.5
50 |     ag_out = agent(torch.zeros(10, 5))[0]
51 |     print("eps=0.5:", ag_out)
52 | 
53 |     selector.epsilon = 0.1
54 |     ag_out = agent(torch.zeros(10, 5))[0]
55 |     print("eps=0.1:", ag_out)
56 | 
57 |     net = PolicyNet(actions=5)
58 |     net_out = net(torch.zeros(6, 10))
59 |     print("policy_net:")
60 |     print(net_out)
61 | 
62 |     selector = ptan.actions.ProbabilityActionSelector()
63 |     agent = ptan.agent.PolicyAgent(model=net, action_selector=selector, apply_softmax=True)
64 |     ag_out = agent(torch.zeros(6, 5))[0]
65 |     print(ag_out)
66 | 


--------------------------------------------------------------------------------
/Chapter07/03_exp_sources.py:
--------------------------------------------------------------------------------
 1 | from lib import *
 2 | 
 3 | 
 4 | if __name__ == "__main__":
 5 |     env = ToyEnv()
 6 |     s, _ = env.reset()
 7 |     print(f"env.reset() -> {s}")
 8 |     s = env.step(1)
 9 |     print(f"env.step(1) -> {s}")
10 |     s = env.step(2)
11 |     print(f"env.step(2) -> {s}")
12 | 
13 |     for _ in range(10):
14 |         r = env.step(0)
15 |         print(r)
16 | 
17 |     agent = DullAgent(action=1)
18 |     print("agent:", agent([1, 2])[0])
19 | 
20 |     env = ToyEnv()
21 |     agent = DullAgent(action=1)
22 |     exp_source = ptan.experience.ExperienceSource(
23 |         env=env, agent=agent, steps_count=2)
24 |     for idx, exp in enumerate(exp_source):
25 |         if idx > 15:
26 |             break
27 |         print(exp)
28 | 
29 |     exp_source = ptan.experience.ExperienceSource(
30 |         env=env, agent=agent, steps_count=4)
31 |     print(next(iter(exp_source)))
32 | 
33 |     exp_source = ptan.experience.ExperienceSource(
34 |         env=[ToyEnv(), ToyEnv()], agent=agent, steps_count=2)
35 |     for idx, exp in enumerate(exp_source):
36 |         if idx > 4:
37 |             break
38 |         print(exp)
39 | 
40 |     print("ExperienceSourceFirstLast")
41 |     exp_source = ptan.experience.ExperienceSourceFirstLast(
42 |         env, agent, gamma=1.0, steps_count=1)
43 |     for idx, exp in enumerate(exp_source):
44 |         print(exp)
45 |         if idx > 10:
46 |             break
47 | 


--------------------------------------------------------------------------------
/Chapter07/04_replay_buf.py:
--------------------------------------------------------------------------------
 1 | from lib import *
 2 | 
 3 | 
 4 | if __name__ == "__main__":
 5 |     env = ToyEnv()
 6 |     agent = DullAgent(action=1)
 7 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1.0, steps_count=1)
 8 |     buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=100)
 9 | 
10 |     for step in range(6):
11 |         buffer.populate(1)
12 |         # if buffer is small enough, do nothing
13 |         if len(buffer) < 5:
14 |             continue
15 |         batch = buffer.sample(4)
16 |         print("Train time, %d batch samples:" % len(batch))
17 |         for s in batch:
18 |             print(s)
19 | 


--------------------------------------------------------------------------------
/Chapter07/05_target_net.py:
--------------------------------------------------------------------------------
 1 | from lib import *
 2 | 
 3 | 
 4 | if __name__ == "__main__":
 5 |     net = DQNNet()
 6 |     print(net)
 7 |     tgt_net = ptan.agent.TargetNet(net)
 8 |     print("Main net:", net.ff.weight)
 9 |     print("Target net:", tgt_net.target_model.ff.weight)
10 |     net.ff.weight.data += 1.0
11 |     print("After update")
12 |     print("Main net:", net.ff.weight)
13 |     print("Target net:", tgt_net.target_model.ff.weight)
14 |     tgt_net.sync()
15 |     print("After sync")
16 |     print("Main net:", net.ff.weight)
17 |     print("Target net:", tgt_net.target_model.ff.weight)
18 | 


--------------------------------------------------------------------------------
/Chapter07/06_cartpole.py:
--------------------------------------------------------------------------------
  1 | import gymnasium as gym
  2 | from ptan.experience import ExperienceFirstLast, ExperienceSourceFirstLast
  3 | import ptan
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | import torch.nn.functional as F
  9 | import typing as tt
 10 | 
 11 | 
 12 | HIDDEN_SIZE = 128
 13 | BATCH_SIZE = 16
 14 | TGT_NET_SYNC = 10
 15 | GAMMA = 0.9
 16 | REPLAY_SIZE = 1000
 17 | LR = 1e-3
 18 | EPS_DECAY = 0.99
 19 | 
 20 | 
 21 | class Net(nn.Module):
 22 |     def __init__(self, obs_size: int, hidden_size: int, n_actions: int):
 23 |         super(Net, self).__init__()
 24 |         self.net = nn.Sequential(
 25 |             nn.Linear(obs_size, hidden_size),
 26 |             nn.ReLU(),
 27 |             nn.Linear(hidden_size, n_actions)
 28 |         )
 29 | 
 30 |     def forward(self, x):
 31 |         return self.net(x.float())
 32 | 
 33 | 
 34 | @torch.no_grad()
 35 | def unpack_batch(batch: tt.List[ExperienceFirstLast], net: Net, gamma: float):
 36 |     states = []
 37 |     actions = []
 38 |     rewards = []
 39 |     done_masks = []
 40 |     last_states = []
 41 |     for exp in batch:
 42 |         states.append(exp.state)
 43 |         actions.append(exp.action)
 44 |         rewards.append(exp.reward)
 45 |         done_masks.append(exp.last_state is None)
 46 |         if exp.last_state is None:
 47 |             last_states.append(exp.state)
 48 |         else:
 49 |             last_states.append(exp.last_state)
 50 | 
 51 |     states_v = torch.as_tensor(np.stack(states))
 52 |     actions_v = torch.tensor(actions)
 53 |     rewards_v = torch.tensor(rewards)
 54 |     last_states_v = torch.as_tensor(np.stack(last_states))
 55 |     last_state_q_v = net(last_states_v)
 56 |     best_last_q_v = torch.max(last_state_q_v, dim=1)[0]
 57 |     best_last_q_v[done_masks] = 0.0
 58 |     return states_v, actions_v, best_last_q_v * gamma + rewards_v
 59 | 
 60 | 
 61 | if __name__ == "__main__":
 62 |     env = gym.make("CartPole-v1")
 63 |     obs_size = env.observation_space.shape[0]
 64 |     n_actions = env.action_space.n
 65 | 
 66 |     net = Net(obs_size, HIDDEN_SIZE, n_actions)
 67 |     tgt_net = ptan.agent.TargetNet(net)
 68 |     selector = ptan.actions.ArgmaxActionSelector()
 69 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1, selector=selector)
 70 |     agent = ptan.agent.DQNAgent(net, selector)
 71 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA)
 72 |     buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)
 73 |     optimizer = optim.Adam(net.parameters(), LR)
 74 | 
 75 |     step = 0
 76 |     episode = 0
 77 |     solved = False
 78 | 
 79 |     while True:
 80 |         step += 1
 81 |         buffer.populate(1)
 82 | 
 83 |         for reward, steps in exp_source.pop_rewards_steps():
 84 |             episode += 1
 85 |             print(f"{step}: episode {episode} done, reward={reward:.2f}, "
 86 |                   f"epsilon={selector.epsilon:.2f}")
 87 |             solved = reward > 150
 88 |         if solved:
 89 |             print("Whee!")
 90 |             break
 91 |         if len(buffer) < 2*BATCH_SIZE:
 92 |             continue
 93 |         batch = buffer.sample(BATCH_SIZE)
 94 |         states_v, actions_v, tgt_q_v = unpack_batch(batch, tgt_net.target_model, GAMMA)
 95 |         optimizer.zero_grad()
 96 |         q_v = net(states_v)
 97 |         q_v = q_v.gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
 98 |         loss_v = F.mse_loss(q_v, tgt_q_v)
 99 |         loss_v.backward()
100 |         optimizer.step()
101 |         selector.epsilon *= EPS_DECAY
102 | 
103 |         if step % TGT_NET_SYNC == 0:
104 |             tgt_net.sync()
105 | 


--------------------------------------------------------------------------------
/Chapter07/lib.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import ptan
 3 | import typing as tt
 4 | import torch.nn as nn
 5 | 
 6 | 
 7 | class ToyEnv(gym.Env):
 8 |     """
 9 |     Environment with observation 0..4 and actions 0..2
10 |     Observations are rotated sequentialy mod 5, reward is equal to given action.
11 |     Episodes are having fixed length of 10
12 |     """
13 | 
14 |     def __init__(self):
15 |         super(ToyEnv, self).__init__()
16 |         self.observation_space = gym.spaces.Discrete(n=5)
17 |         self.action_space = gym.spaces.Discrete(n=3)
18 |         self.step_index = 0
19 | 
20 |     def reset(self):
21 |         self.step_index = 0
22 |         return self.step_index, {}
23 | 
24 |     def step(self, action: int):
25 |         is_done = self.step_index == 10
26 |         if is_done:
27 |             return self.step_index % self.observation_space.n, 0.0, is_done, False, {}
28 |         self.step_index += 1
29 |         return self.step_index % self.observation_space.n, float(action), \
30 |             self.step_index == 10, False, {}
31 | 
32 | 
33 | class DullAgent(ptan.agent.BaseAgent):
34 |     """
35 |     Agent always returns the fixed action
36 |     """
37 |     def __init__(self, action: int):
38 |         self.action = action
39 | 
40 |     def __call__(self, observations: tt.List[int], state: tt.Optional[list] = None) -> \
41 |             tt.Tuple[tt.List[int], tt.Optional[list]]:
42 |         return [self.action for _ in observations], state
43 | 
44 | 
45 | class DQNNet(nn.Module):
46 |     def __init__(self):
47 |         super(DQNNet, self).__init__()
48 |         self.ff = nn.Linear(5, 3)
49 | 
50 |     def forward(self, x):
51 |         return self.ff(x)


--------------------------------------------------------------------------------
/Chapter08/01_dqn_basic.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gymnasium as gym
 3 | import ptan
 4 | import typing as tt
 5 | 
 6 | import torch
 7 | import torch.optim as optim
 8 | 
 9 | from ignite.engine import Engine
10 | 
11 | from lib import dqn_model, common
12 | 
13 | NAME = "01_baseline"
14 | 
15 | BEST_PONG = common.Hyperparams(
16 |     env_name="PongNoFrameskip-v4",
17 |     stop_reward=18.0,
18 |     run_name="pong",
19 |     replay_size=100_000,
20 |     replay_initial=10_000,
21 |     target_net_sync=1000,
22 |     epsilon_frames=100_000,
23 |     epsilon_final=0.02,
24 |     learning_rate=9.932831968547505e-05,
25 |     gamma=0.98,
26 |     episodes_to_solve=340,
27 | )
28 | 
29 | 
30 | def train(params: common.Hyperparams,
31 |           device: torch.device, _: dict) -> tt.Optional[int]:
32 |     env = gym.make(params.env_name)
33 |     env = ptan.common.wrappers.wrap_dqn(env)
34 | 
35 |     net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
36 |     tgt_net = ptan.agent.TargetNet(net)
37 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params.epsilon_start)
38 |     epsilon_tracker = common.EpsilonTracker(selector, params)
39 |     agent = ptan.agent.DQNAgent(net, selector, device=device)
40 | 
41 |     exp_source = ptan.experience.ExperienceSourceFirstLast(
42 |         env, agent, gamma=params.gamma, env_seed=common.SEED)
43 |     buffer = ptan.experience.ExperienceReplayBuffer(
44 |         exp_source, buffer_size=params.replay_size)
45 |     optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)
46 | 
47 |     def process_batch(engine, batch):
48 |         optimizer.zero_grad()
49 |         loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model,
50 |                                       gamma=params.gamma, device=device)
51 |         loss_v.backward()
52 |         optimizer.step()
53 |         epsilon_tracker.frame(engine.state.iteration)
54 |         if engine.state.iteration % params.target_net_sync == 0:
55 |             tgt_net.sync()
56 |         return {
57 |             "loss": loss_v.item(),
58 |             "epsilon": selector.epsilon,
59 |         }
60 | 
61 |     engine = Engine(process_batch)
62 |     common.setup_ignite(engine, params, exp_source, NAME)
63 |     r = engine.run(common.batch_generator(buffer, params.replay_initial, params.batch_size))
64 |     if r.solved:
65 |         return r.episode
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     args = common.argparser().parse_args()
70 |     common.train_or_tune(args, train, BEST_PONG)
71 | 


--------------------------------------------------------------------------------
/Chapter08/04_dqn_noisy_net.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gymnasium as gym
 3 | import ptan
 4 | import typing as tt
 5 | 
 6 | import torch
 7 | import torch.optim as optim
 8 | 
 9 | from ignite.engine import Engine
10 | 
11 | from lib import common, dqn_extra
12 | 
13 | NAME = "04_noisy"
14 | NOISY_SNR_EVERY_ITERS = 100
15 | 
16 | BEST_PONG = common.Hyperparams(
17 |     env_name="PongNoFrameskip-v4",
18 |     stop_reward=18.0,
19 |     run_name="pong",
20 |     replay_size=100_000,
21 |     replay_initial=10_000,
22 |     target_net_sync=1000,
23 |     epsilon_frames=100_000,
24 |     epsilon_final=0.02,
25 |     learning_rate=7.142520950425814e-05,
26 |     gamma=0.99,
27 |     episodes_to_solve=273,
28 | )
29 | 
30 | 
31 | 
32 | def train(params: common.Hyperparams,
33 |           device: torch.device, extra: dict) -> tt.Optional[int]:
34 |     env = gym.make(params.env_name)
35 |     env = ptan.common.wrappers.wrap_dqn(env)
36 | 
37 |     net = dqn_extra.NoisyDQN(
38 |         env.observation_space.shape,
39 |         env.action_space.n).to(device)
40 | 
41 |     tgt_net = ptan.agent.TargetNet(net)
42 |     selector = ptan.actions.ArgmaxActionSelector()
43 |     agent = ptan.agent.DQNAgent(net, selector, device=device)
44 | 
45 |     exp_source = ptan.experience.ExperienceSourceFirstLast(
46 |         env, agent, gamma=params.gamma, env_seed=common.SEED)
47 |     buffer = ptan.experience.ExperienceReplayBuffer(
48 |         exp_source, buffer_size=params.replay_size)
49 |     optimizer = optim.Adam(net.parameters(),
50 |                            lr=params.learning_rate)
51 | 
52 |     def process_batch(engine, batch):
53 |         optimizer.zero_grad()
54 |         loss_v = common.calc_loss_dqn(
55 |             batch, net, tgt_net.target_model,
56 |             gamma=params.gamma, device=device)
57 |         loss_v.backward()
58 |         optimizer.step()
59 |         net.reset_noise()
60 |         if engine.state.iteration % params.target_net_sync == 0:
61 |             tgt_net.sync()
62 |         if engine.state.iteration % NOISY_SNR_EVERY_ITERS == 0:
63 |             for layer_idx, sigma_l2 in enumerate(net.noisy_layers_sigma_snr()):
64 |                 engine.state.metrics[f'snr_{layer_idx+1}'] = sigma_l2
65 |         return {
66 |             "loss": loss_v.item(),
67 |         }
68 | 
69 |     engine = Engine(process_batch)
70 |     common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=('snr_1', 'snr_2'))
71 |     r = engine.run(common.batch_generator(buffer, params.replay_initial,
72 |                                           params.batch_size))
73 |     if r.solved:
74 |         return r.episode
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     args = common.argparser().parse_args()
79 |     common.train_or_tune(args, train, BEST_PONG)
80 | 


--------------------------------------------------------------------------------
/Chapter08/06_dqn_dueling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gymnasium as gym
 3 | import ptan
 4 | import typing as tt
 5 | 
 6 | import torch
 7 | from torch import nn
 8 | from torch import optim
 9 | import numpy as np
10 | 
11 | from ignite.engine import Engine
12 | 
13 | from lib import dqn_extra, common
14 | 
15 | NAME = "06_dueling"
16 | 
17 | STATES_TO_EVALUATE = 1000
18 | EVAL_EVERY_FRAME = 100
19 | 
20 | BEST_PONG = common.GAME_PARAMS['pong']
21 | 
22 | 
23 | @torch.no_grad()
24 | def evaluate_states(states: np.ndarray, net: nn.Module,
25 |                     device: torch.device, engine: Engine):
26 |     s_v = torch.as_tensor(states).to(device)
27 |     adv, val = net.adv_val(s_v)
28 |     engine.state.metrics['adv'] = adv.mean().item()
29 |     engine.state.metrics['val'] = val.mean().item()
30 | 
31 | 
32 | def train(params: common.Hyperparams,
33 |           device: torch.device, extra: dict) -> tt.Optional[int]:
34 |     env = gym.make(params.env_name)
35 |     env = ptan.common.wrappers.wrap_dqn(env)
36 | 
37 |     net = dqn_extra.DuelingDQN(env.observation_space.shape,
38 |                                env.action_space.n).to(device)
39 | 
40 |     tgt_net = ptan.agent.TargetNet(net)
41 |     selector = ptan.actions.EpsilonGreedyActionSelector(
42 |         epsilon=params.epsilon_start)
43 |     epsilon_tracker = common.EpsilonTracker(selector, params)
44 |     agent = ptan.agent.DQNAgent(net, selector, device=device)
45 | 
46 |     exp_source = ptan.experience.ExperienceSourceFirstLast(
47 |         env, agent, gamma=params.gamma, env_seed=common.SEED)
48 |     buffer = ptan.experience.ExperienceReplayBuffer(
49 |         exp_source, buffer_size=params.replay_size)
50 |     optimizer = optim.Adam(net.parameters(),
51 |                            lr=params.learning_rate)
52 | 
53 |     def process_batch(engine, batch):
54 |         optimizer.zero_grad()
55 |         loss_v = common.calc_loss_dqn(
56 |             batch, net, tgt_net.target_model,
57 |             gamma=params.gamma, device=device)
58 |         loss_v.backward()
59 |         optimizer.step()
60 |         epsilon_tracker.frame(engine.state.iteration)
61 |         if engine.state.iteration % params.target_net_sync == 0:
62 |             tgt_net.sync()
63 |         if engine.state.iteration % EVAL_EVERY_FRAME == 0:
64 |             eval_states = getattr(engine.state, "eval_states", None)
65 |             if eval_states is None:
66 |                 eval_states = buffer.sample(STATES_TO_EVALUATE)
67 |                 eval_states = [
68 |                     np.asarray(transition.state)
69 |                     for transition in eval_states
70 |                 ]
71 |                 eval_states = np.asarray(eval_states)
72 |                 engine.state.eval_states = eval_states
73 |             evaluate_states(eval_states, net, device, engine)
74 |         return {
75 |             "loss": loss_v.item(),
76 |             "epsilon": selector.epsilon,
77 |         }
78 | 
79 |     engine = Engine(process_batch)
80 |     common.setup_ignite(engine, params, exp_source, NAME, extra_metrics=('adv', 'val'))
81 |     r = engine.run(common.batch_generator(
82 |         buffer, params.replay_initial, params.batch_size))
83 |     if r.solved:
84 |         return r.episode
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     args = common.argparser().parse_args()
89 |     common.train_or_tune(args, train, BEST_PONG)


--------------------------------------------------------------------------------
/Chapter08/adhoc/commute.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib as mpl
 3 | mpl.use("Agg")
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | if __name__ == "__main__":
 7 |     plt.clf()
 8 |     v1 = np.random.normal(30, 2.0, size=2000)
 9 |     v2 = np.random.normal(90, 4.0, size=200)
10 |     v = np.concatenate((v1, v2))
11 |     mean_time = v.mean()
12 |     plt.hist(v, normed=True, bins=100)
13 |     plt.title("Car commute time distribution\nmean=%.2f mins" % mean_time)
14 |     plt.xlabel("Time, minutes")
15 |     plt.ylabel("Probability")
16 |     plt.savefig("commute-car.png")
17 | 
18 |     plt.clf()
19 |     v1 = np.random.normal(40, 2.0, size=2000)
20 |     v2 = np.random.normal(60, 1.0, size=50)
21 |     v = np.concatenate((v1, v2))
22 |     mean_time = v.mean()
23 |     plt.hist(v, normed=True, bins=100)
24 |     plt.title("Train commute time distribution\nmean=%.2f mins" % mean_time)
25 |     plt.xlabel("Time, minutes")
26 |     plt.ylabel("Probability")
27 |     plt.savefig("commute-train.png")
28 | 


--------------------------------------------------------------------------------
/Chapter08/adhoc/distr_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | 
 4 | import lib.dqn_extra
 5 | 
 6 | sys.path.append("./")
 7 | 
 8 | from lib import common
 9 | 
10 | import matplotlib as mpl
11 | mpl.use("Agg")
12 | import matplotlib.pyplot as plt
13 | 
14 | 
15 | Vmax = 10
16 | Vmin = -10
17 | N_ATOMS = 51
18 | DELTA_Z = (Vmax - Vmin) / (N_ATOMS - 1)
19 | 
20 | 
21 | def save_distr(src, proj, name):
22 |     plt.clf()
23 |     p = np.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)
24 |     plt.subplot(2, 1, 1)
25 |     plt.bar(p, src, width=0.5)
26 |     plt.title("Source")
27 |     plt.subplot(2, 1, 2)
28 |     plt.bar(p, proj, width=0.5)
29 |     plt.title("Projected")
30 |     plt.savefig(name + ".png")
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     np.random.seed(123)
35 |     atoms = np.arange(Vmin, Vmax+DELTA_Z, DELTA_Z)
36 | 
37 |     # single peak distribution
38 |     src_hist = np.zeros(shape=(1, N_ATOMS), dtype=np.float32)
39 |     src_hist[0, N_ATOMS//2+1] = 1.0
40 |     proj_hist = lib.dqn_extra.distr_projection(src_hist, np.array([2], dtype=np.float32), np.array([False]),
41 |                                                Vmin, Vmax, N_ATOMS, gamma=0.9)
42 |     save_distr(src_hist[0], proj_hist[0], "peak-r=2")
43 | 
44 |     # normal distribution
45 |     data = np.random.normal(size=1000, scale=3)
46 |     hist = np.histogram(data, normed=True, bins=np.arange(Vmin - DELTA_Z/2, Vmax + DELTA_Z*3/2, DELTA_Z))
47 | 
48 |     src_hist = hist[0]
49 |     proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([False]),
50 |                                                Vmin, Vmax, N_ATOMS, gamma=0.9)
51 |     save_distr(hist[0], proj_hist[0], "normal-r=2")
52 | 
53 |     # normal distribution, but done episode
54 |     proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist]), np.array([2], dtype=np.float32), np.array([True]),
55 |                                                Vmin, Vmax, N_ATOMS, gamma=0.9)
56 |     save_distr(hist[0], proj_hist[0], "normal-done-r=2")
57 | 
58 |     # clipping for out-of-range distribution
59 |     proj_dist = lib.dqn_extra.distr_projection(np.array([src_hist]), np.array([10], dtype=np.float32), np.array([False]),
60 |                                                Vmin, Vmax, N_ATOMS, gamma=0.9)
61 |     save_distr(hist[0], proj_dist[0], "normal-r=10")
62 | 
63 |     # test both done and not done, unclipped
64 |     proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist, src_hist]), np.array([2, 2], dtype=np.float32),
65 |                                                np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9)
66 |     save_distr(src_hist, proj_hist[0], "both_not_clip-01-incomplete")
67 |     save_distr(src_hist, proj_hist[1], "both_not_clip-02-complete")
68 | 
69 |     # test both done and not done, clipped right
70 |     proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist, src_hist]), np.array([10, 10], dtype=np.float32),
71 |                                                np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9)
72 |     save_distr(src_hist, proj_hist[0], "both_clip-right-01-incomplete")
73 |     save_distr(src_hist, proj_hist[1], "both_clip-right-02-complete")
74 | 
75 |     # test both done and not done, clipped left
76 |     proj_hist = lib.dqn_extra.distr_projection(np.array([src_hist, src_hist]), np.array([-10, -10], dtype=np.float32),
77 |                                                np.array([False, True]), Vmin, Vmax, N_ATOMS, gamma=0.9)
78 |     save_distr(src_hist, proj_hist[0], "both_clip-left-01-incomplete")
79 |     save_distr(src_hist, proj_hist[1], "both_clip-left-02-complete")
80 | 
81 |     pass
82 | 


--------------------------------------------------------------------------------
/Chapter08/bench/simple_buffer_bench.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Benchmark various Replay Buffer variants
 4 | """
 5 | import timeit
 6 | import numpy as np
 7 | import collections
 8 | 
 9 | 
10 | SIZES = [10**n for n in (3, 4, 5)]
11 | DATA_SHAPE = (84, 84, 4)
12 | REPEAT_NUMBER = 10
13 | 
14 | 
15 | class ExperienceBufferDeque:
16 |     def __init__(self, capacity):
17 |         self.buffer = collections.deque(maxlen=capacity)
18 | 
19 |     def __len__(self):
20 |         return len(self.buffer)
21 | 
22 |     def append(self, experience):
23 |         self.buffer.append(experience)
24 | 
25 |     def sample(self, batch_size):
26 |         indices = np.random.choice(len(self.buffer), batch_size, replace=True)
27 |         return [self.buffer[idx] for idx in indices]
28 | 
29 | 
30 | class ExperienceBufferCircularList:
31 |     def __init__(self, capacity):
32 |         self.buffer = list()
33 |         self.capacity = capacity
34 |         self.pos = 0
35 | 
36 |     def __len__(self):
37 |         return len(self.buffer)
38 | 
39 |     def append(self, experience):
40 |         if len(self.buffer) < self.capacity:
41 |             self.buffer.append(experience)
42 |         else:
43 |             self.buffer[self.pos] = experience
44 |             self.pos = (self.pos + 1) % self.capacity
45 | 
46 |     def sample(self, batch_size):
47 |         indices = np.random.choice(len(self.buffer), batch_size, replace=True)
48 |         return [self.buffer[idx] for idx in indices]
49 | 
50 | 
51 | 
52 | def fill_buf(buf, size):
53 |     for _ in range(size):
54 |         buf.append(np.zeros(DATA_SHAPE, dtype=np.uint8))
55 | 
56 | 
57 | def bench_buffer(buf_class):
58 |     print("Benchmarking %s" % buf_class.__name__)
59 | 
60 |     for size in SIZES:
61 |         print("  Test size %d" % size)
62 |         ns = globals()
63 |         ns.update(locals())
64 |         t = timeit.timeit('fill_buf(buf, size)', setup='buf = buf_class(size)', number=REPEAT_NUMBER, globals=ns)
65 |         print("  * Initial fill:\t%.2f items/s" % (size*REPEAT_NUMBER / t))
66 |         buf = buf_class(size)
67 |         fill_buf(buf, size)
68 |         ns.update(locals())
69 |         t = timeit.timeit('fill_buf(buf, size)', number=REPEAT_NUMBER, globals=ns)
70 |         print("  * Append:\t\t%.2f items/s" % (size*REPEAT_NUMBER / t))
71 |         t = timeit.timeit('buf.sample(4)', number=REPEAT_NUMBER*100, globals=ns)
72 |         print("  * Sample 4:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
73 |         t = timeit.timeit('buf.sample(8)', number=REPEAT_NUMBER*100, globals=ns)
74 |         print("  * Sample 8:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
75 |         t = timeit.timeit('buf.sample(16)', number=REPEAT_NUMBER*100, globals=ns)
76 |         print("  * Sample 16:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
77 |         t = timeit.timeit('buf.sample(32)', number=REPEAT_NUMBER*100, globals=ns)
78 |         print("  * Sample 32:\t\t%.2f items/s" % (REPEAT_NUMBER*100 / t))
79 | 
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     bench_buffer(ExperienceBufferCircularList)
84 |     bench_buffer(ExperienceBufferDeque)
85 |     pass
86 | 


--------------------------------------------------------------------------------
/Chapter08/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter08/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter08/lib/dqn_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import typing as tt
 4 | 
 5 | 
 6 | class DQN(nn.Module):
 7 |     def __init__(self, input_shape: tt.Tuple[int, ...],
 8 |                  n_actions: int):
 9 |         super(DQN, self).__init__()
10 | 
11 |         self.conv = nn.Sequential(
12 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
13 |             nn.ReLU(),
14 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
15 |             nn.ReLU(),
16 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
17 |             nn.ReLU(),
18 |             nn.Flatten(),
19 |         )
20 |         size = self.conv(torch.zeros(1, *input_shape)).size()[-1]
21 |         self.fc = nn.Sequential(
22 |             nn.Linear(size, 512),
23 |             nn.ReLU(),
24 |             nn.Linear(512, n_actions)
25 |         )
26 | 
27 |     def forward(self, x: torch.ByteTensor):
28 |         xx = x / 255.0
29 |         return self.fc(self.conv(xx))
30 | 


--------------------------------------------------------------------------------
/Chapter09/.gitignore:
--------------------------------------------------------------------------------
1 | runs
2 | res
3 | 


--------------------------------------------------------------------------------
/Chapter09/attic/03_parallel_orig.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gym
 3 | import ptan
 4 | import argparse
 5 | 
 6 | import torch
 7 | import torch.optim as optim
 8 | import torch.multiprocessing as mp
 9 | 
10 | from tensorboardX import SummaryWriter
11 | 
12 | from lib import dqn_model, common
13 | 
14 | PLAY_STEPS = 4
15 | 
16 | 
17 | def play_func(params, net, cuda, exp_queue):
18 |     env = gym.make(params.env_name)
19 |     env = ptan.common.wrappers.wrap_dqn(env)
20 |     device = torch.device("cuda" if cuda else "cpu")
21 | 
22 |     writer = SummaryWriter(comment="-" + params.run_name + "-03_parallel")
23 | 
24 |     selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params.epsilon_start)
25 |     epsilon_tracker = common.EpsilonTracker(selector, params)
26 |     agent = ptan.agent.DQNAgent(net, selector, device=device)
27 |     exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=params.gamma, steps_count=1)
28 |     exp_source_iter = iter(exp_source)
29 | 
30 |     frame_idx = 0
31 | 
32 |     with common.RewardTracker(writer, params.stop_reward) as reward_tracker:
33 |         while True:
34 |             frame_idx += 1
35 |             exp = next(exp_source_iter)
36 |             exp_queue.put(exp)
37 | 
38 |             epsilon_tracker.frame(frame_idx)
39 | 
40 |             new_rewards = exp_source.pop_total_rewards()
41 |             if new_rewards:
42 |                 if reward_tracker.reward(new_rewards[0], frame_idx, selector.epsilon):
43 |                     break
44 | 
45 |     exp_queue.put(None)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     mp.set_start_method('spawn')
50 |     params = common.HYPERPARAMS['pong']
51 |     params.batch_size *= PLAY_STEPS
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable cuda")
54 |     args = parser.parse_args()
55 |     device = torch.device("cuda" if args.cuda else "cpu")
56 | 
57 |     env = gym.make(params.env_name)
58 |     env = ptan.common.wrappers.wrap_dqn(env)
59 | 
60 |     net = dqn_model.DQN(env.observation_space.shape, env.action_space.n).to(device)
61 |     tgt_net = ptan.agent.TargetNet(net)
62 | 
63 |     buffer = ptan.experience.ExperienceReplayBuffer(experience_source=None, buffer_size=params.replay_size)
64 |     optimizer = optim.Adam(net.parameters(), lr=params.learning_rate)
65 | 
66 |     exp_queue = mp.Queue(maxsize=PLAY_STEPS * 2)
67 |     play_proc = mp.Process(target=play_func, args=(params, net, args.cuda, exp_queue))
68 |     play_proc.start()
69 | 
70 |     frame_idx = 0
71 | 
72 |     while play_proc.is_alive():
73 | #        frame_idx += PLAY_STEPS
74 |         #for _ in range(PLAY_STEPS):
75 |         while exp_queue.qsize() > 1:
76 |             exp = exp_queue.get()
77 |             if exp is None:
78 |                 play_proc.join()
79 |                 break
80 |             buffer._add(exp)
81 |             frame_idx += 1
82 |             if frame_idx % params.target_net_sync == 0:
83 |                 tgt_net.sync()
84 | 
85 |         if len(buffer) < params.replay_initial:
86 |             continue
87 |         optimizer.zero_grad()
88 |         batch = buffer.sample(params.batch_size)
89 |         loss_v = common.calc_loss_dqn(batch, net, tgt_net.target_model, gamma=params.gamma, device=device)
90 |         loss_v.backward()
91 |         optimizer.step()
92 | 
93 | 


--------------------------------------------------------------------------------
/Chapter09/img/01_orig_tb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/01_orig_tb.png


--------------------------------------------------------------------------------
/Chapter09/img/02_steps-tb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/02_steps-tb.png


--------------------------------------------------------------------------------
/Chapter09/img/03-serial-blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/03-serial-blocks.png


--------------------------------------------------------------------------------
/Chapter09/img/03_serial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/03_serial.png


--------------------------------------------------------------------------------
/Chapter09/img/04_parallel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/img/04_parallel.png


--------------------------------------------------------------------------------
/Chapter09/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter09/lib/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/lib/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/Chapter09/lib/__pycache__/common.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/lib/__pycache__/common.cpython-311.pyc


--------------------------------------------------------------------------------
/Chapter09/lib/__pycache__/dqn_model.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter09/lib/__pycache__/dqn_model.cpython-311.pyc


--------------------------------------------------------------------------------
/Chapter09/lib/atari_wrappers.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import numpy as np
 3 | 
 4 | from ptan.common.wrappers import ImageToPyTorch, BufferWrapper
 5 | from stable_baselines3.common.atari_wrappers import (
 6 |     StickyActionEnv, NoopResetEnv, EpisodicLifeEnv,
 7 |     FireResetEnv, WarpFrame, ClipRewardEnv
 8 | )
 9 | from stable_baselines3.common.type_aliases import AtariStepReturn
10 | 
11 | 
12 | class JustSkipEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]):
13 |     """
14 |     Return only every ``skip``-th frame (frameskipping)
15 | 
16 |     :param env: Environment to wrap
17 |     :param skip: Number of ``skip``-th frame
18 |         The same action will be taken ``skip`` times.
19 |     """
20 | 
21 |     def __init__(self, env: gym.Env, skip: int = 4) -> None:
22 |         super().__init__(env)
23 |         self._skip = skip
24 | 
25 |     def step(self, action: int) -> AtariStepReturn:
26 |         """
27 |         Step the environment with the given action
28 |         Repeat action, sum reward, and max over last observations.
29 | 
30 |         :param action: the action
31 |         :return: observation, reward, terminated, truncated, information
32 |         """
33 |         total_reward = 0.0
34 |         info = {}
35 |         obs = None
36 |         terminated = truncated = False
37 |         for i in range(self._skip):
38 |             obs, reward, terminated, truncated, info = self.env.step(action)
39 |             done = terminated or truncated
40 |             total_reward += float(reward)
41 |             if done:
42 |                 break
43 |         return obs, total_reward, terminated, truncated, info
44 | 
45 | 
46 | class AtariWrapper(gym.Wrapper[np.ndarray, int, np.ndarray, int]):
47 |     def __init__(
48 |         self,
49 |         env: gym.Env,
50 |         noop_max: int = 30,
51 |         frame_skip: int = 4,
52 |         screen_size: int = 84,
53 |         terminal_on_life_loss: bool = True,
54 |         clip_reward: bool = True,
55 |         action_repeat_probability: float = 0.0,
56 |     ) -> None:
57 |         if action_repeat_probability > 0.0:
58 |             env = StickyActionEnv(env, action_repeat_probability)
59 |         if noop_max > 0:
60 |             env = NoopResetEnv(env, noop_max=noop_max)
61 |         # frame_skip=1 is the same as no frame-skip (action repeat)
62 |         if frame_skip > 1:
63 |             env = JustSkipEnv(env, skip=frame_skip)
64 |         if terminal_on_life_loss:
65 |             env = EpisodicLifeEnv(env)
66 |         if "FIRE" in env.unwrapped.get_action_meanings():  # type: ignore[attr-defined]
67 |             env = FireResetEnv(env)
68 |         env = WarpFrame(env, width=screen_size, height=screen_size)
69 |         if clip_reward:
70 |             env = ClipRewardEnv(env)
71 | 
72 |         super().__init__(env)
73 | 
74 | 
75 | def wrap_dqn(env: gym.Env, stack_frames: int = 4,
76 |              episodic_life: bool = True, clip_reward: bool = True,
77 |              noop_max: int = 0) -> gym.Env:
78 |     """
79 |     Apply a common set of wrappers for Atari games.
80 |     :param env: Environment to wrap
81 |     :param stack_frames: count of frames to stack, default=4
82 |     :param episodic_life: convert life to end of episode
83 |     :param clip_reward: reward clipping
84 |     :param noop_max: how many NOOP actions to execute
85 |     :return: wrapped environment
86 |     """
87 |     assert 'NoFrameskip' in env.spec.id
88 |     env = AtariWrapper(
89 |         env, clip_reward=clip_reward, noop_max=noop_max,
90 |         terminal_on_life_loss=episodic_life
91 |     )
92 |     env = ImageToPyTorch(env)
93 |     if stack_frames > 1:
94 |         env = BufferWrapper(env, stack_frames)
95 |     return env


--------------------------------------------------------------------------------
/Chapter09/lib/common.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import dataclasses
  5 | import typing as tt
  6 | 
  7 | from ptan.actions import EpsilonGreedyActionSelector
  8 | from ptan.experience import ExperienceFirstLast, \
  9 |     ExperienceReplayBuffer
 10 | 
 11 | SEED = 123
 12 | 
 13 | 
 14 | @dataclasses.dataclass
 15 | class Hyperparams:
 16 |     env_name: str
 17 |     stop_reward: float
 18 |     run_name: str
 19 |     replay_size: int
 20 |     replay_initial: int
 21 |     target_net_sync: int
 22 |     epsilon_frames: int
 23 | 
 24 |     learning_rate: float = 0.0001
 25 |     batch_size: int = 32
 26 |     gamma: float = 0.99
 27 |     epsilon_start: float = 1.0
 28 |     epsilon_final: float = 0.02
 29 | 
 30 | 
 31 | GAME_PARAMS = {
 32 |     'pong': Hyperparams(
 33 |         env_name="PongNoFrameskip-v4",
 34 |         stop_reward=18.0,
 35 |         run_name="pong",
 36 |         replay_size=100_000,
 37 |         replay_initial=10_000,
 38 |         target_net_sync=1000,
 39 |         epsilon_frames=100_000,
 40 |     ),
 41 | }
 42 | 
 43 | 
 44 | def unpack_batch(batch: tt.List[ExperienceFirstLast]):
 45 |     states, actions, rewards, dones, last_states = [],[],[],[],[]
 46 |     for exp in batch:
 47 |         states.append(exp.state)
 48 |         actions.append(exp.action)
 49 |         rewards.append(exp.reward)
 50 |         dones.append(exp.last_state is None)
 51 |         if exp.last_state is None:
 52 |             lstate = exp.state  # the result will be masked anyway
 53 |         else:
 54 |             lstate = exp.last_state
 55 |         last_states.append(lstate)
 56 |     return np.asarray(states), \
 57 |         np.array(actions), \
 58 |         np.array(rewards, dtype=np.float32), \
 59 |         np.array(dones, dtype=bool), \
 60 |         np.asarray(last_states)
 61 | 
 62 | 
 63 | def calc_loss_dqn(
 64 |         batch: tt.List[ExperienceFirstLast],
 65 |         net: nn.Module, tgt_net: nn.Module,
 66 |         gamma: float, device: torch.device) -> torch.Tensor:
 67 |     states, actions, rewards, dones, next_states = \
 68 |         unpack_batch(batch)
 69 | 
 70 |     states_v = torch.as_tensor(states).to(device)
 71 |     next_states_v = torch.as_tensor(next_states).to(device)
 72 |     actions_v = torch.LongTensor(actions).to(device)
 73 |     rewards_v = torch.FloatTensor(rewards).to(device)
 74 |     done_mask = torch.BoolTensor(dones).to(device)
 75 | 
 76 |     actions_v = actions_v.unsqueeze(-1)
 77 |     state_action_vals = net(states_v).gather(1, actions_v)
 78 |     state_action_vals = state_action_vals.squeeze(-1)
 79 |     with torch.no_grad():
 80 |         next_state_vals = tgt_net(next_states_v).max(1)[0]
 81 |         next_state_vals[done_mask] = 0.0
 82 | 
 83 |     bellman_vals = next_state_vals.detach() * gamma + rewards_v
 84 |     return nn.MSELoss()(state_action_vals, bellman_vals)
 85 | 
 86 | 
 87 | class EpsilonTracker:
 88 |     def __init__(self, selector: EpsilonGreedyActionSelector,
 89 |                  params: Hyperparams):
 90 |         self.selector = selector
 91 |         self.params = params
 92 |         self.frame(0)
 93 | 
 94 |     def frame(self, frame_idx: int):
 95 |         eps = self.params.epsilon_start - \
 96 |               frame_idx / self.params.epsilon_frames
 97 |         self.selector.epsilon = max(self.params.epsilon_final, eps)
 98 | 
 99 | 
100 | def batch_generator(buffer: ExperienceReplayBuffer,
101 |                     initial: int, batch_size: int) -> \
102 |         tt.Generator[tt.List[ExperienceFirstLast], None, None]:
103 |     buffer.populate(initial)
104 |     while True:
105 |         buffer.populate(1)
106 |         yield buffer.sample(batch_size)
107 | 
108 | 


--------------------------------------------------------------------------------
/Chapter09/lib/dqn_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import typing as tt
 4 | 
 5 | 
 6 | class DQN(nn.Module):
 7 |     def __init__(self, input_shape: tt.Tuple[int, ...],
 8 |                  n_actions: int):
 9 |         super(DQN, self).__init__()
10 | 
11 |         self.conv = nn.Sequential(
12 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
13 |             nn.ReLU(),
14 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
15 |             nn.ReLU(),
16 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
17 |             nn.ReLU(),
18 |             nn.Flatten(),
19 |         )
20 |         size = self.conv(torch.zeros(1, *input_shape)).size()[-1]
21 |         self.fc = nn.Sequential(
22 |             nn.Linear(size, 512),
23 |             nn.ReLU(),
24 |             nn.Linear(512, n_actions)
25 |         )
26 | 
27 |     def forward(self, x: torch.ByteTensor):
28 |         xx = x / 255.0
29 |         return self.fc(self.conv(xx))
30 | 


--------------------------------------------------------------------------------
/Chapter10/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | saves
3 | res
4 | 


--------------------------------------------------------------------------------
/Chapter10/conftest.py:
--------------------------------------------------------------------------------
1 | # this file adds current dir to the pytest path for modules import


--------------------------------------------------------------------------------
/Chapter10/data/ch10-small-quotes.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter10/data/ch10-small-quotes.tgz


--------------------------------------------------------------------------------
/Chapter10/data/unpack_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | tar xvf ch10-small-quotes.tgz
3 | 


--------------------------------------------------------------------------------
/Chapter10/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter10/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter10/lib/data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | import glob
  4 | import pathlib
  5 | import numpy as np
  6 | import typing as tt
  7 | from dataclasses import dataclass
  8 | 
  9 | 
 10 | @dataclass
 11 | class Prices:
 12 |     open: np.ndarray
 13 |     high: np.ndarray
 14 |     low: np.ndarray
 15 |     close: np.ndarray
 16 |     volume: np.ndarray
 17 | 
 18 | 
 19 | def read_csv(file_path: pathlib.Path, sep: str = ',',
 20 |              filter_data: bool = True,
 21 |              fix_open_price: bool = False) -> Prices:
 22 |     print("Reading", file_path)
 23 |     with file_path.open('rt', encoding='utf-8') as fd:
 24 |         reader = csv.reader(fd, delimiter=sep)
 25 |         h = next(reader)
 26 |         if '<OPEN>' not in h and sep == ',':
 27 |             return read_csv(file_path, ';')
 28 |         indices = [
 29 |             h.index(s)
 30 |             for s in ('<OPEN>', '<HIGH>', '<LOW>',
 31 |                       '<CLOSE>', '<VOL>')
 32 |         ]
 33 |         o, h, l, c, v = [], [], [], [], []
 34 |         count_out = 0
 35 |         count_filter = 0
 36 |         count_fixed = 0
 37 |         prev_vals = None
 38 |         filter_func = lambda v: abs(v-vals[0]) < 1e-8
 39 |         for row in reader:
 40 |             vals = list(map(float, [row[idx] for idx in indices]))
 41 |             if filter_data and all(map(filter_func, vals[:-1])):
 42 |                 count_filter += 1
 43 |                 continue
 44 | 
 45 |             po, ph, pl, pc, pv = vals
 46 | 
 47 |             # fix open price for current bar to match close price for the previous bar
 48 |             if fix_open_price and prev_vals is not None:
 49 |                 ppo, pph, ppl, ppc, ppv = prev_vals
 50 |                 if abs(po - ppc) > 1e-8:
 51 |                     count_fixed += 1
 52 |                     po = ppc
 53 |                     pl = min(pl, po)
 54 |                     ph = max(ph, po)
 55 |             count_out += 1
 56 |             o.append(po)
 57 |             c.append(pc)
 58 |             h.append(ph)
 59 |             l.append(pl)
 60 |             v.append(pv)
 61 |             prev_vals = vals
 62 |     print(f"Read done, got {count_filter + count_out} rows, "
 63 |           f"{count_filter} filtered, "
 64 |           f"{count_fixed} open prices adjusted")
 65 |     return Prices(open=np.array(o, dtype=np.float32),
 66 |                   high=np.array(h, dtype=np.float32),
 67 |                   low=np.array(l, dtype=np.float32),
 68 |                   close=np.array(c, dtype=np.float32),
 69 |                   volume=np.array(v, dtype=np.float32))
 70 | 
 71 | 
 72 | def prices_to_relative(prices: Prices):
 73 |     """
 74 |     Convert prices to relative in respect to open price
 75 |     :param ochl: tuple with open, close, high, low
 76 |     :return: tuple with open, rel_close, rel_high, rel_low
 77 |     """
 78 |     rh = (prices.high - prices.open) / prices.open
 79 |     rl = (prices.low - prices.open) / prices.open
 80 |     rc = (prices.close - prices.open) / prices.open
 81 |     return Prices(open=prices.open, high=rh, low=rl,
 82 |                   close=rc, volume=prices.volume)
 83 | 
 84 | 
 85 | def load_relative(csv_path: pathlib.Path | str) -> Prices:
 86 |     if isinstance(csv_path, str):
 87 |         csv_path = pathlib.Path(csv_path)
 88 |     return prices_to_relative(read_csv(csv_path))
 89 | 
 90 | 
 91 | def price_files(dir_name: str) -> tt.List[pathlib.Path]:
 92 |     result = []
 93 |     for path in glob.glob(os.path.join(dir_name, "*.csv")):
 94 |         result.append(pathlib.Path(path))
 95 |     return result
 96 | 
 97 | 
 98 | def load_year_data(
 99 |         year: int, basedir: str = 'data'
100 | ) -> tt.Dict[str, Prices]:
101 |     y = str(year)[-2:]
102 |     result = {}
103 |     for path in glob.glob(os.path.join(basedir, "*_%s*.csv" % y)):
104 |         result[path] = load_relative(pathlib.Path(path))
105 |     return result
106 | 


--------------------------------------------------------------------------------
/Chapter10/lib/models.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import typing as tt
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | class SimpleFFDQN(nn.Module):
  9 |     def __init__(self, obs_len: int, actions_n: int):
 10 |         super(SimpleFFDQN, self).__init__()
 11 | 
 12 |         self.fc_val = nn.Sequential(
 13 |             nn.Linear(obs_len, 512),
 14 |             nn.ReLU(),
 15 |             nn.Linear(512, 512),
 16 |             nn.ReLU(),
 17 |             nn.Linear(512, 1)
 18 |         )
 19 | 
 20 |         self.fc_adv = nn.Sequential(
 21 |             nn.Linear(obs_len, 512),
 22 |             nn.ReLU(),
 23 |             nn.Linear(512, 512),
 24 |             nn.ReLU(),
 25 |             nn.Linear(512, actions_n)
 26 |         )
 27 | 
 28 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 29 |         val = self.fc_val(x)
 30 |         adv = self.fc_adv(x)
 31 |         return val + (adv - adv.mean(dim=1, keepdim=True))
 32 | 
 33 | 
 34 | class DQNConv1D(nn.Module):
 35 |     def __init__(self, shape: tt.Tuple[int, ...], actions_n: int):
 36 |         super(DQNConv1D, self).__init__()
 37 | 
 38 |         self.conv = nn.Sequential(
 39 |             nn.Conv1d(shape[0], 128, 5),
 40 |             nn.ReLU(),
 41 |             nn.Conv1d(128, 128, 5),
 42 |             nn.ReLU(),
 43 |             nn.Flatten(),
 44 |         )
 45 |         size = self.conv(torch.zeros(1, *shape)).size()[-1]
 46 | 
 47 |         self.fc_val = nn.Sequential(
 48 |             nn.Linear(size, 512),
 49 |             nn.ReLU(),
 50 |             nn.Linear(512, 1)
 51 |         )
 52 | 
 53 |         self.fc_adv = nn.Sequential(
 54 |             nn.Linear(size, 512),
 55 |             nn.ReLU(),
 56 |             nn.Linear(512, actions_n)
 57 |         )
 58 | 
 59 | 
 60 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 61 |         conv_out = self.conv(x)
 62 |         val = self.fc_val(conv_out)
 63 |         adv = self.fc_adv(conv_out)
 64 |         return val + (adv - adv.mean(dim=1, keepdim=True))
 65 | 
 66 | 
 67 | class DQNConv1DLarge(nn.Module):
 68 |     def __init__(self, shape, actions_n):
 69 |         super(DQNConv1DLarge, self).__init__()
 70 | 
 71 |         self.conv = nn.Sequential(
 72 |             nn.Conv1d(shape[0], 32, 3),
 73 |             nn.MaxPool1d(3, 2),
 74 |             nn.ReLU(),
 75 |             nn.Conv1d(32, 32, 3),
 76 |             nn.MaxPool1d(3, 2),
 77 |             nn.ReLU(),
 78 |             nn.Conv1d(32, 32, 3),
 79 |             nn.MaxPool1d(3, 2),
 80 |             nn.ReLU(),
 81 |             nn.Conv1d(32, 32, 3),
 82 |             nn.MaxPool1d(3, 2),
 83 |             nn.ReLU(),
 84 |             nn.Conv1d(32, 32, 3),
 85 |             nn.ReLU(),
 86 |             nn.Conv1d(32, 32, 3),
 87 |             nn.ReLU(),
 88 |             nn.Flatten(),
 89 |         )
 90 |         size = self.conv(torch.zeros(1, *shape)).size()[-1]
 91 | 
 92 |         self.fc_val = nn.Sequential(
 93 |             nn.Linear(size, 512),
 94 |             nn.ReLU(),
 95 |             nn.Linear(512, 1)
 96 |         )
 97 | 
 98 |         self.fc_adv = nn.Sequential(
 99 |             nn.Linear(size, 512),
100 |             nn.ReLU(),
101 |             nn.Linear(512, actions_n)
102 |         )
103 | 
104 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
105 |         conv_out = self.conv(x)
106 |         val = self.fc_val(conv_out)
107 |         adv = self.fc_adv(conv_out)
108 |         return val + (adv - adv.mean(dim=1, keepdim=True))
109 | 


--------------------------------------------------------------------------------
/Chapter10/lib/validation.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torch
 4 | 
 5 | from lib import environ
 6 | 
 7 | METRICS = (
 8 |     'episode_reward',
 9 |     'episode_steps',
10 |     'order_profits',
11 |     'order_steps',
12 | )
13 | 
14 | 
15 | def validation_run(env, net, episodes=100, device="cpu", epsilon=0.02, comission=0.1):
16 |     stats = { metric: [] for metric in METRICS }
17 | 
18 |     for episode in range(episodes):
19 |         obs, _ = env.reset()
20 | 
21 |         total_reward = 0.0
22 |         position = None
23 |         position_steps = None
24 |         episode_steps = 0
25 | 
26 |         while True:
27 |             obs_v = torch.tensor([obs]).to(device)
28 |             out_v = net(obs_v)
29 | 
30 |             action_idx = out_v.max(dim=1)[1].item()
31 |             if np.random.random() < epsilon:
32 |                 action_idx = env.action_space.sample()
33 |             action = environ.Actions(action_idx)
34 | 
35 |             close_price = env._state._cur_close()
36 | 
37 |             if action == environ.Actions.Buy and position is None:
38 |                 position = close_price
39 |                 position_steps = 0
40 |             elif action == environ.Actions.Close and position is not None:
41 |                 profit = close_price - position - (close_price + position) * comission / 100
42 |                 profit = 100.0 * profit / position
43 |                 stats['order_profits'].append(profit)
44 |                 stats['order_steps'].append(position_steps)
45 |                 position = None
46 |                 position_steps = None
47 | 
48 |             obs, reward, done, _, _ = env.step(action_idx)
49 |             total_reward += reward
50 |             episode_steps += 1
51 |             if position_steps is not None:
52 |                 position_steps += 1
53 |             if done:
54 |                 if position is not None:
55 |                     profit = close_price - position - (close_price + position) * comission / 100
56 |                     profit = 100.0 * profit / position
57 |                     stats['order_profits'].append(profit)
58 |                     stats['order_steps'].append(position_steps)
59 |                 break
60 | 
61 |         stats['episode_reward'].append(total_reward)
62 |         stats['episode_steps'].append(episode_steps)
63 | 
64 |     return { key: np.mean(vals) for key, vals in stats.items() }
65 | 


--------------------------------------------------------------------------------
/Chapter10/run_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | from lib import environ, data, models
 6 | 
 7 | import torch
 8 | 
 9 | import matplotlib as mpl
10 | mpl.use("Agg")
11 | import matplotlib.pyplot as plt
12 | 
13 | 
14 | EPSILON = 0.02
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("-d", "--data", required=True, help="CSV file with quotes to run the model")
20 |     parser.add_argument("-m", "--model", required=True, help="Model file to load")
21 |     parser.add_argument("-b", "--bars", type=int, default=50, help="Count of bars to feed into the model")
22 |     parser.add_argument("-n", "--name", required=True, help="Name to use in output images")
23 |     parser.add_argument("--commission", type=float, default=0.1, help="Commission size in percent, default=0.1")
24 |     parser.add_argument("--conv", default=False, action="store_true", help="Use convolution model instead of FF")
25 |     args = parser.parse_args()
26 | 
27 |     prices = data.load_relative(args.data)
28 |     env = environ.StocksEnv({"TEST": prices}, bars_count=args.bars, reset_on_close=False, commission=args.commission,
29 |                             state_1d=args.conv, random_ofs_on_reset=False, reward_on_close=False, volumes=False)
30 |     if args.conv:
31 |         net = models.DQNConv1D(env.observation_space.shape, env.action_space.n)
32 |     else:
33 |         net = models.SimpleFFDQN(env.observation_space.shape[0], env.action_space.n)
34 | 
35 |     net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage, weights_only=True))
36 | 
37 |     obs, _ = env.reset()
38 |     start_price = env._state._cur_close()
39 | 
40 |     total_reward = 0.0
41 |     step_idx = 0
42 |     rewards = []
43 | 
44 |     while True:
45 |         step_idx += 1
46 |         obs_v = torch.tensor([obs])
47 |         out_v = net(obs_v)
48 |         action_idx = out_v.max(dim=1)[1].item()
49 |         if np.random.random() < EPSILON:
50 |             action_idx = env.action_space.sample()
51 |         action = environ.Actions(action_idx)
52 | 
53 |         obs, reward, done, _, _ = env.step(action_idx)
54 |         total_reward += reward
55 |         rewards.append(total_reward)
56 |         if step_idx % 100 == 0:
57 |             print("%d: reward=%.3f" % (step_idx, total_reward))
58 |         if done:
59 |             break
60 | 
61 |     plt.clf()
62 |     plt.plot(rewards)
63 |     plt.title("Total reward, data=%s" % args.name)
64 |     plt.ylabel("Reward, %")
65 |     plt.savefig("rewards-%s.png" % args.name)
66 | 


--------------------------------------------------------------------------------
/Chapter10/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pathlib
 3 | from lib import data
 4 | 
 5 | 
 6 | def test_read_csv():
 7 |     prices = data.read_csv(pathlib.Path("data/YNDX_160101_161231.csv"))
 8 |     assert isinstance(prices, data.Prices)
 9 | 
10 | 
11 | def test_prices_to_relative():
12 |     t = data.Prices(open=np.array([1.0]),
13 |                     high=np.array([3.0]),
14 |                     low=np.array([0.5]),
15 |                     close=np.array([2.0]),
16 |                     volume=np.array([10]))
17 |     rel = data.prices_to_relative(t)
18 |     np.testing.assert_equal(rel.open,  t.open)
19 |     np.testing.assert_equal(rel.volume,  t.volume)
20 |     np.testing.assert_equal(rel.high,  np.array([2.0]))  # 200% growth
21 |     np.testing.assert_equal(rel.low,   np.array([-.5]))  # 50% fall
22 |     np.testing.assert_equal(rel.close, np.array([1.0]))  # 100% growth
23 | 
24 | 
25 | def test_price_files():
26 |     files = data.price_files("data")
27 |     assert len(files) > 0
28 | 
29 | 


--------------------------------------------------------------------------------
/Chapter11/.gitignore:
--------------------------------------------------------------------------------
1 | res
2 | 


--------------------------------------------------------------------------------
/Chapter11/02_cartpole_reinforce.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import gymnasium as gym
  3 | import ptan
  4 | from ptan.experience import ExperienceSourceFirstLast
  5 | import numpy as np
  6 | import typing as tt
  7 | from torch.utils.tensorboard.writer import SummaryWriter
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import torch.optim as optim
 13 | 
 14 | GAMMA = 0.99
 15 | LEARNING_RATE = 0.01
 16 | EPISODES_TO_TRAIN = 4
 17 | 
 18 | 
 19 | class PGN(nn.Module):
 20 |     def __init__(self, input_size: int, n_actions: int):
 21 |         super(PGN, self).__init__()
 22 | 
 23 |         self.net = nn.Sequential(
 24 |             nn.Linear(input_size, 128),
 25 |             nn.ReLU(),
 26 |             nn.Linear(128, n_actions)
 27 |         )
 28 | 
 29 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 30 |         return self.net(x)
 31 | 
 32 | 
 33 | def calc_qvals(rewards: tt.List[float]) -> tt.List[float]:
 34 |     res = []
 35 |     sum_r = 0.0
 36 |     for r in reversed(rewards):
 37 |         sum_r *= GAMMA
 38 |         sum_r += r
 39 |         res.append(sum_r)
 40 |     return list(reversed(res))
 41 | 
 42 | 
 43 | if __name__ == "__main__":
 44 |     env = gym.make("CartPole-v1")
 45 |     writer = SummaryWriter(comment="-cartpole-reinforce")
 46 | 
 47 |     net = PGN(env.observation_space.shape[0], env.action_space.n)
 48 |     print(net)
 49 | 
 50 |     agent = ptan.agent.PolicyAgent(
 51 |         net, preprocessor=ptan.agent.float32_preprocessor, apply_softmax=True)
 52 |     exp_source = ExperienceSourceFirstLast(env, agent, gamma=GAMMA)
 53 | 
 54 |     optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
 55 | 
 56 |     total_rewards = []
 57 |     done_episodes = 0
 58 | 
 59 |     batch_episodes = 0
 60 |     batch_states, batch_actions, batch_qvals = [], [], []
 61 |     cur_rewards = []
 62 | 
 63 |     for step_idx, exp in enumerate(exp_source):
 64 |         batch_states.append(exp.state)
 65 |         batch_actions.append(int(exp.action))
 66 |         cur_rewards.append(exp.reward)
 67 | 
 68 |         if exp.last_state is None:
 69 |             batch_qvals.extend(calc_qvals(cur_rewards))
 70 |             cur_rewards.clear()
 71 |             batch_episodes += 1
 72 | 
 73 |         # handle new rewards
 74 |         new_rewards = exp_source.pop_total_rewards()
 75 |         if new_rewards:
 76 |             done_episodes += 1
 77 |             reward = new_rewards[0]
 78 |             total_rewards.append(reward)
 79 |             mean_rewards = float(np.mean(total_rewards[-100:]))
 80 |             print(f"{step_idx}: reward: {reward:6.2f}, mean_100: {mean_rewards:6.2f}, "
 81 |                   f"episodes: {done_episodes}")
 82 |             writer.add_scalar("reward", reward, step_idx)
 83 |             writer.add_scalar("reward_100", mean_rewards, step_idx)
 84 |             writer.add_scalar("episodes", done_episodes, step_idx)
 85 |             if mean_rewards > 450:
 86 |                 print(f"Solved in {step_idx} steps and {done_episodes} episodes!")
 87 |                 break
 88 | 
 89 |         if batch_episodes < EPISODES_TO_TRAIN:
 90 |             continue
 91 | 
 92 |         optimizer.zero_grad()
 93 |         states_t = torch.as_tensor(np.asarray(batch_states))
 94 |         batch_actions_t = torch.as_tensor(np.asarray(batch_actions))
 95 |         batch_qvals_t = torch.as_tensor(np.asarray(batch_qvals))
 96 | 
 97 |         logits_t = net(states_t)
 98 |         log_prob_t = F.log_softmax(logits_t, dim=1)
 99 |         batch_idx = range(len(batch_states))
100 |         act_probs_t = log_prob_t[batch_idx, batch_actions_t]
101 |         log_prob_actions_v = batch_qvals_t * act_probs_t
102 |         loss_t = -log_prob_actions_v.mean()
103 | 
104 |         loss_t.backward()
105 |         optimizer.step()
106 | 
107 |         batch_episodes = 0
108 |         batch_states.clear()
109 |         batch_actions.clear()
110 |         batch_qvals.clear()
111 | 
112 |     writer.close()
113 | 


--------------------------------------------------------------------------------
/Chapter11/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter11/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter11/lib/common.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import numpy as np
 4 | import typing as tt
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | 
10 | class RewardTracker:
11 |     def __init__(self, writer, stop_reward):
12 |         self.writer = writer
13 |         self.stop_reward = stop_reward
14 | 
15 |     def __enter__(self):
16 |         self.ts = time.time()
17 |         self.ts_frame = 0
18 |         self.total_rewards = []
19 |         return self
20 | 
21 |     def __exit__(self, *args):
22 |         self.writer.close()
23 | 
24 |     def reward(self, reward, frame, epsilon=None):
25 |         self.total_rewards.append(reward)
26 |         speed = (frame - self.ts_frame) / (time.time() - self.ts)
27 |         self.ts_frame = frame
28 |         self.ts = time.time()
29 |         mean_reward = np.mean(self.total_rewards[-100:])
30 |         epsilon_str = "" if epsilon is None else ", eps %.2f" % epsilon
31 |         print("%d: done %d games, mean reward %.3f, speed %.2f f/s%s" % (
32 |             frame, len(self.total_rewards), mean_reward, speed, epsilon_str
33 |         ))
34 |         sys.stdout.flush()
35 |         if epsilon is not None:
36 |             self.writer.add_scalar("epsilon", epsilon, frame)
37 |         self.writer.add_scalar("speed", speed, frame)
38 |         self.writer.add_scalar("reward_100", mean_reward, frame)
39 |         self.writer.add_scalar("reward", reward, frame)
40 |         if mean_reward > self.stop_reward:
41 |             print("Solved in %d frames!" % frame)
42 |             return True
43 |         return False
44 | 
45 | 
46 | class AtariPGN(nn.Module):
47 |     def __init__(self, input_shape: tt.Tuple[int, ...], n_actions: int):
48 |         super(AtariPGN, self).__init__()
49 | 
50 |         self.conv = nn.Sequential(
51 |             nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
52 |             nn.ReLU(),
53 |             nn.Conv2d(32, 64, kernel_size=4, stride=2),
54 |             nn.ReLU(),
55 |             nn.Conv2d(64, 64, kernel_size=3, stride=1),
56 |             nn.ReLU(),
57 |             nn.Flatten(),
58 |         )
59 |         size = self.conv(torch.zeros(1, *input_shape)).size()[-1]
60 | 
61 |         self.fc = nn.Sequential(
62 |             nn.Linear(size, 512),
63 |             nn.ReLU(),
64 |             nn.Linear(512, n_actions)
65 |         )
66 | 
67 |     def forward(self, x: torch.ByteTensor) -> torch.Tensor:
68 |         xx = x / 255.0
69 |         return self.fc(self.conv(xx))
70 | 
71 | 


--------------------------------------------------------------------------------
/Chapter12/.gitignore:
--------------------------------------------------------------------------------
1 | runs_arch
2 | 


--------------------------------------------------------------------------------
/Chapter12/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter12/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter13/adhoc/hf_t1.py:
--------------------------------------------------------------------------------
1 | from transformers import pipeline
2 | import numpy as np
3 | 
4 | if __name__ == "__main__":
5 |     c = pipeline("feature-extraction")
6 |     r = c(["I'm disappointed by delivery service", "Test sentence"])
7 |     for rr in r:
8 |         a = np.array(rr)
9 |         print(a.shape)


--------------------------------------------------------------------------------
/Chapter13/adhoc/hf_t2.py:
--------------------------------------------------------------------------------
1 | from sentence_transformers import SentenceTransformer
2 | import numpy as np
3 | 
4 | if __name__ == "__main__":
5 |     c = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
6 |     print(c.get_sentence_embedding_dimension())
7 |     r = c.encode(["I'm disappointed by delivery service", "Test sentence"], convert_to_tensor=True)
8 |     print(r.shape)


--------------------------------------------------------------------------------
/Chapter13/adhoc/lc_t1.py:
--------------------------------------------------------------------------------
1 | from langchain_openai import ChatOpenAI
2 | 
3 | if __name__ == "__main__":
4 |     llm = ChatOpenAI()
5 | 
6 |     r = llm.invoke("What do you know about TextWorld games?")
7 |     print(r)


--------------------------------------------------------------------------------
/Chapter13/adhoc/openai_check.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import openai
 3 | 
 4 | OPENAI_KEY = os.environ["OPENAI_API_KEY"]
 5 | 
 6 | def check_openai_api_key(api_key):
 7 |     client = openai.OpenAI(api_key=api_key)
 8 |     try:
 9 |         client.models.list()
10 |     except openai.AuthenticationError as e:
11 |         print(e)
12 |         return False
13 |     else:
14 |         return True
15 | 
16 | # Check the validity of the API key
17 | api_key_valid = check_openai_api_key(OPENAI_KEY)
18 | print("API key is valid:", api_key_valid)


--------------------------------------------------------------------------------
/Chapter13/chatgpt_auto.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | from textworld import gym, EnvInfos
 4 | from textworld.gym import register_game
 5 | from langchain_openai import ChatOpenAI
 6 | from langchain_core.output_parsers import StrOutputParser
 7 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 8 | 
 9 | 
10 | def play_game(env, max_steps: int = 20) -> bool:
11 |     prompt_init = ChatPromptTemplate.from_messages([
12 |         ("system", "You're playing the interactive fiction game. "
13 |                    "Reply with just a command in lowercase and nothing else"),
14 |         ("system", "Game objective: {objective}"),
15 |         ("user", "Room description: {description}"),
16 |         ("user", "What command you want to execute next?"),
17 |     ])
18 |     llm = ChatOpenAI()
19 |     output_parser = StrOutputParser()
20 | 
21 |     commands = []
22 | 
23 |     obs, info = env.reset()
24 |     init_msg = prompt_init.invoke({
25 |         "objective": info['objective'],
26 |         "description": info['description'],
27 |     })
28 | 
29 |     context = init_msg.to_messages()
30 |     ai_msg = llm.invoke(init_msg)
31 |     context.append(ai_msg)
32 |     cmd = output_parser.invoke(ai_msg)
33 | 
34 |     prompt_next = ChatPromptTemplate.from_messages([
35 |         MessagesPlaceholder(variable_name="chat_history"),
36 |         ("user", "Last command result: {result}"),
37 |         ("user", "Room description: {description}"),
38 |         ("user", "What command you want to execute next?"),
39 |     ])
40 | 
41 |     for _ in range(max_steps):
42 |         commands.append(cmd)
43 |         print(">>>", cmd)
44 |         obs, r, is_done, info = env.step(cmd)
45 |         if is_done:
46 |             print(f"I won in {len(commands)} steps!")
47 |             return True
48 | 
49 |         user_msgs = prompt_next.invoke({
50 |             "chat_history": context,
51 |             "result": obs.strip(),
52 |             "description": info['description'],
53 |         })
54 |         context = user_msgs.to_messages()
55 |         ai_msg = llm.invoke(user_msgs)
56 |         context.append(ai_msg)
57 |         cmd = output_parser.invoke(ai_msg)
58 | 
59 |     print(f"Wasn't able to solve after {max_steps} steps, commands: {commands}")
60 |     return False
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     parser = argparse.ArgumentParser()
65 |     parser.add_argument("-g", "--game", default="simple",
66 |                         help="Game prefix to be used during training, default=simple")
67 |     parser.add_argument("indices", nargs='+', type=int, default=[1], help="Game indices to test on, default=1")
68 |     args = parser.parse_args()
69 | 
70 |     count_games, count_won = 0, 0
71 |     for index in args.indices:
72 |         print(f"Starting game {index}\n")
73 |         env_id = register_game(
74 |             gamefile=f"games/{args.game}{index}.ulx",
75 |             request_infos=EnvInfos(
76 |                 description=True,
77 |                 objective=True,
78 |             ),
79 |         )
80 |         env = gym.make(env_id)
81 |         count_games += 1
82 |         if play_game(env):
83 |             count_won += 1
84 |     print(f"Played {count_games}, won {count_won}")
85 | 


--------------------------------------------------------------------------------
/Chapter13/chatgpt_interactive.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import textwrap
 4 | from textworld import gym, EnvInfos
 5 | from textworld.gym import register_game
 6 | 
 7 | 
 8 | def play_game(env, max_steps: int = 20) -> bool:
 9 |     commands = []
10 | 
11 |     obs, info = env.reset()
12 | 
13 |     print(textwrap.dedent("""\
14 |     You're playing the interactive fiction game.
15 |     Here is the game objective: %s
16 |     
17 |     Here is the room description: %s
18 |     
19 |     What command do you want to execute next? Reply with 
20 |     just a command in lowercase and nothing else. 
21 |     """)  % (info['objective'], info['description']))
22 | 
23 |     print("=== Send this to chat.openai.com and type the reply...")
24 | 
25 |     while len(commands) < max_steps:
26 |         cmd = input(">>> ")
27 |         commands.append(cmd)
28 |         obs, r, is_done, info = env.step(cmd)
29 |         if is_done:
30 |             print(f"You won in {len(commands)} steps! "
31 |                   f"Don't forget to congratulate ChatGPT!")
32 |             return True
33 | 
34 |         print(textwrap.dedent("""\
35 |         Last command result: %s
36 |         Room description: %s
37 |         
38 |         What's the next command?
39 |         """) % (obs, info['description']))
40 |         print("=== Send this to chat.openai.com and type the reply...")
41 | 
42 |     print(f"Wasn't able to solve after {max_steps} steps, commands: {commands}")
43 |     return False
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     parser = argparse.ArgumentParser()
48 |     parser.add_argument("-g", "--game", default="simple",
49 |                         help="Game prefix to be used during training, default=simple")
50 |     parser.add_argument("indices", nargs='+', type=int, default=[1], help="Game indices to test on, default=1")
51 |     args = parser.parse_args()
52 | 
53 |     count_games, count_won = 0, 0
54 |     for index in args.indices:
55 |         env_id = register_game(
56 |             gamefile=f"games/{args.game}{index}.ulx",
57 |             request_infos=EnvInfos(description=True, objective=True),
58 |         )
59 |         env = gym.make(env_id)
60 |         count_games += 1
61 |         print(f"Starting game {index}\n")
62 |         if play_game(env):
63 |             count_won += 1
64 |     print(f"Played {count_games}, won {count_won}")


--------------------------------------------------------------------------------
/Chapter13/conftest.py:
--------------------------------------------------------------------------------
1 | # this file adds current dir to the pytest path for modules import


--------------------------------------------------------------------------------
/Chapter13/games/.gitignore:
--------------------------------------------------------------------------------
1 | *.json
2 | *.ni
3 | *.ulx
4 | *.z8


--------------------------------------------------------------------------------
/Chapter13/games/make_games.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | tw-make custom --world-size 5 --nb-objects 10 --quest-length 5  --quest-breadth 1 --seed 0 --output simple-val.ulx
3 | 
4 | # change the range to generate more games
5 | for i in `seq 1 20`; do
6 |     tw-make custom --world-size 5 --nb-objects 10 --quest-length 5 --quest-breadth 1 --seed $i --output simple$i.ulx
7 | done
8 | 


--------------------------------------------------------------------------------
/Chapter13/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter13/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter13/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore::DeprecationWarning


--------------------------------------------------------------------------------
/Chapter13/requirements.txt:
--------------------------------------------------------------------------------
1 | textworld==1.6.1
2 | transformers==4.46.0
3 | sentence-transformers==3.2.1
4 | langchain-openai==0.3.4
5 | langchain==0.2.3


--------------------------------------------------------------------------------
/Chapter13/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter13/tests/__init__.py


--------------------------------------------------------------------------------
/Chapter13/tests/test_preproc.py:
--------------------------------------------------------------------------------
 1 | from pytest import mark
 2 | 
 3 | from lib.preproc import RelativeDirectionWrapper
 4 | 
 5 | 
 6 | @mark.parametrize("abs_act, dir_name, exp_rel_act", [
 7 |     ("go north", "north", "go forward"),
 8 |     ("go south", "south", "go forward"),
 9 |     ("go east",  "east",  "go forward"),
10 |     ("go east",  "north", "go right"),
11 |     ("go west",  "north", "go left"),
12 |     ("go south", "north", "go back"),
13 |     ("go west",  "east",  "go back"),
14 |     ("go west",  "south", "go right"),
15 |     ("go east",  "south", "go left"),
16 |     ("go north", "south", "go back"),
17 |     ("go south", "east",  "go right"),
18 |     ("go south", "west",  "go left"),
19 | ])
20 | def test_abs_to_rel(abs_act, dir_name, exp_rel_act):
21 |     dir_idx = RelativeDirectionWrapper.ABSOLUTE_DIRS.index(dir_name)
22 |     rel_act = RelativeDirectionWrapper.abs_to_rel(abs_act, dir_idx)
23 |     assert isinstance(rel_act, str)
24 |     assert rel_act == exp_rel_act
25 | 
26 | 
27 | @mark.parametrize("rel_act, dir_name, exp_abs_act", [
28 |     ("go forward", "north", "go north"),
29 |     ("go right",   "north", "go east"),
30 |     ("go back",    "north", "go south"),
31 |     ("go left",    "north", "go west"),
32 | 
33 |     ("go forward", "east", "go east"),
34 |     ("go right",   "east", "go south"),
35 |     ("go back",    "east", "go west"),
36 |     ("go left",    "east", "go north"),
37 | ])
38 | def test_rel_to_abs(rel_act, dir_name, exp_abs_act):
39 |     dir_idx = RelativeDirectionWrapper.ABSOLUTE_DIRS.index(dir_name)
40 |     abs_act = RelativeDirectionWrapper.rel_to_abs(rel_act, dir_idx)
41 |     assert isinstance(abs_act, str)
42 |     assert abs_act == exp_abs_act
43 | 
44 | 
45 | @mark.parametrize("rel_act, dir_name, exp_new_dir", [
46 |     ("go forward", "north", "north"),
47 |     ("go right",   "north", "east"),
48 |     ("go left",    "north", "west"),
49 |     ("go back",    "north", "south"),
50 | 
51 |     ("go forward", "west", "west"),
52 |     ("go right",   "west", "north"),
53 |     ("go left",    "west", "south"),
54 |     ("go back",    "west", "east"),
55 | ])
56 | def test_rel_execute(rel_act, dir_name, exp_new_dir):
57 |     dir_idx = RelativeDirectionWrapper.ABSOLUTE_DIRS.index(dir_name)
58 |     new_dir = RelativeDirectionWrapper.rel_execute(rel_act, dir_idx)
59 |     assert isinstance(new_dir, int)
60 |     new_dir_name = RelativeDirectionWrapper.ABSOLUTE_DIRS[new_dir]
61 |     assert new_dir_name == exp_new_dir
62 | 
63 | 
64 | def test_update_vocabs():
65 |     v, v_r = {}, {}
66 |     RelativeDirectionWrapper.update_vocabs(v, v_r)
67 |     assert len(v) == 4
68 |     assert len(v_r) == 4
69 |     assert v == {0: "right", 1: "forward", 2: "left", 3: "back"}
70 |     assert v_r == {"right": 0, "forward": 1, "left": 2, "back": 3}
71 | 
72 |     v, v_r = {0: "word", 1: "left"}, {"word": 0, "left": 1}
73 |     RelativeDirectionWrapper.update_vocabs(v, v_r)
74 |     assert len(v) == 5
75 |     assert len(v_r) == 5
76 |     assert v == {0: "word", 1: "left", 2: "right", 3: "forward", 4: "back"}
77 |     assert v_r == {"word": 0, "left": 1, "right": 2, "forward": 3, "back": 4}
78 | 


--------------------------------------------------------------------------------
/Chapter14/.gitignore:
--------------------------------------------------------------------------------
1 | out


--------------------------------------------------------------------------------
/Chapter14/adhoc/01_wob_create.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import time
 3 | import gymnasium as gym
 4 | import miniwob
 5 | from miniwob.action import ActionTypes
 6 | 
 7 | RENDER_ENV = False
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     gym.register_envs(miniwob)
12 | 
13 |     env = gym.make('miniwob/click-test-2-v1', render_mode='human' if RENDER_ENV else None)
14 |     print(env)
15 |     try:
16 |         # Start a new episode.
17 |         obs, info = env.reset()
18 |         print("Obs keys:", list(obs.keys()))
19 |         print("Info dict:", info)
20 |         assert obs["utterance"] == "Click button ONE."
21 |         assert obs["fields"] == (("target", "ONE"),)
22 |         print("Screenshot shape:", obs['screenshot'].shape)
23 |         if RENDER_ENV:
24 |             # to let you look at the environment.
25 |             time.sleep(2)
26 | 
27 |         # Find the HTML element with text "ONE".
28 |         target_elems = [e for e in obs['dom_elements'] if e['text'] == "ONE"]
29 |         assert target_elems
30 |         print("Target elem:", target_elems[0])
31 | 
32 |         # Click on the element.
33 |         action = env.unwrapped.create_action(
34 |             ActionTypes.CLICK_ELEMENT, ref=target_elems[0]["ref"])
35 |         obs, reward, terminated, truncated, info = env.step(action)
36 |         print(reward, terminated, info)
37 |     finally:
38 |         env.close()
39 | 


--------------------------------------------------------------------------------
/Chapter14/adhoc/02_act_clicks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import time
 3 | 
 4 | import numpy as np
 5 | import gymnasium
 6 | import miniwob
 7 | import typing as tt
 8 | from miniwob.action import ActionTypes, ActionSpaceConfig
 9 | 
10 | RENDER_ENV = True
11 | 
12 | BIN_DX = 10
13 | BIN_DY = 10
14 | SIZE_Y = 210
15 | SIZE_X = 160
16 | BINS_X = SIZE_X // BIN_DX
17 | BINS_Y = SIZE_Y // BIN_DY
18 | 
19 | 
20 | 
21 | def close_bins(elems: tt.Tuple[dict, ...]) -> tt.Tuple[int, int]:
22 |     elem_ids = {e['ref']: e for e in elems}
23 |     close_elem = None
24 |     for e in elems:
25 |         if e['text'] == 'Close':
26 |             close_elem = e
27 |             break
28 |     # need to roll back while ref is negative
29 |     while close_elem['ref'] < 0:
30 |         close_elem = elem_ids[close_elem['parent']]
31 |     x = close_elem['left'][0] + close_elem['width'][0] / 2.0
32 |     y = close_elem['top'][0] + close_elem['height'][0] / 2.0
33 |     return x // BIN_DX, y // BIN_DY
34 | 
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     gymnasium.register_envs(miniwob)
39 | 
40 |     act_cfg = ActionSpaceConfig(
41 |         action_types=(ActionTypes.CLICK_COORDS, ),
42 |         coord_bins=(BINS_X, BINS_Y),
43 |     )
44 |     env = gymnasium.make(
45 |         'miniwob/click-dialog-v1',
46 |         render_mode='human' if RENDER_ENV else None,
47 |         action_space_config=act_cfg,
48 |     )
49 |     print(env)
50 |     print(env.action_space)
51 |     try:
52 |         # Start a new episode.
53 |         obs, info = env.reset()
54 |         print("Obs keys:", list(obs.keys()))
55 |         print("Info dict:", info)
56 |         print("Screenshot shape:", obs['screenshot'].shape)
57 |         coords = close_bins(obs['dom_elements'])
58 | 
59 |         action = {
60 |             "action_type": 0,
61 |             "coords": np.array(coords, dtype=np.int8)
62 |         }
63 |         print("action", action)
64 |         if RENDER_ENV:
65 |             time.sleep(3)
66 |         obs, reward, is_done, is_trunc, info = env.step(action)
67 |         print(reward, is_done, info)
68 | 
69 |         # Brute force to check that our action is correct (comment step() call above)
70 |         if False:
71 |             is_done = False
72 |             for y in range(BINS_Y):
73 |                 for x in range(BINS_X):
74 |                     action = {
75 |                         "action_type": 0,
76 |                         "coords": np.array((x, y), dtype=np.int8)
77 |                     }
78 |                     obs, reward, is_done, is_trunc, info = env.step(action)
79 |                     if is_done:
80 |                         print("Episode done:", action)
81 |                         print(reward, is_done, info)
82 |                         break
83 |                 if is_done:
84 |                     break
85 |         if RENDER_ENV:
86 |             input()
87 |     finally:
88 |         env.close()
89 | 


--------------------------------------------------------------------------------
/Chapter14/adhoc/03_clicker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | import time
 4 | 
 5 | sys.path.append(".")
 6 | import typing as tt
 7 | from lib import wob
 8 | 
 9 | RENDER_ENV = True
10 | 
11 | 
12 | def close_bins(elems: tt.Tuple[dict, ...]) -> int:
13 |     elem_ids = {e['ref']: e for e in elems}
14 |     close_elem = None
15 |     for e in elems:
16 |         if e['text'] == 'Close':
17 |             close_elem = e
18 |             break
19 |     # need to roll back while ref is negative
20 |     while close_elem['ref'] < 0:
21 |         close_elem = elem_ids[close_elem['parent']]
22 |     print(close_elem)
23 |     x = close_elem['left'][0] #+ close_elem['width'][0] / 2.0
24 |     y = close_elem['top'][0] #+ close_elem['height'][0] / 2.0
25 |     i = int(x // wob.BIN_SIZE)
26 |     j = int((y - wob.Y_OFS) // wob.BIN_SIZE) - 1
27 |     print(f"found elem x={x}, y={y} -> i={i}, j={j} = {i} + {j*16}")
28 |     return i + 16*j
29 | 
30 | if __name__ == "__main__":
31 |     env = wob.MiniWoBClickWrapper.create(
32 |         'miniwob/click-dialog-v1', keep_obs=True,
33 |         render_mode='human' if RENDER_ENV else None
34 |     )
35 |     print(env)
36 |     print(env.action_space)
37 |     print(env.observation_space)
38 |     try:
39 |         # Start a new episode.
40 |         obs, info = env.reset()
41 |         orig_obs = info.pop(wob.MiniWoBClickWrapper.FULL_OBS_KEY)
42 |         print("Obs shape:", obs.shape)
43 |         print("Info dict:", info)
44 |         action = close_bins(orig_obs['dom_elements'])
45 |         print("action", action)
46 | 
47 |         # switch between detected close action and brute force mode
48 |         if False:
49 |             obs, reward, is_done, is_trunc, info = env.step(action)
50 |             info.pop(wob.MiniWoBClickWrapper.FULL_OBS_KEY)
51 |             print(reward, is_done, info)
52 |         else:
53 |             is_done = False
54 |             for action in range(env.action_space.n):
55 |                 time.sleep(0.001)
56 |                 obs, reward, is_done, is_trunc, info = env.step(action)
57 |                 info.pop(wob.MiniWoBClickWrapper.FULL_OBS_KEY)
58 |                 print(action, "=>", reward, is_done, info)
59 |                 if is_done:
60 |                     print("Episode done:", action)
61 |                     break
62 |     finally:
63 |         env.close()
64 | 


--------------------------------------------------------------------------------
/Chapter14/adhoc/04_load_demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | sys.path.append(".")
 4 | import argparse
 5 | import pathlib
 6 | 
 7 | from lib import demos
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("-s", "--save", help="If given, save observations to this image prefix")
13 |     parser.add_argument("-i", "--input", help="Input file to parse",
14 |                         default="demos/click-dialog/click-dialog_0421123844.json")
15 |     args = parser.parse_args()
16 |     p = pathlib.Path(args.input)
17 |     res = demos.load_demo_file(p, gamma=0.99, steps=2)
18 |     for idx, e in enumerate(res):
19 |         print(f"obs={e.state.shape}, act={e.action}, r={e.reward}, last={e.last_state is None}")
20 |         if args.save is not None:
21 |             name = f"{args.save}_{idx:04d}_a={e.action}.png"
22 |             demos.save_obs_image(e.state, e.action, name)
23 |             print("Saved to", name)
24 | 


--------------------------------------------------------------------------------
/Chapter14/adhoc/05_join_obs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | sys.path.append(".")
 4 | import argparse
 5 | import pathlib
 6 | import pickle
 7 | import json
 8 | from lib import demos
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("--dat", required=True, help="Data file in json format")
14 |     parser.add_argument("--obs", required=True, help="Observations in pickle format")
15 |     parser.add_argument("--save", default=False, action="store_true", help="Save images from observations")
16 |     args = parser.parse_args()
17 | 
18 |     p = pathlib.Path(args.obs)
19 |     rel_obs = pickle.loads(p.read_bytes())
20 |     p = pathlib.Path(args.dat)
21 |     data = json.loads(p.read_text())
22 | 
23 |     if args.save:
24 |         for k in sorted(rel_obs.keys()):
25 |             f = f"{k:05d}.png"
26 |             demos.save_obs_image(rel_obs[k]['screenshot'], action=None, file_name=f, transpose=False)
27 |     new_data = demos.join_obs(data, rel_obs)
28 |     pass


--------------------------------------------------------------------------------
/Chapter14/adhoc/06_save_traj.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Tool saves trajectories from several games using the given model
 4 | """
 5 | import sys
 6 | sys.path.append(".")
 7 | import pathlib
 8 | import argparse
 9 | import torch
10 | import torch.nn.functional as F
11 | import numpy as np
12 | 
13 | from lib import model, wob, demos
14 | 
15 | ENV_NAME = 'miniwob/count-sides-v1'
16 | 
17 | if __name__ == "__main__":
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("-m", "--model", required=True, help="Model file")
20 |     parser.add_argument("-o", "--output", required=True, help="Dir to save screenshots")
21 |     args = parser.parse_args()
22 | 
23 |     env = wob.MiniWoBClickWrapper.create(ENV_NAME)
24 | 
25 |     net = model.Model(input_shape=wob.WOB_SHAPE, n_actions=env.action_space.n)
26 |     net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
27 |     out_dir = pathlib.Path(args.output)
28 |     out_dir.mkdir(parents=True, exist_ok=True)
29 | 
30 |     obs, info = env.reset()
31 |     step_idx = 0
32 | 
33 |     while True:
34 |         obs_v = torch.tensor(np.expand_dims(obs, axis=0))
35 |         logits_v = net(obs_v)[0]
36 |         policy = F.softmax(logits_v, dim=1).data.numpy()[0]
37 |         action = np.random.choice(len(policy), p=policy)
38 | 
39 |         new_obs, reward, done, is_tr, info = env.step(action)
40 |         print(f"{step_idx}: act={action}, r={reward}, done={done}, tr={is_tr}: {info}")
41 | 
42 |         p = out_dir / f"scr_{step_idx:03d}_act={action}_r={reward:.2f}_d={done:d}_tr={is_tr:d}.png"
43 |         demos.save_obs_image(obs, action, str(p))
44 |         obs = new_obs
45 |         step_idx += 1
46 |         if is_tr or done:
47 |             break
48 |     p = out_dir / f"scr_{step_idx:03d}.png"
49 |     demos.save_obs_image(obs, action=None, file_name=str(p))
50 | 
51 |     env.close()
52 | 


--------------------------------------------------------------------------------
/Chapter14/adhoc/06_save_traj_vec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Tool saves trajectories from several games using the given model, vectorized version
 4 | """
 5 | import sys
 6 | sys.path.append(".")
 7 | import pathlib
 8 | import argparse
 9 | import torch
10 | import torch.nn.functional as F
11 | import numpy as np
12 | import gymnasium as gym
13 | 
14 | from lib import model, wob, demos
15 | 
16 | ENV_NAME = 'miniwob/count-sides-v1'
17 | N_ENVS = 4
18 | 
19 | if __name__ == "__main__":
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("-m", "--model", required=True, help="Model file")
22 |     parser.add_argument("-o", "--output", required=True, help="Dir to save screenshots")
23 |     parser.add_argument("-a", type=int, help="If given, this action will be executed")
24 |     args = parser.parse_args()
25 | 
26 |     envs = [
27 |         lambda: wob.MiniWoBClickWrapper.create(ENV_NAME)
28 |         for _ in range(N_ENVS)
29 |     ]
30 |     env = gym.vector.AsyncVectorEnv(envs)
31 | 
32 |     net = model.Model(input_shape=wob.WOB_SHAPE, n_actions=env.single_action_space.n)
33 |     net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
34 |     out_dir = pathlib.Path(args.output)
35 |     for i in range(N_ENVS):
36 |         (out_dir / str(i)).mkdir(parents=True, exist_ok=True)
37 | 
38 |     obs, info = env.reset()
39 |     step_idx = 0
40 |     done_envs = set()
41 | 
42 |     while len(done_envs) < N_ENVS:
43 |         obs_v = torch.tensor(obs)
44 |         logits_v = net(obs_v)[0]
45 |         policy = F.softmax(logits_v, dim=1).data.numpy()
46 |         actions = [
47 |             np.random.choice(len(policy[i]), p=policy[i]) if args.a is None else args.a
48 |             for i in range(N_ENVS)
49 |         ]
50 | 
51 |         new_obs, rewards, dones, is_trs, infos = env.step(actions)
52 |         for i, (action, reward, done, is_tr) in enumerate(zip(actions, rewards, dones, is_trs)):
53 |             b_x, b_y = wob.action_to_bins(action)
54 |             print(f"{step_idx}-{i}: act={action}, b={b_x}_{b_y}, r={reward}, done={done}, tr={is_tr}")
55 |             p = out_dir / str(i) / f"scr_{step_idx:03d}_act={action}_b={b_x}-{b_y}_r={reward:.2f}_d={done:d}_tr={is_tr:d}.png"
56 |             demos.save_obs_image(obs[i], action, str(p))
57 |             if is_tr or done:
58 |                 done_envs.add(i)
59 |         obs = new_obs
60 |         step_idx += 1
61 | 
62 |     env.close()
63 | 


--------------------------------------------------------------------------------
/Chapter14/lib/common.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import time
 3 | import typing as tt
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | from torch.utils.tensorboard.writer import SummaryWriter
 8 | 
 9 | import ptan
10 | 
11 | class RewardTracker:
12 |     def __init__(self, writer: SummaryWriter):
13 |         self.writer = writer
14 | 
15 |     def __enter__(self):
16 |         self.ts = time.time()
17 |         self.ts_frame = 0
18 |         self.total_rewards = []
19 |         return self
20 | 
21 |     def __exit__(self, *args):
22 |         self.writer.close()
23 | 
24 |     def reward(self, reward: float, frame: int,
25 |                epsilon: tt.Optional[float] = None):
26 |         self.total_rewards.append(reward)
27 |         speed = (frame - self.ts_frame) / (time.time() - self.ts)
28 |         self.ts_frame = frame
29 |         self.ts = time.time()
30 |         mean_reward = np.mean(self.total_rewards[-100:])
31 |         epsilon_str = ""
32 |         if epsilon is not None:
33 |             epsilon_str = f", eps {epsilon:.2f}"
34 |         print(f"{frame}: done {len(self.total_rewards)} games, "
35 |               f"mean reward {mean_reward:.3f}, "
36 |               f"speed {speed:.2f} f/s{epsilon_str}")
37 |         sys.stdout.flush()
38 |         if epsilon is not None:
39 |             self.writer.add_scalar("epsilon", epsilon, frame)
40 |         self.writer.add_scalar("speed", speed, frame)
41 |         self.writer.add_scalar("reward_100", mean_reward, frame)
42 |         self.writer.add_scalar("reward", reward, frame)
43 |         return mean_reward if len(self.total_rewards) > 30 else None
44 | 
45 | 
46 | def unpack_batch(batch, net, last_val_gamma, device="cpu", states_preprocessor=ptan.agent.default_states_preprocessor):
47 |     """
48 |     Convert batch into training tensors
49 |     :param batch:
50 |     :param net:
51 |     :return: states variable, actions tensor, reference values variable
52 |     """
53 |     states = []
54 |     actions = []
55 |     rewards = []
56 |     not_done_idx = []
57 |     last_states = []
58 |     for idx, exp in enumerate(batch):
59 |         states.append(exp.state)
60 |         actions.append(int(exp.action))
61 |         rewards.append(exp.reward)
62 |         if exp.last_state is not None:
63 |             not_done_idx.append(idx)
64 |             last_states.append(exp.last_state)
65 |     states_v = states_preprocessor(states)
66 |     if torch.is_tensor(states_v):
67 |         states_v = states_v.to(device)
68 |     actions_t = torch.LongTensor(actions).to(device)
69 | 
70 |     # handle rewards
71 |     rewards_np = np.array(rewards, dtype=np.float32)
72 |     if not_done_idx:
73 |         last_states_v = states_preprocessor(last_states)
74 |         if torch.is_tensor(last_states_v):
75 |             last_states_v = last_states_v.to(device)
76 |         last_vals_v = net(last_states_v)[1]
77 |         last_vals_np = last_vals_v.data.cpu().numpy()[:, 0]
78 |         rewards_np[not_done_idx] += last_val_gamma * last_vals_np
79 | 
80 |     ref_vals_v = torch.FloatTensor(rewards_np).to(device)
81 |     return states_v, actions_t, ref_vals_v
82 | 


--------------------------------------------------------------------------------
/Chapter14/requirements.txt:
--------------------------------------------------------------------------------
1 | miniwob==1.0
2 | nltk==3.8.1
3 | bottle==0.12.25
4 | 


--------------------------------------------------------------------------------
/Chapter14/wob_click_mm_play.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import gym
 4 | import universe
 5 | import numpy as np
 6 | 
 7 | import torch
 8 | import torch.nn.functional as F
 9 | 
10 | from lib import wob_vnc, model_vnc
11 | 
12 | 
13 | ENV_NAME = "wob.mini.ClickTab-v0"
14 | REMOTE_ADDR = 'vnc://gpu:5910+15910'
15 | 
16 | # docker run -d -p 5910:5900 -p 15910:15900 --privileged --ipc host --cap-add SYS_ADMIN 92756d1f08ac
17 | 
18 | 
19 | def step_env(env, action):
20 |     idle_count = 0
21 |     while True:
22 |         obs, reward, is_done, info = env.step([action])
23 |         if obs[0] is None:
24 |             idle_count += 1
25 |             continue
26 |         break
27 |     return obs[0], reward[0], is_done[0], info, idle_count
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument("-m", "--model", help="Model file to load")
33 |     parser.add_argument("-n", "--name", required=True, help="Prefix to save screenshots")
34 |     parser.add_argument("--count", type=int, default=1, help="Count of runs to play, default=1")
35 |     parser.add_argument("--env", default=ENV_NAME, help="Environment name to solve, default=" + ENV_NAME)
36 |     args = parser.parse_args()
37 | 
38 |     env_name = args.env
39 |     if not env_name.startswith('wob.mini.'):
40 |         env_name = "wob.mini." + env_name
41 | 
42 |     env = gym.make(env_name)
43 |     env = universe.wrappers.experimental.SoftmaxClickMouse(env)
44 |     env = wob_vnc.MiniWoBCropper(env, keep_text=True)
45 |     wob_vnc.configure(env, REMOTE_ADDR)
46 | 
47 |     net = model_vnc.ModelMultimodal(input_shape=wob_vnc.WOB_SHAPE, n_actions=env.action_space.n)
48 |     if args.model:
49 |         net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
50 |         preprocessor = model_vnc.MultimodalPreprocessor.load(args.model[:-4] + ".pre")
51 |     else:
52 |         preprocessor = model_vnc.MultimodalPreprocessor()
53 |     env.reset()
54 | 
55 |     for round_idx in range(args.count):
56 |         action = env.action_space.sample()
57 |         step_idx = 0
58 |         while True:
59 |             obs, reward, done, info, idle_count = step_env(env, action)
60 |             print(step_idx, reward, done, idle_count)
61 |             img_name = "%s_r%02d_s%04d_%.3f_i%02d_d%d.png" % (
62 |                 args.name, round_idx, step_idx, reward, idle_count, int(done))
63 |             obs_v = preprocessor([obs])
64 |             logits_v = net(obs_v)[0]
65 |             policy = F.softmax(logits_v, dim=1).data.numpy()[0]
66 |             action = np.random.choice(len(policy), p=policy)
67 |             wob_vnc.save_obs(obs[0], img_name, action=action)
68 |             step_idx += 1
69 |             if done or reward != 0:
70 |                 print("Round %d done" % round_idx)
71 |                 break
72 |     pass
73 | 


--------------------------------------------------------------------------------
/Chapter14/wob_click_play.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import numpy as np
 4 | 
 5 | import torch
 6 | import torch.nn.functional as F
 7 | 
 8 | from lib import wob, model
 9 | 
10 | 
11 | ENV_NAME = 'miniwob/click-dialog-v1'
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("-m", "--model", help="Model file to load")
17 |     parser.add_argument("--count", type=int, default=1, help="Count of episodes to play, default=1")
18 |     parser.add_argument("--env", default=ENV_NAME, help="Environment name to solve, default=" + ENV_NAME)
19 |     parser.add_argument("--verbose", default=False, action='store_true', help="Display every step")
20 |     parser.add_argument("--render", default=False, action='store_true', help="Show browser window")
21 |     args = parser.parse_args()
22 | 
23 |     env_name = args.env
24 |     if not env_name.startswith('miniwob/'):
25 |         env_name = "miniwob/" + env_name
26 | 
27 |     render_mode = 'human' if args.render else None
28 |     env = wob.MiniWoBClickWrapper.create(env_name, render_mode=render_mode)
29 | 
30 |     net = model.Model(input_shape=wob.WOB_SHAPE, n_actions=env.action_space.n)
31 |     if args.model:
32 |         net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
33 | 
34 |     steps_count = 0
35 |     reward_sum = 0
36 | 
37 |     for round_idx in range(args.count):
38 |         step_idx = 0
39 |         obs, info = env.reset()
40 |         while True:
41 |             obs_v = torch.tensor(np.expand_dims(obs, axis=0))
42 |             logits_v = net(obs_v)[0]
43 |             policy = F.softmax(logits_v, dim=1).data.numpy()[0]
44 |             action = np.random.choice(len(policy), p=policy)
45 | 
46 |             obs, reward, done, is_tr, info = env.step(action)
47 |             if args.verbose:
48 |                 print(step_idx, reward, done, info)
49 | 
50 |             step_idx += 1
51 |             reward_sum += reward
52 |             steps_count += 1
53 |             if done:
54 |                 print("Round %d done" % round_idx)
55 |                 break
56 |     print("Done %d rounds, mean steps %.2f, mean reward %.3f" % (
57 |         args.count, steps_count / args.count, reward_sum / args.count
58 |     ))
59 | 
60 |     if args.render:
61 |         input("Press enter to close the browser >>> ")
62 |         env.close()
63 | 
64 |     pass
65 | 


--------------------------------------------------------------------------------
/Chapter15/01_check_env.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gymnasium as gym
 3 | 
 4 | ENV_ID = "MinitaurBulletEnv-v0"
 5 | ENTRY = "pybullet_envs.bullet.minitaur_gym_env:MinitaurBulletEnv"
 6 | RENDER = True
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     gym.register(ENV_ID, entry_point=ENTRY, max_episode_steps=1000,
11 |                  reward_threshold=15.0, disable_env_checker=True)
12 |     env = gym.make(ENV_ID, render=RENDER)
13 | 
14 |     print("Observation space:", env.observation_space)
15 |     print("Action space:", env.action_space)
16 |     print(env)
17 |     print(env.reset())
18 |     input("Press any key to exit\n")
19 |     env.close()
20 | 


--------------------------------------------------------------------------------
/Chapter15/03_play_a2c.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import gymnasium as gym
 4 | 
 5 | from lib import model, common
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("-m", "--model", required=True, help="Model file to load")
14 |     parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled")
15 |     args = parser.parse_args()
16 | 
17 |     common.register_env()
18 |     env = gym.make(common.ENV_ID, render_mode='rgb_array')
19 |     if args.record is not None:
20 |         env = gym.wrappers.RecordVideo(env, video_folder=args.record)
21 | 
22 |     net = model.ModelA2C(env.observation_space.shape[0], env.action_space.shape[0])
23 |     net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
24 | 
25 |     obs, _ = env.reset()
26 |     total_reward = 0.0
27 |     total_steps = 0
28 |     while True:
29 |         obs_v = torch.FloatTensor(np.expand_dims(obs, 0))
30 |         mu_v, var_v, val_v = net(obs_v)
31 |         action = mu_v.squeeze(dim=0).data.numpy()
32 |         action = np.clip(action, -1, 1)
33 |         obs, reward, done, is_tr, _ = env.step(action)
34 |         total_reward += reward
35 |         total_steps += 1
36 |         if done or is_tr:
37 |             break
38 |     print("In %d steps we got %.3f reward" % (total_steps, total_reward))
39 |     env.close()
40 | 


--------------------------------------------------------------------------------
/Chapter15/05_play_ddpg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import gymnasium as gym
 4 | 
 5 | from lib import model, common
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("-m", "--model", required=True, help="Model file to load")
14 |     parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled")
15 |     args = parser.parse_args()
16 | 
17 |     common.register_env()
18 |     env = gym.make(common.ENV_ID, render_mode='rgb_array')
19 |     if args.record is not None:
20 |         env = gym.wrappers.RecordVideo(env, video_folder=args.record)
21 | 
22 |     net = model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0])
23 |     net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
24 | 
25 |     obs, _ = env.reset()
26 |     total_reward = 0.0
27 |     total_steps = 0
28 |     while True:
29 |         obs_v = torch.FloatTensor(np.expand_dims(obs, 0))
30 |         mu_v = net(obs_v)
31 |         action = mu_v.squeeze(dim=0).data.numpy()
32 |         action = np.clip(action, -1, 1)
33 |         obs, reward, done, is_tr, _ = env.step(action)
34 |         total_reward += reward
35 |         total_steps += 1
36 |         if done or is_tr:
37 |             break
38 |     print("In %d steps we got %.3f reward" % (total_steps, total_reward))
39 |     env.close()


--------------------------------------------------------------------------------
/Chapter15/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter15/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter15/lib/common.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import numpy as np
 3 | import torch
 4 | import ptan
 5 | 
 6 | 
 7 | ENV_ID = "MinitaurBulletEnv-v0"
 8 | ENTRY = "pybullet_envs.bullet.minitaur_gym_env:MinitaurBulletEnv"
 9 | 
10 | 
11 | 
12 | def register_env():
13 |     # Small hack to fix render_modes metadata
14 |     from pybullet_envs.bullet.minitaur_gym_env import MinitaurBulletEnv
15 |     MinitaurBulletEnv.metadata['render_modes'] = MinitaurBulletEnv.metadata.pop('render.modes')
16 | 
17 |     # register environment in gymnasium registry, not gym's
18 |     gym.register(
19 |         ENV_ID, entry_point=ENTRY,
20 |         max_episode_steps=1000, reward_threshold=15.0,
21 |         apply_api_compatibility=True,
22 |         disable_env_checker=True,
23 |     )
24 | 
25 | 
26 | def unpack_batch_a2c(batch, net, last_val_gamma, device="cpu"):
27 |     """
28 |     Convert batch into training tensors
29 |     :param batch:
30 |     :param net:
31 |     :return: states variable, actions tensor, reference values variable
32 |     """
33 |     states = []
34 |     actions = []
35 |     rewards = []
36 |     not_done_idx = []
37 |     last_states = []
38 |     for idx, exp in enumerate(batch):
39 |         states.append(exp.state)
40 |         actions.append(exp.action)
41 |         rewards.append(exp.reward)
42 |         if exp.last_state is not None:
43 |             not_done_idx.append(idx)
44 |             last_states.append(exp.last_state)
45 |     states_v = ptan.agent.float32_preprocessor(states).to(device)
46 |     actions_v = torch.FloatTensor(np.asarray(actions)).to(device)
47 | 
48 |     # handle rewards
49 |     rewards_np = np.array(rewards, dtype=np.float32)
50 |     if not_done_idx:
51 |         last_states_v = ptan.agent.float32_preprocessor(last_states).to(device)
52 |         last_vals_v = net(last_states_v)[2]
53 |         last_vals_np = last_vals_v.data.cpu().numpy()[:, 0]
54 |         rewards_np[not_done_idx] += last_val_gamma * last_vals_np
55 | 
56 |     ref_vals_v = torch.FloatTensor(rewards_np).to(device)
57 |     return states_v, actions_v, ref_vals_v
58 | 
59 | 
60 | def unpack_batch_ddqn(batch, device="cpu"):
61 |     states, actions, rewards, dones, last_states = [], [], [], [], []
62 |     for exp in batch:
63 |         states.append(exp.state)
64 |         actions.append(exp.action)
65 |         rewards.append(exp.reward)
66 |         dones.append(exp.last_state is None)
67 |         if exp.last_state is None:
68 |             last_states.append(exp.state)
69 |         else:
70 |             last_states.append(exp.last_state)
71 |     states_v = ptan.agent.float32_preprocessor(states).to(device)
72 |     actions_v = ptan.agent.float32_preprocessor(actions).to(device)
73 |     rewards_v = ptan.agent.float32_preprocessor(rewards).to(device)
74 |     last_states_v = ptan.agent.float32_preprocessor(last_states).to(device)
75 |     dones_t = torch.BoolTensor(dones).to(device)
76 |     return states_v, actions_v, rewards_v, dones_t, last_states_v
77 | 


--------------------------------------------------------------------------------
/Chapter15/requirements.txt:
--------------------------------------------------------------------------------
1 | pybullet==3.2.6
2 | gym==0.25.1
3 | numpy<2


--------------------------------------------------------------------------------
/Chapter16/02_play.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import gymnasium as gym
 4 | 
 5 | from lib import common, model, kfac
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("-m", "--model", required=True, help="Model file to load")
14 |     parser.add_argument("-e", "--env", choices=list(common.ENV_PARAMS.keys()),
15 |                         default='cheetah', help="Environment name to use, default=cheehah")
16 |     parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled")
17 |     parser.add_argument("--acktr", default=False, action='store_true', help="Enable Acktr-specific tweaks")
18 |     parser.add_argument("--mujoco", default=False, action='store_true', help="Enable MuJoCo, default=PyBullet")
19 |     args = parser.parse_args()
20 | 
21 |     env_id = common.register_env(args.env, args.mujoco)
22 |     env = gym.make(env_id, render_mode='rgb_array')
23 |     if args.record is not None:
24 |         env = gym.wrappers.RecordVideo(env, video_folder=args.record)
25 | 
26 |     net = model.ModelActor(env.observation_space.shape[0], env.action_space.shape[0])
27 |     if args.acktr:
28 |         opt = kfac.KFACOptimizer(net)
29 |     net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
30 | 
31 |     obs, _ = env.reset()
32 |     total_reward = 0.0
33 |     total_steps = 0
34 |     while True:
35 |         obs_v = torch.FloatTensor(obs)
36 |         mu_v = net(obs_v)
37 |         action = mu_v.squeeze(dim=0).data.numpy()
38 |         action = np.clip(action, -1, 1)
39 |         if np.isscalar(action): 
40 |             action = [action]
41 |         obs, reward, done, is_tr, _ = env.step(action)
42 |         total_reward += reward
43 |         total_steps += 1
44 |         if done or is_tr:
45 |             break
46 |     print("In %d steps we got %.3f reward" % (total_steps, total_reward))
47 | 


--------------------------------------------------------------------------------
/Chapter16/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter16/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter16/lib/trpo.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | def get_flat_params_from(model):
 7 |     params = []
 8 |     for param in model.parameters():
 9 |         params.append(param.data.view(-1))
10 | 
11 |     flat_params = torch.cat(params)
12 |     return flat_params
13 | 
14 | 
15 | def set_flat_params_to(model, flat_params):
16 |     prev_ind = 0
17 |     for param in model.parameters():
18 |         flat_size = int(np.prod(list(param.size())))
19 |         param.data.copy_(
20 |             flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
21 |         prev_ind += flat_size
22 | 
23 | 
24 | def conjugate_gradients(Avp, b, nsteps, residual_tol=1e-10, device="cpu"):
25 |     x = torch.zeros(b.size()).to(device)
26 |     r = b.clone()
27 |     p = b.clone()
28 |     rdotr = torch.dot(r, r)
29 |     for i in range(nsteps):
30 |         _Avp = Avp(p)
31 |         alpha = rdotr / torch.dot(p, _Avp)
32 |         x += alpha * p
33 |         r -= alpha * _Avp
34 |         new_rdotr = torch.dot(r, r)
35 |         betta = new_rdotr / rdotr
36 |         p = r + betta * p
37 |         rdotr = new_rdotr
38 |         if rdotr < residual_tol:
39 |             break
40 |     return x
41 | 
42 | 
43 | def linesearch(model,
44 |                f,
45 |                x,
46 |                fullstep,
47 |                expected_improve_rate,
48 |                max_backtracks=10,
49 |                accept_ratio=.1):
50 |     fval = f().data
51 |     for (_n_backtracks, stepfrac) in enumerate(.5**np.arange(max_backtracks)):
52 |         xnew = x + fullstep * stepfrac
53 |         set_flat_params_to(model, xnew)
54 |         newfval = f().data
55 |         actual_improve = fval - newfval
56 |         expected_improve = expected_improve_rate * stepfrac
57 |         ratio = actual_improve / expected_improve
58 | 
59 |         if ratio.item() > accept_ratio and actual_improve.item() > 0:
60 |             return True, xnew
61 |     return False, x
62 | 
63 | 
64 | def trpo_step(model, get_loss, get_kl, max_kl, damping, device="cpu"):
65 |     loss = get_loss()
66 |     grads = torch.autograd.grad(loss, model.parameters())
67 |     loss_grad = torch.cat([grad.view(-1) for grad in grads]).data
68 | 
69 |     def Fvp(v):
70 |         kl = get_kl()
71 |         kl = kl.mean()
72 | 
73 |         grads = torch.autograd.grad(kl, model.parameters(), create_graph=True)
74 |         flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])
75 | 
76 |         v_v = v.clone().detach().to(device)
77 |         kl_v = (flat_grad_kl * v_v).sum()
78 |         grads = torch.autograd.grad(kl_v, model.parameters())
79 |         flat_grad_grad_kl = torch.cat([grad.contiguous().view(-1) for grad in grads]).data
80 | 
81 |         return flat_grad_grad_kl + v * damping
82 | 
83 |     stepdir = conjugate_gradients(Fvp, -loss_grad, 10, device=device)
84 | 
85 |     shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0, keepdim=True)
86 | 
87 |     lm = torch.sqrt(shs / max_kl)
88 |     fullstep = stepdir / lm[0]
89 | 
90 |     neggdotstepdir = (-loss_grad * stepdir).sum(0, keepdim=True)
91 | 
92 |     prev_params = get_flat_params_from(model)
93 |     success, new_params = linesearch(model, get_loss, prev_params, fullstep,
94 |                                      neggdotstepdir / lm[0])
95 |     set_flat_params_to(model, new_params)
96 | 
97 |     return loss
98 | 


--------------------------------------------------------------------------------
/Chapter16/requirements.txt:
--------------------------------------------------------------------------------
1 | gymnasium[mujoco]==0.29.0


--------------------------------------------------------------------------------
/Chapter17/.gitignore:
--------------------------------------------------------------------------------
1 | res
2 | 


--------------------------------------------------------------------------------
/Chapter17/01_cartpole_es.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import gymnasium as gym
  3 | import time
  4 | import numpy as np
  5 | import typing as tt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | 
 10 | from torch.utils.tensorboard.writer import SummaryWriter
 11 | 
 12 | from lib import common
 13 | 
 14 | 
 15 | MAX_BATCH_EPISODES = 100
 16 | MAX_BATCH_STEPS = 10000
 17 | NOISE_STD = 0.001
 18 | LEARNING_RATE = 0.001
 19 | 
 20 | 
 21 | 
 22 | class Net(nn.Module):
 23 |     def __init__(self, obs_size: int, action_size: int):
 24 |         super(Net, self).__init__()
 25 |         self.net = nn.Sequential(
 26 |             nn.Linear(obs_size, 32),
 27 |             nn.ReLU(),
 28 |             nn.Linear(32, action_size),
 29 |             nn.Softmax(dim=1)
 30 |         )
 31 | 
 32 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 33 |         return self.net(x)
 34 | 
 35 | 
 36 | def train_step(net: Net, batch_noise: tt.List[common.TNoise], batch_reward: tt.List[float],
 37 |                writer: SummaryWriter, step_idx: int):
 38 |     weighted_noise = None
 39 |     norm_reward = np.array(batch_reward)
 40 |     norm_reward -= np.mean(norm_reward)
 41 |     s = np.std(norm_reward)
 42 |     if abs(s) > 1e-6:
 43 |         norm_reward /= s
 44 | 
 45 |     for noise, reward in zip(batch_noise, norm_reward):
 46 |         if weighted_noise is None:
 47 |             weighted_noise = [reward * p_n for p_n in noise]
 48 |         else:
 49 |             for w_n, p_n in zip(weighted_noise, noise):
 50 |                 w_n += reward * p_n
 51 |     m_updates = []
 52 |     for p, p_update in zip(net.parameters(), weighted_noise):
 53 |         update = p_update / (len(batch_reward) * NOISE_STD)
 54 |         p.data += LEARNING_RATE * update
 55 |         m_updates.append(torch.norm(update))
 56 |     writer.add_scalar("update_l2", np.mean(m_updates), step_idx)
 57 | 
 58 | 
 59 | if __name__ == "__main__":
 60 |     writer = SummaryWriter(comment="-cartpole-es")
 61 |     env = gym.make("CartPole-v1")
 62 | 
 63 |     net = Net(env.observation_space.shape[0], env.action_space.n)
 64 |     print(net)
 65 | 
 66 |     step_idx = 0
 67 |     while True:
 68 |         t_start = time.time()
 69 |         batch_noise = []
 70 |         batch_reward = []
 71 |         batch_steps = 0
 72 |         for _ in range(MAX_BATCH_EPISODES):
 73 |             noise, neg_noise = common.sample_noise(net)
 74 |             batch_noise.append(noise)
 75 |             batch_noise.append(neg_noise)
 76 |             reward, steps = common.eval_with_noise(
 77 |                 env, net, noise, NOISE_STD)
 78 |             batch_reward.append(reward)
 79 |             batch_steps += steps
 80 |             reward, steps = common.eval_with_noise(
 81 |                 env, net, neg_noise, NOISE_STD)
 82 |             batch_reward.append(reward)
 83 |             batch_steps += steps
 84 |             if batch_steps > MAX_BATCH_STEPS:
 85 |                 break
 86 | 
 87 |         step_idx += 1
 88 |         m_reward = float(np.mean(batch_reward))
 89 |         if m_reward > 199:
 90 |             print("Solved in %d steps" % step_idx)
 91 |             break
 92 | 
 93 |         train_step(net, batch_noise, batch_reward, writer, step_idx)
 94 |         writer.add_scalar("reward_mean", m_reward, step_idx)
 95 |         writer.add_scalar("reward_std", np.std(batch_reward), step_idx)
 96 |         writer.add_scalar("reward_max", np.max(batch_reward), step_idx)
 97 |         writer.add_scalar("batch_episodes", len(batch_reward), step_idx)
 98 |         writer.add_scalar("batch_steps", batch_steps, step_idx)
 99 |         speed = batch_steps / (time.time() - t_start)
100 |         writer.add_scalar("speed", speed, step_idx)
101 |         print("%d: reward=%.2f, speed=%.2f f/s" % (
102 |             step_idx, m_reward, speed))
103 | 


--------------------------------------------------------------------------------
/Chapter17/03_cartpole_ga.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import gymnasium as gym
 3 | import copy
 4 | import numpy as np
 5 | 
 6 | import torch
 7 | import torch.nn as nn
 8 | 
 9 | from torch.utils.tensorboard.writer import SummaryWriter
10 | from lib import common
11 | 
12 | 
13 | NOISE_STD = 0.01
14 | POPULATION_SIZE = 50
15 | PARENTS_COUNT = 10
16 | 
17 | 
18 | class Net(nn.Module):
19 |     def __init__(self, obs_size: int, action_size: int):
20 |         super(Net, self).__init__()
21 |         self.net = nn.Sequential(
22 |             nn.Linear(obs_size, 32),
23 |             nn.ReLU(),
24 |             nn.Linear(32, action_size),
25 |             nn.Softmax(dim=1)
26 |         )
27 | 
28 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
29 |         return self.net(x)
30 | 
31 | 
32 | def mutate_parent(net: Net) -> Net:
33 |     new_net = copy.deepcopy(net)
34 |     for p in new_net.parameters():
35 |         noise = np.random.normal(size=p.data.size())
36 |         noise_t = torch.FloatTensor(noise)
37 |         p.data += NOISE_STD * noise_t
38 |     return new_net
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     env = gym.make("CartPole-v1")
43 |     writer = SummaryWriter(comment="-cartpole-ga")
44 | 
45 |     gen_idx = 0
46 |     nets = [
47 |         Net(env.observation_space.shape[0], env.action_space.n)
48 |         for _ in range(POPULATION_SIZE)
49 |     ]
50 |     population = [
51 |         (net, common.evaluate(env, net))
52 |         for net in nets
53 |     ]
54 |     while True:
55 |         population.sort(key=lambda p: p[1], reverse=True)
56 |         rewards = [p[1] for p in population[:PARENTS_COUNT]]
57 |         reward_mean = np.mean(rewards)
58 |         reward_max = np.max(rewards)
59 |         reward_std = np.std(rewards)
60 | 
61 |         writer.add_scalar("reward_mean", reward_mean, gen_idx)
62 |         writer.add_scalar("reward_std", reward_std, gen_idx)
63 |         writer.add_scalar("reward_max", reward_max, gen_idx)
64 |         print("%d: reward_mean=%.2f, reward_max=%.2f, reward_std=%.2f" % (
65 |             gen_idx, reward_mean, reward_max, reward_std))
66 |         if reward_mean > 199:
67 |             print("Solved in %d steps" % gen_idx)
68 |             break
69 | 
70 |         # generate next population
71 |         prev_population = population
72 |         population = [population[0]]
73 |         for _ in range(POPULATION_SIZE-1):
74 |             parent_idx = np.random.randint(0, PARENTS_COUNT)
75 |             parent = prev_population[parent_idx][0]
76 |             net = mutate_parent(parent)
77 |             fitness = common.evaluate(env, net)
78 |             population.append((net, fitness))
79 |         gen_idx += 1
80 |     writer.close()
81 | 


--------------------------------------------------------------------------------
/Chapter17/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter17/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter17/lib/common.py:
--------------------------------------------------------------------------------
 1 | import typing as tt
 2 | import torch
 3 | from torch import nn
 4 | import numpy as np
 5 | import gymnasium as gym
 6 | 
 7 | 
 8 | TNoise = tt.List[torch.Tensor]
 9 | 
10 | 
11 | def sample_noise(
12 |         net: nn.Module,
13 |         device: torch.device = torch.device('cpu')
14 | ) -> tt.Tuple[TNoise, TNoise]:
15 |     pos = []
16 |     neg = []
17 |     for p in net.parameters():
18 |         noise = np.random.normal(size=p.data.size())
19 |         pos.append(torch.FloatTensor(noise).to(device))
20 |         neg.append(torch.FloatTensor(-noise).to(device))
21 |     return pos, neg
22 | 
23 | 
24 | def evaluate(env: gym.Env, net: nn.Module, get_max_action: bool = True,
25 |              device: torch.device = torch.device('cpu')) -> tt.Tuple[float, int]:
26 |     obs, _ = env.reset()
27 |     reward = 0.0
28 |     steps = 0
29 |     while True:
30 |         obs_v = torch.FloatTensor(np.expand_dims(obs, 0)).to(device)
31 |         act_v = net(obs_v)
32 |         if get_max_action:
33 |             act = act_v.max(dim=1)[1].data.numpy()[0]
34 |         else:
35 |             act = act_v.data.cpu().numpy()[0]
36 |         obs, r, done, is_tr, _ = env.step(act)
37 |         reward += r
38 |         steps += 1
39 |         if done or is_tr:
40 |             break
41 |     return reward, steps
42 | 
43 | 
44 | def eval_with_noise(env: gym.Env, net: nn.Module, noise: TNoise, noise_std: float,
45 |         get_max_action: bool = True, device: torch.device = torch.device("cpu")
46 | ) -> tt.Tuple[float, int]:
47 |     old_params = net.state_dict()
48 |     for p, p_n in zip(net.parameters(), noise):
49 |         p.data += noise_std * p_n
50 |     r, s = evaluate(env, net, get_max_action=get_max_action, device=device)
51 |     net.load_state_dict(old_params)
52 |     return r, s
53 | 


--------------------------------------------------------------------------------
/Chapter18/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter18/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter18/riverswim.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import random
 3 | import argparse
 4 | import collections
 5 | import typing as tt
 6 | 
 7 | SEED = 2
 8 | 
 9 | 
10 | def get_action(state: int, total_states: int) -> int:
11 |     """
12 |     Return action from the given state. Actions are selected randomly
13 |     :param state: state we're currently in
14 |     :return: 0 means left, 1 is right
15 |     """
16 |     if state == 1:
17 |         return 1
18 |     if state == total_states:
19 |         return 0
20 |     return random.choice([0, 1])
21 | 
22 | 
23 | def do_action(state: int, action: int) -> int:
24 |     """
25 |     Simulate the action from the given state
26 |     """
27 |     # left action always succeeds and brings us to the left
28 |     if action == 0:
29 |         return state-1
30 | 
31 |     if state == 1:
32 |         return random.choices([1, 2], weights=[0.4, 0.6])[0]
33 |     # the rest of states are the same
34 |     delta = random.choices([-1, 0, 1], weights=[0.05, 0.6, 0.35])[0]
35 |     return state + delta
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument("-n", "--steps", type=int, default=100, help="Amount of steps to simulate, default=100")
41 |     parser.add_argument("--episode-length", type=int, default=10, help="Limit of one episode, default=10")
42 |     parser.add_argument("--seed", type=int, default=SEED, help="Seed to use, default=%d" % SEED)
43 |     parser.add_argument("--env-len", type=int, default=6, help="Amount of states in the environment, default=6")
44 |     args = parser.parse_args()
45 |     random.seed(args.seed)
46 | 
47 |     states_count: tt.Counter[int] = collections.Counter()
48 |     state = 1
49 |     episode_step = 0
50 | 
51 |     for _ in range(args.steps):
52 |         action = get_action(state, args.env_len)
53 |         state = do_action(state, action)
54 |         states_count[state] += 1
55 |         episode_step += 1
56 |         if episode_step == args.episode_length:
57 |             state = 1
58 |             episode_step = 0
59 | 
60 |     for state in range(1, args.env_len+1):
61 |         print("%d:\t%d" % (state, states_count[state]))
62 | 


--------------------------------------------------------------------------------
/Chapter18/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter18/tests/__init__.py


--------------------------------------------------------------------------------
/Chapter18/tests/test_ppo.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from numpy import testing
 3 | 
 4 | from lib import ppo
 5 | 
 6 | 
 7 | class TestPPO(unittest.TestCase):
 8 |     def test_adv_ref(self):
 9 |         vals = [0, 0, 0, 0, 0]
10 |         dones = [False, False, True, False, False]
11 |         rewards = [1, 1, 1, 1, 1]
12 | 
13 |         adv_t, ref_t = ppo.calc_adv_ref(vals, dones, rewards, 1.0, 1.0)
14 |         adv = adv_t.detach().numpy()
15 |         ref = ref_t.detach().numpy()
16 | 
17 |         testing.assert_array_equal(ref, [3, 2, 1, 1])
18 |         testing.assert_array_equal(ref, adv)
19 | 
20 |         adv_t, ref_t = ppo.calc_adv_ref(vals, dones, rewards, 0.9, 1.0)
21 |         adv = adv_t.detach().numpy()
22 |         ref = ref_t.detach().numpy()
23 | 
24 |         testing.assert_array_almost_equal(ref, [2.71, 1.9, 1., 1.])
25 |         testing.assert_array_almost_equal(ref, adv)
26 | 
27 | 
28 |         pass
29 | 
30 | 


--------------------------------------------------------------------------------
/Chapter19/.gitignore:
--------------------------------------------------------------------------------
1 | db*
2 | *.dat
3 | rec*
4 | 


--------------------------------------------------------------------------------
/Chapter19/01_play.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import pathlib
 4 | 
 5 | import gymnasium as gym
 6 | 
 7 | from lib import common, rlhf
 8 | import ptan
 9 | 
10 | import numpy as np
11 | import torch
12 | import torch.nn.functional as F
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument("-m", "--model", required=True, help="Model file to load")
18 |     parser.add_argument("-e", "--env", default="SeaquestNoFrameskip-v4",
19 |                         help="Environment name to use, default=SeaquestNoFrameskip-v4")
20 |     parser.add_argument("-r", "--record", help="If specified, sets the recording dir, default=Disabled")
21 |     parser.add_argument("-n", type=int, help="Count of experiments to run")
22 |     parser.add_argument("--reward", help="Path to reward model, if not given - use env reward")
23 |     args = parser.parse_args()
24 | 
25 |     rounds = args.n if args.n is not None else 1
26 |     logs = []
27 | 
28 |     for round in range(rounds):
29 |         video_folder = args.record
30 |         if args.n is not None:
31 |             video_folder += "-" + str(round)
32 |         env = gym.make(args.env, render_mode='rgb_array')
33 |         if args.record is not None:
34 |             env = gym.wrappers.RecordVideo(env, video_folder=video_folder)
35 |         if args.reward is not None:
36 |             p = pathlib.Path(args.reward)
37 |             env = rlhf.RewardModelWrapper(env, p, dev=torch.device("cpu"))
38 |         env = ptan.common.wrappers.wrap_dqn(env, clip_reward=False)
39 |         print(env)
40 | 
41 |         net = common.AtariA2C(env.observation_space.shape, env.action_space.n)
42 |         net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
43 | 
44 |         obs, _ = env.reset()
45 |         total_reward = 0.0
46 |         total_steps = 0
47 |         while True:
48 |             obs_v = torch.FloatTensor(obs).unsqueeze(0)
49 |             policy_v = net(obs_v)[0]
50 |             policy_v = F.softmax(policy_v, dim=1)
51 |             probs = policy_v[0].detach().cpu().numpy()
52 |             action = np.random.choice(len(probs), p=probs)
53 |             obs, reward, done, is_tr, _ = env.step(action)
54 |             total_reward += reward
55 |             total_steps += 1
56 |             if done or is_tr:
57 |                 break
58 |             if total_steps > 100000:
59 |                 break
60 |         logs.append("%d: %d steps we got %.3f reward" % (round, total_steps, total_reward))
61 |         env.close()
62 |     print("\n".join(logs))
63 | 


--------------------------------------------------------------------------------
/Chapter19/02_label_ui.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Web interface to label stored data
 4 | """
 5 | import argparse
 6 | 
 7 | from nicegui import ui
 8 | import typing as tt
 9 | 
10 | from lib import ui_tools, rlhf
11 | 
12 | db: tt.Optional[rlhf.Database] = None
13 | to_label: tt.List[rlhf.HumanLabel] = []
14 | 
15 | 
16 | 
17 | def label_ui():
18 |     with ui.splitter().classes("w-full") as splitter:
19 |         with splitter.before:
20 |             ui.label("List with data samples")
21 |         with splitter.after:
22 |             ui.label("Interface with gif")
23 | 
24 | 
25 | @ui.page(ui_tools.URL_ROOT, title="RLHF db overview")
26 | def view_root():
27 |     ui_tools.drawers(ui_tools.URL_ROOT)
28 |     ui.label(f"DB path: {db.db_root}")
29 |     ui.label(f"Trajectories: {len(db.paths)}")
30 |     ui.label(f"Human Labels: {len(db.labels)}")
31 | 
32 | 
33 | @ui.page(ui_tools.URL_LABEL, title="RLHF label data")
34 | def view_label():
35 |     ui_tools.drawers(ui_tools.URL_LABEL)
36 |     ui_tools.label_list_view(db, to_label)
37 | 
38 | 
39 | @ui.page(ui_tools.URL_DATA, title="RLHF existing data")
40 | def view_label():
41 |     ui_tools.drawers(ui_tools.URL_DATA)
42 |     # make a copy, just in case
43 |     labels_list = list(db.labels)
44 |     ui_tools.label_list_view(db, labels_list, show_resample_list=False)
45 | 
46 | 
47 | if __name__ in {"__main__", "__mp_main__"}:
48 |     parser = argparse.ArgumentParser()
49 |     parser.add_argument("-d", "--db", required=True, help="DB path to label")
50 |     args = parser.parse_args()
51 | 
52 |     db = rlhf.load_db(args.db)
53 |     to_label = rlhf.sample_to_label(db)
54 | 
55 |     ui.run(host='0.0.0.0', port=8080, show=False)
56 | 


--------------------------------------------------------------------------------
/Chapter19/adhoc/obs_to_gif.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pickle
 3 | import argparse
 4 | import pathlib
 5 | from PIL import Image
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("-i", "--input", required=True, help="Input file name")
11 |     parser.add_argument("-o", "--output", required=True, help="Output file name")
12 |     args = parser.parse_args()
13 | 
14 |     dat = pathlib.Path(args.input).read_bytes()
15 |     steps = pickle.loads(dat)
16 |     print(len(steps))
17 |     sh = steps[0].obs.shape
18 |     im = Image.new("RGB", (sh[1], sh[0]), (0, 0, 0))
19 |     images = [
20 |         Image.fromarray(step.obs)
21 |         for step in steps
22 |     ]
23 |     im.save(args.output, save_all=True, append_images=images,
24 |             duration=300, loop=0)
25 | 


--------------------------------------------------------------------------------
/Chapter19/adhoc/rw_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | sys.path.append(".")
 4 | import gymnasium as gym
 5 | import pathlib
 6 | import torch
 7 | import argparse
 8 | 
 9 | from lib import rlhf
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("-r", "--reward", required=True,
15 |                         help="Path to reward model file")
16 |     parser.add_argument("-d", "--dev", default="cuda")
17 |     args = parser.parse_args()
18 |     dev = torch.device(args.dev)
19 | 
20 |     e = gym.make("SeaquestNoFrameskip-v4")
21 |     p = pathlib.Path(args.reward)
22 |     e = rlhf.RewardModelWrapper(e, p, dev)
23 |     r, _ = e.reset()
24 |     obs, r, is_done, is_tr, extra = e.step(0)
25 |     print(obs.shape)
26 |     print(r)
27 | 


--------------------------------------------------------------------------------
/Chapter19/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter19/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter19/requirements.txt:
--------------------------------------------------------------------------------
1 | nicegui==1.4.26
2 | 


--------------------------------------------------------------------------------
/Chapter20/.gitignore:
--------------------------------------------------------------------------------
1 | res
2 | 


--------------------------------------------------------------------------------
/Chapter20/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter20/play-mu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | import time
 4 | import argparse
 5 | 
 6 | from lib import game
 7 | from lib import muzero as mu
 8 | 
 9 | import torch
10 | 
11 | 
12 | MCTS_SEARCHES = 10
13 | MCTS_BATCH_SIZE = 8
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("models", nargs='+', help="The list of models (at least 2) to play against each other")
19 |     parser.add_argument("-r", "--rounds", type=int, default=2, help="Count of rounds to perform for every pair, default=2")
20 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable CUDA")
21 |     args = parser.parse_args()
22 |     device = torch.device("cuda" if args.cuda else "cpu")
23 |     params = mu.MuZeroParams(dev=device)
24 | 
25 |     nets = []
26 |     for fname in args.models:
27 |         net = mu.MuZeroModels(mu.OBS_SHAPE, game.GAME_COLS)
28 |         d = torch.load(fname, map_location=lambda storage, loc: storage, weights_only=True)
29 |         net.set_state_dict(d)
30 |         net.to(device)
31 |         nets.append((fname, net))
32 | 
33 |     total_agent = {}
34 |     total_pairs = {}
35 | 
36 |     for idx1, n1 in enumerate(nets):
37 |         for idx2, n2 in enumerate(nets):
38 |             if idx1 == idx2:
39 |                 continue
40 |             wins, losses, draws = 0, 0, 0
41 |             ts = time.time()
42 |             for _ in range(args.rounds):
43 |                 r, _ = mu.play_game(n1[1], n2[1], params, temperature=0)
44 |                 if r > 0.5:
45 |                     wins += 1
46 |                 elif r < -0.5:
47 |                     losses += 1
48 |                 else:
49 |                     draws += 1
50 |             speed_games = args.rounds / (time.time() - ts)
51 |             name_1, name_2 = n1[0], n2[0]
52 |             print("%s vs %s -> w=%d, l=%d, d=%d" % (name_1, name_2, wins, losses, draws))
53 |             sys.stderr.write("Speed %.2f games/s\n" % speed_games)
54 |             sys.stdout.flush()
55 |             game.update_counts(total_agent, name_1, (wins, losses, draws))
56 |             game.update_counts(total_agent, name_2, (losses, wins, draws))
57 |             game.update_counts(total_pairs, (name_1, name_2), (wins, losses, draws))
58 | 
59 |     # leaderboard by total wins
60 |     total_leaders = list(total_agent.items())
61 |     total_leaders.sort(reverse=True, key=lambda p: p[1][0])
62 | 
63 |     print("Leaderboard:")
64 |     for name, (wins, losses, draws) in total_leaders:
65 |         print("%s: \t w=%d, l=%d, d=%d" % (name, wins, losses, draws))
66 | 


--------------------------------------------------------------------------------
/Chapter20/play.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import sys
 3 | import time
 4 | import argparse
 5 | 
 6 | from lib import game, model
 7 | 
 8 | import torch
 9 | 
10 | 
11 | MCTS_SEARCHES = 10
12 | MCTS_BATCH_SIZE = 8
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument("models", nargs='+', help="The list of models (at least 2) to play against each other")
18 |     parser.add_argument("-r", "--rounds", type=int, default=2, help="Count of rounds to perform for every pair")
19 |     parser.add_argument("--cuda", default=False, action="store_true", help="Enable CUDA")
20 |     args = parser.parse_args()
21 |     device = torch.device("cuda" if args.cuda else "cpu")
22 | 
23 |     nets = []
24 |     for fname in args.models:
25 |         net = model.Net(model.OBS_SHAPE, game.GAME_COLS)
26 |         net.load_state_dict(torch.load(fname, map_location=lambda storage, loc: storage, weights_only=True))
27 |         net = net.to(device)
28 |         nets.append((fname, net))
29 | 
30 |     total_agent = {}
31 |     total_pairs = {}
32 | 
33 |     for idx1, n1 in enumerate(nets):
34 |         for idx2, n2 in enumerate(nets):
35 |             if idx1 == idx2:
36 |                 continue
37 |             wins, losses, draws = 0, 0, 0
38 |             ts = time.time()
39 |             for _ in range(args.rounds):
40 |                 r, _ = model.play_game(mcts_stores=None, replay_buffer=None, net1=n1[1], net2=n2[1], steps_before_tau_0=0,
41 |                                     mcts_searches=MCTS_SEARCHES, mcts_batch_size=MCTS_BATCH_SIZE, device=device)
42 |                 if r > 0.5:
43 |                     wins += 1
44 |                 elif r < -0.5:
45 |                     losses += 1
46 |                 else:
47 |                     draws += 1
48 |             speed_games = args.rounds / (time.time() - ts)
49 |             name_1, name_2 = n1[0], n2[0]
50 |             print("%s vs %s -> w=%d, l=%d, d=%d" % (name_1, name_2, wins, losses, draws))
51 |             sys.stderr.write("Speed %.2f games/s\n" % speed_games)
52 |             sys.stdout.flush()
53 |             game.update_counts(total_agent, name_1, (wins, losses, draws))
54 |             game.update_counts(total_agent, name_2, (losses, wins, draws))
55 |             game.update_counts(total_pairs, (name_1, name_2), (wins, losses, draws))
56 | 
57 |     # leaderboard by total wins
58 |     total_leaders = list(total_agent.items())
59 |     total_leaders.sort(reverse=True, key=lambda p: p[1][0])
60 | 
61 |     print("Leaderboard:")
62 |     for name, (wins, losses, draws) in total_leaders:
63 |         print("%s: \t w=%d, l=%d, d=%d" % (name, wins, losses, draws))
64 | 


--------------------------------------------------------------------------------
/Chapter20/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/tests/__init__.py


--------------------------------------------------------------------------------
/Chapter20/tests/test_model.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from lib import game, model
 5 | 
 6 | 
 7 | class TestEncoding(unittest.TestCase):
 8 |     def test_encoding(self):
 9 |         s = [[0, 1, 0], [0], [1, 1, 1], [], [1], [], []]
10 |         batch_v = model.state_lists_to_batch([s, s], [game.PLAYER_BLACK, game.PLAYER_WHITE])
11 |         batch = batch_v.data.numpy()
12 |         np.testing.assert_equal(batch, [
13 |             # black player's view
14 |             [
15 |                 # player
16 |                 [
17 |                     [0, 0, 0, 0, 0, 0, 0],
18 |                     [0, 0, 0, 0, 0, 0, 0],
19 |                     [0, 0, 0, 0, 0, 0, 0],
20 |                     [0, 0, 1, 0, 0, 0, 0],
21 |                     [1, 0, 1, 0, 0, 0, 0],
22 |                     [0, 0, 1, 0, 1, 0, 0],
23 |                 ],
24 |                 # opponent
25 |                 [
26 |                     [0, 0, 0, 0, 0, 0, 0],
27 |                     [0, 0, 0, 0, 0, 0, 0],
28 |                     [0, 0, 0, 0, 0, 0, 0],
29 |                     [1, 0, 0, 0, 0, 0, 0],
30 |                     [0, 0, 0, 0, 0, 0, 0],
31 |                     [1, 1, 0, 0, 0, 0, 0],
32 |                 ]
33 |             ],
34 |             # white player's view
35 |             [
36 |                 # player
37 |                 [
38 |                     [0, 0, 0, 0, 0, 0, 0],
39 |                     [0, 0, 0, 0, 0, 0, 0],
40 |                     [0, 0, 0, 0, 0, 0, 0],
41 |                     [1, 0, 0, 0, 0, 0, 0],
42 |                     [0, 0, 0, 0, 0, 0, 0],
43 |                     [1, 1, 0, 0, 0, 0, 0],
44 |                 ],
45 |                 # opponent
46 |                 [
47 |                     [0, 0, 0, 0, 0, 0, 0],
48 |                     [0, 0, 0, 0, 0, 0, 0],
49 |                     [0, 0, 0, 0, 0, 0, 0],
50 |                     [0, 0, 1, 0, 0, 0, 0],
51 |                     [1, 0, 1, 0, 0, 0, 0],
52 |                     [0, 0, 1, 0, 1, 0, 0],
53 |                 ]
54 |             ],
55 |         ])
56 | 
57 | 
58 | pass
59 | 


--------------------------------------------------------------------------------
/Chapter20/tests/test_muzero.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from lib import muzero as mu
 3 | from lib import game
 4 | 
 5 | 
 6 | def test_node():
 7 |     n = mu.MCTSNode(0.5, first_plays=True)
 8 |     assert not n.is_expanded
 9 |     assert n.value == 0
10 | 
11 | 
12 | def test_mcts():
13 |     params = mu.MuZeroParams()
14 |     models = mu.MuZeroModels(mu.OBS_SHAPE, game.GAME_COLS)
15 |     min_max = mu.MinMaxStats()
16 |     root = mu.run_mcts(0, game.INITIAL_STATE, params, models,
17 |                        search_rounds=10, min_max=min_max)
18 |     assert root.is_expanded
19 |     assert len(root.children) == game.GAME_COLS
20 |     assert root.visit_count == 11
21 | 
22 | 
23 | def test_action_selection():
24 |     params = mu.MuZeroParams()
25 |     root = mu.MCTSNode(0.5, first_plays=True)
26 |     np.random.seed(10)
27 |     v = root.select_action(1, params)
28 |     assert v == 1
29 |     for a in range(params.actions_count):
30 |         root.children[a] = mu.MCTSNode(0.1, first_plays=False)
31 |     root.children[0].visit_count = 100
32 |     v = root.select_action(0.0000001, params)
33 |     assert v == 0
34 |     v = root.select_action(0.1, params)
35 |     assert v == 0
36 | 
37 | 
38 | def test_play_game():
39 |     params = mu.MuZeroParams()
40 |     models = mu.MuZeroModels(mu.OBS_SHAPE, game.GAME_COLS)
41 |     reward, episode = mu.play_game(
42 |         models, models, params, temperature=0,
43 |         init_state=8516337133269602564
44 |     )
45 |     assert episode


--------------------------------------------------------------------------------
/Chapter20/tournament/2ed/final-short.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/tournament/2ed/final-short.png


--------------------------------------------------------------------------------
/Chapter20/tournament/2ed/final.csv:
--------------------------------------------------------------------------------
 1 | model_index,wins
 2 | 6,0.5321052631578947
 3 | 8,0.5236842105263158
 4 | 10,0.4268421052631579
 5 | 18,0.3813157894736842
 6 | 20,0.4676315789473684
 7 | 25,0.40131578947368424
 8 | 37,0.4713157894736842
 9 | 38,0.44105263157894736
10 | 46,0.47578947368421054
11 | 49,0.4886842105263158
12 | 50,0.5081578947368421
13 | 57,0.48210526315789476
14 | 66,0.5565789473684211
15 | 68,0.5344736842105263
16 | 72,0.5613157894736842
17 | 78,0.44552631578947366
18 | 87,0.5673684210526316
19 | 88,0.5671052631578948
20 | 91,0.5931578947368421
21 | 105,0.5692105263157895
22 | 


--------------------------------------------------------------------------------
/Chapter20/tournament/2ed/final_plot.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import matplotlib.pyplot as plt\n",
12 |     "import pandas as pd"
13 |    ]
14 |   },
15 |   {
16 |    "cell_type": "code",
17 |    "execution_count": 15,
18 |    "metadata": {
19 |     "collapsed": true
20 |    },
21 |    "outputs": [],
22 |    "source": [
23 |     "df = pd.read_csv(\"final.csv\")"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": 17,
29 |    "metadata": {
30 |     "collapsed": false
31 |    },
32 |    "outputs": [],
33 |    "source": [
34 |     "fig = plt.figure()\n",
35 |     "ax1 = fig.add_subplot(111)\n",
36 |     "\n",
37 |     "ax1.plot(df.model_index, df.wins, color='black', linewidth=.8, linestyle='-')\n",
38 |     "ax1.grid(True, axis='both')\n",
39 |     "ax1.set_xlabel(\"Model index\")\n",
40 |     "ax1.set_xlim(0, max(df.model_index))\n",
41 |     "ax1.set_ylabel(\"Win ratio\")\n",
42 |     "plt.savefig(\"final.svg\")"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {
49 |     "collapsed": true
50 |    },
51 |    "outputs": [],
52 |    "source": []
53 |   }
54 |  ],
55 |  "metadata": {
56 |   "kernelspec": {
57 |    "display_name": "Python [default]",
58 |    "language": "python",
59 |    "name": "python3"
60 |   },
61 |   "language_info": {
62 |    "codemirror_mode": {
63 |     "name": "ipython",
64 |     "version": 3
65 |    },
66 |    "file_extension": ".py",
67 |    "mimetype": "text/x-python",
68 |    "name": "python",
69 |    "nbconvert_exporter": "python",
70 |    "pygments_lexer": "ipython3",
71 |    "version": "3.5.2"
72 |   }
73 |  },
74 |  "nbformat": 4,
75 |  "nbformat_minor": 2
76 | }
77 | 


--------------------------------------------------------------------------------
/Chapter20/tournament/2ed/semi-common.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/tournament/2ed/semi-common.png


--------------------------------------------------------------------------------
/Chapter20/tournament/2ed/semi-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter20/tournament/2ed/semi-scores.png


--------------------------------------------------------------------------------
/Chapter20/tournament/3ed/final.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | ./play.py --cuda -r 10 saves/t1/best_088_39300.dat saves/t1/best_025_09900.dat saves/t1/best_022_08200.dat \
4 |   saves/t1/best_021_08100.dat saves/t1/best_009_03400.dat saves/t1/best_014_04700.dat saves/t1/best_008_02700.dat \
5 |   saves/t1/best_010_03500.dat saves/t1/best_029_11800.dat saves/t1/best_007_02300.dat \
6 |   saves/t2/best_069_41500.dat saves/t2/best_070_42200.dat saves/t2/best_066_38900.dat saves/t2/best_071_42600.dat \
7 |   saves/t2/best_059_33700.dat saves/t2/best_049_27500.dat saves/t2/best_068_41300.dat saves/t2/best_048_26700.dat \
8 |   saves/t2/best_058_32100.dat saves/t2/best_076_45200.dat > final.txt
9 | 


--------------------------------------------------------------------------------
/Chapter20/tournament/3ed/final_plot.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": [
11 |     "import matplotlib.pyplot as plt\n",
12 |     "import pandas as pd"
13 |    ]
14 |   },
15 |   {
16 |    "cell_type": "code",
17 |    "execution_count": 15,
18 |    "metadata": {
19 |     "collapsed": true
20 |    },
21 |    "outputs": [],
22 |    "source": [
23 |     "df = pd.read_csv(\"final.csv\")"
24 |    ]
25 |   },
26 |   {
27 |    "cell_type": "code",
28 |    "execution_count": 17,
29 |    "metadata": {
30 |     "collapsed": false
31 |    },
32 |    "outputs": [],
33 |    "source": [
34 |     "fig = plt.figure()\n",
35 |     "ax1 = fig.add_subplot(111)\n",
36 |     "\n",
37 |     "ax1.plot(df.model_index, df.wins, color='black', linewidth=.8, linestyle='-')\n",
38 |     "ax1.grid(True, axis='both')\n",
39 |     "ax1.set_xlabel(\"Model index\")\n",
40 |     "ax1.set_xlim(0, max(df.model_index))\n",
41 |     "ax1.set_ylabel(\"Win ratio\")\n",
42 |     "plt.savefig(\"final.svg\")"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": null,
48 |    "metadata": {
49 |     "collapsed": true
50 |    },
51 |    "outputs": [],
52 |    "source": []
53 |   }
54 |  ],
55 |  "metadata": {
56 |   "kernelspec": {
57 |    "display_name": "Python [default]",
58 |    "language": "python",
59 |    "name": "python3"
60 |   },
61 |   "language_info": {
62 |    "codemirror_mode": {
63 |     "name": "ipython",
64 |     "version": 3
65 |    },
66 |    "file_extension": ".py",
67 |    "mimetype": "text/x-python",
68 |    "name": "python",
69 |    "nbconvert_exporter": "python",
70 |    "pygments_lexer": "ipython3",
71 |    "version": "3.5.2"
72 |   }
73 |  },
74 |  "nbformat": 4,
75 |  "nbformat_minor": 2
76 | }
77 | 


--------------------------------------------------------------------------------
/Chapter20/tournament/3ed/mu-v1-wins.csv:
--------------------------------------------------------------------------------
 1 | Wall time,Step,Value
 2 | 0,5,0.6578947368421053
 3 | 0,10,0.8921052631578947
 4 | 0,15,0.7842105263157895
 5 | 0,20,0.4263157894736842
 6 | 0,25,0.41578947368421054
 7 | 0,30,0.3868421052631579
 8 | 0,35,0.4421052631578947
 9 | 0,40,0.5368421052631579
10 | 0,45,0.55
11 | 0,50,0.5078947368421053
12 | 0,55,0.41578947368421054
13 | 0,60,0.42894736842105263
14 | 0,65,0.33157894736842103
15 | 0,70,0.37105263157894736
16 | 0,75,0.29210526315789476
17 | 0,80,0.32894736842105265
18 | 0,85,0.37105263157894736
19 | 0,90,0.3815789473684211
20 | 0,95,0.4131578947368421
21 | 0,100,0.48947368421052634
22 | 0,105,0.45
23 | 0,110,0.46842105263157896
24 | 0,115,0.46578947368421053
25 | 0,120,0.4394736842105263
26 | 0,125,0.48157894736842105
27 | 0,130,0.5184210526315789
28 | 0,135,0.5789473684210527
29 | 0,140,0.7026315789473684
30 | 0,145,0.7
31 | 0,150,0.718421052631579
32 | 0,155,0.7552631578947369
33 | 0,160,0.6210526315789474
34 | 0,165,0.6657894736842105
35 | 0,170,0.5421052631578948
36 | 0,175,0.4473684210526316
37 | 0,180,0.3447368421052632
38 | 0,185,0.29736842105263156
39 | 0,190,0.3815789473684211
40 | 0,195,0.48157894736842105
41 | 


--------------------------------------------------------------------------------
/Chapter20/tournament/3ed/v1-wins.csv:
--------------------------------------------------------------------------------
 1 | Wall time,Step,Value
 2 | 0,1,0.45738636363636365
 3 | 0,2,0.475
 4 | 0,3,0.4727272727272727
 5 | 0,4,0.5022727272727273
 6 | 0,5,0.5136363636363637
 7 | 0,6,0.5107954545454545
 8 | 0,7,0.5568181818181818
 9 | 0,8,0.5670454545454545
10 | 0,9,0.5738636363636364
11 | 0,10,0.5664772727272728
12 | 0,11,0.5522727272727272
13 | 0,12,0.5272727272727272
14 | 0,13,0.5375
15 | 0,14,0.5698863636363637
16 | 0,15,0.49772727272727274
17 | 0,16,0.55625
18 | 0,17,0.5261363636363636
19 | 0,18,0.5426136363636364
20 | 0,19,0.5443181818181818
21 | 0,20,0.5113636363636364
22 | 0,21,0.5778409090909091
23 | 0,22,0.58125
24 | 0,23,0.5380681818181818
25 | 0,24,0.5477272727272727
26 | 0,25,0.5818181818181818
27 | 0,26,0.5210227272727272
28 | 0,27,0.4875
29 | 0,28,0.5107954545454545
30 | 0,29,0.5630681818181819
31 | 0,30,0.5289772727272727
32 | 0,31,0.5102272727272728
33 | 0,32,0.5022727272727273
34 | 0,33,0.47556818181818183
35 | 0,34,0.46761363636363634
36 | 0,35,0.48522727272727273
37 | 0,36,0.4630681818181818
38 | 0,37,0.44488636363636364
39 | 0,38,0.4653409090909091
40 | 0,39,0.4636363636363636
41 | 0,40,0.4596590909090909
42 | 0,41,0.45738636363636365
43 | 0,42,0.46988636363636366
44 | 0,43,0.4414772727272727
45 | 0,44,0.45340909090909093
46 | 0,45,0.48125
47 | 0,46,0.44829545454545455
48 | 0,47,0.48125
49 | 0,48,0.49318181818181817
50 | 0,49,0.4659090909090909
51 | 0,50,0.4511363636363636
52 | 0,51,0.48863636363636365
53 | 0,52,0.45340909090909093
54 | 0,53,0.4642045454545455
55 | 0,54,0.45454545454545453
56 | 0,55,0.46193181818181817
57 | 0,56,0.45397727272727273
58 | 0,57,0.4630681818181818
59 | 0,58,0.49886363636363634
60 | 0,59,0.5113636363636364
61 | 0,60,0.5051136363636364
62 | 0,61,0.49261363636363636
63 | 0,62,0.46704545454545454
64 | 0,63,0.49261363636363636
65 | 0,64,0.4732954545454545
66 | 0,65,0.4596590909090909
67 | 0,66,0.4744318181818182
68 | 0,67,0.4335227272727273
69 | 0,68,0.5119318181818182
70 | 0,69,0.4903409090909091
71 | 0,70,0.5164772727272727
72 | 0,71,0.48806818181818185
73 | 0,72,0.4653409090909091
74 | 0,73,0.49829545454545454
75 | 0,74,0.46136363636363636
76 | 0,75,0.4732954545454545
77 | 0,76,0.4903409090909091
78 | 0,77,0.4948863636363636
79 | 0,78,0.5426136363636364
80 | 0,79,0.5085227272727273
81 | 0,80,0.4965909090909091
82 | 0,81,0.5045454545454545
83 | 0,82,0.4778409090909091
84 | 0,83,0.48465909090909093
85 | 0,84,0.48295454545454547
86 | 0,85,0.48920454545454545
87 | 0,86,0.5551136363636363
88 | 0,87,0.5380681818181818
89 | 0,88,0.5835227272727272
90 | 0,89,0.4909090909090909
91 | 


--------------------------------------------------------------------------------
/Chapter20/tournament/3ed/v2-wins.csv:
--------------------------------------------------------------------------------
 1 | Wall time,Step,Value
 2 | 0,1,0.4404494382022472
 3 | 0,2,0.3938202247191011
 4 | 0,3,0.40224719101123596
 5 | 0,4,0.4449438202247191
 6 | 0,5,0.3938202247191011
 7 | 0,6,0.3921348314606742
 8 | 0,7,0.4174157303370786
 9 | 0,8,0.44719101123595506
10 | 0,9,0.47752808988764045
11 | 0,10,0.48707865168539327
12 | 0,11,0.4584269662921348
13 | 0,12,0.46348314606741575
14 | 0,13,0.46629213483146065
15 | 0,14,0.48707865168539327
16 | 0,15,0.4398876404494382
17 | 0,16,0.48820224719101124
18 | 0,17,0.48764044943820223
19 | 0,18,0.4696629213483146
20 | 0,19,0.4657303370786517
21 | 0,20,0.48089887640449436
22 | 0,21,0.4410112359550562
23 | 0,22,0.49887640449438203
24 | 0,23,0.4926966292134832
25 | 0,24,0.4820224719101124
26 | 0,25,0.5022471910112359
27 | 0,26,0.5056179775280899
28 | 0,27,0.48820224719101124
29 | 0,28,0.451123595505618
30 | 0,29,0.5112359550561798
31 | 0,30,0.5168539325842697
32 | 0,31,0.4348314606741573
33 | 0,32,0.5151685393258427
34 | 0,33,0.5337078651685393
35 | 0,34,0.5483146067415731
36 | 0,35,0.551123595505618
37 | 0,36,0.5275280898876404
38 | 0,37,0.4966292134831461
39 | 0,38,0.48820224719101124
40 | 0,39,0.4910112359550562
41 | 0,40,0.5280898876404494
42 | 0,41,0.5415730337078651
43 | 0,42,0.5157303370786517
44 | 0,43,0.4646067415730337
45 | 0,44,0.500561797752809
46 | 0,45,0.48707865168539327
47 | 0,46,0.48707865168539327
48 | 0,47,0.5028089887640449
49 | 0,48,0.552247191011236
50 | 0,49,0.5561797752808989
51 | 0,50,0.49382022471910114
52 | 0,51,0.4691011235955056
53 | 0,52,0.5162921348314606
54 | 0,53,0.48820224719101124
55 | 0,54,0.5123595505617977
56 | 0,55,0.451123595505618
57 | 0,56,0.5022471910112359
58 | 0,57,0.5292134831460674
59 | 0,58,0.5516853932584269
60 | 0,59,0.5612359550561797
61 | 0,60,0.5044943820224719
62 | 0,61,0.4853932584269663
63 | 0,62,0.5073033707865169
64 | 0,63,0.4926966292134832
65 | 0,64,0.5404494382022472
66 | 0,65,0.5297752808988764
67 | 0,66,0.5646067415730337
68 | 0,67,0.5382022471910113
69 | 0,68,0.5561797752808989
70 | 0,69,0.5747191011235955
71 | 0,70,0.5707865168539326
72 | 0,71,0.5634831460674158
73 | 0,72,0.5376404494382022
74 | 0,73,0.4589887640449438
75 | 0,74,0.5162921348314606
76 | 0,75,0.5432584269662921
77 | 0,76,0.5516853932584269
78 | 0,77,0.501123595505618
79 | 0,78,0.5320224719101123
80 | 0,79,0.5129213483146068
81 | 0,80,0.5235955056179775
82 | 0,81,0.5455056179775281
83 | 0,82,0.548876404494382
84 | 0,83,0.4960674157303371
85 | 0,84,0.5140449438202247
86 | 0,85,0.547752808988764
87 | 0,86,0.47191011235955055
88 | 0,87,0.5134831460674157
89 | 0,88,0.5078651685393258
90 | 0,89,0.499438202247191
91 | 0,90,0.5348314606741573
92 | 


--------------------------------------------------------------------------------
/Chapter21/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | runs
3 | saves
4 | *.png
5 | .ipynb_checkpoints
6 | *.log
7 | 


--------------------------------------------------------------------------------
/Chapter21/csvs/2ed/README.md:
--------------------------------------------------------------------------------
 1 | Description of produced test results
 2 | 
 3 | # First results
 4 | 
 5 | Test results from first models (paper versus zero-goal method). Solve tool run for 30k MCTS searches 
 6 | (but due to bug, actual amount of steps in some tests was much lower).
 7 | 
 8 | ````
 9 | c2x2-paper-d200-t1.csv
10 | c2x2-zero-goal-d200-t1.csv
11 | c3x3-paper-d200-t1.csv
12 | c3x3-zero-goal-d200-no-decay.csv
13 | c3x3-zero-goal-d200-t1.csv
14 | ````
15 | 
16 | Analysis of the results are in notebook 
17 | https://github.com/Shmuma/rl/blob/master/articles/01_rubic/nbs/01_paper-vs-zero_goal.ipynb
18 | 
19 | # Fix of wrong steps
20 | 
21 | Fixed with https://github.com/Shmuma/rl/commit/793aebc81b7bf323a8db930e8224521700383af5#diff-b9a7f0478383b0f6ad54ae87c8769b03
22 | 
23 | ````
24 | c2x2-paper-d200-t1-v2.csv
25 | c2x2-zero-goal-d200-t1-v2.csv
26 | c3x3-paper-d200-t1-v2.csv
27 | c3x3-zero-goal-d200-no-decay-v2.csv
28 | c3x3-zero-goal-d200-t1-v2.csv
29 | ````
30 | 
31 | 


--------------------------------------------------------------------------------
/Chapter21/cubes_tests/2ed/cube2x2_d3.txt:
--------------------------------------------------------------------------------
  1 | 10,1,0
  2 | 11,4,3
  3 | 3,2,11
  4 | 1,10,11
  5 | 8,1,9
  6 | 6,1,3
  7 | 3,8,9
  8 | 0,8,3
  9 | 11,10,11
 10 | 8,6,3
 11 | 7,9,4
 12 | 0,2,11
 13 | 6,5,4
 14 | 2,3,5
 15 | 1,1,6
 16 | 1,5,5
 17 | 9,4,0
 18 | 11,7,8
 19 | 1,6,1
 20 | 8,4,9
 21 | 5,9,11
 22 | 1,0,10
 23 | 3,4,1
 24 | 3,1,6
 25 | 4,7,10
 26 | 5,2,5
 27 | 5,3,10
 28 | 4,11,10
 29 | 10,1,9
 30 | 10,2,11
 31 | 3,2,7
 32 | 6,4,11
 33 | 8,3,10
 34 | 5,0,3
 35 | 0,5,6
 36 | 4,1,3
 37 | 9,11,3
 38 | 10,7,6
 39 | 10,7,2
 40 | 4,2,3
 41 | 11,8,8
 42 | 4,11,9
 43 | 6,9,6
 44 | 5,3,2
 45 | 8,7,0
 46 | 1,2,10
 47 | 2,10,6
 48 | 9,1,6
 49 | 6,9,7
 50 | 8,4,8
 51 | 0,10,11
 52 | 1,10,8
 53 | 4,5,1
 54 | 4,6,2
 55 | 7,0,11
 56 | 11,4,8
 57 | 2,1,10
 58 | 4,8,9
 59 | 3,2,5
 60 | 2,0,9
 61 | 5,7,0
 62 | 1,5,4
 63 | 3,0,3
 64 | 9,1,1
 65 | 11,7,8
 66 | 2,2,10
 67 | 7,8,4
 68 | 8,9,6
 69 | 3,8,11
 70 | 11,3,11
 71 | 4,6,10
 72 | 10,5,7
 73 | 8,7,3
 74 | 3,1,5
 75 | 0,9,8
 76 | 3,3,0
 77 | 1,11,10
 78 | 0,3,1
 79 | 0,5,1
 80 | 8,3,4
 81 | 10,7,3
 82 | 8,11,9
 83 | 9,7,3
 84 | 7,6,3
 85 | 1,1,10
 86 | 6,5,6
 87 | 6,7,11
 88 | 0,10,10
 89 | 10,1,0
 90 | 6,11,1
 91 | 3,3,3
 92 | 8,7,2
 93 | 6,2,4
 94 | 7,3,1
 95 | 7,8,1
 96 | 0,10,8
 97 | 0,1,3
 98 | 2,6,7
 99 | 7,3,6
100 | 0,2,6
101 | 


--------------------------------------------------------------------------------
/Chapter21/cubes_tests/2ed/cube2x2_d4.txt:
--------------------------------------------------------------------------------
  1 | 10,1,0,11
  2 | 4,3,3,2
  3 | 11,1,10,11
  4 | 8,1,9,6
  5 | 0,0,1,3
  6 | 3,8,9,0
  7 | 8,3,11,10
  8 | 11,8,6,3
  9 | 7,9,4,0
 10 | 2,11,6,5
 11 | 4,2,3,5
 12 | 1,1,6,1
 13 | 5,5,9,4
 14 | 0,11,7,8
 15 | 1,6,1,8
 16 | 4,9,5,9
 17 | 3,11,1,0
 18 | 10,3,4,1
 19 | 3,1,6,4
 20 | 7,10,5,2
 21 | 5,5,3,10
 22 | 4,11,10,10
 23 | 1,9,10,2
 24 | 8,11,3,2
 25 | 7,6,4,11
 26 | 8,3,10,5
 27 | 0,3,0,5
 28 | 6,4,1,3
 29 | 9,11,3,10
 30 | 7,6,10,7
 31 | 2,4,2,3
 32 | 11,8,8,4
 33 | 11,9,6,9
 34 | 6,5,3,2
 35 | 8,7,0,1
 36 | 2,10,2,10
 37 | 6,9,1,6
 38 | 6,9,7,8
 39 | 4,8,0,10
 40 | 11,1,10,8
 41 | 4,5,1,4
 42 | 6,2,7,0
 43 | 11,11,4,8
 44 | 2,1,10,10
 45 | 8,9,2,5
 46 | 2,0,9,5
 47 | 7,0,1,5
 48 | 4,3,0,3
 49 | 9,1,1,11
 50 | 7,8,10,7
 51 | 8,4,8,9
 52 | 6,3,8,11
 53 | 11,3,11,4
 54 | 6,10,10,5
 55 | 7,8,7,3
 56 | 3,1,5,0
 57 | 9,8,3,3
 58 | 0,1,11,10
 59 | 0,3,1,0
 60 | 5,1,8,3
 61 | 4,7,3,8
 62 | 2,11,9,9
 63 | 7,3,7,6
 64 | 3,1,1,10
 65 | 6,5,6,6
 66 | 7,11,0,10
 67 | 10,10,1,0
 68 | 6,11,1,3
 69 | 3,3,8,7
 70 | 2,6,2,4
 71 | 7,3,1,8
 72 | 1,0,10,8
 73 | 0,1,3,2
 74 | 6,7,7,3
 75 | 6,2,6,6
 76 | 4,7,4,6
 77 | 11,11,8,10
 78 | 11,7,2,3
 79 | 4,3,0,9
 80 | 11,8,0,11
 81 | 5,0,0,9
 82 | 7,8,8,0
 83 | 8,1,2,1
 84 | 9,1,10,3
 85 | 6,1,9,9
 86 | 9,0,9,1
 87 | 6,10,9,9
 88 | 8,5,4,3
 89 | 10,11,3,4
 90 | 6,2,10,10
 91 | 4,7,5,1
 92 | 0,7,9,9
 93 | 1,1,8,3
 94 | 8,4,2,5
 95 | 1,3,5,4
 96 | 2,7,8,11
 97 | 4,9,10,8
 98 | 0,10,8,4
 99 | 10,1,2,4
100 | 1,1,11,8
101 | 


--------------------------------------------------------------------------------
/Chapter21/cubes_tests/2ed/cube2x2_d5.txt:
--------------------------------------------------------------------------------
  1 | 10,1,0,11,4
  2 | 3,3,2,11,1
  3 | 10,11,8,1,9
  4 | 6,1,3,3,8
  5 | 9,0,8,3,11
  6 | 10,11,8,6,3
  7 | 7,9,4,0,2
  8 | 11,6,5,4,2
  9 | 3,5,1,1,6
 10 | 1,5,5,9,4
 11 | 0,11,7,8,1
 12 | 6,1,8,4,9
 13 | 5,9,11,1,0
 14 | 10,3,4,1,3
 15 | 1,6,4,7,10
 16 | 5,2,5,5,3
 17 | 10,11,10,10,1
 18 | 9,10,2,11,3
 19 | 2,7,6,4,11
 20 | 8,3,10,5,0
 21 | 3,0,5,6,4
 22 | 1,3,11,3,10
 23 | 7,6,10,7,2
 24 | 4,2,3,11,8
 25 | 8,4,11,9,6
 26 | 9,6,5,3,2
 27 | 8,7,0,1,2
 28 | 10,2,10,6,9
 29 | 1,6,6,9,7
 30 | 8,4,8,0,10
 31 | 11,1,10,8,4
 32 | 10,5,1,4,6
 33 | 2,7,0,11,11
 34 | 4,8,8,1,10
 35 | 4,8,9,2,5
 36 | 2,0,9,5,7
 37 | 0,1,5,4,3
 38 | 0,3,1,1,11
 39 | 7,8,10,7,8
 40 | 2,4,8,9,6
 41 | 3,8,11,11,3
 42 | 11,4,6,10,10
 43 | 5,7,8,7,3
 44 | 3,1,5,0,9
 45 | 8,3,3,0,1
 46 | 11,10,0,3,1
 47 | 0,5,1,8,3
 48 | 4,7,3,8,11
 49 | 9,9,7,3,7
 50 | 6,3,1,1,10
 51 | 6,5,6,6,7
 52 | 11,0,10,10,10
 53 | 1,0,11,1,3
 54 | 3,3,8,7,2
 55 | 6,2,4,7,3
 56 | 1,8,1,0,10
 57 | 8,0,1,3,2
 58 | 6,7,7,3,6
 59 | 0,2,6,6,4
 60 | 7,4,6,11,11
 61 | 8,10,11,7,2
 62 | 3,4,3,0,9
 63 | 11,8,0,11,0
 64 | 0,9,7,8,8
 65 | 2,0,8,1,2
 66 | 1,9,1,10,3
 67 | 6,1,9,9,9
 68 | 0,9,1,6,10
 69 | 9,9,8,5,4
 70 | 3,10,11,3,4
 71 | 6,2,10,10,7
 72 | 5,1,0,7,9
 73 | 9,1,1,8,3
 74 | 8,4,2,5,1
 75 | 3,5,4,2,7
 76 | 8,11,4,9,10
 77 | 8,0,10,8,4
 78 | 10,1,2,4,1
 79 | 1,11,8,4,4
 80 | 9,11,3,10,10
 81 | 4,8,7,4,0
 82 | 1,10,6,4,0
 83 | 0,5,2,10,2
 84 | 11,7,8,11,6
 85 | 8,0,1,1,11
 86 | 2,0,5,9,8
 87 | 2,6,2,0,4
 88 | 5,0,5,3,10
 89 | 3,10,1,5,8
 90 | 6,9,11,2,3
 91 | 2,2,6,2,11
 92 | 5,6,10,11,3
 93 | 4,2,11,1,6
 94 | 0,7,3,3,7
 95 | 5,4,3,3,0
 96 | 10,3,6,5,4
 97 | 1,4,5,10,8
 98 | 6,10,8,5,0
 99 | 1,4,2,9,4
100 | 0,1,9,6,5
101 | 


--------------------------------------------------------------------------------
/Chapter21/cubes_tests/2ed/cube2x2_d6.txt:
--------------------------------------------------------------------------------
  1 | 10,1,0,11,4,3
  2 | 3,2,11,1,10,11
  3 | 8,1,9,6,1,3
  4 | 3,8,9,0,8,3
  5 | 11,10,11,8,6,3
  6 | 7,9,4,0,2,11
  7 | 6,5,4,2,3,5
  8 | 1,1,6,1,5,5
  9 | 9,4,0,11,7,8
 10 | 1,6,1,8,4,9
 11 | 5,9,11,1,0,10
 12 | 3,4,1,3,1,6
 13 | 4,7,10,5,2,5
 14 | 5,3,10,11,10,10
 15 | 1,9,10,2,11,3
 16 | 2,7,6,4,11,8
 17 | 3,10,5,0,3,0
 18 | 5,6,4,1,3,11
 19 | 5,3,10,7,6,10
 20 | 7,2,4,2,3,11
 21 | 8,8,4,11,9,6
 22 | 9,6,5,3,2,7
 23 | 1,0,1,2,10,2
 24 | 10,6,9,1,6,6
 25 | 9,7,8,4,8,0
 26 | 10,11,1,10,8,4
 27 | 10,5,1,4,6,2
 28 | 7,0,11,11,4,8
 29 | 2,1,10,10,8,9
 30 | 3,2,5,2,0,9
 31 | 5,7,0,1,5,4
 32 | 3,0,3,1,1,11
 33 | 7,8,10,7,8,4
 34 | 8,9,6,3,8,11
 35 | 11,3,11,4,6,10
 36 | 10,5,7,8,7,3
 37 | 3,1,5,0,9,8
 38 | 3,3,0,1,11,10
 39 | 0,3,1,0,5,1
 40 | 8,3,4,7,3,8
 41 | 2,11,9,9,7,3
 42 | 7,6,3,1,1,10
 43 | 6,5,6,6,7,11
 44 | 0,10,10,10,1,0
 45 | 6,11,1,3,3,3
 46 | 8,7,2,6,2,4
 47 | 7,3,1,8,1,0
 48 | 10,8,0,1,3,2
 49 | 6,7,7,3,6,2
 50 | 6,6,4,7,4,6
 51 | 11,11,8,10,11,7
 52 | 2,3,4,3,0,9
 53 | 11,8,0,11,0,0
 54 | 9,7,8,8,0,8
 55 | 1,2,1,9,1,10
 56 | 3,6,1,9,9,9
 57 | 0,9,1,6,10,9
 58 | 9,8,5,4,3,10
 59 | 11,3,4,6,2,10
 60 | 10,7,5,1,0,7
 61 | 9,9,1,1,8,3
 62 | 8,4,2,5,1,3
 63 | 5,4,2,7,8,11
 64 | 4,9,10,8,0,10
 65 | 8,4,1,2,4,1
 66 | 1,11,8,4,4,9
 67 | 3,11,3,10,10,8
 68 | 7,4,0,1,10,6
 69 | 4,0,0,5,2,10
 70 | 4,2,11,7,8,11
 71 | 6,8,0,1,1,11
 72 | 2,0,5,9,8,6
 73 | 2,0,4,5,0,5
 74 | 3,10,3,10,1,5
 75 | 8,6,9,11,2,3
 76 | 2,2,6,2,11,6
 77 | 10,11,3,4,2,11
 78 | 1,6,7,3,3,7
 79 | 5,4,3,3,0,10
 80 | 3,6,5,4,1,4
 81 | 5,10,8,6,10,8
 82 | 5,0,1,4,2,9
 83 | 4,0,1,9,6,5
 84 | 11,6,9,8,1,6
 85 | 9,4,0,11,6,8
 86 | 8,10,11,11,11,10
 87 | 3,5,6,1,10,5
 88 | 9,5,10,1,11,4
 89 | 8,4,6,5,6,11
 90 | 4,8,3,6,10,6
 91 | 10,11,2,9,9,4
 92 | 6,8,0,4,4,3
 93 | 6,9,9,10,5,7
 94 | 7,7,10,3,8,7
 95 | 11,2,10,1,4,8
 96 | 10,10,9,5,1,3
 97 | 10,3,3,2,0,0
 98 | 3,7,9,1,6,10
 99 | 9,11,11,6,7,6
100 | 3,2,10,11,0,1
101 | 


--------------------------------------------------------------------------------
/Chapter21/cubes_tests/2ed/cube3x3_d10.txt:
--------------------------------------------------------------------------------
  1 | 10,1,0,11,4,3,3,2,11,1
  2 | 10,11,8,1,9,6,0,0,1,3
  3 | 3,8,9,0,8,3,11,10,11,8
  4 | 6,3,7,9,4,0,2,11,6,5
  5 | 4,2,3,5,1,1,6,1,5,5
  6 | 9,4,0,11,7,8,1,6,1,8
  7 | 4,10,9,5,9,3,11,1,0,10
  8 | 3,4,1,3,1,6,4,7,10,5
  9 | 2,5,5,3,10,4,11,10,10,1
 10 | 9,10,2,8,11,3,2,7,6,4
 11 | 10,11,8,3,10,5,0,3,0,5
 12 | 6,4,1,3,9,11,5,3,10,7
 13 | 6,10,7,2,4,2,3,11,8,8
 14 | 4,11,9,6,9,6,5,3,2,8
 15 | 7,1,0,1,2,10,2,10,6,9
 16 | 1,6,6,9,7,8,4,8,0,10
 17 | 11,1,10,8,4,10,5,1,4,6
 18 | 2,7,0,11,11,4,8,2,8,1
 19 | 10,4,10,8,9,3,2,5,2,8
 20 | 8,0,9,5,7,0,1,5,4,3
 21 | 0,3,9,1,1,11,7,1,8,2
 22 | 2,10,7,8,2,4,8,9,6,3
 23 | 8,11,11,3,11,4,6,10,10,5
 24 | 7,8,7,1,3,3,1,5,0,9
 25 | 8,3,9,3,0,1,11,10,0,3
 26 | 1,0,5,1,8,3,4,10,7,3
 27 | 8,2,11,9,9,7,3,7,6,3
 28 | 1,1,10,6,5,6,6,7,11,0
 29 | 10,10,10,1,0,6,11,5,1,3
 30 | 3,3,8,7,2,6,2,4,7,3
 31 | 1,7,8,1,0,10,8,0,1,3
 32 | 2,6,7,7,3,6,0,2,6,0
 33 | 6,4,7,4,6,11,11,8,10,11
 34 | 7,2,3,4,3,0,9,11,8,0
 35 | 11,5,0,0,9,7,8,8,2,0
 36 | 8,1,2,1,9,1,10,3,6,1
 37 | 9,3,9,9,0,9,1,6,10,9
 38 | 9,8,5,4,3,10,11,5,3,4
 39 | 6,2,10,10,4,7,5,1,0,7
 40 | 9,9,1,1,8,3,8,4,2,5
 41 | 1,3,5,4,2,7,8,11,4,9
 42 | 10,8,0,10,8,4,10,1,2,4
 43 | 1,1,11,8,2,4,4,9,3,11
 44 | 5,3,10,10,4,8,7,4,0,1
 45 | 10,6,4,0,0,5,2,10,4,2
 46 | 11,7,8,11,6,8,0,1,1,11
 47 | 2,8,0,5,9,8,2,6,2,0
 48 | 4,5,0,5,3,10,3,10,1,5
 49 | 8,6,9,11,2,3,2,2,6,0
 50 | 2,11,5,6,10,11,3,4,2,11
 51 | 1,6,0,7,3,3,7,5,4,3
 52 | 3,0,10,3,6,5,4,1,4,5
 53 | 10,8,6,10,8,5,0,1,4,2
 54 | 9,4,0,1,9,6,5,11,5,6
 55 | 9,8,1,6,9,3,4,0,11,6
 56 | 0,8,8,10,11,11,11,10,3,5
 57 | 6,1,10,5,9,5,10,1,11,4
 58 | 8,4,10,6,5,6,11,4,8,2
 59 | 3,6,10,6,10,11,2,9,9,4
 60 | 6,8,0,4,4,3,6,9,9,10
 61 | 5,7,7,7,10,3,8,7,11,2
 62 | 10,1,4,8,10,10,9,5,1,3
 63 | 10,4,3,3,2,0,0,3,7,9
 64 | 1,7,6,10,9,3,11,11,6,7
 65 | 6,3,2,10,11,0,1,6,3,2
 66 | 11,8,7,0,8,3,1,7,2,7
 67 | 10,8,8,9,5,7,9,11,8,6
 68 | 8,7,2,11,7,7,4,3,10,4
 69 | 8,7,10,3,4,7,1,11,4,3
 70 | 4,5,5,8,1,2,2,3,6,11
 71 | 2,11,3,1,6,6,5,8,7,6
 72 | 0,3,6,6,9,11,0,9,6,7
 73 | 0,5,4,6,6,8,11,11,8,9
 74 | 3,7,3,4,6,7,0,6,5,10
 75 | 10,6,11,2,7,2,9,8,0,6
 76 | 9,9,10,0,1,10,6,2,7,2
 77 | 0,4,6,5,3,7,5,5,6,4
 78 | 6,4,1,7,0,11,8,0,5,3
 79 | 10,1,10,0,0,3,3,0,9,2
 80 | 3,2,7,10,1,9,3,7,11,4
 81 | 5,2,9,9,11,11,1,2,4,1
 82 | 9,0,4,9,10,6,6,11,3,1
 83 | 9,11,10,3,1,11,4,10,9,1
 84 | 9,0,5,8,6,10,5,1,8,10
 85 | 5,0,6,7,1,6,5,10,7,11
 86 | 2,6,2,11,8,10,4,9,8,7
 87 | 7,6,11,9,4,5,3,1,4,7
 88 | 3,7,9,9,10,6,5,0,7,5
 89 | 2,7,3,5,4,5,4,9,11,4
 90 | 8,0,8,3,1,3,11,6,7,8
 91 | 3,11,7,10,11,7,7,0,1,4
 92 | 3,6,11,3,4,10,9,5,7,8
 93 | 8,5,6,11,8,5,5,11,7,4
 94 | 4,4,3,1,11,3,5,1,11,8
 95 | 11,2,3,3,11,7,4,11,9,8
 96 | 9,4,1,3,4,3,5,2,4,0
 97 | 11,8,2,4,0,0,8,4,11,2
 98 | 10,7,1,0,9,4,7,7,7,5
 99 | 2,0,4,7,1,1,6,7,1,9
100 | 10,10,0,2,2,9,4,1,3,1
101 | 


--------------------------------------------------------------------------------
/Chapter21/cubes_tests/2ed/cube3x3_d3.txt:
--------------------------------------------------------------------------------
  1 | 10,1,0
  2 | 11,4,3
  3 | 3,2,11
  4 | 1,10,11
  5 | 8,1,9
  6 | 6,0,0
  7 | 1,3,3
  8 | 8,9,0
  9 | 8,3,11
 10 | 10,11,8
 11 | 6,3,7
 12 | 9,4,0
 13 | 2,11,6
 14 | 5,4,2
 15 | 3,5,1
 16 | 1,6,1
 17 | 5,5,9
 18 | 4,0,11
 19 | 7,8,1
 20 | 6,1,8
 21 | 4,10,9
 22 | 5,9,3
 23 | 11,1,0
 24 | 10,3,4
 25 | 1,3,1
 26 | 6,4,7
 27 | 10,5,2
 28 | 5,5,3
 29 | 10,4,11
 30 | 10,10,1
 31 | 9,10,2
 32 | 8,11,3
 33 | 2,7,6
 34 | 4,10,11
 35 | 8,3,10
 36 | 5,0,3
 37 | 0,5,6
 38 | 4,1,3
 39 | 9,11,5
 40 | 3,10,7
 41 | 6,10,7
 42 | 2,4,2
 43 | 3,11,8
 44 | 8,4,11
 45 | 9,6,9
 46 | 6,5,3
 47 | 2,8,7
 48 | 1,0,1
 49 | 2,10,2
 50 | 10,6,9
 51 | 1,6,6
 52 | 9,7,8
 53 | 4,8,0
 54 | 10,11,1
 55 | 10,8,4
 56 | 10,5,1
 57 | 4,6,2
 58 | 7,0,11
 59 | 11,4,8
 60 | 2,8,1
 61 | 10,4,10
 62 | 8,9,3
 63 | 2,5,2
 64 | 8,8,0
 65 | 9,5,7
 66 | 0,1,5
 67 | 4,3,0
 68 | 3,9,1
 69 | 1,11,7
 70 | 1,8,2
 71 | 2,10,7
 72 | 8,2,4
 73 | 8,9,6
 74 | 3,8,11
 75 | 11,3,11
 76 | 4,6,10
 77 | 10,5,7
 78 | 8,7,1
 79 | 3,3,1
 80 | 5,0,9
 81 | 8,3,9
 82 | 3,0,1
 83 | 11,10,0
 84 | 3,1,0
 85 | 5,1,8
 86 | 3,4,10
 87 | 7,3,8
 88 | 2,11,9
 89 | 9,7,3
 90 | 7,6,3
 91 | 1,1,10
 92 | 6,5,6
 93 | 6,7,11
 94 | 0,10,10
 95 | 10,1,0
 96 | 6,11,5
 97 | 1,3,3
 98 | 3,8,7
 99 | 2,6,2
100 | 4,7,3
101 | 


--------------------------------------------------------------------------------
/Chapter21/cubes_tests/2ed/cube3x3_d3_norepeat.txt:
--------------------------------------------------------------------------------
  1 | 10,1,0
  2 | 11,4,3
  3 | 3,2,11
  4 | 1,10,11
  5 | 8,1,9
  6 | 6,1,3
  7 | 3,8,9
  8 | 0,8,3
  9 | 11,10,11
 10 | 8,6,3
 11 | 7,9,4
 12 | 0,2,11
 13 | 6,5,4
 14 | 2,3,5
 15 | 1,1,6
 16 | 1,5,5
 17 | 9,4,0
 18 | 11,7,8
 19 | 1,6,1
 20 | 8,4,9
 21 | 5,9,11
 22 | 1,0,10
 23 | 3,4,1
 24 | 3,1,6
 25 | 4,7,10
 26 | 5,2,5
 27 | 5,3,10
 28 | 4,11,10
 29 | 10,1,9
 30 | 10,2,11
 31 | 3,2,7
 32 | 6,4,11
 33 | 8,3,10
 34 | 5,0,3
 35 | 0,5,6
 36 | 4,1,3
 37 | 9,11,3
 38 | 10,7,6
 39 | 10,7,2
 40 | 4,2,3
 41 | 11,8,8
 42 | 4,11,9
 43 | 6,9,6
 44 | 5,3,2
 45 | 8,7,0
 46 | 1,2,10
 47 | 2,10,6
 48 | 9,1,6
 49 | 6,9,7
 50 | 8,4,8
 51 | 0,10,11
 52 | 1,10,8
 53 | 4,5,1
 54 | 4,6,2
 55 | 7,0,11
 56 | 11,4,8
 57 | 2,1,10
 58 | 4,8,9
 59 | 3,2,5
 60 | 2,0,9
 61 | 5,7,0
 62 | 1,5,4
 63 | 3,0,3
 64 | 9,1,1
 65 | 11,7,8
 66 | 2,2,10
 67 | 7,8,4
 68 | 8,9,6
 69 | 3,8,11
 70 | 11,3,11
 71 | 4,6,10
 72 | 10,5,7
 73 | 8,7,3
 74 | 3,1,5
 75 | 0,9,8
 76 | 3,3,0
 77 | 1,11,10
 78 | 0,3,1
 79 | 0,5,1
 80 | 8,3,4
 81 | 10,7,3
 82 | 8,11,9
 83 | 9,7,3
 84 | 7,6,3
 85 | 1,1,10
 86 | 6,5,6
 87 | 6,7,11
 88 | 0,10,10
 89 | 10,1,0
 90 | 6,11,1
 91 | 3,3,3
 92 | 8,7,2
 93 | 6,2,4
 94 | 7,3,1
 95 | 7,8,1
 96 | 0,10,8
 97 | 0,1,3
 98 | 2,6,7
 99 | 7,3,6
100 | 0,2,6
101 | 


--------------------------------------------------------------------------------
/Chapter21/gen_cubes.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Tool to generate test set for solver
 4 | """
 5 | import argparse
 6 | import random
 7 | 
 8 | from libcube import cubes
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument("-e", "--env", required=True, help="Type of env to train, supported types=%s" % cubes.names())
14 |     parser.add_argument("-n", "--number", type=int, default=10, help="Amount of scramble rounds, default=10")
15 |     parser.add_argument("-d", "--depth", type=int, default=100, help="Scramble depth, default=10")
16 |     parser.add_argument("--seed", type=int, default=42, help="Seed to use, if zero, no seed used. default=42")
17 |     parser.add_argument("-o", "--output", required=True, help="Output file to produce")
18 |     args = parser.parse_args()
19 | 
20 |     if args.seed:
21 |         random.seed(args.seed)
22 | 
23 |     cube_env = cubes.get(args.env)
24 |     assert isinstance(cube_env, cubes.CubeEnv)
25 | 
26 |     with open(args.output, "w+t", encoding="utf-8") as fd_out:
27 |         for _ in range(args.number):
28 |             s = cube_env.initial_state
29 |             path = []
30 |             prev_a = None
31 |             for _ in range(args.depth):
32 |                 a = cube_env.sample_action(prev_action=prev_a)
33 |                 path.append(a.value)
34 |                 s = cube_env.transform(s, a)
35 |                 prev_a = a
36 |             fd_out.write(",".join(map(str, path)) + "\n")
37 | 


--------------------------------------------------------------------------------
/Chapter21/ini/README.md:
--------------------------------------------------------------------------------
 1 | Configuration files with training/testing settings.
 2 | 
 3 | # cube2x2-paper-d200
 4 | Method from the paper applied to 2x2 cube with scrambling depth 200 during the training.
 5 | 
 6 | Best policy is achieved after 8k batches (3.5 hours on 1080Ti), after 10k batches training diverges.
 7 | 
 8 | # cube2x2-zero-goal-d200
 9 | The same as in paper, but target value for goal states set to zero, which helps convergence a lot
10 | 


--------------------------------------------------------------------------------
/Chapter21/ini/cube2x2-paper-d200.ini:
--------------------------------------------------------------------------------
 1 | [general]
 2 | cube_type=cube2x2
 3 | run_name=paper
 4 | 
 5 | [train]
 6 | cuda=True
 7 | lr=1e-5
 8 | batch_size=10000
 9 | scramble_depth=200
10 | report_batches=10
11 | checkpoint_batches=100
12 | lr_decay=True
13 | lr_decay_gamma=0.95
14 | lr_decay_batches=1000
15 | 


--------------------------------------------------------------------------------
/Chapter21/ini/cube2x2-zero-goal-d200.ini:
--------------------------------------------------------------------------------
 1 | [general]
 2 | cube_type=cube2x2
 3 | run_name=zero-goal
 4 | 
 5 | [train]
 6 | ; how to calculate target values, default is 'paper'
 7 | value_targets_method=zero_goal_value
 8 | ; limit of batches to train (train iterations)
 9 | ;max_batches=4000
10 | ; use cuda
11 | cuda=True
12 | ; learning rate
13 | lr=5e-5
14 | ; count of cubes in single batch
15 | batch_size=10000
16 | ; how deeply to scramble cube
17 | scramble_depth=200
18 | ; how frequently to report training progress
19 | report_batches=10
20 | ; how frequently to save model (if commented out, won't be saved)
21 | checkpoint_batches=100
22 | ; enables LR decay
23 | lr_decay=True
24 | ; LR decay gamma (if enabled)
25 | lr_decay_gamma=0.99
26 | ; interval between decays
27 | lr_decay_batches=100
28 | 


--------------------------------------------------------------------------------
/Chapter21/ini/cube3x3-paper-d20.ini:
--------------------------------------------------------------------------------
 1 | [general]
 2 | cube_type=cube3x3
 3 | run_name=paper
 4 | 
 5 | [train]
 6 | ; how to calculate target values, default is 'paper'
 7 | value_targets_method=paper
 8 | ; limit of batches to train (train iterations)
 9 | max_batches=100000
10 | ; use cuda
11 | cuda=True
12 | ; learning rate
13 | lr=1e-5
14 | ; count of cubes in single batch
15 | batch_size=10000
16 | ; batches to keep in scramble buffer
17 | scramble_buffer_batches=10
18 | ; after how many iterations push fresh batch into the scramble buffer
19 | push_scramble_buffer_iters=100
20 | ; how deeply to scramble cube
21 | scramble_depth=20
22 | ; how frequently to report training progress
23 | report_batches=10
24 | ; how frequently to save model (if commented out, won't be saved)
25 | ;checkpoint_batches=100
26 | ; enables LR decay
27 | lr_decay=True
28 | ; LR decay gamma (if enabled)
29 | lr_decay_gamma=0.95
30 | ; interval between decays
31 | lr_decay_batches=1000
32 | 


--------------------------------------------------------------------------------
/Chapter21/ini/cube3x3-paper-d200.ini:
--------------------------------------------------------------------------------
 1 | [general]
 2 | cube_type=cube3x3
 3 | run_name=paper
 4 | 
 5 | [train]
 6 | ; how to calculate target values, default is 'paper'
 7 | value_targets_method=paper
 8 | ; limit of batches to train (train iterations)
 9 | ;max_batches=4000
10 | ; use cuda
11 | cuda=True
12 | ; learning rate
13 | lr=1e-5
14 | ; count of cubes in single batch
15 | batch_size=10000
16 | ; how deeply to scramble cube
17 | scramble_depth=200
18 | ; how frequently to report training progress
19 | report_batches=10
20 | ; how frequently to save model (if commented out, won't be saved)
21 | ;checkpoint_batches=100
22 | ; enables LR decay
23 | lr_decay=True
24 | ; LR decay gamma (if enabled)
25 | lr_decay_gamma=0.95
26 | ; interval between decays
27 | lr_decay_batches=1000
28 | 


--------------------------------------------------------------------------------
/Chapter21/ini/cube3x3-zero-goal-d20-noweight.ini:
--------------------------------------------------------------------------------
 1 | [general]
 2 | cube_type=cube3x3
 3 | run_name=zero-goal-noweight
 4 | 
 5 | [train]
 6 | ; how to calculate target values, default is 'paper'
 7 | value_targets_method=zero_goal_value
 8 | ; limit of batches to train (train iterations)
 9 | max_batches=100000
10 | ; use cuda
11 | cuda=True
12 | ; learning rate
13 | lr=1e-5
14 | ; count of cubes in single batch
15 | batch_size=10000
16 | ; batches to keep in scramble buffer
17 | scramble_buffer_batches=10
18 | ; after how many iterations push fresh batch into the scramble buffer
19 | push_scramble_buffer_iters=100
20 | ; how deeply to scramble cube
21 | scramble_depth=20
22 | ; how frequently to report training progress
23 | report_batches=10
24 | ; how frequently to save model (if commented out, won't be saved)
25 | checkpoint_batches=1000
26 | ; enables LR decay
27 | lr_decay=False
28 | ; LR decay gamma (if enabled)
29 | lr_decay_gamma=0.95
30 | ; interval between decays
31 | lr_decay_batches=1000
32 | ; perform weighting of training samples inverse by scramble depth, default=True
33 | weight_samples=False
34 | 


--------------------------------------------------------------------------------
/Chapter21/ini/cube3x3-zero-goal-d20.ini:
--------------------------------------------------------------------------------
 1 | [general]
 2 | cube_type=cube3x3
 3 | run_name=zero-goal
 4 | 
 5 | [train]
 6 | ; how to calculate target values, default is 'paper'
 7 | value_targets_method=zero_goal_value
 8 | ; limit of batches to train (train iterations)
 9 | max_batches=100000
10 | ; use cuda
11 | cuda=True
12 | ; learning rate
13 | lr=1e-4
14 | ; count of cubes in single batch
15 | batch_size=10000
16 | ; batches to keep in scramble buffer
17 | scramble_buffer_batches=10
18 | ; after how many iterations push fresh batch into the scramble buffer
19 | push_scramble_buffer_iters=100
20 | ; how deeply to scramble cube
21 | scramble_depth=20
22 | ; how frequently to report training progress
23 | report_batches=10
24 | ; how frequently to save model (if commented out, won't be saved)
25 | ;checkpoint_batches=100
26 | ; enables LR decay
27 | lr_decay=True
28 | ; LR decay gamma (if enabled)
29 | lr_decay_gamma=0.95
30 | ; interval between decays
31 | lr_decay_batches=1000
32 | 


--------------------------------------------------------------------------------
/Chapter21/ini/cube3x3-zero-goal-d200-slow-decay.ini:
--------------------------------------------------------------------------------
 1 | [general]
 2 | cube_type=cube3x3
 3 | run_name=zero-goal-slow-decay
 4 | 
 5 | [train]
 6 | ; how to calculate target values, default is 'paper'
 7 | value_targets_method=zero_goal_value
 8 | ; limit of batches to train (train iterations)
 9 | ;max_batches=40000
10 | ; use cuda
11 | cuda=True
12 | ; learning rate
13 | lr=5e-5
14 | ; count of cubes in single batch
15 | batch_size=10000
16 | ; how deeply to scramble cube
17 | scramble_depth=200
18 | ; how frequently to report training progress
19 | report_batches=10
20 | ; how frequently to save model (if commented out, won't be saved)
21 | checkpoint_batches=100
22 | ; enables LR decay
23 | lr_decay=True
24 | ; LR decay gamma (if enabled)
25 | lr_decay_gamma=0.99
26 | ; interval between decays
27 | lr_decay_batches=200
28 | 


--------------------------------------------------------------------------------
/Chapter21/ini/cube3x3-zero-goal-d200.ini:
--------------------------------------------------------------------------------
 1 | [general]
 2 | cube_type=cube3x3
 3 | run_name=zero-goal
 4 | 
 5 | [train]
 6 | ; how to calculate target values, default is 'paper'
 7 | value_targets_method=zero_goal_value
 8 | ; limit of batches to train (train iterations)
 9 | ;max_batches=40000
10 | ; use cuda
11 | cuda=True
12 | ; learning rate
13 | lr=5e-5
14 | ; count of cubes in single batch
15 | batch_size=10000
16 | ; how deeply to scramble cube
17 | scramble_depth=200
18 | ; how frequently to report training progress
19 | report_batches=10
20 | ; how frequently to save model (if commented out, won't be saved)
21 | checkpoint_batches=100
22 | ; enables LR decay
23 | lr_decay=True
24 | ; LR decay gamma (if enabled)
25 | lr_decay_gamma=0.95
26 | ; interval between decays
27 | lr_decay_batches=200
28 | 


--------------------------------------------------------------------------------
/Chapter21/libcube/conf.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import configparser
 3 | 
 4 | 
 5 | class Config:
 6 |     """
 7 |     Configuration for train/test/solve
 8 |     """
 9 |     log = logging.getLogger("Config")
10 | 
11 |     def __init__(self, file_name):
12 |         self.data = configparser.ConfigParser()
13 |         self.log.info("Reading config file %s", file_name)
14 |         if not self.data.read(file_name):
15 |             raise ValueError("Config file %s not found" % file_name)
16 | 
17 |     # sections acessors
18 |     @property
19 |     def sect_general(self):
20 |         return self.data['general']
21 | 
22 |     @property
23 |     def sect_train(self):
24 |         return self.data['train']
25 | 
26 |     # general section
27 |     @property
28 |     def cube_type(self):
29 |         return self.sect_general['cube_type']
30 | 
31 |     @property
32 |     def run_name(self):
33 |         return self.sect_general['run_name']
34 | 
35 |     # train section
36 |     @property
37 |     def train_scramble_depth(self):
38 |         return self.sect_train.getint('scramble_depth')
39 | 
40 |     @property
41 |     def train_cuda(self):
42 |         return self.sect_train.getboolean('cuda', fallback=False)
43 | 
44 |     @property
45 |     def train_learning_rate(self):
46 |         return self.sect_train.getfloat('lr')
47 | 
48 |     @property
49 |     def train_batch_size(self):
50 |         return self.sect_train.getint('batch_size')
51 | 
52 |     @property
53 |     def train_report_batches(self):
54 |         return self.sect_train.getint('report_batches')
55 | 
56 |     @property
57 |     def train_checkpoint_batches(self):
58 |         return self.sect_train.getint('checkpoint_batches')
59 | 
60 |     @property
61 |     def train_lr_decay_enabled(self):
62 |         return self.sect_train.getboolean('lr_decay', fallback=False)
63 | 
64 |     @property
65 |     def train_lr_decay_batches(self):
66 |         return self.sect_train.getint('lr_decay_batches')
67 | 
68 |     @property
69 |     def train_lr_decay_gamma(self):
70 |         return self.sect_train.getfloat('lr_decay_gamma', fallback=1.0)
71 | 
72 |     @property
73 |     def train_value_targets_method(self):
74 |         return self.sect_train.get('value_targets_method', fallback='paper')
75 | 
76 |     @property
77 |     def train_max_batches(self):
78 |         return self.sect_train.getint('max_batches')
79 | 
80 |     @property
81 |     def scramble_buffer_batches(self):
82 |         return self.sect_train.getint("scramble_buffer_batches", 10)
83 | 
84 |     @property
85 |     def push_scramble_buffer_iters(self):
86 |         return self.sect_train.getint('push_scramble_buffer_iters', 100)
87 | 
88 |     @property
89 |     def weight_samples(self):
90 |         return self.sect_train.getboolean('weight_samples', True)
91 | 
92 |     # higher-level functions
93 |     def train_name(self, suffix=None):
94 |         res = "%s-%s-d%d" % (self.cube_type, self.run_name, self.train_scramble_depth)
95 |         if suffix is not None:
96 |             res += "-" + suffix
97 |         return res
98 | 


--------------------------------------------------------------------------------
/Chapter21/libcube/cubes/__init__.py:
--------------------------------------------------------------------------------
1 | from ._env import CubeEnv, get, names
2 | from . import cube3x3
3 | from . import cube2x2
4 | 
5 | __all__ = ('CubeEnv', 'get', 'names')
6 | 


--------------------------------------------------------------------------------
/Chapter21/libcube/cubes/_common.py:
--------------------------------------------------------------------------------
 1 | def _permute(t, m, is_inv=False):
 2 |     """
 3 |     Perform permutation of tuple according to mapping m
 4 |     """
 5 |     r = list(t)
 6 |     for from_idx, to_idx in m:
 7 |         if is_inv:
 8 |             r[from_idx] = t[to_idx]
 9 |         else:
10 |             r[to_idx] = t[from_idx]
11 |     return r
12 | 
13 | 
14 | def _rotate(corner_ort, corners):
15 |     """
16 |     Rotate given corners 120 degrees
17 |     """
18 |     r = list(corner_ort)
19 |     for c, angle in corners:
20 |         r[c] = (r[c] + angle) % 3
21 |     return r
22 | 
23 | 
24 | # orient corner cubelet
25 | def _map_orient(cols, orient_id):
26 |     if orient_id == 0:
27 |         return cols
28 |     elif orient_id == 1:
29 |         return cols[2], cols[0], cols[1]
30 |     else:
31 |         return cols[1], cols[2], cols[0]
32 | 
33 | 


--------------------------------------------------------------------------------
/Chapter21/models/.gitattributes:
--------------------------------------------------------------------------------
1 | *.dat filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/Chapter21/models/3ed/2x2-paper/best_3.2572e-02.dat:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b8d40fbf6fb3d9c4d0e06ed3dc45f1fb80f2da32a44b8c406fc8167d198482b1
3 | size 45148722
4 | 


--------------------------------------------------------------------------------
/Chapter21/models/3ed/2x2-zg/chpt_017000.dat:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2b011478a90bee5e5139196a3b189e246e32776e62dff10d5e46d2ace937a034
3 | size 45148658
4 | 


--------------------------------------------------------------------------------
/Chapter21/models/3ed/3x3-paper/best_3.1818e-02.dat:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a6e324dc71d107e223392594fc584c8c4b29ece2a1e4c4050da82c3df6988287
3 | size 49867314
4 | 


--------------------------------------------------------------------------------
/Chapter21/models/3ed/3x3-zg/chpt_026400.dat:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a11ea7ab683162e94805731e20db958d1defcef1e59c5e4fd9f017a40635903c
3 | size 49867250
4 | 


--------------------------------------------------------------------------------
/Chapter21/requirements.txt:
--------------------------------------------------------------------------------
1 | nose
2 | seaborn
3 | torch
4 | numpy
5 | tqdm
6 | tensorboard-pytorch
7 | 


--------------------------------------------------------------------------------
/Chapter21/run_tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | ./solver.py -e cube2x2 -m saves/cube2x2-paper-d200-2x2-d200-t1/best_3.2572e-02.dat --max-steps 30000 --cuda -o c2x2-paper-d200-t1.csv &
4 | ./solver.py -e cube2x2 -m saves/cube2x2-zero-goal-d200-2x2-d200-zg-t2/best_1.3816e-02.dat --max-steps 30000 --cuda -o c2x2-zero-goal-d200-t1.csv &
5 | ./solver.py -e cube3x3 --cuda --max-steps 30000 -m saves/cube3x3-paper-d200-3x3-paper-d200-t1/best_3.1818e-02.dat -o c3x3-paper-d200-t1.csv &
6 | ./solver.py -e cube3x3 --cuda --max-steps 30000 -m saves/cube3x3-zero-goal-d200-3x3-zg-d200-t1/best_2.0891e-02.dat -o c3x3-zero-goal-d200-t1.csv &
7 | #./solver.py -e cube3x3 --cuda --max-steps 30000 -m saves/cube3x3-zero-goal-d200-no-decay/best_2.1798e-02.dat -o c3x3-zero-goal-d200-no-decay-v2.csv &
8 | 


--------------------------------------------------------------------------------
/Chapter21/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter21/tests/__init__.py


--------------------------------------------------------------------------------
/Chapter21/tests/libcube/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter21/tests/libcube/__init__.py


--------------------------------------------------------------------------------
/Chapter21/tests/libcube/cubes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter21/tests/libcube/cubes/__init__.py


--------------------------------------------------------------------------------
/Chapter21/train_debug.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Ad-hoc utility to analyze trained model and various training process details
 4 | """
 5 | import argparse
 6 | import logging
 7 | 
 8 | import torch
 9 | import torch.nn.functional as F
10 | import seaborn as sns
11 | import matplotlib.pyplot as plt
12 | import numpy as np
13 | 
14 | from libcube import cubes
15 | from libcube import model
16 | 
17 | 
18 | log = logging.getLogger("train_debug")
19 | 
20 | 
21 | # How many data to generate for plots
22 | MAX_DEPTH = 10
23 | ROUND_COUNTS = 100
24 | # debug params
25 | #MAX_DEPTH = 5
26 | #ROUND_COUNTS = 2
27 | 
28 | 
29 | def gen_states(cube_env, max_depth, round_counts):
30 |     """
31 |     Generate random states of various scramble depth
32 |     :param cube_env: CubeEnv instance
33 |     :return: list of list of (state, correct_action_index) pairs
34 |     """
35 |     assert isinstance(cube_env, cubes.CubeEnv)
36 | 
37 |     result = [[] for _ in range(max_depth)]
38 |     for _ in range(round_counts):
39 |         data = cube_env.scramble_cube(max_depth, return_inverse=True)
40 |         for depth, state, inv_action in data:
41 |             result[depth-1].append((state, inv_action.value))
42 |     return result
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     sns.set()
47 | 
48 |     logging.basicConfig(format="%(asctime)-15s %(levelname)s %(message)s", level=logging.INFO)
49 |     parser = argparse.ArgumentParser()
50 |     parser.add_argument("-e", "--env", required=True, help="Type of env to train, supported types=%s" % cubes.names())
51 |     parser.add_argument("-m", "--model", required=True, help="Model file to load")
52 |     parser.add_argument("-o", "--output", required=True, help="Output prefix for plots")
53 |     args = parser.parse_args()
54 | 
55 |     cube_env = cubes.get(args.env)
56 |     log.info("Selected cube: %s", cube_env)
57 |     net = model.Net(cube_env.encoded_shape, len(cube_env.action_enum))
58 |     net.load_state_dict(torch.load(args.model, map_location=lambda storage, loc: storage, weights_only=True))
59 |     net.eval()
60 |     log.info("Network loaded from %s", args.model)
61 | 
62 | #    model.make_train_data(cube_env, net, device='cpu', batch_size=10, scramble_depth=2, shuffle=False)
63 | 
64 |     states_by_depth = gen_states(cube_env, max_depth=MAX_DEPTH, round_counts=ROUND_COUNTS)
65 |     # for idx, states in enumerate(states_by_depth):
66 |     #     log.info("%d: %s", idx, states)
67 | 
68 |     # flatten returned data
69 |     data = []
70 |     for depth, states in enumerate(states_by_depth):
71 |         for s, inv_action in states:
72 |             data.append((depth+1, s, inv_action))
73 |     depths, states, inv_actions = map(list, zip(*data))
74 | 
75 |     # process states with net
76 |     enc_states = model.encode_states(cube_env, states)
77 |     enc_states_t = torch.tensor(enc_states)
78 |     policy_t, value_t = net(enc_states_t)
79 |     value_t = value_t.squeeze(-1)
80 |     value = value_t.cpu().detach().numpy()
81 |     policy = F.softmax(policy_t, dim=1).cpu().detach().numpy()
82 | 
83 |     # plot value per depth of scramble
84 |     plot = sns.lineplot(depths, value)
85 |     plot.set_title("Values per depths")
86 |     plot.get_figure().savefig(args.output + "-vals_vs_depths.png")
87 | 
88 |     # plot action match
89 |     plt.clf()
90 |     actions = np.argmax(policy, axis=1)
91 |     actions_match = (actions == inv_actions).astype(np.int8)
92 |     plot = sns.lineplot(depths, actions_match)
93 |     plot.set_title("Actions accuracy per depths")
94 |     plot.get_figure().savefig(args.output + "-acts_vs_depths.png")
95 | 
96 |     pass
97 | 


--------------------------------------------------------------------------------
/Chapter22/.gitignore:
--------------------------------------------------------------------------------
1 | render
2 | 


--------------------------------------------------------------------------------
/Chapter22/battle_play.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import torch
 4 | import ptan
 5 | from lib import model, data
 6 | 
 7 | from gymnasium.wrappers.monitoring.video_recorder import VideoRecorder
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("-m", "--model", required=True,
13 |                         help="Model file to load")
14 |     parser.add_argument("--map-size", type=int, default=data.MAP_SIZE,
15 |                         help="Size of the map, default=" + str(data.MAP_SIZE))
16 |     parser.add_argument("--render", default="render/battle.mp4",
17 |                         help="Name of the video file to render, default=render/battle.mp4")
18 |     parser.add_argument("--walls", type=int, default=data.COUNT_WALLS,
19 |                         help="Count of walls, default=" + str(data.COUNT_WALLS))
20 |     parser.add_argument("--a", type=int, default=data.COUNT_BATTLERS,
21 |                         help="Count of tigers, default=" + str(data.COUNT_BATTLERS))
22 |     parser.add_argument("--b", type=int, default=data.COUNT_BATTLERS,
23 |                         help="Count of deer, default=" + str(data.COUNT_BATTLERS))
24 | 
25 |     args = parser.parse_args()
26 | 
27 |     env = data.BattleEnv(
28 |         map_size=args.map_size,
29 |         count_walls=args.walls,
30 |         count_a=args.a,
31 |         count_b=args.b,
32 |         render_mode="rgb_array",
33 |     )
34 |     recorder = VideoRecorder(env, args.render)
35 |     net = model.DQNModel(
36 |         env.observation_spaces['a_0'].shape,
37 |         env.action_spaces['a_0'].n,
38 |     )
39 |     net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
40 |     a_agent = ptan.agent.DQNAgent(
41 |         net, ptan.actions.ArgmaxActionSelector())
42 |     b_agent = data.RandomMAgent(env, env.handles[0])
43 | 
44 |     obs = env.reset()
45 |     recorder.capture_frame()
46 |     total_reward = 0.0
47 |     total_steps = 0
48 | 
49 |     while env.agents:
50 |         actions = {}
51 |         b_obs = [
52 |             obs[agent_id]
53 |             for agent_id in env.agents
54 |             if agent_id.startswith("a")
55 |         ]
56 |         a_acts, _ = a_agent(b_obs)
57 |         ofs = 0
58 |         for agent_id in env.agents:
59 |             if agent_id.startswith("a"):
60 |                 actions[agent_id] = a_acts[ofs]
61 |                 ofs += 1
62 | 
63 |         b_obs = [
64 |             obs[agent_id]
65 |             for agent_id in env.agents
66 |             if agent_id.startswith("b")
67 |         ]
68 |         b_acts, _ = b_agent(b_obs)
69 |         ofs = 0
70 |         for agent_id in env.agents:
71 |             if agent_id.startswith("b"):
72 |                 actions[agent_id] = b_acts[ofs]
73 |                 ofs += 1
74 | 
75 |         obs, rewards, dones, _, _ = env.step(actions)
76 |         recorder.capture_frame()
77 |         total_steps += 1
78 |         for agent_id, reward in rewards.items():
79 |             if agent_id.startswith("a"):
80 |                 total_reward += reward
81 | 
82 |     print("Episode steps: %d" % total_steps)
83 |     print("Total reward: %.3f" % total_reward)
84 |     print("Mean reward: %.3f" % (total_reward / args.a))
85 |     recorder.close()


--------------------------------------------------------------------------------
/Chapter22/forest_random.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from gymnasium.wrappers.monitoring.video_recorder import VideoRecorder
 3 | from lib import data
 4 | from PIL import Image
 5 | import pathlib
 6 | import numpy as np
 7 | 
 8 | RENDER_DIR = "render"
 9 | 
10 | 
11 | def save_render(render: np.ndarray, path: pathlib.Path, step: int):
12 |     img = Image.fromarray(render)
13 |     p = path / f"render_{step:04d}.png"
14 |     img.save(str(p))
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     env = data.ForestEnv(render_mode="rgb_array")
19 |     recorder = VideoRecorder(env, RENDER_DIR + "/forest-random.mp4")
20 |     sum_rewards = {agent_id: 0.0 for agent_id in env.agents}
21 |     sum_steps = {agent_id: 0 for agent_id in env.agents}
22 |     obs = env.reset()
23 |     recorder.capture_frame()
24 |     assert isinstance(obs, dict)
25 |     print(f"tiger_0: obs {obs['tiger_0'].shape}, act: {env.action_space('tiger_0')}")
26 |     print(f"deer_0: obs {obs['deer_0'].shape}, act: {env.action_space('deer_0')}\n")
27 |     step = 0
28 |     save_render(env.render(), pathlib.Path(RENDER_DIR), step)
29 | 
30 |     while env.agents:
31 |         actions = {agent_id: env.action_space(agent_id).sample() for agent_id in env.agents}
32 |         all_obs, all_rewards, all_dones, all_trunc, all_info = env.step(actions)
33 |         recorder.capture_frame()
34 |         for agent_id, r in all_rewards.items():
35 |             sum_rewards[agent_id] += r
36 |             sum_steps[agent_id] += 1
37 |         step += 1
38 |         save_render(env.render(), pathlib.Path(RENDER_DIR), step)
39 | 
40 |     final_rewards = list(sum_rewards.items())
41 |     final_rewards.sort(key=lambda p: p[1], reverse=True)
42 |     for agent_id, r in final_rewards[:20]:
43 |         print(f"{agent_id}: got {r:.2f} in {sum_steps[agent_id]} steps")
44 |     recorder.close()


--------------------------------------------------------------------------------
/Chapter22/forest_tigers_play.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import torch
 4 | import ptan
 5 | from lib import model, data
 6 | 
 7 | from gymnasium.wrappers.monitoring.video_recorder import VideoRecorder
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("-m", "--model", required=True,
13 |                         help="Model file to load")
14 |     parser.add_argument("--map-size", type=int, default=data.MAP_SIZE,
15 |                         help="Size of the map, default=" + str(data.MAP_SIZE))
16 |     parser.add_argument("--render", default="render/video.mp4",
17 |                         help="Name of the video file to render, default=render/video.mp4")
18 |     parser.add_argument("--walls", type=int, default=data.COUNT_WALLS,
19 |                         help="Count of walls, default=" + str(data.COUNT_WALLS))
20 |     parser.add_argument("--tigers", type=int, default=data.COUNT_TIGERS,
21 |                         help="Count of tigers, default=" + str(data.COUNT_TIGERS))
22 |     parser.add_argument("--deer", type=int, default=data.COUNT_DEER,
23 |                         help="Count of deer, default=" + str(data.COUNT_DEER))
24 |     parser.add_argument("--mode", default='forest', choices=['forest', 'double_attack'],
25 |                         help="GridWorld mode, could be 'forest' or 'double_attack', default='forest'")
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     if args.mode == 'forest':
30 |         env = data.ForestEnv(
31 |             map_size=args.map_size,
32 |             count_walls=args.walls,
33 |             count_tigers=args.tigers,
34 |             count_deer=args.deer,
35 |             render_mode="rgb_array",
36 |         )
37 |     elif args.mode == 'double_attack':
38 |         env = data.DoubleAttackEnv(
39 |             map_size=args.map_size,
40 |             count_walls=args.walls,
41 |             count_tigers=args.tigers,
42 |             count_deer=args.deer,
43 |             render_mode="rgb_array",
44 |         )
45 |     else:
46 |         raise RuntimeError()
47 |     recorder = VideoRecorder(env, args.render)
48 |     net = model.DQNModel(
49 |         env.observation_spaces['tiger_0'].shape,
50 |         env.action_spaces['tiger_0'].n,
51 |     )
52 |     net.load_state_dict(torch.load(args.model, map_location=torch.device('cpu'), weights_only=True))
53 |     tiger_agent = ptan.agent.DQNAgent(
54 |         net, ptan.actions.ArgmaxActionSelector())
55 |     deer_agent = data.RandomMAgent(env, env.handles[0])
56 | 
57 |     obs = env.reset()
58 |     recorder.capture_frame()
59 |     total_reward = 0.0
60 |     total_steps = 0
61 | 
62 |     while env.agents:
63 |         actions = {}
64 |         tiger_obs = [
65 |             obs[agent_id]
66 |             for agent_id in env.agents
67 |             if agent_id.startswith("tiger")
68 |         ]
69 |         tiger_acts, _ = tiger_agent(tiger_obs)
70 |         ofs = 0
71 |         for agent_id in env.agents:
72 |             if agent_id.startswith("tiger"):
73 |                 actions[agent_id] = tiger_acts[ofs]
74 |                 ofs += 1
75 | 
76 |         deer_obs = [
77 |             obs[agent_id]
78 |             for agent_id in env.agents
79 |             if agent_id.startswith("deer")
80 |         ]
81 |         deer_acts, _ = deer_agent(deer_obs)
82 |         ofs = 0
83 |         for agent_id in env.agents:
84 |             if agent_id.startswith("deer"):
85 |                 actions[agent_id] = deer_acts[ofs]
86 |                 ofs += 1
87 | 
88 |         obs, rewards, dones, _, _ = env.step(actions)
89 |         recorder.capture_frame()
90 |         total_steps += 1
91 |         for agent_id, reward in rewards.items():
92 |             if agent_id.startswith("tiger"):
93 |                 total_reward += reward
94 | 
95 |     print("Episode steps: %d" % total_steps)
96 |     print("Total reward: %.3f" % total_reward)
97 |     print("Mean reward: %.3f" % (total_reward / args.tigers))
98 |     recorder.close()


--------------------------------------------------------------------------------
/Chapter22/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Third-Edition/3ac5b72d954cfd331164afcbe58dd35c13d3c8bd/Chapter22/lib/__init__.py


--------------------------------------------------------------------------------
/Chapter22/lib/common.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta, datetime
 2 | from types import SimpleNamespace
 3 | from typing import Iterable
 4 | import warnings
 5 | 
 6 | import ptan
 7 | import ptan.ignite as ptan_ignite
 8 | from ignite.engine import Engine
 9 | from ignite.metrics import RunningAverage
10 | from ignite.contrib.handlers import tensorboard_logger as tb_logger
11 | 
12 | 
13 | class EpsilonTracker:
14 |     def __init__(self, selector: ptan.actions.EpsilonGreedyActionSelector,
15 |                  params: SimpleNamespace):
16 |         self.selector = selector
17 |         self.params = params
18 |         self.frame(0)
19 | 
20 |     def frame(self, frame_idx: int):
21 |         eps = self.params.epsilon_start - \
22 |               frame_idx / self.params.epsilon_frames
23 |         self.selector.epsilon = max(self.params.epsilon_final, eps)
24 | 
25 | 
26 | def batch_generator(buffer: ptan.experience.ExperienceReplayBuffer,
27 |                     initial: int, batch_size: int):
28 |     buffer.populate(initial)
29 |     while True:
30 |         buffer.populate(1)
31 |         yield buffer.sample(batch_size)
32 | 
33 | 
34 | def setup_ignite(engine: Engine, params: SimpleNamespace,
35 |                  exp_source, run_name: str,
36 |                  extra_metrics: Iterable[str] = (),
37 |                  loss_metrics: Iterable[str] = ('loss', )):
38 |     warnings.simplefilter("ignore", category=UserWarning)
39 |     handler = ptan_ignite.EndOfEpisodeHandler(
40 |         exp_source, bound_avg_reward=params.stop_reward)
41 |     handler.attach(engine)
42 |     ptan_ignite.EpisodeFPSHandler().attach(engine)
43 | 
44 |     @engine.on(ptan_ignite.EpisodeEvents.EPISODE_COMPLETED)
45 |     def episode_completed(trainer: Engine):
46 |         passed = trainer.state.metrics.get('time_passed', 0)
47 |         print("Episode %d: reward=%.4f, steps=%s, "
48 |               "speed=%.1f f/s, elapsed=%s" % (
49 |             trainer.state.episode, trainer.state.episode_reward,
50 |             trainer.state.episode_steps,
51 |             trainer.state.metrics.get('avg_fps', 0),
52 |             timedelta(seconds=int(passed))))
53 | 
54 |     @engine.on(ptan_ignite.EpisodeEvents.BOUND_REWARD_REACHED)
55 |     def game_solved(trainer: Engine):
56 |         passed = trainer.state.metrics['time_passed']
57 |         print("Game solved in %s, after %d episodes "
58 |               "and %d iterations!" % (
59 |             timedelta(seconds=int(passed)),
60 |             trainer.state.episode, trainer.state.iteration))
61 |         trainer.should_terminate = True
62 | 
63 |     now = datetime.now().isoformat(timespec='minutes')
64 |     logdir = f"runs/{now}-{params.run_name}-{run_name}"
65 |     tb = tb_logger.TensorboardLogger(log_dir=logdir)
66 |     for loss_name in loss_metrics:
67 |         run_avg = RunningAverage(output_transform=lambda v: v[loss_name])
68 |         run_avg.attach(engine, "avg_" + loss_name)
69 | 
70 |     metrics = ['reward', 'steps', 'avg_reward']
71 |     handler = tb_logger.OutputHandler(
72 |         tag="episodes", metric_names=metrics)
73 |     event = ptan_ignite.EpisodeEvents.EPISODE_COMPLETED
74 |     tb.attach(engine, log_handler=handler, event_name=event)
75 | 
76 |     # write to tensorboard every 100 iterations
77 |     ptan_ignite.PeriodicEvents().attach(engine)
78 |     metrics = ['avg_loss', 'avg_fps']
79 |     metrics.extend(extra_metrics)
80 |     handler = tb_logger.OutputHandler(
81 |         tag="train", metric_names=metrics,
82 |         output_transform=lambda a: a)
83 |     event = ptan_ignite.PeriodEvents.ITERS_100_COMPLETED
84 |     tb.attach(engine, log_handler=handler, event_name=event)
85 | 


--------------------------------------------------------------------------------
/Chapter22/lib/model.py:
--------------------------------------------------------------------------------
 1 | import ptan
 2 | import torch
 3 | import torch.nn as nn
 4 | import numpy as np
 5 | from typing import List
 6 | 
 7 | 
 8 | class DQNModel(nn.Module):
 9 |     def __init__(self, view_shape, n_actions):
10 |         super(DQNModel, self).__init__()
11 | 
12 |         self.view_conv = nn.Sequential(
13 |             nn.Conv2d(view_shape[0], 32, kernel_size=3, padding=0),
14 |             nn.ReLU(),
15 |             nn.Conv2d(32, 16, kernel_size=2, padding=1),        # padding was added for deer model
16 |             nn.ReLU(),
17 |             nn.Flatten(),
18 |         )
19 |         view_out_size = self.view_conv(torch.zeros(1, *view_shape)).size()[-1]
20 |         self.fc = nn.Sequential(
21 |             nn.Linear(view_out_size, 128),
22 |             nn.ReLU(),
23 |             nn.Linear(128, n_actions)
24 |         )
25 | 
26 |     def forward(self, x):
27 |         conv_out = self.view_conv(x)
28 |         return self.fc(conv_out)
29 | 
30 | 
31 | def unpack_batch(batch: List[ptan.experience.ExperienceFirstLast]):
32 |     states, actions, rewards, dones, last_states = [],[],[],[],[]
33 |     for exp in batch:
34 |         states.append(exp.state)
35 |         actions.append(exp.action)
36 |         rewards.append(exp.reward)
37 |         dones.append(exp.last_state is None)
38 |         if exp.last_state is None:
39 |             lstate = exp.state  # the result will be masked anyway
40 |         else:
41 |             lstate = exp.last_state
42 |         last_states.append(lstate)
43 |     return states, np.array(actions), \
44 |            np.array(rewards, dtype=np.float32), \
45 |            np.array(dones, dtype=bool), \
46 |            last_states
47 | 
48 | 
49 | def calc_loss_dqn(batch, net, tgt_net, preprocessor, gamma, device="cpu"):
50 |     states, actions, rewards, dones, next_states = \
51 |         unpack_batch(batch)
52 | 
53 |     states = preprocessor(states).to(device)
54 |     next_states = preprocessor(next_states).to(device)
55 | 
56 |     actions_v = torch.tensor(actions).to(device)
57 |     rewards_v = torch.tensor(rewards).to(device)
58 |     done_mask = torch.BoolTensor(dones).to(device)
59 | 
60 |     actions_v = actions_v.unsqueeze(-1)
61 |     state_action_vals = net(states).gather(1, actions_v)
62 |     state_action_vals = state_action_vals.squeeze(-1)
63 |     with torch.no_grad():
64 |         next_state_vals = tgt_net(next_states).max(1)[0]
65 |         next_state_vals[done_mask] = 0.0
66 | 
67 |     bellman_vals = next_state_vals.detach() * gamma + rewards_v
68 |     return nn.MSELoss()(state_action_vals, bellman_vals)
69 | 


--------------------------------------------------------------------------------
/Chapter22/requirements.txt:
--------------------------------------------------------------------------------
1 | magent2==0.3.3


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Max Lapan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | gymnasium[atari]==0.29.1
 2 | gymnasium[classic-control]==0.29.1
 3 | gymnasium[accept-rom-license]==0.29.1
 4 | moviepy==1.0.3
 5 | numpy<2
 6 | opencv-python==4.10.0.84
 7 | torch==2.5.0
 8 | torchvision==0.20.0
 9 | pytorch-ignite==0.5.1
10 | tensorboard==2.18.0
11 | mypy==1.8.0
12 | ptan==0.8.1
13 | stable-baselines3==2.3.2
14 | torchrl==0.6.0
15 | ray[tune]==2.37.0
16 | pytest
17 | 


--------------------------------------------------------------------------------
/tools/avg_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import csv
 3 | import pathlib
 4 | import argparse
 5 | import itertools
 6 | import typing as tt
 7 | from dataclasses import dataclass
 8 | 
 9 | 
10 | @dataclass
11 | class Series:
12 |     start_wall: float
13 |     time_deltas: tt.List[float]
14 |     steps: tt.List[int]
15 |     values: tt.List[float]
16 | 
17 |     @classmethod
18 |     def read(cls, path: pathlib.Path) -> "Series":
19 |         start_wall = None
20 |         deltas = []
21 |         steps = []
22 |         values = []
23 |         with path.open('rt', encoding='utf-8') as fd:
24 |             reader = csv.DictReader(fd)
25 |             for r in reader:
26 |                 assert isinstance(r, dict)
27 |                 t = float(r["Wall time"])
28 |                 if start_wall is None:
29 |                     start_wall = t
30 |                 deltas.append(t - start_wall)
31 |                 steps.append(int(r["Step"]))
32 |                 values.append(float(r["Value"]))
33 |         return Series(start_wall=start_wall, time_deltas=deltas, steps=steps, values=values)
34 | 
35 |     def write(self, path: pathlib.Path):
36 |         with path.open('wt', encoding='utf-8') as fd:
37 |             writer = csv.DictWriter(fd, ('Wall time', 'Step', 'Value'))
38 |             writer.writeheader()
39 |             for dt, s, v in zip(self.time_deltas, self.steps, self.values):
40 |                 writer.writerow({
41 |                     'Wall time': self.start_wall + dt,
42 |                     'Step': s,
43 |                     'Value': v,
44 |                 })
45 | 
46 |     def __iter__(self) -> tt.Generator[tt.Tuple[float, int, float], None, None]:
47 |         yield from zip(self.time_deltas, self.steps, self.values)
48 | 
49 | 
50 | def mean_max_step(series: tt.List[Series]) -> float:
51 |     return sum(map(lambda s: s.steps[-1], series)) / len(series)
52 | 
53 | 
54 | def avg_entries(entries: tt.Tuple[tt.Optional[tt.Tuple[float, int, float]], ...],
55 |                 do_sum: bool = False) -> tt.Tuple[float, int, float]:
56 |     deltas = []
57 |     steps = []
58 |     values = []
59 |     for entry in entries:
60 |         if entry is None:
61 |             continue
62 |         d, s, v = entry
63 |         deltas.append(d)
64 |         steps.append(s)
65 |         values.append(v)
66 |     if do_sum:
67 |         return sum(deltas), int(sum(steps)), sum(values)
68 |     else:
69 |         return sum(deltas) / len(deltas), int(sum(steps) / len(steps)), sum(values) / len(values)
70 | 
71 | 
72 | def average_series(series: tt.List[Series], do_sum: bool = False) -> Series:
73 |     mean_steps = mean_max_step(series)
74 |     start_wall = series[0].start_wall
75 |     deltas = []
76 |     steps = []
77 |     values = []
78 | 
79 |     for vals in itertools.zip_longest(*series):
80 |         dt, s, v = avg_entries(vals, do_sum=do_sum)
81 |         if s <= mean_steps:
82 |             deltas.append(dt)
83 |             steps.append(s)
84 |             values.append(v)
85 |     return Series(start_wall=start_wall, time_deltas=deltas, steps=steps, values=values)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     parser = argparse.ArgumentParser()
90 |     parser.add_argument("-o", "--output", required=True, help="Output csv file to produce")
91 |     parser.add_argument("--sum", default=False, action="store_true", help="Perform summation instead of average")
92 |     parser.add_argument("files", nargs='+', help="Input csv files")
93 |     args = parser.parse_args()
94 | 
95 |     series = [Series.read(pathlib.Path(n)) for n in args.files]
96 |     res = average_series(series, do_sum=args.sum)
97 |     res.write(pathlib.Path(args.output))


--------------------------------------------------------------------------------
/tools/ch12/norm_dist.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | MEAN = 10
 6 | SAMPLES = 1000000
 7 | X_MAX = MEAN * 2
 8 | BINS = 100
 9 | STEP = X_MAX / BINS
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     c1, c2, c5 = [0] * BINS, [0] * BINS, [0] * BINS
14 |     for _ in range(SAMPLES):
15 |         v1 = np.random.normal(loc=MEAN, scale=1.0)
16 |         v2 = np.random.normal(loc=MEAN, scale=2.0)
17 |         v5 = np.random.normal(loc=MEAN, scale=5.0)
18 |         if 0 <= v1 <= X_MAX:
19 |             b = int(BINS * v1 / X_MAX)
20 |             c1[b] += 1
21 |         if 0 <= v2 <= X_MAX:
22 |             b = int(BINS * v2 / X_MAX)
23 |             c2[b] += 1
24 |         if 0 <= v5 <= X_MAX:
25 |             b = int(BINS * v5 / X_MAX)
26 |             c5[b] += 1
27 |     x = [STEP * i for i in range(BINS)]
28 |     y1 = [c / SAMPLES for c in c1]
29 |     y2 = [c / SAMPLES for c in c2]
30 |     y5 = [c / SAMPLES for c in c5]
31 |     print(x)
32 |     print(y1)
33 | 
34 |     fig = plt.figure()
35 |     ax1 = fig.add_subplot(111)
36 |     ax1.plot(x, y1, color='black', linewidth=1.2, linestyle='-')
37 |     ax1.plot(x, y2, color='black', linewidth=1.2, linestyle=':')
38 |     ax1.plot(x, y5, color='black', linewidth=1.2, linestyle='--')
39 |     ax1.legend(["Variance = 1.0", "Variance = 2.0", "Variance = 5.0"],
40 |                loc='upper right', fancybox=True)
41 |     ax1.set_xlim(0, X_MAX)
42 |     ax1.grid(True, axis='both')
43 |     ax1.set_title("Gaussian Distribution with mean = 10.0")
44 |     plt.savefig("norm_dist.svg")
45 |     pass


--------------------------------------------------------------------------------