├── .gitignore ├── AlphaGoZero_Intruduction ├── alphago_zero_introduction.pdf ├── alphago_zero_introduction.tex └── fig │ ├── 1_bq8g8w1ti-qi-r2asH-7Xg.png │ ├── alpha_mcts0.png │ ├── alpha_mcts1.png │ ├── alpha_mcts2.png │ ├── alpha_mcts3.png │ ├── alpha_mcts4.png │ ├── alphagozero_mcts1.png │ ├── alphagozero_mcts2.png │ ├── autodidactic_iteration.png │ ├── convolutional_layer.png │ ├── deepcube_method.png │ ├── deepcube_nn.png │ ├── evaluate_network.png │ ├── expanded_tree.png │ ├── expert_policies.png │ ├── game.png │ ├── game_state.png │ ├── go_state_space.png │ ├── mcts_2_to_4.png │ ├── mcts_backpropagation.png │ ├── mcts_expansion.png │ ├── mcts_iterations.png │ ├── mcts_process.png │ ├── mcts_selection.png │ ├── mcts_simulation.png │ ├── neural_netwrok_architecture.png │ ├── reinforcement_learning.png │ ├── residual_layer.png │ ├── retrain_network.png │ ├── rna_folding │ ├── input_state.png │ ├── native.png │ ├── native_mat.png │ ├── nn.png │ ├── output.png │ ├── rna_folding.pptx │ ├── s0.png │ ├── s1_1.png │ ├── s1_2.png │ ├── s1_3.png │ ├── s2_1.png │ ├── s3_1.png │ └── sn.png │ ├── rna_folding1.png │ ├── rna_folding2.png │ ├── rna_folding3.png │ ├── rubiks_cube.png │ ├── rubiks_cube_action.png │ ├── rubiks_cube_state.png │ ├── s0.png │ ├── self_play.png │ ├── the_policy_head.png │ └── the_value_head.png ├── AlphaGomoku ├── alpha_gomoku.py ├── cpu_node.sh ├── gomoku.py ├── loss │ ├── Residual_CNN_8x8_loss.png │ ├── Simple_CNN_19x19_loss.png │ └── Simple_CNN_8x8_loss.png ├── mcts.py ├── models │ ├── Residual_CNN_8x8_3000.h5 │ ├── Simple_CNN_19x19_3000.h5 │ ├── Simple_CNN_19x19_5000.h5 │ └── Simple_CNN_8x8_3000.h5 └── neural_network.py ├── DDDQN └── Doom-Deadly-Corridor │ ├── deadly_corridor.cfg │ └── deadly_corridor.wad ├── DDPG └── Ant │ ├── DDPG_Ant-v2.pth │ └── DDPG_Ant.py ├── DQN ├── Atari_Space_Invaders │ ├── DQN_Atari_Space_Invaders.py │ ├── Space Invaders (1983) (CCE) (C-820).bin │ ├── model │ │ ├── checkpoint │ │ ├── model.ckpt.data-00000-of-00001 │ │ ├── model.ckpt.index │ │ └── model.ckpt.meta │ └── train_log │ │ └── events.out.tfevents.1530462157.MKK └── Doom │ ├── DQN_Doom.py │ ├── basic.cfg │ ├── basic.wad │ ├── model │ ├── checkpoint │ ├── model.ckpt.data-00000-of-00001 │ ├── model.ckpt.index │ └── model.ckpt.meta │ └── train_log │ └── events.out.tfevents.1524621481.MKK ├── LICENSE ├── MCTS ├── MCTS_Gomoku.py └── MCTS_TicTacToe.py ├── PG ├── Cartpole_pytorch │ ├── PG_CartPole-v0.pth │ └── PG_CartPole.py ├── Cartpole_tensorflow │ ├── PG_Cartpole.py │ └── model │ │ ├── checkpoint │ │ ├── model.ckpt.data-00000-of-00001 │ │ ├── model.ckpt.index │ │ └── model.ckpt.meta └── Doom-Deathmatch │ ├── PG_Doom_Deathmatch.py │ ├── defend_the_center.cfg │ └── defend_the_center.wad ├── PPO └── HalfCheetah │ ├── PPO_HalfCheetah-v2.pth │ └── PPO_HalfCheetah.py ├── QLearning ├── QLearning_FrozenLake.py ├── QLearning_Taxi_v2.py ├── QLearning_TicTacToe.py └── game.py ├── README.md ├── Roms └── Roms.zip ├── imgs ├── Bellman_equation.png ├── DQN.png ├── DQN2.png ├── DQN_loss.png ├── DQN_neural_network.png ├── DQN_neural_network2.png ├── PER.png ├── alphagomoku.png ├── ant.gif ├── ddpg_algorithm.svg ├── doom_loss.png ├── double_DQN.png ├── dueling_DQN1.png ├── dueling_DQN2.png ├── fixed_q_targets.png ├── frozenlake.png ├── halfcheetah.gif ├── mcts_gomoku.png ├── pdf_0.png ├── pdf_1.png ├── pdf_2.png ├── pdf_3.png ├── pg_algorithm.svg ├── pg_doom_deathmatch.png ├── pg_loss.png ├── pg_mean_reward.png ├── pg_network.png ├── play_atari_space_invaders.gif ├── play_cartpole.gif ├── play_doom.gif ├── play_doom_deadly_corridor.gif ├── play_doom_deathmatch.gif ├── policy_gradients.png ├── ppo_algorithm.svg ├── sumtree.png ├── taxi1.png ├── taxi2.png ├── taxi3.png ├── taxi4.png ├── taxi5.png ├── taxi6.png ├── tic1.png ├── tic2.png ├── tic3.png ├── tic4.png ├── tic5.png ├── tic6.png └── tic7.png └── test ├── Airstriker-Genesis-Level1-000000.bk2 ├── Airstriker-Genesis-Level1-000000.mp4 ├── test_gym.py ├── test_mujoco.py └── test_retro.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/alphago_zero_introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/alphago_zero_introduction.pdf -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/1_bq8g8w1ti-qi-r2asH-7Xg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/1_bq8g8w1ti-qi-r2asH-7Xg.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/alpha_mcts0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts0.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/alpha_mcts1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts1.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/alpha_mcts2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts2.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/alpha_mcts3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts3.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/alpha_mcts4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts4.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/alphagozero_mcts1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alphagozero_mcts1.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/alphagozero_mcts2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alphagozero_mcts2.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/autodidactic_iteration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/autodidactic_iteration.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/convolutional_layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/convolutional_layer.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/deepcube_method.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/deepcube_method.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/deepcube_nn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/deepcube_nn.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/evaluate_network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/evaluate_network.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/expanded_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/expanded_tree.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/expert_policies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/expert_policies.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/game.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/game.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/game_state.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/game_state.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/go_state_space.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/go_state_space.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/mcts_2_to_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_2_to_4.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/mcts_backpropagation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_backpropagation.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/mcts_expansion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_expansion.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/mcts_iterations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_iterations.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/mcts_process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_process.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/mcts_selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_selection.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/mcts_simulation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_simulation.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/neural_netwrok_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/neural_netwrok_architecture.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/reinforcement_learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/reinforcement_learning.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/residual_layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/residual_layer.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/retrain_network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/retrain_network.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/input_state.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/input_state.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/native.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/native.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/native_mat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/native_mat.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/nn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/nn.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/output.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/rna_folding.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/rna_folding.pptx -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/s0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s0.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/s1_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s1_1.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/s1_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s1_2.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/s1_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s1_3.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/s2_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s2_1.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/s3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s3_1.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding/sn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/sn.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding1.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding2.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rna_folding3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding3.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rubiks_cube.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rubiks_cube.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rubiks_cube_action.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rubiks_cube_action.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/rubiks_cube_state.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rubiks_cube_state.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/s0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/s0.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/self_play.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/self_play.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/the_policy_head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/the_policy_head.png -------------------------------------------------------------------------------- /AlphaGoZero_Intruduction/fig/the_value_head.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/the_value_head.png -------------------------------------------------------------------------------- /AlphaGomoku/alpha_gomoku.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import random 4 | import numpy as np 5 | from collections import deque 6 | from gomoku import Gomoku 7 | from mcts import MCTS 8 | from neural_network import Residual_CNN, Simple_CNN 9 | 10 | #====================== 11 | # Configuration 12 | #====================== 13 | # 8x8 14 | game_board_width = 8 15 | mcts_playout_itermax_train = 400 16 | mcts_playout_itermax_play = 1000 17 | model_file = 'Simple_CNN_8x8_3000' 18 | policy_network = Simple_CNN # or Residual_CNN 19 | #====================== 20 | # 19x19 21 | # game_board_width = 19 22 | # mcts_playout_itermax_train = 800 23 | # mcts_playout_itermax_play = 1000 24 | # model_file = 'Simple_CNN_19x19_3000' 25 | # policy_network = Simple_CNN 26 | #====================== 27 | 28 | def random_play(game): 29 | return random.choice(game.actions()) 30 | 31 | def human_play(): 32 | t = input('[*] Your turn (i j): ') 33 | a, b = t.split(' ') 34 | i, j = int(a), int(b) 35 | return (i, j) 36 | 37 | def play_game(): 38 | game = Gomoku(game_board_width) 39 | policy = policy_network(input_dim=game.nn_input.shape, output_dim=game.w**2) 40 | policy.load(model_file) 41 | mcts_player = MCTS(policy, mcts_playout_itermax_play) 42 | 43 | starting_player = random.choice([1,2]) 44 | game.reset(starting_player) 45 | mcts_player.set_rootnode(starting_player) 46 | while not game.is_end: 47 | print(game) 48 | # print(game.nn_input) 49 | 50 | if game.current_player == 1: # Player X 51 | action, _ = mcts_player.get_move(game) 52 | else: # Player O 53 | action = human_play() 54 | 55 | game.move(action) 56 | mcts_player.update_with_move(action, game) 57 | 58 | print("[*] Player %s move: %s\n" % (['X', 'O'][game.player_just_moved-1], action)) 59 | 60 | print(game) 61 | if game.winner > 0: 62 | print("[*] Player %s win" % ['X', 'O'][game.winner-1]) 63 | else: 64 | print("[*] Player draw") 65 | 66 | def self_play(game, player, render=False): 67 | starting_player = random.choice([1,2]) 68 | game.reset(starting_player) 69 | player.set_rootnode(starting_player) 70 | board_states, mcts_probs, cur_players = [], [], [] 71 | 72 | while not game.is_end: 73 | if render: print(game) 74 | 75 | action, action_probs = player.get_move(game, stochastically=True, show_node=render) 76 | 77 | board_states.append(game.nn_input) 78 | mcts_probs.append(action_probs) 79 | cur_players.append(game.current_player) 80 | 81 | game.move(action) 82 | player.update_with_move(action, game) 83 | 84 | if render: print("[*] Player %s move: %s\n" % (['X', 'O'][game.player_just_moved-1], action)) 85 | 86 | rewards = list(map(game.reward, cur_players)) 87 | 88 | if render: 89 | print(game) 90 | if game.winner > 0: 91 | print("[*] Player %s win" % ['X', 'O'][game.winner-1]) 92 | else: 93 | print("[*] Player draw") 94 | 95 | return list(zip(board_states, mcts_probs, rewards)), game.winner, starting_player 96 | 97 | def augment_data(play_data): 98 | # augment the data set by rotation and flipping 99 | extend_data = [] 100 | for state, pi, z in play_data: 101 | w = state.shape[-1] 102 | 103 | for i in [1, 2, 3, 4]: 104 | # rotate counterclockwise 105 | equi_state = np.array([np.rot90(s, i) for s in state]) 106 | equi_pi = np.rot90(pi.reshape((w, w)), i) 107 | extend_data.append((equi_state, equi_pi.flatten(), z)) 108 | # flip horizontally 109 | equi_state = np.array([np.fliplr(s) for s in equi_state]) 110 | equi_pi =np.fliplr(equi_pi) 111 | extend_data.append((equi_state, equi_pi.flatten(), z)) 112 | 113 | return extend_data 114 | 115 | 116 | def train(): 117 | game_episode_num = 3000 118 | selfplay_batch_size = 1 119 | data_buffer_size = 10000 120 | check_step = 10 121 | train_batch_size = 512 122 | 123 | data_buffer = deque(maxlen=data_buffer_size) 124 | 125 | game = Gomoku(game_board_width) 126 | policy = policy_network(input_dim=game.nn_input.shape, output_dim=game.w**2) 127 | mcts_player = MCTS(policy, mcts_playout_itermax_train) 128 | winner_num = [0] * 3 129 | 130 | print('[*] Start self play') 131 | # game episode 132 | for i in range(game_episode_num): 133 | 134 | # get train data 135 | start_time = time.time() 136 | for _ in range(selfplay_batch_size): 137 | play_data, winner, starting_player = self_play(game, mcts_player) 138 | episode_len = len(play_data) 139 | extend_data = augment_data(play_data) 140 | data_num = len(extend_data) 141 | data_buffer.extend(extend_data) 142 | winner_num[winner] += 1 143 | end_time = time.time() 144 | 145 | print('[*] Episode: {}, length: {}, start: {}, winner: {}, data: {}, time: {}s, win ratio: X {:.1f}%, O {:.1f}%, - {:.1f}%'.format( 146 | i+1, episode_len, ['-', 'X', 'O'][starting_player], ['-', 'X', 'O'][winner], data_num, int(end_time - start_time), 147 | winner_num[1] / (i+1) * selfplay_batch_size * 100, 148 | winner_num[2] / (i+1) * selfplay_batch_size * 100, 149 | winner_num[0] / (i+1) * selfplay_batch_size * 100, 150 | )) 151 | 152 | # train 153 | if len(data_buffer) > train_batch_size: 154 | mini_batch = random.sample(data_buffer, train_batch_size) 155 | state_batch = np.array([d[0] for d in mini_batch]) 156 | pi_batch = np.array([d[1] for d in mini_batch]) 157 | z_batch = np.array([d[2] for d in mini_batch]) 158 | 159 | policy.train(state_batch, [z_batch, pi_batch]) 160 | 161 | # check current policy model and save the params 162 | if (i + 1) % check_step == 0: 163 | policy.loss_history.plot_loss('loss.png') 164 | print('[*] Save current policy model') 165 | policy.save(model_file) 166 | print('[*] done') 167 | 168 | 169 | 170 | if __name__ == "__main__": 171 | if sys.argv[1] == '--train': 172 | train() 173 | elif sys.argv[1] == '--play': 174 | play_game() 175 | -------------------------------------------------------------------------------- /AlphaGomoku/cpu_node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -n 2 4 | #SBATCH -N 1 5 | #SBATCH -w node1 6 | #SBATCH -o slurm.out 7 | #SBATCH -e slurm.err 8 | 9 | python alpha_gomoku.py --train -------------------------------------------------------------------------------- /AlphaGomoku/gomoku.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class GameState: 4 | def __init__(self): 5 | self.player_just_moved = 2 6 | 7 | def clone(self): 8 | st = GameState() 9 | st.player_just_moved = self.player_just_moved 10 | return st 11 | 12 | def move(self, action): 13 | self.player_just_moved = 3 - self.player_just_moved 14 | 15 | def actions(self): 16 | """ Get all possible moves from this state. 17 | """ 18 | 19 | def win(self, player): 20 | """ Get the game result from the viewpoint of player. 21 | """ 22 | 23 | def end(self): 24 | """ Whether the game is end or not 25 | """ 26 | 27 | def __repr__(self): 28 | pass 29 | 30 | class Gomoku(GameState): 31 | def __init__(self, w=8): # 15x15 32 | self.w = w 33 | self.reset() 34 | 35 | def reset(self, current_player=1): 36 | w = self.w 37 | self.current_player = current_player 38 | self.first_player = current_player 39 | self.player_just_moved = 3 - current_player 40 | self.board = [] # 0 = empty, 1 = player 1 (X), 2 = player 2 (O) 41 | 42 | for y in range(w): 43 | self.board.append([0] * w) 44 | self.is_end = False 45 | self.winner = -1 # 0 = draw, 1 = player 1 (X), 2 = player 2 (O) 46 | 47 | # 1 if stone here and 0 if stone not here 48 | # fisrt 1 stack - position of current player's stones 49 | # next 1 stack - position of last player's stones 50 | # next 1 stack - position of last player's move 51 | # last 1 stack - All 1 if it's first play, all 0 if it's second play 52 | self.nn_input = np.zeros((4, w, w)) 53 | self.nn_input[-1] = 1 54 | 55 | def clone(self): 56 | st = Gomoku() 57 | st.w = self.w 58 | st.current_player = self.current_player 59 | st.first_player = self.first_player 60 | st.player_just_moved = self.player_just_moved 61 | st.board = [self.board[i][:] for i in range(self.w)] 62 | st.nn_input = np.copy(self.nn_input) 63 | return st 64 | 65 | def move(self, action): 66 | a, b = action 67 | assert 0 <= a <= self.w and 0 <= b <= self.w and self.board[a][b] == 0 68 | self.board[a][b] = self.current_player 69 | self.player_just_moved = self.current_player 70 | self.current_player = 3 - self.current_player 71 | self.check_end(action) 72 | 73 | # update nn_input 74 | self.nn_input = np.zeros((4, self.w, self.w)) 75 | for i in range(self.w): 76 | for j in range(self.w): 77 | s = self.board[i][j] 78 | if s == self.current_player: self.nn_input[0, i, j] = 1 79 | elif s == self.player_just_moved: self.nn_input[1, i, j] = 1 80 | self.nn_input[2, a, b] = 1 81 | self.nn_input[3] = 1 if self.current_player == self.first_player else 0 82 | 83 | def actions(self): 84 | return [(i, j) for i in range(self.w) for j in range(self.w) if self.board[i][j] == 0] 85 | 86 | def check_end(self, action): 87 | a, b = action 88 | for i in range(5): 89 | if i <= a <= self.w - (5 - i) and i <= b <= self.w - (5 - i): 90 | if self.board[a-i][b-i] == self.board[a-i+1][b-i+1] == self.board[a-i+2][b-i+2] == self.board[a-i+3][b-i+3] == self.board[a-i+4][b-i+4]: 91 | self.is_end = True 92 | self.winner = self.player_just_moved 93 | return 94 | if i <= a <= self.w - (5 - i): 95 | if self.board[a-i][b] == self.board[a-i+1][b] == self.board[a-i+2][b] == self.board[a-i+3][b] == self.board[a-i+4][b]: 96 | self.is_end = True 97 | self.winner = self.player_just_moved 98 | return 99 | if i <= a <= self.w - (5 - i) and (4 - i) <= b <= (self.w - i - 1): 100 | if self.board[a-i][b+i] == self.board[a-i+1][b+i-1] == self.board[a-i+2][b+i-2] == self.board[a-i+3][b+i-3] == self.board[a-i+4][b+i-4]: 101 | self.is_end = True 102 | self.winner = self.player_just_moved 103 | return 104 | if i <= b <= self.w - (5 - i): 105 | if self.board[a][b-i] == self.board[a][b-i+1] == self.board[a][b-i+2] == self.board[a][b-i+3] == self.board[a][b-i+4]: 106 | self.is_end = True 107 | self.winner = self.player_just_moved 108 | return 109 | 110 | if self.actions() == []: 111 | self.is_end = True 112 | self.winner = 0 113 | 114 | def reward(self, player): 115 | if self.winner == 0: # tie 116 | return 0 117 | if self.winner == player: 118 | return 1 119 | elif self.winner == 3 - player: 120 | return -1 121 | if self.winner == -1: 122 | return 0 123 | 124 | def __repr__(self): 125 | row = '{:>2} ' + ' | '.join(['{}'] * self.w) + ' ' 126 | line = '\n ' + ('----' * self.w)[:-1] + '\n' 127 | s = ' ' + '%2d ' * self.w % tuple(range(self.w)) + '\n' 128 | s += line.join([row.format(i, *map(lambda j: [' ', 'X', 'O'][j], self.board[i])) for i in range(self.w)]) 129 | return s 130 | -------------------------------------------------------------------------------- /AlphaGomoku/loss/Residual_CNN_8x8_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/loss/Residual_CNN_8x8_loss.png -------------------------------------------------------------------------------- /AlphaGomoku/loss/Simple_CNN_19x19_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/loss/Simple_CNN_19x19_loss.png -------------------------------------------------------------------------------- /AlphaGomoku/loss/Simple_CNN_8x8_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/loss/Simple_CNN_8x8_loss.png -------------------------------------------------------------------------------- /AlphaGomoku/mcts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | def softmax(x): 5 | p = np.exp(x - np.max(x)) 6 | p /= np.sum(p) 7 | return p 8 | 9 | class Node: 10 | def __init__(self, action=None, parent=None, player=None, prior_p=1.0): 11 | self.action = action 12 | self.parent = parent 13 | self.childs = {} 14 | 15 | self.W = 0 # total action value 16 | self.N = 0 # visit count 17 | self.Q = 0 # mean action value 18 | self.P = prior_p # prior probability of selecting that edge 19 | 20 | self.current_player = player 21 | self.next_player = 3 - player 22 | 23 | def select(self): 24 | # You should think carefully why use -Q here 25 | return max(self.childs.values(), key=lambda c: -c.Q + c.U()) 26 | 27 | def expand(self, actions, probs): 28 | for i in range(len(actions)): 29 | a, p = actions[i], probs[i] 30 | n = Node(a, self, self.next_player, p) 31 | self.childs[a] = n 32 | 33 | def update(self, v): 34 | self.N += 1 35 | self.W += v 36 | self.Q = self.W / self.N 37 | # self.Q += (v - self.Q) / self.N 38 | 39 | def back_propagate(self, v): 40 | self.update(v) 41 | if self.parent: 42 | self.parent.back_propagate(-v) 43 | 44 | def U(self, c_puct=5.0): 45 | # c_puct -- a number in (0, inf) controlling the relative impact of 46 | # values (Q) and prior probability (P) on this node's score 47 | # it is a constant determining the level of exploration 48 | if self.parent: 49 | return c_puct * self.P * np.sqrt(self.parent.N) / (1 + self.N) 50 | return 0 51 | 52 | def __repr__(self): 53 | return "[A: %s, P: %.2f, Q+U: %.2f, W/N: %.1f/%d]" \ 54 | % (self.action, self.P, self.Q + self.U(), self.W, self.N) 55 | 56 | def show_node_tree(self, indent=0): 57 | print("| " * indent + str(self)) 58 | 59 | for c in self.childs.values(): 60 | c.show_node_tree(indent+1) 61 | 62 | def show_children_nodes(self): 63 | print('\n[*] Child Nodes') 64 | for c in self.childs.values(): print(c) 65 | 66 | 67 | class MCTS: 68 | def __init__(self, neural_network_fn, playout_itermax, playout_depth=4): 69 | self.f = neural_network_fn # (p, v) = f(s) 70 | self.playout_itermax = playout_itermax 71 | self.playout_depth = playout_depth 72 | 73 | def set_rootnode(self, starting_player): 74 | self.rootnode = Node(player=starting_player) 75 | 76 | def get_move(self, state, stochastically=False, show_node=False, verbose=False): 77 | for i in range(self.playout_itermax): 78 | self.playout(self.rootnode, state.clone()) 79 | 80 | if show_node: 81 | if verbose: self.rootnode.show_node_tree() 82 | else: self.rootnode.show_children_nodes() 83 | 84 | action_probs = np.zeros((state.w, state.w)) 85 | acts, probs = [], [] 86 | for c in self.rootnode.childs.values(): 87 | acts.append(c.action) 88 | probs.append(c.N) 89 | probs = softmax(probs) 90 | for a, p in zip(acts, probs): 91 | action_probs[a] = p 92 | action_probs = action_probs.flatten() 93 | 94 | if stochastically: 95 | # add Dirichlet Noise for exploration (for self-play training) 96 | epsilon = 0.25 97 | eta = 0.3 # Dirichlet noise 98 | i = np.random.choice( 99 | len(acts), 100 | p = (1 - epsilon) * probs + epsilon * np.random.dirichlet(eta * np.ones(len(probs))) 101 | ) 102 | action = acts[i] 103 | else: # deterministically, for competitive play 104 | action = max(self.rootnode.childs.values(), key=lambda c: c.N).action 105 | 106 | return action, action_probs 107 | 108 | def update_with_move(self, action, state): 109 | if action in self.rootnode.childs: 110 | self.rootnode = self.rootnode.childs[action] 111 | self.rootnode.parent = None 112 | else: 113 | self.rootnode = Node(player=state.player_just_moved) 114 | 115 | def playout(self, node, state): 116 | #======================================= 117 | # MCTS without neural network 118 | #======================================= 119 | # # Select & Expand 120 | # for i in range(self.playout_depth): 121 | # if node.childs == {}: 122 | # node.expand(state.actions()) 123 | # 124 | # node = node.select() 125 | # state.move(node.action) 126 | # 127 | # # Rollout 128 | # self.rollout(state) 129 | # 130 | # # Backpropagate 131 | # while node != None: 132 | # node.update(state.reward(node.player_just_moved)) 133 | # node = node.parent 134 | #======================================= 135 | 136 | 137 | # Select 138 | while 1: 139 | if node.childs == {}: 140 | break 141 | 142 | node = node.select() 143 | state.move(node.action) 144 | 145 | # Rollout 146 | v, a, p = self.evaluate_state(state) 147 | 148 | if state.is_end: 149 | v = -1 150 | else: 151 | # Expand 152 | node.expand(a, p) 153 | 154 | # Backpropagate 155 | node.back_propagate(v) 156 | 157 | 158 | def evaluate_state(self, state): 159 | x = state.nn_input.reshape((1, *state.nn_input.shape)) 160 | value, probs = self.f.pred(x) 161 | v = value[0, 0] 162 | a = state.actions() 163 | p = [] 164 | probs = probs.reshape((state.w, state.w)) 165 | for i, j in a: 166 | p.append(probs[i, j]) 167 | p = np.array(p) 168 | if p.sum() > 0: p /= p.sum() 169 | return v, a, p 170 | 171 | def rollout(self, state): 172 | while not state.is_end: 173 | state.move(random.choice(state.actions())) 174 | 175 | -------------------------------------------------------------------------------- /AlphaGomoku/models/Residual_CNN_8x8_3000.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/models/Residual_CNN_8x8_3000.h5 -------------------------------------------------------------------------------- /AlphaGomoku/models/Simple_CNN_19x19_3000.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/models/Simple_CNN_19x19_3000.h5 -------------------------------------------------------------------------------- /AlphaGomoku/models/Simple_CNN_19x19_5000.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/models/Simple_CNN_19x19_5000.h5 -------------------------------------------------------------------------------- /AlphaGomoku/models/Simple_CNN_8x8_3000.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/models/Simple_CNN_8x8_3000.h5 -------------------------------------------------------------------------------- /AlphaGomoku/neural_network.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from keras.models import Sequential, Model, load_model 4 | from keras.layers import Input, Dense, Conv2D, Flatten, BatchNormalization, Activation, LeakyReLU, add 5 | from keras.optimizers import SGD, Adam 6 | from keras import regularizers 7 | from keras.callbacks import TensorBoard, Callback 8 | 9 | import matplotlib 10 | matplotlib.use('Agg') 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | 14 | import os 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 16 | 17 | class LossHistory(Callback): 18 | def __init__(self): 19 | self.losses = [] 20 | self.policy_head_losses = [] 21 | self.value_head_losses = [] 22 | 23 | def on_epoch_end(self, epoch, logs={}): 24 | self.losses.append(logs.get('loss')) 25 | self.policy_head_losses.append(logs.get('policy_head_loss')) 26 | self.value_head_losses.append(logs.get('value_head_loss')) 27 | 28 | def plot_loss(self, img_file): 29 | fig = plt.figure() 30 | ax = fig.add_subplot(1, 1, 1) 31 | ax.plot(self.losses) 32 | ax.plot(self.policy_head_losses) 33 | ax.plot(self.value_head_losses) 34 | plt.title('Model loss') 35 | plt.ylabel('loss') 36 | plt.xlabel('episode') 37 | plt.legend(['loss', 'policy_head_loss', 'value_head_loss'], loc='upper right') 38 | plt.savefig(img_file) 39 | plt.close(fig) 40 | 41 | class NetworkModel: 42 | def __init__(self): 43 | pass 44 | 45 | def train(self, states, targets): 46 | # model_log = TensorBoard(log_dir='./logs') 47 | return self.model.fit(states, targets, verbose=self.verbose, callbacks=[self.loss_history]) 48 | 49 | def pred(self, x): 50 | return self.model.predict(x) 51 | 52 | def load(self, name): 53 | return self.model.load_weights('models/{}.h5'.format(name)) 54 | 55 | def save(self, name): 56 | self.model.save_weights('models/{}.h5'.format(name)) 57 | 58 | def info(self): 59 | self.model.summary() 60 | 61 | 62 | class Residual_CNN(NetworkModel): 63 | 64 | def __init__(self, input_dim, output_dim): 65 | self.input_dim = input_dim 66 | self.output_dim = output_dim 67 | 68 | self.conv_layer_filters = 64 69 | self.conv_layer_kernel_size = (3, 3) 70 | self.residual_layer_num = 2 71 | self.value_head_hidden_layer_size = 20 72 | 73 | self.learning_rate = 0.1 74 | self.momentum = 0.9 75 | self.reg_const = 0.0001 76 | 77 | self.verbose = True 78 | 79 | self.model = self.build_model() 80 | self.loss_history = LossHistory() 81 | 82 | def build_model(self): 83 | """Construct a convolutional neural network with Resnet-style skip connections. 84 | 85 | Network Diagram: [value head] 86 | |---------------------------------| /---C---B---R---F---D---R---D---T 87 | I-----C-----B-----R---o---C-----B-----R-----C-----B-----M-----R--- ..... ---| 88 | \___________/ \___________________________________/ \---C---B---R---F---D---S [polich head] 89 | [Convolutional layer] [Residual layer] 90 | 91 | I - input 92 | B - BatchNormalization 93 | R - Rectifier non-linearity, LeakyReLU 94 | T - tanh 95 | C - Conv2D 96 | F - Flatten 97 | D - Dense 98 | M - merge, add 99 | S - Softmax 100 | O - output 101 | """ 102 | main_input = Input(shape=self.input_dim, name='main_input') 103 | 104 | x = self.conv_layer(main_input, self.conv_layer_filters, self.conv_layer_kernel_size) 105 | for _ in range(self.residual_layer_num): 106 | x = self.residual_layer(x, self.conv_layer_filters, self.conv_layer_kernel_size) 107 | 108 | vh = self.value_head(x) 109 | ph = self.policy_head(x) 110 | 111 | model = Model(inputs=main_input, outputs=[vh, ph]) 112 | model.compile( 113 | loss=['mean_squared_error', 'categorical_crossentropy'], 114 | optimizer=SGD(lr=self.learning_rate, momentum=self.momentum) 115 | ) 116 | 117 | return model 118 | 119 | def conv_layer(self, x, filters, kernel_size): 120 | conv = Conv2D( 121 | filters = filters, 122 | kernel_size = kernel_size, 123 | strides = (1, 1), 124 | padding = 'same', 125 | data_format = 'channels_first', 126 | use_bias = False, 127 | activation = 'linear', 128 | kernel_regularizer = regularizers.l2(self.reg_const) 129 | )(x) 130 | bn = BatchNormalization(axis=1)(conv) 131 | lrelu = LeakyReLU()(bn) 132 | return lrelu 133 | 134 | def residual_layer(self, x, filters, kernel_size): 135 | conv_1 = self.conv_layer(x, filters, kernel_size) 136 | conv_2 = Conv2D( 137 | filters = filters, 138 | kernel_size = kernel_size, 139 | strides = (1, 1), 140 | padding = 'same', 141 | data_format = 'channels_first', 142 | use_bias = False, 143 | activation = 'linear', 144 | kernel_regularizer = regularizers.l2(self.reg_const) 145 | )(conv_1) 146 | bn = BatchNormalization(axis=1)(conv_2) 147 | merge_layer = add([x, bn]) 148 | lrelu = LeakyReLU()(merge_layer) 149 | return lrelu 150 | 151 | def value_head(self, x): 152 | x = self.conv_layer(x, 1, (1, 1)) 153 | x = Flatten()(x) 154 | x = Dense( 155 | self.value_head_hidden_layer_size, 156 | use_bias = False, 157 | activation = 'linear', 158 | kernel_regularizer = regularizers.l2(self.reg_const) 159 | )(x) 160 | x = LeakyReLU()(x) 161 | x = Dense( 162 | 1, 163 | use_bias = False, 164 | activation = 'tanh', 165 | kernel_regularizer = regularizers.l2(self.reg_const), 166 | name = 'value_head' 167 | )(x) 168 | return x 169 | 170 | def policy_head(self, x): 171 | x = self.conv_layer(x, 2, (1, 1)) 172 | x = Flatten()(x) 173 | x = Dense( 174 | self.output_dim, 175 | use_bias = False, 176 | activation = 'softmax', 177 | kernel_regularizer = regularizers.l2(self.reg_const), 178 | name = 'policy_head' 179 | )(x) 180 | return x 181 | 182 | 183 | class Simple_CNN(NetworkModel): 184 | def __init__(self, input_dim, output_dim): 185 | self.input_dim = input_dim 186 | self.output_dim = output_dim 187 | self.l2_const = 1e-4 188 | 189 | self.verbose = True 190 | 191 | self.model = self.build_model() 192 | self.loss_history = LossHistory() 193 | 194 | def build_model(self): 195 | """ 196 | Network Diagram: 197 | 2(1x1) 64 1 198 | 32(3x3) 64(3x3) 128(3x3) /-----C-----F-----D-----D-----T [value head] 199 | I-----C-----R-----C-----R-----C-----R-----| 200 | \_____________________________/ \-----C-----F-----D-----S [polich head] 201 | [Convolutional layer] 4(1x1) w^2 202 | 203 | I - input 204 | B - BatchNormalization 205 | R - ReLU 206 | T - tanh 207 | C - Conv2D 208 | F - Flatten 209 | D - Dense 210 | S - Softmax 211 | """ 212 | main_input = Input(shape=self.input_dim, name='main_input') 213 | 214 | x = self.conv_layer(main_input, 32, (3, 3)) 215 | x = self.conv_layer(x, 64, (3, 3)) 216 | x = self.conv_layer(x, 128, (3, 3)) 217 | 218 | vh = self.value_head(x) 219 | ph = self.policy_head(x) 220 | 221 | model = Model(main_input, [vh, ph]) 222 | model.compile( 223 | optimizer=Adam(), 224 | loss=['mean_squared_error', 'categorical_crossentropy'] 225 | ) 226 | return model 227 | 228 | def conv_layer(self, x, filters, kernel_size, padding='same'): 229 | conv = Conv2D( 230 | filters = filters, 231 | kernel_size = kernel_size, 232 | padding = padding, 233 | data_format = 'channels_first', 234 | activation = 'relu', 235 | kernel_regularizer = regularizers.l2(self.l2_const) 236 | )(x) 237 | return conv 238 | 239 | def value_head(self, x): 240 | x = self.conv_layer(x, 2, (1, 1), 'valid') 241 | x = Flatten()(x) 242 | x = Dense(64, kernel_regularizer=regularizers.l2(self.l2_const))(x) 243 | x = Dense( 244 | 1, 245 | kernel_regularizer = regularizers.l2(self.l2_const), 246 | activation = 'tanh', 247 | name = 'value_head' 248 | )(x) 249 | return x 250 | 251 | def policy_head(self, x): 252 | x = self.conv_layer(x, 4, (1, 1), 'valid') 253 | x = Flatten()(x) 254 | x = Dense( 255 | self.output_dim, 256 | kernel_regularizer = regularizers.l2(self.l2_const), 257 | activation = 'softmax', 258 | name = 'policy_head' 259 | )(x) 260 | return x 261 | 262 | -------------------------------------------------------------------------------- /DDDQN/Doom-Deadly-Corridor/deadly_corridor.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = deadly_corridor.wad 6 | 7 | # Skill 5 is reccomanded for the scenario to be a challenge. 8 | doom_skill = 5 9 | 10 | # Rewards 11 | death_penalty = 100 12 | #living_reward = 0 13 | 14 | # Rendering options 15 | screen_resolution = RES_160X120 16 | screen_format = GRAY8 17 | render_hud = true 18 | render_crosshair = false 19 | render_weapon = true 20 | render_decals = false 21 | render_particles = false 22 | window_visible = true 23 | 24 | episode_timeout = 2100 25 | 26 | # Available buttons 27 | available_buttons = 28 | { 29 | MOVE_LEFT 30 | MOVE_RIGHT 31 | ATTACK 32 | MOVE_FORWARD 33 | MOVE_BACKWARD 34 | TURN_LEFT 35 | TURN_RIGHT 36 | } 37 | 38 | # Game variables that will be in the state 39 | available_game_variables = { HEALTH } 40 | 41 | mode = PLAYER 42 | 43 | 44 | -------------------------------------------------------------------------------- /DDDQN/Doom-Deadly-Corridor/deadly_corridor.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DDDQN/Doom-Deadly-Corridor/deadly_corridor.wad -------------------------------------------------------------------------------- /DDPG/Ant/DDPG_Ant-v2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DDPG/Ant/DDPG_Ant-v2.pth -------------------------------------------------------------------------------- /DDPG/Ant/DDPG_Ant.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import gym 5 | import time 6 | 7 | def logger_print(logger, key, with_min_and_max=False): 8 | if with_min_and_max: 9 | print(f'{key+":":13s} {np.mean(logger[key]):.4f} {np.min(logger[key]):.4f}(min) {np.max(logger[key]):.4f}(max) {np.std(logger[key]):.4f}(std)') 10 | else: 11 | print(f'{key+":":13s} {np.mean(logger[key]):.4f}') 12 | 13 | def get_parameter_number(net): 14 | total_num = sum(p.numel() for p in net.parameters()) 15 | trainable_num = sum(p.numel() for p in net.parameters() if p.requires_grad) 16 | return {'Total': total_num, 'Trainable': trainable_num} 17 | 18 | def weight_init(m): 19 | ''' 20 | Code from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5 21 | Usage: 22 | model = Model() 23 | model.apply(weight_init) 24 | ''' 25 | if isinstance(m, nn.Linear): 26 | nn.init.xavier_normal_(m.weight.data) 27 | nn.init.normal_(m.bias.data) 28 | 29 | class ReplayBuffer: 30 | """ 31 | A simple FIFO experience replay buffer for DDPG agents. 32 | """ 33 | def __init__(self, obs_dim, act_dim, size): 34 | self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32) 35 | self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32) 36 | self.acts_buf = np.zeros([size, act_dim], dtype=np.float32) 37 | self.rews_buf = np.zeros(size, dtype=np.float32) 38 | self.done_buf = np.zeros(size, dtype=np.float32) 39 | self.ptr, self.size, self.max_size = 0, 0, size 40 | 41 | def store(self, obs, act, rew, next_obs, done): 42 | self.obs1_buf[self.ptr] = obs 43 | self.obs2_buf[self.ptr] = next_obs 44 | self.acts_buf[self.ptr] = act 45 | self.rews_buf[self.ptr] = rew 46 | self.done_buf[self.ptr] = done 47 | self.ptr = (self.ptr + 1) % self.max_size 48 | self.size = min(self.size + 1, self.max_size) 49 | 50 | def sample_batch(self, batch_size=32): 51 | idxs = np.random.randint(0, self.size, size=batch_size) 52 | return dict(obs1=self.obs1_buf[idxs], 53 | obs2=self.obs2_buf[idxs], 54 | acts=self.acts_buf[idxs], 55 | rews=self.rews_buf[idxs], 56 | done=self.done_buf[idxs]) 57 | 58 | class MLP(nn.Module): 59 | def __init__(self, sizes, activation=nn.Tanh, output_activation=None): 60 | super().__init__() 61 | 62 | net = [] 63 | for i in range(len(sizes)-1): 64 | net.append(nn.Linear(sizes[i], sizes[i+1])) 65 | if i == len(sizes) - 2: 66 | if output_activation is not None: 67 | net.append(output_activation()) 68 | else: 69 | net.append(activation()) 70 | 71 | self.mlp = nn.Sequential(*net) 72 | 73 | def forward(self, x): 74 | return self.mlp(x) 75 | 76 | 77 | class Actor_Critic(nn.Module): 78 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation=nn.ReLU, output_activation=nn.Tanh, action_space=None): 79 | super().__init__() 80 | 81 | self.actor = MLP([obs_dim] + hidden_sizes + [act_dim], activation, output_activation) 82 | self.critic = MLP([obs_dim + act_dim] + hidden_sizes + [1], activation, None) 83 | 84 | """ 85 | Deep Deterministic Policy Gradient (DDPG) 86 | """ 87 | def ddpg( 88 | env_name, 89 | ac_kwargs=dict(), 90 | seed=0, 91 | steps_per_epoch=5000, 92 | epochs=100, 93 | replay_size=int(1e6), 94 | gamma=0.99, 95 | polyak=0.995, 96 | pi_lr=1e-3, 97 | q_lr=1e-3, 98 | batch_size=100, 99 | start_steps=10000, 100 | act_noise=0.1, 101 | max_ep_len=1000 102 | ): 103 | """ 104 | 105 | Args: 106 | env_fn : A function which creates a copy of the environment. 107 | The environment must satisfy the OpenAI Gym API. 108 | 109 | actor_critic: A function which takes in placeholder symbols 110 | for state, ``x_ph``, and action, ``a_ph``, and returns the main 111 | outputs from the agent's Tensorflow computation graph: 112 | 113 | =========== ================ ====================================== 114 | Symbol Shape Description 115 | =========== ================ ====================================== 116 | ``pi`` (batch, act_dim) | Deterministically computes actions 117 | | from policy given states. 118 | ``q`` (batch,) | Gives the current estimate of Q* for 119 | | states in ``x_ph`` and actions in 120 | | ``a_ph``. 121 | ``q_pi`` (batch,) | Gives the composition of ``q`` and 122 | | ``pi`` for states in ``x_ph``: 123 | | q(x, pi(x)). 124 | =========== ================ ====================================== 125 | 126 | ac_kwargs (dict): Any kwargs appropriate for the actor_critic 127 | function you provided to DDPG. 128 | 129 | seed (int): Seed for random number generators. 130 | 131 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 132 | for the agent and the environment in each epoch. 133 | 134 | epochs (int): Number of epochs to run and train agent. 135 | 136 | replay_size (int): Maximum length of replay buffer. 137 | 138 | gamma (float): Discount factor. (Always between 0 and 1.) 139 | 140 | polyak (float): Interpolation factor in polyak averaging for target 141 | networks. Target networks are updated towards main networks 142 | according to: 143 | 144 | .. math:: \\theta_{\\text{targ}} \\leftarrow 145 | \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta 146 | 147 | where :math:`\\rho` is polyak. (Always between 0 and 1, usually 148 | close to 1.) 149 | 150 | pi_lr (float): Learning rate for policy. 151 | 152 | q_lr (float): Learning rate for Q-networks. 153 | 154 | batch_size (int): Minibatch size for SGD. 155 | 156 | start_steps (int): Number of steps for uniform-random action selection, 157 | before running real policy. Helps exploration. 158 | 159 | act_noise (float): Stddev for Gaussian exploration noise added to 160 | policy at training time. (At test time, no noise is added.) 161 | 162 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 163 | 164 | """ 165 | print(locals()) 166 | 167 | torch.manual_seed(seed) 168 | np.random.seed(seed) 169 | if torch.cuda.is_available(): 170 | torch.cuda.manual_seed_all(seed) 171 | 172 | env = gym.make(env_name) 173 | test_env = gym.make(env_name) 174 | obs_dim = env.observation_space.shape[0] 175 | act_dim = env.action_space.shape[0] 176 | # Action limit for clamping: critically, assumes all dimensions share the same bound! 177 | act_limit = env.action_space.high[0] 178 | 179 | # Share information about action space with policy architecture 180 | ac_kwargs['action_space'] = env.action_space 181 | 182 | # Experience buffer 183 | replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) 184 | 185 | # Model 186 | main_ac = Actor_Critic(obs_dim, act_dim, **ac_kwargs) 187 | target_ac = Actor_Critic(obs_dim, act_dim, **ac_kwargs) 188 | print(main_ac) 189 | print(f'\nNumber of parameters: {get_parameter_number(main_ac)}\n') 190 | main_ac.apply(weight_init) 191 | 192 | pi_optimizer = torch.optim.Adam(main_ac.actor.parameters(), lr=pi_lr) 193 | q_optimizer = torch.optim.Adam(main_ac.critic.parameters(), lr=q_lr) 194 | mse_loss = nn.MSELoss() 195 | 196 | # copy main_ac nn parameters to target_ac 197 | for v_targ, v_main in zip(target_ac.parameters(), main_ac.parameters()): 198 | v_targ.data.copy_(v_main.data) 199 | 200 | # Main loop: collect experience in env and update/log each epoch 201 | t = 0 202 | start_time = time.time() 203 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 204 | max_avg_ret = -np.inf 205 | 206 | for epoch in range(epochs): 207 | logger = { 208 | 'LossQ': [], 209 | 'QVals': [], 210 | 'LossPi': [], 211 | 'EpRet': [], 212 | 'EpLen': [], 213 | 'TestEpRet': [], 214 | 'TestEpLen': [] 215 | } 216 | 217 | for _ in range(steps_per_epoch): 218 | """ 219 | Until start_steps have elapsed, randomly sample actions 220 | from a uniform distribution for better exploration. Afterwards, 221 | use the learned policy (with some noise, via act_noise). 222 | """ 223 | if t > start_steps: 224 | with torch.no_grad(): 225 | pi = act_limit * main_ac.actor(torch.tensor(o, dtype=torch.float)) 226 | pi = pi.numpy() + act_noise * np.random.randn(act_dim) 227 | a = np.clip(pi, -act_limit, act_limit) 228 | else: 229 | a = env.action_space.sample() 230 | 231 | o2, r, d, _ = env.step(a) 232 | ep_ret += r 233 | ep_len += 1 234 | 235 | # Ignore the "done" signal if it comes from hitting the time 236 | # horizon (that is, when it's an artificial terminal signal 237 | # that isn't based on the agent's state) 238 | d = False if ep_len==max_ep_len else d 239 | 240 | # Store experience to replay buffer 241 | replay_buffer.store(o, a, r, o2, d) 242 | 243 | # Super critical, easy to overlook step: make sure to update 244 | # most recent observation! 245 | o = o2 246 | 247 | if d or (ep_len == max_ep_len): 248 | """ 249 | Perform all DDPG updates at the end of the trajectory, 250 | in accordance with tuning done by TD3 paper authors. 251 | """ 252 | for _ in range(ep_len): 253 | batch = replay_buffer.sample_batch(batch_size) 254 | obs1 = torch.tensor(batch['obs1'], dtype=torch.float) 255 | obs2 = torch.tensor(batch['obs2'], dtype=torch.float) 256 | acts = torch.tensor(batch['acts'], dtype=torch.float) 257 | rews = torch.tensor(batch['rews'], dtype=torch.float).unsqueeze(1) 258 | done = torch.tensor(batch['done'], dtype=torch.float).unsqueeze(1) 259 | 260 | # Q-learning update 261 | q = main_ac.critic(torch.cat([obs1, acts], dim=-1)) 262 | pi_targ = act_limit * target_ac.actor(obs2) 263 | q_pi_targ = target_ac.critic(torch.cat([obs2, pi_targ], dim=-1)) 264 | backup = rews + gamma * (1 - done) * q_pi_targ 265 | q_loss = mse_loss(q, backup.detach()) 266 | 267 | q_optimizer.zero_grad() 268 | q_loss.backward() 269 | q_optimizer.step() 270 | logger['LossQ'].append(q_loss.item()) 271 | logger['QVals'] += q.squeeze().tolist() 272 | 273 | # Policy update 274 | pi = act_limit * main_ac.actor(obs1) 275 | q_pi = main_ac.critic(torch.cat([obs1, pi], dim=-1)) 276 | pi_loss = -q_pi.mean() 277 | 278 | pi_optimizer.zero_grad() 279 | pi_loss.backward() 280 | pi_optimizer.step() 281 | 282 | logger['LossPi'].append(pi_loss.item()) 283 | 284 | # Target update 285 | for v_targ, v_main in zip(target_ac.parameters(), main_ac.parameters()): 286 | v_targ.data.copy_(polyak * v_targ.data + (1 - polyak) * v_main.data) 287 | 288 | logger['EpRet'].append(ep_ret) 289 | logger['EpLen'].append(ep_len) 290 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 291 | 292 | t += 1 293 | 294 | # Test the performance of the deterministic version of the agent. 295 | with torch.no_grad(): 296 | for _ in range(10): 297 | ob, ret, done, test_ep_ret, test_ep_len = test_env.reset(), 0, False, 0, 0 298 | while not(done or (test_ep_len == max_ep_len)): 299 | # Take deterministic actions at test time withour noise 300 | pi = act_limit * main_ac.actor(torch.tensor(ob, dtype=torch.float)) 301 | act = np.clip(pi, -act_limit, act_limit) 302 | ob, ret, done, _ = test_env.step(act) 303 | test_ep_ret += ret 304 | test_ep_len += 1 305 | logger['TestEpRet'].append(test_ep_ret) 306 | logger['TestEpLen'].append(test_ep_len) 307 | 308 | # Log info about epoch 309 | print('-'*40) 310 | print(f'Epoch: {epoch}') 311 | print(f'TotalEnvInteracts: {t}') 312 | logger_print(logger, 'EpRet', True) 313 | logger_print(logger, 'EpLen') 314 | logger_print(logger, 'TestEpRet', True) 315 | logger_print(logger, 'TestEpLen') 316 | logger_print(logger, 'QVals', True) 317 | logger_print(logger, 'LossPi') 318 | logger_print(logger, 'LossQ') 319 | print(f'Time: {time.time()-start_time:.4f}s') 320 | print('-'*40+'\n') 321 | 322 | # Save model 323 | if np.mean(logger['EpRet']) > max_avg_ret: 324 | max_avg_ret = np.mean(logger['EpRet']) 325 | torch.save(main_ac.state_dict(), 'DDPG_{}.pth'.format(env_name)) 326 | 327 | env.close() 328 | 329 | if __name__ == '__main__': 330 | import argparse 331 | parser = argparse.ArgumentParser() 332 | parser.add_argument('--env', type=str, default='Ant-v2') 333 | parser.add_argument('--hid', type=int, default=300) 334 | parser.add_argument('--l', type=int, default=2) 335 | parser.add_argument('--gamma', type=float, default=0.99) 336 | parser.add_argument('--seed', '-s', type=int, default=0) 337 | parser.add_argument('--epochs', type=int, default=50) 338 | parser.add_argument('--exp_name', type=str, default='ddpg') 339 | args = parser.parse_args() 340 | 341 | ddpg( 342 | args.env, 343 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), 344 | gamma=args.gamma, 345 | seed=args.seed, 346 | epochs=args.epochs 347 | ) -------------------------------------------------------------------------------- /DQN/Atari_Space_Invaders/DQN_Atari_Space_Invaders.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import retro 4 | from skimage import transform 5 | from skimage.color import rgb2gray 6 | from collections import deque 7 | import random 8 | import sys 9 | import time 10 | 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | import os 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 16 | 17 | 18 | ########################################### 19 | # Constant 20 | stack_size = 4 21 | frame_size = (110, 84) 22 | # Global variables 23 | stacked_frames = deque([np.zeros(frame_size) for _ in range(stack_size)], maxlen=stack_size) 24 | ########################################### 25 | 26 | def create_environment(): 27 | env = retro.make(game='SpaceInvaders-Atari2600') 28 | possible_actions = np.array(np.identity(env.action_space.n, dtype=np.int).tolist()) 29 | return env, possible_actions 30 | 31 | def test_environment(): 32 | env, possible_actions = create_environment() 33 | episodes = 1 34 | 35 | for _ in range(episodes): 36 | env.reset() 37 | done = False 38 | 39 | while not done: 40 | env.render() 41 | choice = random.randint(0, action_size - 1) 42 | action = possible_actions[choice] 43 | state, reward, done, info = env.step(action) 44 | 45 | env.close() 46 | 47 | def preprocess_frame(frame): 48 | gray = rgb2gray(frame) 49 | cropped_frame = gray[8:-12, 4:-12] 50 | normalized_frame = cropped_frame / 255.0 51 | preprocessed_frame = transform.resize(normalized_frame, frame_size) 52 | return preprocessed_frame 53 | 54 | def stack_frames(state, is_new_episode=False): 55 | global stacked_frames 56 | frame = preprocess_frame(state) 57 | 58 | if is_new_episode: 59 | stacked_frames = deque([np.zeros(frame_size) for _ in range(stack_size)], maxlen=stack_size) 60 | 61 | for _ in range(stack_size): 62 | stacked_frames.append(frame) 63 | else: 64 | stacked_frames.append(frame) 65 | 66 | return np.stack(stacked_frames, axis=2) 67 | 68 | 69 | class DQNetwork: 70 | def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'): 71 | with tf.variable_scope(name): 72 | self.inputs = tf.placeholder(tf.float32, [None, *state_size], name='inputs') 73 | self.actions = tf.placeholder(tf.float32, [None, action_size], name='actions') 74 | self.target_q = tf.placeholder(tf.float32, [None], name='target_q') 75 | 76 | conv1 = tf.layers.conv2d( 77 | inputs = self.inputs, 78 | filters = 32, 79 | kernel_size = [8, 8], 80 | strides = [4, 4], 81 | padding = 'VALID', 82 | kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(), 83 | name = 'conv1' 84 | ) 85 | conv1_out = tf.nn.elu(conv1, name='conv1_out') 86 | 87 | conv2 = tf.layers.conv2d( 88 | inputs = conv1_out, 89 | filters = 64, 90 | kernel_size = [4, 4], 91 | strides = [2, 2], 92 | padding = 'VALID', 93 | kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(), 94 | name = 'conv2' 95 | ) 96 | conv2_out = tf.nn.elu(conv2, name='conv2_out') 97 | 98 | conv3 = tf.layers.conv2d( 99 | inputs = conv2_out, 100 | filters = 64, 101 | kernel_size = [3, 3], 102 | strides = [2, 2], 103 | padding = 'VALID', 104 | kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(), 105 | name = 'conv3' 106 | ) 107 | conv3_out = tf.nn.elu(conv3, name='conv3_out') 108 | 109 | flatten = tf.contrib.layers.flatten(conv3_out) 110 | fc = tf.layers.dense( 111 | inputs = flatten, 112 | units = 512, 113 | activation = tf.nn.elu, 114 | kernel_initializer = tf.contrib.layers.xavier_initializer(), 115 | name = 'fc' 116 | ) 117 | self.output = tf.layers.dense( 118 | inputs = fc, 119 | units = action_size, 120 | activation = None, 121 | kernel_initializer = tf.contrib.layers.xavier_initializer(), 122 | name = 'output' 123 | ) 124 | 125 | self.q = tf.reduce_sum(tf.multiply(self.output, self.actions)) 126 | self.loss = tf.reduce_mean(tf.square(self.target_q - self.q)) 127 | self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) 128 | 129 | 130 | class Memory(): 131 | def __init__(self, max_size): 132 | self.buffer = deque(maxlen=max_size) 133 | 134 | def add(self, experience): 135 | self.buffer.append(experience) 136 | 137 | def sample(self, batch_size): 138 | buffer_size = len(self.buffer) 139 | index = np.random.choice( 140 | np.arange(buffer_size), 141 | size = batch_size, 142 | replace = False 143 | ) 144 | return [self.buffer[i] for i in index] 145 | 146 | 147 | def train(): 148 | env, possible_actions = create_environment() 149 | 150 | # set hyperparameters 151 | ########################################### 152 | state_size = [*frame_size, stack_size] 153 | action_size = env.action_space.n 154 | learning_rate = 0.00025 155 | total_episodes = 100 156 | check_step = 5 157 | max_steps = 50000 158 | batch_size = 64 159 | explore_start = 1.0 160 | explore_stop = 0.01 161 | decay_rate = 0.00001 162 | gamma = 0.9 163 | pretrain_length = batch_size 164 | memory_size = 1000000 165 | ########################################### 166 | 167 | 168 | # pre-populate train samples 169 | ########################################### 170 | memory = Memory(max_size=memory_size) 171 | state = env.reset() 172 | state = stack_frames(state, True) 173 | 174 | for i in range(pretrain_length): 175 | choice = random.randint(0, action_size - 1) 176 | action = possible_actions[choice] 177 | new_state, reward, done, _ = env.step(action) 178 | new_state = stack_frames(new_state) 179 | 180 | if done: 181 | new_state = np.zeros(state.shape) 182 | memory.add((state, action, reward, new_state, done)) 183 | state = env.reset() 184 | state = stack_frames(state, True) 185 | else: 186 | memory.add((state, action, reward, new_state, done)) 187 | state = new_state 188 | ########################################### 189 | 190 | # train DQN 191 | ########################################### 192 | tf.reset_default_graph() 193 | DQN = DQNetwork(state_size, action_size, learning_rate) 194 | 195 | writer = tf.summary.FileWriter('train_log') 196 | tf.summary.scalar('Loss', DQN.loss) 197 | write_op = tf.summary.merge_all() 198 | saver = tf.train.Saver() 199 | 200 | with tf.Session() as sess: 201 | sess.run(tf.global_variables_initializer()) 202 | 203 | decay_step = 0 204 | loss = None 205 | 206 | for episode in range(1, total_episodes+1): 207 | step = 0 208 | episode_rewards = [] 209 | 210 | state = env.reset() 211 | state = stack_frames(state, True) 212 | 213 | while step < max_steps: 214 | step += 1 215 | decay_step += 1 216 | 217 | exp_exp_tradeoff = np.random.rand() 218 | explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step) 219 | 220 | if explore_probability > exp_exp_tradeoff: 221 | choice = random.randint(0, action_size - 1) 222 | else: 223 | qs = sess.run(DQN.output, feed_dict={ 224 | DQN.inputs: state.reshape((1, *state.shape)) 225 | }) 226 | choice = np.argmax(qs) 227 | 228 | action = possible_actions[choice] 229 | new_state, reward, done, _ = env.step(action) 230 | 231 | env.render() 232 | episode_rewards.append(reward) 233 | 234 | if done: 235 | total_reward = np.sum(episode_rewards) 236 | 237 | new_state = np.zeros(frame_size) 238 | new_state = stack_frames(new_state) 239 | memory.add((state, action, reward, new_state, done)) 240 | 241 | print( 242 | '[*] Episode: {}, total reward: {}, explore p: {:.4f}, train loss: {:.4f}'.format( 243 | episode, total_reward, explore_probability, loss 244 | ) 245 | ) 246 | break 247 | else: 248 | new_state = stack_frames(new_state) 249 | memory.add((state, action, reward, new_state, done)) 250 | state = new_state 251 | 252 | # learning part 253 | ################ 254 | batch = memory.sample(batch_size) 255 | states_mb = np.array([b[0] for b in batch], ndmin=3) 256 | actions_mb = np.array([b[1] for b in batch]) 257 | rewards_mb = np.array([b[2] for b in batch]) 258 | new_states_mb = np.array([b[3] for b in batch], ndmin=3) 259 | dones_mb = np.array([b[4] for b in batch]) 260 | 261 | target_q_mb = [] 262 | new_state_q_mb = sess.run(DQN.output, feed_dict={ 263 | DQN.inputs: new_states_mb, 264 | }) 265 | 266 | for i in range(batch_size): 267 | is_done = dones_mb[i] 268 | if is_done: 269 | target_q_mb.append(rewards_mb[i]) 270 | else: 271 | t = rewards_mb[i] + gamma * np.max(new_state_q_mb) 272 | target_q_mb.append(t) 273 | 274 | target_q_mb = np.array(target_q_mb) 275 | 276 | loss, _ = sess.run([DQN.loss, DQN.optimizer], feed_dict={ 277 | DQN.inputs: states_mb, 278 | DQN.actions: actions_mb, 279 | DQN.target_q: target_q_mb 280 | }) 281 | 282 | summary = sess.run(write_op, feed_dict={ 283 | DQN.inputs: states_mb, 284 | DQN.actions: actions_mb, 285 | DQN.target_q: target_q_mb 286 | }) 287 | writer.add_summary(summary, episode) 288 | writer.flush() 289 | ################ 290 | 291 | if episode % check_step == 0: 292 | save_path = saver.save(sess, './model/model.ckpt') 293 | print('[*] Model Saved:', save_path) 294 | 295 | print('[*] Train done') 296 | env.close() 297 | ########################################### 298 | 299 | 300 | def play(): 301 | env, possible_actions = create_environment() 302 | 303 | with tf.Session() as sess: 304 | total_rewards = 0 305 | 306 | state_size = [*frame_size, stack_size] 307 | action_size = env.action_space.n 308 | learning_rate = 0.00025 309 | DQN = DQNetwork(state_size, action_size, learning_rate) 310 | 311 | saver = tf.train.Saver() 312 | saver.restore(sess, './model/model.ckpt') 313 | 314 | # start game 315 | state = env.reset() 316 | state = stack_frames(state, True) 317 | done = False 318 | 319 | while not done: 320 | state_q = sess.run(DQN.output, feed_dict={ 321 | DQN.inputs: state.reshape((1, *state.shape)) 322 | }) 323 | choice = np.argmax(state_q) 324 | action = possible_actions[choice] 325 | new_state, reward, done, _ = env.step(action) 326 | 327 | env.render() 328 | total_rewards += reward 329 | state = stack_frames(new_state) 330 | 331 | print('[*] total score:', total_rewards) 332 | 333 | env.close() 334 | 335 | 336 | if __name__ == '__main__': 337 | if sys.argv[1] == '--train': 338 | train() 339 | elif sys.argv[1] == '--play': 340 | play() 341 | elif sys.argv[1] == '--test': 342 | test_environment() 343 | -------------------------------------------------------------------------------- /DQN/Atari_Space_Invaders/Space Invaders (1983) (CCE) (C-820).bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/Space Invaders (1983) (CCE) (C-820).bin -------------------------------------------------------------------------------- /DQN/Atari_Space_Invaders/model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt" 2 | all_model_checkpoint_paths: "model.ckpt" 3 | -------------------------------------------------------------------------------- /DQN/Atari_Space_Invaders/model/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/model/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /DQN/Atari_Space_Invaders/model/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/model/model.ckpt.index -------------------------------------------------------------------------------- /DQN/Atari_Space_Invaders/model/model.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/model/model.ckpt.meta -------------------------------------------------------------------------------- /DQN/Atari_Space_Invaders/train_log/events.out.tfevents.1530462157.MKK: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/train_log/events.out.tfevents.1530462157.MKK -------------------------------------------------------------------------------- /DQN/Doom/DQN_Doom.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from vizdoom import DoomGame 4 | import random 5 | import time 6 | from skimage import transform 7 | from collections import deque 8 | 9 | import os 10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 11 | 12 | 13 | 14 | def create_environment(): 15 | game = DoomGame() 16 | game.load_config("basic.cfg") 17 | game.set_doom_scenario_path("basic.wad") 18 | game.init() 19 | 20 | left = [1, 0, 0] 21 | right = [0, 1, 0] 22 | shoot = [0, 0, 1] 23 | possible_actions = [left, right, shoot] 24 | return game, possible_actions 25 | 26 | def test_environment(): 27 | game, actions = create_environment() 28 | episodes = 1 29 | 30 | for _ in range(episodes): 31 | game.new_episode() 32 | 33 | while not game.is_episode_finished(): 34 | state = game.get_state() 35 | 36 | img = state.screen_buffer # 当前游戏画面, 2D array 37 | misc = state.game_variables # [50.] 38 | action = random.choice(actions) 39 | reward = game.make_action(action) 40 | print(action, 'reward:', reward) 41 | time.sleep(0.02) 42 | 43 | print('[*] Result:', game.get_total_reward()) 44 | time.sleep(2) 45 | 46 | game.close() 47 | 48 | 49 | def preprocess_frame(state): 50 | cropped_frame = state[30:-10, 30:-30] 51 | normalized_frame = cropped_frame / 255.0 52 | preprocessed_frame = transform.resize(normalized_frame, [84, 84]) 53 | return preprocessed_frame 54 | 55 | 56 | def stack_states(stacked_frames, state): 57 | frame = preprocess_frame(state) 58 | stacked_frames.append(frame) 59 | stacked_state = np.stack(stacked_frames, axis=2) 60 | return stacked_state 61 | 62 | 63 | class build_DQNetwork: 64 | def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'): 65 | self.state_size = state_size 66 | self.action_size = action_size 67 | self.learning_rate = learning_rate 68 | 69 | with tf.variable_scope(name): 70 | # 84x84x4 71 | self.inputs = tf.placeholder(tf.float32, [None, *state_size], name='inputs') 72 | self.actions = tf.placeholder(tf.float32, [None, action_size], name='actions') 73 | self.target_Q = tf.placeholder(tf.float32, [None], name='target') 74 | 75 | # 20x20x32 76 | self.conv1 = tf.layers.conv2d(inputs = self.inputs, 77 | filters = 32, 78 | kernel_size = [8, 8], 79 | strides = [4, 4], 80 | padding = 'VALID', 81 | kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(), 82 | name = 'conv1') 83 | self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1, 84 | training = True, 85 | epsilon = 1e-5, 86 | name = 'batch_norm1') 87 | self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name='conv1_out') 88 | 89 | # 9x9x64 90 | self.conv2 = tf.layers.conv2d(inputs = self.conv1_out, 91 | filters = 64, 92 | kernel_size = [4, 4], 93 | strides = [2, 2], 94 | padding = 'VALID', 95 | kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(), 96 | name = 'conv2') 97 | self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2, 98 | training = True, 99 | epsilon = 1e-5, 100 | name = 'batch_norm2') 101 | self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name='conv2_out') 102 | 103 | # 3x3x128 104 | self.conv3 = tf.layers.conv2d(inputs = self.conv2_out, 105 | filters = 128, 106 | kernel_size = [4, 4], 107 | strides = [2, 2], 108 | padding = 'VALID', 109 | kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(), 110 | name = 'conv3') 111 | self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3, 112 | training = True, 113 | epsilon = 1e-5, 114 | name = 'batch_norm3') 115 | self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name='conv3_out') 116 | 117 | # 1152 118 | self.flatten = tf.layers.flatten(self.conv3_out) 119 | # 512 120 | self.fc = tf.layers.dense(inputs = self.flatten, 121 | units = 512, 122 | activation = tf.nn.elu, 123 | kernel_initializer = tf.contrib.layers.xavier_initializer(), 124 | name = 'fc1') 125 | # 3 126 | self.output = tf.layers.dense(inputs = self.fc, 127 | units = 3, 128 | activation = None, 129 | kernel_initializer = tf.contrib.layers.xavier_initializer(), 130 | name = 'output') 131 | 132 | # Q is our predicted Q value 133 | self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions), axis=1) 134 | # # The loss is the difference between our predicted Q and the Q_target 135 | self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q)) 136 | self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss) 137 | 138 | 139 | class Memory(): 140 | def __init__(self, max_size): 141 | self.buffer = deque(maxlen=max_size) 142 | 143 | def add(self, experience): 144 | self.buffer.append(experience) 145 | 146 | def sample(self, batch_size): 147 | buffer_size = len(self.buffer) 148 | index = np.random.choice(np.arange(buffer_size), size=batch_size, replace=False) 149 | return [self.buffer[i] for i in index] 150 | 151 | 152 | def train(): 153 | game, possible_actions = create_environment() 154 | 155 | # Set Hyperparameters 156 | ##################### 157 | state_size = [84, 84, 4] 158 | action_size = game.get_available_buttons_size() 159 | learning_rate = 0.0002 160 | 161 | total_episodes = 5000 162 | max_steps = 100 163 | batch_size = 64 164 | 165 | explore_max = 1.0 166 | explore_min = 0.01 167 | decay_rate = 0.0001 168 | gamma = 0.99 169 | 170 | pretrain_length = batch_size 171 | memory_size = 50000 172 | stack_size = 4 173 | 174 | stacked_frames = deque([np.zeros((84, 84), dtype=np.int) for i in range(stack_size)], 175 | maxlen=stack_size) 176 | memory = Memory(max_size=memory_size) 177 | ##################### 178 | 179 | 180 | # make pretrain samples 181 | ########################################### 182 | game.new_episode() 183 | 184 | for i in range(pretrain_length): 185 | if i == 0: 186 | state = game.get_state().screen_buffer 187 | state = stack_states(stacked_frames, state) 188 | 189 | action = random.choice(possible_actions) 190 | reward = game.make_action(action) 191 | done = game.is_episode_finished() 192 | 193 | if done: 194 | next_state = np.zeros(state.shape) 195 | memory.add((state, action, reward, next_state, done)) 196 | game.new_episode() 197 | else: 198 | next_state = game.get_state().screen_buffer 199 | next_state = stack_states(stacked_frames, next_state) 200 | memory.add((state, action, reward, next_state, done)) 201 | 202 | state = next_state 203 | ########################################### 204 | 205 | 206 | # train deep Q neural network 207 | ########################################### 208 | tf.reset_default_graph() 209 | DQNetwork = build_DQNetwork(state_size, action_size, learning_rate) 210 | 211 | writer = tf.summary.FileWriter('train_log') 212 | tf.summary.scalar('loss', DQNetwork.loss) 213 | saver = tf.train.Saver() 214 | 215 | rewards_list = [] 216 | decay_step = 0 217 | game.init() 218 | 219 | with tf.Session() as sess: 220 | sess.run(tf.global_variables_initializer()) 221 | 222 | for episode in range(total_episodes): 223 | game.new_episode() 224 | 225 | step = 0 226 | frame = game.get_state().screen_buffer 227 | state = stack_states(stacked_frames, frame) 228 | 229 | while step < max_steps: 230 | step += 1 231 | decay_step += 1 232 | 233 | exp_exp_tradeoff = np.random.rand() 234 | explore_probability = explore_min + (explore_max - explore_min) * np.exp(-decay_rate * decay_step) 235 | 236 | if explore_probability > exp_exp_tradeoff: 237 | action = random.choice(possible_actions) 238 | else: 239 | Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs: state.reshape(1, *state.shape)}) 240 | action = possible_actions[int(np.argmax(Qs))] 241 | 242 | reward = game.make_action(action) 243 | done = game.is_episode_finished() 244 | 245 | if done: 246 | next_state = np.zeros((84, 84), dtype=np.int) 247 | next_state = stack_states(stacked_frames, next_state) 248 | total_reward = game.get_total_reward() 249 | formated_str = 'Episode: {}, Total reward: {}, Training loss: {:.4f}, Explore P: {:.4f}' 250 | print(formated_str.format(episode, total_reward, loss, explore_probability)) 251 | 252 | rewards_list.append((episode, total_reward)) 253 | memory.add((state, action, reward, next_state, done)) 254 | step = max_steps 255 | else: 256 | next_state = game.get_state().screen_buffer 257 | next_state = stack_states(stacked_frames, next_state) 258 | memory.add((state, action, reward, next_state, done)) 259 | state = next_state 260 | 261 | # train DQNetwork == update Qtable 262 | batch = memory.sample(batch_size) 263 | states = np.array([each[0] for each in batch], ndmin=3) 264 | actions = np.array([each[1] for each in batch]) 265 | rewards = np.array([each[2] for each in batch]) 266 | next_states = np.array([each[3] for each in batch]) 267 | dones = np.array([each[4] for each in batch]) 268 | 269 | target_Qs_batch = [] 270 | target_Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs: next_states}) 271 | 272 | for i in range(batch_size): 273 | terminal = dones[i] 274 | 275 | if terminal: 276 | target_Qs_batch.append(rewards[i]) 277 | else: 278 | target = rewards[i] + gamma * np.max(target_Qs[i]) 279 | target_Qs_batch.append(target) 280 | 281 | targets = np.array([each for each in target_Qs_batch]) 282 | 283 | loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer], 284 | feed_dict={DQNetwork.inputs: states, 285 | DQNetwork.target_Q: targets, 286 | DQNetwork.actions: actions}) 287 | 288 | # Write TF Summaries 289 | summary = sess.run(tf.summary.merge_all(), 290 | feed_dict={DQNetwork.inputs: states, 291 | DQNetwork.target_Q: targets, 292 | DQNetwork.actions: actions}) 293 | writer.add_summary(summary, episode) 294 | writer.flush() 295 | 296 | if episode % 5 == 0: 297 | save_path = saver.save(sess, './model/model.ckpt') 298 | print('[*] Model Saved:', save_path) 299 | print('Train done') 300 | ########################################### 301 | 302 | 303 | def play(): 304 | with tf.Session() as sess: 305 | state_size = [84, 84, 4] 306 | action_size = 3 307 | learning_rate = 0.0002 308 | DQNetwork = build_DQNetwork(state_size, action_size, learning_rate) 309 | 310 | saver = tf.train.Saver() 311 | saver.restore(sess, "./model/model.ckpt") 312 | 313 | game, possible_actions = create_environment() 314 | totalScore = 0 315 | episodes = 10 316 | stack_size = 4 317 | stacked_frames = deque([np.zeros((84, 84), dtype=np.int) for i in range(stack_size)], 318 | maxlen=stack_size) 319 | 320 | for i in range(episodes): 321 | game.new_episode() 322 | 323 | while not game.is_episode_finished(): 324 | frame = game.get_state().screen_buffer 325 | state = stack_states(stacked_frames, frame) 326 | 327 | Qs = sess.run(DQNetwork.output, feed_dict={DQNetwork.inputs: state.reshape((1, *state.shape))}) 328 | action = possible_actions[int(np.argmax(Qs))] 329 | game.make_action(action) 330 | 331 | score = game.get_total_reward() 332 | print("Episode {} Score: {}".format(i, score)) 333 | totalScore += score 334 | 335 | print("[*] Average Score: ", totalScore / episodes) 336 | game.close() 337 | 338 | 339 | if __name__ == '__main__': 340 | import sys 341 | if sys.argv[1] == '--train': 342 | train() 343 | elif sys.argv[1] == '--play': 344 | play() 345 | 346 | -------------------------------------------------------------------------------- /DQN/Doom/basic.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = basic.wad 6 | doom_map = map01 7 | 8 | # Rewards 9 | living_reward = -1 10 | 11 | # Rendering options 12 | screen_resolution = RES_160X120 13 | screen_format = GRAY8 14 | render_hud = True 15 | render_crosshair = false 16 | render_weapon = true 17 | render_decals = true 18 | render_particles = true 19 | window_visible = true 20 | 21 | # make episodes start after 20 tics (after unholstering the gun) 22 | episode_start_time = 14 23 | 24 | # make episodes finish after 300 actions (tics) 25 | episode_timeout = 300 26 | 27 | # Available buttons 28 | available_buttons = 29 | { 30 | MOVE_LEFT 31 | MOVE_RIGHT 32 | ATTACK 33 | } 34 | 35 | # Game variables that will be in the state 36 | available_game_variables = { AMMO2} 37 | 38 | mode = PLAYER 39 | doom_skill = 5 40 | -------------------------------------------------------------------------------- /DQN/Doom/basic.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/basic.wad -------------------------------------------------------------------------------- /DQN/Doom/model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt" 2 | all_model_checkpoint_paths: "model.ckpt" 3 | -------------------------------------------------------------------------------- /DQN/Doom/model/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/model/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /DQN/Doom/model/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/model/model.ckpt.index -------------------------------------------------------------------------------- /DQN/Doom/model/model.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/model/model.ckpt.meta -------------------------------------------------------------------------------- /DQN/Doom/train_log/events.out.tfevents.1524621481.MKK: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/train_log/events.out.tfevents.1524621481.MKK -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MCTS/MCTS_Gomoku.py: -------------------------------------------------------------------------------- 1 | from math import * 2 | import random 3 | import numpy as np 4 | 5 | class GameState: 6 | def __init__(self): 7 | self.player_just_moved = 2 8 | 9 | def clone(self): 10 | st = GameState() 11 | st.player_just_moved = self.player_just_moved 12 | return st 13 | 14 | def move(self, action): 15 | self.player_just_moved = 3 - self.player_just_moved 16 | 17 | def actions(self): 18 | """ Get all possible moves from this state. 19 | """ 20 | 21 | def win(self, player): 22 | """ Get the game result from the viewpoint of player. 23 | """ 24 | 25 | def end(self): 26 | """ Whether the game is end or not 27 | """ 28 | 29 | def __repr__(self): 30 | pass 31 | 32 | class Gomoku(GameState): 33 | def __init__(self, w=8): # 15x15 34 | self.player_just_moved = 2 35 | self.board = [] # 0 = empty, 1 = player 1 (X), 2 = player 2 (O) 36 | self.w = w 37 | for y in range(w): 38 | self.board.append([0] * w) 39 | 40 | def clone(self): 41 | st = Gomoku() 42 | st.player_just_moved = self.player_just_moved 43 | st.board = [self.board[i][:] for i in range(self.w)] 44 | st.w = self.w 45 | return st 46 | 47 | def move(self, action): 48 | a, b = action 49 | assert 0 <= a <= self.w and 0 <= b <= self.w and self.board[a][b] == 0 50 | self.player_just_moved = 3 - self.player_just_moved 51 | self.board[a][b] = self.player_just_moved 52 | 53 | def actions(self): 54 | return [(i, j) for i in range(self.w) for j in range(self.w) if self.board[i][j] == 0] 55 | 56 | def check_five(self, i, j, player): 57 | if 2 <= i < self.w-2 and 2 <= j < self.w-2 and self.board[i-2][j-2] == self.board[i-1][j-1] == self.board[i][j] == self.board[i+1][j+1] == self.board[i+2][j+2] == player: 58 | return 1 59 | elif 2 <= j < self.w-2 and self.board[i][j-2] == self.board[i][j-1] == self.board[i][j] == self.board[i][j+1] == self.board[i][j+2] == player: 60 | return 1 61 | elif 2 <= i < self.w-2 and 2 <= j < self.w-2 and self.board[i+2][j-2] == self.board[i+1][j-1] == self.board[i][j] == self.board[i-1][j+1] == self.board[i-2][j+2] == player: 62 | return 1 63 | elif 2 <= i < self.w-2 and self.board[i-2][j] == self.board[i-1][j] == self.board[i][j] == self.board[i+1][j] == self.board[i+2][j] == player: 64 | return 1 65 | return 0 66 | 67 | def win(self, player): 68 | for i in range(self.w): 69 | for j in range(self.w): 70 | if self.check_five(i, j, player): 71 | return 1 72 | elif self.check_five(i, j, 3-player): 73 | return 0 74 | if self.actions() == []: return 0.5 75 | return -1 76 | 77 | def end(self): 78 | return self.win(1) >= 0 79 | 80 | def __repr__(self): 81 | row = '{:>2} ' + ' | '.join(['{}'] * self.w) + ' ' 82 | line = '\n ' + ('----' * self.w)[:-1] + '\n' 83 | s = ' ' + '%2d ' * self.w % tuple(range(self.w)) + '\n' 84 | s += line.join([row.format(i, *map(lambda j: [' ', 'X', 'O'][j], self.board[i])) for i in range(self.w)]) 85 | return s 86 | 87 | class Node: 88 | def __init__(self, action=None, parent=None, state=None): 89 | self.action = action 90 | self.parent = parent 91 | self.childs = [] 92 | self.W = 0 93 | self.N = 0 94 | self.untried_actions = state.actions() 95 | self.player_just_moved = state.player_just_moved 96 | 97 | def select(self): 98 | s = sorted(self.childs, key = lambda c: c.U())[-1] 99 | return s 100 | 101 | def add_child(self, a, s): 102 | n = Node(a, self, s) 103 | self.untried_actions.remove(a) 104 | self.childs.append(n) 105 | return n 106 | 107 | def update(self, result): 108 | self.N += 1 109 | self.W += result 110 | 111 | def U(self): 112 | if self.parent: 113 | return self.W / self.N + sqrt(2 * log(self.parent.N) / self.N) 114 | return 0 115 | 116 | def __repr__(self): 117 | return "[A: %s, U: %.2f, W/N: %.1f/%d, Untried: %s]" \ 118 | % (self.action, self.U(), self.W, self.N, self.untried_actions) 119 | 120 | def show_node_tree(self, indent=0): 121 | print("| " * indent + str(self)) 122 | 123 | for c in self.childs: 124 | c.show_node_tree(indent+1) 125 | 126 | def show_children_nodes(self): 127 | print('\n[*] Child Nodes') 128 | for c in self.childs: print(c) 129 | 130 | 131 | def UCT(rootstate, itermax, verbose=False): 132 | rootnode = Node(state=rootstate) 133 | 134 | for i in range(itermax): 135 | node = rootnode 136 | state = rootstate.clone() 137 | 138 | # Select 139 | while node.untried_actions == [] and node.childs != []: 140 | node = node.select() 141 | state.move(node.action) 142 | 143 | # Expand 144 | if node.untried_actions != []: 145 | action = random.choice(node.untried_actions) 146 | state.move(action) 147 | node = node.add_child(action, state) 148 | 149 | # Rollout 150 | while state.actions() != []: 151 | state.move(random.choice(state.actions())) 152 | 153 | # Backpropagate 154 | while node != None: 155 | node.update(state.win(node.player_just_moved)) 156 | node = node.parent 157 | 158 | if verbose: rootnode.show_node_tree() 159 | else: rootnode.show_children_nodes() 160 | 161 | return sorted(rootnode.childs, key = lambda c: c.N)[-1].action 162 | 163 | def random_play(game): 164 | return random.choice(game.actions()) 165 | 166 | def human_play(): 167 | t = input('[*] Your turn (i j): ') 168 | a, b = t.split(' ') 169 | i, j = int(a), int(b) 170 | return (i, j) 171 | 172 | def play_game(): 173 | game = Gomoku() 174 | 175 | while not game.end(): 176 | print(game) 177 | 178 | if game.player_just_moved == 1: 179 | # action = UCT(game, 1000) # Player O 180 | action = random_play(game) 181 | else: 182 | action = UCT(game, 10000) # Player X 183 | # action = human_play() 184 | 185 | game.move(action) 186 | print("[*] Player %s move: %s\n" % (['X', 'O'][game.player_just_moved-1], action)) 187 | 188 | print(game) 189 | r = game.win(game.player_just_moved) 190 | if r == 1: 191 | print("[*] Player %s win" % ['X', 'O'][game.player_just_moved-1]) 192 | elif r == 0: 193 | print("[*] Player %s win" % ['X', 'O'][2-game.player_just_moved]) 194 | else: 195 | print("[*] Player draw") 196 | 197 | if __name__ == "__main__": 198 | play_game() 199 | -------------------------------------------------------------------------------- /MCTS/MCTS_TicTacToe.py: -------------------------------------------------------------------------------- 1 | from math import * 2 | import random 3 | 4 | class Game: 5 | def __init__(self): 6 | self.player_just_moved = 2 7 | 8 | def clone(self): 9 | st = GameState() 10 | st.player_just_moved = self.player_just_moved 11 | return st 12 | 13 | def move(self, action): 14 | self.player_just_moved = 3 - self.player_just_moved 15 | 16 | def actions(self): 17 | """ Get all possible moves from this state. 18 | """ 19 | 20 | def win(self, player): 21 | """ Get the game result from the viewpoint of player. 22 | """ 23 | 24 | def end(self): 25 | """ Whether the game is end or not 26 | """ 27 | 28 | def __repr__(self): 29 | pass 30 | 31 | class TicTacToe(Game): 32 | def __init__(self): 33 | self.player_just_moved = 2 34 | self.board = [0] * 9 # 0 = empty, 1 = player 1 (X), 2 = player 2 (O) 35 | 36 | def clone(self): 37 | st = TicTacToe() 38 | st.player_just_moved = self.player_just_moved 39 | st.board = self.board[:] 40 | return st 41 | 42 | def move(self, action): 43 | assert action >= 0 and action <= 8 and action == int(action) and self.board[action] == 0 44 | self.player_just_moved = 3 - self.player_just_moved 45 | self.board[action] = self.player_just_moved 46 | 47 | def actions(self): 48 | return [i for i in range(9) if self.board[i] == 0] 49 | 50 | def win(self, player): 51 | for (x,y,z) in [(0,1,2),(3,4,5),(6,7,8),(0,3,6),(1,4,7),(2,5,8),(0,4,8),(2,4,6)]: 52 | if self.board[x] == self.board[y] == self.board[z]: 53 | if self.board[x] == player: 54 | return 1 55 | else: 56 | return 0 57 | if self.actions() == []: return 0.5 # draw 58 | 59 | def end(self): 60 | return self.actions() == [] or self.win(1) == 1 or self.win(2) == 1 61 | 62 | def __repr__(self): 63 | line = '\n-----------\n' 64 | row = " {} | {} | {}" 65 | s = (row + line + row + line + row).format(*map(lambda i: [' ', 'X', 'O'][i], self.board)) 66 | return s 67 | 68 | class Node: 69 | def __init__(self, action=None, parent=None, state=None): 70 | self.action = action 71 | self.parent = parent 72 | self.childs = [] 73 | self.W = 0 74 | self.N = 0 75 | self.untried_actions = state.actions() 76 | self.player_just_moved = state.player_just_moved 77 | 78 | def select(self): 79 | s = sorted(self.childs, key = lambda c: c.U())[-1] 80 | return s 81 | 82 | def add_child(self, a, s): 83 | n = Node(a, self, s) 84 | self.untried_actions.remove(a) 85 | self.childs.append(n) 86 | return n 87 | 88 | def update(self, result): 89 | self.N += 1 90 | self.W += result 91 | 92 | def U(self): 93 | if self.parent: 94 | return self.W / self.N + sqrt(2 * log(self.parent.N) / self.N) 95 | return 0 96 | 97 | def __repr__(self): 98 | return "[A: %s, U: %.2f, W/N: %.1f/%d, Untried: %s]" \ 99 | % (self.action, self.U(), self.W, self.N, self.untried_actions) 100 | 101 | def show_node_tree(self, indent=0): 102 | print("| " * indent + str(self)) 103 | 104 | for c in self.childs: 105 | c.show_node_tree(indent+1) 106 | 107 | def show_children_nodes(self): 108 | print('\n[*] Child Nodes') 109 | for c in self.childs: print(c) 110 | 111 | 112 | def UCT(rootstate, itermax, verbose=False): 113 | rootnode = Node(state=rootstate) 114 | 115 | for i in range(itermax): 116 | node = rootnode 117 | state = rootstate.clone() 118 | 119 | # Select 120 | while node.untried_actions == [] and node.childs != []: 121 | node = node.select() 122 | state.move(node.action) 123 | 124 | # Expand 125 | if node.untried_actions != []: 126 | action = random.choice(node.untried_actions) 127 | state.move(action) 128 | node = node.add_child(action, state) 129 | 130 | # Rollout 131 | while state.actions() != []: 132 | state.move(random.choice(state.actions())) 133 | 134 | # Backpropagate 135 | while node != None: 136 | node.update(state.win(node.player_just_moved)) 137 | node = node.parent 138 | 139 | if verbose: rootnode.show_node_tree() 140 | else: rootnode.show_children_nodes() 141 | 142 | return sorted(rootnode.childs, key = lambda c: c.N)[-1].action 143 | 144 | def play_game(): 145 | game = TicTacToe() 146 | 147 | while not game.end(): 148 | print(game) 149 | 150 | if game.player_just_moved == 1: 151 | action = UCT(game, 1000) # Player O 152 | else: 153 | action = UCT(game, 100) # Player X 154 | 155 | game.move(action) 156 | print("[*] Player %s move: %d\n" % (['X', 'O'][game.player_just_moved-1], action)) 157 | 158 | print(game) 159 | r = game.win(game.player_just_moved) 160 | if r == 1: 161 | print("[*] Player %s win" % ['X', 'O'][game.player_just_moved-1]) 162 | elif r == 0: 163 | print("[*] Player %s win" % ['X', 'O'][2-game.player_just_moved]) 164 | else: 165 | print("[*] Player draw") 166 | 167 | if __name__ == "__main__": 168 | play_game() 169 | -------------------------------------------------------------------------------- /PG/Cartpole_pytorch/PG_CartPole-v0.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Cartpole_pytorch/PG_CartPole-v0.pth -------------------------------------------------------------------------------- /PG/Cartpole_pytorch/PG_CartPole.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.distributions import Categorical 4 | import numpy as np 5 | import gym 6 | from gym.spaces import Discrete, Box 7 | import argparse 8 | import random 9 | 10 | seed = 1 11 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 12 | torch.manual_seed(seed) 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | if torch.cuda.is_available(): 16 | torch.cuda.manual_seed_all(seed) 17 | DEBUG = False 18 | else: 19 | DEBUG = True 20 | 21 | def weight_init(m): 22 | ''' 23 | Code from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5 24 | Usage: 25 | model = Model() 26 | model.apply(weight_init) 27 | ''' 28 | if isinstance(m, nn.Linear): 29 | nn.init.xavier_normal_(m.weight.data) 30 | nn.init.normal_(m.bias.data) 31 | 32 | def reward_to_go(rews): 33 | n = len(rews) 34 | rtgs = np.zeros_like(rews) 35 | for i in reversed(range(n)): 36 | rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0) 37 | return rtgs 38 | 39 | class MLP(nn.Module): 40 | def __init__(self, sizes, activation=nn.Tanh, output_activation=None): 41 | super().__init__() 42 | 43 | net = [] 44 | for i in range(len(sizes)-1): 45 | net.append(nn.Linear(sizes[i], sizes[i+1])) 46 | if i == len(sizes) - 2: 47 | if output_activation is not None: 48 | net.append(output_activation()) 49 | else: 50 | net.append(activation()) 51 | 52 | self.mlp = nn.Sequential( 53 | *net, 54 | nn.Softmax(dim=-1) 55 | ) 56 | 57 | def forward(self, x): 58 | return self.mlp(x) 59 | 60 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 61 | epochs=50, batch_size=5000, render=False): 62 | 63 | # make environment, check spaces, get obs / act dims 64 | env = gym.make(env_name) 65 | assert isinstance(env.observation_space, Box), \ 66 | "This example only works for envs with continuous state spaces." 67 | assert isinstance(env.action_space, Discrete), \ 68 | "This example only works for envs with discrete action spaces." 69 | 70 | obs_dim = env.observation_space.shape[0] 71 | n_acts = env.action_space.n 72 | 73 | policy = MLP(sizes=[obs_dim]+hidden_sizes+[n_acts]) 74 | policy.apply(weight_init) 75 | optimizer = torch.optim.Adam(policy.parameters(), lr=lr) 76 | 77 | # for training policy 78 | def train_one_epoch(): 79 | # make some empty lists for logging. 80 | batch_obs = [] # for observations 81 | batch_acts = [] # for actions 82 | batch_weights = [] # for reward-to-go weighting in policy gradient 83 | batch_rets = [] # for measuring episode returns 84 | batch_lens = [] # for measuring episode lengths 85 | 86 | # reset episode-specific variables 87 | obs = env.reset() # first obs comes from starting distribution 88 | done = False # signal from environment that episode is over 89 | ep_rews = [] # list for rewards accrued throughout ep 90 | 91 | # render first episode of each epoch 92 | finished_rendering_this_epoch = False 93 | 94 | # collect experience by acting in the environment with current policy 95 | policy.eval() 96 | while True: 97 | # rendering 98 | if (not finished_rendering_this_epoch) and render: 99 | env.render() 100 | 101 | # save obs 102 | batch_obs.append(obs.copy()) 103 | 104 | # act in the environment 105 | with torch.no_grad(): 106 | act_probs = policy(torch.tensor(obs, dtype=torch.float)) 107 | dist = Categorical(act_probs) 108 | act = dist.sample().item() 109 | 110 | obs, rew, done, _ = env.step(act) 111 | 112 | # save action, reward 113 | batch_acts.append(act) 114 | ep_rews.append(rew) 115 | 116 | if done: 117 | # if episode is over, record info about episode 118 | ep_ret, ep_len = sum(ep_rews), len(ep_rews) 119 | batch_rets.append(ep_ret) 120 | batch_lens.append(ep_len) 121 | 122 | # the weight for each logprob(a_t|s_t) is reward-to-go from t 123 | batch_weights += list(reward_to_go(ep_rews)) 124 | 125 | # reset episode-specific variables 126 | obs, done, ep_rews = env.reset(), False, [] 127 | 128 | # won't render again this epoch 129 | finished_rendering_this_epoch = True 130 | 131 | # end experience loop if we have enough of it 132 | if len(batch_obs) > batch_size: 133 | break 134 | 135 | # take a single policy gradient update step 136 | policy.train() 137 | batch_obs = torch.tensor(batch_obs, dtype=torch.float) 138 | batch_acts = torch.tensor(batch_acts) 139 | batch_weights = torch.tensor(batch_weights) 140 | 141 | batch_act_probs = policy(batch_obs) 142 | dist = Categorical(batch_act_probs) 143 | log_probs = dist.log_prob(batch_acts) 144 | loss = (- log_probs * batch_weights).mean() 145 | 146 | optimizer.zero_grad() 147 | loss.backward() 148 | optimizer.step() 149 | 150 | return loss, batch_rets, batch_lens 151 | 152 | # training loop 153 | max_avg_ret = 0 154 | for i in range(epochs): 155 | batch_loss, batch_rets, batch_lens = train_one_epoch() 156 | print(f'epoch: {i:2d} loss: {batch_loss:.3f} episode average rewards: {np.mean(batch_rets):.3f} episode average len: {np.mean(batch_lens):.3f}') 157 | 158 | if np.mean(batch_rets) > max_avg_ret: 159 | max_avg_ret = np.mean(batch_rets) 160 | torch.save(policy.state_dict(), 'PG_{}.pth'.format(env_name)) 161 | 162 | env.close() 163 | 164 | 165 | if __name__ == '__main__': 166 | parser = argparse.ArgumentParser() 167 | parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0') 168 | parser.add_argument('--render', action='store_true') 169 | parser.add_argument('--lr', type=float, default=1e-2) 170 | parser.add_argument('--epochs', type=int, default=50) 171 | args = parser.parse_args() 172 | print('\nUsing reward-to-go formulation of policy gradient.\n') 173 | train(env_name=args.env_name, render=args.render, lr=args.lr, epochs=args.epochs) -------------------------------------------------------------------------------- /PG/Cartpole_tensorflow/PG_Cartpole.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import gym 4 | import sys 5 | import time 6 | 7 | 8 | def create_environment(): 9 | env = gym.make('CartPole-v0') 10 | env = env.unwrapped 11 | env.seed(1) 12 | 13 | state = env.reset() 14 | state_size = len(state) 15 | action_size = env.action_space.n 16 | 17 | return env, state_size, action_size 18 | 19 | def test_environment(): 20 | env, _, _ = create_environment() 21 | episodes = 1 22 | 23 | for _ in range(episodes): 24 | print(env.reset()) 25 | env.render() 26 | total_rewards = 0 27 | done = False 28 | 29 | while not done: 30 | action = env.action_space.sample() 31 | state, reward, done, info = env.step(action) 32 | env.render() 33 | 34 | total_rewards += reward 35 | print('action:', action, 'reward:', reward) 36 | time.sleep(0.5) 37 | 38 | print('[*] Total Reward:',total_rewards) 39 | 40 | def discount_and_normalize_rewards(episode_rewards, gamma): 41 | discounted_episode_rewards = np.zeros_like(episode_rewards, dtype=np.float32) 42 | cumulative = 0.0 43 | for i in reversed(range(len(episode_rewards))): 44 | cumulative = cumulative * gamma + episode_rewards[i] 45 | discounted_episode_rewards[i] = cumulative 46 | 47 | mean = np.mean(discounted_episode_rewards) 48 | std = np.std(discounted_episode_rewards) 49 | discounted_episode_rewards = (discounted_episode_rewards - mean) / std 50 | 51 | return discounted_episode_rewards 52 | 53 | 54 | class PGNetwork(): 55 | 56 | def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'): 57 | self.state_size = state_size 58 | self.action_size = action_size 59 | self.learning_rate = learning_rate 60 | 61 | with tf.name_scope(name): 62 | self.input_state = tf.placeholder(tf.float32, [None, state_size], name='input_state') 63 | self.input_action = tf.placeholder(tf.int32, [None, action_size], name='input_action') 64 | self.input_rewards = tf.placeholder(tf.float32, [None, ], name='input_rewards') 65 | self.input_mean_reward = tf.placeholder(tf.float32, name='input_mean_reward') 66 | 67 | fc1 = tf.contrib.layers.fully_connected( 68 | inputs = self.input_state, 69 | num_outputs = 10, 70 | activation_fn = tf.nn.relu, 71 | weights_initializer = tf.contrib.layers.xavier_initializer()) 72 | fc2 = tf.contrib.layers.fully_connected( 73 | inputs = fc1, 74 | num_outputs = action_size, 75 | activation_fn = tf.nn.relu, 76 | weights_initializer = tf.contrib.layers.xavier_initializer()) 77 | fc3 = tf.contrib.layers.fully_connected( 78 | inputs = fc2, 79 | num_outputs = action_size, 80 | activation_fn = None, 81 | weights_initializer = tf.contrib.layers.xavier_initializer()) 82 | 83 | self.output_action = tf.nn.softmax(fc3) 84 | neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=fc3, labels=self.input_action) 85 | self.loss = tf.reduce_mean(neg_log_prob * self.input_rewards) 86 | self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) 87 | 88 | 89 | 90 | def train(): 91 | env, state_size, action_size = create_environment() 92 | # Hyperparameters 93 | max_episodes = 10000 94 | learning_rate = 0.01 95 | gamma = 0.95 96 | 97 | tf.reset_default_graph() 98 | PG = PGNetwork(state_size, action_size, learning_rate) 99 | 100 | writer = tf.summary.FileWriter('PG_Cartpole_log') 101 | tf.summary.scalar('Loss', PG.loss) 102 | tf.summary.scalar('Reward mean', PG.input_mean_reward) 103 | write_op = tf.summary.merge_all() 104 | saver = tf.train.Saver() 105 | 106 | 107 | all_rewards = [] 108 | total_rewards = 0 109 | maximum_reward_recorded = 0 110 | episode_states, episode_actions, episode_rewards = [], [], [] 111 | 112 | with tf.Session() as sess: 113 | sess.run(tf.global_variables_initializer()) 114 | 115 | for episode in range(max_episodes): 116 | episode_rewards_sum = 0 117 | state = env.reset() 118 | env.render() 119 | done = False 120 | 121 | while not done: 122 | output_action = sess.run(PG.output_action, feed_dict={PG.input_state: state.reshape([1, 4])}) 123 | action = np.random.choice(range(action_size), p=output_action.ravel()) 124 | 125 | new_state, reward, done, info = env.step(action) 126 | env.render() 127 | 128 | episode_states.append(state) 129 | a = np.zeros(action_size) 130 | a[action] = 1 131 | episode_actions.append(a) 132 | episode_rewards.append(reward) 133 | 134 | state = new_state 135 | 136 | episode_rewards_sum = np.sum(episode_rewards) 137 | all_rewards.append(episode_rewards_sum) 138 | total_rewards = np.sum(all_rewards) 139 | mean_reward = np.divide(total_rewards, episode + 1) 140 | maximum_reward_recorded = np.amax(all_rewards) 141 | 142 | print('='*20) 143 | print('Episode:', episode) 144 | print('Reward:', episode_rewards_sum) 145 | print('Mean Reward:', mean_reward) 146 | print('Max reward so far:', maximum_reward_recorded) 147 | 148 | episode_rewards = discount_and_normalize_rewards(episode_rewards, gamma) 149 | loss, _ = sess.run([PG.loss, PG.train], feed_dict={ 150 | PG.input_state: np.vstack(np.array(episode_states)), 151 | PG.input_action: np.vstack(np.array(episode_actions)), 152 | PG.input_rewards: episode_rewards 153 | }) 154 | 155 | summary = sess.run(write_op, feed_dict={ 156 | PG.input_state: np.vstack(np.array(episode_states)), 157 | PG.input_action: np.vstack(np.array(episode_actions)), 158 | PG.input_rewards: episode_rewards, 159 | PG.input_mean_reward: mean_reward 160 | }) 161 | 162 | writer.add_summary(summary, episode) 163 | writer.flush() 164 | episode_states, episode_actions, episode_rewards = [], [], [] 165 | 166 | if episode % 5 == 0: 167 | save_path = saver.save(sess, './model/model.ckpt') 168 | print('[*] Model Saved:', save_path) 169 | 170 | print('Train done') 171 | 172 | def play(): 173 | env, state_size, action_size = create_environment() 174 | learning_rate = 0.01 175 | 176 | with tf.Session() as sess: 177 | PG = PGNetwork(state_size, action_size, learning_rate) 178 | saver = tf.train.Saver() 179 | saver.restore(sess, "./model/model.ckpt") 180 | 181 | state = env.reset() 182 | env.render() 183 | done = False 184 | episode_rewards = [] 185 | while not done: 186 | output_action = sess.run(PG.output_action, feed_dict={PG.input_state: state.reshape([1, 4])}) 187 | action = np.random.choice(range(action_size), p=output_action.ravel()) 188 | 189 | state, reward, done, info = env.step(action) 190 | env.render() 191 | episode_rewards.append(reward) 192 | 193 | episode_rewards_sum = np.sum(episode_rewards) 194 | print('Episode Rewards:', episode_rewards_sum) 195 | 196 | if __name__ == '__main__': 197 | if sys.argv[1] == '--train': 198 | train() 199 | elif sys.argv[1] == '--play': 200 | play() 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | -------------------------------------------------------------------------------- /PG/Cartpole_tensorflow/model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model.ckpt" 2 | all_model_checkpoint_paths: "model.ckpt" 3 | -------------------------------------------------------------------------------- /PG/Cartpole_tensorflow/model/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Cartpole_tensorflow/model/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /PG/Cartpole_tensorflow/model/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Cartpole_tensorflow/model/model.ckpt.index -------------------------------------------------------------------------------- /PG/Cartpole_tensorflow/model/model.ckpt.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Cartpole_tensorflow/model/model.ckpt.meta -------------------------------------------------------------------------------- /PG/Doom-Deathmatch/PG_Doom_Deathmatch.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from vizdoom import DoomGame 4 | import random 5 | import time 6 | from skimage import transform 7 | from collections import deque 8 | import sys 9 | 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | import os 14 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 15 | 16 | 17 | ########################################### 18 | # Constant 19 | stack_size = 4 20 | frame_size = (100, 160) 21 | # Global variables 22 | stacked_frames = deque([np.zeros(frame_size) for _ in range(stack_size)], maxlen=stack_size) 23 | ########################################### 24 | 25 | 26 | def create_environment(): 27 | game = DoomGame() 28 | game.load_config('defend_the_center.cfg') 29 | game.set_doom_scenario_path('defend_the_center.wad') 30 | 31 | game.init() 32 | possible_actions = np.identity(3, dtype=int).tolist() 33 | return game, possible_actions 34 | 35 | def test_environment(): 36 | game, possible_actions = create_environment() 37 | episodes = 1 38 | 39 | for _ in range(episodes): 40 | game.new_episode() 41 | 42 | while not game.is_episode_finished(): 43 | state = game.get_state() 44 | 45 | img = state.screen_buffer # 当前游戏画面, 2D array 46 | misc = state.game_variables # [50.] 47 | action = random.choice(possible_actions) 48 | reward = game.make_action(action) 49 | print(action, 'reward:', reward) 50 | time.sleep(0.02) 51 | 52 | print('[*] Result:', game.get_total_reward()) 53 | time.sleep(2) 54 | 55 | game.close() 56 | 57 | def preprocess_frame(frame): 58 | cropped_frame = frame[40:, :] 59 | normalized_frame = cropped_frame / 255.0 60 | preprocessed_frame = transform.resize(normalized_frame, frame_size) 61 | return preprocessed_frame 62 | 63 | def stack_frames(state, is_new_episode=False): 64 | global stacked_frames 65 | frame = preprocess_frame(state) 66 | 67 | if is_new_episode: 68 | stacked_frames = deque([np.zeros(frame_size) for _ in range(stack_size)], maxlen=stack_size) 69 | 70 | for _ in range(stack_size): 71 | stacked_frames.append(frame) 72 | else: 73 | stacked_frames.append(frame) 74 | 75 | return np.stack(stacked_frames, axis=2) 76 | 77 | def discount_and_normalize_rewards(episode_rewards, gamma): 78 | discounted_episode_rewards = np.zeros_like(episode_rewards, dtype=np.float32) 79 | cumulative = 0.0 80 | for i in reversed(range(len(episode_rewards))): 81 | cumulative = cumulative * gamma + episode_rewards[i] 82 | discounted_episode_rewards[i] = cumulative 83 | 84 | mean = np.mean(discounted_episode_rewards) 85 | std = np.std(discounted_episode_rewards) 86 | discounted_episode_rewards = (discounted_episode_rewards - mean) / std 87 | 88 | return discounted_episode_rewards 89 | 90 | 91 | class PGNetwork: 92 | def __init__(self, state_size, action_size, learning_rate=0.0001, name='PGNetwork'): 93 | with tf.variable_scope(name): 94 | self.inputs = tf.placeholder(tf.float32, [None, *state_size], name='inputs') 95 | self.actions = tf.placeholder(tf.float32, [None, action_size], name='actions') 96 | self.discounted_episode_rewards = tf.placeholder(tf.float32, [None, ], name='discounted_episode_rewards') 97 | self.mean_reward = tf.placeholder(tf.float32, name='mean_reward') 98 | 99 | conv1 = tf.layers.conv2d( 100 | inputs = self.inputs, 101 | filters = 32, 102 | kernel_size = [8, 8], 103 | strides = [4, 4], 104 | padding = 'VALID', 105 | kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(), 106 | name = 'conv1' 107 | ) 108 | conv1_batchnorm = tf.layers.batch_normalization( 109 | conv1, 110 | training = True, 111 | epsilon = 1e-5, 112 | name = 'conv1_batchnorm' 113 | ) 114 | conv1_out = tf.nn.elu(conv1_batchnorm, name='conv1_out') 115 | 116 | conv2 = tf.layers.conv2d( 117 | inputs = conv1_out, 118 | filters = 64, 119 | kernel_size = [4, 4], 120 | strides = [2, 2], 121 | padding = 'VALID', 122 | kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(), 123 | name = 'conv2' 124 | ) 125 | conv2_batchnorm = tf.layers.batch_normalization( 126 | conv2, 127 | training = True, 128 | epsilon = 1e-5, 129 | name = 'conv2_batchnorm' 130 | ) 131 | conv2_out = tf.nn.elu(conv2_batchnorm, name='conv2_out') 132 | 133 | conv3 = tf.layers.conv2d( 134 | inputs = conv2_out, 135 | filters = 128, 136 | kernel_size = [4, 4], 137 | strides = [2, 2], 138 | padding = 'VALID', 139 | kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(), 140 | name = 'conv3' 141 | ) 142 | conv3_batchnorm = tf.layers.batch_normalization( 143 | conv3, 144 | training = True, 145 | epsilon = 1e-5, 146 | name = 'conv3_batchnorm' 147 | ) 148 | conv3_out = tf.nn.elu(conv3_batchnorm, name='conv3_out') 149 | 150 | flatten = tf.layers.flatten(conv3_out) 151 | fc1 = tf.layers.dense( 152 | inputs = flatten, 153 | units = 512, 154 | activation = tf.nn.elu, 155 | kernel_initializer = tf.contrib.layers.xavier_initializer(), 156 | name = 'fc1' 157 | ) 158 | fc2 = tf.layers.dense( 159 | inputs = fc1, 160 | units = action_size, 161 | activation = None, 162 | kernel_initializer = tf.contrib.layers.xavier_initializer(), 163 | name = 'fc2' 164 | ) 165 | self.output = tf.nn.softmax(fc2) 166 | 167 | neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=fc2, labels=self.actions) 168 | self.loss = tf.reduce_mean(neg_log_prob * self.discounted_episode_rewards) 169 | self.train = tf.train.RMSPropOptimizer(learning_rate).minimize(self.loss) 170 | 171 | 172 | def train(): 173 | game, possible_actions = create_environment() 174 | 175 | # set hyperparameters 176 | ########################################### 177 | state_size = [*frame_size, stack_size] 178 | action_size = game.get_available_buttons_size() 179 | learning_rate = 0.0001 180 | total_episodes = 5000 181 | batch_size = 1000 182 | gamma = 0.99 183 | check_step = 5 184 | ########################################### 185 | 186 | # train PG 187 | ########################################### 188 | tf.reset_default_graph() 189 | PG = PGNetwork(state_size, action_size, learning_rate) 190 | 191 | writer = tf.summary.FileWriter('train_log') 192 | tf.summary.scalar('Loss', PG.loss) 193 | tf.summary.scalar('Reward mean', PG.mean_reward) 194 | write_op = tf.summary.merge_all() 195 | saver = tf.train.Saver() 196 | 197 | all_rewards = [] 198 | total_rewards = 0 199 | maximum_reward_recorded = 0 200 | 201 | with tf.Session() as sess: 202 | sess.run(tf.global_variables_initializer()) 203 | 204 | for episode in range(1, total_episodes+1): 205 | episode_states, episode_actions, episode_rewards = [], [], [] 206 | 207 | game.new_episode() 208 | state = game.get_state().screen_buffer 209 | state = stack_frames(state, True) 210 | 211 | while not game.is_episode_finished(): 212 | state = game.get_state().screen_buffer 213 | state = stack_frames(state) 214 | 215 | action_prob = sess.run(PG.output, feed_dict={ 216 | PG.inputs: state.reshape((1, *state_size)) 217 | }) 218 | action = np.random.choice(range(action_size), p=action_prob.ravel()) 219 | action = possible_actions[action] 220 | reward = game.make_action(action) 221 | 222 | episode_states.append(state) 223 | episode_actions.append(action) 224 | episode_rewards.append(reward) 225 | 226 | episode_rewards_sum = np.sum(episode_rewards) 227 | all_rewards.append(episode_rewards_sum) 228 | total_rewards = np.sum(all_rewards) 229 | mean_reward = np.divide(total_rewards, episode + 1) 230 | maximum_reward_recorded = np.amax(all_rewards) 231 | 232 | 233 | episode_rewards = discount_and_normalize_rewards(episode_rewards, gamma) 234 | loss, _ = sess.run([PG.loss, PG.train], feed_dict={ 235 | PG.inputs: np.array(episode_states), 236 | PG.actions: np.array(episode_actions), 237 | PG.discounted_episode_rewards: episode_rewards 238 | }) 239 | 240 | summary = sess.run(write_op, feed_dict={ 241 | PG.inputs: np.array(episode_states), 242 | PG.actions: np.array(episode_actions), 243 | PG.discounted_episode_rewards: episode_rewards, 244 | PG.mean_reward: mean_reward 245 | }) 246 | 247 | writer.add_summary(summary, episode) 248 | writer.flush() 249 | 250 | print('='*30) 251 | print('[*] Episode:', episode) 252 | print('[*] Reward:', episode_rewards_sum) 253 | print('[*] Mean Reward:', mean_reward) 254 | print('[*] Max reward so far:', maximum_reward_recorded) 255 | print('[*] Loss:', loss) 256 | 257 | if episode % check_step == 0: 258 | save_path = saver.save(sess, './model/model.ckpt') 259 | print('[*] Model Saved:', save_path) 260 | 261 | print('[*] Train done') 262 | game.close() 263 | ########################################### 264 | 265 | def play(): 266 | game, possible_actions = create_environment() 267 | 268 | state_size = [*frame_size, stack_size] 269 | action_size = game.get_available_buttons_size() 270 | PG = PGNetwork(state_size, action_size) 271 | 272 | with tf.Session() as sess: 273 | saver = tf.train.Saver() 274 | saver.restore(sess, "./model/model.ckpt") 275 | 276 | game.new_episode() 277 | frame = game.get_state().screen_buffer 278 | state = stack_frames(frame, True) 279 | 280 | while not game.is_episode_finished(): 281 | frame = game.get_state().screen_buffer 282 | state = stack_frames(frame) 283 | 284 | action_prob = sess.run(PG.output, feed_dict={ 285 | PG.inputs: state.reshape((1, *state_size)) 286 | }) 287 | action = np.random.choice(range(action_size), p=action_prob.ravel()) 288 | action = possible_actions[action] 289 | game.make_action(action) 290 | 291 | score = game.get_total_reward() 292 | print("[*] Score: ", score) 293 | 294 | game.close() 295 | 296 | if __name__ == '__main__': 297 | if sys.argv[1] == '--train': 298 | train() 299 | elif sys.argv[1] == '--play': 300 | play() 301 | elif sys.argv[1] == '--test': 302 | test_environment() 303 | -------------------------------------------------------------------------------- /PG/Doom-Deathmatch/defend_the_center.cfg: -------------------------------------------------------------------------------- 1 | # Lines starting with # are treated as comments (or with whitespaces+#). 2 | # It doesn't matter if you use capital letters or not. 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout. 4 | 5 | doom_scenario_path = defend_the_center.wad 6 | 7 | # Rewards 8 | death_penalty = 1 9 | 10 | # Rendering options 11 | screen_resolution = RES_320X240 12 | screen_format = GRAY8 13 | render_hud = True 14 | render_crosshair = false 15 | render_weapon = true 16 | render_decals = false 17 | render_particles = false 18 | window_visible = true 19 | 20 | # make episodes start after 10 tics (after unholstering the gun) 21 | episode_start_time = 10 22 | 23 | # make episodes finish after 2100 actions (tics) 24 | episode_timeout = 2100 25 | 26 | # Available buttons 27 | available_buttons = 28 | { 29 | TURN_LEFT 30 | TURN_RIGHT 31 | ATTACK 32 | } 33 | 34 | # Game variables that will be in the state 35 | available_game_variables = { AMMO2 HEALTH } 36 | 37 | mode = PLAYER 38 | doom_skill = 3 39 | -------------------------------------------------------------------------------- /PG/Doom-Deathmatch/defend_the_center.wad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Doom-Deathmatch/defend_the_center.wad -------------------------------------------------------------------------------- /PPO/HalfCheetah/PPO_HalfCheetah-v2.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PPO/HalfCheetah/PPO_HalfCheetah-v2.pth -------------------------------------------------------------------------------- /PPO/HalfCheetah/PPO_HalfCheetah.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.distributions import Categorical 4 | import numpy as np 5 | import gym 6 | from gym.spaces import Discrete, Box 7 | import time 8 | import random 9 | import scipy.signal 10 | 11 | def logger_print(logger, key, with_min_and_max=False): 12 | if with_min_and_max: 13 | print(f'{key+":":13s} {np.mean(logger[key]):.4f}\t{np.min(logger[key]):.4f}(min) {np.max(logger[key]):.4f}(max) {np.std(logger[key]):.4f}(std)') 14 | else: 15 | print(f'{key+":":13s} {np.mean(logger[key]):.4f}') 16 | 17 | def get_parameter_number(net): 18 | total_num = sum(p.numel() for p in net.parameters()) 19 | trainable_num = sum(p.numel() for p in net.parameters() if p.requires_grad) 20 | return {'Total': total_num, 'Trainable': trainable_num} 21 | 22 | def weight_init(m): 23 | ''' 24 | Code from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5 25 | Usage: 26 | model = Model() 27 | model.apply(weight_init) 28 | ''' 29 | if isinstance(m, nn.Linear): 30 | nn.init.xavier_normal_(m.weight.data) 31 | nn.init.normal_(m.bias.data) 32 | 33 | def discount_cumsum(x, discount): 34 | """ 35 | magic from rllab for computing discounted cumulative sums of vectors. 36 | 37 | input: 38 | vector x: [x0, x1, x2] 39 | 40 | output: 41 | [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2] 42 | """ 43 | return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1] 44 | 45 | class PPOBuffer: 46 | def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95): 47 | self.obs_buf = np.zeros((size, obs_dim), dtype=np.float32) 48 | self.act_buf = np.zeros((size, act_dim), dtype=np.float32) 49 | self.adv_buf = np.zeros(size, dtype=np.float32) 50 | self.rew_buf = np.zeros(size, dtype=np.float32) 51 | self.ret_buf = np.zeros(size, dtype=np.float32) 52 | self.val_buf = np.zeros(size, dtype=np.float32) 53 | self.logp_buf = np.zeros(size, dtype=np.float32) 54 | self.gamma, self.lam = gamma, lam 55 | self.ptr, self.path_start_idx, self.max_size = 0, 0, size 56 | 57 | def store(self, obs, act, rew, val, logp): 58 | """ 59 | Append one timestep of agent-environment interaction to the buffer. 60 | """ 61 | assert self.ptr < self.max_size # buffer has to have room so you can store 62 | i = self.ptr 63 | self.obs_buf[i] = obs 64 | self.act_buf[i] = act 65 | self.rew_buf[i] = rew 66 | self.val_buf[i] = val 67 | self.logp_buf[i] = logp 68 | self.ptr += 1 69 | 70 | def finish_path(self, last_val=0): 71 | """ 72 | Call this at the end of a trajectory, or when one gets cut off 73 | by an epoch ending. This looks back in the buffer to where the 74 | trajectory started, and uses rewards and value estimates from 75 | the whole trajectory to compute advantage estimates with GAE-Lambda, 76 | as well as compute the rewards-to-go for each state, to use as 77 | the targets for the value function. 78 | 79 | The "last_val" argument should be 0 if the trajectory ended 80 | because the agent reached a terminal state (died), and otherwise 81 | should be V(s_T), the value function estimated for the last state. 82 | This allows us to bootstrap the reward-to-go calculation to account 83 | for timesteps beyond the arbitrary episode horizon (or epoch cutoff). 84 | """ 85 | path_slice = slice(self.path_start_idx, self.ptr) 86 | rews = np.append(self.rew_buf[path_slice], last_val) 87 | vals = np.append(self.val_buf[path_slice], last_val) 88 | 89 | # the next two lines implement GAE-Lambda advantage calculation 90 | deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1] 91 | self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam) 92 | 93 | # the next line computes rewards-to-go, to be targets for the value function 94 | self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1] 95 | 96 | self.path_start_idx = self.ptr 97 | 98 | def get(self): 99 | """ 100 | Call this at the end of an epoch to get all of the data from 101 | the buffer, with advantages appropriately normalized (shifted to have 102 | mean zero and std one). Also, resets some pointers in the buffer. 103 | """ 104 | assert self.ptr == self.max_size # buffer has to be full before you can get 105 | self.ptr, self.path_start_idx = 0, 0 106 | # the next two lines implement the advantage normalization trick 107 | adv_mean, adv_std = np.mean(self.adv_buf), np.std(self.adv_buf) 108 | self.adv_buf = (self.adv_buf - adv_mean) / adv_std 109 | return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf] 110 | 111 | class MLP(nn.Module): 112 | def __init__(self, sizes, activation=nn.Tanh, output_activation=None): 113 | super().__init__() 114 | 115 | net = [] 116 | for i in range(len(sizes)-1): 117 | net.append(nn.Linear(sizes[i], sizes[i+1])) 118 | if i == len(sizes) - 2: 119 | if output_activation is not None: 120 | net.append(output_activation()) 121 | else: 122 | net.append(activation()) 123 | 124 | self.mlp = nn.Sequential(*net) 125 | 126 | def forward(self, x): 127 | return self.mlp(x) 128 | 129 | class MLP_Categorical_Policy(nn.Module): 130 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation=nn.Tanh, output_activation=None): 131 | super().__init__() 132 | 133 | self.mlp = MLP([obs_dim] + hidden_sizes + [act_dim], activation, output_activation) 134 | self.softmax = nn.Softmax(dim=-1) 135 | 136 | def forward(self, x): 137 | x = self.mlp(x) 138 | p = self.softmax(x) 139 | dist = Categorical(p) 140 | a = dist.sample() 141 | log_p = dist.log_prob(a) 142 | return a.item(), log_p 143 | 144 | class MLP_Gaussian_Policy(nn.Module): 145 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation=nn.Tanh, output_activation=None): 146 | super().__init__() 147 | 148 | self.mlp = MLP([obs_dim] + hidden_sizes + [act_dim], activation, output_activation) 149 | self.pi = torch.tensor(np.pi, dtype=torch.float) 150 | 151 | def forward(self, x, a=None): 152 | mu = self.mlp(x) 153 | log_std = -0.5 * torch.ones(mu.shape[-1], dtype=torch.float) 154 | std = torch.exp(log_std) 155 | if not self.training: 156 | a = mu + torch.randn(mu.shape) * std 157 | # gaussian likelihood 158 | pre_sum = -0.5 * ( ((a-mu) / (torch.exp(log_std) + 1e-8))**2 + 2*log_std + torch.log(2*self.pi) ) 159 | logp = pre_sum.sum(dim=-1) 160 | return a, logp 161 | 162 | class Actor_Critic(nn.Module): 163 | def __init__(self, obs_dim, act_dim, hidden_sizes, activation=nn.Tanh, output_activation=None, action_space=None): 164 | super().__init__() 165 | 166 | if isinstance(action_space, Box): 167 | policy = MLP_Gaussian_Policy 168 | elif isinstance(action_space, Discrete): 169 | policy = MLP_Categorical_Policy 170 | 171 | self.actor = policy(obs_dim, act_dim, hidden_sizes, activation, output_activation) 172 | self.critic = MLP([obs_dim] + hidden_sizes + [1], activation, output_activation) 173 | 174 | def forward(self, x, a=None): 175 | v = self.critic(x) 176 | if self.training: 177 | _, logp = self.actor(x, a) 178 | return logp, v 179 | else: 180 | a, logp = self.actor(x) 181 | return a, logp, v 182 | 183 | """ 184 | Proximal Policy Optimization (by clipping), with early stopping based on approximate KL 185 | """ 186 | def train( 187 | env_name, 188 | ac_kwargs=dict(), 189 | seed=0, 190 | steps_per_epoch=4000, 191 | epochs=50, 192 | gamma=0.99, 193 | clip_ratio=0.2, 194 | pi_lr=3e-4, 195 | vf_lr=1e-3, 196 | train_pi_iters=80, 197 | train_v_iters=80, 198 | lam=0.97, 199 | max_ep_len=1000, 200 | target_kl=0.01, 201 | save_freq=10 202 | ): 203 | """ 204 | 205 | Args: 206 | actor_critic: A function which takes in placeholder symbols 207 | for state, ``x_ph``, and action, ``a_ph``, and returns the main 208 | outputs from the agent's Tensorflow computation graph: 209 | 210 | =========== ================ ====================================== 211 | Symbol Shape Description 212 | =========== ================ ====================================== 213 | ``pi`` (batch, act_dim) | Samples actions from policy given 214 | | states. 215 | ``logp`` (batch,) | Gives log probability, according to 216 | | the policy, of taking actions ``a_ph`` 217 | | in states ``x_ph``. 218 | ``logp_pi`` (batch,) | Gives log probability, according to 219 | | the policy, of the action sampled by 220 | | ``pi``. 221 | ``v`` (batch,) | Gives the value estimate for states 222 | | in ``x_ph``. (Critical: make sure 223 | | to flatten this!) 224 | =========== ================ ====================================== 225 | 226 | ac_kwargs (dict): Any kwargs appropriate for the actor_critic 227 | function you provided to PPO. 228 | 229 | seed (int): Seed for random number generators. 230 | 231 | steps_per_epoch (int): Number of steps of interaction (state-action pairs) 232 | for the agent and the environment in each epoch. 233 | 234 | epochs (int): Number of epochs of interaction (equivalent to 235 | number of policy updates) to perform. 236 | 237 | gamma (float): Discount factor. (Always between 0 and 1.) 238 | 239 | clip_ratio (float): Hyperparameter for clipping in the policy objective. 240 | Roughly: how far can the new policy go from the old policy while 241 | still profiting (improving the objective function)? The new policy 242 | can still go farther than the clip_ratio says, but it doesn't help 243 | on the objective anymore. (Usually small, 0.1 to 0.3.) 244 | 245 | pi_lr (float): Learning rate for policy optimizer. 246 | 247 | vf_lr (float): Learning rate for value function optimizer. 248 | 249 | train_pi_iters (int): Maximum number of gradient descent steps to take 250 | on policy loss per epoch. (Early stopping may cause optimizer 251 | to take fewer than this.) 252 | 253 | train_v_iters (int): Number of gradient descent steps to take on 254 | value function per epoch. 255 | 256 | lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, 257 | close to 1.) 258 | 259 | max_ep_len (int): Maximum length of trajectory / episode / rollout. 260 | 261 | target_kl (float): Roughly what KL divergence we think is appropriate 262 | between new and old policies after an update. This will get used 263 | for early stopping. (Usually small, 0.01 or 0.05.) 264 | 265 | logger_kwargs (dict): Keyword args for EpochLogger. 266 | 267 | save_freq (int): How often (in terms of gap between epochs) to save 268 | the current policy and value function. 269 | 270 | """ 271 | print(locals()) 272 | 273 | torch.manual_seed(seed) 274 | random.seed(seed) 275 | np.random.seed(seed) 276 | if torch.cuda.is_available(): 277 | torch.cuda.manual_seed_all(seed) 278 | 279 | env = gym.make(env_name) 280 | obs_dim = env.observation_space.shape[0] 281 | act_dim = env.action_space.shape[0] 282 | 283 | # Share information about action space with policy architecture 284 | ac_kwargs['action_space'] = env.action_space 285 | 286 | # Experience buffer 287 | local_steps_per_epoch = steps_per_epoch 288 | buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) 289 | 290 | # Model 291 | actor_critic = Actor_Critic(obs_dim, act_dim, **ac_kwargs) 292 | print(actor_critic) 293 | print(f'\nNumber of parameters: {get_parameter_number(actor_critic)}\n') 294 | actor_critic.apply(weight_init) 295 | actor_optimizer = torch.optim.Adam(actor_critic.actor.parameters(), lr=pi_lr) 296 | critic_optimizer = torch.optim.Adam(actor_critic.critic.parameters(), lr=vf_lr) 297 | 298 | start_time = time.time() 299 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 300 | 301 | # Main loop: collect experience in env and update/log each epoch 302 | max_avg_ret = -np.inf 303 | for epoch in range(epochs): 304 | 305 | logger = { 306 | 'VVals': [], 307 | 'EpRet': [], 308 | 'EpLen': [], 309 | 'StopIter': [], 310 | 'LossPi': [], 311 | 'LossV': [], 312 | 'KL': [], 313 | 'Entropy': [], 314 | 'ClipFrac': [], 315 | 'DeltaLossPi': [], 316 | 'DeltaLossV': [] 317 | } 318 | 319 | actor_critic.eval() 320 | with torch.no_grad(): 321 | for t in range(local_steps_per_epoch): 322 | a, logp, v = actor_critic(torch.tensor(o, dtype=torch.float)) 323 | # breakpoint() 324 | 325 | # save and log 326 | buf.store(o, a, r, v, logp) 327 | logger['VVals'].append(v) 328 | 329 | o, r, d, _ = env.step(a) 330 | ep_ret += r 331 | ep_len += 1 332 | 333 | terminal = d or (ep_len == max_ep_len) 334 | if terminal or (t==local_steps_per_epoch-1): 335 | # if trajectory didn't reach terminal state, bootstrap value target 336 | last_val = r if d else actor_critic(torch.tensor(o, dtype=torch.float))[-1].item() 337 | buf.finish_path(last_val) 338 | if terminal: 339 | # only save EpRet / EpLen if trajectory finished 340 | logger['EpRet'].append(ep_ret) 341 | logger['EpLen'].append(ep_len) 342 | o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 343 | 344 | # Perform PPO update! 345 | obs_buf, act_buf, adv_buf, ret_buf, logp_buf = buf.get() 346 | obs = torch.tensor(obs_buf, dtype=torch.float) 347 | acts = torch.tensor(act_buf, dtype=torch.float) 348 | logp_old = torch.tensor(logp_buf, dtype=torch.float) 349 | adv = torch.tensor(adv_buf, dtype=torch.float) 350 | ret = torch.tensor(ret_buf, dtype=torch.float) 351 | 352 | actor_critic.train() 353 | with torch.no_grad(): 354 | logp, v = actor_critic(obs, acts) 355 | ratio = torch.exp(logp - logp_old) # pi(a|s) / pi_old(a|s) 356 | min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv) 357 | 358 | pi_l_old= - torch.min(ratio * adv, min_adv).mean() 359 | v_l_old = ((ret - v)**2).mean() 360 | ent = (-logp).mean() # a sample estimate for entropy, also easy to compute 361 | 362 | # Training 363 | for i in range(train_pi_iters): 364 | _, logp = actor_critic.actor(obs, acts) 365 | ratio = torch.exp(logp - logp_old) # pi(a|s) / pi_old(a|s) 366 | min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv) 367 | pi_loss = - torch.min(ratio * adv, min_adv).mean() 368 | kl = (logp_old - logp).mean() # a sample estimate for KL-divergence, easy to compute 369 | 370 | actor_optimizer.zero_grad() 371 | pi_loss.backward() 372 | actor_optimizer.step() 373 | 374 | if kl > 1.5 * target_kl: 375 | # print('Early stopping at step %d due to reaching max kl.'%i) 376 | break 377 | 378 | logger['StopIter'].append(i) 379 | 380 | for _ in range(train_v_iters): 381 | v = actor_critic.critic(obs) 382 | v_loss = ((ret - v)**2).mean() 383 | 384 | critic_optimizer.zero_grad() 385 | v_loss.backward() 386 | critic_optimizer.step() 387 | 388 | # Log changes from update 389 | with torch.no_grad(): 390 | logp, v = actor_critic(obs, acts) 391 | ratio = torch.exp(logp - logp_old) # pi(a|s) / pi_old(a|s) 392 | min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv) 393 | pi_l_new= - torch.min(ratio * adv, min_adv).mean() 394 | v_l_new = ((ret - v)**2).mean() 395 | kl = (logp_old - logp).mean() 396 | clipped = np.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio)) 397 | cf = clipped.float().mean() 398 | 399 | logger['LossPi'].append(pi_l_new) 400 | logger['LossV'].append(v_l_new) 401 | logger['KL'].append(kl) 402 | logger['Entropy'].append(ent) 403 | logger['ClipFrac'].append(cf) 404 | logger['DeltaLossPi'].append(pi_l_new - pi_l_old) 405 | logger['DeltaLossV'].append(v_l_new - v_l_old) 406 | 407 | # Log info about epoch 408 | print('-'*40) 409 | print(f'Epoch: {epoch}') 410 | print(f'TotalEnvInteracts: {(epoch+1)*steps_per_epoch}') 411 | logger_print(logger, 'EpRet', True) 412 | logger_print(logger, 'EpLen') 413 | logger_print(logger, 'VVals', True) 414 | logger_print(logger, 'LossPi') 415 | logger_print(logger, 'LossV') 416 | logger_print(logger, 'DeltaLossPi') 417 | logger_print(logger, 'DeltaLossV') 418 | logger_print(logger, 'Entropy') 419 | logger_print(logger, 'KL') 420 | logger_print(logger, 'ClipFrac') 421 | logger_print(logger, 'StopIter') 422 | print(f'Time: {time.time()-start_time:.4f}s') 423 | print('-'*40+'\n') 424 | 425 | # Save model 426 | if np.mean(logger['EpRet']) > max_avg_ret: 427 | max_avg_ret = np.mean(logger['EpRet']) 428 | torch.save(actor_critic.state_dict(), 'PPO_{}.pth'.format(env_name)) 429 | 430 | env.close() 431 | 432 | if __name__ == '__main__': 433 | import argparse 434 | parser = argparse.ArgumentParser() 435 | parser.add_argument('--env', type=str, default='HalfCheetah-v2') 436 | parser.add_argument('--hid', type=int, default=64) 437 | parser.add_argument('--l', type=int, default=2) 438 | parser.add_argument('--gamma', type=float, default=0.99) 439 | parser.add_argument('--seed', '-s', type=int, default=0) 440 | parser.add_argument('--cpu', type=int, default=4) 441 | parser.add_argument('--steps', type=int, default=4000) 442 | parser.add_argument('--epochs', type=int, default=50) 443 | parser.add_argument('--exp_name', type=str, default='ppo') 444 | args = parser.parse_args() 445 | 446 | train( 447 | args.env, 448 | ac_kwargs=dict(hidden_sizes=[args.hid]*args.l), 449 | gamma=args.gamma, 450 | seed=args.seed, 451 | steps_per_epoch=args.steps, 452 | epochs=args.epochs 453 | ) -------------------------------------------------------------------------------- /QLearning/QLearning_FrozenLake.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import random 4 | 5 | env = gym.make('FrozenLake-v0') 6 | action_size = env.action_space.n 7 | state_size = env.observation_space.n 8 | 9 | qtable = np.zeros((state_size, action_size)) 10 | 11 | total_episodes = 1000 12 | learning_rate = 0.8 13 | max_steps = 99 14 | gamma = 0.95 15 | 16 | epsilon = 1.0 17 | max_epsilon = 1.0 18 | min_epsilon = 0.01 19 | decay_rate = 0.01 20 | 21 | rewards = [] 22 | for episode in range(total_episodes): 23 | state = env.reset() 24 | total_rewards = 0 25 | 26 | for step in range(max_steps): 27 | exp_exp_tradeoff = random.uniform(0, 1) 28 | if exp_exp_tradeoff > epsilon: 29 | action = np.argmax(qtable[state]) 30 | else: 31 | action = env.action_space.sample() 32 | 33 | new_state, reward, done, info = env.step(action) 34 | qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state]) - qtable[state, action]) 35 | 36 | state = new_state 37 | total_rewards += reward 38 | if done: break 39 | 40 | epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * (episode+1)) 41 | rewards.append(total_rewards) 42 | 43 | print('[*] episode {}, total reward {}, average score {}'.format(episode, total_rewards, sum(rewards)/(episode+1))) 44 | 45 | print(qtable) 46 | 47 | # Play the game 48 | 49 | for episode in range(1): 50 | state = env.reset() 51 | print('*'*20) 52 | print('EPISODE ', episode) 53 | 54 | for step in range(max_steps): 55 | env.render() 56 | action = np.argmax(qtable[state]) 57 | input() 58 | state, reward, done, info = env.step(action) 59 | if done: break 60 | 61 | env.close() 62 | 63 | -------------------------------------------------------------------------------- /QLearning/QLearning_Taxi_v2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | import random 4 | 5 | env = gym.make("Taxi-v2") 6 | 7 | action_size = env.action_space.n 8 | state_size = env.observation_space.n 9 | qtable = np.zeros((state_size, action_size)) 10 | 11 | # Hyperparameters 12 | total_episodes = 50000 13 | total_test_episodes = 100 14 | max_steps = 99 15 | learning_rate = 0.7 16 | gamma = 0.618 17 | epsilon = 1.0 18 | max_epsilon = 1.0 19 | min_epsilon = 0.01 20 | decay_rate = 0.01 21 | 22 | # Train 23 | for episode in range(total_episodes): 24 | state = env.reset() 25 | 26 | for step in range(max_steps): 27 | exp_exp_tradeoff = random.uniform(0, 1) 28 | if exp_exp_tradeoff > epsilon: 29 | action = np.argmax(qtable[state, :]) 30 | else: 31 | action = env.action_space.sample() 32 | 33 | new_state, reward, done, info = env.step(action) 34 | qtable[state, action] += learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action]) 35 | 36 | state = new_state 37 | if done: break 38 | 39 | epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * (episode+1)) 40 | 41 | 42 | # Play the Game 43 | rewards = [] 44 | for episode in range(total_test_episodes): 45 | state = env.reset() 46 | total_rewards = 0 47 | 48 | print('='*20) 49 | print("[*] Episode", episode) 50 | print('='*20) 51 | 52 | for step in range(max_steps): 53 | env.render() 54 | action = np.argmax(qtable[state, :]) 55 | state, reward, done, info = env.step(action) 56 | total_rewards += reward 57 | 58 | if done: 59 | rewards.append(total_rewards) 60 | print('[*] Score', total_rewards) 61 | break 62 | 63 | env.close() 64 | print('[*] Average Score: ' + str(sum(rewards) / total_test_episodes)) -------------------------------------------------------------------------------- /QLearning/QLearning_TicTacToe.py: -------------------------------------------------------------------------------- 1 | import game 2 | import numpy as np 3 | import random 4 | 5 | class RandomPlayer(): 6 | def __init__(self): 7 | self.name = 'Random' 8 | self.win_n = 0 9 | 10 | def action(self, state, actions): 11 | return random.choice(actions) 12 | 13 | def reward(self, reward, state): 14 | if reward == 1: 15 | self.win_n += 1 16 | 17 | def episode_end(self, episode): 18 | pass 19 | 20 | class QLearningPlayer(): 21 | def __init__(self): 22 | self.name = 'Q-Learning' 23 | self.q = {} 24 | self.init_q = 1 # "optimistic" 1.0 initial values 25 | self.lr = 0.3 26 | self.gamma = 0.9 27 | self.epsilon = 1.0 28 | self.max_epsilon = 1.0 29 | self.min_epsilon = 0.01 30 | self.decay_rate = 0.01 31 | self.action_n = 9 32 | self.win_n = 0 33 | 34 | self.last_state = (' ',) * 9 35 | self.last_action = -1 36 | 37 | def action(self, state, actions): 38 | state = tuple(state) 39 | self.last_state = state 40 | 41 | r = random.uniform(0, 1) 42 | if r > self.epsilon: 43 | if self.q.get(state): 44 | i = np.argmax([self.q[state][a] for a in actions]) 45 | action = actions[i] 46 | else: 47 | self.q[state] = [self.init_q] * self.action_n 48 | action = random.choice(actions) 49 | else: 50 | action = random.choice(actions) 51 | 52 | self.last_action = action 53 | return action 54 | 55 | def reward(self, reward, state): 56 | if self.last_action >= 0: 57 | if reward == 1: 58 | self.win_n += 1 59 | 60 | state = tuple(state) 61 | if self.q.get(self.last_state): 62 | q = self.q[self.last_state][self.last_action] 63 | else: 64 | self.q[self.last_state] = [self.init_q] * self.action_n 65 | q = self.init_q 66 | 67 | self.q[self.last_state][self.last_action] = q + self.lr * (reward + self.gamma * np.max(self.q.get(state, [self.init_q]*self.action_n)) - q) 68 | 69 | def episode_end(self, episode): 70 | # epsilon decay 71 | self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.decay_rate*(episode+1)) 72 | 73 | def print_q(self): 74 | for k,v in self.q.items(): 75 | print(k,v) 76 | 77 | class HumanPlayer(): 78 | def __init__(self): 79 | self.name = 'Human' 80 | 81 | def action(self, state, actions): 82 | a = int(input('your move:')) - 1 83 | return a 84 | 85 | 86 | def train(trails_num, p1, p2, env): 87 | for episode in range(trails_num): 88 | 89 | state, win, done, info = env.reset(X=p1, O=p2) 90 | 91 | for (cur_player, oth_player) in env.player_turn(): 92 | #env.render() 93 | action = cur_player.action(state, env.action_space) 94 | state, win, done, info = env.step(action) 95 | 96 | if done: 97 | if win: 98 | cur_player.reward(1, state) 99 | oth_player.reward(-1, state) 100 | else: 101 | cur_player.reward(0.5, state) 102 | oth_player.reward(0.5, state) 103 | #env.render() 104 | break 105 | else: 106 | oth_player.reward(0, state) 107 | 108 | env.playerX.episode_end(episode) 109 | env.playerO.episode_end(episode) 110 | 111 | print('='*20) 112 | print('Train result - %d episodes' % trails_num) 113 | print('{} win rate: {}'.format(p1.name, p1.win_n / trails_num)) 114 | print('{} win rate: {}'.format(p2.name, p2.win_n / trails_num)) 115 | print('players draw rate: {}'.format((trails_num - p1.win_n - p2.win_n) / trails_num)) 116 | print('='*20) 117 | 118 | 119 | def play(p1, p2, env): 120 | while 1: 121 | state, win, done, info = env.reset(X=p1, O=p2) 122 | for (cp, op) in env.player_turn(): 123 | print() 124 | env.render() 125 | action = cp.action(state, env.action_space) 126 | state, win, done, info = env.step(action) 127 | if done: 128 | env.render() 129 | break 130 | 131 | if __name__ == '__main__': 132 | env = game.make('TicTacToe') 133 | p1 = QLearningPlayer() 134 | p2 = QLearningPlayer() 135 | p3 = HumanPlayer() 136 | p4 = RandomPlayer() 137 | 138 | train(100000, p1, p4, env) 139 | print() 140 | print('Human play') 141 | print() 142 | 143 | play(p1, p3, env) 144 | -------------------------------------------------------------------------------- /QLearning/game.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def make(game_name): 4 | if game_name == 'TicTacToe': 5 | return TicTacToe() 6 | 7 | class TicTacToe(): 8 | 9 | def __init__(self): 10 | self.reset() 11 | 12 | def render(self): 13 | line = '\n-----------\n' 14 | row = " {} | {} | {}" 15 | print((row + line + row + line + row).format(*self.state)) 16 | print(self.info) 17 | 18 | def step(self, action): 19 | #print(action) 20 | self.state[action] = self.cur_player 21 | self.action_space.remove(action) 22 | 23 | self.check_end() 24 | if self.is_end: 25 | if self.is_win: 26 | self.info = 'player{} win!'.format(self.cur_player) 27 | else: 28 | self.info = 'players draw' 29 | else: 30 | self.info = 'player{} turn'.format(self.cur_player) 31 | return (self.state, self.is_win, self.is_end, self.info) 32 | 33 | def reset(self, X=None, O=None): 34 | self.state = [' '] * 9 35 | self.action_space = list(range(9)) 36 | self.is_end = False 37 | self.is_win = False 38 | self.info = 'new game' 39 | self.playerX = X 40 | self.playerO = O 41 | self.cur_player = random.choice(['O','X']) 42 | return (self.state, self.is_win, self.is_end, self.info) 43 | 44 | def player_turn(self): 45 | while 1: 46 | if self.cur_player == 'O': 47 | cur = self.playerO 48 | oth = self.playerX 49 | else: 50 | cur = self.playerX 51 | oth = self.playerO 52 | 53 | self.info = 'player{} turn'.format(self.cur_player) 54 | yield (cur, oth) 55 | 56 | self.cur_player = 'OX'.replace(self.cur_player, '') 57 | 58 | def check_end(self): 59 | for a,b,c in [(0,1,2), (3,4,5), (6,7,8), 60 | (0,3,6), (1,4,7), (2,5,8), 61 | (0,4,8), (2,4,6)]: 62 | if self.cur_player == self.state[a] == self.state[b] == self.state[c]: 63 | self.is_win = True 64 | self.is_end = True 65 | return 66 | 67 | if not any([s == ' ' for s in self.state]): 68 | self.is_win = False 69 | self.is_end = True 70 | return 71 | 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Reinforcement Learning 2 | ====================== 3 | 4 | Reinforcing Your Learning of Reinforcement Learning. 5 | 6 | 这个是我在学习强化学习的过程中的一些记录,以及写的一些代码。建立这个Github项目主要是可以和大家一起相互学习和交流,也同时方便其他人寻找强化学习方面的资料。我为什么学习强化学习,主要是想把 AlphaZero 的那套方法(结合深度学习的蒙特卡洛树搜索)用在 RNA 分子结构预测上,目前已经做了一些尝试,比如寻找 RNA 分子的二级结构折叠路径。 7 | 8 | 首先看的书是 Richard S. Sutton 和 Andrew G. Barto 的 [Reinforcement Learning: An Introduction (Second edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf)。 9 | 10 | 看书的同时,也根据网上的一些文章写一些简单的代码,依次如下。 11 | 12 | 13 | Table of contents 14 | ================= 15 | 16 | * [Q-Learning](#q-Learning) 17 | * [Frozen Lake Game](#frozen-lake-game) 18 | * [Tic Tac Toe](#tic-tac-toe) 19 | * [Taxi v2](#taxi-v2) 20 | * [Deep Q-Learning Network (DQN)](#deep-q-Learning-network) 21 | * [Doom Game](#doom-game) 22 | * [Atari Space Invaders](#atari-space-invaders) 23 | * [Dueling Double DQN & Prioritized Experience Replay](#dueling-double-dqn-and-prioritized-experience-replay) 24 | * [Doom Deadly Corridor](#doom-deadly-corridor) 25 | * [Policy Gradients (PG)](#policy-gradients) 26 | * [CartPole Game](#cartPole-game) 27 | * [Doom Deathmatch](#doom-deathmatch) 28 | * [Advantage Actor Critic (A2C)](#advantage-actor-critic) 29 | * [Asynchronous Advantage Actor Critic (A3C)](#asynchronous-advantage-actor-critic) 30 | * [Proximal Policy Optimization (PPO)](#proximal-policy-optimization) 31 | * [Half Cheetah](#half-cheetah) 32 | * [Deep Deterministic Policy Gradient (DDPG)](#deep-deterministic-policy-gradient) 33 | * [Ant](#ant) 34 | * [AlphaGoZero Introduction](#alphagozero-introduction) 35 | * [Monte Carlo Tree Search (MCTS)](#monte-carlo-tree-search) 36 | * [Gomoku](#gomoku) 37 | * [AlphaGomoku](#alphagomoku) 38 | * [RNA Folding Path](#rna-folding-path) 39 | * [Atari Game Roms](#atari-game-roms) 40 | 41 | 42 | Q-Learning 43 | ========== 44 | 45 | **Bellman equation:** 46 |  47 | 48 | Frozen Lake Game 49 | ---------------- 50 | 51 |