├── .gitignore
├── AlphaGoZero_Intruduction
    ├── alphago_zero_introduction.pdf
    ├── alphago_zero_introduction.tex
    └── fig
    │   ├── 1_bq8g8w1ti-qi-r2asH-7Xg.png
    │   ├── alpha_mcts0.png
    │   ├── alpha_mcts1.png
    │   ├── alpha_mcts2.png
    │   ├── alpha_mcts3.png
    │   ├── alpha_mcts4.png
    │   ├── alphagozero_mcts1.png
    │   ├── alphagozero_mcts2.png
    │   ├── autodidactic_iteration.png
    │   ├── convolutional_layer.png
    │   ├── deepcube_method.png
    │   ├── deepcube_nn.png
    │   ├── evaluate_network.png
    │   ├── expanded_tree.png
    │   ├── expert_policies.png
    │   ├── game.png
    │   ├── game_state.png
    │   ├── go_state_space.png
    │   ├── mcts_2_to_4.png
    │   ├── mcts_backpropagation.png
    │   ├── mcts_expansion.png
    │   ├── mcts_iterations.png
    │   ├── mcts_process.png
    │   ├── mcts_selection.png
    │   ├── mcts_simulation.png
    │   ├── neural_netwrok_architecture.png
    │   ├── reinforcement_learning.png
    │   ├── residual_layer.png
    │   ├── retrain_network.png
    │   ├── rna_folding
    │       ├── input_state.png
    │       ├── native.png
    │       ├── native_mat.png
    │       ├── nn.png
    │       ├── output.png
    │       ├── rna_folding.pptx
    │       ├── s0.png
    │       ├── s1_1.png
    │       ├── s1_2.png
    │       ├── s1_3.png
    │       ├── s2_1.png
    │       ├── s3_1.png
    │       └── sn.png
    │   ├── rna_folding1.png
    │   ├── rna_folding2.png
    │   ├── rna_folding3.png
    │   ├── rubiks_cube.png
    │   ├── rubiks_cube_action.png
    │   ├── rubiks_cube_state.png
    │   ├── s0.png
    │   ├── self_play.png
    │   ├── the_policy_head.png
    │   └── the_value_head.png
├── AlphaGomoku
    ├── alpha_gomoku.py
    ├── cpu_node.sh
    ├── gomoku.py
    ├── loss
    │   ├── Residual_CNN_8x8_loss.png
    │   ├── Simple_CNN_19x19_loss.png
    │   └── Simple_CNN_8x8_loss.png
    ├── mcts.py
    ├── models
    │   ├── Residual_CNN_8x8_3000.h5
    │   ├── Simple_CNN_19x19_3000.h5
    │   ├── Simple_CNN_19x19_5000.h5
    │   └── Simple_CNN_8x8_3000.h5
    └── neural_network.py
├── DDDQN
    └── Doom-Deadly-Corridor
    │   ├── deadly_corridor.cfg
    │   └── deadly_corridor.wad
├── DDPG
    └── Ant
    │   ├── DDPG_Ant-v2.pth
    │   └── DDPG_Ant.py
├── DQN
    ├── Atari_Space_Invaders
    │   ├── DQN_Atari_Space_Invaders.py
    │   ├── Space Invaders (1983) (CCE) (C-820).bin
    │   ├── model
    │   │   ├── checkpoint
    │   │   ├── model.ckpt.data-00000-of-00001
    │   │   ├── model.ckpt.index
    │   │   └── model.ckpt.meta
    │   └── train_log
    │   │   └── events.out.tfevents.1530462157.MKK
    └── Doom
    │   ├── DQN_Doom.py
    │   ├── basic.cfg
    │   ├── basic.wad
    │   ├── model
    │       ├── checkpoint
    │       ├── model.ckpt.data-00000-of-00001
    │       ├── model.ckpt.index
    │       └── model.ckpt.meta
    │   └── train_log
    │       └── events.out.tfevents.1524621481.MKK
├── LICENSE
├── MCTS
    ├── MCTS_Gomoku.py
    └── MCTS_TicTacToe.py
├── PG
    ├── Cartpole_pytorch
    │   ├── PG_CartPole-v0.pth
    │   └── PG_CartPole.py
    ├── Cartpole_tensorflow
    │   ├── PG_Cartpole.py
    │   └── model
    │   │   ├── checkpoint
    │   │   ├── model.ckpt.data-00000-of-00001
    │   │   ├── model.ckpt.index
    │   │   └── model.ckpt.meta
    └── Doom-Deathmatch
    │   ├── PG_Doom_Deathmatch.py
    │   ├── defend_the_center.cfg
    │   └── defend_the_center.wad
├── PPO
    └── HalfCheetah
    │   ├── PPO_HalfCheetah-v2.pth
    │   └── PPO_HalfCheetah.py
├── QLearning
    ├── QLearning_FrozenLake.py
    ├── QLearning_Taxi_v2.py
    ├── QLearning_TicTacToe.py
    └── game.py
├── README.md
├── Roms
    └── Roms.zip
├── imgs
    ├── Bellman_equation.png
    ├── DQN.png
    ├── DQN2.png
    ├── DQN_loss.png
    ├── DQN_neural_network.png
    ├── DQN_neural_network2.png
    ├── PER.png
    ├── alphagomoku.png
    ├── ant.gif
    ├── ddpg_algorithm.svg
    ├── doom_loss.png
    ├── double_DQN.png
    ├── dueling_DQN1.png
    ├── dueling_DQN2.png
    ├── fixed_q_targets.png
    ├── frozenlake.png
    ├── halfcheetah.gif
    ├── mcts_gomoku.png
    ├── pdf_0.png
    ├── pdf_1.png
    ├── pdf_2.png
    ├── pdf_3.png
    ├── pg_algorithm.svg
    ├── pg_doom_deathmatch.png
    ├── pg_loss.png
    ├── pg_mean_reward.png
    ├── pg_network.png
    ├── play_atari_space_invaders.gif
    ├── play_cartpole.gif
    ├── play_doom.gif
    ├── play_doom_deadly_corridor.gif
    ├── play_doom_deathmatch.gif
    ├── policy_gradients.png
    ├── ppo_algorithm.svg
    ├── sumtree.png
    ├── taxi1.png
    ├── taxi2.png
    ├── taxi3.png
    ├── taxi4.png
    ├── taxi5.png
    ├── taxi6.png
    ├── tic1.png
    ├── tic2.png
    ├── tic3.png
    ├── tic4.png
    ├── tic5.png
    ├── tic6.png
    └── tic7.png
└── test
    ├── Airstriker-Genesis-Level1-000000.bk2
    ├── Airstriker-Genesis-Level1-000000.mp4
    ├── test_gym.py
    ├── test_mujoco.py
    └── test_retro.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/alphago_zero_introduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/alphago_zero_introduction.pdf


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/1_bq8g8w1ti-qi-r2asH-7Xg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/1_bq8g8w1ti-qi-r2asH-7Xg.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/alpha_mcts0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts0.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/alpha_mcts1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts1.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/alpha_mcts2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts2.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/alpha_mcts3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts3.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/alpha_mcts4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alpha_mcts4.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/alphagozero_mcts1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alphagozero_mcts1.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/alphagozero_mcts2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/alphagozero_mcts2.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/autodidactic_iteration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/autodidactic_iteration.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/convolutional_layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/convolutional_layer.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/deepcube_method.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/deepcube_method.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/deepcube_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/deepcube_nn.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/evaluate_network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/evaluate_network.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/expanded_tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/expanded_tree.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/expert_policies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/expert_policies.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/game.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/game.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/game_state.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/game_state.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/go_state_space.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/go_state_space.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/mcts_2_to_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_2_to_4.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/mcts_backpropagation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_backpropagation.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/mcts_expansion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_expansion.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/mcts_iterations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_iterations.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/mcts_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_process.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/mcts_selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_selection.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/mcts_simulation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/mcts_simulation.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/neural_netwrok_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/neural_netwrok_architecture.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/reinforcement_learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/reinforcement_learning.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/residual_layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/residual_layer.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/retrain_network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/retrain_network.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/input_state.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/input_state.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/native.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/native.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/native_mat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/native_mat.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/nn.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/output.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/rna_folding.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/rna_folding.pptx


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/s0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s0.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/s1_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s1_1.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/s1_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s1_2.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/s1_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s1_3.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/s2_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s2_1.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/s3_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/s3_1.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding/sn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding/sn.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding1.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding2.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rna_folding3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rna_folding3.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rubiks_cube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rubiks_cube.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rubiks_cube_action.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rubiks_cube_action.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/rubiks_cube_state.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/rubiks_cube_state.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/s0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/s0.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/self_play.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/self_play.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/the_policy_head.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/the_policy_head.png


--------------------------------------------------------------------------------
/AlphaGoZero_Intruduction/fig/the_value_head.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGoZero_Intruduction/fig/the_value_head.png


--------------------------------------------------------------------------------
/AlphaGomoku/alpha_gomoku.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import random
  4 | import numpy as np
  5 | from collections import deque
  6 | from gomoku import Gomoku
  7 | from mcts import MCTS
  8 | from neural_network import Residual_CNN, Simple_CNN
  9 | 
 10 | #======================
 11 | # Configuration
 12 | #======================
 13 | # 8x8
 14 | game_board_width = 8
 15 | mcts_playout_itermax_train = 400
 16 | mcts_playout_itermax_play = 1000
 17 | model_file = 'Simple_CNN_8x8_3000'
 18 | policy_network = Simple_CNN # or Residual_CNN
 19 | #======================
 20 | # 19x19
 21 | # game_board_width = 19
 22 | # mcts_playout_itermax_train = 800
 23 | # mcts_playout_itermax_play = 1000
 24 | # model_file = 'Simple_CNN_19x19_3000'
 25 | # policy_network = Simple_CNN
 26 | #======================
 27 | 
 28 | def random_play(game):
 29 |     return random.choice(game.actions())
 30 | 
 31 | def human_play():
 32 |     t = input('[*] Your turn (i j): ')
 33 |     a, b = t.split(' ')
 34 |     i, j = int(a), int(b)
 35 |     return (i, j)
 36 | 
 37 | def play_game():
 38 |     game = Gomoku(game_board_width)
 39 |     policy = policy_network(input_dim=game.nn_input.shape, output_dim=game.w**2)
 40 |     policy.load(model_file)
 41 |     mcts_player = MCTS(policy, mcts_playout_itermax_play)
 42 | 
 43 |     starting_player = random.choice([1,2])
 44 |     game.reset(starting_player)
 45 |     mcts_player.set_rootnode(starting_player)
 46 |     while not game.is_end:
 47 |         print(game)
 48 |         # print(game.nn_input)
 49 | 
 50 |         if game.current_player == 1: # Player X
 51 |             action, _ = mcts_player.get_move(game)
 52 |         else: # Player O
 53 |             action = human_play()
 54 |         
 55 |         game.move(action)
 56 |         mcts_player.update_with_move(action, game)
 57 | 
 58 |         print("[*] Player %s move: %s\n" % (['X', 'O'][game.player_just_moved-1], action))
 59 | 
 60 |     print(game)
 61 |     if game.winner > 0:
 62 |         print("[*] Player %s win" % ['X', 'O'][game.winner-1])
 63 |     else:
 64 |         print("[*] Player draw")
 65 | 
 66 | def self_play(game, player, render=False):
 67 |     starting_player = random.choice([1,2])
 68 |     game.reset(starting_player)
 69 |     player.set_rootnode(starting_player)
 70 |     board_states, mcts_probs, cur_players = [], [], []
 71 | 
 72 |     while not game.is_end:
 73 |         if render: print(game)
 74 | 
 75 |         action, action_probs = player.get_move(game, stochastically=True, show_node=render)
 76 | 
 77 |         board_states.append(game.nn_input)
 78 |         mcts_probs.append(action_probs)
 79 |         cur_players.append(game.current_player)
 80 | 
 81 |         game.move(action)
 82 |         player.update_with_move(action, game)
 83 | 
 84 |         if render: print("[*] Player %s move: %s\n" % (['X', 'O'][game.player_just_moved-1], action))
 85 | 
 86 |     rewards = list(map(game.reward, cur_players))
 87 | 
 88 |     if render:
 89 |         print(game)
 90 |         if game.winner > 0:
 91 |             print("[*] Player %s win" % ['X', 'O'][game.winner-1])
 92 |         else:
 93 |             print("[*] Player draw")
 94 | 
 95 |     return list(zip(board_states, mcts_probs, rewards)), game.winner, starting_player
 96 | 
 97 | def augment_data(play_data):
 98 |     # augment the data set by rotation and flipping
 99 |     extend_data = []
100 |     for state, pi, z in play_data:
101 |         w = state.shape[-1]
102 | 
103 |         for i in [1, 2, 3, 4]:
104 |             # rotate counterclockwise
105 |             equi_state = np.array([np.rot90(s, i) for s in state])
106 |             equi_pi = np.rot90(pi.reshape((w, w)), i)
107 |             extend_data.append((equi_state, equi_pi.flatten(), z))
108 |             # flip horizontally
109 |             equi_state = np.array([np.fliplr(s) for s in equi_state])
110 |             equi_pi =np.fliplr(equi_pi)
111 |             extend_data.append((equi_state, equi_pi.flatten(), z))
112 | 
113 |     return extend_data
114 |     
115 | 
116 | def train():
117 |     game_episode_num = 3000
118 |     selfplay_batch_size = 1
119 |     data_buffer_size = 10000
120 |     check_step = 10
121 |     train_batch_size = 512
122 | 
123 |     data_buffer = deque(maxlen=data_buffer_size)
124 | 
125 |     game = Gomoku(game_board_width)
126 |     policy = policy_network(input_dim=game.nn_input.shape, output_dim=game.w**2)
127 |     mcts_player = MCTS(policy, mcts_playout_itermax_train)
128 |     winner_num = [0] * 3
129 | 
130 |     print('[*] Start self play')
131 |     # game episode
132 |     for i in range(game_episode_num):
133 | 
134 |         # get train data
135 |         start_time = time.time()
136 |         for _ in range(selfplay_batch_size):
137 |             play_data, winner, starting_player = self_play(game, mcts_player)
138 |             episode_len = len(play_data)
139 |             extend_data = augment_data(play_data)
140 |             data_num = len(extend_data)
141 |             data_buffer.extend(extend_data)
142 |             winner_num[winner] += 1
143 |         end_time = time.time()
144 | 
145 |         print('[*] Episode: {}, length: {}, start: {}, winner: {}, data: {}, time: {}s, win ratio: X {:.1f}%, O {:.1f}%, - {:.1f}%'.format(
146 |             i+1, episode_len, ['-', 'X', 'O'][starting_player], ['-', 'X', 'O'][winner], data_num, int(end_time - start_time),
147 |             winner_num[1] / (i+1) * selfplay_batch_size * 100,
148 |             winner_num[2] / (i+1) * selfplay_batch_size * 100,
149 |             winner_num[0] / (i+1) * selfplay_batch_size * 100,
150 |         ))
151 | 
152 |         # train
153 |         if len(data_buffer) > train_batch_size:
154 |             mini_batch = random.sample(data_buffer, train_batch_size)
155 |             state_batch = np.array([d[0] for d in mini_batch])
156 |             pi_batch = np.array([d[1] for d in mini_batch])
157 |             z_batch = np.array([d[2] for d in mini_batch])
158 | 
159 |             policy.train(state_batch, [z_batch, pi_batch])
160 | 
161 |         # check current policy model and save the params
162 |         if (i + 1) % check_step == 0:
163 |             policy.loss_history.plot_loss('loss.png')
164 |             print('[*] Save current policy model')
165 |             policy.save(model_file)
166 |             print('[*] done')
167 | 
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     if sys.argv[1] == '--train':
172 |         train()
173 |     elif sys.argv[1] == '--play':
174 |         play_game()
175 | 


--------------------------------------------------------------------------------
/AlphaGomoku/cpu_node.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |  
3 | #SBATCH -n 2
4 | #SBATCH -N 1
5 | #SBATCH -w node1                                                                                
6 | #SBATCH -o slurm.out
7 | #SBATCH -e slurm.err
8 | 
9 | python alpha_gomoku.py --train


--------------------------------------------------------------------------------
/AlphaGomoku/gomoku.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class GameState:
  4 |     def __init__(self):
  5 |         self.player_just_moved = 2
  6 |         
  7 |     def clone(self):
  8 |         st = GameState()
  9 |         st.player_just_moved = self.player_just_moved
 10 |         return st
 11 | 
 12 |     def move(self, action):
 13 |         self.player_just_moved = 3 - self.player_just_moved
 14 |         
 15 |     def actions(self):
 16 |         """ Get all possible moves from this state.
 17 |         """
 18 |     
 19 |     def win(self, player):
 20 |         """ Get the game result from the viewpoint of player. 
 21 |         """
 22 | 
 23 |     def end(self):
 24 |         """ Whether the game is end or not
 25 |         """
 26 | 
 27 |     def __repr__(self):
 28 |         pass
 29 | 
 30 | class Gomoku(GameState):
 31 |     def __init__(self, w=8): # 15x15
 32 |         self.w = w
 33 |         self.reset()
 34 |     
 35 |     def reset(self, current_player=1):
 36 |         w = self.w
 37 |         self.current_player = current_player
 38 |         self.first_player = current_player
 39 |         self.player_just_moved = 3 - current_player
 40 |         self.board = [] # 0 = empty, 1 = player 1 (X), 2 = player 2 (O)
 41 | 
 42 |         for y in range(w):
 43 |             self.board.append([0] * w)
 44 |         self.is_end = False
 45 |         self.winner = -1 # 0 = draw, 1 = player 1 (X), 2 = player 2 (O)
 46 | 
 47 |         # 1 if stone here and 0 if stone not here
 48 |         # fisrt 1 stack - position of current player's stones
 49 |         # next 1 stack - position of last player's stones
 50 |         # next 1 stack - position of last player's move
 51 |         # last 1 stack - All 1 if it's first play, all 0 if it's second play
 52 |         self.nn_input = np.zeros((4, w, w))
 53 |         self.nn_input[-1] = 1
 54 | 
 55 |     def clone(self):
 56 |         st = Gomoku()
 57 |         st.w = self.w
 58 |         st.current_player = self.current_player
 59 |         st.first_player = self.first_player
 60 |         st.player_just_moved = self.player_just_moved
 61 |         st.board = [self.board[i][:] for i in range(self.w)]
 62 |         st.nn_input = np.copy(self.nn_input)
 63 |         return st
 64 | 
 65 |     def move(self, action):
 66 |         a, b = action
 67 |         assert 0 <= a <= self.w and 0 <= b <= self.w and self.board[a][b] == 0
 68 |         self.board[a][b] = self.current_player
 69 |         self.player_just_moved = self.current_player
 70 |         self.current_player = 3 - self.current_player
 71 |         self.check_end(action)
 72 | 
 73 |         # update nn_input
 74 |         self.nn_input = np.zeros((4, self.w, self.w))
 75 |         for i in range(self.w):
 76 |             for j in range(self.w):
 77 |                 s = self.board[i][j]
 78 |                 if s == self.current_player: self.nn_input[0, i, j] = 1
 79 |                 elif s == self.player_just_moved: self.nn_input[1, i, j] = 1
 80 |         self.nn_input[2, a, b] = 1
 81 |         self.nn_input[3] = 1 if self.current_player == self.first_player else 0
 82 |         
 83 |     def actions(self):
 84 |         return [(i, j) for i in range(self.w) for j in range(self.w) if self.board[i][j] == 0]
 85 |     
 86 |     def check_end(self, action):
 87 |         a, b = action
 88 |         for i in range(5):
 89 |             if i <= a <= self.w - (5 - i) and i <= b <= self.w - (5 - i):
 90 |                 if self.board[a-i][b-i] == self.board[a-i+1][b-i+1] == self.board[a-i+2][b-i+2] == self.board[a-i+3][b-i+3] == self.board[a-i+4][b-i+4]:
 91 |                     self.is_end = True
 92 |                     self.winner = self.player_just_moved
 93 |                     return
 94 |             if i <= a <= self.w - (5 - i):
 95 |                 if self.board[a-i][b] == self.board[a-i+1][b] == self.board[a-i+2][b] == self.board[a-i+3][b] == self.board[a-i+4][b]:
 96 |                     self.is_end = True
 97 |                     self.winner = self.player_just_moved
 98 |                     return
 99 |             if i <= a <= self.w - (5 - i) and (4 - i) <= b <= (self.w - i - 1):
100 |                 if self.board[a-i][b+i] == self.board[a-i+1][b+i-1] == self.board[a-i+2][b+i-2] == self.board[a-i+3][b+i-3] == self.board[a-i+4][b+i-4]:
101 |                     self.is_end = True
102 |                     self.winner = self.player_just_moved
103 |                     return
104 |             if i <= b <= self.w - (5 - i):
105 |                 if self.board[a][b-i] == self.board[a][b-i+1] == self.board[a][b-i+2] == self.board[a][b-i+3] == self.board[a][b-i+4]:
106 |                     self.is_end = True
107 |                     self.winner = self.player_just_moved
108 |                     return
109 | 
110 |         if self.actions() == []:
111 |             self.is_end = True
112 |             self.winner = 0
113 | 
114 |     def reward(self, player):
115 |         if self.winner == 0: # tie
116 |             return 0
117 |         if self.winner == player:
118 |             return 1
119 |         elif self.winner == 3 - player:
120 |             return -1
121 |         if self.winner == -1:
122 |             return 0
123 | 
124 |     def __repr__(self):
125 |         row = '{:>2}  ' + ' | '.join(['{}'] * self.w) + ' '
126 |         line = '\n   ' + ('----' * self.w)[:-1] + '\n'
127 |         s = '   ' + '%2d  ' * self.w % tuple(range(self.w)) + '\n'
128 |         s += line.join([row.format(i, *map(lambda j: [' ', 'X', 'O'][j], self.board[i])) for i in range(self.w)])
129 |         return s
130 | 


--------------------------------------------------------------------------------
/AlphaGomoku/loss/Residual_CNN_8x8_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/loss/Residual_CNN_8x8_loss.png


--------------------------------------------------------------------------------
/AlphaGomoku/loss/Simple_CNN_19x19_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/loss/Simple_CNN_19x19_loss.png


--------------------------------------------------------------------------------
/AlphaGomoku/loss/Simple_CNN_8x8_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/loss/Simple_CNN_8x8_loss.png


--------------------------------------------------------------------------------
/AlphaGomoku/mcts.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | def softmax(x):
  5 |     p = np.exp(x - np.max(x))
  6 |     p /= np.sum(p)
  7 |     return p
  8 | 
  9 | class Node:
 10 |     def __init__(self, action=None, parent=None, player=None, prior_p=1.0):
 11 |         self.action = action
 12 |         self.parent = parent
 13 |         self.childs = {}
 14 | 
 15 |         self.W = 0 # total action value
 16 |         self.N = 0 # visit count
 17 |         self.Q = 0 # mean action value
 18 |         self.P = prior_p # prior probability of selecting that edge
 19 | 
 20 |         self.current_player = player
 21 |         self.next_player = 3 - player
 22 | 
 23 |     def select(self):
 24 |         # You should think carefully why use -Q here
 25 |         return max(self.childs.values(), key=lambda c: -c.Q + c.U())
 26 | 
 27 |     def expand(self, actions, probs):
 28 |         for i in range(len(actions)):
 29 |             a, p = actions[i], probs[i]
 30 |             n = Node(a, self, self.next_player, p)
 31 |             self.childs[a] = n
 32 |     
 33 |     def update(self, v):
 34 |         self.N += 1
 35 |         self.W += v
 36 |         self.Q = self.W / self.N
 37 |         # self.Q += (v - self.Q) / self.N
 38 | 
 39 |     def back_propagate(self, v):
 40 |         self.update(v)
 41 |         if self.parent:
 42 |             self.parent.back_propagate(-v)
 43 | 
 44 |     def U(self, c_puct=5.0):
 45 |         # c_puct -- a number in (0, inf) controlling the relative impact of
 46 |         #       values (Q) and prior probability (P) on this node's score
 47 |         #       it is a constant determining the level of exploration
 48 |         if self.parent:
 49 |             return c_puct * self.P * np.sqrt(self.parent.N) / (1 + self.N)
 50 |         return 0
 51 | 
 52 |     def __repr__(self):
 53 |         return "[A: %s, P: %.2f, Q+U: %.2f, W/N: %.1f/%d]" \
 54 |         % (self.action, self.P, self.Q + self.U(), self.W, self.N)
 55 | 
 56 |     def show_node_tree(self, indent=0):
 57 |         print("|  " * indent + str(self))
 58 | 
 59 |         for c in self.childs.values():
 60 |             c.show_node_tree(indent+1)
 61 | 
 62 |     def show_children_nodes(self):
 63 |         print('\n[*] Child Nodes')
 64 |         for c in self.childs.values(): print(c)
 65 | 
 66 | 
 67 | class MCTS:
 68 |     def __init__(self, neural_network_fn, playout_itermax, playout_depth=4):
 69 |         self.f = neural_network_fn # (p, v) = f(s)
 70 |         self.playout_itermax = playout_itermax
 71 |         self.playout_depth = playout_depth
 72 | 
 73 |     def set_rootnode(self, starting_player):
 74 |         self.rootnode = Node(player=starting_player)
 75 | 
 76 |     def get_move(self, state, stochastically=False, show_node=False, verbose=False):
 77 |         for i in range(self.playout_itermax):
 78 |             self.playout(self.rootnode, state.clone())
 79 | 
 80 |         if show_node:
 81 |             if verbose: self.rootnode.show_node_tree()
 82 |             else: self.rootnode.show_children_nodes()
 83 | 
 84 |         action_probs = np.zeros((state.w, state.w))
 85 |         acts, probs = [], []
 86 |         for c in self.rootnode.childs.values():
 87 |             acts.append(c.action)
 88 |             probs.append(c.N)
 89 |         probs = softmax(probs)
 90 |         for a, p in zip(acts, probs):
 91 |             action_probs[a] = p
 92 |         action_probs = action_probs.flatten()
 93 | 
 94 |         if stochastically:
 95 |             # add Dirichlet Noise for exploration (for self-play training)
 96 |             epsilon = 0.25
 97 |             eta = 0.3 # Dirichlet noise
 98 |             i = np.random.choice(
 99 |                 len(acts),
100 |                 p = (1 - epsilon) * probs + epsilon * np.random.dirichlet(eta * np.ones(len(probs)))
101 |                 )
102 |             action = acts[i]
103 |         else: # deterministically, for competitive play
104 |             action = max(self.rootnode.childs.values(), key=lambda c: c.N).action
105 | 
106 |         return action, action_probs
107 | 
108 |     def update_with_move(self, action, state):
109 |         if action in self.rootnode.childs:
110 |             self.rootnode = self.rootnode.childs[action]
111 |             self.rootnode.parent = None
112 |         else:
113 |             self.rootnode = Node(player=state.player_just_moved)
114 | 
115 |     def playout(self, node, state):
116 |         #=======================================
117 |         # MCTS without neural network
118 |         #=======================================
119 |         # # Select & Expand
120 |         # for i in range(self.playout_depth):
121 |         #     if node.childs == {}:
122 |         #         node.expand(state.actions())
123 |         #
124 |         #     node = node.select()
125 |         #     state.move(node.action)
126 |         #
127 |         # # Rollout
128 |         # self.rollout(state)
129 |         #
130 |         # # Backpropagate
131 |         # while node != None:
132 |         #     node.update(state.reward(node.player_just_moved))
133 |         #     node = node.parent
134 |         #=======================================
135 | 
136 | 
137 |         # Select
138 |         while 1:
139 |             if node.childs == {}:
140 |                 break
141 | 
142 |             node = node.select()
143 |             state.move(node.action)
144 | 
145 |         # Rollout
146 |         v, a, p = self.evaluate_state(state)
147 | 
148 |         if state.is_end:
149 |             v = -1
150 |         else:
151 |             # Expand
152 |             node.expand(a, p)
153 | 
154 |         # Backpropagate
155 |         node.back_propagate(v)
156 | 
157 | 
158 |     def evaluate_state(self, state):
159 |         x = state.nn_input.reshape((1, *state.nn_input.shape))
160 |         value, probs = self.f.pred(x)
161 |         v = value[0, 0]
162 |         a = state.actions()
163 |         p = []
164 |         probs = probs.reshape((state.w, state.w))
165 |         for i, j in a:
166 |             p.append(probs[i, j])
167 |         p = np.array(p)
168 |         if p.sum() > 0: p /= p.sum()
169 |         return v, a, p
170 | 
171 |     def rollout(self, state):
172 |         while not state.is_end:
173 |             state.move(random.choice(state.actions()))
174 | 
175 | 


--------------------------------------------------------------------------------
/AlphaGomoku/models/Residual_CNN_8x8_3000.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/models/Residual_CNN_8x8_3000.h5


--------------------------------------------------------------------------------
/AlphaGomoku/models/Simple_CNN_19x19_3000.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/models/Simple_CNN_19x19_3000.h5


--------------------------------------------------------------------------------
/AlphaGomoku/models/Simple_CNN_19x19_5000.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/models/Simple_CNN_19x19_5000.h5


--------------------------------------------------------------------------------
/AlphaGomoku/models/Simple_CNN_8x8_3000.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/AlphaGomoku/models/Simple_CNN_8x8_3000.h5


--------------------------------------------------------------------------------
/AlphaGomoku/neural_network.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from keras.models import Sequential, Model, load_model
  4 | from keras.layers import Input, Dense, Conv2D, Flatten, BatchNormalization, Activation, LeakyReLU, add
  5 | from keras.optimizers import SGD, Adam
  6 | from keras import regularizers
  7 | from keras.callbacks import TensorBoard, Callback
  8 | 
  9 | import matplotlib
 10 | matplotlib.use('Agg')
 11 | import matplotlib.pyplot as plt
 12 | import seaborn as sns
 13 | 
 14 | import os
 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 16 | 
 17 | class LossHistory(Callback):
 18 |     def __init__(self):
 19 |         self.losses = []
 20 |         self.policy_head_losses = []
 21 |         self.value_head_losses = []        
 22 | 
 23 |     def on_epoch_end(self, epoch, logs={}):
 24 |         self.losses.append(logs.get('loss'))
 25 |         self.policy_head_losses.append(logs.get('policy_head_loss'))
 26 |         self.value_head_losses.append(logs.get('value_head_loss'))
 27 | 
 28 |     def plot_loss(self, img_file):
 29 |         fig = plt.figure()
 30 |         ax = fig.add_subplot(1, 1, 1)
 31 |         ax.plot(self.losses)
 32 |         ax.plot(self.policy_head_losses)
 33 |         ax.plot(self.value_head_losses)
 34 |         plt.title('Model loss')
 35 |         plt.ylabel('loss')
 36 |         plt.xlabel('episode')
 37 |         plt.legend(['loss', 'policy_head_loss', 'value_head_loss'], loc='upper right')
 38 |         plt.savefig(img_file)
 39 |         plt.close(fig)
 40 | 
 41 | class NetworkModel:
 42 |     def __init__(self):
 43 |         pass
 44 | 
 45 |     def train(self, states, targets):
 46 |         # model_log = TensorBoard(log_dir='./logs')
 47 |         return self.model.fit(states, targets, verbose=self.verbose, callbacks=[self.loss_history])
 48 | 
 49 |     def pred(self, x):
 50 |         return self.model.predict(x)
 51 | 
 52 |     def load(self, name):
 53 |         return self.model.load_weights('models/{}.h5'.format(name))
 54 | 
 55 |     def save(self, name):
 56 |         self.model.save_weights('models/{}.h5'.format(name))
 57 | 
 58 |     def info(self):
 59 |         self.model.summary()
 60 | 
 61 | 
 62 | class Residual_CNN(NetworkModel):
 63 | 
 64 |     def __init__(self, input_dim, output_dim):
 65 |         self.input_dim = input_dim
 66 |         self.output_dim = output_dim
 67 | 
 68 |         self.conv_layer_filters = 64
 69 |         self.conv_layer_kernel_size = (3, 3)
 70 |         self.residual_layer_num = 2
 71 |         self.value_head_hidden_layer_size = 20
 72 | 
 73 |         self.learning_rate = 0.1
 74 |         self.momentum = 0.9
 75 |         self.reg_const = 0.0001
 76 | 
 77 |         self.verbose = True
 78 | 
 79 |         self.model = self.build_model()
 80 |         self.loss_history = LossHistory()
 81 | 
 82 |     def build_model(self):
 83 |         """Construct a convolutional neural network with Resnet-style skip connections.
 84 | 
 85 |         Network Diagram:                                                                        [value head]
 86 |                               |---------------------------------|                   /---C---B---R---F---D---R---D---T
 87 |         I-----C-----B-----R---o---C-----B-----R-----C-----B-----M-----R--- ..... ---|
 88 |               \___________/     \___________________________________/               \---C---B---R---F---D---S [polich head]
 89 |            [Convolutional layer]          [Residual layer]
 90 | 
 91 |         I - input
 92 |         B - BatchNormalization
 93 |         R - Rectifier non-linearity, LeakyReLU
 94 |         T - tanh
 95 |         C - Conv2D
 96 |         F - Flatten
 97 |         D - Dense
 98 |         M - merge, add
 99 |         S - Softmax
100 |         O - output
101 |         """
102 |         main_input = Input(shape=self.input_dim, name='main_input')
103 | 
104 |         x = self.conv_layer(main_input, self.conv_layer_filters, self.conv_layer_kernel_size)
105 |         for _ in range(self.residual_layer_num):
106 |             x = self.residual_layer(x, self.conv_layer_filters, self.conv_layer_kernel_size)
107 | 
108 |         vh = self.value_head(x)
109 |         ph = self.policy_head(x)
110 | 
111 |         model = Model(inputs=main_input, outputs=[vh, ph])
112 |         model.compile(
113 |         	loss=['mean_squared_error', 'categorical_crossentropy'],
114 |             optimizer=SGD(lr=self.learning_rate, momentum=self.momentum)
115 |             )
116 | 
117 |         return model
118 | 
119 |     def conv_layer(self, x, filters, kernel_size):
120 |         conv = Conv2D(
121 |             filters = filters,
122 |             kernel_size = kernel_size,
123 |             strides = (1, 1),
124 |             padding = 'same',
125 |             data_format = 'channels_first',
126 |             use_bias = False,
127 |             activation = 'linear',
128 |             kernel_regularizer = regularizers.l2(self.reg_const)
129 |             )(x)
130 |         bn = BatchNormalization(axis=1)(conv)
131 |         lrelu = LeakyReLU()(bn)
132 |         return lrelu
133 | 
134 |     def residual_layer(self, x, filters, kernel_size):
135 |         conv_1 = self.conv_layer(x, filters, kernel_size)
136 |         conv_2 = Conv2D(
137 |             filters = filters,
138 |             kernel_size = kernel_size,
139 |             strides = (1, 1),
140 |             padding = 'same',
141 |             data_format = 'channels_first',
142 |             use_bias = False,
143 |             activation = 'linear',
144 |             kernel_regularizer = regularizers.l2(self.reg_const)
145 |             )(conv_1)
146 |         bn = BatchNormalization(axis=1)(conv_2)
147 |         merge_layer = add([x, bn])
148 |         lrelu = LeakyReLU()(merge_layer)
149 |         return lrelu
150 | 
151 |     def value_head(self, x):
152 |         x = self.conv_layer(x, 1, (1, 1))
153 |         x = Flatten()(x)
154 |         x = Dense(
155 |             self.value_head_hidden_layer_size,
156 |             use_bias = False,
157 |             activation = 'linear',
158 |             kernel_regularizer = regularizers.l2(self.reg_const)
159 |             )(x)
160 |         x = LeakyReLU()(x)
161 |         x = Dense(
162 |             1,
163 |             use_bias = False,
164 |             activation = 'tanh',
165 |             kernel_regularizer = regularizers.l2(self.reg_const),
166 |             name = 'value_head'
167 |             )(x)
168 |         return x
169 | 
170 |     def policy_head(self, x):
171 |         x = self.conv_layer(x, 2, (1, 1))
172 |         x = Flatten()(x)
173 |         x = Dense(
174 |             self.output_dim,
175 |             use_bias = False,
176 |             activation = 'softmax',
177 |             kernel_regularizer = regularizers.l2(self.reg_const),
178 |             name = 'policy_head'
179 |             )(x)
180 |         return x
181 | 
182 | 
183 | class Simple_CNN(NetworkModel):
184 |     def __init__(self, input_dim, output_dim):
185 |         self.input_dim = input_dim
186 |         self.output_dim = output_dim
187 |         self.l2_const = 1e-4
188 | 
189 |         self.verbose = True
190 | 
191 |         self.model = self.build_model()
192 |         self.loss_history = LossHistory()
193 | 
194 |     def build_model(self):
195 |         """
196 |         Network Diagram:
197 |                                                        2(1x1)       64     1
198 |             32(3x3)     64(3x3)    128(3x3)        /-----C-----F-----D-----D-----T  [value head]
199 |         I-----C-----R-----C-----R-----C-----R-----|
200 |               \_____________________________/      \-----C-----F-----D-----S        [polich head]
201 |                    [Convolutional layer]               4(1x1)       w^2
202 | 
203 |         I - input
204 |         B - BatchNormalization
205 |         R - ReLU
206 |         T - tanh
207 |         C - Conv2D
208 |         F - Flatten
209 |         D - Dense
210 |         S - Softmax
211 |         """
212 |         main_input = Input(shape=self.input_dim, name='main_input')
213 | 
214 |         x = self.conv_layer(main_input, 32, (3, 3))
215 |         x = self.conv_layer(x, 64, (3, 3))
216 |         x = self.conv_layer(x, 128, (3, 3))
217 | 
218 |         vh = self.value_head(x)
219 |         ph = self.policy_head(x)
220 | 
221 |         model = Model(main_input, [vh, ph])
222 |         model.compile(
223 |             optimizer=Adam(),
224 |             loss=['mean_squared_error', 'categorical_crossentropy']
225 |             )
226 |         return model
227 | 
228 |     def conv_layer(self, x, filters, kernel_size, padding='same'):
229 |         conv = Conv2D(
230 |             filters = filters,
231 |             kernel_size = kernel_size,
232 |             padding = padding,
233 |             data_format = 'channels_first',
234 |             activation = 'relu',
235 |             kernel_regularizer = regularizers.l2(self.l2_const)
236 |         )(x)
237 |         return conv
238 | 
239 |     def value_head(self, x):
240 |         x = self.conv_layer(x, 2, (1, 1), 'valid')
241 |         x = Flatten()(x)
242 |         x = Dense(64, kernel_regularizer=regularizers.l2(self.l2_const))(x)
243 |         x = Dense(
244 |             1,
245 |             kernel_regularizer = regularizers.l2(self.l2_const),
246 |             activation = 'tanh',
247 |             name = 'value_head'
248 |             )(x)
249 |         return x
250 | 
251 |     def policy_head(self, x):
252 |         x = self.conv_layer(x, 4, (1, 1), 'valid')
253 |         x = Flatten()(x)
254 |         x = Dense(
255 |             self.output_dim,
256 |             kernel_regularizer = regularizers.l2(self.l2_const),
257 |             activation = 'softmax',
258 |             name = 'policy_head'
259 |             )(x)
260 |         return x
261 | 
262 | 


--------------------------------------------------------------------------------
/DDDQN/Doom-Deadly-Corridor/deadly_corridor.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = deadly_corridor.wad
 6 | 
 7 | # Skill 5 is reccomanded for the scenario to be a challenge.
 8 | doom_skill = 5
 9 | 
10 | # Rewards
11 | death_penalty = 100
12 | #living_reward = 0
13 | 
14 | # Rendering options
15 | screen_resolution = RES_160X120
16 | screen_format = GRAY8
17 | render_hud = true
18 | render_crosshair = false
19 | render_weapon = true
20 | render_decals = false
21 | render_particles = false
22 | window_visible = true
23 | 
24 | episode_timeout = 2100
25 | 
26 | # Available buttons
27 | available_buttons = 
28 | 	{ 
29 | 		MOVE_LEFT 
30 | 		MOVE_RIGHT 
31 | 		ATTACK 
32 | 		MOVE_FORWARD
33 | 		MOVE_BACKWARD
34 | 		TURN_LEFT
35 | 		TURN_RIGHT
36 | 	}
37 | 
38 | # Game variables that will be in the state
39 | available_game_variables = { HEALTH }
40 | 
41 | mode = PLAYER
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/DDDQN/Doom-Deadly-Corridor/deadly_corridor.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DDDQN/Doom-Deadly-Corridor/deadly_corridor.wad


--------------------------------------------------------------------------------
/DDPG/Ant/DDPG_Ant-v2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DDPG/Ant/DDPG_Ant-v2.pth


--------------------------------------------------------------------------------
/DDPG/Ant/DDPG_Ant.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import numpy as np
  4 | import gym
  5 | import time
  6 | 
  7 | def logger_print(logger, key, with_min_and_max=False):
  8 |     if with_min_and_max:
  9 |         print(f'{key+":":13s} {np.mean(logger[key]):.4f} {np.min(logger[key]):.4f}(min) {np.max(logger[key]):.4f}(max) {np.std(logger[key]):.4f}(std)')
 10 |     else:
 11 |         print(f'{key+":":13s} {np.mean(logger[key]):.4f}')
 12 | 
 13 | def get_parameter_number(net):
 14 |     total_num = sum(p.numel() for p in net.parameters())
 15 |     trainable_num = sum(p.numel() for p in net.parameters() if p.requires_grad)
 16 |     return {'Total': total_num, 'Trainable': trainable_num}
 17 | 
 18 | def weight_init(m):
 19 |     '''
 20 |     Code from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5
 21 |     Usage:
 22 |         model = Model()
 23 |         model.apply(weight_init)
 24 |     '''
 25 |     if isinstance(m, nn.Linear):
 26 |         nn.init.xavier_normal_(m.weight.data)
 27 |         nn.init.normal_(m.bias.data)
 28 | 
 29 | class ReplayBuffer:
 30 |     """
 31 |     A simple FIFO experience replay buffer for DDPG agents.
 32 |     """
 33 |     def __init__(self, obs_dim, act_dim, size):
 34 |         self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
 35 |         self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
 36 |         self.acts_buf = np.zeros([size, act_dim], dtype=np.float32)
 37 |         self.rews_buf = np.zeros(size, dtype=np.float32)
 38 |         self.done_buf = np.zeros(size, dtype=np.float32)
 39 |         self.ptr, self.size, self.max_size = 0, 0, size
 40 | 
 41 |     def store(self, obs, act, rew, next_obs, done):
 42 |         self.obs1_buf[self.ptr] = obs
 43 |         self.obs2_buf[self.ptr] = next_obs
 44 |         self.acts_buf[self.ptr] = act
 45 |         self.rews_buf[self.ptr] = rew
 46 |         self.done_buf[self.ptr] = done
 47 |         self.ptr = (self.ptr + 1) % self.max_size
 48 |         self.size = min(self.size + 1, self.max_size)
 49 | 
 50 |     def sample_batch(self, batch_size=32):
 51 |         idxs = np.random.randint(0, self.size, size=batch_size)
 52 |         return dict(obs1=self.obs1_buf[idxs],
 53 |                     obs2=self.obs2_buf[idxs],
 54 |                     acts=self.acts_buf[idxs],
 55 |                     rews=self.rews_buf[idxs],
 56 |                     done=self.done_buf[idxs])
 57 | 
 58 | class MLP(nn.Module):
 59 |     def __init__(self, sizes, activation=nn.Tanh, output_activation=None):
 60 |         super().__init__()
 61 | 
 62 |         net = []
 63 |         for i in range(len(sizes)-1):
 64 |             net.append(nn.Linear(sizes[i], sizes[i+1]))
 65 |             if i == len(sizes) - 2:
 66 |                 if output_activation is not None:
 67 |                     net.append(output_activation())
 68 |             else:
 69 |                 net.append(activation())
 70 | 
 71 |         self.mlp = nn.Sequential(*net)
 72 | 
 73 |     def forward(self, x):
 74 |         return self.mlp(x)
 75 | 
 76 | 
 77 | class Actor_Critic(nn.Module):
 78 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation=nn.ReLU, output_activation=nn.Tanh, action_space=None):
 79 |         super().__init__()
 80 | 
 81 |         self.actor = MLP([obs_dim] + hidden_sizes + [act_dim], activation, output_activation)
 82 |         self.critic = MLP([obs_dim + act_dim] + hidden_sizes + [1], activation, None)
 83 | 
 84 | """
 85 | Deep Deterministic Policy Gradient (DDPG)
 86 | """
 87 | def ddpg(
 88 |     env_name,
 89 |     ac_kwargs=dict(),
 90 |     seed=0, 
 91 |     steps_per_epoch=5000,
 92 |     epochs=100,
 93 |     replay_size=int(1e6),
 94 |     gamma=0.99,
 95 |     polyak=0.995,
 96 |     pi_lr=1e-3,
 97 |     q_lr=1e-3,
 98 |     batch_size=100,
 99 |     start_steps=10000,
100 |     act_noise=0.1,
101 |     max_ep_len=1000
102 |     ):
103 |     """
104 | 
105 |     Args:
106 |         env_fn : A function which creates a copy of the environment.
107 |             The environment must satisfy the OpenAI Gym API.
108 | 
109 |         actor_critic: A function which takes in placeholder symbols 
110 |             for state, ``x_ph``, and action, ``a_ph``, and returns the main 
111 |             outputs from the agent's Tensorflow computation graph:
112 | 
113 |             ===========  ================  ======================================
114 |             Symbol       Shape             Description
115 |             ===========  ================  ======================================
116 |             ``pi``       (batch, act_dim)  | Deterministically computes actions
117 |                                            | from policy given states.
118 |             ``q``        (batch,)          | Gives the current estimate of Q* for 
119 |                                            | states in ``x_ph`` and actions in
120 |                                            | ``a_ph``.
121 |             ``q_pi``     (batch,)          | Gives the composition of ``q`` and 
122 |                                            | ``pi`` for states in ``x_ph``: 
123 |                                            | q(x, pi(x)).
124 |             ===========  ================  ======================================
125 | 
126 |         ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
127 |             function you provided to DDPG.
128 | 
129 |         seed (int): Seed for random number generators.
130 | 
131 |         steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
132 |             for the agent and the environment in each epoch.
133 | 
134 |         epochs (int): Number of epochs to run and train agent.
135 | 
136 |         replay_size (int): Maximum length of replay buffer.
137 | 
138 |         gamma (float): Discount factor. (Always between 0 and 1.)
139 | 
140 |         polyak (float): Interpolation factor in polyak averaging for target 
141 |             networks. Target networks are updated towards main networks 
142 |             according to:
143 | 
144 |             .. math:: \\theta_{\\text{targ}} \\leftarrow 
145 |                 \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta
146 | 
147 |             where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
148 |             close to 1.)
149 | 
150 |         pi_lr (float): Learning rate for policy.
151 | 
152 |         q_lr (float): Learning rate for Q-networks.
153 | 
154 |         batch_size (int): Minibatch size for SGD.
155 | 
156 |         start_steps (int): Number of steps for uniform-random action selection,
157 |             before running real policy. Helps exploration.
158 | 
159 |         act_noise (float): Stddev for Gaussian exploration noise added to 
160 |             policy at training time. (At test time, no noise is added.)
161 | 
162 |         max_ep_len (int): Maximum length of trajectory / episode / rollout.
163 | 
164 |     """
165 |     print(locals())
166 | 
167 |     torch.manual_seed(seed)
168 |     np.random.seed(seed)
169 |     if torch.cuda.is_available():
170 |         torch.cuda.manual_seed_all(seed)
171 | 
172 |     env = gym.make(env_name)
173 |     test_env = gym.make(env_name)
174 |     obs_dim = env.observation_space.shape[0]
175 |     act_dim = env.action_space.shape[0]
176 |     # Action limit for clamping: critically, assumes all dimensions share the same bound!
177 |     act_limit = env.action_space.high[0]
178 | 
179 |     # Share information about action space with policy architecture
180 |     ac_kwargs['action_space'] = env.action_space
181 | 
182 |     # Experience buffer
183 |     replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)
184 | 
185 |     # Model
186 |     main_ac = Actor_Critic(obs_dim, act_dim, **ac_kwargs)
187 |     target_ac = Actor_Critic(obs_dim, act_dim, **ac_kwargs)
188 |     print(main_ac)
189 |     print(f'\nNumber of parameters: {get_parameter_number(main_ac)}\n')
190 |     main_ac.apply(weight_init)
191 | 
192 |     pi_optimizer = torch.optim.Adam(main_ac.actor.parameters(), lr=pi_lr)
193 |     q_optimizer = torch.optim.Adam(main_ac.critic.parameters(), lr=q_lr)
194 |     mse_loss = nn.MSELoss()
195 | 
196 |     # copy main_ac nn parameters to target_ac
197 |     for v_targ, v_main in zip(target_ac.parameters(), main_ac.parameters()):
198 |         v_targ.data.copy_(v_main.data)
199 | 
200 |     # Main loop: collect experience in env and update/log each epoch
201 |     t = 0
202 |     start_time = time.time()
203 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
204 |     max_avg_ret = -np.inf
205 | 
206 |     for epoch in range(epochs):
207 |         logger = {
208 |             'LossQ': [],
209 |             'QVals': [],
210 |             'LossPi': [],
211 |             'EpRet': [],
212 |             'EpLen': [],
213 |             'TestEpRet': [],
214 |             'TestEpLen': []
215 |         }
216 | 
217 |         for _ in range(steps_per_epoch):
218 |             """
219 |             Until start_steps have elapsed, randomly sample actions
220 |             from a uniform distribution for better exploration. Afterwards, 
221 |             use the learned policy (with some noise, via act_noise). 
222 |             """
223 |             if t > start_steps:
224 |                 with torch.no_grad():
225 |                     pi = act_limit * main_ac.actor(torch.tensor(o, dtype=torch.float))
226 |                     pi = pi.numpy() + act_noise * np.random.randn(act_dim)
227 |                     a = np.clip(pi, -act_limit, act_limit)
228 |             else:
229 |                 a = env.action_space.sample()
230 | 
231 |             o2, r, d, _ = env.step(a)
232 |             ep_ret += r
233 |             ep_len += 1
234 | 
235 |             # Ignore the "done" signal if it comes from hitting the time
236 |             # horizon (that is, when it's an artificial terminal signal
237 |             # that isn't based on the agent's state)
238 |             d = False if ep_len==max_ep_len else d
239 | 
240 |             # Store experience to replay buffer
241 |             replay_buffer.store(o, a, r, o2, d)
242 | 
243 |             # Super critical, easy to overlook step: make sure to update 
244 |             # most recent observation!
245 |             o = o2
246 | 
247 |             if d or (ep_len == max_ep_len):
248 |                 """
249 |                 Perform all DDPG updates at the end of the trajectory,
250 |                 in accordance with tuning done by TD3 paper authors.
251 |                 """
252 |                 for _ in range(ep_len):
253 |                     batch = replay_buffer.sample_batch(batch_size)
254 |                     obs1 = torch.tensor(batch['obs1'], dtype=torch.float)
255 |                     obs2 = torch.tensor(batch['obs2'], dtype=torch.float)
256 |                     acts = torch.tensor(batch['acts'], dtype=torch.float)
257 |                     rews = torch.tensor(batch['rews'], dtype=torch.float).unsqueeze(1)
258 |                     done = torch.tensor(batch['done'], dtype=torch.float).unsqueeze(1)
259 | 
260 |                     # Q-learning update
261 |                     q = main_ac.critic(torch.cat([obs1, acts], dim=-1))
262 |                     pi_targ = act_limit * target_ac.actor(obs2)
263 |                     q_pi_targ = target_ac.critic(torch.cat([obs2, pi_targ], dim=-1))
264 |                     backup = rews + gamma * (1 - done) * q_pi_targ
265 |                     q_loss = mse_loss(q, backup.detach())
266 | 
267 |                     q_optimizer.zero_grad()
268 |                     q_loss.backward()
269 |                     q_optimizer.step()
270 |                     logger['LossQ'].append(q_loss.item())
271 |                     logger['QVals'] += q.squeeze().tolist()
272 | 
273 |                     # Policy update
274 |                     pi = act_limit * main_ac.actor(obs1)
275 |                     q_pi = main_ac.critic(torch.cat([obs1, pi], dim=-1))
276 |                     pi_loss = -q_pi.mean()
277 | 
278 |                     pi_optimizer.zero_grad()
279 |                     pi_loss.backward()
280 |                     pi_optimizer.step()
281 | 
282 |                     logger['LossPi'].append(pi_loss.item())
283 | 
284 |                     # Target update
285 |                     for v_targ, v_main in zip(target_ac.parameters(), main_ac.parameters()):
286 |                         v_targ.data.copy_(polyak * v_targ.data + (1 - polyak) * v_main.data)
287 | 
288 |                 logger['EpRet'].append(ep_ret)
289 |                 logger['EpLen'].append(ep_len)
290 |                 o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
291 | 
292 |             t += 1
293 | 
294 |         # Test the performance of the deterministic version of the agent.
295 |         with torch.no_grad():
296 |             for _ in range(10):
297 |                 ob, ret, done, test_ep_ret, test_ep_len = test_env.reset(), 0, False, 0, 0
298 |                 while not(done or (test_ep_len == max_ep_len)):
299 |                     # Take deterministic actions at test time withour noise
300 |                     pi = act_limit * main_ac.actor(torch.tensor(ob, dtype=torch.float))
301 |                     act = np.clip(pi, -act_limit, act_limit)
302 |                     ob, ret, done, _ = test_env.step(act)
303 |                     test_ep_ret += ret
304 |                     test_ep_len += 1
305 |                 logger['TestEpRet'].append(test_ep_ret)
306 |                 logger['TestEpLen'].append(test_ep_len)
307 | 
308 |         # Log info about epoch
309 |         print('-'*40)
310 |         print(f'Epoch: {epoch}')
311 |         print(f'TotalEnvInteracts: {t}')
312 |         logger_print(logger, 'EpRet', True)
313 |         logger_print(logger, 'EpLen')
314 |         logger_print(logger, 'TestEpRet', True)
315 |         logger_print(logger, 'TestEpLen')
316 |         logger_print(logger, 'QVals', True)
317 |         logger_print(logger, 'LossPi')
318 |         logger_print(logger, 'LossQ')
319 |         print(f'Time: {time.time()-start_time:.4f}s')
320 |         print('-'*40+'\n')
321 | 
322 |         # Save model
323 |         if np.mean(logger['EpRet']) > max_avg_ret:
324 |             max_avg_ret = np.mean(logger['EpRet'])
325 |             torch.save(main_ac.state_dict(), 'DDPG_{}.pth'.format(env_name))
326 | 
327 |     env.close()
328 | 
329 | if __name__ == '__main__':
330 |     import argparse
331 |     parser = argparse.ArgumentParser()
332 |     parser.add_argument('--env', type=str, default='Ant-v2')
333 |     parser.add_argument('--hid', type=int, default=300)
334 |     parser.add_argument('--l', type=int, default=2)
335 |     parser.add_argument('--gamma', type=float, default=0.99)
336 |     parser.add_argument('--seed', '-s', type=int, default=0)
337 |     parser.add_argument('--epochs', type=int, default=50)
338 |     parser.add_argument('--exp_name', type=str, default='ddpg')
339 |     args = parser.parse_args()
340 | 
341 |     ddpg(
342 |         args.env,
343 |         ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
344 |         gamma=args.gamma,
345 |         seed=args.seed,
346 |         epochs=args.epochs
347 |         )


--------------------------------------------------------------------------------
/DQN/Atari_Space_Invaders/DQN_Atari_Space_Invaders.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import retro
  4 | from skimage import transform
  5 | from skimage.color import rgb2gray
  6 | from collections import deque
  7 | import random
  8 | import sys
  9 | import time
 10 | 
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | import os
 15 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 16 | 
 17 | 
 18 | ###########################################
 19 | # Constant
 20 | stack_size = 4
 21 | frame_size = (110, 84)
 22 | # Global variables
 23 | stacked_frames = deque([np.zeros(frame_size) for _ in range(stack_size)], maxlen=stack_size)
 24 | ###########################################
 25 | 
 26 | def create_environment():
 27 |     env = retro.make(game='SpaceInvaders-Atari2600')
 28 |     possible_actions = np.array(np.identity(env.action_space.n, dtype=np.int).tolist())
 29 |     return env, possible_actions
 30 | 
 31 | def test_environment():
 32 |     env, possible_actions = create_environment()
 33 |     episodes = 1
 34 | 
 35 |     for _ in range(episodes):
 36 |         env.reset()
 37 |         done = False
 38 | 
 39 |         while not done:
 40 |             env.render()
 41 |             choice = random.randint(0, action_size - 1)
 42 |             action = possible_actions[choice]
 43 |             state, reward, done, info = env.step(action)
 44 | 
 45 |     env.close()
 46 | 
 47 | def preprocess_frame(frame):
 48 |     gray = rgb2gray(frame)
 49 |     cropped_frame = gray[8:-12, 4:-12]
 50 |     normalized_frame = cropped_frame / 255.0
 51 |     preprocessed_frame = transform.resize(normalized_frame, frame_size)
 52 |     return preprocessed_frame
 53 | 
 54 | def stack_frames(state, is_new_episode=False):
 55 |     global stacked_frames
 56 |     frame = preprocess_frame(state)
 57 | 
 58 |     if is_new_episode:
 59 |         stacked_frames = deque([np.zeros(frame_size) for _ in range(stack_size)], maxlen=stack_size)
 60 | 
 61 |         for _ in range(stack_size):
 62 |             stacked_frames.append(frame)
 63 |     else:
 64 |         stacked_frames.append(frame)
 65 | 
 66 |     return np.stack(stacked_frames, axis=2)
 67 | 
 68 | 
 69 | class DQNetwork:
 70 |     def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
 71 |         with tf.variable_scope(name):
 72 |             self.inputs = tf.placeholder(tf.float32, [None, *state_size], name='inputs')
 73 |             self.actions = tf.placeholder(tf.float32, [None, action_size], name='actions')
 74 |             self.target_q = tf.placeholder(tf.float32, [None], name='target_q')
 75 | 
 76 |             conv1 = tf.layers.conv2d(
 77 |                 inputs = self.inputs,
 78 |                 filters = 32,
 79 |                 kernel_size = [8, 8],
 80 |                 strides = [4, 4],
 81 |                 padding = 'VALID',
 82 |                 kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
 83 |                 name = 'conv1'
 84 |             )
 85 |             conv1_out = tf.nn.elu(conv1, name='conv1_out')
 86 | 
 87 |             conv2 = tf.layers.conv2d(
 88 |                 inputs = conv1_out,
 89 |                 filters = 64,
 90 |                 kernel_size = [4, 4],
 91 |                 strides = [2, 2],
 92 |                 padding = 'VALID',
 93 |                 kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
 94 |                 name = 'conv2'
 95 |             )
 96 |             conv2_out = tf.nn.elu(conv2, name='conv2_out')
 97 | 
 98 |             conv3 = tf.layers.conv2d(
 99 |                 inputs = conv2_out,
100 |                 filters = 64,
101 |                 kernel_size = [3, 3],
102 |                 strides = [2, 2],
103 |                 padding = 'VALID',
104 |                 kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
105 |                 name = 'conv3'
106 |             )
107 |             conv3_out = tf.nn.elu(conv3, name='conv3_out')
108 | 
109 |             flatten = tf.contrib.layers.flatten(conv3_out)
110 |             fc = tf.layers.dense(
111 |                 inputs = flatten,
112 |                 units = 512,
113 |                 activation = tf.nn.elu,
114 |                 kernel_initializer = tf.contrib.layers.xavier_initializer(),
115 |                 name = 'fc'
116 |             )
117 |             self.output = tf.layers.dense(
118 |                 inputs = fc,
119 |                 units = action_size,
120 |                 activation = None,
121 |                 kernel_initializer = tf.contrib.layers.xavier_initializer(),
122 |                 name = 'output'
123 |             )
124 | 
125 |             self.q = tf.reduce_sum(tf.multiply(self.output, self.actions))
126 |             self.loss = tf.reduce_mean(tf.square(self.target_q - self.q))
127 |             self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
128 | 
129 | 
130 | class Memory():
131 |     def __init__(self, max_size):
132 |         self.buffer = deque(maxlen=max_size)
133 | 
134 |     def add(self, experience):
135 |         self.buffer.append(experience)
136 | 
137 |     def sample(self, batch_size):
138 |         buffer_size = len(self.buffer)
139 |         index = np.random.choice(
140 |             np.arange(buffer_size),
141 |             size = batch_size,
142 |             replace = False
143 |         )
144 |         return [self.buffer[i] for i in index]
145 | 
146 | 
147 | def train():
148 |     env, possible_actions = create_environment()
149 | 
150 |     # set hyperparameters
151 |     ###########################################
152 |     state_size = [*frame_size, stack_size]
153 |     action_size = env.action_space.n
154 |     learning_rate = 0.00025
155 |     total_episodes = 100
156 |     check_step = 5
157 |     max_steps = 50000
158 |     batch_size = 64
159 |     explore_start = 1.0
160 |     explore_stop = 0.01
161 |     decay_rate = 0.00001
162 |     gamma = 0.9
163 |     pretrain_length = batch_size
164 |     memory_size = 1000000
165 |     ###########################################
166 | 
167 | 
168 |     # pre-populate train samples
169 |     ###########################################
170 |     memory = Memory(max_size=memory_size)
171 |     state = env.reset()
172 |     state = stack_frames(state, True)
173 | 
174 |     for i in range(pretrain_length):
175 |         choice = random.randint(0, action_size - 1)
176 |         action = possible_actions[choice]
177 |         new_state, reward, done, _ = env.step(action)
178 |         new_state = stack_frames(new_state)
179 | 
180 |         if done:
181 |             new_state = np.zeros(state.shape)
182 |             memory.add((state, action, reward, new_state, done))
183 |             state = env.reset()
184 |             state = stack_frames(state, True)
185 |         else:
186 |             memory.add((state, action, reward, new_state, done))
187 |             state = new_state
188 |     ###########################################
189 | 
190 |     # train DQN
191 |     ###########################################
192 |     tf.reset_default_graph()
193 |     DQN = DQNetwork(state_size, action_size, learning_rate)
194 | 
195 |     writer = tf.summary.FileWriter('train_log')
196 |     tf.summary.scalar('Loss', DQN.loss)
197 |     write_op = tf.summary.merge_all()
198 |     saver = tf.train.Saver()
199 | 
200 |     with tf.Session() as sess:
201 |         sess.run(tf.global_variables_initializer())
202 | 
203 |         decay_step = 0
204 |         loss = None
205 | 
206 |         for episode in range(1, total_episodes+1):
207 |             step = 0
208 |             episode_rewards = []
209 | 
210 |             state = env.reset()
211 |             state = stack_frames(state, True)
212 | 
213 |             while step < max_steps:
214 |                 step += 1
215 |                 decay_step += 1
216 | 
217 |                 exp_exp_tradeoff = np.random.rand()
218 |                 explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
219 | 
220 |                 if explore_probability > exp_exp_tradeoff:
221 |                     choice = random.randint(0, action_size - 1)
222 |                 else:
223 |                     qs = sess.run(DQN.output, feed_dict={
224 |                             DQN.inputs: state.reshape((1, *state.shape))
225 |                         })
226 |                     choice = np.argmax(qs)
227 | 
228 |                 action = possible_actions[choice]
229 |                 new_state, reward, done, _ = env.step(action)
230 | 
231 |                 env.render()
232 |                 episode_rewards.append(reward)
233 | 
234 |                 if done:
235 |                     total_reward = np.sum(episode_rewards)
236 | 
237 |                     new_state = np.zeros(frame_size)
238 |                     new_state = stack_frames(new_state)
239 |                     memory.add((state, action, reward, new_state, done))
240 | 
241 |                     print(
242 |                         '[*] Episode: {}, total reward: {}, explore p: {:.4f}, train loss: {:.4f}'.format(
243 |                             episode, total_reward, explore_probability, loss
244 |                         )
245 |                     )
246 |                     break
247 |                 else:
248 |                     new_state = stack_frames(new_state)
249 |                     memory.add((state, action, reward, new_state, done))
250 |                     state = new_state
251 | 
252 |                 # learning part
253 |                 ################
254 |                 batch = memory.sample(batch_size)
255 |                 states_mb = np.array([b[0] for b in batch], ndmin=3)
256 |                 actions_mb = np.array([b[1] for b in batch])
257 |                 rewards_mb = np.array([b[2] for b in batch])
258 |                 new_states_mb = np.array([b[3] for b in batch], ndmin=3)
259 |                 dones_mb = np.array([b[4] for b in batch])
260 | 
261 |                 target_q_mb = []
262 |                 new_state_q_mb = sess.run(DQN.output, feed_dict={
263 |                         DQN.inputs: new_states_mb,
264 |                     })
265 | 
266 |                 for i in range(batch_size):
267 |                     is_done = dones_mb[i]
268 |                     if is_done:
269 |                         target_q_mb.append(rewards_mb[i])
270 |                     else:
271 |                         t = rewards_mb[i] + gamma * np.max(new_state_q_mb)
272 |                         target_q_mb.append(t)
273 | 
274 |                 target_q_mb = np.array(target_q_mb)
275 | 
276 |                 loss, _ = sess.run([DQN.loss, DQN.optimizer], feed_dict={
277 |                         DQN.inputs: states_mb,
278 |                         DQN.actions: actions_mb,
279 |                         DQN.target_q: target_q_mb
280 |                     })
281 | 
282 |                 summary = sess.run(write_op, feed_dict={
283 |                         DQN.inputs: states_mb,
284 |                         DQN.actions: actions_mb,
285 |                         DQN.target_q: target_q_mb
286 |                     })
287 |                 writer.add_summary(summary, episode)
288 |                 writer.flush()
289 |                 ################
290 | 
291 |             if episode % check_step == 0:
292 |                 save_path = saver.save(sess, './model/model.ckpt')
293 |                 print('[*] Model Saved:', save_path)
294 | 
295 |     print('[*] Train done')
296 |     env.close()
297 |     ###########################################
298 | 
299 | 
300 | def play():
301 |     env, possible_actions = create_environment()
302 | 
303 |     with tf.Session() as sess:
304 |         total_rewards = 0
305 | 
306 |         state_size = [*frame_size, stack_size]
307 |         action_size = env.action_space.n
308 |         learning_rate = 0.00025
309 |         DQN = DQNetwork(state_size, action_size, learning_rate)
310 | 
311 |         saver = tf.train.Saver()
312 |         saver.restore(sess, './model/model.ckpt')
313 | 
314 |         # start game
315 |         state = env.reset()
316 |         state = stack_frames(state, True)
317 |         done = False
318 | 
319 |         while not done:
320 |             state_q = sess.run(DQN.output, feed_dict={
321 |                     DQN.inputs: state.reshape((1, *state.shape))
322 |                 })
323 |             choice = np.argmax(state_q)
324 |             action = possible_actions[choice]
325 |             new_state, reward, done, _ = env.step(action)
326 |             
327 |             env.render()
328 |             total_rewards += reward
329 |             state = stack_frames(new_state)
330 | 
331 |         print('[*] total score:', total_rewards)
332 | 
333 |     env.close()
334 | 
335 | 
336 | if __name__ == '__main__':
337 |     if sys.argv[1] == '--train':
338 |         train()
339 |     elif sys.argv[1] == '--play':
340 |         play()
341 |     elif sys.argv[1] == '--test':
342 |         test_environment()
343 | 


--------------------------------------------------------------------------------
/DQN/Atari_Space_Invaders/Space Invaders (1983) (CCE) (C-820).bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/Space Invaders (1983) (CCE) (C-820).bin


--------------------------------------------------------------------------------
/DQN/Atari_Space_Invaders/model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/DQN/Atari_Space_Invaders/model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/model/model.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/DQN/Atari_Space_Invaders/model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/model/model.ckpt.index


--------------------------------------------------------------------------------
/DQN/Atari_Space_Invaders/model/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/model/model.ckpt.meta


--------------------------------------------------------------------------------
/DQN/Atari_Space_Invaders/train_log/events.out.tfevents.1530462157.MKK:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Atari_Space_Invaders/train_log/events.out.tfevents.1530462157.MKK


--------------------------------------------------------------------------------
/DQN/Doom/DQN_Doom.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from vizdoom import DoomGame
  4 | import random
  5 | import time
  6 | from skimage import transform
  7 | from collections import deque
  8 | 
  9 | import os
 10 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 11 | 
 12 | 
 13 | 
 14 | def create_environment():
 15 |     game = DoomGame()
 16 |     game.load_config("basic.cfg")
 17 |     game.set_doom_scenario_path("basic.wad")
 18 |     game.init()
 19 | 
 20 |     left = [1, 0, 0]
 21 |     right = [0, 1, 0]
 22 |     shoot = [0, 0, 1]
 23 |     possible_actions = [left, right, shoot]
 24 |     return game, possible_actions
 25 | 
 26 | def test_environment():
 27 |     game, actions = create_environment()
 28 |     episodes = 1
 29 | 
 30 |     for _ in range(episodes):
 31 |         game.new_episode()
 32 | 
 33 |         while not game.is_episode_finished():
 34 |             state = game.get_state()
 35 | 
 36 |             img = state.screen_buffer # 当前游戏画面, 2D array
 37 |             misc = state.game_variables # [50.]
 38 |             action = random.choice(actions)
 39 |             reward = game.make_action(action)
 40 |             print(action, 'reward:', reward)
 41 |             time.sleep(0.02)
 42 | 
 43 |         print('[*] Result:', game.get_total_reward())
 44 |         time.sleep(2)
 45 | 
 46 |     game.close()
 47 | 
 48 | 
 49 | def preprocess_frame(state):
 50 |     cropped_frame = state[30:-10, 30:-30]
 51 |     normalized_frame = cropped_frame / 255.0
 52 |     preprocessed_frame = transform.resize(normalized_frame, [84, 84])
 53 |     return preprocessed_frame
 54 | 
 55 | 
 56 | def stack_states(stacked_frames, state):
 57 |     frame = preprocess_frame(state)
 58 |     stacked_frames.append(frame)
 59 |     stacked_state = np.stack(stacked_frames, axis=2)
 60 |     return stacked_state
 61 | 
 62 | 
 63 | class build_DQNetwork:
 64 |     def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
 65 |         self.state_size = state_size
 66 |         self.action_size = action_size
 67 |         self.learning_rate = learning_rate
 68 | 
 69 |         with tf.variable_scope(name):
 70 |             # 84x84x4
 71 |             self.inputs = tf.placeholder(tf.float32, [None, *state_size], name='inputs')
 72 |             self.actions = tf.placeholder(tf.float32, [None, action_size], name='actions')
 73 |             self.target_Q = tf.placeholder(tf.float32, [None], name='target')
 74 | 
 75 |             # 20x20x32
 76 |             self.conv1 = tf.layers.conv2d(inputs = self.inputs,
 77 |                                         filters = 32,
 78 |                                         kernel_size = [8, 8],
 79 |                                         strides = [4, 4],
 80 |                                         padding = 'VALID',
 81 |                                         kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
 82 |                                         name = 'conv1')
 83 |             self.conv1_batchnorm = tf.layers.batch_normalization(self.conv1,
 84 |                                         training = True,
 85 |                                         epsilon = 1e-5,
 86 |                                         name = 'batch_norm1')
 87 |             self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name='conv1_out')
 88 | 
 89 |             # 9x9x64
 90 |             self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
 91 |                                         filters = 64,
 92 |                                         kernel_size = [4, 4],
 93 |                                         strides = [2, 2],
 94 |                                         padding = 'VALID',
 95 |                                         kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
 96 |                                         name = 'conv2')
 97 |             self.conv2_batchnorm = tf.layers.batch_normalization(self.conv2,
 98 |                                         training = True,
 99 |                                         epsilon = 1e-5,
100 |                                         name = 'batch_norm2')
101 |             self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name='conv2_out')
102 | 
103 |             # 3x3x128
104 |             self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
105 |                                         filters = 128,
106 |                                         kernel_size = [4, 4],
107 |                                         strides = [2, 2],
108 |                                         padding = 'VALID',
109 |                                         kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
110 |                                         name = 'conv3')
111 |             self.conv3_batchnorm = tf.layers.batch_normalization(self.conv3,
112 |                                         training = True,
113 |                                         epsilon = 1e-5,
114 |                                         name = 'batch_norm3')
115 |             self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name='conv3_out')
116 | 
117 |             # 1152
118 |             self.flatten = tf.layers.flatten(self.conv3_out)
119 |             # 512
120 |             self.fc = tf.layers.dense(inputs = self.flatten,
121 |                                     units = 512,
122 |                                     activation = tf.nn.elu,
123 |                                     kernel_initializer = tf.contrib.layers.xavier_initializer(),
124 |                                     name = 'fc1')
125 |             # 3
126 |             self.output = tf.layers.dense(inputs = self.fc,
127 |                                     units = 3,
128 |                                     activation = None,
129 |                                     kernel_initializer = tf.contrib.layers.xavier_initializer(),
130 |                                     name = 'output')
131 | 
132 |             # Q is our predicted Q value
133 |             self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions), axis=1)
134 |             # # The loss is the difference between our predicted Q and the Q_target
135 |             self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
136 |             self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)
137 | 
138 | 
139 | class Memory():
140 |     def __init__(self, max_size):
141 |         self.buffer = deque(maxlen=max_size)
142 | 
143 |     def add(self, experience):
144 |         self.buffer.append(experience)
145 | 
146 |     def sample(self, batch_size):
147 |         buffer_size = len(self.buffer)
148 |         index = np.random.choice(np.arange(buffer_size), size=batch_size, replace=False)
149 |         return [self.buffer[i] for i in index]
150 | 
151 | 
152 | def train():
153 |     game, possible_actions = create_environment()
154 | 
155 |     # Set Hyperparameters
156 |     #####################
157 |     state_size = [84, 84, 4]
158 |     action_size = game.get_available_buttons_size()
159 |     learning_rate = 0.0002
160 | 
161 |     total_episodes = 5000
162 |     max_steps = 100
163 |     batch_size = 64
164 | 
165 |     explore_max = 1.0
166 |     explore_min = 0.01
167 |     decay_rate = 0.0001
168 |     gamma = 0.99
169 | 
170 |     pretrain_length = batch_size
171 |     memory_size = 50000
172 |     stack_size = 4
173 | 
174 |     stacked_frames = deque([np.zeros((84, 84), dtype=np.int) for i in range(stack_size)],
175 |                             maxlen=stack_size)
176 |     memory = Memory(max_size=memory_size)
177 |     #####################
178 | 
179 | 
180 |     # make pretrain samples
181 |     ###########################################
182 |     game.new_episode()
183 | 
184 |     for i in range(pretrain_length):
185 |         if i == 0:
186 |             state = game.get_state().screen_buffer
187 |             state = stack_states(stacked_frames, state)
188 | 
189 |         action = random.choice(possible_actions)
190 |         reward = game.make_action(action)
191 |         done = game.is_episode_finished()
192 | 
193 |         if done:
194 |             next_state = np.zeros(state.shape)
195 |             memory.add((state, action, reward, next_state, done))
196 |             game.new_episode()
197 |         else:
198 |             next_state = game.get_state().screen_buffer
199 |             next_state = stack_states(stacked_frames, next_state)
200 |             memory.add((state, action, reward, next_state, done))
201 | 
202 |             state = next_state
203 |     ###########################################
204 | 
205 | 
206 |     # train deep Q neural network
207 |     ###########################################
208 |     tf.reset_default_graph()
209 |     DQNetwork = build_DQNetwork(state_size, action_size, learning_rate)
210 | 
211 |     writer = tf.summary.FileWriter('train_log')
212 |     tf.summary.scalar('loss', DQNetwork.loss)
213 |     saver = tf.train.Saver()
214 | 
215 |     rewards_list = []
216 |     decay_step = 0
217 |     game.init()
218 | 
219 |     with tf.Session() as sess:
220 |         sess.run(tf.global_variables_initializer())
221 |         
222 |         for episode in range(total_episodes):
223 |             game.new_episode()
224 | 
225 |             step = 0
226 |             frame = game.get_state().screen_buffer
227 |             state = stack_states(stacked_frames, frame)
228 | 
229 |             while step < max_steps:
230 |                 step += 1
231 |                 decay_step += 1
232 | 
233 |                 exp_exp_tradeoff = np.random.rand()
234 |                 explore_probability = explore_min + (explore_max - explore_min) * np.exp(-decay_rate * decay_step)
235 | 
236 |                 if explore_probability > exp_exp_tradeoff:
237 |                     action = random.choice(possible_actions)
238 |                 else:
239 |                     Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs: state.reshape(1, *state.shape)})
240 |                     action = possible_actions[int(np.argmax(Qs))]
241 | 
242 |                 reward = game.make_action(action)
243 |                 done = game.is_episode_finished()
244 | 
245 |                 if done:
246 |                     next_state = np.zeros((84, 84), dtype=np.int)
247 |                     next_state = stack_states(stacked_frames, next_state)
248 |                     total_reward = game.get_total_reward()
249 |                     formated_str = 'Episode: {}, Total reward: {}, Training loss: {:.4f}, Explore P: {:.4f}'
250 |                     print(formated_str.format(episode, total_reward, loss, explore_probability))
251 | 
252 |                     rewards_list.append((episode, total_reward))
253 |                     memory.add((state, action, reward, next_state, done))
254 |                     step = max_steps
255 |                 else:
256 |                     next_state = game.get_state().screen_buffer
257 |                     next_state = stack_states(stacked_frames, next_state)
258 |                     memory.add((state, action, reward, next_state, done))
259 |                     state = next_state
260 | 
261 |                 # train DQNetwork == update Qtable
262 |                 batch = memory.sample(batch_size)
263 |                 states = np.array([each[0] for each in batch], ndmin=3)
264 |                 actions = np.array([each[1] for each in batch])
265 |                 rewards = np.array([each[2] for each in batch])
266 |                 next_states = np.array([each[3] for each in batch])
267 |                 dones = np.array([each[4] for each in batch])
268 | 
269 |                 target_Qs_batch = []
270 |                 target_Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs: next_states})
271 | 
272 |                 for i in range(batch_size):
273 |                     terminal = dones[i]
274 | 
275 |                     if terminal:
276 |                         target_Qs_batch.append(rewards[i])
277 |                     else:
278 |                         target = rewards[i] + gamma * np.max(target_Qs[i])
279 |                         target_Qs_batch.append(target)
280 | 
281 |                 targets = np.array([each for each in target_Qs_batch])
282 | 
283 |                 loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
284 |                                     feed_dict={DQNetwork.inputs: states,
285 |                                                DQNetwork.target_Q: targets,
286 |                                                DQNetwork.actions: actions})
287 | 
288 |                 # Write TF Summaries
289 |                 summary = sess.run(tf.summary.merge_all(),
290 |                                 feed_dict={DQNetwork.inputs: states,
291 |                                             DQNetwork.target_Q: targets,
292 |                                             DQNetwork.actions: actions})
293 |                 writer.add_summary(summary, episode)
294 |                 writer.flush()
295 | 
296 |             if episode % 5 == 0:
297 |                 save_path = saver.save(sess, './model/model.ckpt')
298 |                 print('[*] Model Saved:', save_path)
299 |     print('Train done')
300 | ###########################################
301 | 
302 | 
303 | def play():
304 |     with tf.Session() as sess:
305 |         state_size = [84, 84, 4]
306 |         action_size = 3
307 |         learning_rate = 0.0002
308 |         DQNetwork = build_DQNetwork(state_size, action_size, learning_rate)
309 | 
310 |         saver = tf.train.Saver()
311 |         saver.restore(sess, "./model/model.ckpt")
312 | 
313 |         game, possible_actions = create_environment()
314 |         totalScore = 0
315 |         episodes = 10
316 |         stack_size = 4
317 |         stacked_frames = deque([np.zeros((84, 84), dtype=np.int) for i in range(stack_size)],
318 |                             maxlen=stack_size)
319 | 
320 |         for i in range(episodes):
321 |             game.new_episode()
322 | 
323 |             while not game.is_episode_finished():
324 |                 frame = game.get_state().screen_buffer
325 |                 state = stack_states(stacked_frames, frame)
326 | 
327 |                 Qs = sess.run(DQNetwork.output, feed_dict={DQNetwork.inputs: state.reshape((1, *state.shape))})
328 |                 action = possible_actions[int(np.argmax(Qs))]
329 |                 game.make_action(action)
330 |             
331 |             score = game.get_total_reward()
332 |             print("Episode {} Score: {}".format(i, score))
333 |             totalScore += score
334 | 
335 |         print("[*] Average Score: ", totalScore / episodes)
336 |         game.close()
337 | 
338 | 
339 | if __name__ == '__main__':
340 |     import sys
341 |     if sys.argv[1] == '--train':
342 |         train()
343 |     elif sys.argv[1] == '--play':
344 |         play()
345 | 
346 | 


--------------------------------------------------------------------------------
/DQN/Doom/basic.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = basic.wad
 6 | doom_map = map01
 7 | 
 8 | # Rewards
 9 | living_reward = -1
10 | 
11 | # Rendering options
12 | screen_resolution = RES_160X120
13 | screen_format = GRAY8
14 | render_hud = True
15 | render_crosshair = false
16 | render_weapon = true
17 | render_decals = true
18 | render_particles = true
19 | window_visible = true
20 | 
21 | # make episodes start after 20 tics (after unholstering the gun)
22 | episode_start_time = 14
23 | 
24 | # make episodes finish after 300 actions (tics)
25 | episode_timeout = 300
26 | 
27 | # Available buttons
28 | available_buttons = 
29 | 	{ 
30 | 	    MOVE_LEFT 
31 | 	    MOVE_RIGHT 
32 | 	    ATTACK 
33 | 	}
34 | 
35 | # Game variables that will be in the state
36 | available_game_variables = { AMMO2}
37 | 
38 | mode = PLAYER
39 | doom_skill = 5
40 | 


--------------------------------------------------------------------------------
/DQN/Doom/basic.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/basic.wad


--------------------------------------------------------------------------------
/DQN/Doom/model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/DQN/Doom/model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/model/model.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/DQN/Doom/model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/model/model.ckpt.index


--------------------------------------------------------------------------------
/DQN/Doom/model/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/model/model.ckpt.meta


--------------------------------------------------------------------------------
/DQN/Doom/train_log/events.out.tfevents.1524621481.MKK:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/DQN/Doom/train_log/events.out.tfevents.1524621481.MKK


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MCTS/MCTS_Gomoku.py:
--------------------------------------------------------------------------------
  1 | from math import *
  2 | import random
  3 | import numpy as np
  4 | 
  5 | class GameState:
  6 |     def __init__(self):
  7 |         self.player_just_moved = 2
  8 |         
  9 |     def clone(self):
 10 |         st = GameState()
 11 |         st.player_just_moved = self.player_just_moved
 12 |         return st
 13 | 
 14 |     def move(self, action):
 15 |         self.player_just_moved = 3 - self.player_just_moved
 16 |         
 17 |     def actions(self):
 18 |         """ Get all possible moves from this state.
 19 |         """
 20 |     
 21 |     def win(self, player):
 22 |         """ Get the game result from the viewpoint of player. 
 23 |         """
 24 | 
 25 |     def end(self):
 26 |         """ Whether the game is end or not
 27 |         """
 28 | 
 29 |     def __repr__(self):
 30 |         pass
 31 | 
 32 | class Gomoku(GameState):
 33 |     def __init__(self, w=8): # 15x15
 34 |         self.player_just_moved = 2
 35 |         self.board = [] # 0 = empty, 1 = player 1 (X), 2 = player 2 (O)
 36 |         self.w = w
 37 |         for y in range(w):
 38 |             self.board.append([0] * w)
 39 |         
 40 |     def clone(self):
 41 |         st = Gomoku()
 42 |         st.player_just_moved = self.player_just_moved
 43 |         st.board = [self.board[i][:] for i in range(self.w)]
 44 |         st.w = self.w
 45 |         return st
 46 | 
 47 |     def move(self, action):
 48 |         a, b = action
 49 |         assert 0 <= a <= self.w and 0 <= b <= self.w and self.board[a][b] == 0
 50 |         self.player_just_moved = 3 - self.player_just_moved
 51 |         self.board[a][b] = self.player_just_moved
 52 |         
 53 |     def actions(self):
 54 |         return [(i, j) for i in range(self.w) for j in range(self.w) if self.board[i][j] == 0]
 55 |     
 56 |     def check_five(self, i, j, player):
 57 |         if 2 <= i < self.w-2 and 2 <= j < self.w-2 and self.board[i-2][j-2] == self.board[i-1][j-1] == self.board[i][j] == self.board[i+1][j+1] == self.board[i+2][j+2] == player:
 58 |             return 1
 59 |         elif 2 <= j < self.w-2 and self.board[i][j-2] == self.board[i][j-1] == self.board[i][j] == self.board[i][j+1] == self.board[i][j+2] == player:
 60 |             return 1
 61 |         elif 2 <= i < self.w-2 and 2 <= j < self.w-2 and self.board[i+2][j-2] == self.board[i+1][j-1] == self.board[i][j] == self.board[i-1][j+1] == self.board[i-2][j+2] == player:
 62 |             return 1
 63 |         elif 2 <= i < self.w-2 and self.board[i-2][j] == self.board[i-1][j] == self.board[i][j] == self.board[i+1][j] == self.board[i+2][j] == player:
 64 |             return 1
 65 |         return 0
 66 | 
 67 |     def win(self, player):
 68 |         for i in range(self.w):
 69 |             for j in range(self.w):
 70 |                 if self.check_five(i, j, player):
 71 |                     return 1
 72 |                 elif self.check_five(i, j, 3-player):
 73 |                     return 0
 74 |         if self.actions() == []: return 0.5
 75 |         return -1
 76 | 
 77 |     def end(self):
 78 |         return self.win(1) >= 0
 79 | 
 80 |     def __repr__(self):
 81 |         row = '{:>2}  ' + ' | '.join(['{}'] * self.w) + ' '
 82 |         line = '\n   ' + ('----' * self.w)[:-1] + '\n'
 83 |         s = '   ' + '%2d  ' * self.w % tuple(range(self.w)) + '\n'
 84 |         s += line.join([row.format(i, *map(lambda j: [' ', 'X', 'O'][j], self.board[i])) for i in range(self.w)])
 85 |         return s
 86 | 
 87 | class Node:
 88 |     def __init__(self, action=None, parent=None, state=None):
 89 |         self.action = action
 90 |         self.parent = parent
 91 |         self.childs = []
 92 |         self.W = 0
 93 |         self.N = 0
 94 |         self.untried_actions = state.actions()
 95 |         self.player_just_moved = state.player_just_moved
 96 |         
 97 |     def select(self):
 98 |         s = sorted(self.childs, key = lambda c: c.U())[-1]
 99 |         return s
100 |     
101 |     def add_child(self, a, s):
102 |         n = Node(a, self, s)
103 |         self.untried_actions.remove(a)
104 |         self.childs.append(n)
105 |         return n
106 |     
107 |     def update(self, result):
108 |         self.N += 1
109 |         self.W += result
110 | 
111 |     def U(self):
112 |         if self.parent:
113 |             return self.W / self.N + sqrt(2 * log(self.parent.N) / self.N)
114 |         return 0
115 | 
116 |     def __repr__(self):
117 |         return "[A: %s, U: %.2f, W/N: %.1f/%d, Untried: %s]" \
118 |         % (self.action, self.U(), self.W, self.N, self.untried_actions)
119 | 
120 |     def show_node_tree(self, indent=0):
121 |         print("|  " * indent + str(self))
122 | 
123 |         for c in self.childs:
124 |             c.show_node_tree(indent+1)
125 | 
126 |     def show_children_nodes(self):
127 |         print('\n[*] Child Nodes')
128 |         for c in self.childs: print(c)
129 | 
130 | 
131 | def UCT(rootstate, itermax, verbose=False):
132 |     rootnode = Node(state=rootstate)
133 | 
134 |     for i in range(itermax):
135 |         node = rootnode
136 |         state = rootstate.clone()
137 | 
138 |         # Select
139 |         while node.untried_actions == [] and node.childs != []:
140 |             node = node.select()
141 |             state.move(node.action)
142 | 
143 |         # Expand
144 |         if node.untried_actions != []:
145 |             action = random.choice(node.untried_actions)
146 |             state.move(action)
147 |             node = node.add_child(action, state)
148 | 
149 |         # Rollout
150 |         while state.actions() != []:
151 |             state.move(random.choice(state.actions()))
152 | 
153 |         # Backpropagate
154 |         while node != None:
155 |             node.update(state.win(node.player_just_moved))
156 |             node = node.parent
157 | 
158 |     if verbose: rootnode.show_node_tree()
159 |     else: rootnode.show_children_nodes()
160 | 
161 |     return sorted(rootnode.childs, key = lambda c: c.N)[-1].action
162 | 
163 | def random_play(game):
164 |     return random.choice(game.actions())
165 | 
166 | def human_play():
167 |     t = input('[*] Your turn (i j): ')
168 |     a, b = t.split(' ')
169 |     i, j = int(a), int(b)
170 |     return (i, j)
171 | 
172 | def play_game():
173 |     game = Gomoku()
174 | 
175 |     while not game.end():
176 |         print(game)
177 | 
178 |         if game.player_just_moved == 1:
179 |             # action = UCT(game, 1000) # Player O
180 |             action = random_play(game)
181 |         else:
182 |             action = UCT(game, 10000) # Player X
183 |             # action = human_play()
184 |         
185 |         game.move(action)
186 |         print("[*] Player %s move: %s\n" % (['X', 'O'][game.player_just_moved-1], action))
187 | 
188 |     print(game)
189 |     r = game.win(game.player_just_moved)
190 |     if r == 1:
191 |         print("[*] Player %s win" % ['X', 'O'][game.player_just_moved-1])
192 |     elif r == 0:
193 |         print("[*] Player %s win" % ['X', 'O'][2-game.player_just_moved])
194 |     else:
195 |         print("[*] Player draw")
196 | 
197 | if __name__ == "__main__":
198 |     play_game()
199 | 


--------------------------------------------------------------------------------
/MCTS/MCTS_TicTacToe.py:
--------------------------------------------------------------------------------
  1 | from math import *
  2 | import random
  3 | 
  4 | class Game:
  5 |     def __init__(self):
  6 |         self.player_just_moved = 2
  7 |         
  8 |     def clone(self):
  9 |         st = GameState()
 10 |         st.player_just_moved = self.player_just_moved
 11 |         return st
 12 | 
 13 |     def move(self, action):
 14 |         self.player_just_moved = 3 - self.player_just_moved
 15 |         
 16 |     def actions(self):
 17 |         """ Get all possible moves from this state.
 18 |         """
 19 |     
 20 |     def win(self, player):
 21 |         """ Get the game result from the viewpoint of player. 
 22 |         """
 23 | 
 24 |     def end(self):
 25 |         """ Whether the game is end or not
 26 |         """
 27 | 
 28 |     def __repr__(self):
 29 |         pass
 30 | 
 31 | class TicTacToe(Game):
 32 |     def __init__(self):
 33 |         self.player_just_moved = 2
 34 |         self.board = [0] * 9 # 0 = empty, 1 = player 1 (X), 2 = player 2 (O)
 35 |         
 36 |     def clone(self):
 37 |         st = TicTacToe()
 38 |         st.player_just_moved = self.player_just_moved
 39 |         st.board = self.board[:]
 40 |         return st
 41 | 
 42 |     def move(self, action):
 43 |         assert action >= 0 and action <= 8 and action == int(action) and self.board[action] == 0
 44 |         self.player_just_moved = 3 - self.player_just_moved
 45 |         self.board[action] = self.player_just_moved
 46 |         
 47 |     def actions(self):
 48 |         return [i for i in range(9) if self.board[i] == 0]
 49 |     
 50 |     def win(self, player):
 51 |         for (x,y,z) in [(0,1,2),(3,4,5),(6,7,8),(0,3,6),(1,4,7),(2,5,8),(0,4,8),(2,4,6)]:
 52 |             if self.board[x] == self.board[y] == self.board[z]:
 53 |                 if self.board[x] == player:
 54 |                     return 1
 55 |                 else:
 56 |                     return 0
 57 |         if self.actions() == []: return 0.5 # draw
 58 | 
 59 |     def end(self):
 60 |         return self.actions() == [] or self.win(1) == 1 or self.win(2) == 1
 61 | 
 62 |     def __repr__(self):
 63 |         line = '\n-----------\n'
 64 |         row = " {} | {} | {}"
 65 |         s = (row + line + row + line + row).format(*map(lambda i: [' ', 'X', 'O'][i], self.board))
 66 |         return s
 67 | 
 68 | class Node:
 69 |     def __init__(self, action=None, parent=None, state=None):
 70 |         self.action = action
 71 |         self.parent = parent
 72 |         self.childs = []
 73 |         self.W = 0
 74 |         self.N = 0
 75 |         self.untried_actions = state.actions()
 76 |         self.player_just_moved = state.player_just_moved
 77 |         
 78 |     def select(self):
 79 |         s = sorted(self.childs, key = lambda c: c.U())[-1]
 80 |         return s
 81 |     
 82 |     def add_child(self, a, s):
 83 |         n = Node(a, self, s)
 84 |         self.untried_actions.remove(a)
 85 |         self.childs.append(n)
 86 |         return n
 87 |     
 88 |     def update(self, result):
 89 |         self.N += 1
 90 |         self.W += result
 91 | 
 92 |     def U(self):
 93 |         if self.parent:
 94 |             return self.W / self.N + sqrt(2 * log(self.parent.N) / self.N)
 95 |         return 0
 96 | 
 97 |     def __repr__(self):
 98 |         return "[A: %s, U: %.2f, W/N: %.1f/%d, Untried: %s]" \
 99 |         % (self.action, self.U(), self.W, self.N, self.untried_actions)
100 | 
101 |     def show_node_tree(self, indent=0):
102 |         print("|  " * indent + str(self))
103 | 
104 |         for c in self.childs:
105 |             c.show_node_tree(indent+1)
106 | 
107 |     def show_children_nodes(self):
108 |         print('\n[*] Child Nodes')
109 |         for c in self.childs: print(c)
110 | 
111 | 
112 | def UCT(rootstate, itermax, verbose=False):
113 |     rootnode = Node(state=rootstate)
114 | 
115 |     for i in range(itermax):
116 |         node = rootnode
117 |         state = rootstate.clone()
118 | 
119 |         # Select
120 |         while node.untried_actions == [] and node.childs != []:
121 |             node = node.select()
122 |             state.move(node.action)
123 | 
124 |         # Expand
125 |         if node.untried_actions != []:
126 |             action = random.choice(node.untried_actions) 
127 |             state.move(action)
128 |             node = node.add_child(action, state)
129 | 
130 |         # Rollout
131 |         while state.actions() != []:
132 |             state.move(random.choice(state.actions()))
133 | 
134 |         # Backpropagate
135 |         while node != None:
136 |             node.update(state.win(node.player_just_moved))
137 |             node = node.parent
138 | 
139 |     if verbose: rootnode.show_node_tree()
140 |     else: rootnode.show_children_nodes()
141 | 
142 |     return sorted(rootnode.childs, key = lambda c: c.N)[-1].action
143 |                 
144 | def play_game():
145 |     game = TicTacToe()
146 | 
147 |     while not game.end():
148 |         print(game)
149 | 
150 |         if game.player_just_moved == 1:
151 |             action = UCT(game, 1000) # Player O
152 |         else:
153 |             action = UCT(game, 100) # Player X
154 |         
155 |         game.move(action)
156 |         print("[*] Player %s move: %d\n" % (['X', 'O'][game.player_just_moved-1], action))
157 | 
158 |     print(game)
159 |     r = game.win(game.player_just_moved)
160 |     if r == 1:
161 |         print("[*] Player %s win" % ['X', 'O'][game.player_just_moved-1])
162 |     elif r == 0:
163 |         print("[*] Player %s win" % ['X', 'O'][2-game.player_just_moved])
164 |     else:
165 |         print("[*] Player draw")
166 | 
167 | if __name__ == "__main__":
168 |     play_game()
169 | 


--------------------------------------------------------------------------------
/PG/Cartpole_pytorch/PG_CartPole-v0.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Cartpole_pytorch/PG_CartPole-v0.pth


--------------------------------------------------------------------------------
/PG/Cartpole_pytorch/PG_CartPole.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.distributions import Categorical
  4 | import numpy as np
  5 | import gym
  6 | from gym.spaces import Discrete, Box
  7 | import argparse
  8 | import random
  9 | 
 10 | seed = 1
 11 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 12 | torch.manual_seed(seed)
 13 | random.seed(seed)
 14 | np.random.seed(seed)
 15 | if torch.cuda.is_available():
 16 |     torch.cuda.manual_seed_all(seed)
 17 |     DEBUG = False
 18 | else:
 19 |     DEBUG = True
 20 | 
 21 | def weight_init(m):
 22 |     '''
 23 |     Code from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5
 24 |     Usage:
 25 |         model = Model()
 26 |         model.apply(weight_init)
 27 |     '''
 28 |     if isinstance(m, nn.Linear):
 29 |         nn.init.xavier_normal_(m.weight.data)
 30 |         nn.init.normal_(m.bias.data)
 31 | 
 32 | def reward_to_go(rews):
 33 |     n = len(rews)
 34 |     rtgs = np.zeros_like(rews)
 35 |     for i in reversed(range(n)):
 36 |         rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
 37 |     return rtgs
 38 | 
 39 | class MLP(nn.Module):
 40 |     def __init__(self, sizes, activation=nn.Tanh, output_activation=None):
 41 |         super().__init__()
 42 | 
 43 |         net = []
 44 |         for i in range(len(sizes)-1):
 45 |             net.append(nn.Linear(sizes[i], sizes[i+1]))
 46 |             if i == len(sizes) - 2:
 47 |                 if output_activation is not None:
 48 |                     net.append(output_activation())
 49 |             else:
 50 |                 net.append(activation())
 51 | 
 52 |         self.mlp = nn.Sequential(
 53 |             *net,
 54 |             nn.Softmax(dim=-1)
 55 |             )
 56 | 
 57 |     def forward(self, x):
 58 |         return self.mlp(x)
 59 | 
 60 | def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, 
 61 |           epochs=50, batch_size=5000, render=False):
 62 | 
 63 |     # make environment, check spaces, get obs / act dims
 64 |     env = gym.make(env_name)
 65 |     assert isinstance(env.observation_space, Box), \
 66 |         "This example only works for envs with continuous state spaces."
 67 |     assert isinstance(env.action_space, Discrete), \
 68 |         "This example only works for envs with discrete action spaces."
 69 | 
 70 |     obs_dim = env.observation_space.shape[0]
 71 |     n_acts = env.action_space.n
 72 | 
 73 |     policy = MLP(sizes=[obs_dim]+hidden_sizes+[n_acts])
 74 |     policy.apply(weight_init)
 75 |     optimizer = torch.optim.Adam(policy.parameters(), lr=lr)
 76 | 
 77 |     # for training policy
 78 |     def train_one_epoch():
 79 |         # make some empty lists for logging.
 80 |         batch_obs = []          # for observations
 81 |         batch_acts = []         # for actions
 82 |         batch_weights = []      # for reward-to-go weighting in policy gradient
 83 |         batch_rets = []         # for measuring episode returns
 84 |         batch_lens = []         # for measuring episode lengths
 85 | 
 86 |         # reset episode-specific variables
 87 |         obs = env.reset()       # first obs comes from starting distribution
 88 |         done = False            # signal from environment that episode is over
 89 |         ep_rews = []            # list for rewards accrued throughout ep
 90 | 
 91 |         # render first episode of each epoch
 92 |         finished_rendering_this_epoch = False
 93 | 
 94 |         # collect experience by acting in the environment with current policy
 95 |         policy.eval()
 96 |         while True:
 97 |             # rendering
 98 |             if (not finished_rendering_this_epoch) and render:
 99 |                 env.render()
100 | 
101 |             # save obs
102 |             batch_obs.append(obs.copy())
103 | 
104 |             # act in the environment
105 |             with torch.no_grad():
106 |                 act_probs = policy(torch.tensor(obs, dtype=torch.float))
107 |                 dist = Categorical(act_probs)
108 |                 act = dist.sample().item()
109 |             
110 |             obs, rew, done, _ = env.step(act)
111 | 
112 |             # save action, reward
113 |             batch_acts.append(act)
114 |             ep_rews.append(rew)
115 | 
116 |             if done:
117 |                 # if episode is over, record info about episode
118 |                 ep_ret, ep_len = sum(ep_rews), len(ep_rews)
119 |                 batch_rets.append(ep_ret)
120 |                 batch_lens.append(ep_len)
121 | 
122 |                 # the weight for each logprob(a_t|s_t) is reward-to-go from t
123 |                 batch_weights += list(reward_to_go(ep_rews))
124 | 
125 |                 # reset episode-specific variables
126 |                 obs, done, ep_rews = env.reset(), False, []
127 | 
128 |                 # won't render again this epoch
129 |                 finished_rendering_this_epoch = True
130 | 
131 |                 # end experience loop if we have enough of it
132 |                 if len(batch_obs) > batch_size:
133 |                     break
134 | 
135 |         # take a single policy gradient update step
136 |         policy.train()
137 |         batch_obs = torch.tensor(batch_obs, dtype=torch.float)
138 |         batch_acts = torch.tensor(batch_acts)
139 |         batch_weights = torch.tensor(batch_weights)
140 | 
141 |         batch_act_probs = policy(batch_obs)
142 |         dist = Categorical(batch_act_probs)
143 |         log_probs = dist.log_prob(batch_acts)
144 |         loss = (- log_probs * batch_weights).mean()
145 | 
146 |         optimizer.zero_grad()
147 |         loss.backward()
148 |         optimizer.step()
149 | 
150 |         return loss, batch_rets, batch_lens
151 | 
152 |     # training loop
153 |     max_avg_ret = 0
154 |     for i in range(epochs):
155 |         batch_loss, batch_rets, batch_lens = train_one_epoch()
156 |         print(f'epoch: {i:2d} loss: {batch_loss:.3f} episode average rewards: {np.mean(batch_rets):.3f} episode average len: {np.mean(batch_lens):.3f}')
157 | 
158 |         if np.mean(batch_rets) > max_avg_ret:
159 |             max_avg_ret = np.mean(batch_rets)
160 |             torch.save(policy.state_dict(), 'PG_{}.pth'.format(env_name))
161 | 
162 |     env.close()
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     parser = argparse.ArgumentParser()
167 |     parser.add_argument('--env_name', '--env', type=str, default='CartPole-v0')
168 |     parser.add_argument('--render', action='store_true')
169 |     parser.add_argument('--lr', type=float, default=1e-2)
170 |     parser.add_argument('--epochs', type=int, default=50)
171 |     args = parser.parse_args()
172 |     print('\nUsing reward-to-go formulation of policy gradient.\n')
173 |     train(env_name=args.env_name, render=args.render, lr=args.lr, epochs=args.epochs)


--------------------------------------------------------------------------------
/PG/Cartpole_tensorflow/PG_Cartpole.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import gym
  4 | import sys
  5 | import time
  6 | 
  7 | 
  8 | def create_environment():
  9 |     env = gym.make('CartPole-v0')
 10 |     env = env.unwrapped
 11 |     env.seed(1)
 12 | 
 13 |     state = env.reset()
 14 |     state_size = len(state)
 15 |     action_size = env.action_space.n
 16 | 
 17 |     return env, state_size, action_size
 18 | 
 19 | def test_environment():
 20 |     env, _, _ = create_environment()
 21 |     episodes = 1
 22 | 
 23 |     for _ in range(episodes):
 24 |         print(env.reset())
 25 |         env.render()
 26 |         total_rewards = 0
 27 |         done = False
 28 | 
 29 |         while not done:
 30 |             action = env.action_space.sample()
 31 |             state, reward, done, info = env.step(action)
 32 |             env.render()
 33 | 
 34 |             total_rewards += reward
 35 |             print('action:', action, 'reward:', reward)
 36 |             time.sleep(0.5)
 37 | 
 38 |         print('[*] Total Reward:',total_rewards)
 39 | 
 40 | def discount_and_normalize_rewards(episode_rewards, gamma):
 41 |     discounted_episode_rewards = np.zeros_like(episode_rewards, dtype=np.float32)
 42 |     cumulative = 0.0
 43 |     for i in reversed(range(len(episode_rewards))):
 44 |         cumulative = cumulative * gamma + episode_rewards[i]
 45 |         discounted_episode_rewards[i] = cumulative
 46 | 
 47 |     mean = np.mean(discounted_episode_rewards)
 48 |     std = np.std(discounted_episode_rewards)
 49 |     discounted_episode_rewards = (discounted_episode_rewards - mean) / std
 50 | 
 51 |     return discounted_episode_rewards
 52 | 
 53 | 
 54 | class PGNetwork():
 55 | 
 56 |     def __init__(self, state_size, action_size, learning_rate, name='PGNetwork'):
 57 |         self.state_size = state_size
 58 |         self.action_size = action_size
 59 |         self.learning_rate = learning_rate
 60 | 
 61 |         with tf.name_scope(name):
 62 |             self.input_state = tf.placeholder(tf.float32, [None, state_size], name='input_state')
 63 |             self.input_action = tf.placeholder(tf.int32, [None, action_size], name='input_action')
 64 |             self.input_rewards = tf.placeholder(tf.float32, [None, ], name='input_rewards')
 65 |             self.input_mean_reward = tf.placeholder(tf.float32, name='input_mean_reward')
 66 | 
 67 |             fc1 = tf.contrib.layers.fully_connected(
 68 |                 inputs = self.input_state,
 69 |                 num_outputs = 10,
 70 |                 activation_fn = tf.nn.relu,
 71 |                 weights_initializer = tf.contrib.layers.xavier_initializer())
 72 |             fc2 = tf.contrib.layers.fully_connected(
 73 |                 inputs = fc1,
 74 |                 num_outputs = action_size,
 75 |                 activation_fn = tf.nn.relu,
 76 |                 weights_initializer = tf.contrib.layers.xavier_initializer())
 77 |             fc3 = tf.contrib.layers.fully_connected(
 78 |                 inputs = fc2,
 79 |                 num_outputs = action_size,
 80 |                 activation_fn = None,
 81 |                 weights_initializer = tf.contrib.layers.xavier_initializer())
 82 | 
 83 |             self.output_action = tf.nn.softmax(fc3)
 84 |             neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=fc3, labels=self.input_action)
 85 |             self.loss = tf.reduce_mean(neg_log_prob * self.input_rewards)
 86 |             self.train = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
 87 | 
 88 | 
 89 | 
 90 | def train():
 91 |     env, state_size, action_size = create_environment()
 92 |     # Hyperparameters
 93 |     max_episodes = 10000
 94 |     learning_rate = 0.01
 95 |     gamma = 0.95
 96 | 
 97 |     tf.reset_default_graph()
 98 |     PG = PGNetwork(state_size, action_size, learning_rate)
 99 | 
100 |     writer = tf.summary.FileWriter('PG_Cartpole_log')
101 |     tf.summary.scalar('Loss', PG.loss)
102 |     tf.summary.scalar('Reward mean', PG.input_mean_reward)
103 |     write_op = tf.summary.merge_all()
104 |     saver = tf.train.Saver()
105 | 
106 | 
107 |     all_rewards = []
108 |     total_rewards = 0
109 |     maximum_reward_recorded = 0
110 |     episode_states, episode_actions, episode_rewards = [], [], []
111 | 
112 |     with tf.Session() as sess:
113 |         sess.run(tf.global_variables_initializer())
114 | 
115 |         for episode in range(max_episodes):
116 |             episode_rewards_sum = 0
117 |             state = env.reset()
118 |             env.render()
119 |             done = False
120 | 
121 |             while not done:
122 |                 output_action = sess.run(PG.output_action, feed_dict={PG.input_state: state.reshape([1, 4])})
123 |                 action = np.random.choice(range(action_size), p=output_action.ravel())
124 | 
125 |                 new_state, reward, done, info = env.step(action)
126 |                 env.render()
127 |                 
128 |                 episode_states.append(state)
129 |                 a = np.zeros(action_size)
130 |                 a[action] = 1
131 |                 episode_actions.append(a)
132 |                 episode_rewards.append(reward)
133 | 
134 |                 state = new_state
135 | 
136 |             episode_rewards_sum = np.sum(episode_rewards)
137 |             all_rewards.append(episode_rewards_sum)
138 |             total_rewards = np.sum(all_rewards)
139 |             mean_reward = np.divide(total_rewards, episode + 1)
140 |             maximum_reward_recorded = np.amax(all_rewards)
141 | 
142 |             print('='*20)
143 |             print('Episode:', episode)
144 |             print('Reward:', episode_rewards_sum)
145 |             print('Mean Reward:', mean_reward)
146 |             print('Max reward so far:', maximum_reward_recorded)
147 | 
148 |             episode_rewards = discount_and_normalize_rewards(episode_rewards, gamma)
149 |             loss, _ = sess.run([PG.loss, PG.train], feed_dict={
150 |                     PG.input_state: np.vstack(np.array(episode_states)),
151 |                     PG.input_action: np.vstack(np.array(episode_actions)),
152 |                     PG.input_rewards: episode_rewards
153 |                 })
154 | 
155 |             summary = sess.run(write_op, feed_dict={
156 |                     PG.input_state: np.vstack(np.array(episode_states)),
157 |                     PG.input_action: np.vstack(np.array(episode_actions)),
158 |                     PG.input_rewards: episode_rewards,
159 |                     PG.input_mean_reward: mean_reward
160 |                 })
161 | 
162 |             writer.add_summary(summary, episode)
163 |             writer.flush()
164 |             episode_states, episode_actions, episode_rewards = [], [], []
165 | 
166 |             if episode % 5 == 0:
167 |                 save_path = saver.save(sess, './model/model.ckpt')
168 |                 print('[*] Model Saved:', save_path)
169 | 
170 |         print('Train done')
171 | 
172 | def play():
173 |     env, state_size, action_size = create_environment()
174 |     learning_rate = 0.01
175 | 
176 |     with tf.Session() as sess:
177 |         PG = PGNetwork(state_size, action_size, learning_rate)
178 |         saver = tf.train.Saver()
179 |         saver.restore(sess, "./model/model.ckpt")
180 | 
181 |         state = env.reset()
182 |         env.render()
183 |         done = False
184 |         episode_rewards = []
185 |         while not done:
186 |                 output_action = sess.run(PG.output_action, feed_dict={PG.input_state: state.reshape([1, 4])})
187 |                 action = np.random.choice(range(action_size), p=output_action.ravel())
188 | 
189 |                 state, reward, done, info = env.step(action)
190 |                 env.render()
191 |                 episode_rewards.append(reward)
192 |         
193 |         episode_rewards_sum = np.sum(episode_rewards)
194 |         print('Episode Rewards:', episode_rewards_sum)
195 | 
196 | if __name__ == '__main__':
197 |     if sys.argv[1] == '--train':
198 |         train()
199 |     elif sys.argv[1] == '--play':
200 |         play()
201 | 
202 | 
203 | 
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 


--------------------------------------------------------------------------------
/PG/Cartpole_tensorflow/model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "model.ckpt"
2 | all_model_checkpoint_paths: "model.ckpt"
3 | 


--------------------------------------------------------------------------------
/PG/Cartpole_tensorflow/model/model.ckpt.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Cartpole_tensorflow/model/model.ckpt.data-00000-of-00001


--------------------------------------------------------------------------------
/PG/Cartpole_tensorflow/model/model.ckpt.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Cartpole_tensorflow/model/model.ckpt.index


--------------------------------------------------------------------------------
/PG/Cartpole_tensorflow/model/model.ckpt.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Cartpole_tensorflow/model/model.ckpt.meta


--------------------------------------------------------------------------------
/PG/Doom-Deathmatch/PG_Doom_Deathmatch.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from vizdoom import DoomGame
  4 | import random
  5 | import time
  6 | from skimage import transform
  7 | from collections import deque
  8 | import sys
  9 | 
 10 | import warnings
 11 | warnings.filterwarnings('ignore')
 12 | 
 13 | import os
 14 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 15 | 
 16 | 
 17 | ###########################################
 18 | # Constant
 19 | stack_size = 4
 20 | frame_size = (100, 160)
 21 | # Global variables
 22 | stacked_frames = deque([np.zeros(frame_size) for _ in range(stack_size)], maxlen=stack_size)
 23 | ###########################################
 24 | 
 25 | 
 26 | def create_environment():
 27 |     game = DoomGame()
 28 |     game.load_config('defend_the_center.cfg')
 29 |     game.set_doom_scenario_path('defend_the_center.wad')
 30 | 
 31 |     game.init()
 32 |     possible_actions = np.identity(3, dtype=int).tolist()
 33 |     return game, possible_actions
 34 | 
 35 | def test_environment():
 36 |     game, possible_actions = create_environment()
 37 |     episodes = 1
 38 | 
 39 |     for _ in range(episodes):
 40 |         game.new_episode()
 41 | 
 42 |         while not game.is_episode_finished():
 43 |             state = game.get_state()
 44 | 
 45 |             img = state.screen_buffer # 当前游戏画面, 2D array
 46 |             misc = state.game_variables # [50.]
 47 |             action = random.choice(possible_actions)
 48 |             reward = game.make_action(action)
 49 |             print(action, 'reward:', reward)
 50 |             time.sleep(0.02)
 51 | 
 52 |         print('[*] Result:', game.get_total_reward())
 53 |         time.sleep(2)
 54 | 
 55 |     game.close()
 56 | 
 57 | def preprocess_frame(frame):
 58 |     cropped_frame = frame[40:, :]
 59 |     normalized_frame = cropped_frame / 255.0
 60 |     preprocessed_frame = transform.resize(normalized_frame, frame_size)
 61 |     return preprocessed_frame
 62 | 
 63 | def stack_frames(state, is_new_episode=False):
 64 |     global stacked_frames
 65 |     frame = preprocess_frame(state)
 66 | 
 67 |     if is_new_episode:
 68 |         stacked_frames = deque([np.zeros(frame_size) for _ in range(stack_size)], maxlen=stack_size)
 69 | 
 70 |         for _ in range(stack_size):
 71 |             stacked_frames.append(frame)
 72 |     else:
 73 |         stacked_frames.append(frame)
 74 | 
 75 |     return np.stack(stacked_frames, axis=2)
 76 | 
 77 | def discount_and_normalize_rewards(episode_rewards, gamma):
 78 |     discounted_episode_rewards = np.zeros_like(episode_rewards, dtype=np.float32)
 79 |     cumulative = 0.0
 80 |     for i in reversed(range(len(episode_rewards))):
 81 |         cumulative = cumulative * gamma + episode_rewards[i]
 82 |         discounted_episode_rewards[i] = cumulative
 83 | 
 84 |     mean = np.mean(discounted_episode_rewards)
 85 |     std = np.std(discounted_episode_rewards)
 86 |     discounted_episode_rewards = (discounted_episode_rewards - mean) / std
 87 | 
 88 |     return discounted_episode_rewards
 89 | 
 90 | 
 91 | class PGNetwork:
 92 |     def __init__(self, state_size, action_size, learning_rate=0.0001, name='PGNetwork'):
 93 |         with tf.variable_scope(name):
 94 |             self.inputs = tf.placeholder(tf.float32, [None, *state_size], name='inputs')
 95 |             self.actions = tf.placeholder(tf.float32, [None, action_size], name='actions')
 96 |             self.discounted_episode_rewards = tf.placeholder(tf.float32, [None, ], name='discounted_episode_rewards')
 97 |             self.mean_reward = tf.placeholder(tf.float32, name='mean_reward')
 98 | 
 99 |             conv1 = tf.layers.conv2d(
100 |                 inputs = self.inputs,
101 |                 filters = 32,
102 |                 kernel_size = [8, 8],
103 |                 strides = [4, 4],
104 |                 padding = 'VALID',
105 |                 kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
106 |                 name = 'conv1'
107 |             )
108 |             conv1_batchnorm = tf.layers.batch_normalization(
109 |                 conv1,
110 |                 training = True,
111 |                 epsilon = 1e-5,
112 |                 name = 'conv1_batchnorm'
113 |             )
114 |             conv1_out = tf.nn.elu(conv1_batchnorm, name='conv1_out')
115 | 
116 |             conv2 = tf.layers.conv2d(
117 |                 inputs = conv1_out,
118 |                 filters = 64,
119 |                 kernel_size = [4, 4],
120 |                 strides = [2, 2],
121 |                 padding = 'VALID',
122 |                 kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
123 |                 name = 'conv2'
124 |             )
125 |             conv2_batchnorm = tf.layers.batch_normalization(
126 |                 conv2,
127 |                 training = True,
128 |                 epsilon = 1e-5,
129 |                 name = 'conv2_batchnorm'
130 |             )
131 |             conv2_out = tf.nn.elu(conv2_batchnorm, name='conv2_out')
132 | 
133 |             conv3 = tf.layers.conv2d(
134 |                 inputs = conv2_out,
135 |                 filters = 128,
136 |                 kernel_size = [4, 4],
137 |                 strides = [2, 2],
138 |                 padding = 'VALID',
139 |                 kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
140 |                 name = 'conv3'
141 |             )
142 |             conv3_batchnorm = tf.layers.batch_normalization(
143 |                 conv3,
144 |                 training = True,
145 |                 epsilon = 1e-5,
146 |                 name = 'conv3_batchnorm'
147 |             )
148 |             conv3_out = tf.nn.elu(conv3_batchnorm, name='conv3_out')
149 | 
150 |             flatten = tf.layers.flatten(conv3_out)
151 |             fc1 = tf.layers.dense(
152 |                 inputs = flatten,
153 |                 units = 512,
154 |                 activation = tf.nn.elu,
155 |                 kernel_initializer = tf.contrib.layers.xavier_initializer(),
156 |                 name = 'fc1'
157 |             )
158 |             fc2 = tf.layers.dense(
159 |                 inputs = fc1,
160 |                 units = action_size,
161 |                 activation = None,
162 |                 kernel_initializer = tf.contrib.layers.xavier_initializer(),
163 |                 name = 'fc2'
164 |             )
165 |             self.output = tf.nn.softmax(fc2)
166 | 
167 |             neg_log_prob = tf.nn.softmax_cross_entropy_with_logits_v2(logits=fc2, labels=self.actions)
168 |             self.loss = tf.reduce_mean(neg_log_prob * self.discounted_episode_rewards)
169 |             self.train = tf.train.RMSPropOptimizer(learning_rate).minimize(self.loss)
170 | 
171 | 
172 | def train():
173 |     game, possible_actions = create_environment()
174 | 
175 |     # set hyperparameters
176 |     ###########################################
177 |     state_size = [*frame_size, stack_size]
178 |     action_size = game.get_available_buttons_size()
179 |     learning_rate = 0.0001
180 |     total_episodes = 5000
181 |     batch_size = 1000
182 |     gamma = 0.99
183 |     check_step = 5
184 |     ###########################################
185 | 
186 |     # train PG
187 |     ###########################################
188 |     tf.reset_default_graph()
189 |     PG = PGNetwork(state_size, action_size, learning_rate)
190 | 
191 |     writer = tf.summary.FileWriter('train_log')
192 |     tf.summary.scalar('Loss', PG.loss)
193 |     tf.summary.scalar('Reward mean', PG.mean_reward)
194 |     write_op = tf.summary.merge_all()
195 |     saver = tf.train.Saver()
196 | 
197 |     all_rewards = []
198 |     total_rewards = 0
199 |     maximum_reward_recorded = 0
200 | 
201 |     with tf.Session() as sess:
202 |         sess.run(tf.global_variables_initializer())
203 | 
204 |         for episode in range(1, total_episodes+1):
205 |             episode_states, episode_actions, episode_rewards = [], [], []
206 | 
207 |             game.new_episode()
208 |             state = game.get_state().screen_buffer
209 |             state = stack_frames(state, True)
210 | 
211 |             while not game.is_episode_finished():
212 |                 state = game.get_state().screen_buffer
213 |                 state = stack_frames(state)
214 | 
215 |                 action_prob = sess.run(PG.output, feed_dict={
216 |                     PG.inputs: state.reshape((1, *state_size))
217 |                 })
218 |                 action = np.random.choice(range(action_size), p=action_prob.ravel())
219 |                 action = possible_actions[action]
220 |                 reward = game.make_action(action)
221 | 
222 |                 episode_states.append(state)
223 |                 episode_actions.append(action)
224 |                 episode_rewards.append(reward)                
225 | 
226 |             episode_rewards_sum = np.sum(episode_rewards)
227 |             all_rewards.append(episode_rewards_sum)
228 |             total_rewards = np.sum(all_rewards)
229 |             mean_reward = np.divide(total_rewards, episode + 1)
230 |             maximum_reward_recorded = np.amax(all_rewards)
231 | 
232 | 
233 |             episode_rewards = discount_and_normalize_rewards(episode_rewards, gamma)
234 |             loss, _ = sess.run([PG.loss, PG.train], feed_dict={
235 |                     PG.inputs: np.array(episode_states),
236 |                     PG.actions: np.array(episode_actions),
237 |                     PG.discounted_episode_rewards: episode_rewards
238 |                 })
239 | 
240 |             summary = sess.run(write_op, feed_dict={
241 |                     PG.inputs: np.array(episode_states),
242 |                     PG.actions: np.array(episode_actions),
243 |                     PG.discounted_episode_rewards: episode_rewards,
244 |                     PG.mean_reward: mean_reward
245 |                 })
246 | 
247 |             writer.add_summary(summary, episode)
248 |             writer.flush()
249 | 
250 |             print('='*30)
251 |             print('[*] Episode:', episode)
252 |             print('[*] Reward:', episode_rewards_sum)
253 |             print('[*] Mean Reward:', mean_reward)
254 |             print('[*] Max reward so far:', maximum_reward_recorded)
255 |             print('[*] Loss:', loss)
256 | 
257 |             if episode % check_step == 0:
258 |                 save_path = saver.save(sess, './model/model.ckpt')
259 |                 print('[*] Model Saved:', save_path)
260 | 
261 |     print('[*] Train done')
262 |     game.close()
263 |     ###########################################
264 | 
265 | def play():
266 |     game, possible_actions = create_environment()
267 | 
268 |     state_size = [*frame_size, stack_size]
269 |     action_size = game.get_available_buttons_size()
270 |     PG = PGNetwork(state_size, action_size)
271 | 
272 |     with tf.Session() as sess:
273 |         saver = tf.train.Saver()
274 |         saver.restore(sess, "./model/model.ckpt")
275 | 
276 |         game.new_episode()
277 |         frame = game.get_state().screen_buffer
278 |         state = stack_frames(frame, True)
279 | 
280 |         while not game.is_episode_finished():
281 |             frame = game.get_state().screen_buffer
282 |             state = stack_frames(frame)
283 | 
284 |             action_prob = sess.run(PG.output, feed_dict={
285 |                 PG.inputs: state.reshape((1, *state_size))
286 |             })
287 |             action = np.random.choice(range(action_size), p=action_prob.ravel())
288 |             action = possible_actions[action]
289 |             game.make_action(action)
290 |         
291 |         score = game.get_total_reward()
292 |         print("[*] Score: ", score)
293 | 
294 |     game.close()
295 | 
296 | if __name__ == '__main__':
297 |     if sys.argv[1] == '--train':
298 |         train()
299 |     elif sys.argv[1] == '--play':
300 |         play()
301 |     elif sys.argv[1] == '--test':
302 |         test_environment()
303 | 


--------------------------------------------------------------------------------
/PG/Doom-Deathmatch/defend_the_center.cfg:
--------------------------------------------------------------------------------
 1 | # Lines starting with # are treated as comments (or with whitespaces+#).
 2 | # It doesn't matter if you use capital letters or not.
 3 | # It doesn't matter if you use underscore or camel notation for keys, e.g. episode_timeout is the same as episodeTimeout.
 4 | 
 5 | doom_scenario_path = defend_the_center.wad
 6 | 
 7 | # Rewards
 8 | death_penalty = 1
 9 | 
10 | # Rendering options
11 | screen_resolution = RES_320X240
12 | screen_format = GRAY8
13 | render_hud = True
14 | render_crosshair = false
15 | render_weapon = true
16 | render_decals = false
17 | render_particles = false
18 | window_visible = true
19 | 
20 | # make episodes start after 10 tics (after unholstering the gun)
21 | episode_start_time = 10
22 | 
23 | # make episodes finish after 2100 actions (tics)
24 | episode_timeout = 2100
25 | 
26 | # Available buttons
27 | available_buttons = 
28 | 	{ 
29 | 		TURN_LEFT 
30 | 		TURN_RIGHT 
31 | 		ATTACK 
32 | 	}
33 | 
34 | # Game variables that will be in the state
35 | available_game_variables = { AMMO2 HEALTH  }
36 | 
37 | mode = PLAYER
38 | doom_skill = 3
39 | 


--------------------------------------------------------------------------------
/PG/Doom-Deathmatch/defend_the_center.wad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PG/Doom-Deathmatch/defend_the_center.wad


--------------------------------------------------------------------------------
/PPO/HalfCheetah/PPO_HalfCheetah-v2.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/PPO/HalfCheetah/PPO_HalfCheetah-v2.pth


--------------------------------------------------------------------------------
/PPO/HalfCheetah/PPO_HalfCheetah.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.distributions import Categorical
  4 | import numpy as np
  5 | import gym
  6 | from gym.spaces import Discrete, Box
  7 | import time
  8 | import random
  9 | import scipy.signal
 10 | 
 11 | def logger_print(logger, key, with_min_and_max=False):
 12 |     if with_min_and_max:
 13 |         print(f'{key+":":13s} {np.mean(logger[key]):.4f}\t{np.min(logger[key]):.4f}(min) {np.max(logger[key]):.4f}(max) {np.std(logger[key]):.4f}(std)')
 14 |     else:
 15 |         print(f'{key+":":13s} {np.mean(logger[key]):.4f}')
 16 | 
 17 | def get_parameter_number(net):
 18 |     total_num = sum(p.numel() for p in net.parameters())
 19 |     trainable_num = sum(p.numel() for p in net.parameters() if p.requires_grad)
 20 |     return {'Total': total_num, 'Trainable': trainable_num}
 21 | 
 22 | def weight_init(m):
 23 |     '''
 24 |     Code from https://gist.github.com/jeasinema/ed9236ce743c8efaf30fa2ff732749f5
 25 |     Usage:
 26 |         model = Model()
 27 |         model.apply(weight_init)
 28 |     '''
 29 |     if isinstance(m, nn.Linear):
 30 |         nn.init.xavier_normal_(m.weight.data)
 31 |         nn.init.normal_(m.bias.data)
 32 | 
 33 | def discount_cumsum(x, discount):
 34 |     """
 35 |     magic from rllab for computing discounted cumulative sums of vectors.
 36 | 
 37 |     input: 
 38 |         vector x: [x0, x1, x2]
 39 | 
 40 |     output:
 41 |         [x0 + discount * x1 + discount^2 * x2, x1 + discount * x2, x2]
 42 |     """
 43 |     return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
 44 | 
 45 | class PPOBuffer:
 46 |     def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
 47 |         self.obs_buf = np.zeros((size, obs_dim), dtype=np.float32)
 48 |         self.act_buf = np.zeros((size, act_dim), dtype=np.float32)
 49 |         self.adv_buf = np.zeros(size, dtype=np.float32)
 50 |         self.rew_buf = np.zeros(size, dtype=np.float32)
 51 |         self.ret_buf = np.zeros(size, dtype=np.float32)
 52 |         self.val_buf = np.zeros(size, dtype=np.float32)
 53 |         self.logp_buf = np.zeros(size, dtype=np.float32)
 54 |         self.gamma, self.lam = gamma, lam
 55 |         self.ptr, self.path_start_idx, self.max_size = 0, 0, size
 56 | 
 57 |     def store(self, obs, act, rew, val, logp):
 58 |         """
 59 |         Append one timestep of agent-environment interaction to the buffer.
 60 |         """
 61 |         assert self.ptr < self.max_size # buffer has to have room so you can store
 62 |         i = self.ptr
 63 |         self.obs_buf[i] = obs
 64 |         self.act_buf[i] = act
 65 |         self.rew_buf[i] = rew
 66 |         self.val_buf[i] = val
 67 |         self.logp_buf[i] = logp
 68 |         self.ptr += 1
 69 | 
 70 |     def finish_path(self, last_val=0):
 71 |         """
 72 |         Call this at the end of a trajectory, or when one gets cut off
 73 |         by an epoch ending. This looks back in the buffer to where the
 74 |         trajectory started, and uses rewards and value estimates from
 75 |         the whole trajectory to compute advantage estimates with GAE-Lambda,
 76 |         as well as compute the rewards-to-go for each state, to use as
 77 |         the targets for the value function.
 78 | 
 79 |         The "last_val" argument should be 0 if the trajectory ended
 80 |         because the agent reached a terminal state (died), and otherwise
 81 |         should be V(s_T), the value function estimated for the last state.
 82 |         This allows us to bootstrap the reward-to-go calculation to account
 83 |         for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
 84 |         """
 85 |         path_slice = slice(self.path_start_idx, self.ptr)
 86 |         rews = np.append(self.rew_buf[path_slice], last_val)
 87 |         vals = np.append(self.val_buf[path_slice], last_val)
 88 | 
 89 |         # the next two lines implement GAE-Lambda advantage calculation
 90 |         deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
 91 |         self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)
 92 | 
 93 |         # the next line computes rewards-to-go, to be targets for the value function
 94 |         self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]
 95 |         
 96 |         self.path_start_idx = self.ptr
 97 | 
 98 |     def get(self):
 99 |         """
100 |         Call this at the end of an epoch to get all of the data from
101 |         the buffer, with advantages appropriately normalized (shifted to have
102 |         mean zero and std one). Also, resets some pointers in the buffer.
103 |         """
104 |         assert self.ptr == self.max_size    # buffer has to be full before you can get
105 |         self.ptr, self.path_start_idx = 0, 0
106 |         # the next two lines implement the advantage normalization trick
107 |         adv_mean, adv_std = np.mean(self.adv_buf), np.std(self.adv_buf)
108 |         self.adv_buf = (self.adv_buf - adv_mean) / adv_std
109 |         return [self.obs_buf, self.act_buf, self.adv_buf, self.ret_buf, self.logp_buf]
110 | 
111 | class MLP(nn.Module):
112 |     def __init__(self, sizes, activation=nn.Tanh, output_activation=None):
113 |         super().__init__()
114 | 
115 |         net = []
116 |         for i in range(len(sizes)-1):
117 |             net.append(nn.Linear(sizes[i], sizes[i+1]))
118 |             if i == len(sizes) - 2:
119 |                 if output_activation is not None:
120 |                     net.append(output_activation())
121 |             else:
122 |                 net.append(activation())
123 | 
124 |         self.mlp = nn.Sequential(*net)
125 | 
126 |     def forward(self, x):
127 |         return self.mlp(x)
128 | 
129 | class MLP_Categorical_Policy(nn.Module):
130 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation=nn.Tanh, output_activation=None):
131 |         super().__init__()
132 | 
133 |         self.mlp = MLP([obs_dim] + hidden_sizes + [act_dim], activation, output_activation)
134 |         self.softmax = nn.Softmax(dim=-1)
135 | 
136 |     def forward(self, x):
137 |         x = self.mlp(x)
138 |         p = self.softmax(x)
139 |         dist = Categorical(p)
140 |         a = dist.sample()
141 |         log_p = dist.log_prob(a)
142 |         return a.item(), log_p
143 | 
144 | class MLP_Gaussian_Policy(nn.Module):
145 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation=nn.Tanh, output_activation=None):
146 |         super().__init__()
147 | 
148 |         self.mlp = MLP([obs_dim] + hidden_sizes + [act_dim], activation, output_activation)
149 |         self.pi = torch.tensor(np.pi, dtype=torch.float)
150 | 
151 |     def forward(self, x, a=None):
152 |         mu = self.mlp(x)
153 |         log_std = -0.5 * torch.ones(mu.shape[-1], dtype=torch.float)
154 |         std = torch.exp(log_std)
155 |         if not self.training:
156 |             a = mu + torch.randn(mu.shape) * std
157 |         # gaussian likelihood
158 |         pre_sum = -0.5 * ( ((a-mu) / (torch.exp(log_std) + 1e-8))**2 + 2*log_std + torch.log(2*self.pi) )
159 |         logp = pre_sum.sum(dim=-1)
160 |         return a, logp
161 | 
162 | class Actor_Critic(nn.Module):
163 |     def __init__(self, obs_dim, act_dim, hidden_sizes, activation=nn.Tanh, output_activation=None, action_space=None):
164 |         super().__init__()
165 | 
166 |         if isinstance(action_space, Box):
167 |             policy = MLP_Gaussian_Policy
168 |         elif isinstance(action_space, Discrete):
169 |             policy = MLP_Categorical_Policy
170 | 
171 |         self.actor = policy(obs_dim, act_dim, hidden_sizes, activation, output_activation)
172 |         self.critic = MLP([obs_dim] + hidden_sizes + [1], activation, output_activation)
173 | 
174 |     def forward(self, x, a=None):
175 |         v = self.critic(x)
176 |         if self.training:
177 |             _, logp = self.actor(x, a)
178 |             return logp, v
179 |         else:
180 |             a, logp = self.actor(x)
181 |             return a, logp, v
182 | 
183 | """
184 | Proximal Policy Optimization (by clipping), with early stopping based on approximate KL
185 | """
186 | def train(
187 |         env_name,
188 |         ac_kwargs=dict(),
189 |         seed=0,
190 |         steps_per_epoch=4000,
191 |         epochs=50,
192 |         gamma=0.99,
193 |         clip_ratio=0.2,
194 |         pi_lr=3e-4,
195 |         vf_lr=1e-3,
196 |         train_pi_iters=80,
197 |         train_v_iters=80,
198 |         lam=0.97,
199 |         max_ep_len=1000,
200 |         target_kl=0.01,
201 |         save_freq=10
202 |         ):
203 |     """
204 | 
205 |     Args:
206 |         actor_critic: A function which takes in placeholder symbols 
207 |             for state, ``x_ph``, and action, ``a_ph``, and returns the main 
208 |             outputs from the agent's Tensorflow computation graph:
209 | 
210 |             ===========  ================  ======================================
211 |             Symbol       Shape             Description
212 |             ===========  ================  ======================================
213 |             ``pi``       (batch, act_dim)  | Samples actions from policy given 
214 |                                            | states.
215 |             ``logp``     (batch,)          | Gives log probability, according to
216 |                                            | the policy, of taking actions ``a_ph``
217 |                                            | in states ``x_ph``.
218 |             ``logp_pi``  (batch,)          | Gives log probability, according to
219 |                                            | the policy, of the action sampled by
220 |                                            | ``pi``.
221 |             ``v``        (batch,)          | Gives the value estimate for states
222 |                                            | in ``x_ph``. (Critical: make sure 
223 |                                            | to flatten this!)
224 |             ===========  ================  ======================================
225 | 
226 |         ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
227 |             function you provided to PPO.
228 | 
229 |         seed (int): Seed for random number generators.
230 | 
231 |         steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
232 |             for the agent and the environment in each epoch.
233 | 
234 |         epochs (int): Number of epochs of interaction (equivalent to
235 |             number of policy updates) to perform.
236 | 
237 |         gamma (float): Discount factor. (Always between 0 and 1.)
238 | 
239 |         clip_ratio (float): Hyperparameter for clipping in the policy objective.
240 |             Roughly: how far can the new policy go from the old policy while 
241 |             still profiting (improving the objective function)? The new policy 
242 |             can still go farther than the clip_ratio says, but it doesn't help
243 |             on the objective anymore. (Usually small, 0.1 to 0.3.)
244 | 
245 |         pi_lr (float): Learning rate for policy optimizer.
246 | 
247 |         vf_lr (float): Learning rate for value function optimizer.
248 | 
249 |         train_pi_iters (int): Maximum number of gradient descent steps to take 
250 |             on policy loss per epoch. (Early stopping may cause optimizer
251 |             to take fewer than this.)
252 | 
253 |         train_v_iters (int): Number of gradient descent steps to take on 
254 |             value function per epoch.
255 | 
256 |         lam (float): Lambda for GAE-Lambda. (Always between 0 and 1,
257 |             close to 1.)
258 | 
259 |         max_ep_len (int): Maximum length of trajectory / episode / rollout.
260 | 
261 |         target_kl (float): Roughly what KL divergence we think is appropriate
262 |             between new and old policies after an update. This will get used 
263 |             for early stopping. (Usually small, 0.01 or 0.05.)
264 | 
265 |         logger_kwargs (dict): Keyword args for EpochLogger.
266 | 
267 |         save_freq (int): How often (in terms of gap between epochs) to save
268 |             the current policy and value function.
269 | 
270 |     """
271 |     print(locals())
272 | 
273 |     torch.manual_seed(seed)
274 |     random.seed(seed)
275 |     np.random.seed(seed)
276 |     if torch.cuda.is_available():
277 |         torch.cuda.manual_seed_all(seed)
278 | 
279 |     env = gym.make(env_name)
280 |     obs_dim = env.observation_space.shape[0]
281 |     act_dim = env.action_space.shape[0]
282 | 
283 |     # Share information about action space with policy architecture
284 |     ac_kwargs['action_space'] = env.action_space
285 | 
286 |     # Experience buffer
287 |     local_steps_per_epoch = steps_per_epoch
288 |     buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam)
289 | 
290 |     # Model
291 |     actor_critic = Actor_Critic(obs_dim, act_dim, **ac_kwargs)
292 |     print(actor_critic)
293 |     print(f'\nNumber of parameters: {get_parameter_number(actor_critic)}\n')
294 |     actor_critic.apply(weight_init)
295 |     actor_optimizer = torch.optim.Adam(actor_critic.actor.parameters(), lr=pi_lr)
296 |     critic_optimizer = torch.optim.Adam(actor_critic.critic.parameters(), lr=vf_lr)
297 | 
298 |     start_time = time.time()
299 |     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
300 | 
301 |     # Main loop: collect experience in env and update/log each epoch
302 |     max_avg_ret = -np.inf
303 |     for epoch in range(epochs):
304 | 
305 |         logger = {
306 |             'VVals': [],
307 |             'EpRet': [],
308 |             'EpLen': [],
309 |             'StopIter': [],
310 |             'LossPi': [],
311 |             'LossV': [],
312 |             'KL': [],
313 |             'Entropy': [],
314 |             'ClipFrac': [],
315 |             'DeltaLossPi': [],
316 |             'DeltaLossV': []
317 |         }
318 | 
319 |         actor_critic.eval()
320 |         with torch.no_grad():
321 |             for t in range(local_steps_per_epoch):
322 |                 a, logp, v = actor_critic(torch.tensor(o, dtype=torch.float))
323 |                 # breakpoint()
324 | 
325 |                 # save and log
326 |                 buf.store(o, a, r, v, logp)
327 |                 logger['VVals'].append(v)
328 | 
329 |                 o, r, d, _ = env.step(a)
330 |                 ep_ret += r
331 |                 ep_len += 1
332 | 
333 |                 terminal = d or (ep_len == max_ep_len)
334 |                 if terminal or (t==local_steps_per_epoch-1):
335 |                     # if trajectory didn't reach terminal state, bootstrap value target
336 |                     last_val = r if d else actor_critic(torch.tensor(o, dtype=torch.float))[-1].item()
337 |                     buf.finish_path(last_val)
338 |                     if terminal:
339 |                         # only save EpRet / EpLen if trajectory finished
340 |                         logger['EpRet'].append(ep_ret)
341 |                         logger['EpLen'].append(ep_len)
342 |                     o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
343 |         
344 |         # Perform PPO update!
345 |         obs_buf, act_buf, adv_buf, ret_buf, logp_buf = buf.get()   
346 |         obs = torch.tensor(obs_buf, dtype=torch.float)
347 |         acts = torch.tensor(act_buf, dtype=torch.float)
348 |         logp_old = torch.tensor(logp_buf, dtype=torch.float)
349 |         adv = torch.tensor(adv_buf, dtype=torch.float)
350 |         ret = torch.tensor(ret_buf, dtype=torch.float)
351 | 
352 |         actor_critic.train()
353 |         with torch.no_grad():
354 |             logp, v = actor_critic(obs, acts)
355 |             ratio = torch.exp(logp - logp_old) # pi(a|s) / pi_old(a|s)
356 |             min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv)
357 | 
358 |             pi_l_old= - torch.min(ratio * adv, min_adv).mean()
359 |             v_l_old = ((ret - v)**2).mean()
360 |             ent = (-logp).mean() # a sample estimate for entropy, also easy to compute
361 | 
362 |         # Training
363 |         for i in range(train_pi_iters):
364 |             _, logp = actor_critic.actor(obs, acts)
365 |             ratio = torch.exp(logp - logp_old) # pi(a|s) / pi_old(a|s)
366 |             min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv)
367 |             pi_loss = - torch.min(ratio * adv, min_adv).mean()
368 |             kl = (logp_old - logp).mean() # a sample estimate for KL-divergence, easy to compute
369 | 
370 |             actor_optimizer.zero_grad()
371 |             pi_loss.backward()
372 |             actor_optimizer.step()
373 | 
374 |             if kl > 1.5 * target_kl:
375 |                 # print('Early stopping at step %d due to reaching max kl.'%i)
376 |                 break
377 | 
378 |         logger['StopIter'].append(i)
379 | 
380 |         for _ in range(train_v_iters):
381 |             v = actor_critic.critic(obs)
382 |             v_loss = ((ret - v)**2).mean()
383 | 
384 |             critic_optimizer.zero_grad()
385 |             v_loss.backward()
386 |             critic_optimizer.step()
387 | 
388 |         # Log changes from update
389 |         with torch.no_grad():
390 |             logp, v = actor_critic(obs, acts)
391 |             ratio = torch.exp(logp - logp_old) # pi(a|s) / pi_old(a|s)
392 |             min_adv = torch.where(adv>0, (1+clip_ratio)*adv, (1-clip_ratio)*adv)
393 |             pi_l_new= - torch.min(ratio * adv, min_adv).mean()
394 |             v_l_new = ((ret - v)**2).mean()
395 |             kl = (logp_old - logp).mean()
396 |             clipped = np.logical_or(ratio > (1+clip_ratio), ratio < (1-clip_ratio))
397 |             cf = clipped.float().mean()
398 | 
399 |             logger['LossPi'].append(pi_l_new)
400 |             logger['LossV'].append(v_l_new)
401 |             logger['KL'].append(kl)
402 |             logger['Entropy'].append(ent)
403 |             logger['ClipFrac'].append(cf)
404 |             logger['DeltaLossPi'].append(pi_l_new - pi_l_old)
405 |             logger['DeltaLossV'].append(v_l_new - v_l_old)
406 | 
407 |         # Log info about epoch
408 |         print('-'*40)
409 |         print(f'Epoch: {epoch}')
410 |         print(f'TotalEnvInteracts: {(epoch+1)*steps_per_epoch}')
411 |         logger_print(logger, 'EpRet', True)
412 |         logger_print(logger, 'EpLen')
413 |         logger_print(logger, 'VVals', True)
414 |         logger_print(logger, 'LossPi')
415 |         logger_print(logger, 'LossV')
416 |         logger_print(logger, 'DeltaLossPi')
417 |         logger_print(logger, 'DeltaLossV')
418 |         logger_print(logger, 'Entropy')
419 |         logger_print(logger, 'KL')
420 |         logger_print(logger, 'ClipFrac')
421 |         logger_print(logger, 'StopIter')
422 |         print(f'Time: {time.time()-start_time:.4f}s')
423 |         print('-'*40+'\n')
424 | 
425 |         # Save model
426 |         if np.mean(logger['EpRet']) > max_avg_ret:
427 |             max_avg_ret = np.mean(logger['EpRet'])
428 |             torch.save(actor_critic.state_dict(), 'PPO_{}.pth'.format(env_name))
429 | 
430 |     env.close()
431 | 
432 | if __name__ == '__main__':
433 |     import argparse
434 |     parser = argparse.ArgumentParser()
435 |     parser.add_argument('--env', type=str, default='HalfCheetah-v2')
436 |     parser.add_argument('--hid', type=int, default=64)
437 |     parser.add_argument('--l', type=int, default=2)
438 |     parser.add_argument('--gamma', type=float, default=0.99)
439 |     parser.add_argument('--seed', '-s', type=int, default=0)
440 |     parser.add_argument('--cpu', type=int, default=4)
441 |     parser.add_argument('--steps', type=int, default=4000)
442 |     parser.add_argument('--epochs', type=int, default=50)
443 |     parser.add_argument('--exp_name', type=str, default='ppo')
444 |     args = parser.parse_args()
445 | 
446 |     train(
447 |         args.env,
448 |         ac_kwargs=dict(hidden_sizes=[args.hid]*args.l),
449 |         gamma=args.gamma, 
450 |         seed=args.seed,
451 |         steps_per_epoch=args.steps,
452 |         epochs=args.epochs
453 |         )


--------------------------------------------------------------------------------
/QLearning/QLearning_FrozenLake.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | import random
 4 | 
 5 | env = gym.make('FrozenLake-v0')
 6 | action_size = env.action_space.n
 7 | state_size = env.observation_space.n
 8 | 
 9 | qtable = np.zeros((state_size, action_size))
10 | 
11 | total_episodes = 1000
12 | learning_rate = 0.8
13 | max_steps = 99
14 | gamma = 0.95
15 | 
16 | epsilon = 1.0
17 | max_epsilon = 1.0
18 | min_epsilon = 0.01
19 | decay_rate = 0.01
20 | 
21 | rewards = []
22 | for episode in range(total_episodes):
23 |     state = env.reset()
24 |     total_rewards = 0
25 | 
26 |     for step in range(max_steps):
27 |         exp_exp_tradeoff = random.uniform(0, 1)
28 |         if exp_exp_tradeoff > epsilon:
29 |             action = np.argmax(qtable[state])
30 |         else:
31 |             action = env.action_space.sample()
32 | 
33 |         new_state, reward, done, info = env.step(action)
34 |         qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state]) - qtable[state, action])
35 | 
36 |         state = new_state
37 |         total_rewards += reward
38 |         if done: break
39 | 
40 |     epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * (episode+1))
41 |     rewards.append(total_rewards)
42 | 
43 |     print('[*] episode {}, total reward {}, average score {}'.format(episode, total_rewards, sum(rewards)/(episode+1)))
44 | 
45 | print(qtable)
46 | 
47 | # Play the game
48 | 
49 | for episode in range(1):
50 |     state = env.reset()
51 |     print('*'*20)
52 |     print('EPISODE ', episode)
53 | 
54 |     for step in range(max_steps):
55 |         env.render()
56 |         action = np.argmax(qtable[state])
57 |         input()
58 |         state, reward, done, info = env.step(action)
59 |         if done: break
60 | 
61 | env.close()
62 | 
63 | 


--------------------------------------------------------------------------------
/QLearning/QLearning_Taxi_v2.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import gym
 3 | import random
 4 | 
 5 | env = gym.make("Taxi-v2")
 6 | 
 7 | action_size = env.action_space.n
 8 | state_size = env.observation_space.n
 9 | qtable = np.zeros((state_size, action_size))
10 | 
11 | # Hyperparameters
12 | total_episodes = 50000
13 | total_test_episodes = 100
14 | max_steps = 99
15 | learning_rate = 0.7
16 | gamma = 0.618
17 | epsilon = 1.0
18 | max_epsilon = 1.0
19 | min_epsilon = 0.01
20 | decay_rate = 0.01
21 | 
22 | # Train
23 | for episode in range(total_episodes):
24 |     state = env.reset()
25 |     
26 |     for step in range(max_steps):
27 |         exp_exp_tradeoff = random.uniform(0, 1)
28 |         if exp_exp_tradeoff > epsilon:
29 |             action = np.argmax(qtable[state, :])
30 |         else:
31 |             action = env.action_space.sample()
32 | 
33 |         new_state, reward, done, info = env.step(action)
34 |         qtable[state, action] += learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
35 |         
36 |         state = new_state
37 |         if done: break
38 | 
39 |     epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * (episode+1))
40 | 
41 | 
42 | # Play the Game
43 | rewards = []
44 | for episode in range(total_test_episodes):
45 |     state = env.reset()
46 |     total_rewards = 0
47 | 
48 |     print('='*20)
49 |     print("[*] Episode", episode)
50 |     print('='*20)
51 | 
52 |     for step in range(max_steps):
53 |         env.render()
54 |         action = np.argmax(qtable[state, :])
55 |         state, reward, done, info = env.step(action)
56 |         total_rewards += reward
57 | 
58 |         if done:
59 |             rewards.append(total_rewards)
60 |             print('[*] Score', total_rewards)
61 |             break
62 | 
63 | env.close()
64 | print('[*] Average Score: ' + str(sum(rewards) / total_test_episodes))


--------------------------------------------------------------------------------
/QLearning/QLearning_TicTacToe.py:
--------------------------------------------------------------------------------
  1 | import game
  2 | import numpy as np
  3 | import random
  4 | 
  5 | class RandomPlayer():
  6 |     def __init__(self):
  7 |         self.name = 'Random'
  8 |         self.win_n = 0
  9 | 
 10 |     def action(self, state, actions):
 11 |         return random.choice(actions)
 12 | 
 13 |     def reward(self, reward, state):
 14 |         if reward == 1:
 15 |             self.win_n += 1
 16 | 
 17 |     def episode_end(self, episode):
 18 |         pass
 19 | 
 20 | class QLearningPlayer():
 21 |     def __init__(self):
 22 |         self.name = 'Q-Learning'
 23 |         self.q = {}
 24 |         self.init_q = 1 # "optimistic" 1.0 initial values
 25 |         self.lr = 0.3
 26 |         self.gamma = 0.9
 27 |         self.epsilon = 1.0
 28 |         self.max_epsilon = 1.0
 29 |         self.min_epsilon = 0.01
 30 |         self.decay_rate = 0.01
 31 |         self.action_n = 9
 32 |         self.win_n = 0
 33 | 
 34 |         self.last_state = (' ',) * 9
 35 |         self.last_action = -1
 36 | 
 37 |     def action(self, state, actions):
 38 |         state = tuple(state)
 39 |         self.last_state = state
 40 | 
 41 |         r = random.uniform(0, 1)
 42 |         if r > self.epsilon:
 43 |             if self.q.get(state):
 44 |                 i = np.argmax([self.q[state][a] for a in actions])
 45 |                 action = actions[i]
 46 |             else:
 47 |                 self.q[state] = [self.init_q] * self.action_n
 48 |                 action = random.choice(actions)
 49 |         else:
 50 |             action = random.choice(actions)
 51 | 
 52 |         self.last_action = action
 53 |         return action
 54 | 
 55 |     def reward(self, reward, state):
 56 |         if self.last_action >= 0:
 57 |             if reward == 1:
 58 |                 self.win_n += 1
 59 | 
 60 |             state = tuple(state)
 61 |             if self.q.get(self.last_state):
 62 |                 q = self.q[self.last_state][self.last_action]
 63 |             else:
 64 |                 self.q[self.last_state] = [self.init_q] * self.action_n
 65 |                 q = self.init_q
 66 | 
 67 |             self.q[self.last_state][self.last_action] = q + self.lr * (reward + self.gamma * np.max(self.q.get(state, [self.init_q]*self.action_n)) - q)
 68 | 
 69 |     def episode_end(self, episode):
 70 |         # epsilon decay
 71 |         self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.decay_rate*(episode+1))
 72 | 
 73 |     def print_q(self):
 74 |         for k,v in self.q.items():
 75 |             print(k,v)
 76 | 
 77 | class HumanPlayer():
 78 |     def __init__(self):
 79 |         self.name = 'Human'
 80 | 
 81 |     def action(self, state, actions):
 82 |         a = int(input('your move:')) - 1
 83 |         return a
 84 | 
 85 | 
 86 | def train(trails_num, p1, p2, env):
 87 |     for episode in range(trails_num):
 88 |         
 89 |         state, win, done, info = env.reset(X=p1, O=p2)
 90 | 
 91 |         for (cur_player, oth_player) in env.player_turn():
 92 |             #env.render()
 93 |             action = cur_player.action(state, env.action_space)
 94 |             state, win, done, info = env.step(action)
 95 | 
 96 |             if done:
 97 |                 if win:
 98 |                     cur_player.reward(1, state)
 99 |                     oth_player.reward(-1, state)
100 |                 else:
101 |                     cur_player.reward(0.5, state)
102 |                     oth_player.reward(0.5, state)
103 |                 #env.render()
104 |                 break
105 |             else:
106 |                 oth_player.reward(0, state)
107 |         
108 |         env.playerX.episode_end(episode)
109 |         env.playerO.episode_end(episode)
110 |     
111 |     print('='*20)
112 |     print('Train result - %d episodes' % trails_num)
113 |     print('{} win rate: {}'.format(p1.name, p1.win_n / trails_num))
114 |     print('{} win rate: {}'.format(p2.name, p2.win_n / trails_num))
115 |     print('players draw rate: {}'.format((trails_num - p1.win_n - p2.win_n) / trails_num))
116 |     print('='*20)
117 | 
118 | 
119 | def play(p1, p2, env):
120 |     while 1:
121 |         state, win, done, info = env.reset(X=p1, O=p2)
122 |         for (cp, op) in env.player_turn():
123 |             print()
124 |             env.render()
125 |             action = cp.action(state, env.action_space)
126 |             state, win, done, info = env.step(action)
127 |             if done:
128 |                 env.render()
129 |                 break
130 | 
131 | if __name__ == '__main__':
132 |     env = game.make('TicTacToe')
133 |     p1 = QLearningPlayer()
134 |     p2 = QLearningPlayer()
135 |     p3 = HumanPlayer()
136 |     p4 = RandomPlayer()
137 | 
138 |     train(100000, p1, p4, env)
139 |     print()
140 |     print('Human play')
141 |     print()
142 | 
143 |     play(p1, p3, env)
144 | 


--------------------------------------------------------------------------------
/QLearning/game.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | def make(game_name):
 4 |     if game_name == 'TicTacToe':
 5 |         return TicTacToe()
 6 | 
 7 | class TicTacToe():
 8 | 
 9 |     def __init__(self):
10 |         self.reset()
11 | 
12 |     def render(self):
13 |         line = '\n-----------\n'
14 |         row = " {} | {} | {}"
15 |         print((row + line + row + line + row).format(*self.state))
16 |         print(self.info)
17 | 
18 |     def step(self, action):
19 |         #print(action)
20 |         self.state[action] = self.cur_player
21 |         self.action_space.remove(action)
22 | 
23 |         self.check_end()
24 |         if self.is_end:
25 |             if self.is_win:
26 |                 self.info = 'player{} win!'.format(self.cur_player)
27 |             else:
28 |                 self.info = 'players draw'
29 |         else:
30 |             self.info = 'player{} turn'.format(self.cur_player)
31 |         return (self.state, self.is_win, self.is_end, self.info)
32 | 
33 |     def reset(self, X=None, O=None):
34 |         self.state = [' '] * 9
35 |         self.action_space = list(range(9))
36 |         self.is_end = False
37 |         self.is_win = False
38 |         self.info = 'new game'
39 |         self.playerX = X
40 |         self.playerO = O
41 |         self.cur_player = random.choice(['O','X'])
42 |         return (self.state, self.is_win, self.is_end, self.info)
43 | 
44 |     def player_turn(self):
45 |         while 1:
46 |             if self.cur_player == 'O':
47 |                 cur = self.playerO
48 |                 oth = self.playerX
49 |             else:
50 |                 cur = self.playerX
51 |                 oth = self.playerO
52 |             
53 |             self.info = 'player{} turn'.format(self.cur_player) 
54 |             yield (cur, oth)
55 |             
56 |             self.cur_player = 'OX'.replace(self.cur_player, '')
57 | 
58 |     def check_end(self):
59 |         for a,b,c in [(0,1,2), (3,4,5), (6,7,8),
60 |                       (0,3,6), (1,4,7), (2,5,8),
61 |                       (0,4,8), (2,4,6)]:
62 |             if self.cur_player == self.state[a] == self.state[b] == self.state[c]:
63 |                 self.is_win = True
64 |                 self.is_end = True
65 |                 return
66 | 
67 |         if not any([s == ' ' for s in self.state]):
68 |             self.is_win = False
69 |             self.is_end = True
70 |             return
71 | 
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Reinforcement Learning
  2 | ======================
  3 | 
  4 | Reinforcing Your Learning of Reinforcement Learning.
  5 | 
  6 | 这个是我在学习强化学习的过程中的一些记录，以及写的一些代码。建立这个Github项目主要是可以和大家一起相互学习和交流，也同时方便其他人寻找强化学习方面的资料。我为什么学习强化学习，主要是想把 AlphaZero 的那套方法（结合深度学习的蒙特卡洛树搜索）用在 RNA 分子结构预测上，目前已经做了一些尝试，比如寻找 RNA 分子的二级结构折叠路径。
  7 | 
  8 | 首先看的书是 Richard S. Sutton 和 Andrew G. Barto 的 [Reinforcement Learning: An Introduction (Second edition)](http://incompleteideas.net/book/bookdraft2017nov5.pdf)。
  9 | 
 10 | 看书的同时，也根据网上的一些文章写一些简单的代码，依次如下。
 11 | 
 12 | 
 13 | Table of contents
 14 | =================
 15 | 
 16 | * [Q-Learning](#q-Learning)
 17 | 	* [Frozen Lake Game](#frozen-lake-game)
 18 | 	* [Tic Tac Toe](#tic-tac-toe)
 19 | 	* [Taxi v2](#taxi-v2)
 20 | * [Deep Q-Learning Network (DQN)](#deep-q-Learning-network)
 21 | 	* [Doom Game](#doom-game)
 22 | 	* [Atari Space Invaders](#atari-space-invaders)
 23 | * [Dueling Double DQN & Prioritized Experience Replay](#dueling-double-dqn-and-prioritized-experience-replay)
 24 | 	* [Doom Deadly Corridor](#doom-deadly-corridor)
 25 | * [Policy Gradients (PG)](#policy-gradients)
 26 | 	* [CartPole Game](#cartPole-game)
 27 | 	* [Doom Deathmatch](#doom-deathmatch)
 28 | * [Advantage Actor Critic (A2C)](#advantage-actor-critic)
 29 | * [Asynchronous Advantage Actor Critic (A3C)](#asynchronous-advantage-actor-critic)
 30 | * [Proximal Policy Optimization (PPO)](#proximal-policy-optimization)
 31 | 	* [Half Cheetah](#half-cheetah)
 32 | * [Deep Deterministic Policy Gradient (DDPG)](#deep-deterministic-policy-gradient)
 33 | 	* [Ant](#ant)
 34 | * [AlphaGoZero Introduction](#alphagozero-introduction)
 35 | * [Monte Carlo Tree Search (MCTS)](#monte-carlo-tree-search)
 36 | 	* [Gomoku](#gomoku)
 37 | * [AlphaGomoku](#alphagomoku)
 38 | * [RNA Folding Path](#rna-folding-path)
 39 | * [Atari Game Roms](#atari-game-roms)
 40 | 
 41 | 
 42 | Q-Learning
 43 | ==========
 44 | 
 45 | **Bellman equation:**
 46 | ![Bellman equation](imgs/Bellman_equation.png)
 47 | 
 48 | Frozen Lake Game
 49 | ----------------
 50 | 
 51 | <div align=center>
 52 | 	<img width="300" height="300" src="imgs/frozenlake.png" alt="Frozen Lake Game">
 53 | </div>
 54 | 
 55 | 基于 `Q-Learning` 玩 `Frozen Lake` 游戏：[[code]](QLearning/QLearning_FrozenLake.py)
 56 | 
 57 | 
 58 | Tic Tac Toe
 59 | -----------
 60 | 
 61 | <div align=center>
 62 | 	<img width="100" height="130" src="imgs/tic1.png" alt="Tic Tac Toe">
 63 | 	<img width="100" height="130" src="imgs/tic2.png" alt="Tic Tac Toe">
 64 | 	<img width="100" height="130" src="imgs/tic3.png" alt="Tic Tac Toe">
 65 | 	<img width="100" height="130" src="imgs/tic4.png" alt="Tic Tac Toe">
 66 | 	<img width="100" height="130" src="imgs/tic5.png" alt="Tic Tac Toe">
 67 | 	<img width="100" height="130" src="imgs/tic6.png" alt="Tic Tac Toe">
 68 | 	<img width="100" height="130" src="imgs/tic7.png" alt="Tic Tac Toe">
 69 | </div>
 70 | 
 71 | 基于 `Q-Learning` 玩井字棋游戏：[[code]](QLearning/QLearning_TicTacToe.py)
 72 | 
 73 | 训练结果：
 74 | ```
 75 | Q-Learning Player vs Q-Learning Player
 76 | ====================
 77 | Train result - 100000 episodes
 78 | Q-Learning win rate: 0.45383
 79 | Q-Learning win rate: 0.3527
 80 | players draw rate: 0.19347
 81 | ====================
 82 | 
 83 | Q-Learning Player vs Random Player
 84 | ====================
 85 | Train result - 100000 episodes
 86 | Q-Learning win rate: 0.874
 87 | Random win rate: 0.03072
 88 | players draw rate: 0.09528
 89 | ====================
 90 | ```
 91 | 
 92 | 
 93 | Taxi v2
 94 | -------
 95 | 
 96 | <div align=center>
 97 | 	<img width="93" height="133" src="imgs/taxi1.png" alt="Taxi v2">
 98 | 	<img width="93" height="133" src="imgs/taxi2.png" alt="Taxi v2">
 99 | 	<img width="93" height="133" src="imgs/taxi3.png" alt="Taxi v2">
100 | 	<img width="93" height="133" src="imgs/taxi4.png" alt="Taxi v2">
101 | 	<img width="93" height="133" src="imgs/taxi5.png" alt="Taxi v2">
102 | 	<img width="93" height="133" src="imgs/taxi6.png" alt="Taxi v2">
103 | </div>
104 | 
105 | 基于 `Q-Learning` 玩 `Taxi v2` 游戏：[[code]](QLearning/QLearning_Taxi_v2.py)
106 | 
107 | 
108 | [0]. [Diving deeper into Reinforcement Learning with Q-Learning](https://medium.freecodecamp.org/diving-deeper-into-reinforcement-learning-with-q-learning-c18d0db58efe)<br/>
109 | [1]. [Q* Learning with FrozenLake - Notebook](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb)<br/>
110 | [2]. [Q* Learning with OpenAI Taxi-v2 - Notebook](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/Taxi-v2/Q%20Learning%20with%20OpenAI%20Taxi-v2%20video%20version.ipynb)
111 | 
112 | Deep Q-Learning Network
113 | =======================
114 | 
115 | <div align=center>
116 | 	<img width="400" height="300" src="imgs/DQN.png" alt="Deep Q-Learning Network">
117 | </div>
118 | 
119 | ![](imgs/DQN2.png)
120 | 
121 | **weights updation:**
122 | 
123 | ![](imgs/DQN_loss.png)
124 | 
125 | Doom Game
126 | ---------
127 | 
128 | <div align=center>
129 | 	<img src="imgs/play_doom.gif" alt="play Doom">
130 | </div>
131 | 
132 | 游戏环境这里使用的是 [ViZDoom](http://vizdoom.cs.put.edu.pl) ，神经网络是三层的卷积网络。[[code]](DQN/Doom/DQN_Doom.py)
133 | 
134 | ![DQN neural network](imgs/DQN_neural_network.png)
135 | 
136 | 训练大约 1200 轮后结果如下：
137 | 
138 | ![Doom loss](imgs/doom_loss.png)
139 | 
140 | ```
141 | Episode 0 Score: 61.0
142 | Episode 1 Score: 68.0
143 | Episode 2 Score: 51.0
144 | Episode 3 Score: 62.0
145 | Episode 4 Score: 56.0
146 | Episode 5 Score: 33.0
147 | Episode 6 Score: 86.0
148 | Episode 7 Score: 57.0
149 | Episode 8 Score: 88.0
150 | Episode 9 Score: 61.0
151 | [*] Average Score: 62.3
152 | ```
153 | 
154 | 
155 | Atari Space Invaders
156 | --------------------
157 | 
158 | <div align=center>
159 | 	<img width="427" height="530" src="imgs/play_atari_space_invaders.gif" alt="Atari Space Invaders">
160 | </div>
161 | 
162 | 游戏环境使用的是 [Gym Retro](https://github.com/openai/retro) ，神经网络见下图。[[code]](DQN/Atari_Space_Invaders/DQN_Atari_Space_Invaders.py)
163 | 
164 | ![DQN neural network](imgs/DQN_neural_network2.png)
165 | 
166 | 训练大约 25 局后结果如下：
167 | 
168 | ```
169 | [*] Episode: 11, total reward: 120.0, explore p: 0.7587, train loss: 0.0127
170 | [*] Episode: 12, total reward: 80.0, explore p: 0.7495, train loss: 0.0194
171 | [*] Episode: 13, total reward: 110.0, explore p: 0.7409, train loss: 0.0037
172 | [*] Episode: 14, total reward: 410.0, explore p: 0.7233, train loss: 0.0004
173 | [*] Episode: 15, total reward: 240.0, explore p: 0.7019, train loss: 0.0223
174 | [*] Episode: 16, total reward: 230.0, explore p: 0.6813, train loss: 0.0535
175 | [*] Episode: 17, total reward: 315.0, explore p: 0.6606, train loss: 9.7144
176 | [*] Episode: 18, total reward: 140.0, explore p: 0.6455, train loss: 0.0022
177 | [*] Episode: 19, total reward: 310.0, explore p: 0.6266, train loss: 1.5386
178 | [*] Episode: 20, total reward: 200.0, explore p: 0.6114, train loss: 1.5545
179 | [*] Episode: 21, total reward: 65.0, explore p: 0.6044, train loss: 0.0042
180 | [*] Episode: 22, total reward: 210.0, explore p: 0.5895, train loss: 0.0161
181 | [*] Episode: 23, total reward: 155.0, explore p: 0.5778, train loss: 0.0006
182 | [*] Episode: 24, total reward: 105.0, explore p: 0.5665, train loss: 0.0016
183 | [*] Episode: 25, total reward: 425.0, explore p: 0.5505, train loss: 0.0063
184 | ```
185 | 
186 | 
187 | [0]. [An introduction to Deep Q-Learning: let’s play Doom](https://medium.freecodecamp.org/an-introduction-to-deep-q-learning-lets-play-doom-54d02d8017d8)<br/>
188 | [1]. [Deep Q learning with Doom - Notebook](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/DQN%20Doom/Deep%20Q%20learning%20with%20Doom.ipynb)<br/>
189 | [2]. [Deep Q Learning with Atari Space Invaders](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/DQN/Space%20Invaders/DQN%20Atari%20Space%20Invaders.ipynb)<br/>
190 | [3]. [Atari 2600 VCS ROM Collection](http://www.atarimania.com/rom_collection_archive_atari_2600_roms.html)
191 | 
192 | 
193 | Dueling Double DQN and Prioritized Experience Replay
194 | ====================================================
195 | 
196 | Four improvements in Deep Q Learning:
197 | * Fixed Q-targets
198 | ![Fixed Q-targets](imgs/fixed_q_targets.png)
199 | * Double DQN
200 | ![Double DQN](imgs/double_DQN.png)
201 | * Dueling DQN
202 | ![Dueling DQN](imgs/dueling_DQN1.png)
203 | * Prioritized Experience Replay
204 | <div align=center>
205 | 	<img height="400" src="imgs/PER.png" alt="PER">
206 | </div>
207 | 
208 | Doom Deadly Corridor
209 | --------------------
210 | 
211 | <div align=center>
212 | 	<img src="imgs/play_doom_deadly_corridor.gif" alt="play Doom Deadly Corridor">
213 | </div>
214 | 
215 | 其中，Dueling DQN 的神经网络如下图: [[code]](DDDQN/Doom-Deadly-Corridor/)
216 | 
217 | ![Dueling DQN](imgs/dueling_DQN2.png)
218 | 
219 | Prioritized Experience Replay 采用 SumTree 的方法:
220 | 
221 | ![SumTree](imgs/sumtree.png)
222 | 
223 | [0]. [Improvements in Deep Q Learning: Dueling Double DQN, Prioritized Experience Replay, and fixed Q-targets](https://medium.freecodecamp.org/improvements-in-deep-q-learning-dueling-double-dqn-prioritized-experience-replay-and-fixed-58b130cc5682)<br/>
224 | [1]. [Let’s make a DQN: Double Learning and Prioritized Experience Replay](https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/)<br/>
225 | [2]. [Double Dueling Deep Q Learning with Prioritized Experience Replay - Notebook](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Dueling%20Double%20DQN%20with%20PER%20and%20fixed-q%20targets/Dueling%20Deep%20Q%20Learning%20with%20Doom%20(%2B%20double%20DQNs%20and%20Prioritized%20Experience%20Replay).ipynb)
226 | 
227 | 
228 | Policy Gradients
229 | ================
230 | 
231 | <div align=center>
232 | 	<img width="500" src="imgs/policy_gradients.png" alt="Policy Gradients">
233 | </div>
234 | 
235 | <div align=center>
236 | 	<img src="imgs/pg_algorithm.svg" alt="PG Algorithm">
237 | </div>
238 | 
239 | CartPole Game
240 | -------------
241 | 
242 | <div align=center>
243 | 	<img src="imgs/play_cartpole.gif" alt="Play CartPole Game">
244 | </div>
245 | 
246 | 其中，Policy Gradient 神经网络如下图。
247 | 
248 | ![Policy Gradient Network](imgs/pg_network.png)
249 | 
250 | 训练大约 950 轮后结果如下：
251 | 
252 | ![](imgs/pg_loss.png)
253 | ![](imgs/pg_mean_reward.png)
254 | 
255 | ```
256 | ====================
257 | Episode: 941
258 | Reward: 39712.0
259 | Mean Reward: 2246.384288747346
260 | Max reward so far: 111837.0
261 | ====================
262 | Episode: 942
263 | Reward: 9417.0
264 | Mean Reward: 2253.9883351007425
265 | Max reward so far: 111837.0
266 | ====================
267 | Episode: 943
268 | Reward: 109958.0
269 | Mean Reward: 2368.08156779661
270 | Max reward so far: 111837.0
271 | ====================
272 | Episode: 944
273 | Reward: 73285.0
274 | Mean Reward: 2443.125925925926
275 | Max reward so far: 111837.0
276 | ====================
277 | Episode: 945
278 | Reward: 40370.0
279 | Mean Reward: 2483.217758985201
280 | Max reward so far: 111837.0
281 | [*] Model Saved: ./model/model.ckpt
282 | ```
283 | 
284 | 具体代码请参见：[[tensorflow]](PG/Cartpole_tensorflow/PG_Cartpole.py) [[pytorch]](PG/Cartpole_pytorch/PG_Cartpole.py)
285 | 
286 | 
287 | Doom Deathmatch
288 | ---------------
289 | 
290 | <div align=center>
291 | 	<img src="imgs/play_doom_deathmatch.gif" alt="play Doom Deathmatch">
292 | </div>
293 | 
294 | ![](imgs/pg_doom_deathmatch.png)
295 | 
296 | 神经网络如上，具体代码请参见：[[code]](PG/Doom-Deathmatch/PG_Doom_Deathmatch.py)
297 | 
298 | 
299 | [0]. [An introduction to Policy Gradients with Cartpole and Doom](https://medium.freecodecamp.org/an-introduction-to-policy-gradients-with-cartpole-and-doom-495b5ef2207f)<br/>
300 | [1]. [Cartpole: REINFORCE Monte Carlo Policy Gradients - Notebook](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Policy%20Gradients/Cartpole/Cartpole%20REINFORCE%20Monte%20Carlo%20Policy%20Gradients.ipynb)<br/>
301 | [2]. [Doom-Deathmatch: REINFORCE Monte Carlo Policy gradients - Notebook](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Policy%20Gradients/Doom%20Deathmatch/Doom-deathmatch%20REINFORCE%20Monte%20Carlo%20Policy%20gradients.ipynb)<br/>
302 | [3]. [Deep Reinforcement Learning: Pong from Pixels](http://karpathy.github.io/2016/05/31/rl/)
303 | 
304 | 
305 | Advantage Actor Critic
306 | ======================
307 | [to be done]
308 | 
309 | 
310 | Asynchronous Advantage Actor Critic
311 | ===================================
312 | [to be done]
313 | 
314 | 
315 | Proximal Policy Optimization
316 | ============================
317 | 
318 | <div align=center>
319 | 	<img src="imgs/ppo_algorithm.svg" alt="PPO Algorithm">
320 | </div>
321 | 
322 | Half Cheetah
323 | ------------
324 | 
325 | <div align=center>
326 | 	<img src="imgs/halfcheetah.gif" alt="play HalfCheetah">
327 | </div>
328 | 
329 | 
330 | 训练 500 epoch 后：
331 | ```
332 | ----------------------------------------
333 | Epoch: 499
334 | TotalEnvInteracts: 2000000
335 | EpRet:        585.4069  470.6009(min) 644.4069(max) 67.6205(std)
336 | EpLen:        1000.0000
337 | VVals:        46.3796   23.6165(min) 50.2677(max) 2.5903(std)
338 | LossPi:       -0.0172
339 | LossV:        506.2033
340 | DeltaLossPi:  -0.0172
341 | DeltaLossV:   -32.9680
342 | Entropy:      5.5010
343 | KL:           0.0188
344 | ClipFrac:     0.1937
345 | StopIter:     6.0000
346 | Time: 27175.5427s
347 | ----------------------------------------
348 | ```
349 | 
350 | 具体代码请参见：[[code]](PPO/HalfCheetah/PPO_HalfCheetah.py)
351 | 
352 | [0]. [OpenAI Spinning Up - Proximal Policy Optimization](https://spinningup.openai.com/en/latest/algorithms/ppo.html)
353 | 
354 | Deep Deterministic Policy Gradient
355 | ==================================
356 | 
357 | <div align=center>
358 | 	<img src="imgs/ddpg_algorithm.svg" alt="DDPG Algorithm">
359 | </div>
360 | 
361 | Ant
362 | ---
363 | 
364 | <div align=center>
365 | 	<img src="imgs/ant.gif" alt="Ant-v2">
366 | </div>
367 | 
368 | 由于训练不稳定，在第 225 epoch 取得最大平均 reward：
369 | ```
370 | ----------------------------------------
371 | Epoch: 225
372 | TotalEnvInteracts: 1130000
373 | EpRet:        1358.9755 920.2474(min) 1608.4678(max) 238.5494(std)
374 | EpLen:        992.0000
375 | TestEpRet:    1101.4177 479.9980(min) 1520.0907(max) 415.7242(std)
376 | TestEpLen:    873.2000
377 | QVals:        136.3016 -112.4969(min) 572.5870(max) 36.0409(std)
378 | LossPi:       -138.1635
379 | LossQ:        7.0895
380 | Time: 28183.4149s
381 | ----------------------------------------
382 | ```
383 | 
384 | 随着时间的增长，平均 reward 波动较大，此起彼伏，训练 365 epoch 后：
385 | ```
386 | ----------------------------------------
387 | Epoch: 365
388 | TotalEnvInteracts: 1830000
389 | EpRet:        -1250.8838 -2355.5800(min) -10.4664(max) 810.3868(std)
390 | EpLen:        723.5714
391 | TestEpRet:    -1241.4192 -2211.0383(min) -884.2655(max) 342.6774(std)
392 | TestEpLen:    1000.0000
393 | QVals:        407.9140 -116.6802(min) 684.3555(max) 76.7627(std)
394 | LossPi:       -413.1655
395 | LossQ:        61.5379
396 | Time: 50710.5035s
397 | ----------------------------------------
398 | ```
399 | 
400 | 具体代码请参见：[[code]](DDPG/Ant/DDPG_Ant.py)
401 | 
402 | [0]. [OpenAI Spinning Up - Deep Deterministic Policy Gradient](https://spinningup.openai.com/en/latest/algorithms/ddpg.html)
403 | 
404 | 
405 | AlphaGoZero Introduction
406 | ========================
407 | 
408 | 这个是我通过阅读 AlphaGo Zero 的文献，以及结合网路上相关的一些文章，将这些内容通过自己的理解整合到这一个PPT中，用来在组会上简单的介绍 AlphaGo Zero 背后的方法和原理给同学和老师，同时也思考如何将其结合到其他领域。当然，其中也不仅仅包括 AlphaGo Zero 的内容，也有我最近看的另外一篇文章，他们的研究团队运用类似的方法来解魔方。[[pdf]](AlphaGoZero_Intruduction/alphago_zero_introduction.pdf)
409 | 
410 | <div align=center>
411 | 	<img width="420" height="315" src="imgs/pdf_2.png" alt="pdf">
412 | 	<img width="420" height="315" src="imgs/pdf_3.png" alt="pdf">
413 | </div>
414 | 
415 | [0]. [AlphaGo Zero - How and Why it Works](http://tim.hibal.org/blog/alpha-zero-how-and-why-it-works/)<br/>
416 | [1]. [Alpha Go Zero Cheat Sheet](https://applied-data.science/static/main/res/alpha_go_zero_cheat_sheet.png)<br/>
417 | [2]. [Mastering the game of Go with deep neural networks and tree search](https://deepmind.com/research/publications/mastering-game-go-deep-neural-networks-tree-search/)<br/>
418 | [3]. [Mastering the game of Go without Human Knowledge](https://deepmind.com/research/publications/mastering-game-go-without-human-knowledge/)
419 | 
420 | 
421 | Monte Carlo Tree Search
422 | =======================
423 | 
424 | Gomoku
425 | ------
426 | 
427 | <div align=center>
428 | 	<img width="400" height="400" src="imgs/mcts_gomoku.png" alt="pdf">
429 | </div>
430 | 
431 | MCTS vs Random Player [[code]](MCTS/MCTS_Gomoku.py). Another MCTS on Tic Tac Toe [[code]](MCTS/MCTS_TicTacToe.py).
432 | 
433 | [0]. [mcts.ai](http://mcts.ai/code/python.html)<br/>
434 | [1]. [Introduction to Monte Carlo Tree Search](https://www.caktusgroup.com/blog/2015/09/24/introduction-monte-carlo-tree-search-1/)
435 | 
436 | 
437 | AlphaGomoku
438 | ===========
439 | 
440 | 使用AlphaGo Zero的方法实现的一个五子棋AI。
441 | 
442 | 下图是自我博弈训练 3000 局棋后，与人类选手对局的结果，已经很难下赢了。
443 | 
444 | <div align=center>
445 | 	<img width="400" height="400" src="imgs/alphagomoku.png" alt="pdf">
446 | </div>
447 | 
448 | 策略估值网络提供了两个模型，分别是：
449 | ```
450 | ################
451 | # Residual_CNN #
452 | ################
453 | 
454 | Network Diagram:
455 |                 |-----------------------|                  /---C---B---R---F---D---R---D---T [value head]
456 | I---C---B---R---o---C---B---R---C---B---M---R--- ..... ---|
457 |     \_______/     \_______________________/                \---C---B---R---F---D---S [polich head]
458 |    [Conv layer]       [Residual layer]
459 | 
460 | I - input
461 | B - BatchNormalization
462 | R - Rectifier non-linearity, LeakyReLU
463 | T - tanh
464 | C - Conv2D
465 | F - Flatten
466 | D - Dense
467 | M - merge, add
468 | S - Softmax
469 | O - output
470 | 
471 | ##############
472 | # Simple_CNN #
473 | ##############
474 | 
475 | Network Diagram:
476 |                                                2(1x1)       64     1
477 |     32(3x3)     64(3x3)    128(3x3)        /-----C-----F-----D-----D-----T  [value head]
478 | I-----C-----R-----C-----R-----C-----R-----|
479 |       \_____________________________/      \-----C-----F-----D-----S        [polich head]
480 |            [Convolutional layer]               4(1x1)       w^2
481 | 
482 | I - input
483 | B - BatchNormalization
484 | R - ReLU
485 | T - tanh
486 | C - Conv2D
487 | F - Flatten
488 | D - Dense
489 | S - Softmax
490 | ```
491 | 
492 | `8x8` 大小棋盘自我博弈训练 3000 局的结果如下：
493 | 
494 | <div align=center>
495 | 	<img src="AlphaGomoku/loss/Simple_CNN_8x8_loss.png" alt="loss">
496 | </div>
497 | 
498 | ```
499 | [*] Episode: 2991, length: 42, start: O, winner: X, data: 336, time: 85s, win ratio: X 48.1%, O 48.5%, - 3.4%
500 | Epoch 1/1
501 | 512/512 [==============================] - 1s 2ms/step - loss: 1.7491 - value_head_loss: 0.4658 - policy_head_loss: 1.0655
502 | [*] Episode: 2992, length: 19, start: O, winner: O, data: 152, time: 40s, win ratio: X 48.1%, O 48.5%, - 3.4%
503 | Epoch 1/1
504 | 512/512 [==============================] - 1s 2ms/step - loss: 1.6507 - value_head_loss: 0.4631 - policy_head_loss: 0.9698
505 | [*] Episode: 2993, length: 23, start: X, winner: X, data: 184, time: 47s, win ratio: X 48.1%, O 48.5%, - 3.4%
506 | Epoch 1/1
507 | 512/512 [==============================] - 1s 2ms/step - loss: 1.6409 - value_head_loss: 0.4322 - policy_head_loss: 0.9908
508 | [*] Episode: 2994, length: 35, start: X, winner: X, data: 280, time: 71s, win ratio: X 48.1%, O 48.5%, - 3.4%
509 | Epoch 1/1
510 | 512/512 [==============================] - 1s 2ms/step - loss: 1.6128 - value_head_loss: 0.4528 - policy_head_loss: 0.9421
511 | [*] Episode: 2995, length: 16, start: X, winner: O, data: 128, time: 35s, win ratio: X 48.1%, O 48.5%, - 3.4%
512 | Epoch 1/1
513 | 512/512 [==============================] - 1s 2ms/step - loss: 1.7529 - value_head_loss: 0.4884 - policy_head_loss: 1.0466
514 | [*] Episode: 2996, length: 22, start: O, winner: X, data: 176, time: 46s, win ratio: X 48.1%, O 48.5%, - 3.4%
515 | Epoch 1/1
516 | 512/512 [==============================] - 1s 2ms/step - loss: 1.6800 - value_head_loss: 0.4583 - policy_head_loss: 1.0038
517 | [*] Episode: 2997, length: 16, start: X, winner: O, data: 128, time: 35s, win ratio: X 48.1%, O 48.5%, - 3.4%
518 | Epoch 1/1
519 | 512/512 [==============================] - 1s 2ms/step - loss: 1.6877 - value_head_loss: 0.4973 - policy_head_loss: 0.9725
520 | [*] Episode: 2998, length: 22, start: X, winner: O, data: 176, time: 48s, win ratio: X 48.1%, O 48.5%, - 3.4%
521 | Epoch 1/1
522 | 512/512 [==============================] - 1s 2ms/step - loss: 1.6530 - value_head_loss: 0.4887 - policy_head_loss: 0.9464
523 | [*] Episode: 2999, length: 16, start: X, winner: O, data: 128, time: 33s, win ratio: X 48.1%, O 48.5%, - 3.4%
524 | Epoch 1/1
525 | 512/512 [==============================] - 1s 2ms/step - loss: 1.6951 - value_head_loss: 0.4582 - policy_head_loss: 1.0189
526 | [*] Episode: 3000, length: 9, start: X, winner: X, data: 72, time: 18s, win ratio: X 48.1%, O 48.5%, - 3.4%
527 | Epoch 1/1
528 | 512/512 [==============================] - 1s 2ms/step - loss: 1.6760 - value_head_loss: 0.4743 - policy_head_loss: 0.9838
529 | ```
530 | 
531 | 具体代码及训练好的模型参数请参考这里：[[code]](AlphaGomoku/)
532 | 
533 | 
534 | [0]. [How to build your own AlphaZero AI using Python and Keras](https://applied-data.science/blog/how-to-build-your-own-alphazero-ai-using-python-and-keras/)<br/>
535 | [1]. [Github: AppliedDataSciencePartners/DeepReinforcementLearning](https://github.com/AppliedDataSciencePartners/DeepReinforcementLearning)<br/>
536 | [2]. [Github: Rochester-NRT/RocAlphaGo](https://github.com/Rochester-NRT/RocAlphaGo)<br/>
537 | [3]. [28 天自制你的 AlphaGo (6) : 蒙特卡洛树搜索（MCTS）基础](https://zhuanlan.zhihu.com/p/25345778)<br/>
538 | [4]. [AlphaZero实战：从零学下五子棋（附代码）](https://zhuanlan.zhihu.com/p/32089487)<br/>
539 | [5]. [Github: junxiaosong/AlphaZero_Gomoku](https://github.com/junxiaosong/AlphaZero_Gomoku)
540 | 
541 | 
542 | RNA Folding Path
543 | ================
544 | 
545 | 使用深度强化学习来学习 RNA 分子的二级结构折叠路径。具体说明这里就不再重复了，请参见这里：[[link]](RNA_Secondary_Structure_Folding_Path/)
546 | 
547 | Atari Game Roms
548 | ===============
549 | 
550 | 这里有一些 Atari 游戏的 Rom，可以导入到 retro 环境中，方便进行游戏。[[link]](Roms/)


--------------------------------------------------------------------------------
/Roms/Roms.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/Roms/Roms.zip


--------------------------------------------------------------------------------
/imgs/Bellman_equation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/Bellman_equation.png


--------------------------------------------------------------------------------
/imgs/DQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/DQN.png


--------------------------------------------------------------------------------
/imgs/DQN2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/DQN2.png


--------------------------------------------------------------------------------
/imgs/DQN_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/DQN_loss.png


--------------------------------------------------------------------------------
/imgs/DQN_neural_network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/DQN_neural_network.png


--------------------------------------------------------------------------------
/imgs/DQN_neural_network2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/DQN_neural_network2.png


--------------------------------------------------------------------------------
/imgs/PER.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/PER.png


--------------------------------------------------------------------------------
/imgs/alphagomoku.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/alphagomoku.png


--------------------------------------------------------------------------------
/imgs/ant.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/ant.gif


--------------------------------------------------------------------------------
/imgs/doom_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/doom_loss.png


--------------------------------------------------------------------------------
/imgs/double_DQN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/double_DQN.png


--------------------------------------------------------------------------------
/imgs/dueling_DQN1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/dueling_DQN1.png


--------------------------------------------------------------------------------
/imgs/dueling_DQN2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/dueling_DQN2.png


--------------------------------------------------------------------------------
/imgs/fixed_q_targets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/fixed_q_targets.png


--------------------------------------------------------------------------------
/imgs/frozenlake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/frozenlake.png


--------------------------------------------------------------------------------
/imgs/halfcheetah.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/halfcheetah.gif


--------------------------------------------------------------------------------
/imgs/mcts_gomoku.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/mcts_gomoku.png


--------------------------------------------------------------------------------
/imgs/pdf_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/pdf_0.png


--------------------------------------------------------------------------------
/imgs/pdf_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/pdf_1.png


--------------------------------------------------------------------------------
/imgs/pdf_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/pdf_2.png


--------------------------------------------------------------------------------
/imgs/pdf_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/pdf_3.png


--------------------------------------------------------------------------------
/imgs/pg_doom_deathmatch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/pg_doom_deathmatch.png


--------------------------------------------------------------------------------
/imgs/pg_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/pg_loss.png


--------------------------------------------------------------------------------
/imgs/pg_mean_reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/pg_mean_reward.png


--------------------------------------------------------------------------------
/imgs/pg_network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/pg_network.png


--------------------------------------------------------------------------------
/imgs/play_atari_space_invaders.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/play_atari_space_invaders.gif


--------------------------------------------------------------------------------
/imgs/play_cartpole.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/play_cartpole.gif


--------------------------------------------------------------------------------
/imgs/play_doom.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/play_doom.gif


--------------------------------------------------------------------------------
/imgs/play_doom_deadly_corridor.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/play_doom_deadly_corridor.gif


--------------------------------------------------------------------------------
/imgs/play_doom_deathmatch.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/play_doom_deathmatch.gif


--------------------------------------------------------------------------------
/imgs/policy_gradients.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/policy_gradients.png


--------------------------------------------------------------------------------
/imgs/sumtree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/sumtree.png


--------------------------------------------------------------------------------
/imgs/taxi1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/taxi1.png


--------------------------------------------------------------------------------
/imgs/taxi2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/taxi2.png


--------------------------------------------------------------------------------
/imgs/taxi3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/taxi3.png


--------------------------------------------------------------------------------
/imgs/taxi4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/taxi4.png


--------------------------------------------------------------------------------
/imgs/taxi5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/taxi5.png


--------------------------------------------------------------------------------
/imgs/taxi6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/taxi6.png


--------------------------------------------------------------------------------
/imgs/tic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/tic1.png


--------------------------------------------------------------------------------
/imgs/tic2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/tic2.png


--------------------------------------------------------------------------------
/imgs/tic3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/tic3.png


--------------------------------------------------------------------------------
/imgs/tic4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/tic4.png


--------------------------------------------------------------------------------
/imgs/tic5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/tic5.png


--------------------------------------------------------------------------------
/imgs/tic6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/tic6.png


--------------------------------------------------------------------------------
/imgs/tic7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/imgs/tic7.png


--------------------------------------------------------------------------------
/test/Airstriker-Genesis-Level1-000000.bk2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/test/Airstriker-Genesis-Level1-000000.bk2


--------------------------------------------------------------------------------
/test/Airstriker-Genesis-Level1-000000.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Urinx/ReinforcementLearning/40c00b8297503e127c6c8134a8becffb81b676e4/test/Airstriker-Genesis-Level1-000000.mp4


--------------------------------------------------------------------------------
/test/test_gym.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | # env = gym.make('CartPole-v0')
 4 | # env = gym.make('Breakout-v0')
 5 | env = gym.make('Reacher-v2')
 6 | env.reset()
 7 | 
 8 | done = False
 9 | while not done:
10 |     env.render()
11 |     ob, reward, done, _ = env.step(env.action_space.sample())
12 | env.close()


--------------------------------------------------------------------------------
/test/test_mujoco.py:
--------------------------------------------------------------------------------
 1 | import mujoco_py
 2 | import os
 3 | import gym
 4 | 
 5 | mj_path, _ = mujoco_py.utils.discover_mujoco()
 6 | xml_path = os.path.join(mj_path, 'model', 'humanoid.xml')
 7 | model = mujoco_py.load_model_from_path(xml_path)
 8 | sim = mujoco_py.MjSim(model)
 9 | 
10 | print(sim.data.qpos)
11 | # [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
12 | 
13 | sim.step()
14 | print(sim.data.qpos)
15 | # [-2.09531783e-19  2.72130735e-05  6.14480786e-22 -3.45474715e-06
16 | #   7.42993721e-06 -1.40711141e-04 -3.04253586e-04 -2.07559344e-04
17 | #   8.50646247e-05 -3.45474715e-06  7.42993721e-06 -1.40711141e-04
18 | #  -3.04253586e-04 -2.07559344e-04 -8.50646247e-05  1.11317030e-04
19 | #  -7.03465386e-05 -2.22862221e-05 -1.11317030e-04  7.03465386e-05
20 | #  -2.22862221e-05]
21 | 
22 | # sim.render() # ERROR: GLEW initalization error: Missing GL version
23 | 
24 | # env = gym.make('Ant-v2')
25 | env = gym.make('Humanoid-v2')
26 | env.reset()
27 | 
28 | done = False
29 | while not done:
30 |     env.render()
31 |     ob, reward, done, _ = env.step(env.action_space.sample())
32 | env.close()


--------------------------------------------------------------------------------
/test/test_retro.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import retro
 3 | 
 4 | def test_record():
 5 |     # env = retro.make(game='SpaceInvaders-Atari2600')
 6 |     # env = retro.make(game='SonicTheHedgehog-Genesis', state='GreenHillZone.Act1', record='.')
 7 |     env = retro.make(game='Airstriker-Genesis', record='.')
 8 |     obs = env.reset()
 9 |     done = False
10 |     while not done:
11 |         obs, rew, done, info = env.step(env.action_space.sample())
12 |         env.render()
13 |     env.close()
14 | 
15 | def test_playback():
16 |     movie = retro.Movie('Airstriker-Genesis-Level1-000000.bk2')
17 |     movie.step()
18 | 
19 |     env = retro.make(
20 |         game=movie.get_game(),
21 |         state=None,
22 |         # bk2s can contain any button presses, so allow everything
23 |         use_restricted_actions=retro.Actions.ALL,
24 |         players=movie.players,
25 |         )
26 |     env.initial_state = movie.get_state()
27 |     env.reset()
28 | 
29 |     while movie.step():
30 |         keys = []
31 |         for p in range(movie.players):
32 |             for i in range(env.num_buttons):
33 |                 keys.append(movie.get_key(i, p))
34 |         env.step(keys)
35 |         env.render()
36 | 
37 | test_record()
38 | 
39 | # Render to Video
40 | # python -m retro.scripts.playback_movie Airstriker-Genesis-Level1-000000.bk2
41 | 


--------------------------------------------------------------------------------