├── .gitignore
├── Alpha Toe.pdf
├── LICENSE
├── README.md
├── common
    ├── __init__.py
    ├── base_game_spec.py
    ├── benchmark.py
    └── network_helpers.py
├── connect_4
    ├── __init__.py
    ├── network.py
    ├── position_connect_4_min_max_depth_6
    ├── supervised.py
    ├── train_historical.py
    └── train_vs_min_max.py
├── games
    ├── __init__.py
    ├── connect_4.py
    ├── tic_tac_toe.py
    └── tic_tac_toe_x.py
├── policy_gradient.py
├── policy_gradient_historical_competition.py
├── requirements.txt
├── techniques
    ├── __init__.py
    ├── create_positions_set.py
    ├── min_max.py
    ├── monte_carlo.py
    ├── monte_carlo_uct_with_value.py
    ├── train_policy_gradient.py
    ├── train_policy_gradient_historic.py
    ├── train_supervised.py
    └── train_value_network.py
├── tests
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   └── test_network_helpers.py
    ├── games
    │   ├── __init__.py
    │   ├── test_connect_4.py
    │   └── test_tic_tac_toe_x.py
    └── techniques
    │   ├── __init__.py
    │   ├── test_create_positions_set.py
    │   ├── test_min_max.py
    │   ├── test_train_policy_gradient.py
    │   └── test_train_policy_gradient_historic.py
├── tic_tac_toe_5_4
    ├── __init__.py
    ├── network.py
    ├── position_tic_tac_toe_5_4_min_max_depth_6
    ├── supervised.py
    ├── train_historical.py
    └── train_vs_min_max.py
└── value_network.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/Alpha Toe.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/Alpha Toe.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Daniel Slater
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AlphaToe
 2 | Applying the deep learning techniques from Alpha Go to play tic-tac-toe
 3 | 
 4 | These are the code examples to with my talk, the slide for which are in AlphaToe.pdf
 5 | 
 6 | As well as the slides, the file script/policy_gradient.py is a good starting point for the project. All networks are 
 7 | built using TensorFlow.
 8 | 
 9 | ## SetUp
10 | 
11 | To get running start by creating a virtual env/conda env with tensorFlow installed. Current instructions for this are 
12 | at: https://www.tensorflow.org/versions/r0.11/get_started/os_setup.html#anaconda-installation
13 | 
14 | I've also found this useful: https://anaconda.org/jjhelmus/tensorflow
15 | 
16 | Then run the file file policy_gradient.py
17 | 
18 | This has been tested with python 2.7 and 3.5


--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/common/__init__.py


--------------------------------------------------------------------------------
/common/base_game_spec.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import random
  3 | from functools import reduce
  4 | 
  5 | 
  6 | class BaseGameSpec(object):
  7 |     def __init__(self):
  8 |         """Abstract base class for the specification for running/training on a game.
  9 | 
 10 |         Examples:
 11 |             spec = TicTacToeGameSpec()
 12 |             result = spec.play_game(func_a, func_b)
 13 |         """
 14 |         raise NotImplementedError('This is an abstract base class')
 15 | 
 16 |     def new_board(self):
 17 |         raise NotImplementedError()
 18 | 
 19 |     def apply_move(self, board_state, move, side):
 20 |         raise NotImplementedError()
 21 | 
 22 |     def available_moves(self, board_state):
 23 |         raise NotImplementedError()
 24 | 
 25 |     def has_winner(self, board_state):
 26 |         raise NotImplementedError()
 27 | 
 28 |     def evaluate(self, board_state):
 29 |         """An evaluation function for this game, gives an estimate of how good the board position is for the plus player.
 30 |         There is no specific range for the values returned, they just need to be relative to each other.
 31 | 
 32 |         Args:
 33 |             board_state (tuple): State of the board
 34 | 
 35 |         Returns:
 36 |             number
 37 |         """
 38 |         raise NotImplementedError()
 39 | 
 40 |     def board_dimensions(self):
 41 |         """Returns the dimensions of the board for this game
 42 | 
 43 |         Returns:
 44 |             tuple of ints: one int for each dimension of the board, this will normally be 2 ints
 45 |         """
 46 |         raise NotImplementedError()
 47 | 
 48 |     def board_squares(self):
 49 |         """The number of squares on the board. This can be used for the number of input nodes to a network.
 50 | 
 51 |         Returns:
 52 |             int
 53 |         """
 54 |         return reduce(operator.mul, self.board_dimensions(), 1)
 55 | 
 56 |     def outputs(self):
 57 |         """The number of moves that could be made in this kind of game, weather or not they are legal. For most games
 58 |         this will be every single square on the board, but for connect 4 this is different. If we wanted to do chess in
 59 |         the future this method may need to get a bit more complicated.
 60 | 
 61 |         Returns:
 62 |             int
 63 |         """
 64 |         return self.board_squares()
 65 | 
 66 |     def flat_move_to_tuple(self, move_index):
 67 |         """If board is 2d then we return a tuple for where we moved to.
 68 |         e.g if the board is a 3x3 size and our move_index was 6 then
 69 |         this method will return (2, 0)
 70 | 
 71 |         Args:
 72 |             move_index (int): The index of the square we moved to
 73 | 
 74 |         Returns:
 75 |             tuple or int: For where we moved in board coordinates
 76 |         """
 77 |         if len(self.board_dimensions()) == 1:
 78 |             return move_index
 79 | 
 80 |         board_x = self.board_dimensions()[0]
 81 |         return int(move_index / board_x), move_index % board_x
 82 | 
 83 |     def tuple_move_to_flat(self, tuple_move):
 84 |         """Does the inverse operation to flat_move_to_tuple
 85 | 
 86 |         Args:
 87 |             tuple_move (tuple):
 88 | 
 89 |         Returns:
 90 |             int :
 91 |         """
 92 |         if len(self.board_dimensions()) == 1:
 93 |             return tuple_move[0]
 94 |         else:
 95 |             return tuple_move[0] * self.board_dimensions()[0] + tuple_move[1]
 96 | 
 97 |     def play_game(self, plus_player_func, minus_player_func, log=False, board_state=None):
 98 |         """Run a single game of until the end, using the provided function args to determine the moves for each
 99 |         player.
100 | 
101 |         Args:
102 |             plus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the
103 |                 current board_state and side this player is playing, and returns the move the player wants to play.
104 |             minus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the
105 |                 current board_state and side this player is playing, and returns the move the player wants to play.
106 |             log (bool): If True progress is logged to console, defaults to False
107 |             board_state: Optionally have the game start from this position, rather than from a new board
108 | 
109 |         Returns:
110 |             int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw
111 |         """
112 |         board_state = board_state or self.new_board()
113 |         player_turn = 1
114 | 
115 |         while True:
116 |             _available_moves = list(self.available_moves(board_state))
117 | 
118 |             if len(_available_moves) == 0:
119 |                 # draw
120 |                 if log:
121 |                     print("no moves left, game ended a draw")
122 |                 return 0.
123 |             if player_turn > 0:
124 |                 move = plus_player_func(board_state, 1)
125 |             else:
126 |                 move = minus_player_func(board_state, -1)
127 | 
128 |             if move not in _available_moves:
129 |                 # if a player makes an invalid move the other player wins
130 |                 if log:
131 |                     print("illegal move ", move)
132 |                 return -player_turn
133 | 
134 |             board_state = self.apply_move(board_state, move, player_turn)
135 |             if log:
136 |                 print(board_state)
137 | 
138 |             winner = self.has_winner(board_state)
139 |             if winner != 0:
140 |                 if log:
141 |                     print("we have a winner, side: %s" % player_turn)
142 |                 return winner
143 |             player_turn = -player_turn
144 | 
145 |     def get_random_player_func(self):
146 |         """Return a function that makes moves for the current game by choosing a move randomly
147 |         NOTE: this move returns the function that makes the random move so should be used like so:
148 |         Examples:
149 |             self.play_game(self.get_random_player_func(), self.get_random_player_func())
150 | 
151 |         Returns:
152 |             board_state, side (int) -> move : function that plays this game by making random moves
153 |         """
154 |         return lambda board_state, side: random.choice(list(self.available_moves(board_state)))
155 | 


--------------------------------------------------------------------------------
/common/benchmark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Plays games against a variety of algorithms to see how good a network is
  3 | """
  4 | import functools
  5 | 
  6 | import tensorflow as tf
  7 | 
  8 | from common.network_helpers import load_network, get_deterministic_network_move
  9 | from techniques.min_max import min_max_alpha_beta
 10 | from techniques.monte_carlo import monte_carlo_tree_search_uct
 11 | 
 12 | 
 13 | def benchmark(game_spec, network_file_path, create_network_func, log_games=False, games_vs_random=500):
 14 |     """Plays games against a variety of algorithms to see how good a network is. Results are currently just
 15 |     printed to std out
 16 | 
 17 |     Args:
 18 |         game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
 19 |         create_network_func (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
 20 |             Method that creates the network we will train.
 21 |         network_file_path (str): path to the file with weights we want to load for this network
 22 |         log_games (bool): If True print all positions from all games played
 23 |         games_vs_random (int): Number of games to play vs random opponents
 24 |     """
 25 |     input_layer, output_layer, variables = create_network_func()
 26 | 
 27 |     with tf.Session() as session:
 28 |         session.run(tf.global_variables_initializer())
 29 |         load_network(session, variables, network_file_path)
 30 | 
 31 |         def make_move(board_state, side):
 32 |             move = get_deterministic_network_move(session, input_layer, output_layer, board_state, side,
 33 |                                                   valid_only=True, game_spec=game_spec)
 34 |             return game_spec.flat_move_to_tuple(move.argmax())
 35 | 
 36 |         def min_max_move_func(board_state, side, depth):
 37 |             return min_max_alpha_beta(game_spec, board_state, side, depth)[1]
 38 | 
 39 |         def monte_carlo_move_func(board_state, side):
 40 |             return monte_carlo_tree_search_uct(game_spec, board_state, side, 100000)[1]
 41 | 
 42 |         results = []
 43 |         for _ in range(int(games_vs_random / 2)):
 44 |             result = game_spec.play_game(make_move,
 45 |                                          game_spec.get_random_player_func(),
 46 |                                          log=log_games)
 47 |             results.append(result)
 48 |             result = game_spec.play_game(
 49 |                 game_spec.get_random_player_func(),
 50 |                 make_move, log=log_games)
 51 |             results.append(-result)
 52 | 
 53 |         print("*** results vs random = %s" % (sum(results),))
 54 | 
 55 |         results = []
 56 |         for _ in range(1):
 57 |             result = game_spec.play_game(make_move,
 58 |                                          functools.partial(min_max_move_func, depth=2), log=log_games)
 59 |             results.append(result)
 60 |             result = game_spec.play_game(functools.partial(min_max_move_func, depth=2),
 61 |                                          make_move, log=log_games)
 62 |             results.append(-result)
 63 | 
 64 |         print("*** results vs min max depth 2 = %s" % (sum(results),))
 65 | 
 66 |         results = []
 67 |         for _ in range(1):
 68 |             result = game_spec.play_game(make_move,
 69 |                                          functools.partial(min_max_move_func, depth=4), log=log_games)
 70 |             results.append(result)
 71 |             result = game_spec.play_game(functools.partial(min_max_move_func, depth=4),
 72 |                                          make_move, log=log_games)
 73 |             results.append(-result)
 74 | 
 75 |         print("*** results vs min max depth 4 = %s" % (sum(results),))
 76 | 
 77 |         results = []
 78 |         for _ in range(1):
 79 |             result = game_spec.play_game(make_move,
 80 |                                          functools.partial(min_max_move_func, depth=6), log=log_games)
 81 |             results.append(result)
 82 |             result = game_spec.play_game(functools.partial(min_max_move_func, depth=6),
 83 |                                          make_move, log=log_games)
 84 |             results.append(-result)
 85 | 
 86 |         print("*** results vs min max depth 6 = %s" % (sum(results),))
 87 | 
 88 |         results = []
 89 |         for _ in range(1):
 90 |             result = game_spec.play_game(make_move,
 91 |                                          functools.partial(min_max_move_func, depth=8), log=log_games)
 92 |             results.append(result)
 93 |             result = game_spec.play_game(functools.partial(min_max_move_func,
 94 |                                                            make_move, depth=8), log=log_games)
 95 |             results.append(-result)
 96 | 
 97 |         print("*** results vs min max depth 8 = %s" % (sum(results),))
 98 | 
 99 |         results = []
100 |         for _ in range(1):
101 |             result = game_spec.play_game(make_move,
102 |                                          monte_carlo_move_func, log=log_games)
103 |             results.append(result)
104 |             result = game_spec.play_game(monte_carlo_move_func,
105 |                                          make_move, log=log_games)
106 |             results.append(-result)
107 | 
108 |         print("*** results vs monte carlo uct 100000 = %s" % (sum(results),))
109 | 


--------------------------------------------------------------------------------
/common/network_helpers.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | import pickle
  3 | from functools import reduce
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | def create_network(input_nodes, hidden_nodes, output_nodes=None, output_softmax=True):
 10 |     """Create a network with relu activations at each layer
 11 | 
 12 |     Args:
 13 |         output_nodes: (int): Number of output nodes, if None then number of input nodes is used
 14 |         input_nodes (int or tuple(int)): The size of the board this network will work on. The output layer will also be
 15 |             this size if not specified. Can be an int if 1d or a tuple of ints for a 2d+ dim board
 16 |         hidden_nodes ([int]): The number of hidden nodes in each hidden layer
 17 |         output_softmax (bool): If True softmax is used in the final layer, otherwise just use the activation with no
 18 |             non-linearity function
 19 | 
 20 |     Returns:
 21 |         (input_layer, output_layer, [variables]) : The final item in the tuple is a list containing all the parameters,
 22 |             wieghts and biases used in this network
 23 |     """
 24 |     output_nodes = output_nodes or input_nodes
 25 | 
 26 |     variables = []
 27 | 
 28 |     with tf.name_scope('network'):
 29 |         if isinstance(input_nodes, tuple):
 30 |             input_layer = tf.placeholder("float", (None,) + input_nodes)
 31 |             flat_size = reduce(operator.mul, input_nodes, 1)
 32 |             current_layer = tf.reshape(input_layer, (-1, flat_size))
 33 |         else:
 34 |             input_layer = tf.placeholder("float", (None, input_nodes))
 35 |             current_layer = input_layer
 36 | 
 37 |         for hidden_nodes in hidden_nodes:
 38 |             last_layer_nodes = int(current_layer.get_shape()[-1])
 39 |             hidden_weights = tf.Variable(
 40 |                 tf.truncated_normal((last_layer_nodes, hidden_nodes), stddev=1. / np.sqrt(last_layer_nodes)),
 41 |                 name='weights')
 42 |             hidden_bias = tf.Variable(tf.constant(0.01, shape=(hidden_nodes,)), name='biases')
 43 | 
 44 |             variables.append(hidden_weights)
 45 |             variables.append(hidden_bias)
 46 | 
 47 |             current_layer = tf.nn.relu(
 48 |                 tf.matmul(current_layer, hidden_weights) + hidden_bias)
 49 | 
 50 |         if isinstance(output_nodes, tuple):
 51 |             output_nodes = reduce(operator.mul, input_nodes, 1)
 52 | 
 53 |         # for some reason having output std divided by np.sqrt(output_nodes) massively outperforms np.sqrt(hidden_nodes)
 54 |         output_weights = tf.Variable(
 55 |             tf.truncated_normal((hidden_nodes, output_nodes), stddev=1. / np.sqrt(output_nodes)), name="output_weights")
 56 |         output_bias = tf.Variable(tf.constant(0.01, shape=(output_nodes,)), name="output_bias")
 57 | 
 58 |         variables.append(output_weights)
 59 |         variables.append(output_bias)
 60 | 
 61 |         output_layer = tf.matmul(current_layer, output_weights) + output_bias
 62 |         if output_softmax:
 63 |             output_layer = tf.nn.softmax(output_layer)
 64 | 
 65 |     return input_layer, output_layer, variables
 66 | 
 67 | 
 68 | def save_network(session, tf_variables, file_path):
 69 |     """Save the given set of variables to the given file using the given session
 70 | 
 71 |     Args:
 72 |         session (tf.Session): session within which the variables has been initialised
 73 |         tf_variables (list of tf.Variable): list of variables which will be saved to the file
 74 |         file_path (str): path of the file we want to save to.
 75 |     """
 76 |     variable_values = session.run(tf_variables)
 77 |     with open(file_path, mode='wb') as f:
 78 |         pickle.dump(variable_values, f)
 79 | 
 80 | 
 81 | def load_network(session, tf_variables, file_path):
 82 |     """Load the given set of variables from the given file using the given session
 83 | 
 84 |     Args:
 85 |         session (tf.Session): session within which the variables has been initialised
 86 |         tf_variables (list of tf.Variable): list of variables which will set up with the values saved to the file. List
 87 |             order matters, in must be the exact same order as was used to save and all of the same shape.
 88 |         file_path (str): path of the file we want to load from.
 89 |     """
 90 |     with open(file_path, mode='rb') as f:
 91 |         variable_values = pickle.load(f)
 92 | 
 93 |     try:
 94 |         if len(variable_values) != len(tf_variables):
 95 |             raise ValueError("Network in file had different structure, variables in file: %s variables in memeory: %s"
 96 |                              % (len(variable_values), len(tf_variables)))
 97 |         for value, tf_variable in zip(variable_values, tf_variables):
 98 |             session.run(tf_variable.assign(value))
 99 |     except ValueError as ex:
100 |         # TODO: maybe raise custom exception
101 |         raise ValueError("""Tried to load network file %s with different architecture from the in memory network.
102 | Error was %s
103 | Either delete the network file to train a new network from scratch or change the in memory network to match that dimensions of the one in the file""" % (file_path, ex))
104 | 
105 | 
106 | def invert_board_state(board_state):
107 |     """Returns the board state inverted, so all 1 are replaced with -1 and visa-versa
108 | 
109 |     Args:
110 |         board_state (tuple of tuple of ints): The board we want to invert
111 | 
112 |     Returns:
113 |         (tuple of tuple of ints) The board state for the other player
114 |     """
115 |     return tuple(tuple(-board_state[j][i] for i in range(len(board_state[0]))) for j in range(len(board_state)))
116 | 
117 | 
118 | def get_stochastic_network_move(session, input_layer, output_layer, board_state, side,
119 |                                 valid_only=False, game_spec=None):
120 |     """Choose a move for the given board_state using a stocastic policy. A move is selected using the values from the
121 |      output_layer as a categorical probability distribution to select a single move
122 | 
123 |     Args:
124 |         session (tf.Session): Session used to run this network
125 |         input_layer (tf.Placeholder): Placeholder to the network used to feed in the board_state
126 |         output_layer (tf.Tensor): Tensor that will output the probabilities of the moves, we expect this to be of
127 |             dimesensions (None, board_squares) and the sum of values across the board_squares to be 1.
128 |         board_state: The board_state we want to get the move for.
129 |         side: The side that is making the move.
130 | 
131 |     Returns:
132 |         (np.array) It's shape is (board_squares), and it is a 1 hot encoding for the move the network has chosen.
133 |     """
134 |     np_board_state = np.array(board_state)
135 |     if side == -1:
136 |         np_board_state = -np_board_state
137 | 
138 |     np_board_state = np_board_state.reshape(1, *input_layer.get_shape().as_list()[1:])
139 |     probability_of_actions = session.run(output_layer,
140 |                                          feed_dict={input_layer: np_board_state})[0]
141 | 
142 |     if valid_only:
143 |         available_moves = list(game_spec.available_moves(board_state))
144 |         if len(available_moves) == 1:
145 |             move = np.zeros(game_spec.board_squares())
146 |             np.put(move, game_spec.tuple_move_to_flat(available_moves[0]), 1)
147 |             return move
148 |         available_moves_flat = [game_spec.tuple_move_to_flat(x) for x in available_moves]
149 |         for i in range(game_spec.board_squares()):
150 |             if i not in available_moves_flat:
151 |                 probability_of_actions[i] = 0.
152 | 
153 |         prob_mag = sum(probability_of_actions)
154 |         if prob_mag != 0.:
155 |             probability_of_actions /= sum(probability_of_actions)
156 | 
157 |     try:
158 |         move = np.random.multinomial(1, probability_of_actions)
159 |     except ValueError:
160 |         # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1.
161 |         # so need to reduce slightly to be a valid value
162 |         move = np.random.multinomial(1, probability_of_actions / (1. + 1e-6))
163 | 
164 |     return move
165 | 
166 | 
167 | def get_deterministic_network_move(session, input_layer, output_layer, board_state, side, valid_only=False,
168 |                                    game_spec=None):
169 |     """Choose a move for the given board_state using a deterministic policy. A move is selected using the values from
170 |     the output_layer and selecting the move with the highest score.
171 | 
172 |     Args:
173 |         session (tf.Session): Session used to run this network
174 |         input_layer (tf.Placeholder): Placeholder to the network used to feed in the board_state
175 |         output_layer (tf.Tensor): Tensor that will output the probabilities of the moves, we expect this to be of
176 |             dimesensions (None, board_squares).
177 |         board_state: The board_state we want to get the move for.
178 |         side: The side that is making the move.
179 | 
180 |     Returns:
181 |         (np.array) It's shape is (board_squares), and it is a 1 hot encoding for the move the network has chosen.
182 |     """
183 |     np_board_state = np.array(board_state)
184 |     np_board_state = np_board_state.reshape(1, *input_layer.get_shape().as_list()[1:])
185 |     if side == -1:
186 |         np_board_state = -np_board_state
187 | 
188 |     probability_of_actions = session.run(output_layer,
189 |                                          feed_dict={input_layer: np_board_state})[0]
190 | 
191 |     if valid_only:
192 |         available_moves = game_spec.available_moves(board_state)
193 |         available_moves_flat = [game_spec.tuple_move_to_flat(x) for x in available_moves]
194 |         for i in range(game_spec.board_squares()):
195 |             if i not in available_moves_flat:
196 |                 probability_of_actions[i] = 0
197 | 
198 |     move = np.argmax(probability_of_actions)
199 |     one_hot = np.zeros(len(probability_of_actions))
200 |     one_hot[move] = 1.
201 |     return one_hot
202 | 


--------------------------------------------------------------------------------
/connect_4/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/connect_4/__init__.py


--------------------------------------------------------------------------------
/connect_4/network.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from games.connect_4 import Connect4GameSpec
 4 | 
 5 | connect_4_game_spec = Connect4GameSpec()
 6 | 
 7 | 
 8 | def create_convolutional_network():
 9 |     input_layer = tf.input_layer = tf.placeholder("float",
10 |                                                   (None,) + connect_4_game_spec.board_dimensions() + (1,))
11 |     CONVOLUTIONS_LAYER_1 = 64
12 |     CONVOLUTIONS_LAYER_2 = 64
13 |     CONVOLUTIONS_LAYER_3 = 64
14 |     CONVOLUTIONS_LAYER_4 = 64
15 |     CONVOLUTIONS_LAYER_5 = 64
16 |     FLAT_SIZE = 7 * 6 * CONVOLUTIONS_LAYER_2
17 | 
18 |     convolution_weights_1 = tf.Variable(tf.truncated_normal([3, 3, 1, CONVOLUTIONS_LAYER_1], stddev=0.01))
19 |     convolution_bias_1 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_1]))
20 | 
21 |     convolution_weights_2 = tf.Variable(
22 |         tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_1, CONVOLUTIONS_LAYER_2], stddev=0.01))
23 |     convolution_bias_2 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_2]))
24 | 
25 |     convolution_weights_3 = tf.Variable(
26 |         tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_2, CONVOLUTIONS_LAYER_3], stddev=0.01))
27 |     convolution_bias_3 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_3]))
28 | 
29 |     convolution_weights_4 = tf.Variable(
30 |         tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_3, CONVOLUTIONS_LAYER_4], stddev=0.01))
31 |     convolution_bias_4 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_4]))
32 | 
33 |     convolution_weights_5 = tf.Variable(
34 |         tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_4, CONVOLUTIONS_LAYER_5], stddev=0.01))
35 |     convolution_bias_5 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_5]))
36 | 
37 |     # feed_forward_weights_1 = tf.Variable(tf.truncated_normal([FLAT_SIZE, FLAT_HIDDEN_NODES], stddev=0.01))
38 |     # feed_forward_bias_1 = tf.Variable(tf.constant(0.01, shape=[FLAT_HIDDEN_NODES]))
39 | 
40 |     feed_forward_weights_2 = tf.Variable(
41 |         tf.truncated_normal([FLAT_SIZE, connect_4_game_spec.outputs()], stddev=0.01))
42 |     feed_forward_bias_2 = tf.Variable(tf.constant(0.01, shape=[connect_4_game_spec.outputs()]))
43 | 
44 |     hidden_convolutional_layer_1 = tf.nn.relu(
45 |         tf.nn.conv2d(input_layer, convolution_weights_1, strides=[1, 1, 1, 1], padding="SAME") + convolution_bias_1)
46 | 
47 |     hidden_convolutional_layer_2 = tf.nn.relu(
48 |         tf.nn.conv2d(hidden_convolutional_layer_1, convolution_weights_2, strides=[1, 1, 1, 1],
49 |                      padding="SAME") + convolution_bias_2)
50 | 
51 |     hidden_convolutional_layer_3 = tf.nn.relu(
52 |         tf.nn.conv2d(hidden_convolutional_layer_2, convolution_weights_3, strides=[1, 1, 1, 1],
53 |                      padding="SAME") + convolution_bias_3)
54 | 
55 |     hidden_convolutional_layer_4 = tf.nn.relu(
56 |         tf.nn.conv2d(hidden_convolutional_layer_3, convolution_weights_4, strides=[1, 1, 1, 1],
57 |                      padding="SAME") + convolution_bias_4)
58 | 
59 |     hidden_convolutional_layer_5 = tf.nn.relu(
60 |         tf.nn.conv2d(hidden_convolutional_layer_4, convolution_weights_5, strides=[1, 1, 1, 1],
61 |                      padding="SAME") + convolution_bias_5)
62 | 
63 |     hidden_convolutional_layer_3_flat = tf.reshape(hidden_convolutional_layer_5, [-1, FLAT_SIZE])
64 | 
65 |     # final_hidden_activations = tf.nn.relu(
66 |     #     tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_1) + feed_forward_bias_1)
67 | 
68 |     output_layer = tf.nn.softmax(tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_2) + feed_forward_bias_2)
69 | 
70 |     return input_layer, output_layer, [convolution_weights_1, convolution_bias_1,
71 |                                        convolution_weights_2, convolution_bias_2,
72 |                                        convolution_weights_3, convolution_bias_3,
73 |                                        convolution_weights_4, convolution_bias_4,
74 |                                        convolution_weights_5, convolution_bias_5,
75 |                                        # feed_forward_weights_1, feed_forward_bias_1,
76 |                                        feed_forward_weights_2, feed_forward_bias_2]
77 | 


--------------------------------------------------------------------------------
/connect_4/position_connect_4_min_max_depth_6:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/connect_4/position_connect_4_min_max_depth_6


--------------------------------------------------------------------------------
/connect_4/supervised.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import numpy as np
 4 | 
 5 | from techniques.train_supervised import train_supervised
 6 | from connect_4.network import connect_4_game_spec, create_convolutional_network
 7 | 
 8 | with open("position_connect_4_min_max_depth_6", 'rb') as f:
 9 |     positions = pickle.load(f)
10 | 
11 | # for now we need to reshape input for convolutions and one hot the move responses
12 | # this is the kind of stuff I need to clean up in overall design
13 | for i in range(len(positions)):
14 |     one_hot = np.zeros(connect_4_game_spec.outputs())
15 |     np.put(one_hot, positions[i][1], 1)
16 |     positions[i] = np.array(positions[i][0]).reshape(connect_4_game_spec.board_dimensions()[0],
17 |                                                      connect_4_game_spec.board_dimensions()[1],
18 |                                                      1), one_hot
19 | 
20 | # for convolutional_layers in [3, 4, 5, 6]:
21 | #     for convolutional_channels in [48, 64, 80, 96]:
22 | 
23 | train_supervised(connect_4_game_spec, create_convolutional_network, 'convolutional_net_l_c_5_f_1_other.p',
24 |                  positions,
25 |                  regularization_coefficent=1e-3,
26 |                  learn_rate=5e-5)


--------------------------------------------------------------------------------
/connect_4/train_historical.py:
--------------------------------------------------------------------------------
 1 | from connect_4.network import create_convolutional_network, connect_4_game_spec
 2 | from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic
 3 | 
 4 | 
 5 | train_policy_gradients_vs_historic(connect_4_game_spec, create_convolutional_network,
 6 |                                    'convolutional_net_5_4_l_c_4_f_1_other_after_1.p',
 7 |                                    save_network_file_path='convolutional_net_5_4_l_c_4_f_1_other_after_2.p',
 8 |                                    number_of_games=50000,
 9 |                                    print_results_every=500,
10 |                                    save_historic_every=8000)


--------------------------------------------------------------------------------
/connect_4/train_vs_min_max.py:
--------------------------------------------------------------------------------
 1 | from techniques.min_max import min_max_alpha_beta
 2 | from techniques.train_policy_gradient import train_policy_gradients
 3 | from connect_4.network import connect_4_game_spec, create_convolutional_network
 4 | 
 5 | 
 6 | def min_max_move_func(board_state, side):
 7 |     return min_max_alpha_beta(connect_4_game_spec, board_state, side, 3)[1]
 8 | 
 9 | 
10 | train_policy_gradients(connect_4_game_spec, create_convolutional_network,
11 |                        'convolutional_net_5_4_l_c_4_f_1_other_after.p',
12 |                        opponent_func=min_max_move_func,
13 |                        save_network_file_path='convolutional_net_5_4_l_c_4_f_1_other_after_vs_depth_3.p',
14 |                        number_of_games=5000,
15 |                        print_results_every=100)


--------------------------------------------------------------------------------
/games/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/games/__init__.py


--------------------------------------------------------------------------------
/games/connect_4.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Full code for running a game of connect 4 on a board_width, board_height and winning length can be specified in relevant
  3 | methods. Allowing you to play connect 5, 6, 7, etc. Defaults are board_width = 7, board_height = 6, winning_length = 4
  4 | 
  5 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine
  6 | where each player plays.
  7 | The board is represented by a board_width x board_height tuple of ints. A 0 means no player has played in a space, 1
  8 | means player one has played there, -1 means the seconds player has played there. The apply_move method can be used to
  9 | return a copy of a given state with a given move applied. This can be useful for doing min-max or monte carlo sampling.
 10 | """
 11 | 
 12 | import random
 13 | 
 14 | from common.base_game_spec import BaseGameSpec
 15 | from games.tic_tac_toe_x import evaluate
 16 | 
 17 | 
 18 | def _new_board(board_width, board_height):
 19 |     """Return a emprty tic-tac-toe board we can use for simulating a game.
 20 | 
 21 |     Args:
 22 |         board_width (int): The width of the board, a board_width * board_height board is created
 23 |         board_height (int): The height of the board, a board_width * board_height board is created
 24 | 
 25 |     Returns:
 26 |         board_width x board_height tuple of ints
 27 |     """
 28 |     return tuple(tuple(0 for _ in range(board_height)) for _ in range(board_width))
 29 | 
 30 | 
 31 | def apply_move(board_state, move_x, side):
 32 |     """Returns a copy of the given board_state with the desired move applied.
 33 | 
 34 |     Args:
 35 |         board_state (2d tuple of int): The given board_state we want to apply the move to.
 36 |         move_x (int): Which column we are going to "drop" our piece in
 37 |         side (int): The side we are making this move for, 1 for the first player, -1 for the second player.
 38 | 
 39 |     Returns:
 40 |         (2d tuple of int): A copy of the board_state with the given move applied for the given side.
 41 |     """
 42 |     # find position in which move will settle
 43 |     move_y = 0
 44 |     for x in board_state[move_x]:
 45 |         if x == 0:
 46 |             break
 47 |         else:
 48 |             move_y += 1
 49 | 
 50 |     def get_tuples():
 51 |         for i in range(len(board_state)):
 52 |             if move_x == i:
 53 |                 temp = list(board_state[i])
 54 |                 temp[move_y] = side
 55 |                 yield tuple(temp)
 56 |             else:
 57 |                 yield board_state[i]
 58 | 
 59 |     return tuple(get_tuples())
 60 | 
 61 | 
 62 | def available_moves(board_state):
 63 |     """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have
 64 |     pieces played.
 65 | 
 66 |     Args:
 67 |         board_state: The board_state we want to check for valid moves.
 68 | 
 69 |     Returns:
 70 |         Generator of int: All the valid moves that can be played in this position.
 71 |     """
 72 |     for x in range(len(board_state)):
 73 |         if any(y == 0 for y in board_state[x]):
 74 |             yield x
 75 | 
 76 | 
 77 | def _has_winning_line(line, winning_length):
 78 |     count = 0
 79 |     last_side = 0
 80 |     for x in line:
 81 |         if x == last_side:
 82 |             count += 1
 83 |             if count == winning_length:
 84 |                 return last_side
 85 |         else:
 86 |             count = 1
 87 |             last_side = x
 88 |     return 0
 89 | 
 90 | 
 91 | def has_winner(board_state, winning_length=4):
 92 |     """Determine if a player has won on the given board_state.
 93 | 
 94 |     Args:
 95 |         board_state (2d tuple of int): The current board_state we want to evaluate.
 96 |         winning_length (int): The number of moves in a row needed for a win.
 97 | 
 98 |     Returns:
 99 |         int: 1 if player one has won, -1 if player 2 has won, otherwise 0.
100 |     """
101 |     board_width = len(board_state)
102 |     board_height = len(board_state[0])
103 | 
104 |     # check rows
105 |     for x in range(board_width):
106 |         winner = _has_winning_line(board_state[x], winning_length)
107 |         if winner != 0:
108 |             return winner
109 |     # check columns
110 |     for y in range(board_height):
111 |         winner = _has_winning_line((i[y] for i in board_state), winning_length)
112 |         if winner != 0:
113 |             return winner
114 | 
115 |     # check diagonals
116 |     diagonals_start = -(board_width - winning_length)
117 |     diagonals_end = (board_width - winning_length)
118 |     for d in range(diagonals_start, diagonals_end+1):
119 |         winner = _has_winning_line(
120 |             (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))),
121 |             winning_length)
122 |         if winner != 0:
123 |             return winner
124 |     for d in range(diagonals_start, diagonals_end+1):
125 |         winner = _has_winning_line(
126 |             (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))),
127 |             winning_length)
128 |         if winner != 0:
129 |             return winner
130 | 
131 |     return 0  # no one has won, return 0 for a draw
132 | 
133 | 
134 | def play_game(plus_player_func, minus_player_func, board_width=7, board_height=6, winning_length=4, log=False):
135 |     """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each
136 |     player.
137 | 
138 |     Args:
139 |         plus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))):
140 |             Function that takes the current board_state and side this player is playing, and returns the move the player
141 |             wants to play.
142 |         minus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))):
143 |             Function that takes the current board_state and side this player is playing, and returns the move the player
144 |             wants to play.
145 |         board_width (int): The width of the board, a board_width * board_height board is created
146 |         board_height (int): The height of the board, a board_width * board_height board is created
147 |         winning_length (int): The number of pieces in a row needed to win a game.
148 |         log (bool): If True progress is logged to console, defaults to False
149 | 
150 |     Returns:
151 |         int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw
152 |     """
153 |     board_state = _new_board(board_width, board_height)
154 |     player_turn = 1
155 | 
156 |     while True:
157 |         _avialable_moves = list(available_moves(board_state))
158 |         if len(_avialable_moves) == 0:
159 |             # draw
160 |             if log:
161 |                 print("no moves left, game ended a draw")
162 |             return 0.
163 |         if player_turn > 0:
164 |             move = plus_player_func(board_state, 1)
165 |         else:
166 |             move = minus_player_func(board_state, -1)
167 | 
168 |         if move not in _avialable_moves:
169 |             # if a player makes an invalid move the other player wins
170 |             if log:
171 |                 print("illegal move ", move)
172 |             return -player_turn
173 | 
174 |         board_state = apply_move(board_state, move, player_turn)
175 |         if log:
176 |             print(board_state)
177 | 
178 |         winner = has_winner(board_state, winning_length)
179 |         if winner != 0:
180 |             if log:
181 |                 print("we have a winner, side: %s" % player_turn)
182 |             return winner
183 |         player_turn = -player_turn
184 | 
185 | 
186 | def random_player(board_state, _):
187 |     """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the
188 |     valid moves in the current state.
189 | 
190 |     Args:
191 |         board_state (2d tuple of int): The current state of the board
192 |         _: the side this player is playing, not used in this function because we are simply choosing the moves randomly
193 | 
194 |     Returns:
195 |         (int, int): the move we want to play on the current board
196 |     """
197 |     moves = list(available_moves(board_state))
198 |     return random.choice(moves)
199 | 
200 | 
201 | class Connect4GameSpec(BaseGameSpec):
202 |     def __init__(self, board_width=7, board_height=6, winning_length=4):
203 |         self._board_height = board_height
204 |         self._board_width = board_width
205 |         self._winning_length = winning_length
206 |         self.available_moves = available_moves
207 |         self.apply_move = apply_move
208 | 
209 |     def new_board(self):
210 |         return _new_board(self._board_width, self._board_height)
211 | 
212 |     def has_winner(self, board_state):
213 |         return has_winner(board_state, self._winning_length)
214 | 
215 |     def board_dimensions(self):
216 |         return self._board_width, self._board_height
217 | 
218 |     def flat_move_to_tuple(self, move_index):
219 |         return move_index
220 | 
221 |     def outputs(self):
222 |         return self._board_width * self._board_height
223 | 
224 |     def evaluate(self, board_state):
225 |         return evaluate(board_state, self._winning_length)
226 | 
227 | if __name__ == '__main__':
228 |     # example of playing a game
229 |     play_game(random_player, random_player, log=True, board_width=7, board_height=6, winning_length=4)
230 | 


--------------------------------------------------------------------------------
/games/tic_tac_toe.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Full code for running a game of tic-tac-toe on a 3 by 3 board.
  3 | Two players take turns making moves on squares of the board, the first to get 3 in a row, including diagonals, wins. If
  4 | there are no valid moves left to make the game ends a draw.
  5 | 
  6 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine
  7 | where each player plays.
  8 | The board is represented by a 3 x 3 tuple of ints. A 0 means no player has played in a space, 1 means player one has
  9 | played there, -1 means the seconds player has played there. The apply_move method can be used to return a copy of a
 10 | given state with a given move applied. This can be useful for doing min-max or monte carlo sampling.
 11 | """
 12 | import itertools
 13 | import random
 14 | 
 15 | from common.base_game_spec import BaseGameSpec
 16 | from techniques.min_max import evaluate
 17 | 
 18 | 
 19 | def _new_board():
 20 |     """Return a emprty tic-tac-toe board we can use for simulating a game.
 21 | 
 22 |     Returns:
 23 |         3x3 tuple of ints
 24 |     """
 25 |     return ((0, 0, 0),
 26 |             (0, 0, 0),
 27 |             (0, 0, 0))
 28 | 
 29 | 
 30 | def apply_move(board_state, move, side):
 31 |     """Returns a copy of the given board_state with the desired move applied.
 32 | 
 33 |     Args:
 34 |         board_state (3x3 tuple of int): The given board_state we want to apply the move to.
 35 |         move (int, int): The position we want to make the move in.
 36 |         side (int): The side we are making this move for, 1 for the first player, -1 for the second player.
 37 | 
 38 |     Returns:
 39 |         (3x3 tuple of int): A copy of the board_state with the given move applied for the given side.
 40 |     """
 41 |     move_x, move_y = move
 42 | 
 43 |     def get_tuples():
 44 |         for x in range(3):
 45 |             if move_x == x:
 46 |                 temp = list(board_state[x])
 47 |                 temp[move_y] = side
 48 |                 yield tuple(temp)
 49 |             else:
 50 |                 yield board_state[x]
 51 | 
 52 |     return tuple(get_tuples())
 53 | 
 54 | 
 55 | def available_moves(board_state):
 56 |     """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have
 57 |     pieces played.
 58 | 
 59 |     Args:
 60 |         board_state: The board_state we want to check for valid moves.
 61 | 
 62 |     Returns:
 63 |         Generator of (int, int): All the valid moves that can be played in this position.
 64 |     """
 65 |     for x, y in itertools.product(range(3), range(3)):
 66 |         if board_state[x][y] == 0:
 67 |             yield (x, y)
 68 | 
 69 | 
 70 | def _has_3_in_a_line(line):
 71 |     return all(x == -1 for x in line) | all(x == 1 for x in line)
 72 | 
 73 | 
 74 | def has_winner(board_state):
 75 |     """Determine if a player has won on the given board_state.
 76 | 
 77 |     Args:
 78 |         board_state (3x3 tuple of int): The current board_state we want to evaluate.
 79 | 
 80 |     Returns:
 81 |         int: 1 if player one has won, -1 if player 2 has won, otherwise 0.
 82 |     """
 83 |     # check rows
 84 |     for x in range(3):
 85 |         if _has_3_in_a_line(board_state[x]):
 86 |             return board_state[x][0]
 87 |     # check columns
 88 |     for y in range(3):
 89 |         if _has_3_in_a_line([i[y] for i in board_state]):
 90 |             return board_state[0][y]
 91 | 
 92 |     # check diagonals
 93 |     if _has_3_in_a_line([board_state[i][i] for i in range(3)]):
 94 |         return board_state[0][0]
 95 |     if _has_3_in_a_line([board_state[2 - i][i] for i in range(3)]):
 96 |         return board_state[0][2]
 97 | 
 98 |     return 0  # no one has won, return 0 for a draw
 99 | 
100 | 
101 | def play_game(plus_player_func, minus_player_func, log=False):
102 |     """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each
103 |     player.
104 | 
105 |     Args:
106 |         plus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the
107 |             current board_state and side this player is playing, and returns the move the player wants to play.
108 |         minus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the
109 |             current board_state and side this player is playing, and returns the move the player wants to play.
110 |         log (bool): If True progress is logged to console, defaults to False
111 | 
112 |     Returns:
113 |         int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw
114 |     """
115 |     board_state = _new_board()
116 |     player_turn = 1
117 | 
118 |     while True:
119 |         _available_moves = list(available_moves(board_state))
120 | 
121 |         if len(_available_moves) == 0:
122 |             # draw
123 |             if log:
124 |                 print("no moves left, game ended a draw")
125 |             return 0.
126 |         if player_turn > 0:
127 |             move = plus_player_func(board_state, 1)
128 |         else:
129 |             move = minus_player_func(board_state, -1)
130 | 
131 |         if move not in _available_moves:
132 |             # if a player makes an invalid move the other player wins
133 |             if log:
134 |                 print("illegal move ", move)
135 |             return -player_turn
136 | 
137 |         board_state = apply_move(board_state, move, player_turn)
138 |         if log:
139 |             print(board_state)
140 | 
141 |         winner = has_winner(board_state)
142 |         if winner != 0:
143 |             if log:
144 |                 print("we have a winner, side: %s" % player_turn)
145 |             return winner
146 |         player_turn = -player_turn
147 | 
148 | 
149 | def random_player(board_state, _):
150 |     """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the
151 |     valid moves in the current state.
152 | 
153 |     Args:
154 |         board_state (3x3 tuple of int): The current state of the board
155 |         _: the side this player is playing, not used in this function because we are simply choosing the moves randomly
156 | 
157 |     Returns:
158 |         (int, int): the move we want to play on the current board
159 |     """
160 |     moves = list(available_moves(board_state))
161 |     return random.choice(moves)
162 | 
163 | 
164 | class TicTacToeGameSpec(BaseGameSpec):
165 |     def __init__(self):
166 |         self.available_moves = available_moves
167 |         self.has_winner = has_winner
168 |         self.new_board = _new_board
169 |         self.apply_move = apply_move
170 |         self.evaluate = evaluate
171 | 
172 |     def board_dimensions(self):
173 |         return 3, 3
174 | 
175 | 
176 | if __name__ == '__main__':
177 |     # example of playing a game
178 |     play_game(random_player, random_player, log=True)
179 | 


--------------------------------------------------------------------------------
/games/tic_tac_toe_x.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Full code for running a game of tic-tac-toe on a board of any size with a specified number in a row for the win. This is
  3 | similar to tic_tac_toe.py but all relevent moves are paramiterized by board_size arg that sets how big the board is and
  4 | winning_length which determines how many in a row are needed to win. Defaults are 5 and 4. This allows you to play games
  5 | in a more complex environment than standard tic-tac-toe.
  6 | 
  7 | Two players take turns making moves on squares of the board, the first to get winning_length in a row, including
  8 | diagonals, wins. If there are no valid moves left to make the game ends a draw.
  9 | 
 10 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine
 11 | where each player plays.
 12 | The board is represented by a board_size x board_size tuple of ints. A 0 means no player has played in a space, 1 means
 13 | player one has played there, -1 means the seconds player has played there. The apply_move method can be used to return a
 14 | copy of a given state with a given move applied. This can be useful for doing min-max or monte carlo sampling.
 15 | """
 16 | import itertools
 17 | import random
 18 | 
 19 | from common.base_game_spec import BaseGameSpec
 20 | 
 21 | 
 22 | def _new_board(board_size):
 23 |     """Return a emprty tic-tac-toe board we can use for simulating a game.
 24 | 
 25 |     Args:
 26 |         board_size (int): The size of one side of the board, a board_size * board_size board is created
 27 | 
 28 |     Returns:
 29 |         board_size x board_size tuple of ints
 30 |     """
 31 |     return tuple(tuple(0 for _ in range(board_size)) for _ in range(board_size))
 32 | 
 33 | 
 34 | def apply_move(board_state, move, side):
 35 |     """Returns a copy of the given board_state with the desired move applied.
 36 | 
 37 |     Args:
 38 |         board_state (2d tuple of int): The given board_state we want to apply the move to.
 39 |         move (int, int): The position we want to make the move in.
 40 |         side (int): The side we are making this move for, 1 for the first player, -1 for the second player.
 41 | 
 42 |     Returns:
 43 |         (2d tuple of int): A copy of the board_state with the given move applied for the given side.
 44 |     """
 45 |     move_x, move_y = move
 46 | 
 47 |     def get_tuples():
 48 |         for x in range(len(board_state)):
 49 |             if move_x == x:
 50 |                 temp = list(board_state[x])
 51 |                 temp[move_y] = side
 52 |                 yield tuple(temp)
 53 |             else:
 54 |                 yield board_state[x]
 55 | 
 56 |     return tuple(get_tuples())
 57 | 
 58 | 
 59 | def available_moves(board_state):
 60 |     """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have
 61 |     pieces played.
 62 | 
 63 |     Args:
 64 |         board_state: The board_state we want to check for valid moves.
 65 | 
 66 |     Returns:
 67 |         Generator of (int, int): All the valid moves that can be played in this position.
 68 |     """
 69 |     for x, y in itertools.product(range(len(board_state)), range(len(board_state[0]))):
 70 |         if board_state[x][y] == 0:
 71 |             yield (x, y)
 72 | 
 73 | 
 74 | def _has_winning_line(line, winning_length):
 75 |     count = 0
 76 |     last_side = 0
 77 |     for x in line:
 78 |         if x == last_side:
 79 |             count += 1
 80 |             if count == winning_length:
 81 |                 return last_side
 82 |         else:
 83 |             count = 1
 84 |             last_side = x
 85 |     return 0
 86 | 
 87 | 
 88 | def has_winner(board_state, winning_length):
 89 |     """Determine if a player has won on the given board_state.
 90 | 
 91 |     Args:
 92 |         board_state (2d tuple of int): The current board_state we want to evaluate.
 93 |         winning_length (int): The number of moves in a row needed for a win.
 94 | 
 95 |     Returns:
 96 |         int: 1 if player one has won, -1 if player 2 has won, otherwise 0.
 97 |     """
 98 |     board_width = len(board_state)
 99 |     board_height = len(board_state[0])
100 | 
101 |     # check rows
102 |     for x in range(board_width):
103 |         winner = _has_winning_line(board_state[x], winning_length)
104 |         if winner != 0:
105 |             return winner
106 |     # check columns
107 |     for y in range(board_height):
108 |         winner = _has_winning_line((i[y] for i in board_state), winning_length)
109 |         if winner != 0:
110 |             return winner
111 | 
112 |     # check diagonals
113 |     diagonals_start = -(board_width - winning_length)
114 |     diagonals_end = (board_width - winning_length)
115 |     for d in range(diagonals_start, diagonals_end + 1):
116 |         winner = _has_winning_line(
117 |             (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))),
118 |             winning_length)
119 |         if winner != 0:
120 |             return winner
121 |     for d in range(diagonals_start, diagonals_end + 1):
122 |         winner = _has_winning_line(
123 |             (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))),
124 |             winning_length)
125 |         if winner != 0:
126 |             return winner
127 | 
128 |     return 0  # no one has won, return 0 for a draw
129 | 
130 | 
131 | def _evaluate_line(line, winning_length):
132 |     count = 0
133 |     last_side = 0
134 |     score = 0
135 |     neutrals = 0
136 | 
137 |     for x in line:
138 |         if x == last_side:
139 |             count += 1
140 |             if count == winning_length and neutrals == 0:
141 |                 return 100000 * x  # a side has already won here
142 |         elif x == 0:  # we could score here
143 |             neutrals += 1
144 |         elif x == -last_side:
145 |             if neutrals + count >= winning_length:
146 |                 score += (count - 1) * last_side
147 |             count = 1
148 |             last_side = x
149 |             neutrals = 0
150 |         else:
151 |             last_side = x
152 |             count = 1
153 | 
154 |     if neutrals + count >= winning_length:
155 |         score += (count - 1) * last_side
156 | 
157 |     return score
158 | 
159 | 
160 | def evaluate(board_state, winning_length):
161 |     """An evaluation function for this game, gives an estimate of how good the board position is for the plus player.
162 |     There is no specific range for the values returned, they just need to be relative to each other.
163 | 
164 |     Args:
165 |         winning_length (int): The length needed to win a game
166 |         board_state (tuple): State of the board
167 | 
168 |     Returns:
169 |         number
170 |     """
171 |     board_width = len(board_state)
172 |     board_height = len(board_state[0])
173 | 
174 |     score = 0
175 | 
176 |     # check rows
177 |     for x in range(board_width):
178 |         score += _evaluate_line(board_state[x], winning_length)
179 |     # check columns
180 |     for y in range(board_height):
181 |         score += _evaluate_line((i[y] for i in board_state), winning_length)
182 | 
183 |     # check diagonals
184 |     diagonals_start = -(board_width - winning_length)
185 |     diagonals_end = (board_width - winning_length)
186 |     for d in range(diagonals_start, diagonals_end + 1):
187 |         score += _evaluate_line(
188 |             (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))),
189 |             winning_length)
190 |     for d in range(diagonals_start, diagonals_end + 1):
191 |         score += _evaluate_line(
192 |             (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))),
193 |             winning_length)
194 | 
195 |     return score
196 | 
197 | 
198 | def play_game(plus_player_func, minus_player_func, board_size=5, winning_length=4, log=False):
199 |     """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each
200 |     player.
201 | 
202 |     Args:
203 |         plus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))):
204 |             Function that takes the current board_state and side this player is playing, and returns the move the player
205 |             wants to play.
206 |         minus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))):
207 |             Function that takes the current board_state and side this player is playing, and returns the move the player
208 |             wants to play.
209 |         board_size (int): The size of a single side of the board. Game is played on a board_size*board_size sized board
210 |         winning_length (int): The number of pieces in a row needed to win a game.
211 |         log (bool): If True progress is logged to console, defaults to False
212 | 
213 |     Returns:
214 |         int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw
215 |     """
216 |     board_state = _new_board(board_size)
217 |     player_turn = 1
218 | 
219 |     while True:
220 |         _available_moves = list(available_moves(board_state))
221 |         if len(_available_moves) == 0:
222 |             # draw
223 |             if log:
224 |                 print("no moves left, game ended a draw")
225 |             return 0.
226 |         if player_turn > 0:
227 |             move = plus_player_func(board_state, 1)
228 |         else:
229 |             move = minus_player_func(board_state, -1)
230 | 
231 |         if move not in _available_moves:
232 |             # if a player makes an invalid move the other player wins
233 |             if log:
234 |                 print("illegal move ", move)
235 |             return -player_turn
236 | 
237 |         board_state = apply_move(board_state, move, player_turn)
238 |         print(board_state)
239 | 
240 |         winner = has_winner(board_state, winning_length)
241 |         if winner != 0:
242 |             if log:
243 |                 print("we have a winner, side: %s" % player_turn)
244 |             return winner
245 |         player_turn = -player_turn
246 | 
247 | 
248 | def random_player(board_state, _):
249 |     """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the
250 |     valid moves in the current state.
251 | 
252 |     Args:
253 |         board_state (2d tuple of int): The current state of the board
254 |         _: the side this player is playing, not used in this function because we are simply choosing the moves randomly
255 | 
256 |     Returns:
257 |         (int, int): the move we want to play on the current board
258 |     """
259 |     moves = list(available_moves(board_state))
260 |     return random.choice(moves)
261 | 
262 | 
263 | class TicTacToeXGameSpec(BaseGameSpec):
264 |     def __init__(self, board_size, winning_length):
265 |         """
266 | 
267 |         Args:
268 |             board_size (int): The length of one side of the board, so the bard will have board_size*board_size total
269 |                 squares
270 |             winning_length (int): The length in a row needed to win the game. Should be less than or equal to board_size
271 |         """
272 |         if not isinstance(board_size, int):
273 |             raise TypeError("board_size must be an int")
274 |         if not isinstance(winning_length, int):
275 |             raise TypeError("winning_length must be an int")
276 |         if winning_length > board_size:
277 |             raise ValueError("winning_length must be less than or equal to board_size")
278 |         self._winning_length = winning_length
279 |         self._board_size = board_size
280 |         self.available_moves = available_moves
281 |         self.apply_move = apply_move
282 | 
283 |     def new_board(self):
284 |         return _new_board(self._board_size)
285 | 
286 |     def has_winner(self, board_state):
287 |         return has_winner(board_state, self._winning_length)
288 | 
289 |     def board_dimensions(self):
290 |         return self._board_size, self._board_size
291 | 
292 |     def evaluate(self, board_state):
293 |         return evaluate(board_state, self._winning_length)
294 | 
295 | 
296 | if __name__ == '__main__':
297 |     # example of playing a game
298 |     play_game(random_player, random_player, log=True, board_size=10, winning_length=4)
299 | 


--------------------------------------------------------------------------------
/policy_gradient.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Builds and trains a neural network that uses policy gradients to learn to play Tic-Tac-Toe.
 3 | 
 4 | The input to the network is a vector with a number for each space on the board. If the space has one of the networks
 5 | pieces then the input vector has the value 1. -1 for the opponents space and 0 for no piece.
 6 | 
 7 | The output of the network is a also of the size of the board with each number learning the probability that a move in
 8 | that space is the best move.
 9 | 
10 | The network plays successive games randomly alternating between going first and second against an opponent that makes
11 | moves by randomly selecting a free space. The neural network does NOT initially have any way of knowing what is or is not
12 | a valid move, so initially it must learn the rules of the game.
13 | 
14 | I have trained this version with success at 3x3 tic tac toe until it has a success rate in the region of 75% this maybe
15 | as good as it can do, because 3x3 tic-tac-toe is a theoretical draw, so the random opponent will often get lucky and
16 | force a draw.
17 | """
18 | import functools
19 | 
20 | from common.network_helpers import create_network
21 | from games.tic_tac_toe import TicTacToeGameSpec
22 | from techniques.train_policy_gradient import train_policy_gradients
23 | 
24 | BATCH_SIZE = 100  # every how many games to do a parameter update?
25 | LEARN_RATE = 1e-4
26 | PRINT_RESULTS_EVERY_X = 1000  # every how many games to print the results
27 | NETWORK_FILE_PATH = None#'current_network.p'  # path to save the network to
28 | NUMBER_OF_GAMES_TO_RUN = 1000000
29 | 
30 | # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run
31 | # well may require tuning the hyper parameters a bit
32 | game_spec = TicTacToeGameSpec()
33 | 
34 | create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100))
35 | 
36 | train_policy_gradients(game_spec, create_network_func, NETWORK_FILE_PATH,
37 |                        number_of_games=NUMBER_OF_GAMES_TO_RUN,
38 |                        batch_size=BATCH_SIZE,
39 |                        learn_rate=LEARN_RATE,
40 |                        print_results_every=PRINT_RESULTS_EVERY_X)
41 | 


--------------------------------------------------------------------------------
/policy_gradient_historical_competition.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is the same as the policy_gradient.py network except that instead of playing against a random opponent. It plays
 3 | against previous versions of itself. It is first created with the weights from the "current_network.p" file, if no file
 4 | is found there random weights are used. It then creates a series of copies of itself and plays against them.
 5 | After "SAVE_HISTORICAL_NETWORK_EVERY" games, it saves it's current weights into the weights of one of the historical
 6 | networks. Over time the main network and the historical networks should improve.
 7 | """
 8 | import collections
 9 | import functools
10 | import os
11 | import random
12 | 
13 | import numpy as np
14 | import tensorflow as tf
15 | 
16 | from common.network_helpers import create_network, load_network, get_stochastic_network_move, \
17 |     save_network
18 | from games.tic_tac_toe import TicTacToeGameSpec
19 | from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic
20 | 
21 | HIDDEN_NODES = (100, 100, 100)
22 | SAVE_HISTORICAL_NETWORK_EVERY = 10000
23 | game_spec = TicTacToeGameSpec()
24 | 
25 | create_network_func = functools.partial(create_network, game_spec.board_squares(), HIDDEN_NODES)
26 | 
27 | train_policy_gradients_vs_historic(game_spec, create_network_func,
28 |                                    'train_vs_historical.p',
29 |                                    save_historic_every=SAVE_HISTORICAL_NETWORK_EVERY)
30 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow=0.7.1
2 | 


--------------------------------------------------------------------------------
/techniques/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/techniques/__init__.py


--------------------------------------------------------------------------------
/techniques/create_positions_set.py:
--------------------------------------------------------------------------------
 1 | """
 2 | For games like tic-tac-toe we are unlikely to be able to find databases of top level games. This file allows us
 3 | generate sets games using good existing algorithms from which we can train our networks. Such as min max
 4 | """
 5 | import pickle
 6 | import random
 7 | 
 8 | import zlib
 9 | 
10 | from common.network_helpers import invert_board_state
11 | from techniques.min_max import min_max_alpha_beta
12 | 
13 | 
14 | def create_positions_set(game_spec, number_of_positions, choose_move_func, compress=False):
15 |     """Generate a set of positions. All positions are set to be from the point of view of the plus player. In order to
16 |     aid breadth of search if a position that we have already calculated the best move for comes up twice we choose a
17 |     random move. Moves chosen randomly are not stored in the returned set.
18 | 
19 |     Args:
20 |         game_spec (common.BaseGameSpec):
21 |         number_of_positions (int): We will simulate this many positions
22 |         choose_move_func (): Function that picks the best move from a board position
23 |         compress (bool): If True then we will compress all the state pairs we store to save on memory, use
24 |             pickle.loads(zlib.decompress(item)) to uncompress
25 | 
26 |     Returns:
27 |         {board_state, move}
28 |     """
29 |     positions = {}
30 |     random_player_func = game_spec.get_random_player_func()
31 | 
32 |     def store_move_pair(board_state, side):
33 |         if side != 1:
34 |             board_state_for_plus = invert_board_state(board_state)
35 |         else:
36 |             board_state_for_plus = board_state
37 | 
38 |         if compress:
39 |             board_state_for_plus = zlib.compress(pickle.dumps(board_state_for_plus))
40 | 
41 |         # if we have already seen this position then make a random move to increase position diversity
42 |         if board_state_for_plus in positions:
43 |             return random_player_func(board_state, side)
44 |         else:
45 |             move = choose_move_func(board_state, side)
46 |             positions[board_state_for_plus] = move
47 | 
48 |             return move
49 | 
50 |     while number_of_positions > len(positions.keys()):
51 |         game_spec.play_game(store_move_pair, store_move_pair)
52 |         print(len(positions.keys()))
53 | 
54 |     return positions
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     # example usage
59 |     from games.connect_4 import Connect4GameSpec
60 | 
61 |     game_spec = Connect4GameSpec()
62 | 
63 |     def choose_move_func(board_state, side):
64 |         return min_max_alpha_beta(game_spec, board_state, side, 6)[1]
65 | 
66 |     positions = create_positions_set(game_spec, 10000, choose_move_func)
67 | 
68 |     positions_as_array = [(x, y) for x, y in positions.items()]
69 |     random.shuffle(positions_as_array)
70 | 
71 |     with open('position_connect_4_min_max_depth_6', mode='wb') as f:
72 |         pickle.dump(positions_as_array, f)
73 | 
74 |     print("created")


--------------------------------------------------------------------------------
/techniques/min_max.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | 
  4 | def _score_line(line):
  5 |     minus_count = line.count(-1)
  6 |     plus_count = line.count(1)
  7 |     if minus_count + plus_count < 3:
  8 |         if minus_count == 2:
  9 |             return -1
 10 |         elif plus_count == 2:
 11 |             return 1
 12 |     return 0
 13 | 
 14 | 
 15 | def evaluate_tic_tac_toe(board_state):
 16 |     """Get a rough score for how good we think this board position is for the plus_player for the game tic-tac-toe. Does
 17 |     this based on number of 2 in row lines we have.
 18 | 
 19 |     Args:
 20 |         board_state (3x3 tuple of int): The board state we are evaluating
 21 | 
 22 |     Returns:
 23 |         int: evaluated score for the position for the plus player, posative is good for the plus player, negative good
 24 |             for the minus player
 25 |     """
 26 |     score = 0
 27 |     for x in range(3):
 28 |         score += _score_line(board_state[x])
 29 |     for y in range(3):
 30 |         score += _score_line([i[y] for i in board_state])
 31 | 
 32 |     # diagonals
 33 |     score += _score_line([board_state[i][i] for i in range(3)])
 34 |     score += _score_line([board_state[2 - i][i] for i in range(3)])
 35 | 
 36 |     return score
 37 | 
 38 | 
 39 | def min_max(game_spec, board_state, side, max_depth, evaluation_func=None):
 40 |     """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best
 41 |     move
 42 | 
 43 |     Args:
 44 |         game_spec (BaseGameSpec): The specification for the game we are evaluating
 45 |         evaluation_func (board_state -> int): Function used to evaluate the position for the plus player, If None then
 46 |             we will use the evaluation function from the game_spec
 47 |         board_state (3x3 tuple of int): The board state we are evaluating
 48 |         side (int): either +1 or -1
 49 |         max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the
 50 |         position is.
 51 | 
 52 |     Returns:
 53 |         (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was
 54 |     """
 55 |     best_score = None
 56 |     best_score_move = None
 57 |     evaluation_func = evaluation_func or game_spec.evaluate
 58 | 
 59 |     moves = list(game_spec.available_moves(board_state))
 60 |     if not moves:
 61 |         # this is a draw
 62 |         return 0, None
 63 | 
 64 |     for move in moves:
 65 |         new_board_state = game_spec.apply_move(board_state, move, side)
 66 |         winner = game_spec.has_winner(new_board_state)
 67 |         if winner != 0:
 68 |             return winner * 10000, move
 69 |         else:
 70 |             if max_depth <= 1:
 71 |                 score = evaluation_func(new_board_state)
 72 |             else:
 73 |                 score, _ = min_max(game_spec, new_board_state, -side, max_depth - 1, evaluation_func=evaluation_func)
 74 |             if side > 0:
 75 |                 if best_score is None or score > best_score:
 76 |                     best_score = score
 77 |                     best_score_move = move
 78 |             else:
 79 |                 if best_score is None or score < best_score:
 80 |                     best_score = score
 81 |                     best_score_move = move
 82 |     return best_score, best_score_move
 83 | 
 84 | 
 85 | def min_max_alpha_beta(game_spec, board_state, side, max_depth, evaluation_func=None, alpha=-sys.float_info.max,
 86 |                        beta=sys.float_info.max):
 87 |     """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best
 88 |     move
 89 | 
 90 |     Args:
 91 |         game_spec (BaseGameSpec): The specification for the game we are evaluating
 92 |         evaluation_func (board_state -> int): Function used to evaluate the position for the plus player
 93 |         board_state (3x3 tuple of int): The board state we are evaluating
 94 |         side (int): either +1 or -1
 95 |         max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the
 96 |         position is.
 97 |         alpha (float): Used when this is called recursively, normally ignore
 98 |         beta (float): Used when this is called recursively, normally ignore
 99 | 
100 |     Returns:
101 |         (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was
102 |     """
103 |     evaluation_func = evaluation_func or game_spec.evaluate
104 |     best_score_move = None
105 |     moves = list(game_spec.available_moves(board_state))
106 |     if not moves:
107 |         return 0, None
108 | 
109 |     for move in moves:
110 |         new_board_state = game_spec.apply_move(board_state, move, side)
111 |         winner = game_spec.has_winner(new_board_state)
112 |         if winner != 0:
113 |             return winner * 10000, move
114 |         else:
115 |             if max_depth <= 1:
116 |                 score = evaluation_func(new_board_state)
117 |             else:
118 |                 score, _ = min_max_alpha_beta(game_spec, new_board_state, -side, max_depth - 1, evaluation_func, alpha,
119 |                                               beta)
120 | 
121 |         if side > 0:
122 |             if score > alpha:
123 |                 alpha = score
124 |                 best_score_move = move
125 |         else:
126 |             if score < beta:
127 |                 beta = score
128 |                 best_score_move = move
129 |         if alpha >= beta:
130 |             break
131 | 
132 |     return alpha if side > 0 else beta, best_score_move
133 | 
134 | 
135 | def min_max_player(board_state, side):
136 |     return min_max(board_state, side, 5)[1]
137 | 
138 | 
139 | def evaluate(board_state):
140 |     """Get a rough score for how good we think this board position is for the plus_player. Does this based on number of
141 |     2 in row lines we have.
142 | 
143 |     Args:
144 |         board_state (3x3 tuple of int): The board state we are evaluating
145 | 
146 |     Returns:
147 |         int: evaluated score for the position for the plus player, posative is good for the plus player, negative good
148 |             for the minus player
149 |     """
150 |     score = 0
151 |     for x in range(len(board_state)):
152 |         score += _score_line(board_state[x])
153 |     for y in range(len(board_state[0])):
154 |         score += _score_line([i[y] for i in board_state])
155 | 
156 |     # diagonals
157 |     score += _score_line([board_state[i][i] for i in range(3)])
158 |     score += _score_line([board_state[2 - i][i] for i in range(3)])
159 | 
160 |     return score
161 | 


--------------------------------------------------------------------------------
/techniques/monte_carlo.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import random
  3 | import math
  4 | 
  5 | 
  6 | def _monte_carlo_sample(game_spec, board_state, side):
  7 |     """Sample a single rollout from the current board_state and side. Moves are made to the current board_state until we
  8 |      reach a terminal state then the result and the first move made to get there is returned.
  9 | 
 10 |     Args:
 11 |         game_spec (BaseGameSpec): The specification for the game we are evaluating
 12 |         board_state (3x3 tuple of int): state of the board
 13 |         side (int): side currently to play. +1 for the plus player, -1 for the minus player
 14 | 
 15 |     Returns:
 16 |         (result(int), move(int,int)): The result from this rollout, +1 for a win for the plus player -1 for a win for
 17 |             the minus player, 0 for a draw
 18 |     """
 19 |     result = game_spec.has_winner(board_state)
 20 |     if result != 0:
 21 |         return result, None
 22 |     moves = list(game_spec.available_moves(board_state))
 23 |     if not moves:
 24 |         return 0, None
 25 | 
 26 |     # select a random move
 27 |     move = random.choice(moves)
 28 |     result, next_move = _monte_carlo_sample(game_spec, game_spec.apply_move(board_state, move, side), -side)
 29 |     return result, move
 30 | 
 31 | 
 32 | def monte_carlo_tree_search(game_spec, board_state, side, number_of_samples):
 33 |     """Evaluate the best from the current board_state for the given side using monte carlo sampling.
 34 | 
 35 |     Args:
 36 |         game_spec (BaseGameSpec): The specification for the game we are evaluating
 37 |         board_state (3x3 tuple of int): state of the board
 38 |         side (int): side currently to play. +1 for the plus player, -1 for the minus player
 39 |         number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the
 40 |             better the estimation of the position
 41 | 
 42 |     Returns:
 43 |         (result(int), move(int,int)): The average result for the best move from this position and what that move was.
 44 |     """
 45 |     move_wins = collections.defaultdict(int)
 46 |     move_samples = collections.defaultdict(int)
 47 |     for _ in range(number_of_samples):
 48 |         result, move = _monte_carlo_sample(game_spec, board_state, side)
 49 |         # store the result and a count of the number of times we have tried this move
 50 |         if result == side:
 51 |             move_wins[move] += 1
 52 |         move_samples[move] += 1
 53 | 
 54 |     # get the move with the best average result
 55 |     move = max(move_wins, key=lambda x: move_wins.get(x) / move_samples[move])
 56 | 
 57 |     return move_wins[move] / move_samples[move], move
 58 | 
 59 | 
 60 | def _upper_confidence_bounds(payout, samples_for_this_machine, log_total_samples):
 61 |     return payout / samples_for_this_machine + math.sqrt((2 * log_total_samples) / samples_for_this_machine)
 62 | 
 63 | 
 64 | def monte_carlo_tree_search_uct(game_spec, board_state, side, number_of_samples):
 65 |     """Evaluate the best from the current board_state for the given side using monte carlo sampling with upper
 66 |     confidence bounds for trees.
 67 | 
 68 |     Args:
 69 |         game_spec (BaseGameSpec): The specification for the game we are evaluating
 70 |         board_state (3x3 tuple of int): state of the board
 71 |         side (int): side currently to play. +1 for the plus player, -1 for the minus player
 72 |         number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the
 73 |             better the estimation of the position
 74 | 
 75 |     Returns:
 76 |         (result(int), move(int,int)): The average result for the best move from this position and what that move was.
 77 |     """
 78 |     state_results = collections.defaultdict(float)
 79 |     state_samples = collections.defaultdict(float)
 80 | 
 81 |     for _ in range(number_of_samples):
 82 |         current_side = side
 83 |         current_board_state = board_state
 84 |         first_unvisited_node = True
 85 |         rollout_path = []
 86 |         result = 0
 87 | 
 88 |         while result == 0:
 89 |             move_states = {move: game_spec.apply_move(current_board_state, move, current_side)
 90 |                            for move in game_spec.available_moves(current_board_state)}
 91 | 
 92 |             if not move_states:
 93 |                 result = 0
 94 |                 break
 95 | 
 96 |             if all((state in state_samples) for _, state in move_states):
 97 |                 log_total_samples = math.log(sum(state_samples[s] for s in move_states.values()))
 98 |                 move, state = max(move_states, key=lambda _, s: _upper_confidence_bounds(state_results[s],
 99 |                                                                                          state_samples[s],
100 |                                                                                          log_total_samples))
101 |             else:
102 |                 move = random.choice(list(move_states.keys()))
103 | 
104 |             current_board_state = move_states[move]
105 | 
106 |             if first_unvisited_node:
107 |                 rollout_path.append((current_board_state, current_side))
108 |                 if current_board_state not in state_samples:
109 |                     first_unvisited_node = False
110 | 
111 |             current_side = -current_side
112 | 
113 |             result = game_spec.has_winner(current_board_state)
114 | 
115 |         for path_board_state, path_side in rollout_path:
116 |             state_samples[path_board_state] += 1.
117 |             result *= path_side
118 |             # normalize results to be between 0 and 1 before this it between -1 and 1
119 |             result /= 2.
120 |             result += .5
121 |             state_results[path_board_state] += result
122 | 
123 |     move_states = {move: game_spec.apply_move(board_state, move, side) for move in game_spec.available_moves(board_state)}
124 | 
125 |     move = max(move_states, key=lambda x: state_results[move_states[x]] / state_samples[move_states[x]])
126 | 
127 |     return state_results[move_states[move]] / state_samples[move_states[move]], move
128 | 
129 | 
130 | if __name__ == '__main__':
131 |     from games.tic_tac_toe import TicTacToeGameSpec
132 | 
133 |     sample_board_state = ((1, 0, -1),
134 |                           (1, 0, 0),
135 |                           (0, -1, 0))
136 | 
137 |     print(monte_carlo_tree_search_uct(TicTacToeGameSpec(), sample_board_state, -1, 10000))
138 | 


--------------------------------------------------------------------------------
/techniques/monte_carlo_uct_with_value.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import random
 3 | 
 4 | import math
 5 | 
 6 | from techniques.monte_carlo import _upper_confidence_bounds
 7 | 
 8 | 
 9 | def monte_carlo_tree_search_uct_with_value(game_spec, board_state, side, number_of_samples, value_func,
10 |                                            value_weighting):
11 |     """Evaluate the best from the current board_state for the given side using monte carlo sampling with upper
12 |     confidence bounds for trees.
13 | 
14 |     Args:
15 |         game_spec (BaseGameSpec): The specification for the game we are evaluating
16 |         board_state (3x3 tuple of int): state of the board
17 |         side (int): side currently to play. +1 for the plus player, -1 for the minus player
18 |         number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the
19 |             better the estimation of the position
20 |         value_func (board_state, side -> float):
21 |         value_weighting (float): parameter to adjust how much priority we give to the value_func
22 | 
23 |     Returns:
24 |         (result(int), move(int,int)): The average result for the best move from this position and what that move was.
25 |     """
26 |     state_results = collections.defaultdict(float)
27 |     state_samples = collections.defaultdict(float)
28 |     state_values = collections.defaultdict(float)
29 | 
30 |     for _ in range(number_of_samples):
31 |         current_side = side
32 |         current_board_state = board_state
33 |         first_unvisited_node = True
34 |         rollout_path = []
35 |         result = 0
36 | 
37 |         while result == 0:
38 |             move_states = {move: game_spec.apply_move(current_board_state, move, current_side)
39 |                            for move in game_spec.available_moves(current_board_state)}
40 | 
41 |             if not move_states:
42 |                 result = 0
43 |                 break
44 | 
45 |             if all((state in state_samples) for _, state in move_states):
46 |                 log_total_samples = math.log(sum(state_samples[s] for s in move_states.values()))
47 |                 move, state = max(move_states, key=lambda _, s: state_values[s] * value_weighting +
48 |                                                                 _upper_confidence_bounds(state_results[s],
49 |                                                                                          state_samples[s],
50 |                                                                                          log_total_samples))
51 |             else:
52 |                 move = random.choice(list(move_states.keys()))
53 | 
54 |             current_board_state = move_states[move]
55 | 
56 |             if first_unvisited_node:
57 |                 rollout_path.append((current_board_state, current_side))
58 |                 if current_board_state not in state_samples:
59 |                     state_values[current_board_state] = value_func(current_board_state)
60 |                     first_unvisited_node = False
61 | 
62 |             current_side = -current_side
63 | 
64 |             result = game_spec.has_winner(current_board_state)
65 | 
66 |         for path_board_state, path_side in rollout_path:
67 |             state_samples[path_board_state] += 1.
68 |             result *= path_side
69 |             # normalize results to be between 0 and 1 before this it between -1 and 1
70 |             result /= 2.
71 |             result += .5
72 |             state_results[path_board_state] += result
73 | 
74 |     move_states = {move: game_spec.apply_move(board_state, move, side) for move in
75 |                    game_spec.available_moves(board_state)}
76 | 
77 |     move = max(move_states, key=lambda x: state_results[move_states[x]] / state_samples[move_states[x]])
78 | 
79 |     return state_results[move_states[move]] / state_samples[move_states[move]], move
80 | 


--------------------------------------------------------------------------------
/techniques/train_policy_gradient.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import os
  3 | import random
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | from common.network_helpers import load_network, get_stochastic_network_move, save_network
  9 | 
 10 | 
 11 | def train_policy_gradients(game_spec,
 12 |                            create_network,
 13 |                            network_file_path,
 14 |                            save_network_file_path=None,
 15 |                            opponent_func=None,
 16 |                            number_of_games=10000,
 17 |                            print_results_every=1000,
 18 |                            learn_rate=1e-4,
 19 |                            batch_size=100,
 20 |                            randomize_first_player=True):
 21 |     """Train a network using policy gradients
 22 | 
 23 |     Args:
 24 |         save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
 25 |             the network_file_path param is used.
 26 |         opponent_func (board_state, side) -> move: Function for the opponent, if unset we use an opponent playing
 27 |             randomly
 28 |         randomize_first_player (bool): If True we alternate between being the first and second player
 29 |         game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
 30 |         create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
 31 |             Method that creates the network we will train.
 32 |         network_file_path (str): path to the file with weights we want to load for this network
 33 |         number_of_games (int): number of games to play before stopping
 34 |         print_results_every (int): Prints results to std out every x games, also saves the network
 35 |         learn_rate (float):
 36 |         batch_size (int):
 37 | 
 38 |     Returns:
 39 |         (variables used in the final network : list, win rate: float)
 40 |     """
 41 |     save_network_file_path = save_network_file_path or network_file_path
 42 |     opponent_func = opponent_func or game_spec.get_random_player_func()
 43 |     reward_placeholder = tf.placeholder("float", shape=(None,))
 44 |     actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.outputs()))
 45 | 
 46 |     input_layer, output_layer, variables = create_network()
 47 | 
 48 |     policy_gradient = tf.log(
 49 |         tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer), reduction_indices=1)) * reward_placeholder
 50 |     train_step = tf.train.AdamOptimizer(learn_rate).minimize(-policy_gradient)
 51 | 
 52 |     with tf.Session() as session:
 53 |         session.run(tf.global_variables_initializer())
 54 | 
 55 |         if network_file_path and os.path.isfile(network_file_path):
 56 |             print("loading pre-existing network")
 57 |             load_network(session, variables, network_file_path)
 58 | 
 59 |         mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
 60 |         results = collections.deque(maxlen=print_results_every)
 61 | 
 62 |         def make_training_move(board_state, side):
 63 |             mini_batch_board_states.append(np.ravel(board_state) * side)
 64 |             move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side)
 65 |             mini_batch_moves.append(move)
 66 |             return game_spec.flat_move_to_tuple(move.argmax())
 67 | 
 68 |         for episode_number in range(1, number_of_games):
 69 |             # randomize if going first or second
 70 |             if (not randomize_first_player) or bool(random.getrandbits(1)):
 71 |                 reward = game_spec.play_game(make_training_move, opponent_func)
 72 |             else:
 73 |                 reward = -game_spec.play_game(opponent_func, make_training_move)
 74 | 
 75 |             results.append(reward)
 76 | 
 77 |             # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
 78 |             last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards)
 79 | 
 80 |             reward /= float(last_game_length)
 81 | 
 82 |             mini_batch_rewards += ([reward] * last_game_length)
 83 | 
 84 |             if episode_number % batch_size == 0:
 85 |                 normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards)
 86 | 
 87 |                 rewards_std = np.std(normalized_rewards)
 88 |                 if rewards_std != 0:
 89 |                     normalized_rewards /= rewards_std
 90 |                 else:
 91 |                     print("warning: got mini batch std of 0.")
 92 | 
 93 |                 np_mini_batch_board_states = np.array(mini_batch_board_states) \
 94 |                     .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])
 95 | 
 96 |                 session.run(train_step, feed_dict={input_layer: np_mini_batch_board_states,
 97 |                                                    reward_placeholder: normalized_rewards,
 98 |                                                    actual_move_placeholder: mini_batch_moves})
 99 | 
100 |                 # clear batches
101 |                 del mini_batch_board_states[:]
102 |                 del mini_batch_moves[:]
103 |                 del mini_batch_rewards[:]
104 | 
105 |             if episode_number % print_results_every == 0:
106 |                 print("episode: %s win_rate: %s" % (episode_number, _win_rate(print_results_every, results)))
107 |                 if network_file_path:
108 |                     save_network(session, variables, save_network_file_path)
109 | 
110 |         if network_file_path:
111 |             save_network(session, variables, save_network_file_path)
112 | 
113 |     return variables, _win_rate(print_results_every, results)
114 | 
115 | 
116 | def _win_rate(print_results_every, results):
117 |     return 0.5 + sum(results) / (print_results_every * 2.)
118 | 


--------------------------------------------------------------------------------
/techniques/train_policy_gradient_historic.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import functools
  3 | import os
  4 | import random
  5 | 
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | 
  9 | from common.network_helpers import get_stochastic_network_move, load_network, save_network
 10 | 
 11 | 
 12 | def train_policy_gradients_vs_historic(game_spec, create_network, network_file_path,
 13 |                                        save_network_file_path=None,
 14 |                                        number_of_historic_networks=8,
 15 |                                        save_historic_every=10000,
 16 |                                        historic_network_base_path='historic_network',
 17 |                                        number_of_games=100000,
 18 |                                        print_results_every=1000,
 19 |                                        learn_rate=1e-4,
 20 |                                        batch_size=100):
 21 |     """Train a network against itself and over time store new version of itself to play against.
 22 | 
 23 |     Args:
 24 |         historic_network_base_path (str): Bast path to save new historic networks to a number for the network "slot" is
 25 |             appended to the end of this string.
 26 |         save_historic_every (int): We save a version of the learning network into one of the historic network
 27 |             "slots" every x number of games. We have number_of_historic_networks "slots"
 28 |         number_of_historic_networks (int): We keep this many old networks to play against
 29 |         save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then
 30 |             the network_file_path param is used.
 31 |         game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
 32 |         create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
 33 |             Method that creates the network we will train.
 34 |         network_file_path (str): path to the file with weights we want to load for this network
 35 |         number_of_games (int): number of games to play before stopping
 36 |         print_results_every (int): Prints results to std out every x games, also saves the network
 37 |         learn_rate (float):
 38 |         batch_size (int):
 39 | 
 40 |     Returns:
 41 |         [tf.Vaiables] : trained variables used in the final network
 42 |     """
 43 |     input_layer, output_layer, variables = create_network()
 44 | 
 45 |     reward_placeholder = tf.placeholder("float", shape=(None,))
 46 |     actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares()))
 47 |     policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer)
 48 |     train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(-policy_gradient)
 49 | 
 50 |     current_historical_index = 0
 51 |     historical_networks = []
 52 | 
 53 |     mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], []
 54 |     results = collections.deque(maxlen=print_results_every)
 55 | 
 56 |     for _ in range(number_of_historic_networks):
 57 |         historical_input_layer, historical_output_layer, historical_variables = create_network()
 58 |         historical_networks.append((historical_input_layer, historical_output_layer, historical_variables))
 59 | 
 60 |     with tf.Session() as session:
 61 |         session.run(tf.global_variables_initializer())
 62 | 
 63 |         def make_move_historical(histoical_network_index, board_state, side):
 64 |             net = historical_networks[histoical_network_index]
 65 |             move = get_stochastic_network_move(session, net[0], net[1], board_state, side,
 66 |                                                valid_only=True, game_spec=game_spec)
 67 |             return game_spec.flat_move_to_tuple(move.argmax())
 68 | 
 69 |         def make_training_move(board_state, side):
 70 |             mini_batch_board_states.append(np.ravel(board_state) * side)
 71 |             move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side,
 72 |                                                valid_only=True, game_spec=game_spec)
 73 |             mini_batch_moves.append(move)
 74 |             return game_spec.flat_move_to_tuple(move.argmax())
 75 | 
 76 |         if os.path.isfile(network_file_path):
 77 |             print("loading pre existing weights")
 78 |             load_network(session, variables, network_file_path)
 79 |         else:
 80 |             print("could not find previous weights so initialising randomly")
 81 | 
 82 |         for i in range(number_of_historic_networks):
 83 |             if os.path.isfile(historic_network_base_path + str(i) + '.p'):
 84 |                 load_network(session, historical_networks[i][2], historic_network_base_path + str(i) + '.p')
 85 |             elif os.path.isfile(network_file_path):
 86 |                 # if we can't load a historical file use the current network weights
 87 |                 load_network(session, historical_networks[i][2], network_file_path)
 88 | 
 89 |         for episode_number in range(1, number_of_games):
 90 |             opponent_index = random.randint(0, number_of_historic_networks - 1)
 91 |             make_move_historical_for_index = functools.partial(make_move_historical, opponent_index)
 92 | 
 93 |             # randomize if going first or second
 94 |             if bool(random.getrandbits(1)):
 95 |                 reward = game_spec.play_game(make_training_move, make_move_historical_for_index)
 96 |             else:
 97 |                 reward = -game_spec.play_game(make_move_historical_for_index, make_training_move)
 98 | 
 99 |             results.append(reward)
100 | 
101 |             # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
102 |             last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards)
103 | 
104 |             reward /= float(last_game_length)
105 | 
106 |             mini_batch_rewards += ([reward] * last_game_length)
107 | 
108 |             episode_number += 1
109 | 
110 |             if episode_number % batch_size == 0:
111 |                 normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards)
112 |                 rewards_std = np.std(normalized_rewards)
113 |                 if rewards_std != 0:
114 |                     normalized_rewards /= rewards_std
115 |                 else:
116 |                     print("warning: got mini batch std of 0.")
117 | 
118 |                 np_mini_batch_board_states = np.array(mini_batch_board_states) \
119 |                                     .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:])
120 | 
121 |                 session.run(train_step, feed_dict={input_layer: np_mini_batch_board_states,
122 |                                                    reward_placeholder: normalized_rewards,
123 |                                                    actual_move_placeholder: mini_batch_moves})
124 | 
125 |                 # clear batches
126 |                 del mini_batch_board_states[:]
127 |                 del mini_batch_moves[:]
128 |                 del mini_batch_rewards[:]
129 | 
130 |             if episode_number % print_results_every == 0:
131 |                 print("episode: %s average result: %s" % (episode_number, np.mean(results)))
132 | 
133 |             if episode_number % save_historic_every == 0:
134 |                 print("saving historical network %s", current_historical_index)
135 |                 save_network(session, variables, historic_network_base_path + str(current_historical_index) + '.p')
136 |                 load_network(session, historical_networks[current_historical_index][2],
137 |                              historic_network_base_path + str(current_historical_index) + '.p')
138 | 
139 |                 # also save to the main network file
140 |                 save_network(session, variables, save_network_file_path or network_file_path)
141 | 
142 |                 current_historical_index += 1
143 |                 current_historical_index %= number_of_historic_networks
144 | 
145 |         # save our final weights
146 |         save_network(session, variables, save_network_file_path or network_file_path)
147 | 
148 |     return variables


--------------------------------------------------------------------------------
/techniques/train_supervised.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | 
  4 | import tensorflow as tf
  5 | 
  6 | from common.network_helpers import save_network, load_network
  7 | 
  8 | 
  9 | def train_supervised(game_spec, create_network, network_file_path,
 10 |                      positions,
 11 |                      test_set_ratio=0.4,
 12 |                      regularization_coefficent=1e-5,
 13 |                      batch_size=100,
 14 |                      learn_rate=1e-4,
 15 |                      stop_turns_without_improvement = 7):
 16 |     """Train a network using supervised learning using against a list of game positions and moves chosen.
 17 |     We stop after we have had stop_turns_without_improvement without an improvement in the test error.
 18 |     The test set is used as a validation set as well, will possibly improve this in the future to have a seperate test
 19 |      and validation set.
 20 | 
 21 |     Args:
 22 |         stop_turns_without_improvement (int): we stop training after this many iterations without any improvement in
 23 |             the test error.
 24 |         regularization_coefficent (float): amount to multiply the l2 regularizer by in the loss function
 25 |         test_set_ratio (float): portion of the data to divide into the test set,
 26 |         positions ([(board_state, move)]): list of tuples of board states and the moves chosen in those board_states
 27 |         game_spec (games.base_game_spec.BaseGameSpec): The game we are playing
 28 |         create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])):
 29 |             Method that creates the network we will train.
 30 |         network_file_path (str): path to the file with weights we want to load for this network
 31 |         learn_rate (float):
 32 |         batch_size (int):
 33 | 
 34 |     Returns:
 35 |         episode_number, train_error, train_accuracy, new_test_error, test_accuracy
 36 |     """
 37 |     input_layer, output_layer, variables = create_network()
 38 | 
 39 |     test_set_count = int(len(positions) * test_set_ratio)
 40 |     train_set = positions[:-test_set_count]
 41 |     test_set = positions[-test_set_count:]
 42 | 
 43 |     actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs()))
 44 | 
 45 |     error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer))
 46 | 
 47 |     regularizer = None
 48 |     for var in variables:
 49 |         if regularizer is None:
 50 |             regularizer = tf.nn.l2_loss(var)
 51 |         else:
 52 |             regularizer += tf.nn.l2_loss(var)
 53 | 
 54 |     loss = error + regularizer * regularization_coefficent
 55 | 
 56 |     train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(loss)
 57 | 
 58 |     correct_pred = tf.equal(tf.argmax(output_layer, 1), tf.argmax(actual_move_placeholder, 1))
 59 |     accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
 60 | 
 61 |     with tf.Session() as session:
 62 |         session.run(tf.global_variables_initializer())
 63 | 
 64 |         if os.path.isfile(network_file_path):
 65 |             print("loading existing network")
 66 |             load_network(session, variables, network_file_path)
 67 | 
 68 |         episode_number = 1
 69 |         turns_without_test_improvement = 0
 70 | 
 71 |         best_test_error, test_accuracy = session.run([error, accuracy],
 72 |                                                      feed_dict={
 73 |                                                          input_layer: [x[0] for x in test_set],
 74 |                                                          actual_move_placeholder: [x[1] for x in test_set]})
 75 | 
 76 |         while True:
 77 |             random.shuffle(train_set)
 78 |             train_error = 0
 79 | 
 80 |             for start_index in range(0, len(train_set) - batch_size + 1, batch_size):
 81 |                 mini_batch = train_set[start_index:start_index + batch_size]
 82 | 
 83 |                 batch_error, _ = session.run([error, train_step],
 84 |                                              feed_dict={input_layer: [x[0] for x in mini_batch],
 85 |                                                         actual_move_placeholder: [x[1] for x in mini_batch]})
 86 |                 train_error += batch_error
 87 | 
 88 |             new_test_error, test_accuracy = session.run([error, accuracy],
 89 |                                                         feed_dict={input_layer: [x[0] for x in test_set],
 90 |                                                                    actual_move_placeholder: [x[1] for x in test_set]})
 91 | 
 92 |             print("episode: %s train_error: %s test_error: %s test_acc: %s" %
 93 |                   (episode_number, train_error, new_test_error, test_accuracy))
 94 | 
 95 |             if new_test_error < best_test_error:
 96 |                 best_test_error = new_test_error
 97 |                 turns_without_test_improvement = 0
 98 |             else:
 99 |                 turns_without_test_improvement += 1
100 |                 if turns_without_test_improvement > stop_turns_without_improvement:
101 |                     train_accuracy = session.run([accuracy], feed_dict={input_layer: [x[0] for x in train_set],
102 |                                                                         actual_move_placeholder: [x[1] for x in
103 |                                                                                                   train_set]})
104 | 
105 |                     print("test error not improving for %s turns, ending training" % (stop_turns_without_improvement, ))
106 |                     break
107 | 
108 |             episode_number += 1
109 | 
110 |         print("final episode: %s train_error: %s train acc: %s test_error: %s test_acc: %s" %
111 |               (episode_number, train_error, train_accuracy, new_test_error, test_accuracy))
112 | 
113 |         save_network(session, variables, network_file_path)
114 | 
115 |     return episode_number, train_error, train_accuracy, new_test_error, test_accuracy


--------------------------------------------------------------------------------
/techniques/train_value_network.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | 
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | 
  7 | from common.network_helpers import create_network, load_network, get_deterministic_network_move, save_network
  8 | 
  9 | 
 10 | # it would be good to have real board positions, but failing that just generate random ones
 11 | def _generate_random_board_position(game_spec, random_move_range):
 12 |     while True:
 13 |         board_state = game_spec.new_board()
 14 |         number_moves = random.randint(*random_move_range)
 15 |         side = 1
 16 |         for _ in range(number_moves):
 17 |             board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))),
 18 |                                                side)
 19 |             if game_spec.has_winner(board_state) != 0:
 20 |                 # start again if we hit an already winning position
 21 |                 continue
 22 | 
 23 |             side = -side
 24 |         return board_state
 25 | 
 26 | 
 27 | def train_value_network(game_spec, hidden_nodes_reinforcement, reinforcement_network_file_path,
 28 |                         hidden_nodes_value, value_network_file_path,
 29 |                         learn_rate=1e-4,
 30 |                         batch_size=100,
 31 |                         train_samples=10000,
 32 |                         test_samples=8000):
 33 |     reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network(
 34 |         game_spec.board_squares(),
 35 |         hidden_nodes_reinforcement,
 36 |         game_spec.outputs())
 37 | 
 38 |     value_input_layer, value_output_layer, value_variables = create_network(game_spec.board_squares(),
 39 |                                                                             hidden_nodes_value,
 40 |                                                                             output_nodes=1, output_softmax=False)
 41 | 
 42 |     target_placeholder = tf.placeholder("float", (None, 1))
 43 |     error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer))
 44 | 
 45 |     train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(error)
 46 | 
 47 |     with tf.Session() as session:
 48 |         session.run(tf.global_variables_initializer())
 49 | 
 50 |         load_network(session, reinforcement_variables, reinforcement_network_file_path)
 51 | 
 52 |         if os.path.isfile(value_network_file_path):
 53 |             print("loading previous version of value network")
 54 |             load_network(session, value_variables, value_network_file_path)
 55 | 
 56 |         def make_move(board_state, side):
 57 |             move = get_deterministic_network_move(session, reinforcement_input_layer, reinforcement_output_layer,
 58 |                                                   board_state, side)
 59 | 
 60 |             return game_spec.flat_move_to_tuple(np.argmax(move))
 61 | 
 62 |         board_states_training = {}
 63 |         board_states_test = []
 64 |         episode_number = 0
 65 | 
 66 |         while len(board_states_training) < train_samples + test_samples:
 67 |             board_state = _generate_random_board_position(game_spec, (1, game_spec.board_squares() * 0.8))
 68 |             board_state_flat = tuple(np.ravel(board_state))
 69 | 
 70 |             # only accept the board_state if not already in the dict
 71 |             if board_state_flat not in board_states_training:
 72 |                 result = game_spec.play_game(make_move, make_move, board_state=board_state)
 73 |                 board_states_training[board_state_flat] = float(result)
 74 | 
 75 |         # take a random selection from training into a test set
 76 |         for _ in range(test_samples):
 77 |             sample = random.choice(board_states_training.keys())
 78 |             board_states_test.append((sample, board_states_training[sample]))
 79 |             del board_states_training[sample]
 80 | 
 81 |         board_states_training = list(board_states_training.iteritems())
 82 | 
 83 |         test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test],
 84 |                                                    target_placeholder: [[x[1]] for x in board_states_test]})
 85 | 
 86 |         while True:
 87 |             np.random.shuffle(board_states_training)
 88 |             train_error = 0
 89 | 
 90 |             for start_index in range(0, len(board_states_training) - batch_size + 1, batch_size):
 91 |                 mini_batch = board_states_training[start_index:start_index + batch_size]
 92 | 
 93 |                 batch_error, _ = session.run([error, train_step],
 94 |                                              feed_dict={value_input_layer: [x[0] for x in mini_batch],
 95 |                                                         target_placeholder: [[x[1]] for x in mini_batch]})
 96 |                 train_error += batch_error
 97 | 
 98 |             new_test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test],
 99 |                                                            target_placeholder: [[x[1]] for x in board_states_test]})
100 | 
101 |             print("episode: %s train_error: %s test_error: %s" % (episode_number, train_error, test_error))
102 | 
103 |             if new_test_error > test_error:
104 |                 print("train error went up, stopping training")
105 |                 break
106 | 
107 |             test_error = new_test_error
108 |             episode_number += 1
109 | 
110 |         save_network(session, value_variables, value_network_file_path)
111 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tests/__init__.py


--------------------------------------------------------------------------------
/tests/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tests/common/__init__.py


--------------------------------------------------------------------------------
/tests/common/test_network_helpers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | 
 7 | from common.network_helpers import create_network, save_network, load_network
 8 | 
 9 | 
10 | class TestNetworkHelpers(TestCase):
11 |     def test_create_network(self):
12 |         input_nodes = 20
13 |         hidden_nodes = (50, 40, 30)
14 |         input_layer, output_layer, variables = create_network(input_nodes, hidden_nodes)
15 |         self.assertSequenceEqual(input_layer.get_shape().as_list(), [None, input_nodes])
16 |         self.assertSequenceEqual(output_layer.get_shape().as_list(), [None, input_nodes])
17 |         self.assertEqual(len(variables), (len(hidden_nodes) + 1) * 2)
18 | 
19 |     def test_create_network_with_2d_input(self):
20 |         input_nodes = (5, 5)
21 |         hidden_nodes = (50, 40, 30)
22 |         input_layer, output_layer, variables = create_network(input_nodes, hidden_nodes)
23 |         self.assertSequenceEqual(input_layer.get_shape().as_list(), [None, input_nodes[0], input_nodes[1]])
24 |         self.assertSequenceEqual(output_layer.get_shape().as_list(), [None, input_nodes[0] * input_nodes[1]])
25 |         self.assertEqual(len(variables), (len(hidden_nodes) + 1) * 2)
26 | 
27 |     def test_save_and_load_network(self):
28 |         try:
29 |             file_name = 'test.p'
30 |             input_nodes = 20
31 |             hidden_nodes = (50, 40, 30)
32 |             _, _, variables1 = create_network(input_nodes, hidden_nodes)
33 |             _, _, variables2 = create_network(input_nodes, hidden_nodes)
34 | 
35 |             with tf.Session() as session:
36 |                 session.run(tf.global_variables_initializer())
37 | 
38 |                 save_network(session, variables1, file_name)
39 |                 load_network(session, variables2, file_name)
40 | 
41 |                 for var1, var2 in zip(variables1, variables2):
42 |                     np.testing.assert_array_almost_equal(session.run(var1), session.run(var2))
43 |         finally:
44 |             try:
45 |                 os.remove(file_name)
46 |             except OSError:
47 |                 pass
48 | 
49 |     def test_load_variables_into_network_of_wrong_size_gives_friendly_exception(self):
50 |         try:
51 |             file_name = 'test.p'
52 |             input_nodes = 20
53 | 
54 |             _, _, variables1 = create_network(input_nodes, (30, ))
55 |             _, _, variables2 = create_network(input_nodes, (40, ))
56 | 
57 |             with tf.Session() as session:
58 |                 session.run(tf.global_variables_initializer())
59 | 
60 |                 save_network(session, variables1, file_name)
61 | 
62 |                 with self.assertRaises(ValueError):
63 |                     load_network(session, variables2, file_name)
64 |         finally:
65 |             try:
66 |                 os.remove(file_name)
67 |             except OSError:
68 |                 pass


--------------------------------------------------------------------------------
/tests/games/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tests/games/__init__.py


--------------------------------------------------------------------------------
/tests/games/test_connect_4.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from games.connect_4 import has_winner, play_game, random_player
 4 | 
 5 | 
 6 | class TestTicTacToeX(TestCase):
 7 |     def test_has_winner(self):
 8 |         board_state = ((0, 0, 0, 0, 0, 0),
 9 |                        (0, 0, 0, 0, 0, 0),
10 |                        (0, 0, 0, 0, 0, 0),
11 |                        (1, 0, 0, 0, 0, 0),
12 |                        (0, 1, 0, 0, 0, 0),
13 |                        (0, 0, 1, 0, 0, 0),
14 |                        (0, 0, 0, 1, 0, 0))
15 | 
16 |         self.assertEqual(1, has_winner(board_state), 4)
17 | 
18 |         board_state = ((0, 0, 0, 0, 1, 0),
19 |                        (0, 0, 0, 1, 0, 0),
20 |                        (0, 0, 1, 0, 0, 0),
21 |                        (0, 1, 0, 0, 0, 0),
22 |                        (0, 0, 0, 0, 0, 0),
23 |                        (0, 0, 0, 0, 0, 0),
24 |                        (0, 0, 0, 0, 0, 0))
25 | 
26 |         self.assertEqual(1, has_winner(board_state), 4)
27 | 
28 |     def test_play_game(self):
29 |         play_game(random_player, random_player)


--------------------------------------------------------------------------------
/tests/games/test_tic_tac_toe_x.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from games.tic_tac_toe_x import has_winner, _has_winning_line, play_game, random_player, evaluate
 4 | 
 5 | 
 6 | class TestTicTacToeX(TestCase):
 7 | 
 8 |     def test_has_winning_line(self):
 9 |         self.assertEqual(1, _has_winning_line((0, 1, 1, 1, 1), 4))
10 |         self.assertEqual(0, _has_winning_line((0, 1, -1, 1, 1), 4))
11 |         self.assertEqual(1, _has_winning_line((1, 1, 1, 1, 1, 0), 4))
12 |         self.assertEqual(1, _has_winning_line((1, 0, 1, 1, 1, 0), 3))
13 |         self.assertEqual(-1, _has_winning_line((-1, -1, -1, -1, 1), 4))
14 | 
15 |     def test_has_winner(self):
16 |         board_state = ((0, 0, 0, 0, 0),
17 |                        (0, -1, 0, 0, 0),
18 |                        (0, -1, 0, 0, 0),
19 |                        (0, -1, 0, 0, 0),
20 |                        (0, -1, 0, 0, 0))
21 |         self.assertEqual(-1, has_winner(board_state, 4))
22 | 
23 |         board_state = ((0, 1, 0, 0, 0),
24 |                        (0, 0, 1, 0, 0),
25 |                        (0, 0, 0, 1, 0),
26 |                        (0, 0, 0, 0, 1),
27 |                        (0, 0, 0, 0, 0))
28 |         self.assertEqual(1, has_winner(board_state, 4))
29 | 
30 |         board_state = ((0, 0, 0, 0, 0),
31 |                        (0, 0, 0, 0, 1),
32 |                        (0, 0, 0, 1, 0),
33 |                        (0, 0, 1, 0, 0),
34 |                        (0, 1, 0, 0, 0))
35 |         self.assertEqual(1, has_winner(board_state, 4))
36 | 
37 |         board_state = ((0, 0, 0, -1, 0),
38 |                        (0, 0, -1, 0, 0),
39 |                        (0, -1, 0, 0, 0),
40 |                        (-1, 0, 0, 0, 0),
41 |                        (0, 0, 0, 0, 0))
42 |         self.assertEqual(-1, has_winner(board_state, 4))
43 | 
44 |     def test_play_game(self):
45 |         play_game(random_player, random_player)
46 | 
47 |     def test_has_evaluate(self):
48 |         board_state = ((-1, -1, -1, 0, 0),
49 |                        (0, 0, 0, 0, 0),
50 |                        (0, -1, 0, 0, 0),
51 |                        (0, -1, 0, 0, 0),
52 |                        (0, -1, 0, 0, 0))
53 |         self.assertGreater(0, evaluate(board_state, 4))
54 | 
55 |         board_state = ((0, 1, 0, 0, 0),
56 |                        (0, 0, 1, 0, 0),
57 |                        (0, 0, 0, 0, 0),
58 |                        (0, 0, 0, 0, 1),
59 |                        (0, 0, 0, 0, 0))
60 |         self.assertGreater(evaluate(board_state, 4), 0)
61 | 
62 |         board_state = ((0, 0, 0, 0, 0),
63 |                        (0, 0, 0, 0, 1),
64 |                        (0, 0, 0, 0, 0),
65 |                        (0, 0, 1, 0, 0),
66 |                        (0, 1, 1, 0, 0))
67 |         self.assertGreater(evaluate(board_state, 4), 0)
68 | 
69 |         board_state = ((0, 0, 0, -1, 0),
70 |                        (0, 0, 0, 0, 0),
71 |                        (0, 0, 0, 0, 0),
72 |                        (-1, 0, 0, 0, 0),
73 |                        (0, 0, 0, 0, 0))
74 |         self.assertGreater(0, evaluate(board_state, 4))


--------------------------------------------------------------------------------
/tests/techniques/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tests/techniques/__init__.py


--------------------------------------------------------------------------------
/tests/techniques/test_create_positions_set.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from games.tic_tac_toe import TicTacToeGameSpec
 4 | from techniques.create_positions_set import create_positions_set
 5 | 
 6 | 
 7 | class TestCreatePositionsSet(TestCase):
 8 |     def setUp(self):
 9 |         self._game_spec = TicTacToeGameSpec()
10 | 
11 |     def test_create_positions(self):
12 |         number_of_positions = 100
13 |         positions = create_positions_set(self._game_spec, number_of_positions, self._game_spec.get_random_player_func())
14 | 
15 |         self.assertGreater(len(positions), number_of_positions-1)


--------------------------------------------------------------------------------
/tests/techniques/test_min_max.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from games.tic_tac_toe import TicTacToeGameSpec
 4 | from techniques.min_max import min_max_alpha_beta, min_max
 5 | 
 6 | 
 7 | class TestMinMax(TestCase):
 8 |     def setUp(self):
 9 |         self._game_spec = TicTacToeGameSpec()
10 | 
11 |     def test_basic_position(self):
12 |         # the best move is 2, 2 forcing a win with pluses next move, both players should select it
13 |         board_state = ((0, 0, 0),
14 |                        (-1, -1, 1),
15 |                        (1, 0, 0))
16 | 
17 |         result_min_max = min_max(self._game_spec, board_state, 1, 8)
18 |         result_min_max_alpha_beta = min_max_alpha_beta(self._game_spec, board_state, 1, 8)
19 | 
20 |         self.assertEqual(result_min_max[1], (2, 2))
21 |         self.assertEqual(result_min_max_alpha_beta[1], (2, 2))
22 | 
23 |     def test_basic_position_for_minus_player(self):
24 |         board_state = ((-1, 1, 0),
25 |                        (1, -1, 1),
26 |                        (1, 0, 0))
27 | 
28 |         result_min_max = min_max(self._game_spec, board_state, -1, 8)
29 |         result_min_max_alpha_beta = min_max_alpha_beta(self._game_spec, board_state, -1, 8)
30 | 
31 |         self.assertEqual(result_min_max[1], (2, 2))
32 |         self.assertEqual(result_min_max_alpha_beta[1], (2, 2))
33 | 


--------------------------------------------------------------------------------
/tests/techniques/test_train_policy_gradient.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | from unittest import TestCase
 3 | 
 4 | from common.base_game_spec import BaseGameSpec
 5 | from common.network_helpers import create_network
 6 | from games.tic_tac_toe import TicTacToeGameSpec
 7 | from games.tic_tac_toe_x import TicTacToeXGameSpec
 8 | from techniques.train_policy_gradient import train_policy_gradients
 9 | 
10 | 
11 | class _VerySimpleGameSpec(BaseGameSpec):
12 |     def new_board(self):
13 |         return [0, 0]
14 | 
15 |     def apply_move(self, board_state, move, side):
16 |         board_state[move] = side
17 |         return board_state
18 | 
19 |     def has_winner(self, board_state):
20 |         return board_state[0]
21 | 
22 |     def __init__(self):
23 |         pass
24 | 
25 |     def available_moves(self, board_state):
26 |         return [i for i, x in enumerate(board_state) if x == 0]
27 | 
28 |     def board_dimensions(self):
29 |         return 2,
30 | 
31 | 
32 | class TestTrainPolicyGradient(TestCase):
33 |     def test_learn_simple_game(self):
34 |         game_spec = _VerySimpleGameSpec()
35 |         create_model_func = functools.partial(create_network, 2, (4,))
36 |         variables, win_rate = train_policy_gradients(game_spec, create_model_func, None,
37 |                                                      learn_rate=0.1,
38 |                                                      number_of_games=1000, print_results_every=100,
39 |                                                      batch_size=20,
40 |                                                      randomize_first_player=False)
41 |         self.assertGreater(win_rate, 0.9)
42 | 
43 |     def test_tic_tac_toe(self):
44 |         game_spec = TicTacToeGameSpec()
45 |         create_model_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100,))
46 |         variables, win_rate = train_policy_gradients(game_spec, create_model_func, None,
47 |                                                      learn_rate=1e-4,
48 |                                                      number_of_games=60000,
49 |                                                      print_results_every=1000,
50 |                                                      batch_size=100,
51 |                                                      randomize_first_player=False)
52 |         self.assertGreater(win_rate, 0.4)
53 | 


--------------------------------------------------------------------------------
/tests/techniques/test_train_policy_gradient_historic.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from games.tic_tac_toe import TicTacToeGameSpec
 4 | from techniques.create_positions_set import create_positions_set
 5 | 
 6 | 
 7 | class TestCreatePositionsSet(TestCase):
 8 |     def setUp(self):
 9 |         self._game_spec = TicTacToeGameSpec()
10 | 
11 |     def test_create_positions(self):
12 |         number_of_positions = 100
13 |         positions = create_positions_set(self._game_spec, number_of_positions, self._game_spec.get_random_player_func())
14 | 
15 |         self.assertGreater(len(positions), number_of_positions-1)


--------------------------------------------------------------------------------
/tic_tac_toe_5_4/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tic_tac_toe_5_4/__init__.py


--------------------------------------------------------------------------------
/tic_tac_toe_5_4/network.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from common.benchmark import benchmark
 4 | from games.tic_tac_toe_x import TicTacToeXGameSpec
 5 | 
 6 | tic_tac_toe_5_4_game_spec = TicTacToeXGameSpec(5, 4)
 7 | 
 8 | 
 9 | def create_convolutional_network():
10 |     input_layer = tf.input_layer = tf.placeholder("float",
11 |                                                   (None,) + tic_tac_toe_5_4_game_spec.board_dimensions() + (1,))
12 |     CONVOLUTIONS_LAYER_1 = 64
13 |     CONVOLUTIONS_LAYER_2 = 64
14 |     CONVOLUTIONS_LAYER_3 = 64
15 |     CONVOLUTIONS_LAYER_4 = 64
16 |     CONVOLUTIONS_LAYER_5 = 64
17 |     FLAT_SIZE = 5 * 5 * CONVOLUTIONS_LAYER_2
18 |     FLAT_HIDDEN_NODES = 256
19 | 
20 |     convolution_weights_1 = tf.Variable(tf.truncated_normal([3, 3, 1, CONVOLUTIONS_LAYER_1], stddev=0.01))
21 |     convolution_bias_1 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_1]))
22 | 
23 |     convolution_weights_2 = tf.Variable(
24 |         tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_1, CONVOLUTIONS_LAYER_2], stddev=0.01))
25 |     convolution_bias_2 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_2]))
26 | 
27 |     convolution_weights_3 = tf.Variable(
28 |         tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_2, CONVOLUTIONS_LAYER_3], stddev=0.01))
29 |     convolution_bias_3 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_3]))
30 | 
31 |     convolution_weights_4 = tf.Variable(
32 |         tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_3, CONVOLUTIONS_LAYER_4], stddev=0.01))
33 |     convolution_bias_4 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_4]))
34 | 
35 |     # convolution_weights_5 = tf.Variable(
36 |     #     tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_4, CONVOLUTIONS_LAYER_5], stddev=0.01))
37 |     # convolution_bias_5 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_5]))
38 | 
39 |     # feed_forward_weights_1 = tf.Variable(tf.truncated_normal([FLAT_SIZE, FLAT_HIDDEN_NODES], stddev=0.01))
40 |     # feed_forward_bias_1 = tf.Variable(tf.constant(0.01, shape=[FLAT_HIDDEN_NODES]))
41 | 
42 |     feed_forward_weights_2 = tf.Variable(
43 |         tf.truncated_normal([FLAT_SIZE, tic_tac_toe_5_4_game_spec.outputs()], stddev=0.01))
44 |     feed_forward_bias_2 = tf.Variable(tf.constant(0.01, shape=[tic_tac_toe_5_4_game_spec.outputs()]))
45 | 
46 |     hidden_convolutional_layer_1 = tf.nn.relu(
47 |         tf.nn.conv2d(input_layer, convolution_weights_1, strides=[1, 1, 1, 1], padding="SAME") + convolution_bias_1)
48 | 
49 |     hidden_convolutional_layer_2 = tf.nn.relu(
50 |         tf.nn.conv2d(hidden_convolutional_layer_1, convolution_weights_2, strides=[1, 1, 1, 1],
51 |                      padding="SAME") + convolution_bias_2)
52 | 
53 |     hidden_convolutional_layer_3 = tf.nn.relu(
54 |         tf.nn.conv2d(hidden_convolutional_layer_2, convolution_weights_3, strides=[1, 1, 1, 1],
55 |                      padding="SAME") + convolution_bias_3)
56 | 
57 |     hidden_convolutional_layer_4 = tf.nn.relu(
58 |         tf.nn.conv2d(hidden_convolutional_layer_3, convolution_weights_4, strides=[1, 1, 1, 1],
59 |                      padding="SAME") + convolution_bias_4)
60 | 
61 |     # hidden_convolutional_layer_5 = tf.nn.relu(
62 |     #     tf.nn.conv2d(hidden_convolutional_layer_4, convolution_weights_5, strides=[1, 1, 1, 1],
63 |     #                  padding="SAME") + convolution_bias_5)
64 | 
65 |     hidden_convolutional_layer_3_flat = tf.reshape(hidden_convolutional_layer_4, [-1, FLAT_SIZE])
66 | 
67 |     # final_hidden_activations = tf.nn.relu(
68 |     #     tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_1) + feed_forward_bias_1)
69 | 
70 |     output_layer = tf.nn.softmax(tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_2) + feed_forward_bias_2)
71 | 
72 |     return input_layer, output_layer, [convolution_weights_1, convolution_bias_1,
73 |                                        convolution_weights_2, convolution_bias_2,
74 |                                        convolution_weights_3, convolution_bias_3,
75 |                                        convolution_weights_4, convolution_bias_4,
76 |                                        # convolution_weights_5, convolution_bias_5,
77 |                                        # feed_forward_weights_1, feed_forward_bias_1,
78 |                                        feed_forward_weights_2, feed_forward_bias_2]
79 | 
80 | file_path = 'convolutional_net_5_4_l_c_4_f_1_other_fresh.p'
81 | 
82 | benchmark(tic_tac_toe_5_4_game_spec, file_path, create_convolutional_network)


--------------------------------------------------------------------------------
/tic_tac_toe_5_4/position_tic_tac_toe_5_4_min_max_depth_6:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tic_tac_toe_5_4/position_tic_tac_toe_5_4_min_max_depth_6


--------------------------------------------------------------------------------
/tic_tac_toe_5_4/supervised.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import numpy as np
 4 | 
 5 | from techniques.train_supervised import train_supervised
 6 | from tic_tac_toe_5_4.network import tic_tac_toe_5_4_game_spec, create_convolutional_network
 7 | 
 8 | with open("position_tic_tac_toe_5_4_min_max_depth_6", 'rb') as f:
 9 |     positions = pickle.load(f)
10 | 
11 | # for now we need to reshape input for convolutions and one hot the move responses
12 | # this is the kind of stuff I need to clean up in overall design
13 | for i in range(len(positions)):
14 |     one_hot = np.zeros(tic_tac_toe_5_4_game_spec.outputs())
15 |     np.put(one_hot, tic_tac_toe_5_4_game_spec.tuple_move_to_flat(positions[i][1]), 1)
16 |     positions[i] = np.array(positions[i][0]).reshape(tic_tac_toe_5_4_game_spec.board_dimensions()[0],
17 |                                                      tic_tac_toe_5_4_game_spec.board_dimensions()[0],
18 |                                                      1), one_hot
19 | 
20 | train_supervised(tic_tac_toe_5_4_game_spec, create_convolutional_network, 'convolutional_net_5_4_l_c_4_f_1_other.p',
21 |                  positions, regularization_coefficent=1e-4)


--------------------------------------------------------------------------------
/tic_tac_toe_5_4/train_historical.py:
--------------------------------------------------------------------------------
1 | from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic
2 | from tic_tac_toe_5_4.network import tic_tac_toe_5_4_game_spec, create_convolutional_network
3 | 
4 | train_policy_gradients_vs_historic(tic_tac_toe_5_4_game_spec, create_convolutional_network,
5 |                                    'convolutional_net_5_4_l_c_4_f_1_other_after_1.p',
6 |                                    save_network_file_path='convolutional_net_5_4_l_c_4_f_1_other_after_2.p',
7 |                                    number_of_games=50000,
8 |                                    print_results_every=500,
9 |                                    save_historic_every=8000)


--------------------------------------------------------------------------------
/tic_tac_toe_5_4/train_vs_min_max.py:
--------------------------------------------------------------------------------
 1 | from techniques.min_max import min_max_alpha_beta
 2 | from techniques.train_policy_gradient import train_policy_gradients
 3 | from tic_tac_toe_5_4.network import tic_tac_toe_5_4_game_spec, create_convolutional_network
 4 | 
 5 | 
 6 | def min_max_move_func(board_state, side):
 7 |     return min_max_alpha_beta(tic_tac_toe_5_4_game_spec, board_state, side, 3)[1]
 8 | 
 9 | 
10 | train_policy_gradients(tic_tac_toe_5_4_game_spec, create_convolutional_network,
11 |                        'convolutional_net_5_4_l_c_4_f_1_other_after.p',
12 |                        opponent_func=min_max_move_func,
13 |                        save_network_file_path='convolutional_net_5_4_l_c_4_f_1_other_after_vs_depth_3.p',
14 |                        number_of_games=5000,
15 |                        print_results_every=100)


--------------------------------------------------------------------------------
/value_network.py:
--------------------------------------------------------------------------------
  1 | """
  2 | After using reinforcement learning to train a network, e.g. policy_gradient.py, to play a game well. We then want to
  3 | learn to estimate weather that network would win, lose or draw from a given position.
  4 | 
  5 | Alpha Go used a database of real positions to get it's predictions from, we don't have that for tic-tac-toe so instead
  6 | we generate some random game positions and train off of the results we get playing from those.
  7 | """
  8 | import os
  9 | import random
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | from common.network_helpers import create_network, load_network, save_network, \
 15 |     get_deterministic_network_move
 16 | from games.tic_tac_toe import TicTacToeGameSpec
 17 | 
 18 | HIDDEN_NODES_VALUE = (100, 100, 100)
 19 | HIDDEN_NODES_REINFORCEMENT = (100, 100, 100)
 20 | BATCH_SIZE = 100  # every how many games to do a parameter update?
 21 | LEARN_RATE = 1e-4
 22 | REINFORCEMENT_NETWORK_PATH = 'current_network.p'
 23 | VALUE_NETWORK_PATH = 'value_netowrk.p'
 24 | TRAIN_SAMPLES = 10000
 25 | TEST_SAMPLES = 10000
 26 | 
 27 | # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec
 28 | game_spec = TicTacToeGameSpec()
 29 | 
 30 | NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8)
 31 | 
 32 | 
 33 | # it would be good to have real board positions, but failing that just generate random ones
 34 | def generate_random_board_position():
 35 |     while True:
 36 |         board_state = game_spec.new_board()
 37 |         number_moves = random.randint(*NUMBER_RANDOM_RANGE)
 38 |         side = 1
 39 |         for _ in range(number_moves):
 40 |             board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))),
 41 |                                                side)
 42 |             if game_spec.has_winner(board_state) != 0:
 43 |                 # start again if we hit an already winning position
 44 |                 continue
 45 | 
 46 |             side = -side
 47 |         return board_state
 48 | 
 49 | 
 50 | reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network(
 51 |     game_spec.board_squares(),
 52 |     HIDDEN_NODES_REINFORCEMENT,
 53 |     game_spec.outputs())
 54 | 
 55 | value_input_layer, value_output_layer, value_variables = create_network(game_spec.board_squares(), HIDDEN_NODES_VALUE,
 56 |                                                                         output_nodes=1, output_softmax=False)
 57 | 
 58 | target_placeholder = tf.placeholder("float", (None, 1))
 59 | error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer))
 60 | 
 61 | train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error)
 62 | 
 63 | with tf.Session() as session:
 64 |     session.run(tf.global_variables_initializer())
 65 | 
 66 |     load_network(session, reinforcement_variables, REINFORCEMENT_NETWORK_PATH)
 67 | 
 68 |     if os.path.isfile(VALUE_NETWORK_PATH):
 69 |         print("loading previous version of value network")
 70 |         load_network(session, value_variables, VALUE_NETWORK_PATH)
 71 | 
 72 | 
 73 |     def make_move(board_state, side):
 74 |         move = get_deterministic_network_move(session, reinforcement_input_layer, reinforcement_output_layer,
 75 |                                               board_state, side)
 76 | 
 77 |         return game_spec.flat_move_to_tuple(np.argmax(move))
 78 | 
 79 | 
 80 |     board_states_training = {}
 81 |     board_states_test = []
 82 |     episode_number = 0
 83 | 
 84 |     while len(board_states_training) < TRAIN_SAMPLES + TEST_SAMPLES:
 85 |         board_state = generate_random_board_position()
 86 |         board_state_flat = tuple(np.ravel(board_state))
 87 | 
 88 |         # only accept the board_state if not already in the dict
 89 |         if board_state_flat not in board_states_training:
 90 |             result = game_spec.play_game(make_move, make_move, board_state=board_state)
 91 |             board_states_training[board_state_flat] = float(result)
 92 | 
 93 |     # take a random selection from training into a test set
 94 |     for _ in range(TEST_SAMPLES):
 95 |         sample = random.choice(board_states_training.keys())
 96 |         board_states_test.append((sample, board_states_training[sample]))
 97 |         del board_states_training[sample]
 98 | 
 99 |     board_states_training = list(board_states_training.iteritems())
100 | 
101 |     test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test],
102 |                                                target_placeholder: [[x[1]] for x in board_states_test]})
103 | 
104 |     while True:
105 |         np.random.shuffle(board_states_training)
106 |         train_error = 0
107 | 
108 |         for start_index in range(0, len(board_states_training) - BATCH_SIZE + 1, BATCH_SIZE):
109 |             mini_batch = board_states_training[start_index:start_index + BATCH_SIZE]
110 | 
111 |             batch_error, _ = session.run([error, train_step],
112 |                                          feed_dict={value_input_layer: [x[0] for x in mini_batch],
113 |                                                     target_placeholder: [[x[1]] for x in mini_batch]})
114 |             train_error += batch_error
115 | 
116 |         new_test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test],
117 |                                                        target_placeholder: [[x[1]] for x in board_states_test]})
118 | 
119 |         print("episode: %s train_error: %s test_error: %s" % (episode_number, train_error, test_error))
120 | 
121 |         if new_test_error > test_error:
122 |             print("train error went up, stopping training")
123 |             break
124 | 
125 |         test_error = new_test_error
126 |         episode_number += 1
127 | 
128 |     save_network(session, value_variables, VALUE_NETWORK_PATH)
129 | 


--------------------------------------------------------------------------------