├── .gitignore ├── Alpha Toe.pdf ├── LICENSE ├── README.md ├── common ├── __init__.py ├── base_game_spec.py ├── benchmark.py └── network_helpers.py ├── connect_4 ├── __init__.py ├── network.py ├── position_connect_4_min_max_depth_6 ├── supervised.py ├── train_historical.py └── train_vs_min_max.py ├── games ├── __init__.py ├── connect_4.py ├── tic_tac_toe.py └── tic_tac_toe_x.py ├── policy_gradient.py ├── policy_gradient_historical_competition.py ├── requirements.txt ├── techniques ├── __init__.py ├── create_positions_set.py ├── min_max.py ├── monte_carlo.py ├── monte_carlo_uct_with_value.py ├── train_policy_gradient.py ├── train_policy_gradient_historic.py ├── train_supervised.py └── train_value_network.py ├── tests ├── __init__.py ├── common │ ├── __init__.py │ └── test_network_helpers.py ├── games │ ├── __init__.py │ ├── test_connect_4.py │ └── test_tic_tac_toe_x.py └── techniques │ ├── __init__.py │ ├── test_create_positions_set.py │ ├── test_min_max.py │ ├── test_train_policy_gradient.py │ └── test_train_policy_gradient_historic.py ├── tic_tac_toe_5_4 ├── __init__.py ├── network.py ├── position_tic_tac_toe_5_4_min_max_depth_6 ├── supervised.py ├── train_historical.py └── train_vs_min_max.py └── value_network.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /Alpha Toe.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/Alpha Toe.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Daniel Slater 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AlphaToe 2 | Applying the deep learning techniques from Alpha Go to play tic-tac-toe 3 | 4 | These are the code examples to with my talk, the slide for which are in AlphaToe.pdf 5 | 6 | As well as the slides, the file script/policy_gradient.py is a good starting point for the project. All networks are 7 | built using TensorFlow. 8 | 9 | ## SetUp 10 | 11 | To get running start by creating a virtual env/conda env with tensorFlow installed. Current instructions for this are 12 | at: https://www.tensorflow.org/versions/r0.11/get_started/os_setup.html#anaconda-installation 13 | 14 | I've also found this useful: https://anaconda.org/jjhelmus/tensorflow 15 | 16 | Then run the file file policy_gradient.py 17 | 18 | This has been tested with python 2.7 and 3.5 -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/common/__init__.py -------------------------------------------------------------------------------- /common/base_game_spec.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import random 3 | from functools import reduce 4 | 5 | 6 | class BaseGameSpec(object): 7 | def __init__(self): 8 | """Abstract base class for the specification for running/training on a game. 9 | 10 | Examples: 11 | spec = TicTacToeGameSpec() 12 | result = spec.play_game(func_a, func_b) 13 | """ 14 | raise NotImplementedError('This is an abstract base class') 15 | 16 | def new_board(self): 17 | raise NotImplementedError() 18 | 19 | def apply_move(self, board_state, move, side): 20 | raise NotImplementedError() 21 | 22 | def available_moves(self, board_state): 23 | raise NotImplementedError() 24 | 25 | def has_winner(self, board_state): 26 | raise NotImplementedError() 27 | 28 | def evaluate(self, board_state): 29 | """An evaluation function for this game, gives an estimate of how good the board position is for the plus player. 30 | There is no specific range for the values returned, they just need to be relative to each other. 31 | 32 | Args: 33 | board_state (tuple): State of the board 34 | 35 | Returns: 36 | number 37 | """ 38 | raise NotImplementedError() 39 | 40 | def board_dimensions(self): 41 | """Returns the dimensions of the board for this game 42 | 43 | Returns: 44 | tuple of ints: one int for each dimension of the board, this will normally be 2 ints 45 | """ 46 | raise NotImplementedError() 47 | 48 | def board_squares(self): 49 | """The number of squares on the board. This can be used for the number of input nodes to a network. 50 | 51 | Returns: 52 | int 53 | """ 54 | return reduce(operator.mul, self.board_dimensions(), 1) 55 | 56 | def outputs(self): 57 | """The number of moves that could be made in this kind of game, weather or not they are legal. For most games 58 | this will be every single square on the board, but for connect 4 this is different. If we wanted to do chess in 59 | the future this method may need to get a bit more complicated. 60 | 61 | Returns: 62 | int 63 | """ 64 | return self.board_squares() 65 | 66 | def flat_move_to_tuple(self, move_index): 67 | """If board is 2d then we return a tuple for where we moved to. 68 | e.g if the board is a 3x3 size and our move_index was 6 then 69 | this method will return (2, 0) 70 | 71 | Args: 72 | move_index (int): The index of the square we moved to 73 | 74 | Returns: 75 | tuple or int: For where we moved in board coordinates 76 | """ 77 | if len(self.board_dimensions()) == 1: 78 | return move_index 79 | 80 | board_x = self.board_dimensions()[0] 81 | return int(move_index / board_x), move_index % board_x 82 | 83 | def tuple_move_to_flat(self, tuple_move): 84 | """Does the inverse operation to flat_move_to_tuple 85 | 86 | Args: 87 | tuple_move (tuple): 88 | 89 | Returns: 90 | int : 91 | """ 92 | if len(self.board_dimensions()) == 1: 93 | return tuple_move[0] 94 | else: 95 | return tuple_move[0] * self.board_dimensions()[0] + tuple_move[1] 96 | 97 | def play_game(self, plus_player_func, minus_player_func, log=False, board_state=None): 98 | """Run a single game of until the end, using the provided function args to determine the moves for each 99 | player. 100 | 101 | Args: 102 | plus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the 103 | current board_state and side this player is playing, and returns the move the player wants to play. 104 | minus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the 105 | current board_state and side this player is playing, and returns the move the player wants to play. 106 | log (bool): If True progress is logged to console, defaults to False 107 | board_state: Optionally have the game start from this position, rather than from a new board 108 | 109 | Returns: 110 | int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw 111 | """ 112 | board_state = board_state or self.new_board() 113 | player_turn = 1 114 | 115 | while True: 116 | _available_moves = list(self.available_moves(board_state)) 117 | 118 | if len(_available_moves) == 0: 119 | # draw 120 | if log: 121 | print("no moves left, game ended a draw") 122 | return 0. 123 | if player_turn > 0: 124 | move = plus_player_func(board_state, 1) 125 | else: 126 | move = minus_player_func(board_state, -1) 127 | 128 | if move not in _available_moves: 129 | # if a player makes an invalid move the other player wins 130 | if log: 131 | print("illegal move ", move) 132 | return -player_turn 133 | 134 | board_state = self.apply_move(board_state, move, player_turn) 135 | if log: 136 | print(board_state) 137 | 138 | winner = self.has_winner(board_state) 139 | if winner != 0: 140 | if log: 141 | print("we have a winner, side: %s" % player_turn) 142 | return winner 143 | player_turn = -player_turn 144 | 145 | def get_random_player_func(self): 146 | """Return a function that makes moves for the current game by choosing a move randomly 147 | NOTE: this move returns the function that makes the random move so should be used like so: 148 | Examples: 149 | self.play_game(self.get_random_player_func(), self.get_random_player_func()) 150 | 151 | Returns: 152 | board_state, side (int) -> move : function that plays this game by making random moves 153 | """ 154 | return lambda board_state, side: random.choice(list(self.available_moves(board_state))) 155 | -------------------------------------------------------------------------------- /common/benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | Plays games against a variety of algorithms to see how good a network is 3 | """ 4 | import functools 5 | 6 | import tensorflow as tf 7 | 8 | from common.network_helpers import load_network, get_deterministic_network_move 9 | from techniques.min_max import min_max_alpha_beta 10 | from techniques.monte_carlo import monte_carlo_tree_search_uct 11 | 12 | 13 | def benchmark(game_spec, network_file_path, create_network_func, log_games=False, games_vs_random=500): 14 | """Plays games against a variety of algorithms to see how good a network is. Results are currently just 15 | printed to std out 16 | 17 | Args: 18 | game_spec (games.base_game_spec.BaseGameSpec): The game we are playing 19 | create_network_func (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): 20 | Method that creates the network we will train. 21 | network_file_path (str): path to the file with weights we want to load for this network 22 | log_games (bool): If True print all positions from all games played 23 | games_vs_random (int): Number of games to play vs random opponents 24 | """ 25 | input_layer, output_layer, variables = create_network_func() 26 | 27 | with tf.Session() as session: 28 | session.run(tf.global_variables_initializer()) 29 | load_network(session, variables, network_file_path) 30 | 31 | def make_move(board_state, side): 32 | move = get_deterministic_network_move(session, input_layer, output_layer, board_state, side, 33 | valid_only=True, game_spec=game_spec) 34 | return game_spec.flat_move_to_tuple(move.argmax()) 35 | 36 | def min_max_move_func(board_state, side, depth): 37 | return min_max_alpha_beta(game_spec, board_state, side, depth)[1] 38 | 39 | def monte_carlo_move_func(board_state, side): 40 | return monte_carlo_tree_search_uct(game_spec, board_state, side, 100000)[1] 41 | 42 | results = [] 43 | for _ in range(int(games_vs_random / 2)): 44 | result = game_spec.play_game(make_move, 45 | game_spec.get_random_player_func(), 46 | log=log_games) 47 | results.append(result) 48 | result = game_spec.play_game( 49 | game_spec.get_random_player_func(), 50 | make_move, log=log_games) 51 | results.append(-result) 52 | 53 | print("*** results vs random = %s" % (sum(results),)) 54 | 55 | results = [] 56 | for _ in range(1): 57 | result = game_spec.play_game(make_move, 58 | functools.partial(min_max_move_func, depth=2), log=log_games) 59 | results.append(result) 60 | result = game_spec.play_game(functools.partial(min_max_move_func, depth=2), 61 | make_move, log=log_games) 62 | results.append(-result) 63 | 64 | print("*** results vs min max depth 2 = %s" % (sum(results),)) 65 | 66 | results = [] 67 | for _ in range(1): 68 | result = game_spec.play_game(make_move, 69 | functools.partial(min_max_move_func, depth=4), log=log_games) 70 | results.append(result) 71 | result = game_spec.play_game(functools.partial(min_max_move_func, depth=4), 72 | make_move, log=log_games) 73 | results.append(-result) 74 | 75 | print("*** results vs min max depth 4 = %s" % (sum(results),)) 76 | 77 | results = [] 78 | for _ in range(1): 79 | result = game_spec.play_game(make_move, 80 | functools.partial(min_max_move_func, depth=6), log=log_games) 81 | results.append(result) 82 | result = game_spec.play_game(functools.partial(min_max_move_func, depth=6), 83 | make_move, log=log_games) 84 | results.append(-result) 85 | 86 | print("*** results vs min max depth 6 = %s" % (sum(results),)) 87 | 88 | results = [] 89 | for _ in range(1): 90 | result = game_spec.play_game(make_move, 91 | functools.partial(min_max_move_func, depth=8), log=log_games) 92 | results.append(result) 93 | result = game_spec.play_game(functools.partial(min_max_move_func, 94 | make_move, depth=8), log=log_games) 95 | results.append(-result) 96 | 97 | print("*** results vs min max depth 8 = %s" % (sum(results),)) 98 | 99 | results = [] 100 | for _ in range(1): 101 | result = game_spec.play_game(make_move, 102 | monte_carlo_move_func, log=log_games) 103 | results.append(result) 104 | result = game_spec.play_game(monte_carlo_move_func, 105 | make_move, log=log_games) 106 | results.append(-result) 107 | 108 | print("*** results vs monte carlo uct 100000 = %s" % (sum(results),)) 109 | -------------------------------------------------------------------------------- /common/network_helpers.py: -------------------------------------------------------------------------------- 1 | import operator 2 | import pickle 3 | from functools import reduce 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | 9 | def create_network(input_nodes, hidden_nodes, output_nodes=None, output_softmax=True): 10 | """Create a network with relu activations at each layer 11 | 12 | Args: 13 | output_nodes: (int): Number of output nodes, if None then number of input nodes is used 14 | input_nodes (int or tuple(int)): The size of the board this network will work on. The output layer will also be 15 | this size if not specified. Can be an int if 1d or a tuple of ints for a 2d+ dim board 16 | hidden_nodes ([int]): The number of hidden nodes in each hidden layer 17 | output_softmax (bool): If True softmax is used in the final layer, otherwise just use the activation with no 18 | non-linearity function 19 | 20 | Returns: 21 | (input_layer, output_layer, [variables]) : The final item in the tuple is a list containing all the parameters, 22 | wieghts and biases used in this network 23 | """ 24 | output_nodes = output_nodes or input_nodes 25 | 26 | variables = [] 27 | 28 | with tf.name_scope('network'): 29 | if isinstance(input_nodes, tuple): 30 | input_layer = tf.placeholder("float", (None,) + input_nodes) 31 | flat_size = reduce(operator.mul, input_nodes, 1) 32 | current_layer = tf.reshape(input_layer, (-1, flat_size)) 33 | else: 34 | input_layer = tf.placeholder("float", (None, input_nodes)) 35 | current_layer = input_layer 36 | 37 | for hidden_nodes in hidden_nodes: 38 | last_layer_nodes = int(current_layer.get_shape()[-1]) 39 | hidden_weights = tf.Variable( 40 | tf.truncated_normal((last_layer_nodes, hidden_nodes), stddev=1. / np.sqrt(last_layer_nodes)), 41 | name='weights') 42 | hidden_bias = tf.Variable(tf.constant(0.01, shape=(hidden_nodes,)), name='biases') 43 | 44 | variables.append(hidden_weights) 45 | variables.append(hidden_bias) 46 | 47 | current_layer = tf.nn.relu( 48 | tf.matmul(current_layer, hidden_weights) + hidden_bias) 49 | 50 | if isinstance(output_nodes, tuple): 51 | output_nodes = reduce(operator.mul, input_nodes, 1) 52 | 53 | # for some reason having output std divided by np.sqrt(output_nodes) massively outperforms np.sqrt(hidden_nodes) 54 | output_weights = tf.Variable( 55 | tf.truncated_normal((hidden_nodes, output_nodes), stddev=1. / np.sqrt(output_nodes)), name="output_weights") 56 | output_bias = tf.Variable(tf.constant(0.01, shape=(output_nodes,)), name="output_bias") 57 | 58 | variables.append(output_weights) 59 | variables.append(output_bias) 60 | 61 | output_layer = tf.matmul(current_layer, output_weights) + output_bias 62 | if output_softmax: 63 | output_layer = tf.nn.softmax(output_layer) 64 | 65 | return input_layer, output_layer, variables 66 | 67 | 68 | def save_network(session, tf_variables, file_path): 69 | """Save the given set of variables to the given file using the given session 70 | 71 | Args: 72 | session (tf.Session): session within which the variables has been initialised 73 | tf_variables (list of tf.Variable): list of variables which will be saved to the file 74 | file_path (str): path of the file we want to save to. 75 | """ 76 | variable_values = session.run(tf_variables) 77 | with open(file_path, mode='wb') as f: 78 | pickle.dump(variable_values, f) 79 | 80 | 81 | def load_network(session, tf_variables, file_path): 82 | """Load the given set of variables from the given file using the given session 83 | 84 | Args: 85 | session (tf.Session): session within which the variables has been initialised 86 | tf_variables (list of tf.Variable): list of variables which will set up with the values saved to the file. List 87 | order matters, in must be the exact same order as was used to save and all of the same shape. 88 | file_path (str): path of the file we want to load from. 89 | """ 90 | with open(file_path, mode='rb') as f: 91 | variable_values = pickle.load(f) 92 | 93 | try: 94 | if len(variable_values) != len(tf_variables): 95 | raise ValueError("Network in file had different structure, variables in file: %s variables in memeory: %s" 96 | % (len(variable_values), len(tf_variables))) 97 | for value, tf_variable in zip(variable_values, tf_variables): 98 | session.run(tf_variable.assign(value)) 99 | except ValueError as ex: 100 | # TODO: maybe raise custom exception 101 | raise ValueError("""Tried to load network file %s with different architecture from the in memory network. 102 | Error was %s 103 | Either delete the network file to train a new network from scratch or change the in memory network to match that dimensions of the one in the file""" % (file_path, ex)) 104 | 105 | 106 | def invert_board_state(board_state): 107 | """Returns the board state inverted, so all 1 are replaced with -1 and visa-versa 108 | 109 | Args: 110 | board_state (tuple of tuple of ints): The board we want to invert 111 | 112 | Returns: 113 | (tuple of tuple of ints) The board state for the other player 114 | """ 115 | return tuple(tuple(-board_state[j][i] for i in range(len(board_state[0]))) for j in range(len(board_state))) 116 | 117 | 118 | def get_stochastic_network_move(session, input_layer, output_layer, board_state, side, 119 | valid_only=False, game_spec=None): 120 | """Choose a move for the given board_state using a stocastic policy. A move is selected using the values from the 121 | output_layer as a categorical probability distribution to select a single move 122 | 123 | Args: 124 | session (tf.Session): Session used to run this network 125 | input_layer (tf.Placeholder): Placeholder to the network used to feed in the board_state 126 | output_layer (tf.Tensor): Tensor that will output the probabilities of the moves, we expect this to be of 127 | dimesensions (None, board_squares) and the sum of values across the board_squares to be 1. 128 | board_state: The board_state we want to get the move for. 129 | side: The side that is making the move. 130 | 131 | Returns: 132 | (np.array) It's shape is (board_squares), and it is a 1 hot encoding for the move the network has chosen. 133 | """ 134 | np_board_state = np.array(board_state) 135 | if side == -1: 136 | np_board_state = -np_board_state 137 | 138 | np_board_state = np_board_state.reshape(1, *input_layer.get_shape().as_list()[1:]) 139 | probability_of_actions = session.run(output_layer, 140 | feed_dict={input_layer: np_board_state})[0] 141 | 142 | if valid_only: 143 | available_moves = list(game_spec.available_moves(board_state)) 144 | if len(available_moves) == 1: 145 | move = np.zeros(game_spec.board_squares()) 146 | np.put(move, game_spec.tuple_move_to_flat(available_moves[0]), 1) 147 | return move 148 | available_moves_flat = [game_spec.tuple_move_to_flat(x) for x in available_moves] 149 | for i in range(game_spec.board_squares()): 150 | if i not in available_moves_flat: 151 | probability_of_actions[i] = 0. 152 | 153 | prob_mag = sum(probability_of_actions) 154 | if prob_mag != 0.: 155 | probability_of_actions /= sum(probability_of_actions) 156 | 157 | try: 158 | move = np.random.multinomial(1, probability_of_actions) 159 | except ValueError: 160 | # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1. 161 | # so need to reduce slightly to be a valid value 162 | move = np.random.multinomial(1, probability_of_actions / (1. + 1e-6)) 163 | 164 | return move 165 | 166 | 167 | def get_deterministic_network_move(session, input_layer, output_layer, board_state, side, valid_only=False, 168 | game_spec=None): 169 | """Choose a move for the given board_state using a deterministic policy. A move is selected using the values from 170 | the output_layer and selecting the move with the highest score. 171 | 172 | Args: 173 | session (tf.Session): Session used to run this network 174 | input_layer (tf.Placeholder): Placeholder to the network used to feed in the board_state 175 | output_layer (tf.Tensor): Tensor that will output the probabilities of the moves, we expect this to be of 176 | dimesensions (None, board_squares). 177 | board_state: The board_state we want to get the move for. 178 | side: The side that is making the move. 179 | 180 | Returns: 181 | (np.array) It's shape is (board_squares), and it is a 1 hot encoding for the move the network has chosen. 182 | """ 183 | np_board_state = np.array(board_state) 184 | np_board_state = np_board_state.reshape(1, *input_layer.get_shape().as_list()[1:]) 185 | if side == -1: 186 | np_board_state = -np_board_state 187 | 188 | probability_of_actions = session.run(output_layer, 189 | feed_dict={input_layer: np_board_state})[0] 190 | 191 | if valid_only: 192 | available_moves = game_spec.available_moves(board_state) 193 | available_moves_flat = [game_spec.tuple_move_to_flat(x) for x in available_moves] 194 | for i in range(game_spec.board_squares()): 195 | if i not in available_moves_flat: 196 | probability_of_actions[i] = 0 197 | 198 | move = np.argmax(probability_of_actions) 199 | one_hot = np.zeros(len(probability_of_actions)) 200 | one_hot[move] = 1. 201 | return one_hot 202 | -------------------------------------------------------------------------------- /connect_4/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/connect_4/__init__.py -------------------------------------------------------------------------------- /connect_4/network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from games.connect_4 import Connect4GameSpec 4 | 5 | connect_4_game_spec = Connect4GameSpec() 6 | 7 | 8 | def create_convolutional_network(): 9 | input_layer = tf.input_layer = tf.placeholder("float", 10 | (None,) + connect_4_game_spec.board_dimensions() + (1,)) 11 | CONVOLUTIONS_LAYER_1 = 64 12 | CONVOLUTIONS_LAYER_2 = 64 13 | CONVOLUTIONS_LAYER_3 = 64 14 | CONVOLUTIONS_LAYER_4 = 64 15 | CONVOLUTIONS_LAYER_5 = 64 16 | FLAT_SIZE = 7 * 6 * CONVOLUTIONS_LAYER_2 17 | 18 | convolution_weights_1 = tf.Variable(tf.truncated_normal([3, 3, 1, CONVOLUTIONS_LAYER_1], stddev=0.01)) 19 | convolution_bias_1 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_1])) 20 | 21 | convolution_weights_2 = tf.Variable( 22 | tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_1, CONVOLUTIONS_LAYER_2], stddev=0.01)) 23 | convolution_bias_2 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_2])) 24 | 25 | convolution_weights_3 = tf.Variable( 26 | tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_2, CONVOLUTIONS_LAYER_3], stddev=0.01)) 27 | convolution_bias_3 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_3])) 28 | 29 | convolution_weights_4 = tf.Variable( 30 | tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_3, CONVOLUTIONS_LAYER_4], stddev=0.01)) 31 | convolution_bias_4 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_4])) 32 | 33 | convolution_weights_5 = tf.Variable( 34 | tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_4, CONVOLUTIONS_LAYER_5], stddev=0.01)) 35 | convolution_bias_5 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_5])) 36 | 37 | # feed_forward_weights_1 = tf.Variable(tf.truncated_normal([FLAT_SIZE, FLAT_HIDDEN_NODES], stddev=0.01)) 38 | # feed_forward_bias_1 = tf.Variable(tf.constant(0.01, shape=[FLAT_HIDDEN_NODES])) 39 | 40 | feed_forward_weights_2 = tf.Variable( 41 | tf.truncated_normal([FLAT_SIZE, connect_4_game_spec.outputs()], stddev=0.01)) 42 | feed_forward_bias_2 = tf.Variable(tf.constant(0.01, shape=[connect_4_game_spec.outputs()])) 43 | 44 | hidden_convolutional_layer_1 = tf.nn.relu( 45 | tf.nn.conv2d(input_layer, convolution_weights_1, strides=[1, 1, 1, 1], padding="SAME") + convolution_bias_1) 46 | 47 | hidden_convolutional_layer_2 = tf.nn.relu( 48 | tf.nn.conv2d(hidden_convolutional_layer_1, convolution_weights_2, strides=[1, 1, 1, 1], 49 | padding="SAME") + convolution_bias_2) 50 | 51 | hidden_convolutional_layer_3 = tf.nn.relu( 52 | tf.nn.conv2d(hidden_convolutional_layer_2, convolution_weights_3, strides=[1, 1, 1, 1], 53 | padding="SAME") + convolution_bias_3) 54 | 55 | hidden_convolutional_layer_4 = tf.nn.relu( 56 | tf.nn.conv2d(hidden_convolutional_layer_3, convolution_weights_4, strides=[1, 1, 1, 1], 57 | padding="SAME") + convolution_bias_4) 58 | 59 | hidden_convolutional_layer_5 = tf.nn.relu( 60 | tf.nn.conv2d(hidden_convolutional_layer_4, convolution_weights_5, strides=[1, 1, 1, 1], 61 | padding="SAME") + convolution_bias_5) 62 | 63 | hidden_convolutional_layer_3_flat = tf.reshape(hidden_convolutional_layer_5, [-1, FLAT_SIZE]) 64 | 65 | # final_hidden_activations = tf.nn.relu( 66 | # tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_1) + feed_forward_bias_1) 67 | 68 | output_layer = tf.nn.softmax(tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_2) + feed_forward_bias_2) 69 | 70 | return input_layer, output_layer, [convolution_weights_1, convolution_bias_1, 71 | convolution_weights_2, convolution_bias_2, 72 | convolution_weights_3, convolution_bias_3, 73 | convolution_weights_4, convolution_bias_4, 74 | convolution_weights_5, convolution_bias_5, 75 | # feed_forward_weights_1, feed_forward_bias_1, 76 | feed_forward_weights_2, feed_forward_bias_2] 77 | -------------------------------------------------------------------------------- /connect_4/position_connect_4_min_max_depth_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/connect_4/position_connect_4_min_max_depth_6 -------------------------------------------------------------------------------- /connect_4/supervised.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import numpy as np 4 | 5 | from techniques.train_supervised import train_supervised 6 | from connect_4.network import connect_4_game_spec, create_convolutional_network 7 | 8 | with open("position_connect_4_min_max_depth_6", 'rb') as f: 9 | positions = pickle.load(f) 10 | 11 | # for now we need to reshape input for convolutions and one hot the move responses 12 | # this is the kind of stuff I need to clean up in overall design 13 | for i in range(len(positions)): 14 | one_hot = np.zeros(connect_4_game_spec.outputs()) 15 | np.put(one_hot, positions[i][1], 1) 16 | positions[i] = np.array(positions[i][0]).reshape(connect_4_game_spec.board_dimensions()[0], 17 | connect_4_game_spec.board_dimensions()[1], 18 | 1), one_hot 19 | 20 | # for convolutional_layers in [3, 4, 5, 6]: 21 | # for convolutional_channels in [48, 64, 80, 96]: 22 | 23 | train_supervised(connect_4_game_spec, create_convolutional_network, 'convolutional_net_l_c_5_f_1_other.p', 24 | positions, 25 | regularization_coefficent=1e-3, 26 | learn_rate=5e-5) -------------------------------------------------------------------------------- /connect_4/train_historical.py: -------------------------------------------------------------------------------- 1 | from connect_4.network import create_convolutional_network, connect_4_game_spec 2 | from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic 3 | 4 | 5 | train_policy_gradients_vs_historic(connect_4_game_spec, create_convolutional_network, 6 | 'convolutional_net_5_4_l_c_4_f_1_other_after_1.p', 7 | save_network_file_path='convolutional_net_5_4_l_c_4_f_1_other_after_2.p', 8 | number_of_games=50000, 9 | print_results_every=500, 10 | save_historic_every=8000) -------------------------------------------------------------------------------- /connect_4/train_vs_min_max.py: -------------------------------------------------------------------------------- 1 | from techniques.min_max import min_max_alpha_beta 2 | from techniques.train_policy_gradient import train_policy_gradients 3 | from connect_4.network import connect_4_game_spec, create_convolutional_network 4 | 5 | 6 | def min_max_move_func(board_state, side): 7 | return min_max_alpha_beta(connect_4_game_spec, board_state, side, 3)[1] 8 | 9 | 10 | train_policy_gradients(connect_4_game_spec, create_convolutional_network, 11 | 'convolutional_net_5_4_l_c_4_f_1_other_after.p', 12 | opponent_func=min_max_move_func, 13 | save_network_file_path='convolutional_net_5_4_l_c_4_f_1_other_after_vs_depth_3.p', 14 | number_of_games=5000, 15 | print_results_every=100) -------------------------------------------------------------------------------- /games/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/games/__init__.py -------------------------------------------------------------------------------- /games/connect_4.py: -------------------------------------------------------------------------------- 1 | """ 2 | Full code for running a game of connect 4 on a board_width, board_height and winning length can be specified in relevant 3 | methods. Allowing you to play connect 5, 6, 7, etc. Defaults are board_width = 7, board_height = 6, winning_length = 4 4 | 5 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine 6 | where each player plays. 7 | The board is represented by a board_width x board_height tuple of ints. A 0 means no player has played in a space, 1 8 | means player one has played there, -1 means the seconds player has played there. The apply_move method can be used to 9 | return a copy of a given state with a given move applied. This can be useful for doing min-max or monte carlo sampling. 10 | """ 11 | 12 | import random 13 | 14 | from common.base_game_spec import BaseGameSpec 15 | from games.tic_tac_toe_x import evaluate 16 | 17 | 18 | def _new_board(board_width, board_height): 19 | """Return a emprty tic-tac-toe board we can use for simulating a game. 20 | 21 | Args: 22 | board_width (int): The width of the board, a board_width * board_height board is created 23 | board_height (int): The height of the board, a board_width * board_height board is created 24 | 25 | Returns: 26 | board_width x board_height tuple of ints 27 | """ 28 | return tuple(tuple(0 for _ in range(board_height)) for _ in range(board_width)) 29 | 30 | 31 | def apply_move(board_state, move_x, side): 32 | """Returns a copy of the given board_state with the desired move applied. 33 | 34 | Args: 35 | board_state (2d tuple of int): The given board_state we want to apply the move to. 36 | move_x (int): Which column we are going to "drop" our piece in 37 | side (int): The side we are making this move for, 1 for the first player, -1 for the second player. 38 | 39 | Returns: 40 | (2d tuple of int): A copy of the board_state with the given move applied for the given side. 41 | """ 42 | # find position in which move will settle 43 | move_y = 0 44 | for x in board_state[move_x]: 45 | if x == 0: 46 | break 47 | else: 48 | move_y += 1 49 | 50 | def get_tuples(): 51 | for i in range(len(board_state)): 52 | if move_x == i: 53 | temp = list(board_state[i]) 54 | temp[move_y] = side 55 | yield tuple(temp) 56 | else: 57 | yield board_state[i] 58 | 59 | return tuple(get_tuples()) 60 | 61 | 62 | def available_moves(board_state): 63 | """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have 64 | pieces played. 65 | 66 | Args: 67 | board_state: The board_state we want to check for valid moves. 68 | 69 | Returns: 70 | Generator of int: All the valid moves that can be played in this position. 71 | """ 72 | for x in range(len(board_state)): 73 | if any(y == 0 for y in board_state[x]): 74 | yield x 75 | 76 | 77 | def _has_winning_line(line, winning_length): 78 | count = 0 79 | last_side = 0 80 | for x in line: 81 | if x == last_side: 82 | count += 1 83 | if count == winning_length: 84 | return last_side 85 | else: 86 | count = 1 87 | last_side = x 88 | return 0 89 | 90 | 91 | def has_winner(board_state, winning_length=4): 92 | """Determine if a player has won on the given board_state. 93 | 94 | Args: 95 | board_state (2d tuple of int): The current board_state we want to evaluate. 96 | winning_length (int): The number of moves in a row needed for a win. 97 | 98 | Returns: 99 | int: 1 if player one has won, -1 if player 2 has won, otherwise 0. 100 | """ 101 | board_width = len(board_state) 102 | board_height = len(board_state[0]) 103 | 104 | # check rows 105 | for x in range(board_width): 106 | winner = _has_winning_line(board_state[x], winning_length) 107 | if winner != 0: 108 | return winner 109 | # check columns 110 | for y in range(board_height): 111 | winner = _has_winning_line((i[y] for i in board_state), winning_length) 112 | if winner != 0: 113 | return winner 114 | 115 | # check diagonals 116 | diagonals_start = -(board_width - winning_length) 117 | diagonals_end = (board_width - winning_length) 118 | for d in range(diagonals_start, diagonals_end+1): 119 | winner = _has_winning_line( 120 | (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))), 121 | winning_length) 122 | if winner != 0: 123 | return winner 124 | for d in range(diagonals_start, diagonals_end+1): 125 | winner = _has_winning_line( 126 | (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))), 127 | winning_length) 128 | if winner != 0: 129 | return winner 130 | 131 | return 0 # no one has won, return 0 for a draw 132 | 133 | 134 | def play_game(plus_player_func, minus_player_func, board_width=7, board_height=6, winning_length=4, log=False): 135 | """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each 136 | player. 137 | 138 | Args: 139 | plus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))): 140 | Function that takes the current board_state and side this player is playing, and returns the move the player 141 | wants to play. 142 | minus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))): 143 | Function that takes the current board_state and side this player is playing, and returns the move the player 144 | wants to play. 145 | board_width (int): The width of the board, a board_width * board_height board is created 146 | board_height (int): The height of the board, a board_width * board_height board is created 147 | winning_length (int): The number of pieces in a row needed to win a game. 148 | log (bool): If True progress is logged to console, defaults to False 149 | 150 | Returns: 151 | int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw 152 | """ 153 | board_state = _new_board(board_width, board_height) 154 | player_turn = 1 155 | 156 | while True: 157 | _avialable_moves = list(available_moves(board_state)) 158 | if len(_avialable_moves) == 0: 159 | # draw 160 | if log: 161 | print("no moves left, game ended a draw") 162 | return 0. 163 | if player_turn > 0: 164 | move = plus_player_func(board_state, 1) 165 | else: 166 | move = minus_player_func(board_state, -1) 167 | 168 | if move not in _avialable_moves: 169 | # if a player makes an invalid move the other player wins 170 | if log: 171 | print("illegal move ", move) 172 | return -player_turn 173 | 174 | board_state = apply_move(board_state, move, player_turn) 175 | if log: 176 | print(board_state) 177 | 178 | winner = has_winner(board_state, winning_length) 179 | if winner != 0: 180 | if log: 181 | print("we have a winner, side: %s" % player_turn) 182 | return winner 183 | player_turn = -player_turn 184 | 185 | 186 | def random_player(board_state, _): 187 | """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the 188 | valid moves in the current state. 189 | 190 | Args: 191 | board_state (2d tuple of int): The current state of the board 192 | _: the side this player is playing, not used in this function because we are simply choosing the moves randomly 193 | 194 | Returns: 195 | (int, int): the move we want to play on the current board 196 | """ 197 | moves = list(available_moves(board_state)) 198 | return random.choice(moves) 199 | 200 | 201 | class Connect4GameSpec(BaseGameSpec): 202 | def __init__(self, board_width=7, board_height=6, winning_length=4): 203 | self._board_height = board_height 204 | self._board_width = board_width 205 | self._winning_length = winning_length 206 | self.available_moves = available_moves 207 | self.apply_move = apply_move 208 | 209 | def new_board(self): 210 | return _new_board(self._board_width, self._board_height) 211 | 212 | def has_winner(self, board_state): 213 | return has_winner(board_state, self._winning_length) 214 | 215 | def board_dimensions(self): 216 | return self._board_width, self._board_height 217 | 218 | def flat_move_to_tuple(self, move_index): 219 | return move_index 220 | 221 | def outputs(self): 222 | return self._board_width * self._board_height 223 | 224 | def evaluate(self, board_state): 225 | return evaluate(board_state, self._winning_length) 226 | 227 | if __name__ == '__main__': 228 | # example of playing a game 229 | play_game(random_player, random_player, log=True, board_width=7, board_height=6, winning_length=4) 230 | -------------------------------------------------------------------------------- /games/tic_tac_toe.py: -------------------------------------------------------------------------------- 1 | """ 2 | Full code for running a game of tic-tac-toe on a 3 by 3 board. 3 | Two players take turns making moves on squares of the board, the first to get 3 in a row, including diagonals, wins. If 4 | there are no valid moves left to make the game ends a draw. 5 | 6 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine 7 | where each player plays. 8 | The board is represented by a 3 x 3 tuple of ints. A 0 means no player has played in a space, 1 means player one has 9 | played there, -1 means the seconds player has played there. The apply_move method can be used to return a copy of a 10 | given state with a given move applied. This can be useful for doing min-max or monte carlo sampling. 11 | """ 12 | import itertools 13 | import random 14 | 15 | from common.base_game_spec import BaseGameSpec 16 | from techniques.min_max import evaluate 17 | 18 | 19 | def _new_board(): 20 | """Return a emprty tic-tac-toe board we can use for simulating a game. 21 | 22 | Returns: 23 | 3x3 tuple of ints 24 | """ 25 | return ((0, 0, 0), 26 | (0, 0, 0), 27 | (0, 0, 0)) 28 | 29 | 30 | def apply_move(board_state, move, side): 31 | """Returns a copy of the given board_state with the desired move applied. 32 | 33 | Args: 34 | board_state (3x3 tuple of int): The given board_state we want to apply the move to. 35 | move (int, int): The position we want to make the move in. 36 | side (int): The side we are making this move for, 1 for the first player, -1 for the second player. 37 | 38 | Returns: 39 | (3x3 tuple of int): A copy of the board_state with the given move applied for the given side. 40 | """ 41 | move_x, move_y = move 42 | 43 | def get_tuples(): 44 | for x in range(3): 45 | if move_x == x: 46 | temp = list(board_state[x]) 47 | temp[move_y] = side 48 | yield tuple(temp) 49 | else: 50 | yield board_state[x] 51 | 52 | return tuple(get_tuples()) 53 | 54 | 55 | def available_moves(board_state): 56 | """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have 57 | pieces played. 58 | 59 | Args: 60 | board_state: The board_state we want to check for valid moves. 61 | 62 | Returns: 63 | Generator of (int, int): All the valid moves that can be played in this position. 64 | """ 65 | for x, y in itertools.product(range(3), range(3)): 66 | if board_state[x][y] == 0: 67 | yield (x, y) 68 | 69 | 70 | def _has_3_in_a_line(line): 71 | return all(x == -1 for x in line) | all(x == 1 for x in line) 72 | 73 | 74 | def has_winner(board_state): 75 | """Determine if a player has won on the given board_state. 76 | 77 | Args: 78 | board_state (3x3 tuple of int): The current board_state we want to evaluate. 79 | 80 | Returns: 81 | int: 1 if player one has won, -1 if player 2 has won, otherwise 0. 82 | """ 83 | # check rows 84 | for x in range(3): 85 | if _has_3_in_a_line(board_state[x]): 86 | return board_state[x][0] 87 | # check columns 88 | for y in range(3): 89 | if _has_3_in_a_line([i[y] for i in board_state]): 90 | return board_state[0][y] 91 | 92 | # check diagonals 93 | if _has_3_in_a_line([board_state[i][i] for i in range(3)]): 94 | return board_state[0][0] 95 | if _has_3_in_a_line([board_state[2 - i][i] for i in range(3)]): 96 | return board_state[0][2] 97 | 98 | return 0 # no one has won, return 0 for a draw 99 | 100 | 101 | def play_game(plus_player_func, minus_player_func, log=False): 102 | """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each 103 | player. 104 | 105 | Args: 106 | plus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the 107 | current board_state and side this player is playing, and returns the move the player wants to play. 108 | minus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the 109 | current board_state and side this player is playing, and returns the move the player wants to play. 110 | log (bool): If True progress is logged to console, defaults to False 111 | 112 | Returns: 113 | int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw 114 | """ 115 | board_state = _new_board() 116 | player_turn = 1 117 | 118 | while True: 119 | _available_moves = list(available_moves(board_state)) 120 | 121 | if len(_available_moves) == 0: 122 | # draw 123 | if log: 124 | print("no moves left, game ended a draw") 125 | return 0. 126 | if player_turn > 0: 127 | move = plus_player_func(board_state, 1) 128 | else: 129 | move = minus_player_func(board_state, -1) 130 | 131 | if move not in _available_moves: 132 | # if a player makes an invalid move the other player wins 133 | if log: 134 | print("illegal move ", move) 135 | return -player_turn 136 | 137 | board_state = apply_move(board_state, move, player_turn) 138 | if log: 139 | print(board_state) 140 | 141 | winner = has_winner(board_state) 142 | if winner != 0: 143 | if log: 144 | print("we have a winner, side: %s" % player_turn) 145 | return winner 146 | player_turn = -player_turn 147 | 148 | 149 | def random_player(board_state, _): 150 | """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the 151 | valid moves in the current state. 152 | 153 | Args: 154 | board_state (3x3 tuple of int): The current state of the board 155 | _: the side this player is playing, not used in this function because we are simply choosing the moves randomly 156 | 157 | Returns: 158 | (int, int): the move we want to play on the current board 159 | """ 160 | moves = list(available_moves(board_state)) 161 | return random.choice(moves) 162 | 163 | 164 | class TicTacToeGameSpec(BaseGameSpec): 165 | def __init__(self): 166 | self.available_moves = available_moves 167 | self.has_winner = has_winner 168 | self.new_board = _new_board 169 | self.apply_move = apply_move 170 | self.evaluate = evaluate 171 | 172 | def board_dimensions(self): 173 | return 3, 3 174 | 175 | 176 | if __name__ == '__main__': 177 | # example of playing a game 178 | play_game(random_player, random_player, log=True) 179 | -------------------------------------------------------------------------------- /games/tic_tac_toe_x.py: -------------------------------------------------------------------------------- 1 | """ 2 | Full code for running a game of tic-tac-toe on a board of any size with a specified number in a row for the win. This is 3 | similar to tic_tac_toe.py but all relevent moves are paramiterized by board_size arg that sets how big the board is and 4 | winning_length which determines how many in a row are needed to win. Defaults are 5 and 4. This allows you to play games 5 | in a more complex environment than standard tic-tac-toe. 6 | 7 | Two players take turns making moves on squares of the board, the first to get winning_length in a row, including 8 | diagonals, wins. If there are no valid moves left to make the game ends a draw. 9 | 10 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine 11 | where each player plays. 12 | The board is represented by a board_size x board_size tuple of ints. A 0 means no player has played in a space, 1 means 13 | player one has played there, -1 means the seconds player has played there. The apply_move method can be used to return a 14 | copy of a given state with a given move applied. This can be useful for doing min-max or monte carlo sampling. 15 | """ 16 | import itertools 17 | import random 18 | 19 | from common.base_game_spec import BaseGameSpec 20 | 21 | 22 | def _new_board(board_size): 23 | """Return a emprty tic-tac-toe board we can use for simulating a game. 24 | 25 | Args: 26 | board_size (int): The size of one side of the board, a board_size * board_size board is created 27 | 28 | Returns: 29 | board_size x board_size tuple of ints 30 | """ 31 | return tuple(tuple(0 for _ in range(board_size)) for _ in range(board_size)) 32 | 33 | 34 | def apply_move(board_state, move, side): 35 | """Returns a copy of the given board_state with the desired move applied. 36 | 37 | Args: 38 | board_state (2d tuple of int): The given board_state we want to apply the move to. 39 | move (int, int): The position we want to make the move in. 40 | side (int): The side we are making this move for, 1 for the first player, -1 for the second player. 41 | 42 | Returns: 43 | (2d tuple of int): A copy of the board_state with the given move applied for the given side. 44 | """ 45 | move_x, move_y = move 46 | 47 | def get_tuples(): 48 | for x in range(len(board_state)): 49 | if move_x == x: 50 | temp = list(board_state[x]) 51 | temp[move_y] = side 52 | yield tuple(temp) 53 | else: 54 | yield board_state[x] 55 | 56 | return tuple(get_tuples()) 57 | 58 | 59 | def available_moves(board_state): 60 | """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have 61 | pieces played. 62 | 63 | Args: 64 | board_state: The board_state we want to check for valid moves. 65 | 66 | Returns: 67 | Generator of (int, int): All the valid moves that can be played in this position. 68 | """ 69 | for x, y in itertools.product(range(len(board_state)), range(len(board_state[0]))): 70 | if board_state[x][y] == 0: 71 | yield (x, y) 72 | 73 | 74 | def _has_winning_line(line, winning_length): 75 | count = 0 76 | last_side = 0 77 | for x in line: 78 | if x == last_side: 79 | count += 1 80 | if count == winning_length: 81 | return last_side 82 | else: 83 | count = 1 84 | last_side = x 85 | return 0 86 | 87 | 88 | def has_winner(board_state, winning_length): 89 | """Determine if a player has won on the given board_state. 90 | 91 | Args: 92 | board_state (2d tuple of int): The current board_state we want to evaluate. 93 | winning_length (int): The number of moves in a row needed for a win. 94 | 95 | Returns: 96 | int: 1 if player one has won, -1 if player 2 has won, otherwise 0. 97 | """ 98 | board_width = len(board_state) 99 | board_height = len(board_state[0]) 100 | 101 | # check rows 102 | for x in range(board_width): 103 | winner = _has_winning_line(board_state[x], winning_length) 104 | if winner != 0: 105 | return winner 106 | # check columns 107 | for y in range(board_height): 108 | winner = _has_winning_line((i[y] for i in board_state), winning_length) 109 | if winner != 0: 110 | return winner 111 | 112 | # check diagonals 113 | diagonals_start = -(board_width - winning_length) 114 | diagonals_end = (board_width - winning_length) 115 | for d in range(diagonals_start, diagonals_end + 1): 116 | winner = _has_winning_line( 117 | (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))), 118 | winning_length) 119 | if winner != 0: 120 | return winner 121 | for d in range(diagonals_start, diagonals_end + 1): 122 | winner = _has_winning_line( 123 | (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))), 124 | winning_length) 125 | if winner != 0: 126 | return winner 127 | 128 | return 0 # no one has won, return 0 for a draw 129 | 130 | 131 | def _evaluate_line(line, winning_length): 132 | count = 0 133 | last_side = 0 134 | score = 0 135 | neutrals = 0 136 | 137 | for x in line: 138 | if x == last_side: 139 | count += 1 140 | if count == winning_length and neutrals == 0: 141 | return 100000 * x # a side has already won here 142 | elif x == 0: # we could score here 143 | neutrals += 1 144 | elif x == -last_side: 145 | if neutrals + count >= winning_length: 146 | score += (count - 1) * last_side 147 | count = 1 148 | last_side = x 149 | neutrals = 0 150 | else: 151 | last_side = x 152 | count = 1 153 | 154 | if neutrals + count >= winning_length: 155 | score += (count - 1) * last_side 156 | 157 | return score 158 | 159 | 160 | def evaluate(board_state, winning_length): 161 | """An evaluation function for this game, gives an estimate of how good the board position is for the plus player. 162 | There is no specific range for the values returned, they just need to be relative to each other. 163 | 164 | Args: 165 | winning_length (int): The length needed to win a game 166 | board_state (tuple): State of the board 167 | 168 | Returns: 169 | number 170 | """ 171 | board_width = len(board_state) 172 | board_height = len(board_state[0]) 173 | 174 | score = 0 175 | 176 | # check rows 177 | for x in range(board_width): 178 | score += _evaluate_line(board_state[x], winning_length) 179 | # check columns 180 | for y in range(board_height): 181 | score += _evaluate_line((i[y] for i in board_state), winning_length) 182 | 183 | # check diagonals 184 | diagonals_start = -(board_width - winning_length) 185 | diagonals_end = (board_width - winning_length) 186 | for d in range(diagonals_start, diagonals_end + 1): 187 | score += _evaluate_line( 188 | (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))), 189 | winning_length) 190 | for d in range(diagonals_start, diagonals_end + 1): 191 | score += _evaluate_line( 192 | (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))), 193 | winning_length) 194 | 195 | return score 196 | 197 | 198 | def play_game(plus_player_func, minus_player_func, board_size=5, winning_length=4, log=False): 199 | """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each 200 | player. 201 | 202 | Args: 203 | plus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))): 204 | Function that takes the current board_state and side this player is playing, and returns the move the player 205 | wants to play. 206 | minus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))): 207 | Function that takes the current board_state and side this player is playing, and returns the move the player 208 | wants to play. 209 | board_size (int): The size of a single side of the board. Game is played on a board_size*board_size sized board 210 | winning_length (int): The number of pieces in a row needed to win a game. 211 | log (bool): If True progress is logged to console, defaults to False 212 | 213 | Returns: 214 | int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw 215 | """ 216 | board_state = _new_board(board_size) 217 | player_turn = 1 218 | 219 | while True: 220 | _available_moves = list(available_moves(board_state)) 221 | if len(_available_moves) == 0: 222 | # draw 223 | if log: 224 | print("no moves left, game ended a draw") 225 | return 0. 226 | if player_turn > 0: 227 | move = plus_player_func(board_state, 1) 228 | else: 229 | move = minus_player_func(board_state, -1) 230 | 231 | if move not in _available_moves: 232 | # if a player makes an invalid move the other player wins 233 | if log: 234 | print("illegal move ", move) 235 | return -player_turn 236 | 237 | board_state = apply_move(board_state, move, player_turn) 238 | print(board_state) 239 | 240 | winner = has_winner(board_state, winning_length) 241 | if winner != 0: 242 | if log: 243 | print("we have a winner, side: %s" % player_turn) 244 | return winner 245 | player_turn = -player_turn 246 | 247 | 248 | def random_player(board_state, _): 249 | """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the 250 | valid moves in the current state. 251 | 252 | Args: 253 | board_state (2d tuple of int): The current state of the board 254 | _: the side this player is playing, not used in this function because we are simply choosing the moves randomly 255 | 256 | Returns: 257 | (int, int): the move we want to play on the current board 258 | """ 259 | moves = list(available_moves(board_state)) 260 | return random.choice(moves) 261 | 262 | 263 | class TicTacToeXGameSpec(BaseGameSpec): 264 | def __init__(self, board_size, winning_length): 265 | """ 266 | 267 | Args: 268 | board_size (int): The length of one side of the board, so the bard will have board_size*board_size total 269 | squares 270 | winning_length (int): The length in a row needed to win the game. Should be less than or equal to board_size 271 | """ 272 | if not isinstance(board_size, int): 273 | raise TypeError("board_size must be an int") 274 | if not isinstance(winning_length, int): 275 | raise TypeError("winning_length must be an int") 276 | if winning_length > board_size: 277 | raise ValueError("winning_length must be less than or equal to board_size") 278 | self._winning_length = winning_length 279 | self._board_size = board_size 280 | self.available_moves = available_moves 281 | self.apply_move = apply_move 282 | 283 | def new_board(self): 284 | return _new_board(self._board_size) 285 | 286 | def has_winner(self, board_state): 287 | return has_winner(board_state, self._winning_length) 288 | 289 | def board_dimensions(self): 290 | return self._board_size, self._board_size 291 | 292 | def evaluate(self, board_state): 293 | return evaluate(board_state, self._winning_length) 294 | 295 | 296 | if __name__ == '__main__': 297 | # example of playing a game 298 | play_game(random_player, random_player, log=True, board_size=10, winning_length=4) 299 | -------------------------------------------------------------------------------- /policy_gradient.py: -------------------------------------------------------------------------------- 1 | """ 2 | Builds and trains a neural network that uses policy gradients to learn to play Tic-Tac-Toe. 3 | 4 | The input to the network is a vector with a number for each space on the board. If the space has one of the networks 5 | pieces then the input vector has the value 1. -1 for the opponents space and 0 for no piece. 6 | 7 | The output of the network is a also of the size of the board with each number learning the probability that a move in 8 | that space is the best move. 9 | 10 | The network plays successive games randomly alternating between going first and second against an opponent that makes 11 | moves by randomly selecting a free space. The neural network does NOT initially have any way of knowing what is or is not 12 | a valid move, so initially it must learn the rules of the game. 13 | 14 | I have trained this version with success at 3x3 tic tac toe until it has a success rate in the region of 75% this maybe 15 | as good as it can do, because 3x3 tic-tac-toe is a theoretical draw, so the random opponent will often get lucky and 16 | force a draw. 17 | """ 18 | import functools 19 | 20 | from common.network_helpers import create_network 21 | from games.tic_tac_toe import TicTacToeGameSpec 22 | from techniques.train_policy_gradient import train_policy_gradients 23 | 24 | BATCH_SIZE = 100 # every how many games to do a parameter update? 25 | LEARN_RATE = 1e-4 26 | PRINT_RESULTS_EVERY_X = 1000 # every how many games to print the results 27 | NETWORK_FILE_PATH = None#'current_network.p' # path to save the network to 28 | NUMBER_OF_GAMES_TO_RUN = 1000000 29 | 30 | # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec, to get these to run 31 | # well may require tuning the hyper parameters a bit 32 | game_spec = TicTacToeGameSpec() 33 | 34 | create_network_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100)) 35 | 36 | train_policy_gradients(game_spec, create_network_func, NETWORK_FILE_PATH, 37 | number_of_games=NUMBER_OF_GAMES_TO_RUN, 38 | batch_size=BATCH_SIZE, 39 | learn_rate=LEARN_RATE, 40 | print_results_every=PRINT_RESULTS_EVERY_X) 41 | -------------------------------------------------------------------------------- /policy_gradient_historical_competition.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is the same as the policy_gradient.py network except that instead of playing against a random opponent. It plays 3 | against previous versions of itself. It is first created with the weights from the "current_network.p" file, if no file 4 | is found there random weights are used. It then creates a series of copies of itself and plays against them. 5 | After "SAVE_HISTORICAL_NETWORK_EVERY" games, it saves it's current weights into the weights of one of the historical 6 | networks. Over time the main network and the historical networks should improve. 7 | """ 8 | import collections 9 | import functools 10 | import os 11 | import random 12 | 13 | import numpy as np 14 | import tensorflow as tf 15 | 16 | from common.network_helpers import create_network, load_network, get_stochastic_network_move, \ 17 | save_network 18 | from games.tic_tac_toe import TicTacToeGameSpec 19 | from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic 20 | 21 | HIDDEN_NODES = (100, 100, 100) 22 | SAVE_HISTORICAL_NETWORK_EVERY = 10000 23 | game_spec = TicTacToeGameSpec() 24 | 25 | create_network_func = functools.partial(create_network, game_spec.board_squares(), HIDDEN_NODES) 26 | 27 | train_policy_gradients_vs_historic(game_spec, create_network_func, 28 | 'train_vs_historical.p', 29 | save_historic_every=SAVE_HISTORICAL_NETWORK_EVERY) 30 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow=0.7.1 2 | -------------------------------------------------------------------------------- /techniques/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/techniques/__init__.py -------------------------------------------------------------------------------- /techniques/create_positions_set.py: -------------------------------------------------------------------------------- 1 | """ 2 | For games like tic-tac-toe we are unlikely to be able to find databases of top level games. This file allows us 3 | generate sets games using good existing algorithms from which we can train our networks. Such as min max 4 | """ 5 | import pickle 6 | import random 7 | 8 | import zlib 9 | 10 | from common.network_helpers import invert_board_state 11 | from techniques.min_max import min_max_alpha_beta 12 | 13 | 14 | def create_positions_set(game_spec, number_of_positions, choose_move_func, compress=False): 15 | """Generate a set of positions. All positions are set to be from the point of view of the plus player. In order to 16 | aid breadth of search if a position that we have already calculated the best move for comes up twice we choose a 17 | random move. Moves chosen randomly are not stored in the returned set. 18 | 19 | Args: 20 | game_spec (common.BaseGameSpec): 21 | number_of_positions (int): We will simulate this many positions 22 | choose_move_func (): Function that picks the best move from a board position 23 | compress (bool): If True then we will compress all the state pairs we store to save on memory, use 24 | pickle.loads(zlib.decompress(item)) to uncompress 25 | 26 | Returns: 27 | {board_state, move} 28 | """ 29 | positions = {} 30 | random_player_func = game_spec.get_random_player_func() 31 | 32 | def store_move_pair(board_state, side): 33 | if side != 1: 34 | board_state_for_plus = invert_board_state(board_state) 35 | else: 36 | board_state_for_plus = board_state 37 | 38 | if compress: 39 | board_state_for_plus = zlib.compress(pickle.dumps(board_state_for_plus)) 40 | 41 | # if we have already seen this position then make a random move to increase position diversity 42 | if board_state_for_plus in positions: 43 | return random_player_func(board_state, side) 44 | else: 45 | move = choose_move_func(board_state, side) 46 | positions[board_state_for_plus] = move 47 | 48 | return move 49 | 50 | while number_of_positions > len(positions.keys()): 51 | game_spec.play_game(store_move_pair, store_move_pair) 52 | print(len(positions.keys())) 53 | 54 | return positions 55 | 56 | 57 | if __name__ == '__main__': 58 | # example usage 59 | from games.connect_4 import Connect4GameSpec 60 | 61 | game_spec = Connect4GameSpec() 62 | 63 | def choose_move_func(board_state, side): 64 | return min_max_alpha_beta(game_spec, board_state, side, 6)[1] 65 | 66 | positions = create_positions_set(game_spec, 10000, choose_move_func) 67 | 68 | positions_as_array = [(x, y) for x, y in positions.items()] 69 | random.shuffle(positions_as_array) 70 | 71 | with open('position_connect_4_min_max_depth_6', mode='wb') as f: 72 | pickle.dump(positions_as_array, f) 73 | 74 | print("created") -------------------------------------------------------------------------------- /techniques/min_max.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def _score_line(line): 5 | minus_count = line.count(-1) 6 | plus_count = line.count(1) 7 | if minus_count + plus_count < 3: 8 | if minus_count == 2: 9 | return -1 10 | elif plus_count == 2: 11 | return 1 12 | return 0 13 | 14 | 15 | def evaluate_tic_tac_toe(board_state): 16 | """Get a rough score for how good we think this board position is for the plus_player for the game tic-tac-toe. Does 17 | this based on number of 2 in row lines we have. 18 | 19 | Args: 20 | board_state (3x3 tuple of int): The board state we are evaluating 21 | 22 | Returns: 23 | int: evaluated score for the position for the plus player, posative is good for the plus player, negative good 24 | for the minus player 25 | """ 26 | score = 0 27 | for x in range(3): 28 | score += _score_line(board_state[x]) 29 | for y in range(3): 30 | score += _score_line([i[y] for i in board_state]) 31 | 32 | # diagonals 33 | score += _score_line([board_state[i][i] for i in range(3)]) 34 | score += _score_line([board_state[2 - i][i] for i in range(3)]) 35 | 36 | return score 37 | 38 | 39 | def min_max(game_spec, board_state, side, max_depth, evaluation_func=None): 40 | """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best 41 | move 42 | 43 | Args: 44 | game_spec (BaseGameSpec): The specification for the game we are evaluating 45 | evaluation_func (board_state -> int): Function used to evaluate the position for the plus player, If None then 46 | we will use the evaluation function from the game_spec 47 | board_state (3x3 tuple of int): The board state we are evaluating 48 | side (int): either +1 or -1 49 | max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the 50 | position is. 51 | 52 | Returns: 53 | (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was 54 | """ 55 | best_score = None 56 | best_score_move = None 57 | evaluation_func = evaluation_func or game_spec.evaluate 58 | 59 | moves = list(game_spec.available_moves(board_state)) 60 | if not moves: 61 | # this is a draw 62 | return 0, None 63 | 64 | for move in moves: 65 | new_board_state = game_spec.apply_move(board_state, move, side) 66 | winner = game_spec.has_winner(new_board_state) 67 | if winner != 0: 68 | return winner * 10000, move 69 | else: 70 | if max_depth <= 1: 71 | score = evaluation_func(new_board_state) 72 | else: 73 | score, _ = min_max(game_spec, new_board_state, -side, max_depth - 1, evaluation_func=evaluation_func) 74 | if side > 0: 75 | if best_score is None or score > best_score: 76 | best_score = score 77 | best_score_move = move 78 | else: 79 | if best_score is None or score < best_score: 80 | best_score = score 81 | best_score_move = move 82 | return best_score, best_score_move 83 | 84 | 85 | def min_max_alpha_beta(game_spec, board_state, side, max_depth, evaluation_func=None, alpha=-sys.float_info.max, 86 | beta=sys.float_info.max): 87 | """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best 88 | move 89 | 90 | Args: 91 | game_spec (BaseGameSpec): The specification for the game we are evaluating 92 | evaluation_func (board_state -> int): Function used to evaluate the position for the plus player 93 | board_state (3x3 tuple of int): The board state we are evaluating 94 | side (int): either +1 or -1 95 | max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the 96 | position is. 97 | alpha (float): Used when this is called recursively, normally ignore 98 | beta (float): Used when this is called recursively, normally ignore 99 | 100 | Returns: 101 | (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was 102 | """ 103 | evaluation_func = evaluation_func or game_spec.evaluate 104 | best_score_move = None 105 | moves = list(game_spec.available_moves(board_state)) 106 | if not moves: 107 | return 0, None 108 | 109 | for move in moves: 110 | new_board_state = game_spec.apply_move(board_state, move, side) 111 | winner = game_spec.has_winner(new_board_state) 112 | if winner != 0: 113 | return winner * 10000, move 114 | else: 115 | if max_depth <= 1: 116 | score = evaluation_func(new_board_state) 117 | else: 118 | score, _ = min_max_alpha_beta(game_spec, new_board_state, -side, max_depth - 1, evaluation_func, alpha, 119 | beta) 120 | 121 | if side > 0: 122 | if score > alpha: 123 | alpha = score 124 | best_score_move = move 125 | else: 126 | if score < beta: 127 | beta = score 128 | best_score_move = move 129 | if alpha >= beta: 130 | break 131 | 132 | return alpha if side > 0 else beta, best_score_move 133 | 134 | 135 | def min_max_player(board_state, side): 136 | return min_max(board_state, side, 5)[1] 137 | 138 | 139 | def evaluate(board_state): 140 | """Get a rough score for how good we think this board position is for the plus_player. Does this based on number of 141 | 2 in row lines we have. 142 | 143 | Args: 144 | board_state (3x3 tuple of int): The board state we are evaluating 145 | 146 | Returns: 147 | int: evaluated score for the position for the plus player, posative is good for the plus player, negative good 148 | for the minus player 149 | """ 150 | score = 0 151 | for x in range(len(board_state)): 152 | score += _score_line(board_state[x]) 153 | for y in range(len(board_state[0])): 154 | score += _score_line([i[y] for i in board_state]) 155 | 156 | # diagonals 157 | score += _score_line([board_state[i][i] for i in range(3)]) 158 | score += _score_line([board_state[2 - i][i] for i in range(3)]) 159 | 160 | return score 161 | -------------------------------------------------------------------------------- /techniques/monte_carlo.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import random 3 | import math 4 | 5 | 6 | def _monte_carlo_sample(game_spec, board_state, side): 7 | """Sample a single rollout from the current board_state and side. Moves are made to the current board_state until we 8 | reach a terminal state then the result and the first move made to get there is returned. 9 | 10 | Args: 11 | game_spec (BaseGameSpec): The specification for the game we are evaluating 12 | board_state (3x3 tuple of int): state of the board 13 | side (int): side currently to play. +1 for the plus player, -1 for the minus player 14 | 15 | Returns: 16 | (result(int), move(int,int)): The result from this rollout, +1 for a win for the plus player -1 for a win for 17 | the minus player, 0 for a draw 18 | """ 19 | result = game_spec.has_winner(board_state) 20 | if result != 0: 21 | return result, None 22 | moves = list(game_spec.available_moves(board_state)) 23 | if not moves: 24 | return 0, None 25 | 26 | # select a random move 27 | move = random.choice(moves) 28 | result, next_move = _monte_carlo_sample(game_spec, game_spec.apply_move(board_state, move, side), -side) 29 | return result, move 30 | 31 | 32 | def monte_carlo_tree_search(game_spec, board_state, side, number_of_samples): 33 | """Evaluate the best from the current board_state for the given side using monte carlo sampling. 34 | 35 | Args: 36 | game_spec (BaseGameSpec): The specification for the game we are evaluating 37 | board_state (3x3 tuple of int): state of the board 38 | side (int): side currently to play. +1 for the plus player, -1 for the minus player 39 | number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the 40 | better the estimation of the position 41 | 42 | Returns: 43 | (result(int), move(int,int)): The average result for the best move from this position and what that move was. 44 | """ 45 | move_wins = collections.defaultdict(int) 46 | move_samples = collections.defaultdict(int) 47 | for _ in range(number_of_samples): 48 | result, move = _monte_carlo_sample(game_spec, board_state, side) 49 | # store the result and a count of the number of times we have tried this move 50 | if result == side: 51 | move_wins[move] += 1 52 | move_samples[move] += 1 53 | 54 | # get the move with the best average result 55 | move = max(move_wins, key=lambda x: move_wins.get(x) / move_samples[move]) 56 | 57 | return move_wins[move] / move_samples[move], move 58 | 59 | 60 | def _upper_confidence_bounds(payout, samples_for_this_machine, log_total_samples): 61 | return payout / samples_for_this_machine + math.sqrt((2 * log_total_samples) / samples_for_this_machine) 62 | 63 | 64 | def monte_carlo_tree_search_uct(game_spec, board_state, side, number_of_samples): 65 | """Evaluate the best from the current board_state for the given side using monte carlo sampling with upper 66 | confidence bounds for trees. 67 | 68 | Args: 69 | game_spec (BaseGameSpec): The specification for the game we are evaluating 70 | board_state (3x3 tuple of int): state of the board 71 | side (int): side currently to play. +1 for the plus player, -1 for the minus player 72 | number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the 73 | better the estimation of the position 74 | 75 | Returns: 76 | (result(int), move(int,int)): The average result for the best move from this position and what that move was. 77 | """ 78 | state_results = collections.defaultdict(float) 79 | state_samples = collections.defaultdict(float) 80 | 81 | for _ in range(number_of_samples): 82 | current_side = side 83 | current_board_state = board_state 84 | first_unvisited_node = True 85 | rollout_path = [] 86 | result = 0 87 | 88 | while result == 0: 89 | move_states = {move: game_spec.apply_move(current_board_state, move, current_side) 90 | for move in game_spec.available_moves(current_board_state)} 91 | 92 | if not move_states: 93 | result = 0 94 | break 95 | 96 | if all((state in state_samples) for _, state in move_states): 97 | log_total_samples = math.log(sum(state_samples[s] for s in move_states.values())) 98 | move, state = max(move_states, key=lambda _, s: _upper_confidence_bounds(state_results[s], 99 | state_samples[s], 100 | log_total_samples)) 101 | else: 102 | move = random.choice(list(move_states.keys())) 103 | 104 | current_board_state = move_states[move] 105 | 106 | if first_unvisited_node: 107 | rollout_path.append((current_board_state, current_side)) 108 | if current_board_state not in state_samples: 109 | first_unvisited_node = False 110 | 111 | current_side = -current_side 112 | 113 | result = game_spec.has_winner(current_board_state) 114 | 115 | for path_board_state, path_side in rollout_path: 116 | state_samples[path_board_state] += 1. 117 | result *= path_side 118 | # normalize results to be between 0 and 1 before this it between -1 and 1 119 | result /= 2. 120 | result += .5 121 | state_results[path_board_state] += result 122 | 123 | move_states = {move: game_spec.apply_move(board_state, move, side) for move in game_spec.available_moves(board_state)} 124 | 125 | move = max(move_states, key=lambda x: state_results[move_states[x]] / state_samples[move_states[x]]) 126 | 127 | return state_results[move_states[move]] / state_samples[move_states[move]], move 128 | 129 | 130 | if __name__ == '__main__': 131 | from games.tic_tac_toe import TicTacToeGameSpec 132 | 133 | sample_board_state = ((1, 0, -1), 134 | (1, 0, 0), 135 | (0, -1, 0)) 136 | 137 | print(monte_carlo_tree_search_uct(TicTacToeGameSpec(), sample_board_state, -1, 10000)) 138 | -------------------------------------------------------------------------------- /techniques/monte_carlo_uct_with_value.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import random 3 | 4 | import math 5 | 6 | from techniques.monte_carlo import _upper_confidence_bounds 7 | 8 | 9 | def monte_carlo_tree_search_uct_with_value(game_spec, board_state, side, number_of_samples, value_func, 10 | value_weighting): 11 | """Evaluate the best from the current board_state for the given side using monte carlo sampling with upper 12 | confidence bounds for trees. 13 | 14 | Args: 15 | game_spec (BaseGameSpec): The specification for the game we are evaluating 16 | board_state (3x3 tuple of int): state of the board 17 | side (int): side currently to play. +1 for the plus player, -1 for the minus player 18 | number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the 19 | better the estimation of the position 20 | value_func (board_state, side -> float): 21 | value_weighting (float): parameter to adjust how much priority we give to the value_func 22 | 23 | Returns: 24 | (result(int), move(int,int)): The average result for the best move from this position and what that move was. 25 | """ 26 | state_results = collections.defaultdict(float) 27 | state_samples = collections.defaultdict(float) 28 | state_values = collections.defaultdict(float) 29 | 30 | for _ in range(number_of_samples): 31 | current_side = side 32 | current_board_state = board_state 33 | first_unvisited_node = True 34 | rollout_path = [] 35 | result = 0 36 | 37 | while result == 0: 38 | move_states = {move: game_spec.apply_move(current_board_state, move, current_side) 39 | for move in game_spec.available_moves(current_board_state)} 40 | 41 | if not move_states: 42 | result = 0 43 | break 44 | 45 | if all((state in state_samples) for _, state in move_states): 46 | log_total_samples = math.log(sum(state_samples[s] for s in move_states.values())) 47 | move, state = max(move_states, key=lambda _, s: state_values[s] * value_weighting + 48 | _upper_confidence_bounds(state_results[s], 49 | state_samples[s], 50 | log_total_samples)) 51 | else: 52 | move = random.choice(list(move_states.keys())) 53 | 54 | current_board_state = move_states[move] 55 | 56 | if first_unvisited_node: 57 | rollout_path.append((current_board_state, current_side)) 58 | if current_board_state not in state_samples: 59 | state_values[current_board_state] = value_func(current_board_state) 60 | first_unvisited_node = False 61 | 62 | current_side = -current_side 63 | 64 | result = game_spec.has_winner(current_board_state) 65 | 66 | for path_board_state, path_side in rollout_path: 67 | state_samples[path_board_state] += 1. 68 | result *= path_side 69 | # normalize results to be between 0 and 1 before this it between -1 and 1 70 | result /= 2. 71 | result += .5 72 | state_results[path_board_state] += result 73 | 74 | move_states = {move: game_spec.apply_move(board_state, move, side) for move in 75 | game_spec.available_moves(board_state)} 76 | 77 | move = max(move_states, key=lambda x: state_results[move_states[x]] / state_samples[move_states[x]]) 78 | 79 | return state_results[move_states[move]] / state_samples[move_states[move]], move 80 | -------------------------------------------------------------------------------- /techniques/train_policy_gradient.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import os 3 | import random 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | from common.network_helpers import load_network, get_stochastic_network_move, save_network 9 | 10 | 11 | def train_policy_gradients(game_spec, 12 | create_network, 13 | network_file_path, 14 | save_network_file_path=None, 15 | opponent_func=None, 16 | number_of_games=10000, 17 | print_results_every=1000, 18 | learn_rate=1e-4, 19 | batch_size=100, 20 | randomize_first_player=True): 21 | """Train a network using policy gradients 22 | 23 | Args: 24 | save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then 25 | the network_file_path param is used. 26 | opponent_func (board_state, side) -> move: Function for the opponent, if unset we use an opponent playing 27 | randomly 28 | randomize_first_player (bool): If True we alternate between being the first and second player 29 | game_spec (games.base_game_spec.BaseGameSpec): The game we are playing 30 | create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): 31 | Method that creates the network we will train. 32 | network_file_path (str): path to the file with weights we want to load for this network 33 | number_of_games (int): number of games to play before stopping 34 | print_results_every (int): Prints results to std out every x games, also saves the network 35 | learn_rate (float): 36 | batch_size (int): 37 | 38 | Returns: 39 | (variables used in the final network : list, win rate: float) 40 | """ 41 | save_network_file_path = save_network_file_path or network_file_path 42 | opponent_func = opponent_func or game_spec.get_random_player_func() 43 | reward_placeholder = tf.placeholder("float", shape=(None,)) 44 | actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.outputs())) 45 | 46 | input_layer, output_layer, variables = create_network() 47 | 48 | policy_gradient = tf.log( 49 | tf.reduce_sum(tf.multiply(actual_move_placeholder, output_layer), reduction_indices=1)) * reward_placeholder 50 | train_step = tf.train.AdamOptimizer(learn_rate).minimize(-policy_gradient) 51 | 52 | with tf.Session() as session: 53 | session.run(tf.global_variables_initializer()) 54 | 55 | if network_file_path and os.path.isfile(network_file_path): 56 | print("loading pre-existing network") 57 | load_network(session, variables, network_file_path) 58 | 59 | mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] 60 | results = collections.deque(maxlen=print_results_every) 61 | 62 | def make_training_move(board_state, side): 63 | mini_batch_board_states.append(np.ravel(board_state) * side) 64 | move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side) 65 | mini_batch_moves.append(move) 66 | return game_spec.flat_move_to_tuple(move.argmax()) 67 | 68 | for episode_number in range(1, number_of_games): 69 | # randomize if going first or second 70 | if (not randomize_first_player) or bool(random.getrandbits(1)): 71 | reward = game_spec.play_game(make_training_move, opponent_func) 72 | else: 73 | reward = -game_spec.play_game(opponent_func, make_training_move) 74 | 75 | results.append(reward) 76 | 77 | # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick 78 | last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards) 79 | 80 | reward /= float(last_game_length) 81 | 82 | mini_batch_rewards += ([reward] * last_game_length) 83 | 84 | if episode_number % batch_size == 0: 85 | normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards) 86 | 87 | rewards_std = np.std(normalized_rewards) 88 | if rewards_std != 0: 89 | normalized_rewards /= rewards_std 90 | else: 91 | print("warning: got mini batch std of 0.") 92 | 93 | np_mini_batch_board_states = np.array(mini_batch_board_states) \ 94 | .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:]) 95 | 96 | session.run(train_step, feed_dict={input_layer: np_mini_batch_board_states, 97 | reward_placeholder: normalized_rewards, 98 | actual_move_placeholder: mini_batch_moves}) 99 | 100 | # clear batches 101 | del mini_batch_board_states[:] 102 | del mini_batch_moves[:] 103 | del mini_batch_rewards[:] 104 | 105 | if episode_number % print_results_every == 0: 106 | print("episode: %s win_rate: %s" % (episode_number, _win_rate(print_results_every, results))) 107 | if network_file_path: 108 | save_network(session, variables, save_network_file_path) 109 | 110 | if network_file_path: 111 | save_network(session, variables, save_network_file_path) 112 | 113 | return variables, _win_rate(print_results_every, results) 114 | 115 | 116 | def _win_rate(print_results_every, results): 117 | return 0.5 + sum(results) / (print_results_every * 2.) 118 | -------------------------------------------------------------------------------- /techniques/train_policy_gradient_historic.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import functools 3 | import os 4 | import random 5 | 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from common.network_helpers import get_stochastic_network_move, load_network, save_network 10 | 11 | 12 | def train_policy_gradients_vs_historic(game_spec, create_network, network_file_path, 13 | save_network_file_path=None, 14 | number_of_historic_networks=8, 15 | save_historic_every=10000, 16 | historic_network_base_path='historic_network', 17 | number_of_games=100000, 18 | print_results_every=1000, 19 | learn_rate=1e-4, 20 | batch_size=100): 21 | """Train a network against itself and over time store new version of itself to play against. 22 | 23 | Args: 24 | historic_network_base_path (str): Bast path to save new historic networks to a number for the network "slot" is 25 | appended to the end of this string. 26 | save_historic_every (int): We save a version of the learning network into one of the historic network 27 | "slots" every x number of games. We have number_of_historic_networks "slots" 28 | number_of_historic_networks (int): We keep this many old networks to play against 29 | save_network_file_path (str): Optionally specifiy a path to use for saving the network, if unset then 30 | the network_file_path param is used. 31 | game_spec (games.base_game_spec.BaseGameSpec): The game we are playing 32 | create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): 33 | Method that creates the network we will train. 34 | network_file_path (str): path to the file with weights we want to load for this network 35 | number_of_games (int): number of games to play before stopping 36 | print_results_every (int): Prints results to std out every x games, also saves the network 37 | learn_rate (float): 38 | batch_size (int): 39 | 40 | Returns: 41 | [tf.Vaiables] : trained variables used in the final network 42 | """ 43 | input_layer, output_layer, variables = create_network() 44 | 45 | reward_placeholder = tf.placeholder("float", shape=(None,)) 46 | actual_move_placeholder = tf.placeholder("float", shape=(None, game_spec.board_squares())) 47 | policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) 48 | train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(-policy_gradient) 49 | 50 | current_historical_index = 0 51 | historical_networks = [] 52 | 53 | mini_batch_board_states, mini_batch_moves, mini_batch_rewards = [], [], [] 54 | results = collections.deque(maxlen=print_results_every) 55 | 56 | for _ in range(number_of_historic_networks): 57 | historical_input_layer, historical_output_layer, historical_variables = create_network() 58 | historical_networks.append((historical_input_layer, historical_output_layer, historical_variables)) 59 | 60 | with tf.Session() as session: 61 | session.run(tf.global_variables_initializer()) 62 | 63 | def make_move_historical(histoical_network_index, board_state, side): 64 | net = historical_networks[histoical_network_index] 65 | move = get_stochastic_network_move(session, net[0], net[1], board_state, side, 66 | valid_only=True, game_spec=game_spec) 67 | return game_spec.flat_move_to_tuple(move.argmax()) 68 | 69 | def make_training_move(board_state, side): 70 | mini_batch_board_states.append(np.ravel(board_state) * side) 71 | move = get_stochastic_network_move(session, input_layer, output_layer, board_state, side, 72 | valid_only=True, game_spec=game_spec) 73 | mini_batch_moves.append(move) 74 | return game_spec.flat_move_to_tuple(move.argmax()) 75 | 76 | if os.path.isfile(network_file_path): 77 | print("loading pre existing weights") 78 | load_network(session, variables, network_file_path) 79 | else: 80 | print("could not find previous weights so initialising randomly") 81 | 82 | for i in range(number_of_historic_networks): 83 | if os.path.isfile(historic_network_base_path + str(i) + '.p'): 84 | load_network(session, historical_networks[i][2], historic_network_base_path + str(i) + '.p') 85 | elif os.path.isfile(network_file_path): 86 | # if we can't load a historical file use the current network weights 87 | load_network(session, historical_networks[i][2], network_file_path) 88 | 89 | for episode_number in range(1, number_of_games): 90 | opponent_index = random.randint(0, number_of_historic_networks - 1) 91 | make_move_historical_for_index = functools.partial(make_move_historical, opponent_index) 92 | 93 | # randomize if going first or second 94 | if bool(random.getrandbits(1)): 95 | reward = game_spec.play_game(make_training_move, make_move_historical_for_index) 96 | else: 97 | reward = -game_spec.play_game(make_move_historical_for_index, make_training_move) 98 | 99 | results.append(reward) 100 | 101 | # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick 102 | last_game_length = len(mini_batch_board_states) - len(mini_batch_rewards) 103 | 104 | reward /= float(last_game_length) 105 | 106 | mini_batch_rewards += ([reward] * last_game_length) 107 | 108 | episode_number += 1 109 | 110 | if episode_number % batch_size == 0: 111 | normalized_rewards = mini_batch_rewards - np.mean(mini_batch_rewards) 112 | rewards_std = np.std(normalized_rewards) 113 | if rewards_std != 0: 114 | normalized_rewards /= rewards_std 115 | else: 116 | print("warning: got mini batch std of 0.") 117 | 118 | np_mini_batch_board_states = np.array(mini_batch_board_states) \ 119 | .reshape(len(mini_batch_rewards), *input_layer.get_shape().as_list()[1:]) 120 | 121 | session.run(train_step, feed_dict={input_layer: np_mini_batch_board_states, 122 | reward_placeholder: normalized_rewards, 123 | actual_move_placeholder: mini_batch_moves}) 124 | 125 | # clear batches 126 | del mini_batch_board_states[:] 127 | del mini_batch_moves[:] 128 | del mini_batch_rewards[:] 129 | 130 | if episode_number % print_results_every == 0: 131 | print("episode: %s average result: %s" % (episode_number, np.mean(results))) 132 | 133 | if episode_number % save_historic_every == 0: 134 | print("saving historical network %s", current_historical_index) 135 | save_network(session, variables, historic_network_base_path + str(current_historical_index) + '.p') 136 | load_network(session, historical_networks[current_historical_index][2], 137 | historic_network_base_path + str(current_historical_index) + '.p') 138 | 139 | # also save to the main network file 140 | save_network(session, variables, save_network_file_path or network_file_path) 141 | 142 | current_historical_index += 1 143 | current_historical_index %= number_of_historic_networks 144 | 145 | # save our final weights 146 | save_network(session, variables, save_network_file_path or network_file_path) 147 | 148 | return variables -------------------------------------------------------------------------------- /techniques/train_supervised.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import tensorflow as tf 5 | 6 | from common.network_helpers import save_network, load_network 7 | 8 | 9 | def train_supervised(game_spec, create_network, network_file_path, 10 | positions, 11 | test_set_ratio=0.4, 12 | regularization_coefficent=1e-5, 13 | batch_size=100, 14 | learn_rate=1e-4, 15 | stop_turns_without_improvement = 7): 16 | """Train a network using supervised learning using against a list of game positions and moves chosen. 17 | We stop after we have had stop_turns_without_improvement without an improvement in the test error. 18 | The test set is used as a validation set as well, will possibly improve this in the future to have a seperate test 19 | and validation set. 20 | 21 | Args: 22 | stop_turns_without_improvement (int): we stop training after this many iterations without any improvement in 23 | the test error. 24 | regularization_coefficent (float): amount to multiply the l2 regularizer by in the loss function 25 | test_set_ratio (float): portion of the data to divide into the test set, 26 | positions ([(board_state, move)]): list of tuples of board states and the moves chosen in those board_states 27 | game_spec (games.base_game_spec.BaseGameSpec): The game we are playing 28 | create_network (->(input_layer : tf.placeholder, output_layer : tf.placeholder, variables : [tf.Variable])): 29 | Method that creates the network we will train. 30 | network_file_path (str): path to the file with weights we want to load for this network 31 | learn_rate (float): 32 | batch_size (int): 33 | 34 | Returns: 35 | episode_number, train_error, train_accuracy, new_test_error, test_accuracy 36 | """ 37 | input_layer, output_layer, variables = create_network() 38 | 39 | test_set_count = int(len(positions) * test_set_ratio) 40 | train_set = positions[:-test_set_count] 41 | test_set = positions[-test_set_count:] 42 | 43 | actual_move_placeholder = tf.placeholder("float", (None, game_spec.outputs())) 44 | 45 | error = tf.reduce_sum(tf.square(actual_move_placeholder - output_layer)) 46 | 47 | regularizer = None 48 | for var in variables: 49 | if regularizer is None: 50 | regularizer = tf.nn.l2_loss(var) 51 | else: 52 | regularizer += tf.nn.l2_loss(var) 53 | 54 | loss = error + regularizer * regularization_coefficent 55 | 56 | train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(loss) 57 | 58 | correct_pred = tf.equal(tf.argmax(output_layer, 1), tf.argmax(actual_move_placeholder, 1)) 59 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 60 | 61 | with tf.Session() as session: 62 | session.run(tf.global_variables_initializer()) 63 | 64 | if os.path.isfile(network_file_path): 65 | print("loading existing network") 66 | load_network(session, variables, network_file_path) 67 | 68 | episode_number = 1 69 | turns_without_test_improvement = 0 70 | 71 | best_test_error, test_accuracy = session.run([error, accuracy], 72 | feed_dict={ 73 | input_layer: [x[0] for x in test_set], 74 | actual_move_placeholder: [x[1] for x in test_set]}) 75 | 76 | while True: 77 | random.shuffle(train_set) 78 | train_error = 0 79 | 80 | for start_index in range(0, len(train_set) - batch_size + 1, batch_size): 81 | mini_batch = train_set[start_index:start_index + batch_size] 82 | 83 | batch_error, _ = session.run([error, train_step], 84 | feed_dict={input_layer: [x[0] for x in mini_batch], 85 | actual_move_placeholder: [x[1] for x in mini_batch]}) 86 | train_error += batch_error 87 | 88 | new_test_error, test_accuracy = session.run([error, accuracy], 89 | feed_dict={input_layer: [x[0] for x in test_set], 90 | actual_move_placeholder: [x[1] for x in test_set]}) 91 | 92 | print("episode: %s train_error: %s test_error: %s test_acc: %s" % 93 | (episode_number, train_error, new_test_error, test_accuracy)) 94 | 95 | if new_test_error < best_test_error: 96 | best_test_error = new_test_error 97 | turns_without_test_improvement = 0 98 | else: 99 | turns_without_test_improvement += 1 100 | if turns_without_test_improvement > stop_turns_without_improvement: 101 | train_accuracy = session.run([accuracy], feed_dict={input_layer: [x[0] for x in train_set], 102 | actual_move_placeholder: [x[1] for x in 103 | train_set]}) 104 | 105 | print("test error not improving for %s turns, ending training" % (stop_turns_without_improvement, )) 106 | break 107 | 108 | episode_number += 1 109 | 110 | print("final episode: %s train_error: %s train acc: %s test_error: %s test_acc: %s" % 111 | (episode_number, train_error, train_accuracy, new_test_error, test_accuracy)) 112 | 113 | save_network(session, variables, network_file_path) 114 | 115 | return episode_number, train_error, train_accuracy, new_test_error, test_accuracy -------------------------------------------------------------------------------- /techniques/train_value_network.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from common.network_helpers import create_network, load_network, get_deterministic_network_move, save_network 8 | 9 | 10 | # it would be good to have real board positions, but failing that just generate random ones 11 | def _generate_random_board_position(game_spec, random_move_range): 12 | while True: 13 | board_state = game_spec.new_board() 14 | number_moves = random.randint(*random_move_range) 15 | side = 1 16 | for _ in range(number_moves): 17 | board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))), 18 | side) 19 | if game_spec.has_winner(board_state) != 0: 20 | # start again if we hit an already winning position 21 | continue 22 | 23 | side = -side 24 | return board_state 25 | 26 | 27 | def train_value_network(game_spec, hidden_nodes_reinforcement, reinforcement_network_file_path, 28 | hidden_nodes_value, value_network_file_path, 29 | learn_rate=1e-4, 30 | batch_size=100, 31 | train_samples=10000, 32 | test_samples=8000): 33 | reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network( 34 | game_spec.board_squares(), 35 | hidden_nodes_reinforcement, 36 | game_spec.outputs()) 37 | 38 | value_input_layer, value_output_layer, value_variables = create_network(game_spec.board_squares(), 39 | hidden_nodes_value, 40 | output_nodes=1, output_softmax=False) 41 | 42 | target_placeholder = tf.placeholder("float", (None, 1)) 43 | error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer)) 44 | 45 | train_step = tf.train.RMSPropOptimizer(learn_rate).minimize(error) 46 | 47 | with tf.Session() as session: 48 | session.run(tf.global_variables_initializer()) 49 | 50 | load_network(session, reinforcement_variables, reinforcement_network_file_path) 51 | 52 | if os.path.isfile(value_network_file_path): 53 | print("loading previous version of value network") 54 | load_network(session, value_variables, value_network_file_path) 55 | 56 | def make_move(board_state, side): 57 | move = get_deterministic_network_move(session, reinforcement_input_layer, reinforcement_output_layer, 58 | board_state, side) 59 | 60 | return game_spec.flat_move_to_tuple(np.argmax(move)) 61 | 62 | board_states_training = {} 63 | board_states_test = [] 64 | episode_number = 0 65 | 66 | while len(board_states_training) < train_samples + test_samples: 67 | board_state = _generate_random_board_position(game_spec, (1, game_spec.board_squares() * 0.8)) 68 | board_state_flat = tuple(np.ravel(board_state)) 69 | 70 | # only accept the board_state if not already in the dict 71 | if board_state_flat not in board_states_training: 72 | result = game_spec.play_game(make_move, make_move, board_state=board_state) 73 | board_states_training[board_state_flat] = float(result) 74 | 75 | # take a random selection from training into a test set 76 | for _ in range(test_samples): 77 | sample = random.choice(board_states_training.keys()) 78 | board_states_test.append((sample, board_states_training[sample])) 79 | del board_states_training[sample] 80 | 81 | board_states_training = list(board_states_training.iteritems()) 82 | 83 | test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test], 84 | target_placeholder: [[x[1]] for x in board_states_test]}) 85 | 86 | while True: 87 | np.random.shuffle(board_states_training) 88 | train_error = 0 89 | 90 | for start_index in range(0, len(board_states_training) - batch_size + 1, batch_size): 91 | mini_batch = board_states_training[start_index:start_index + batch_size] 92 | 93 | batch_error, _ = session.run([error, train_step], 94 | feed_dict={value_input_layer: [x[0] for x in mini_batch], 95 | target_placeholder: [[x[1]] for x in mini_batch]}) 96 | train_error += batch_error 97 | 98 | new_test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test], 99 | target_placeholder: [[x[1]] for x in board_states_test]}) 100 | 101 | print("episode: %s train_error: %s test_error: %s" % (episode_number, train_error, test_error)) 102 | 103 | if new_test_error > test_error: 104 | print("train error went up, stopping training") 105 | break 106 | 107 | test_error = new_test_error 108 | episode_number += 1 109 | 110 | save_network(session, value_variables, value_network_file_path) 111 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tests/__init__.py -------------------------------------------------------------------------------- /tests/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tests/common/__init__.py -------------------------------------------------------------------------------- /tests/common/test_network_helpers.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | from common.network_helpers import create_network, save_network, load_network 8 | 9 | 10 | class TestNetworkHelpers(TestCase): 11 | def test_create_network(self): 12 | input_nodes = 20 13 | hidden_nodes = (50, 40, 30) 14 | input_layer, output_layer, variables = create_network(input_nodes, hidden_nodes) 15 | self.assertSequenceEqual(input_layer.get_shape().as_list(), [None, input_nodes]) 16 | self.assertSequenceEqual(output_layer.get_shape().as_list(), [None, input_nodes]) 17 | self.assertEqual(len(variables), (len(hidden_nodes) + 1) * 2) 18 | 19 | def test_create_network_with_2d_input(self): 20 | input_nodes = (5, 5) 21 | hidden_nodes = (50, 40, 30) 22 | input_layer, output_layer, variables = create_network(input_nodes, hidden_nodes) 23 | self.assertSequenceEqual(input_layer.get_shape().as_list(), [None, input_nodes[0], input_nodes[1]]) 24 | self.assertSequenceEqual(output_layer.get_shape().as_list(), [None, input_nodes[0] * input_nodes[1]]) 25 | self.assertEqual(len(variables), (len(hidden_nodes) + 1) * 2) 26 | 27 | def test_save_and_load_network(self): 28 | try: 29 | file_name = 'test.p' 30 | input_nodes = 20 31 | hidden_nodes = (50, 40, 30) 32 | _, _, variables1 = create_network(input_nodes, hidden_nodes) 33 | _, _, variables2 = create_network(input_nodes, hidden_nodes) 34 | 35 | with tf.Session() as session: 36 | session.run(tf.global_variables_initializer()) 37 | 38 | save_network(session, variables1, file_name) 39 | load_network(session, variables2, file_name) 40 | 41 | for var1, var2 in zip(variables1, variables2): 42 | np.testing.assert_array_almost_equal(session.run(var1), session.run(var2)) 43 | finally: 44 | try: 45 | os.remove(file_name) 46 | except OSError: 47 | pass 48 | 49 | def test_load_variables_into_network_of_wrong_size_gives_friendly_exception(self): 50 | try: 51 | file_name = 'test.p' 52 | input_nodes = 20 53 | 54 | _, _, variables1 = create_network(input_nodes, (30, )) 55 | _, _, variables2 = create_network(input_nodes, (40, )) 56 | 57 | with tf.Session() as session: 58 | session.run(tf.global_variables_initializer()) 59 | 60 | save_network(session, variables1, file_name) 61 | 62 | with self.assertRaises(ValueError): 63 | load_network(session, variables2, file_name) 64 | finally: 65 | try: 66 | os.remove(file_name) 67 | except OSError: 68 | pass -------------------------------------------------------------------------------- /tests/games/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tests/games/__init__.py -------------------------------------------------------------------------------- /tests/games/test_connect_4.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from games.connect_4 import has_winner, play_game, random_player 4 | 5 | 6 | class TestTicTacToeX(TestCase): 7 | def test_has_winner(self): 8 | board_state = ((0, 0, 0, 0, 0, 0), 9 | (0, 0, 0, 0, 0, 0), 10 | (0, 0, 0, 0, 0, 0), 11 | (1, 0, 0, 0, 0, 0), 12 | (0, 1, 0, 0, 0, 0), 13 | (0, 0, 1, 0, 0, 0), 14 | (0, 0, 0, 1, 0, 0)) 15 | 16 | self.assertEqual(1, has_winner(board_state), 4) 17 | 18 | board_state = ((0, 0, 0, 0, 1, 0), 19 | (0, 0, 0, 1, 0, 0), 20 | (0, 0, 1, 0, 0, 0), 21 | (0, 1, 0, 0, 0, 0), 22 | (0, 0, 0, 0, 0, 0), 23 | (0, 0, 0, 0, 0, 0), 24 | (0, 0, 0, 0, 0, 0)) 25 | 26 | self.assertEqual(1, has_winner(board_state), 4) 27 | 28 | def test_play_game(self): 29 | play_game(random_player, random_player) -------------------------------------------------------------------------------- /tests/games/test_tic_tac_toe_x.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from games.tic_tac_toe_x import has_winner, _has_winning_line, play_game, random_player, evaluate 4 | 5 | 6 | class TestTicTacToeX(TestCase): 7 | 8 | def test_has_winning_line(self): 9 | self.assertEqual(1, _has_winning_line((0, 1, 1, 1, 1), 4)) 10 | self.assertEqual(0, _has_winning_line((0, 1, -1, 1, 1), 4)) 11 | self.assertEqual(1, _has_winning_line((1, 1, 1, 1, 1, 0), 4)) 12 | self.assertEqual(1, _has_winning_line((1, 0, 1, 1, 1, 0), 3)) 13 | self.assertEqual(-1, _has_winning_line((-1, -1, -1, -1, 1), 4)) 14 | 15 | def test_has_winner(self): 16 | board_state = ((0, 0, 0, 0, 0), 17 | (0, -1, 0, 0, 0), 18 | (0, -1, 0, 0, 0), 19 | (0, -1, 0, 0, 0), 20 | (0, -1, 0, 0, 0)) 21 | self.assertEqual(-1, has_winner(board_state, 4)) 22 | 23 | board_state = ((0, 1, 0, 0, 0), 24 | (0, 0, 1, 0, 0), 25 | (0, 0, 0, 1, 0), 26 | (0, 0, 0, 0, 1), 27 | (0, 0, 0, 0, 0)) 28 | self.assertEqual(1, has_winner(board_state, 4)) 29 | 30 | board_state = ((0, 0, 0, 0, 0), 31 | (0, 0, 0, 0, 1), 32 | (0, 0, 0, 1, 0), 33 | (0, 0, 1, 0, 0), 34 | (0, 1, 0, 0, 0)) 35 | self.assertEqual(1, has_winner(board_state, 4)) 36 | 37 | board_state = ((0, 0, 0, -1, 0), 38 | (0, 0, -1, 0, 0), 39 | (0, -1, 0, 0, 0), 40 | (-1, 0, 0, 0, 0), 41 | (0, 0, 0, 0, 0)) 42 | self.assertEqual(-1, has_winner(board_state, 4)) 43 | 44 | def test_play_game(self): 45 | play_game(random_player, random_player) 46 | 47 | def test_has_evaluate(self): 48 | board_state = ((-1, -1, -1, 0, 0), 49 | (0, 0, 0, 0, 0), 50 | (0, -1, 0, 0, 0), 51 | (0, -1, 0, 0, 0), 52 | (0, -1, 0, 0, 0)) 53 | self.assertGreater(0, evaluate(board_state, 4)) 54 | 55 | board_state = ((0, 1, 0, 0, 0), 56 | (0, 0, 1, 0, 0), 57 | (0, 0, 0, 0, 0), 58 | (0, 0, 0, 0, 1), 59 | (0, 0, 0, 0, 0)) 60 | self.assertGreater(evaluate(board_state, 4), 0) 61 | 62 | board_state = ((0, 0, 0, 0, 0), 63 | (0, 0, 0, 0, 1), 64 | (0, 0, 0, 0, 0), 65 | (0, 0, 1, 0, 0), 66 | (0, 1, 1, 0, 0)) 67 | self.assertGreater(evaluate(board_state, 4), 0) 68 | 69 | board_state = ((0, 0, 0, -1, 0), 70 | (0, 0, 0, 0, 0), 71 | (0, 0, 0, 0, 0), 72 | (-1, 0, 0, 0, 0), 73 | (0, 0, 0, 0, 0)) 74 | self.assertGreater(0, evaluate(board_state, 4)) -------------------------------------------------------------------------------- /tests/techniques/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tests/techniques/__init__.py -------------------------------------------------------------------------------- /tests/techniques/test_create_positions_set.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from games.tic_tac_toe import TicTacToeGameSpec 4 | from techniques.create_positions_set import create_positions_set 5 | 6 | 7 | class TestCreatePositionsSet(TestCase): 8 | def setUp(self): 9 | self._game_spec = TicTacToeGameSpec() 10 | 11 | def test_create_positions(self): 12 | number_of_positions = 100 13 | positions = create_positions_set(self._game_spec, number_of_positions, self._game_spec.get_random_player_func()) 14 | 15 | self.assertGreater(len(positions), number_of_positions-1) -------------------------------------------------------------------------------- /tests/techniques/test_min_max.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from games.tic_tac_toe import TicTacToeGameSpec 4 | from techniques.min_max import min_max_alpha_beta, min_max 5 | 6 | 7 | class TestMinMax(TestCase): 8 | def setUp(self): 9 | self._game_spec = TicTacToeGameSpec() 10 | 11 | def test_basic_position(self): 12 | # the best move is 2, 2 forcing a win with pluses next move, both players should select it 13 | board_state = ((0, 0, 0), 14 | (-1, -1, 1), 15 | (1, 0, 0)) 16 | 17 | result_min_max = min_max(self._game_spec, board_state, 1, 8) 18 | result_min_max_alpha_beta = min_max_alpha_beta(self._game_spec, board_state, 1, 8) 19 | 20 | self.assertEqual(result_min_max[1], (2, 2)) 21 | self.assertEqual(result_min_max_alpha_beta[1], (2, 2)) 22 | 23 | def test_basic_position_for_minus_player(self): 24 | board_state = ((-1, 1, 0), 25 | (1, -1, 1), 26 | (1, 0, 0)) 27 | 28 | result_min_max = min_max(self._game_spec, board_state, -1, 8) 29 | result_min_max_alpha_beta = min_max_alpha_beta(self._game_spec, board_state, -1, 8) 30 | 31 | self.assertEqual(result_min_max[1], (2, 2)) 32 | self.assertEqual(result_min_max_alpha_beta[1], (2, 2)) 33 | -------------------------------------------------------------------------------- /tests/techniques/test_train_policy_gradient.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from unittest import TestCase 3 | 4 | from common.base_game_spec import BaseGameSpec 5 | from common.network_helpers import create_network 6 | from games.tic_tac_toe import TicTacToeGameSpec 7 | from games.tic_tac_toe_x import TicTacToeXGameSpec 8 | from techniques.train_policy_gradient import train_policy_gradients 9 | 10 | 11 | class _VerySimpleGameSpec(BaseGameSpec): 12 | def new_board(self): 13 | return [0, 0] 14 | 15 | def apply_move(self, board_state, move, side): 16 | board_state[move] = side 17 | return board_state 18 | 19 | def has_winner(self, board_state): 20 | return board_state[0] 21 | 22 | def __init__(self): 23 | pass 24 | 25 | def available_moves(self, board_state): 26 | return [i for i, x in enumerate(board_state) if x == 0] 27 | 28 | def board_dimensions(self): 29 | return 2, 30 | 31 | 32 | class TestTrainPolicyGradient(TestCase): 33 | def test_learn_simple_game(self): 34 | game_spec = _VerySimpleGameSpec() 35 | create_model_func = functools.partial(create_network, 2, (4,)) 36 | variables, win_rate = train_policy_gradients(game_spec, create_model_func, None, 37 | learn_rate=0.1, 38 | number_of_games=1000, print_results_every=100, 39 | batch_size=20, 40 | randomize_first_player=False) 41 | self.assertGreater(win_rate, 0.9) 42 | 43 | def test_tic_tac_toe(self): 44 | game_spec = TicTacToeGameSpec() 45 | create_model_func = functools.partial(create_network, game_spec.board_squares(), (100, 100, 100,)) 46 | variables, win_rate = train_policy_gradients(game_spec, create_model_func, None, 47 | learn_rate=1e-4, 48 | number_of_games=60000, 49 | print_results_every=1000, 50 | batch_size=100, 51 | randomize_first_player=False) 52 | self.assertGreater(win_rate, 0.4) 53 | -------------------------------------------------------------------------------- /tests/techniques/test_train_policy_gradient_historic.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from games.tic_tac_toe import TicTacToeGameSpec 4 | from techniques.create_positions_set import create_positions_set 5 | 6 | 7 | class TestCreatePositionsSet(TestCase): 8 | def setUp(self): 9 | self._game_spec = TicTacToeGameSpec() 10 | 11 | def test_create_positions(self): 12 | number_of_positions = 100 13 | positions = create_positions_set(self._game_spec, number_of_positions, self._game_spec.get_random_player_func()) 14 | 15 | self.assertGreater(len(positions), number_of_positions-1) -------------------------------------------------------------------------------- /tic_tac_toe_5_4/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tic_tac_toe_5_4/__init__.py -------------------------------------------------------------------------------- /tic_tac_toe_5_4/network.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from common.benchmark import benchmark 4 | from games.tic_tac_toe_x import TicTacToeXGameSpec 5 | 6 | tic_tac_toe_5_4_game_spec = TicTacToeXGameSpec(5, 4) 7 | 8 | 9 | def create_convolutional_network(): 10 | input_layer = tf.input_layer = tf.placeholder("float", 11 | (None,) + tic_tac_toe_5_4_game_spec.board_dimensions() + (1,)) 12 | CONVOLUTIONS_LAYER_1 = 64 13 | CONVOLUTIONS_LAYER_2 = 64 14 | CONVOLUTIONS_LAYER_3 = 64 15 | CONVOLUTIONS_LAYER_4 = 64 16 | CONVOLUTIONS_LAYER_5 = 64 17 | FLAT_SIZE = 5 * 5 * CONVOLUTIONS_LAYER_2 18 | FLAT_HIDDEN_NODES = 256 19 | 20 | convolution_weights_1 = tf.Variable(tf.truncated_normal([3, 3, 1, CONVOLUTIONS_LAYER_1], stddev=0.01)) 21 | convolution_bias_1 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_1])) 22 | 23 | convolution_weights_2 = tf.Variable( 24 | tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_1, CONVOLUTIONS_LAYER_2], stddev=0.01)) 25 | convolution_bias_2 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_2])) 26 | 27 | convolution_weights_3 = tf.Variable( 28 | tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_2, CONVOLUTIONS_LAYER_3], stddev=0.01)) 29 | convolution_bias_3 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_3])) 30 | 31 | convolution_weights_4 = tf.Variable( 32 | tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_3, CONVOLUTIONS_LAYER_4], stddev=0.01)) 33 | convolution_bias_4 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_4])) 34 | 35 | # convolution_weights_5 = tf.Variable( 36 | # tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_4, CONVOLUTIONS_LAYER_5], stddev=0.01)) 37 | # convolution_bias_5 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_5])) 38 | 39 | # feed_forward_weights_1 = tf.Variable(tf.truncated_normal([FLAT_SIZE, FLAT_HIDDEN_NODES], stddev=0.01)) 40 | # feed_forward_bias_1 = tf.Variable(tf.constant(0.01, shape=[FLAT_HIDDEN_NODES])) 41 | 42 | feed_forward_weights_2 = tf.Variable( 43 | tf.truncated_normal([FLAT_SIZE, tic_tac_toe_5_4_game_spec.outputs()], stddev=0.01)) 44 | feed_forward_bias_2 = tf.Variable(tf.constant(0.01, shape=[tic_tac_toe_5_4_game_spec.outputs()])) 45 | 46 | hidden_convolutional_layer_1 = tf.nn.relu( 47 | tf.nn.conv2d(input_layer, convolution_weights_1, strides=[1, 1, 1, 1], padding="SAME") + convolution_bias_1) 48 | 49 | hidden_convolutional_layer_2 = tf.nn.relu( 50 | tf.nn.conv2d(hidden_convolutional_layer_1, convolution_weights_2, strides=[1, 1, 1, 1], 51 | padding="SAME") + convolution_bias_2) 52 | 53 | hidden_convolutional_layer_3 = tf.nn.relu( 54 | tf.nn.conv2d(hidden_convolutional_layer_2, convolution_weights_3, strides=[1, 1, 1, 1], 55 | padding="SAME") + convolution_bias_3) 56 | 57 | hidden_convolutional_layer_4 = tf.nn.relu( 58 | tf.nn.conv2d(hidden_convolutional_layer_3, convolution_weights_4, strides=[1, 1, 1, 1], 59 | padding="SAME") + convolution_bias_4) 60 | 61 | # hidden_convolutional_layer_5 = tf.nn.relu( 62 | # tf.nn.conv2d(hidden_convolutional_layer_4, convolution_weights_5, strides=[1, 1, 1, 1], 63 | # padding="SAME") + convolution_bias_5) 64 | 65 | hidden_convolutional_layer_3_flat = tf.reshape(hidden_convolutional_layer_4, [-1, FLAT_SIZE]) 66 | 67 | # final_hidden_activations = tf.nn.relu( 68 | # tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_1) + feed_forward_bias_1) 69 | 70 | output_layer = tf.nn.softmax(tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_2) + feed_forward_bias_2) 71 | 72 | return input_layer, output_layer, [convolution_weights_1, convolution_bias_1, 73 | convolution_weights_2, convolution_bias_2, 74 | convolution_weights_3, convolution_bias_3, 75 | convolution_weights_4, convolution_bias_4, 76 | # convolution_weights_5, convolution_bias_5, 77 | # feed_forward_weights_1, feed_forward_bias_1, 78 | feed_forward_weights_2, feed_forward_bias_2] 79 | 80 | file_path = 'convolutional_net_5_4_l_c_4_f_1_other_fresh.p' 81 | 82 | benchmark(tic_tac_toe_5_4_game_spec, file_path, create_convolutional_network) -------------------------------------------------------------------------------- /tic_tac_toe_5_4/position_tic_tac_toe_5_4_min_max_depth_6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/AlphaToe/1220f4f883dbbd7ac1d84092bdaf04ca18a4dbc2/tic_tac_toe_5_4/position_tic_tac_toe_5_4_min_max_depth_6 -------------------------------------------------------------------------------- /tic_tac_toe_5_4/supervised.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import numpy as np 4 | 5 | from techniques.train_supervised import train_supervised 6 | from tic_tac_toe_5_4.network import tic_tac_toe_5_4_game_spec, create_convolutional_network 7 | 8 | with open("position_tic_tac_toe_5_4_min_max_depth_6", 'rb') as f: 9 | positions = pickle.load(f) 10 | 11 | # for now we need to reshape input for convolutions and one hot the move responses 12 | # this is the kind of stuff I need to clean up in overall design 13 | for i in range(len(positions)): 14 | one_hot = np.zeros(tic_tac_toe_5_4_game_spec.outputs()) 15 | np.put(one_hot, tic_tac_toe_5_4_game_spec.tuple_move_to_flat(positions[i][1]), 1) 16 | positions[i] = np.array(positions[i][0]).reshape(tic_tac_toe_5_4_game_spec.board_dimensions()[0], 17 | tic_tac_toe_5_4_game_spec.board_dimensions()[0], 18 | 1), one_hot 19 | 20 | train_supervised(tic_tac_toe_5_4_game_spec, create_convolutional_network, 'convolutional_net_5_4_l_c_4_f_1_other.p', 21 | positions, regularization_coefficent=1e-4) -------------------------------------------------------------------------------- /tic_tac_toe_5_4/train_historical.py: -------------------------------------------------------------------------------- 1 | from techniques.train_policy_gradient_historic import train_policy_gradients_vs_historic 2 | from tic_tac_toe_5_4.network import tic_tac_toe_5_4_game_spec, create_convolutional_network 3 | 4 | train_policy_gradients_vs_historic(tic_tac_toe_5_4_game_spec, create_convolutional_network, 5 | 'convolutional_net_5_4_l_c_4_f_1_other_after_1.p', 6 | save_network_file_path='convolutional_net_5_4_l_c_4_f_1_other_after_2.p', 7 | number_of_games=50000, 8 | print_results_every=500, 9 | save_historic_every=8000) -------------------------------------------------------------------------------- /tic_tac_toe_5_4/train_vs_min_max.py: -------------------------------------------------------------------------------- 1 | from techniques.min_max import min_max_alpha_beta 2 | from techniques.train_policy_gradient import train_policy_gradients 3 | from tic_tac_toe_5_4.network import tic_tac_toe_5_4_game_spec, create_convolutional_network 4 | 5 | 6 | def min_max_move_func(board_state, side): 7 | return min_max_alpha_beta(tic_tac_toe_5_4_game_spec, board_state, side, 3)[1] 8 | 9 | 10 | train_policy_gradients(tic_tac_toe_5_4_game_spec, create_convolutional_network, 11 | 'convolutional_net_5_4_l_c_4_f_1_other_after.p', 12 | opponent_func=min_max_move_func, 13 | save_network_file_path='convolutional_net_5_4_l_c_4_f_1_other_after_vs_depth_3.p', 14 | number_of_games=5000, 15 | print_results_every=100) -------------------------------------------------------------------------------- /value_network.py: -------------------------------------------------------------------------------- 1 | """ 2 | After using reinforcement learning to train a network, e.g. policy_gradient.py, to play a game well. We then want to 3 | learn to estimate weather that network would win, lose or draw from a given position. 4 | 5 | Alpha Go used a database of real positions to get it's predictions from, we don't have that for tic-tac-toe so instead 6 | we generate some random game positions and train off of the results we get playing from those. 7 | """ 8 | import os 9 | import random 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | from common.network_helpers import create_network, load_network, save_network, \ 15 | get_deterministic_network_move 16 | from games.tic_tac_toe import TicTacToeGameSpec 17 | 18 | HIDDEN_NODES_VALUE = (100, 100, 100) 19 | HIDDEN_NODES_REINFORCEMENT = (100, 100, 100) 20 | BATCH_SIZE = 100 # every how many games to do a parameter update? 21 | LEARN_RATE = 1e-4 22 | REINFORCEMENT_NETWORK_PATH = 'current_network.p' 23 | VALUE_NETWORK_PATH = 'value_netowrk.p' 24 | TRAIN_SAMPLES = 10000 25 | TEST_SAMPLES = 10000 26 | 27 | # to play a different game change this to another spec, e.g TicTacToeXGameSpec or ConnectXGameSpec 28 | game_spec = TicTacToeGameSpec() 29 | 30 | NUMBER_RANDOM_RANGE = (1, game_spec.board_squares() * 0.8) 31 | 32 | 33 | # it would be good to have real board positions, but failing that just generate random ones 34 | def generate_random_board_position(): 35 | while True: 36 | board_state = game_spec.new_board() 37 | number_moves = random.randint(*NUMBER_RANDOM_RANGE) 38 | side = 1 39 | for _ in range(number_moves): 40 | board_state = game_spec.apply_move(board_state, random.choice(list(game_spec.available_moves(board_state))), 41 | side) 42 | if game_spec.has_winner(board_state) != 0: 43 | # start again if we hit an already winning position 44 | continue 45 | 46 | side = -side 47 | return board_state 48 | 49 | 50 | reinforcement_input_layer, reinforcement_output_layer, reinforcement_variables = create_network( 51 | game_spec.board_squares(), 52 | HIDDEN_NODES_REINFORCEMENT, 53 | game_spec.outputs()) 54 | 55 | value_input_layer, value_output_layer, value_variables = create_network(game_spec.board_squares(), HIDDEN_NODES_VALUE, 56 | output_nodes=1, output_softmax=False) 57 | 58 | target_placeholder = tf.placeholder("float", (None, 1)) 59 | error = tf.reduce_sum(tf.square(target_placeholder - value_output_layer)) 60 | 61 | train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(error) 62 | 63 | with tf.Session() as session: 64 | session.run(tf.global_variables_initializer()) 65 | 66 | load_network(session, reinforcement_variables, REINFORCEMENT_NETWORK_PATH) 67 | 68 | if os.path.isfile(VALUE_NETWORK_PATH): 69 | print("loading previous version of value network") 70 | load_network(session, value_variables, VALUE_NETWORK_PATH) 71 | 72 | 73 | def make_move(board_state, side): 74 | move = get_deterministic_network_move(session, reinforcement_input_layer, reinforcement_output_layer, 75 | board_state, side) 76 | 77 | return game_spec.flat_move_to_tuple(np.argmax(move)) 78 | 79 | 80 | board_states_training = {} 81 | board_states_test = [] 82 | episode_number = 0 83 | 84 | while len(board_states_training) < TRAIN_SAMPLES + TEST_SAMPLES: 85 | board_state = generate_random_board_position() 86 | board_state_flat = tuple(np.ravel(board_state)) 87 | 88 | # only accept the board_state if not already in the dict 89 | if board_state_flat not in board_states_training: 90 | result = game_spec.play_game(make_move, make_move, board_state=board_state) 91 | board_states_training[board_state_flat] = float(result) 92 | 93 | # take a random selection from training into a test set 94 | for _ in range(TEST_SAMPLES): 95 | sample = random.choice(board_states_training.keys()) 96 | board_states_test.append((sample, board_states_training[sample])) 97 | del board_states_training[sample] 98 | 99 | board_states_training = list(board_states_training.iteritems()) 100 | 101 | test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test], 102 | target_placeholder: [[x[1]] for x in board_states_test]}) 103 | 104 | while True: 105 | np.random.shuffle(board_states_training) 106 | train_error = 0 107 | 108 | for start_index in range(0, len(board_states_training) - BATCH_SIZE + 1, BATCH_SIZE): 109 | mini_batch = board_states_training[start_index:start_index + BATCH_SIZE] 110 | 111 | batch_error, _ = session.run([error, train_step], 112 | feed_dict={value_input_layer: [x[0] for x in mini_batch], 113 | target_placeholder: [[x[1]] for x in mini_batch]}) 114 | train_error += batch_error 115 | 116 | new_test_error = session.run(error, feed_dict={value_input_layer: [x[0] for x in board_states_test], 117 | target_placeholder: [[x[1]] for x in board_states_test]}) 118 | 119 | print("episode: %s train_error: %s test_error: %s" % (episode_number, train_error, test_error)) 120 | 121 | if new_test_error > test_error: 122 | print("train error went up, stopping training") 123 | break 124 | 125 | test_error = new_test_error 126 | episode_number += 1 127 | 128 | save_network(session, value_variables, VALUE_NETWORK_PATH) 129 | --------------------------------------------------------------------------------