├── Chapter 01 ├── Chapter1_ex1_v2.py ├── Chapter1_ex2_v2.py └── Chapter1_ex3_v2.py ├── Chapter 06 └── language model │ ├── data_processing.py │ ├── data_reader.py │ └── model.py ├── LICENSE ├── Chapter 05 ├── astro_chapter5.py ├── mnist_chapter5_example.py ├── mnist_chapter5_example_convolution.py └── cifar_chapter5_example_convolution.py ├── Chapter 03 └── mnist_chapter3_example.py ├── Chapter 08 ├── q_learning_1d.py ├── q_learning_1d_terminal.py ├── deep_q_cart_pole.py ├── actor_critic_baseline_cart_pole.py ├── actor_critic_advantage_cart_pole.py ├── deep_q_breakout.py └── deep_q_pong.py ├── Chapter 04 └── restricted_boltzmann_machine.py ├── README.md ├── Chapter 07 ├── policy_gradient.py ├── min_max.py ├── monte_carlo.py ├── tic_tac_toe.py ├── connect_4.py └── tic_tac_toe_x.py └── Chapter 02 └── Ch2Example.py /Chapter 01/Chapter1_ex1_v2.py: -------------------------------------------------------------------------------- 1 | from sklearn.neural_network.multilayer_perceptron import MLPClassifier 2 | from sklearn import datasets 3 | from sklearn.metrics import accuracy_score 4 | 5 | iris = datasets.load_iris() 6 | data = iris.data 7 | labels = iris.target 8 | 9 | # We add max_iter=1000 becaue the default is max_iter=200 and 10 | # it is not enough for full convergence 11 | mlp = MLPClassifier(random_state=1, max_iter=1000) 12 | mlp.fit(data, labels) 13 | 14 | pred = mlp.predict(data) 15 | 16 | print() 17 | print('Accuracy: %.2f' % accuracy_score(labels, pred)) 18 | -------------------------------------------------------------------------------- /Chapter 06/language model/data_processing.py: -------------------------------------------------------------------------------- 1 | """Process text file for language model training.""" 2 | from __future__ import print_function, division 3 | 4 | import re 5 | import codecs 6 | 7 | 8 | filepath = 'war_and_peace.txt' # in 9 | out_file = 'wap.txt' # out 10 | 11 | # Regexes used to clean up the text 12 | NEW_LINE_IN_PARAGRAPH_REGEX = re.compile(r'(\S)\n(\S)') 13 | MULTIPLE_NEWLINES_REGEX = re.compile(r'(\n)(\n)+') 14 | 15 | # Read text as string 16 | with codecs.open(filepath, encoding='utf-8', mode='r') as f_input: 17 | book_str = f_input.read() 18 | 19 | # Cleanup 20 | book_str = NEW_LINE_IN_PARAGRAPH_REGEX.sub('\g<1> \g<2>', book_str) 21 | book_str = MULTIPLE_NEWLINES_REGEX.sub('\n\n', book_str) 22 | 23 | # Write proccessed text to file 24 | with codecs.open(out_file, encoding='utf-8', mode='w')as f_output: 25 | f_output.write(book_str) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Chapter 05/astro_chapter5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy 4 | import theano 5 | import matplotlib.pyplot as plt 6 | import theano.tensor as T 7 | from theano.tensor.nnet import conv 8 | import skimage.data 9 | 10 | import matplotlib.cm as cm 11 | 12 | depth = 4 13 | filter_shape = (3, 3) 14 | 15 | input = T.tensor4(name='input') 16 | 17 | w_shape = (depth, 3, filter_shape[0], filter_shape[1]) 18 | dist = numpy.random.uniform(-0.2, 0.2, size=w_shape) 19 | W = theano.shared(numpy.asarray(dist, dtype=input.dtype), name = 'W') 20 | conv_output = conv.conv2d(input, W) 21 | output = T.nnet.sigmoid(conv_output) 22 | f = theano.function([input], output) 23 | 24 | astronaut = skimage.data.astronaut() 25 | img = numpy.asarray(astronaut, dtype='float32') / 255 26 | filtered_img = f(img.transpose(2, 0, 1).reshape(1, 3, 512, 512)) 27 | 28 | 29 | plt.axis('off') 30 | plt.imshow(img) 31 | plt.show() 32 | for img in range(depth): 33 | fig = plt.figure() 34 | plt.axis( 'off') 35 | plt.imshow(filtered_img[0, img, :, :, ], cmap = cm.gray) 36 | plt.show() 37 | 38 | filename = "astro" + str(img) 39 | fig.savefig(filename, bbox_inches='tight') 40 | 41 | 42 | -------------------------------------------------------------------------------- /Chapter 05/mnist_chapter5_example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from keras.datasets import mnist 4 | from keras.models import Sequential 5 | from keras.layers.core import Dense, Activation 6 | from keras.utils import np_utils 7 | 8 | (X_train, Y_train), (X_test, Y_test) = mnist.load_data() 9 | 10 | X_train = X_train.reshape(60000, 784) 11 | X_test = X_test.reshape(10000, 784) 12 | X_train = X_train.astype('float32') 13 | X_test = X_test.astype('float32') 14 | X_train /= 255 15 | X_test /= 255 16 | 17 | 18 | classes = 10 19 | Y_train = np_utils.to_categorical(Y_train, classes) 20 | Y_test = np_utils.to_categorical(Y_test, classes) 21 | 22 | input_size = 784 23 | batch_size = 100 24 | hidden_neurons = 400 25 | epochs = 30 26 | 27 | model = Sequential() 28 | model.add(Dense(hidden_neurons, input_dim=input_size)) 29 | model.add(Activation('relu')) 30 | model.add(Dense(classes, input_dim=hidden_neurons)) 31 | model.add(Activation('softmax')) 32 | 33 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adadelta') 34 | 35 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=1) 36 | 37 | score = model.evaluate(X_test, Y_test, verbose=1) 38 | print('Test accuracy:', score[1]) 39 | 40 | -------------------------------------------------------------------------------- /Chapter 01/Chapter1_ex2_v2.py: -------------------------------------------------------------------------------- 1 | from sklearn.neural_network.multilayer_perceptron import MLPClassifier 2 | from sklearn import datasets 3 | from sklearn.metrics import accuracy_score 4 | 5 | # Since the book came out, the cross_validation method has been moved to 6 | # the model_selection library from the cross_validation library 7 | #from sklearn.cross_validation import train_test_split 8 | from sklearn.model_selection import train_test_split 9 | 10 | from sklearn.preprocessing import StandardScaler 11 | 12 | iris = datasets.load_iris() 13 | data = iris.data 14 | labels = iris.target 15 | 16 | data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.5, random_state=1) 17 | 18 | scaler = StandardScaler() 19 | scaler.fit(data) 20 | data_train_std = scaler.transform(data_train) 21 | data_test_std = scaler.transform(data_test) 22 | 23 | data_train = data_train_std 24 | data_test = data_test_std 25 | 26 | # We add max_iter=1000 becaue the default is max_iter=200 and 27 | # it is not enough for full convergence 28 | mlp = MLPClassifier(random_state=1, max_iter=1000) 29 | mlp.fit(data, labels) 30 | mlp.fit(data_train, labels_train) 31 | pred = mlp.predict(data_test) 32 | 33 | print() 34 | print('Misclassified samples: %d' % (labels_test != pred).sum()) 35 | print('Accuracy: %.2f' % accuracy_score(labels_test, pred)) 36 | -------------------------------------------------------------------------------- /Chapter 03/mnist_chapter3_example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from keras.datasets import mnist 4 | from keras.models import Sequential 5 | from keras.layers.core import Dense, Activation 6 | from keras.utils import np_utils 7 | 8 | (X_train, Y_train), (X_test, Y_test) = mnist.load_data() 9 | 10 | X_train = X_train.reshape(60000, 784) 11 | X_test = X_test.reshape(10000, 784) 12 | 13 | classes = 10 14 | Y_train = np_utils.to_categorical(Y_train, classes) 15 | Y_test = np_utils.to_categorical(Y_test, classes) 16 | 17 | input_size = 784 18 | batch_size = 100 19 | hidden_neurons = 100 20 | epochs = 30 21 | 22 | model = Sequential() 23 | model.add(Dense(hidden_neurons, input_dim=input_size)) 24 | model.add(Activation('sigmoid')) 25 | model.add(Dense(classes, input_dim=hidden_neurons)) 26 | model.add(Activation('softmax')) 27 | 28 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='sgd') 29 | 30 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=1) 31 | 32 | score = model.evaluate(X_test, Y_test, verbose=1) 33 | print('Test accuracy:', score[1]) 34 | 35 | 36 | weights = model.layers[0].get_weights() 37 | 38 | import matplotlib.pyplot as plt 39 | import matplotlib.cm as cm 40 | import numpy 41 | 42 | fig = plt.figure() 43 | 44 | w = weights[0].T 45 | for neuron in range(hidden_neurons): 46 | ax = fig.add_subplot(10, 10, neuron+1) 47 | ax.axis("off") 48 | ax.imshow(numpy.reshape(w[neuron], (28, 28)), cmap = cm.Greys_r) 49 | 50 | plt.savefig("neuron_images.png", dpi=300) 51 | plt.show() 52 | -------------------------------------------------------------------------------- /Chapter 05/mnist_chapter5_example_convolution.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | np.random.seed(0) #for reproducibility 5 | 6 | from keras.datasets import mnist 7 | from keras.models import Sequential 8 | from keras.layers import Dense, Activation 9 | from keras.layers import Convolution2D, MaxPooling2D 10 | from keras.layers import Dropout, Flatten 11 | 12 | from keras.utils import np_utils 13 | 14 | input_size = 784 15 | batch_size = 100 16 | hidden_neurons = 200 17 | classes = 10 18 | epochs = 8 19 | 20 | (X_train, Y_train), (X_test, Y_test) = mnist.load_data() 21 | 22 | X_train = X_train.reshape(60000, 28, 28, 1) 23 | X_test = X_test.reshape(10000, 28, 28, 1) 24 | 25 | X_train = X_train.astype('float32') 26 | X_test = X_test.astype('float32') 27 | X_train /= 255 28 | X_test /= 255 29 | 30 | Y_train = np_utils.to_categorical(Y_train, classes) 31 | Y_test = np_utils.to_categorical(Y_test, classes) 32 | 33 | model = Sequential() 34 | model.add(Convolution2D(32, (3, 3), input_shape=(28, 28, 1))) 35 | model.add(Activation('relu')) 36 | model.add(Convolution2D(32, (3, 3))) 37 | model.add(Activation('relu')) 38 | model.add(MaxPooling2D(pool_size=(2, 2))) 39 | model.add(Dropout(0.25)) 40 | 41 | model.add(Flatten()) 42 | 43 | model.add(Dense(hidden_neurons)) 44 | model.add(Activation('relu')) 45 | model.add(Dense(classes)) 46 | model.add(Activation('softmax')) 47 | 48 | 49 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adadelta') 50 | 51 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, validation_split = 0.1, verbose=1) 52 | 53 | score = model.evaluate(X_test, Y_test, verbose=1) 54 | print('Test accuracy:', score[1]) 55 | 56 | -------------------------------------------------------------------------------- /Chapter 08/q_learning_1d.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | states = [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] 5 | NUM_STATES = len(states) 6 | NUM_ACTIONS = 2 7 | DISCOUNT_FACTOR = 0.5 8 | 9 | 10 | def one_hot_state(index): 11 | array = np.zeros(NUM_STATES) 12 | array[index] = 1. 13 | return array 14 | 15 | 16 | session = tf.Session() 17 | state = tf.placeholder("float", [None, NUM_STATES]) 18 | targets = tf.placeholder("float", [None, NUM_ACTIONS]) 19 | 20 | weights = tf.Variable(tf.constant(0., shape=[NUM_STATES, NUM_ACTIONS])) 21 | 22 | output = tf.matmul(state, weights) 23 | 24 | loss = tf.reduce_mean(tf.square(output - targets)) 25 | train_operation = tf.train.GradientDescentOptimizer(1.).minimize(loss) 26 | 27 | session.run(tf.initialize_all_variables()) 28 | 29 | for _ in range(50): 30 | state_batch = [] 31 | rewards_batch = [] 32 | 33 | for state_index in range(NUM_STATES): 34 | state_batch.append(one_hot_state(state_index)) 35 | 36 | minus_action_index = (state_index - 1) % NUM_STATES 37 | plus_action_index = (state_index + 1) % NUM_STATES 38 | 39 | minus_action_state_reward = session.run(output, feed_dict={state: [one_hot_state(minus_action_index)]}) 40 | plus_action_state_reward = session.run(output, feed_dict={state: [one_hot_state(plus_action_index)]}) 41 | 42 | minus_action_q_value = DISCOUNT_FACTOR * (states[minus_action_index] + np.max(minus_action_state_reward)) 43 | plus_action_q_value = DISCOUNT_FACTOR * (states[plus_action_index] + np.max(plus_action_state_reward)) 44 | 45 | action_rewards = [minus_action_q_value, plus_action_q_value] 46 | rewards_batch.append(action_rewards) 47 | 48 | session.run(train_operation, feed_dict={ 49 | state: state_batch, 50 | targets: rewards_batch}) 51 | 52 | print([states[x] + np.max(session.run(output, feed_dict={state: [one_hot_state(x)]})) 53 | for x in range(NUM_STATES)]) 54 | -------------------------------------------------------------------------------- /Chapter 05/cifar_chapter5_example_convolution.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy 4 | 5 | from keras.datasets import cifar10 6 | from keras.models import Sequential 7 | from keras.layers.core import Dense, Activation 8 | from keras.layers import Convolution2D, MaxPooling2D, Flatten 9 | from keras.layers import Dropout 10 | from keras.utils import np_utils 11 | 12 | batch_size = 100 13 | hidden_neurons = 200 14 | classes = 10 15 | epochs = 20 16 | 17 | (X_train, Y_train), (X_test, Y_test) = cifar10.load_data() 18 | 19 | 20 | Y_train = np_utils.to_categorical(Y_train, classes) 21 | Y_test = np_utils.to_categorical(Y_test, classes) 22 | 23 | model = Sequential() 24 | model.add(Convolution2D(32, (3, 3), input_shape=(32, 32, 3))) 25 | model.add(Activation('relu')) 26 | model.add(Convolution2D(32, (3, 3))) 27 | model.add(Activation('relu')) 28 | model.add(MaxPooling2D(pool_size=(2, 2))) 29 | model.add(Dropout(0.25)) 30 | 31 | model.add(Convolution2D(64, (3, 3))) 32 | model.add(Activation('relu')) 33 | model.add(Convolution2D(64, (3, 3))) 34 | model.add(Activation('relu')) 35 | model.add(MaxPooling2D(pool_size=(2, 2))) 36 | model.add(Dropout(0.25)) 37 | 38 | model.add(Flatten()) 39 | 40 | model.add(Dense(hidden_neurons)) 41 | model.add(Activation('relu')) 42 | model.add(Dropout(0.5)) 43 | model.add(Dense(classes)) 44 | model.add(Activation('softmax')) 45 | 46 | 47 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adadelta') 48 | 49 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, validation_split = 0.1, verbose=1) 50 | 51 | score = model.evaluate(X_test, Y_test, verbose=1) 52 | print('Test accuracy:', score[1]) 53 | 54 | numpy.set_printoptions(threshold='nan') 55 | index = 0 56 | for layer in model.layers: 57 | filename = "conv_layer_" + str(index) 58 | f1 = open(filename, 'w+') 59 | f1.write(repr(layer.get_weights())) 60 | f1.close() 61 | print (filename + " has been opened and closed") 62 | index = index+1 63 | -------------------------------------------------------------------------------- /Chapter 04/restricted_boltzmann_machine.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.examples.tutorials.mnist import input_data 3 | 4 | VISIBLE_NODES = 784 5 | HIDDEN_NODES = 400 6 | LEARNING_RATE = 0.01 7 | 8 | mnist = input_data.read_data_sets("MNIST_data/") 9 | 10 | input_placeholder = tf.placeholder("float", shape=(None, VISIBLE_NODES)) 11 | 12 | weights = tf.Variable(tf.random_normal((VISIBLE_NODES, HIDDEN_NODES), mean=0.0, stddev=1. / VISIBLE_NODES)) 13 | hidden_bias = tf.Variable(tf.zeros([HIDDEN_NODES])) 14 | visible_bias = tf.Variable(tf.zeros([VISIBLE_NODES])) 15 | 16 | hidden_activation = tf.nn.sigmoid(tf.matmul(input_placeholder, weights) + hidden_bias) 17 | visible_reconstruction = tf.nn.sigmoid(tf.matmul(hidden_activation, tf.transpose(weights)) + visible_bias) 18 | 19 | final_hidden_activation = tf.nn.sigmoid(tf.matmul(visible_reconstruction, weights) + hidden_bias) 20 | 21 | positive_phase = tf.matmul(tf.transpose(input_placeholder), hidden_activation) 22 | negative_phase = tf.matmul(tf.transpose(visible_reconstruction), final_hidden_activation) 23 | 24 | weight_update = weights.assign_add(LEARNING_RATE * (positive_phase - negative_phase)) 25 | visible_bias_update = visible_bias.assign_add(LEARNING_RATE * 26 | tf.reduce_mean(input_placeholder - visible_reconstruction, 0)) 27 | hidden_bias_update = hidden_bias.assign_add(LEARNING_RATE * 28 | tf.reduce_mean(hidden_activation - final_hidden_activation, 0)) 29 | 30 | train_op = tf.group(weight_update, visible_bias_update, hidden_bias_update) 31 | 32 | loss_op = tf.reduce_sum(tf.square(input_placeholder - visible_reconstruction)) 33 | 34 | session = tf.Session() 35 | 36 | session.run(tf.initialize_all_variables()) 37 | 38 | current_epochs = 0 39 | 40 | for i in range(20): 41 | total_loss = 0 42 | while mnist.train.epochs_completed == current_epochs: 43 | batch_inputs, batch_labels = mnist.train.next_batch(100) 44 | _, reconstruction_loss = session.run([train_op, loss_op], feed_dict={input_placeholder: batch_inputs}) 45 | total_loss += reconstruction_loss 46 | 47 | print("epochs %s loss %s" % (current_epochs, reconstruction_loss)) 48 | current_epochs = mnist.train.epochs_completed 49 | -------------------------------------------------------------------------------- /Chapter 08/q_learning_1d_terminal.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | states = [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] 5 | terminal = [False, False, False, False, True, False, False, False, False, False] 6 | NUM_STATES = len(states) 7 | NUM_ACTIONS = 2 8 | DISCOUNT_FACTOR = 0.5 9 | 10 | 11 | def one_hot_state(index): 12 | array = np.zeros(NUM_STATES) 13 | array[index] = 1. 14 | return array 15 | 16 | 17 | session = tf.Session() 18 | state = tf.placeholder("float", [None, NUM_STATES]) 19 | targets = tf.placeholder("float", [None, NUM_ACTIONS]) 20 | 21 | weights = tf.Variable(tf.constant(0., shape=[NUM_STATES, NUM_ACTIONS])) 22 | 23 | output = tf.matmul(state, weights) 24 | 25 | loss = tf.reduce_mean(tf.square(output - targets)) 26 | train_operation = tf.train.GradientDescentOptimizer(1.).minimize(loss) 27 | 28 | session.run(tf.initialize_all_variables()) 29 | 30 | for _ in range(50): 31 | state_batch = [] 32 | rewards_batch = [] 33 | 34 | for state_index in range(NUM_STATES): 35 | state_batch.append(one_hot_state(state_index)) 36 | 37 | minus_action_index = (state_index - 1) % NUM_STATES 38 | plus_action_index = (state_index + 1) % NUM_STATES 39 | 40 | if terminal[minus_action_index]: 41 | minus_action_q_value = DISCOUNT_FACTOR * states[minus_action_index] 42 | else: 43 | minus_action_state_reward = session.run(output, feed_dict={state: [one_hot_state(minus_action_index)]}) 44 | minus_action_q_value = DISCOUNT_FACTOR * (states[minus_action_index] + np.max(minus_action_state_reward)) 45 | 46 | if terminal[plus_action_index]: 47 | plus_action_q_value = DISCOUNT_FACTOR * states[plus_action_index] 48 | else: 49 | plus_action_state_reward = session.run(output, feed_dict={state: [one_hot_state(plus_action_index)]}) 50 | plus_action_q_value = DISCOUNT_FACTOR * (states[plus_action_index] + np.max(plus_action_state_reward)) 51 | 52 | action_rewards = [minus_action_q_value, plus_action_q_value] 53 | rewards_batch.append(action_rewards) 54 | 55 | session.run(train_operation, feed_dict={ 56 | state: state_batch, 57 | targets: rewards_batch}) 58 | 59 | print([states[x] + (1-float(terminal[x]))*np.max(session.run(output, feed_dict={state: [one_hot_state(x)]})) 60 | for x in range(NUM_STATES)]) 61 | -------------------------------------------------------------------------------- /Chapter 06/language model/data_reader.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | 3 | from six.moves import range 4 | import codecs 5 | import numpy as np 6 | 7 | 8 | class DataReader(object): 9 | """Data reader used for training language model.""" 10 | def __init__(self, filepath, batch_length, batch_size): 11 | self.batch_length = batch_length 12 | self.batch_size = batch_size 13 | # Read data into string 14 | with codecs.open(filepath, encoding='utf-8', mode='r') as f: 15 | self.data_str = f.read() 16 | self.data_length = len(self.data_str) 17 | print('data_length: ', self.data_length) 18 | # Create a list of characters, indices are class indices for softmax 19 | char_set = set() 20 | for ch in self.data_str: 21 | char_set.add(ch) 22 | self.char_list = sorted(list(char_set)) 23 | print('char_list: ', len(self.char_list), self.char_list) 24 | # Create reverse mapping to look up the index based on the character 25 | self.char_dict = {val: idx for idx, val in enumerate(self.char_list)} 26 | print('char_dict: ', self.char_dict) 27 | # Initalise random start indices 28 | self.reset_indices() 29 | 30 | def reset_indices(self): 31 | self.start_idxs = np.random.random_integers( 32 | 0, self.data_length, self.batch_size) 33 | 34 | def get_sample(self, start_idx, length): 35 | # Get a sample and wrap around the data string 36 | return [self.char_dict[self.data_str[i % self.data_length]] 37 | for i in range(start_idx, start_idx+length)] 38 | 39 | def get_input_target_sample(self, start_idx): 40 | sample = self.get_sample(start_idx, self.batch_length+1) 41 | inpt = sample[0:self.batch_length] 42 | trgt = sample[1:self.batch_length+1] 43 | return inpt, trgt 44 | 45 | def get_batch(self, start_idxs): 46 | input_batch = np.zeros((self.batch_size, self.batch_length), 47 | dtype=np.int32) 48 | target_batch = np.zeros((self.batch_size, self.batch_length), 49 | dtype=np.int32) 50 | for i, start_idx in enumerate(start_idxs): 51 | inpt, trgt = self.get_input_target_sample(start_idx) 52 | input_batch[i, :] = inpt 53 | target_batch[i, :] = trgt 54 | return input_batch, target_batch 55 | 56 | def __iter__(self): 57 | while True: 58 | input_batch, target_batch = self.get_batch(self.start_idxs) 59 | self.start_idxs = ( 60 | self.start_idxs + self.batch_length) % self.data_length 61 | yield input_batch, target_batch 62 | 63 | 64 | def main(): 65 | filepath = './wap.txt' 66 | batch_length = 10 67 | batch_size = 2 68 | reader = DataReader(filepath, batch_length, batch_size) 69 | s = 'As in the question of astronomy then, so in the question of history now,' 70 | print([reader.char_dict[c] for c in s]) 71 | 72 | if __name__ == "__main__": 73 | main() 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Python Deep Learning 5 | This is the code repository for [Python Deep Learning](https://www.packtpub.com/big-data-and-business-intelligence/python-deep-learning?utm_source=github&utm_medium=repository&utm_campaign=9781786464453), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish. 6 | ## About the Book 7 | With an increasing interest in AI around the world, deep learning has attracted a great deal of public attention. Every day, deep learning algorithms are used broadly across different industries. 8 | 9 | The book will give you all the practical information available on the subject, including the best practices, using real-world use cases. You will learn to recognize and extract information to increase predictive accuracy and optimize results. 10 | 11 | Starting with a quick recap of important machine learning concepts, the book will delve straight into deep learning principles using Sci-kit learn. Moving ahead, you will learn to use the latest open source libraries such as Theano, Keras, Google's TensorFlow, and H20. Use this guide to uncover the difficulties of pattern recognition, scaling data with greater accuracy and discussing deep learning algorithms and techniques. 12 | ## Instructions and Navigation 13 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02. 14 | 15 | 16 | 17 | The code will look like the following: 18 | ``` 19 | mlp.fit(data_train, labels_train) 20 | pred = mlp.predict(data_test) 21 | print('Misclassified samples: %d' % (labels_test != pred).sum()) 22 | from sklearn.metrics import accuracy_score print('Accuracy: %.2f' % accuracy_score(labels_test, pred)) 23 | ``` 24 | 25 | 26 | 27 | ## Related Products 28 | * [Python: Deeper Insights into Machine Learning](https://www.packtpub.com/big-data-and-business-intelligence/python-deeper-insights-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781787128576) 29 | 30 | * [Deep Learning with Keras](https://www.packtpub.com/big-data-and-business-intelligence/deep-learning-keras?utm_source=github&utm_medium=repository&utm_campaign=9781787128422) 31 | 32 | * [Deep Learning with Hadoop](https://www.packtpub.com/big-data-and-business-intelligence/deep-learning-hadoop?utm_source=github&utm_medium=repository&utm_campaign=9781787124769) 33 | 34 | * [Python Machine Learning](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781783555130) 35 | 36 | ### Suggestions and Feedback 37 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions. 38 | ### Download a free PDF 39 | 40 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
41 |

https://packt.link/free-ebook/9781789348460

-------------------------------------------------------------------------------- /Chapter 01/Chapter1_ex3_v2.py: -------------------------------------------------------------------------------- 1 | from sklearn.neural_network.multilayer_perceptron import MLPClassifier 2 | from sklearn import datasets 3 | 4 | # Since the book came out, the cross_validation method has been moved to 5 | # the model_selection library from the cross_validation library 6 | #from sklearn.cross_validation import train_test_split 7 | from sklearn.model_selection import train_test_split 8 | 9 | from sklearn.preprocessing import StandardScaler 10 | from sklearn.metrics import accuracy_score 11 | 12 | import numpy 13 | from matplotlib.colors import ListedColormap 14 | import matplotlib.pyplot as plt 15 | 16 | #Apply standardization 17 | standardised = True 18 | 19 | M = {0:"sepal length", 1:"sepal width", 2:"petal length", 3:"petal width"} 20 | 21 | #Choose two features 22 | x=1 #1 corresponds to the sepal width 23 | y=3 #3 corresponds to the petal width 24 | 25 | iris = datasets.load_iris() 26 | data = iris.data[:,[x,y]] 27 | 28 | labels = iris.target 29 | 30 | X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.5, random_state=1) 31 | 32 | reg = StandardScaler() 33 | reg.fit(data) 34 | X_train_std = reg.transform(X_train) 35 | X_test_std = reg.transform(X_test) 36 | 37 | if (standardised == False): 38 | X_train_std = X_train 39 | X_test_std = X_test 40 | 41 | # We add max_iter=1000 becaue the default is max_iter=200 and 42 | # it is not enough for full convergence 43 | mlp = MLPClassifier(random_state=1, max_iter=1000) 44 | mlp.fit(X_train_std, y_train) 45 | 46 | y_pred = mlp.predict(X_test_std) 47 | print('Misclassified samples: %d' % (y_test != y_pred).sum()) 48 | 49 | print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) 50 | 51 | 52 | def plot_decision_regions(data, labels, classifier, resolution=0.01): 53 | markers = ('s', '*', '^') 54 | colors = ('blue', 'green', 'red') 55 | cmap = ListedColormap(colors) 56 | # plot the decision surface 57 | x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1 58 | y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1 59 | 60 | x, y = numpy.meshgrid(numpy.arange(x_min, x_max, resolution), numpy.arange(y_min, y_max, resolution)) 61 | Z = classifier.predict(numpy.array([x.ravel(), y.ravel()]).T) 62 | Z = Z.reshape(x.shape) 63 | 64 | plt.pcolormesh(x, y, Z, cmap=cmap) 65 | plt.xlim(x.min(), x.max()) 66 | plt.ylim(y.min(), y.max()) 67 | 68 | colors = ('yellow', 'white', 'black') 69 | #cmap = ListedColormap(colors) 70 | #plot the data 71 | classes = ["setosa", "versicolor", "verginica"] 72 | for index, cl in enumerate(numpy.unique(labels)): 73 | plt.scatter(data[labels == cl, 0], data[labels == cl, 1], c=cmap(index), marker=markers[index], edgecolor="black", alpha=1.0, s=50, label=classes[index]) 74 | 75 | X_combined_std = numpy.vstack((X_train_std, X_test_std)) 76 | y_combined = numpy.hstack((y_train, y_test)) 77 | plot_decision_regions(X_combined_std, y_combined, classifier=mlp) 78 | 79 | if (standardised == False): 80 | xString = M[x] + " [not standardized]" 81 | yString = M[y] + " [not standardized]" 82 | else: 83 | xString = M[x] + " [standardized]" 84 | yString = M[y] + " [standardized]" 85 | 86 | plt.xlabel(xString) 87 | plt.ylabel(yString) 88 | plt.legend(loc='upper left') 89 | plt.show() -------------------------------------------------------------------------------- /Chapter 07/policy_gradient.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | from tic_tac_toe import play_game, random_player 6 | 7 | HIDDEN_NODES = (100, 100, 100) # number of hidden layer neurons 8 | INPUT_NODES = 3 * 3 # board size 9 | BATCH_SIZE = 100 # every how many games to do a parameter update? 10 | LEARN_RATE = 1e-4 11 | OUTPUT_NODES = INPUT_NODES 12 | PRINT_RESULTS_EVERY_X = 1000 # every how many games to print the results 13 | 14 | input_placeholder = tf.placeholder("float", shape=(None, INPUT_NODES)) 15 | reward_placeholder = tf.placeholder("float", shape=(None,)) 16 | actual_move_placeholder = tf.placeholder("float", shape=(None, OUTPUT_NODES)) 17 | 18 | hidden_weights_1 = tf.Variable(tf.truncated_normal((INPUT_NODES, HIDDEN_NODES[0]), stddev=1. / np.sqrt(INPUT_NODES))) 19 | hidden_weights_2 = tf.Variable( 20 | tf.truncated_normal((HIDDEN_NODES[0], HIDDEN_NODES[1]), stddev=1. / np.sqrt(HIDDEN_NODES[0]))) 21 | hidden_weights_3 = tf.Variable( 22 | tf.truncated_normal((HIDDEN_NODES[1], HIDDEN_NODES[2]), stddev=1. / np.sqrt(HIDDEN_NODES[1]))) 23 | output_weights = tf.Variable(tf.truncated_normal((HIDDEN_NODES[-1], OUTPUT_NODES), stddev=1. / np.sqrt(OUTPUT_NODES))) 24 | 25 | hidden_layer_1 = tf.nn.relu( 26 | tf.matmul(input_placeholder, hidden_weights_1) + tf.Variable(tf.constant(0.01, shape=(HIDDEN_NODES[0],)))) 27 | hidden_layer_2 = tf.nn.relu( 28 | tf.matmul(hidden_layer_1, hidden_weights_2) + tf.Variable(tf.constant(0.01, shape=(HIDDEN_NODES[1],)))) 29 | hidden_layer_3 = tf.nn.relu( 30 | tf.matmul(hidden_layer_2, hidden_weights_3) + tf.Variable(tf.constant(0.01, shape=(HIDDEN_NODES[2],)))) 31 | output_layer = tf.nn.softmax( 32 | tf.matmul(hidden_layer_3, output_weights) + tf.Variable(tf.constant(0.01, shape=(OUTPUT_NODES,)))) 33 | 34 | policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer) 35 | train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient) 36 | 37 | sess = tf.Session() 38 | sess.run(tf.initialize_all_variables()) 39 | 40 | board_states, actual_moves, rewards = [], [], [] 41 | episode_number = 1 42 | results = collections.deque() 43 | 44 | 45 | def make_move(board_state, side): 46 | board_state_flat = np.ravel(board_state) 47 | board_states.append(board_state_flat) 48 | probability_of_actions = sess.run(output_layer, feed_dict={input_placeholder: [board_state_flat]})[0] 49 | 50 | try: 51 | move = np.random.multinomial(1, probability_of_actions) 52 | except ValueError: 53 | # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1. 54 | # so need to reduce slightly to be a valid value 55 | move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-7)) 56 | 57 | actual_moves.append(move) 58 | 59 | move_index = move.argmax() 60 | return move_index / 3, move_index % 3 61 | 62 | 63 | while True: 64 | reward = play_game(make_move, random_player) 65 | 66 | results.append(reward) 67 | if len(results) > PRINT_RESULTS_EVERY_X: 68 | results.popleft() 69 | 70 | last_game_length = len(board_states) - len(rewards) 71 | 72 | # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick 73 | reward /= float(last_game_length) 74 | 75 | rewards += ([reward] * last_game_length) 76 | 77 | episode_number += 1 78 | 79 | if episode_number % BATCH_SIZE == 0: 80 | normalized_rewards = rewards - np.mean(rewards) 81 | normalized_rewards /= np.std(normalized_rewards) 82 | 83 | sess.run(train_step, feed_dict={input_placeholder: board_states, 84 | reward_placeholder: normalized_rewards, 85 | actual_move_placeholder: actual_moves}) 86 | 87 | # clear batches 88 | del board_states[:] 89 | del actual_moves[:] 90 | del rewards[:] 91 | 92 | if episode_number % PRINT_RESULTS_EVERY_X == 0: 93 | print("episode: %s win_rate: %s" % (episode_number, 0.5 + sum(results) / (PRINT_RESULTS_EVERY_X * 2.))) 94 | -------------------------------------------------------------------------------- /Chapter 07/min_max.py: -------------------------------------------------------------------------------- 1 | from tic_tac_toe import available_moves, apply_move, has_winner 2 | import sys 3 | 4 | 5 | def _score_line(line): 6 | minus_count = line.count(-1) 7 | plus_count = line.count(1) 8 | if minus_count + plus_count < 3: 9 | if minus_count == 2: 10 | return -1 11 | elif plus_count == 2: 12 | return 1 13 | return 0 14 | 15 | 16 | def evaluate(board_state): 17 | """Get a rough score for how good we think this board position is for the plus_player. Does this based on number of 18 | 2 in row lines we have. 19 | 20 | Args: 21 | board_state (3x3 tuple of int): The board state we are evaluating 22 | 23 | Returns: 24 | int: evaluated score for the position for the plus player, posative is good for the plus player, negative good 25 | for the minus player 26 | """ 27 | score = 0 28 | for x in range(3): 29 | score += _score_line(board_state[x]) 30 | for y in range(3): 31 | score += _score_line([i[y] for i in board_state]) 32 | 33 | # diagonals 34 | score += _score_line([board_state[i][i] for i in range(3)]) 35 | score += _score_line([board_state[2 - i][i] for i in range(3)]) 36 | 37 | return score 38 | 39 | 40 | def min_max(board_state, side, max_depth, evaluation_func=evaluate): 41 | """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best 42 | move 43 | 44 | Args: 45 | board_state (3x3 tuple of int): The board state we are evaluating 46 | side (int): either +1 or -1 47 | max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the 48 | position is. 49 | evaluation_func (board_state -> int): Function used to evaluate the position for the plus player 50 | 51 | Returns: 52 | (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was 53 | """ 54 | best_score = None 55 | best_score_move = None 56 | 57 | moves = list(available_moves(board_state)) 58 | if not moves: 59 | # this is a draw 60 | return 0, None 61 | 62 | for move in moves: 63 | new_board_state = apply_move(board_state, move, side) 64 | winner = has_winner(new_board_state) 65 | if winner != 0: 66 | return winner * 10000, move 67 | else: 68 | if max_depth <= 1: 69 | score = evaluation_func(new_board_state) 70 | else: 71 | score, _ = min_max(new_board_state, -side, max_depth - 1) 72 | if side > 0: 73 | if best_score is None or score > best_score: 74 | best_score = score 75 | best_score_move = move 76 | else: 77 | if best_score is None or score < best_score: 78 | best_score = score 79 | best_score_move = move 80 | return best_score, best_score_move 81 | 82 | 83 | def min_max_alpha_beta(board_state, side, max_depth, evaluation_func=evaluate, alpha=-sys.float_info.max, 84 | beta=sys.float_info.max): 85 | """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best 86 | move 87 | 88 | Args: 89 | board_state (3x3 tuple of int): The board state we are evaluating 90 | side (int): either +1 or -1 91 | max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the 92 | position is. 93 | evaluation_func (board_state -> int): Function used to evaluate the position for the plus player 94 | alpha (float): Used when this is called recursively, normally ignore 95 | beta (float): Used when this is called recursively, normally ignore 96 | 97 | Returns: 98 | (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was 99 | """ 100 | best_score_move = None 101 | moves = list(available_moves(board_state)) 102 | if not moves: 103 | return 0, None 104 | 105 | for move in moves: 106 | new_board_state = apply_move(board_state, move, side) 107 | winner = has_winner(new_board_state) 108 | if winner != 0: 109 | return winner * 10000, move 110 | else: 111 | if max_depth <= 1: 112 | score = evaluation_func(new_board_state) 113 | else: 114 | score, _ = min_max_alpha_beta(new_board_state, -side, max_depth - 1, alpha, beta) 115 | 116 | if side > 0: 117 | if score > alpha: 118 | alpha = score 119 | best_score_move = move 120 | else: 121 | if score < beta: 122 | beta = score 123 | best_score_move = move 124 | if alpha >= beta: 125 | break 126 | 127 | return alpha if side > 0 else beta, best_score_move 128 | 129 | 130 | def min_max_player(board_state, side): 131 | return min_max(board_state, side, 5)[1] 132 | -------------------------------------------------------------------------------- /Chapter 08/deep_q_cart_pole.py: -------------------------------------------------------------------------------- 1 | # note must import tensorflow before gym 2 | import random 3 | from collections import deque 4 | 5 | import tensorflow as tf 6 | import gym 7 | import numpy as np 8 | 9 | env = gym.make('CartPole-v0') 10 | 11 | ACTIONS_COUNT = 2 12 | FUTURE_REWARD_DISCOUNT = 0.9 13 | OBSERVATION_STEPS = 5000. # time steps to observe before training 14 | EXPLORE_STEPS = 15000. # frames over which to anneal epsilon 15 | INITIAL_RANDOM_ACTION_PROB = 1.0 # starting chance of an action being random 16 | FINAL_RANDOM_ACTION_PROB = 0.0 # final chance of an action being random 17 | MEMORY_SIZE = 20000 # number of observations to remember 18 | MINI_BATCH_SIZE = 100 # size of mini batches 19 | OBS_LAST_STATE_INDEX, OBS_ACTION_INDEX, OBS_REWARD_INDEX, OBS_CURRENT_STATE_INDEX, OBS_TERMINAL_INDEX = range(5) 20 | LEARN_RATE = 1e-3 21 | STORE_SCORES_LEN = 100. 22 | INPUT_NODES = env.observation_space.shape[0] 23 | HIDDEN_NODES = 20 24 | 25 | session = tf.Session() 26 | 27 | feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, HIDDEN_NODES], stddev=0.01)) 28 | feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[HIDDEN_NODES])) 29 | 30 | feed_forward_weights_2 = tf.Variable(tf.truncated_normal([HIDDEN_NODES, ACTIONS_COUNT], stddev=0.01)) 31 | feed_forward_bias_2 = tf.Variable(tf.constant(0.0, shape=[ACTIONS_COUNT])) 32 | 33 | input_placeholder = tf.placeholder("float", [None, INPUT_NODES]) 34 | hidden_layer = tf.nn.tanh(tf.matmul(input_placeholder, feed_forward_weights_1) + feed_forward_bias_1) 35 | output_layer = tf.matmul(hidden_layer, feed_forward_weights_2) + feed_forward_bias_2 36 | 37 | action_placeholder = tf.placeholder("float", [None, ACTIONS_COUNT]) 38 | target_placeholder = tf.placeholder("float", [None]) 39 | 40 | readout_action = tf.reduce_sum(tf.mul(output_layer, action_placeholder), reduction_indices=1) 41 | 42 | cost = tf.reduce_mean(tf.square(target_placeholder - readout_action)) 43 | train_operation = tf.train.AdamOptimizer(LEARN_RATE).minimize(cost) 44 | 45 | observations = deque(maxlen=MEMORY_SIZE) 46 | scores = deque(maxlen=STORE_SCORES_LEN) 47 | 48 | # set the first action to do nothing 49 | last_action = np.zeros(ACTIONS_COUNT) 50 | last_action[1] = 1 51 | 52 | probability_of_random_action = INITIAL_RANDOM_ACTION_PROB 53 | time = 0 54 | 55 | session.run(tf.initialize_all_variables()) 56 | 57 | 58 | def choose_next_action(state): 59 | new_action = np.zeros([ACTIONS_COUNT]) 60 | 61 | if random.random() <= probability_of_random_action: 62 | # choose an action randomly 63 | action_index = random.randrange(ACTIONS_COUNT) 64 | else: 65 | # choose an action given our state 66 | action_values = session.run(output_layer, feed_dict={input_placeholder: [state]})[0] 67 | # we will take the highest value action 68 | action_index = np.argmax(action_values) 69 | 70 | new_action[action_index] = 1 71 | return new_action 72 | 73 | 74 | def train(): 75 | # sample a mini_batch to train on 76 | mini_batch = random.sample(observations, MINI_BATCH_SIZE) 77 | 78 | # get the batch variables 79 | previous_states = [d[OBS_LAST_STATE_INDEX] for d in mini_batch] 80 | actions = [d[OBS_ACTION_INDEX] for d in mini_batch] 81 | rewards = [d[OBS_REWARD_INDEX] for d in mini_batch] 82 | current_states = [d[OBS_CURRENT_STATE_INDEX] for d in mini_batch] 83 | agents_expected_reward = [] 84 | # this gives us the agents expected reward for each action we might take 85 | agents_reward_per_action = session.run(output_layer, feed_dict={input_placeholder: current_states}) 86 | for i in range(len(mini_batch)): 87 | if mini_batch[i][OBS_TERMINAL_INDEX]: 88 | # this was a terminal frame so there is no future reward... 89 | agents_expected_reward.append(rewards[i]) 90 | else: 91 | agents_expected_reward.append( 92 | rewards[i] + FUTURE_REWARD_DISCOUNT * np.max(agents_reward_per_action[i])) 93 | 94 | # learn that these actions in these states lead to this reward 95 | session.run(train_operation, feed_dict={ 96 | input_placeholder: previous_states, 97 | action_placeholder: actions, 98 | target_placeholder: agents_expected_reward}) 99 | 100 | 101 | last_state = env.reset() 102 | total_reward = 0 103 | 104 | while True: 105 | env.render() 106 | last_action = choose_next_action(last_state) 107 | current_state, reward, terminal, info = env.step(np.argmax(last_action)) 108 | total_reward += reward 109 | 110 | if terminal: 111 | reward = -1. 112 | scores.append(total_reward) 113 | 114 | print("Time: %s random_action_prob: %s reward %s scores differential %s" % 115 | (time, probability_of_random_action, total_reward, 116 | np.mean(scores))) 117 | total_reward = 0 118 | 119 | # store the transition in previous_observations 120 | observations.append((last_state, last_action, reward, current_state, terminal)) 121 | 122 | # only train if done observing 123 | if len(observations) > OBSERVATION_STEPS: 124 | train() 125 | time += 1 126 | 127 | # update the old values 128 | if terminal: 129 | last_state = env.reset() 130 | else: 131 | last_state = current_state 132 | 133 | # gradually reduce the probability of a random action 134 | if probability_of_random_action > FINAL_RANDOM_ACTION_PROB \ 135 | and len(observations) > OBSERVATION_STEPS: 136 | probability_of_random_action -= \ 137 | (INITIAL_RANDOM_ACTION_PROB - FINAL_RANDOM_ACTION_PROB) / EXPLORE_STEPS 138 | -------------------------------------------------------------------------------- /Chapter 07/monte_carlo.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import random 3 | import math 4 | from tic_tac_toe import has_winner, available_moves, apply_move 5 | 6 | 7 | def monte_carlo_sample(board_state, side): 8 | """Sample a single rollout from the current board_state and side. Moves are made to the current board_state until we 9 | reach a terminal state then the result and the first move made to get there is returned. 10 | 11 | Args: 12 | board_state (3x3 tuple of int): state of the board 13 | side (int): side currently to play. +1 for the plus player, -1 for the minus player 14 | 15 | Returns: 16 | (result(int), move(int,int)): The result from this rollout, +1 for a win for the plus player -1 for a win for 17 | the minus player, 0 for a draw 18 | """ 19 | result = has_winner(board_state) 20 | if result != 0: 21 | return result, None 22 | moves = list(available_moves(board_state)) 23 | if not moves: 24 | return 0, None 25 | 26 | # select a random move 27 | move = random.choice(moves) 28 | result, next_move = monte_carlo_sample(apply_move(board_state, move, side), -side) 29 | return result, move 30 | 31 | 32 | def monte_carlo_tree_search(board_state, side, number_of_samples): 33 | """Evaluate the best from the current board_state for the given side using monte carlo sampling. 34 | 35 | Args: 36 | board_state (3x3 tuple of int): state of the board 37 | side (int): side currently to play. +1 for the plus player, -1 for the minus player 38 | number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the 39 | better the estimation of the position 40 | 41 | Returns: 42 | (result(int), move(int,int)): The average result for the best move from this position and what that move was. 43 | """ 44 | move_wins = collections.defaultdict(int) 45 | move_samples = collections.defaultdict(int) 46 | for _ in range(number_of_samples): 47 | result, move = monte_carlo_sample(board_state, side) 48 | # store the result and a count of the number of times we have tried this move 49 | if result == side: 50 | move_wins[move] += 1 51 | move_samples[move] += 1 52 | 53 | # get the move with the best average result 54 | move = max(move_wins, key=lambda x: move_wins.get(x) / move_samples[move]) 55 | 56 | return move_wins[move] / move_samples[move], move 57 | 58 | 59 | def _upper_confidence_bounds(payout, samples_for_this_machine, log_total_samples): 60 | return payout / samples_for_this_machine + math.sqrt((2 * log_total_samples) / samples_for_this_machine) 61 | 62 | 63 | def monte_carlo_tree_search_uct(board_state, side, number_of_samples): 64 | """Evaluate the best from the current board_state for the given side using monte carlo sampling with upper 65 | confidence bounds for trees. 66 | 67 | Args: 68 | board_state (3x3 tuple of int): state of the board 69 | side (int): side currently to play. +1 for the plus player, -1 for the minus player 70 | number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the 71 | better the estimation of the position 72 | 73 | Returns: 74 | (result(int), move(int,int)): The average result for the best move from this position and what that move was. 75 | """ 76 | state_results = collections.defaultdict(float) 77 | state_samples = collections.defaultdict(float) 78 | 79 | for _ in range(number_of_samples): 80 | current_side = side 81 | current_board_state = board_state 82 | first_unvisited_node = True 83 | rollout_path = [] 84 | result = 0 85 | 86 | while result == 0: 87 | move_states = {move: apply_move(current_board_state, move, current_side) 88 | for move in available_moves(current_board_state)} 89 | 90 | if not move_states: 91 | result = 0 92 | break 93 | 94 | if all((state in state_samples) for _, state in move_states): 95 | log_total_samples = math.log(sum(state_samples[s] for s in move_states.values())) 96 | move, state = max(move_states, key=lambda _, s: _upper_confidence_bounds(state_results[s], 97 | state_samples[s], 98 | log_total_samples)) 99 | else: 100 | move = random.choice(list(move_states.keys())) 101 | 102 | current_board_state = move_states[move] 103 | 104 | if first_unvisited_node: 105 | rollout_path.append((current_board_state, current_side)) 106 | if current_board_state not in state_samples: 107 | first_unvisited_node = False 108 | 109 | current_side = -current_side 110 | 111 | result = has_winner(current_board_state) 112 | 113 | for path_board_state, path_side in rollout_path: 114 | state_samples[path_board_state] += 1. 115 | result *= path_side 116 | # normalize results to be between 0 and 1 before this it between -1 and 1 117 | result /= 2. 118 | result += .5 119 | state_results[path_board_state] += result 120 | 121 | move_states = {move: apply_move(board_state, move, side) for move in available_moves(board_state)} 122 | 123 | move = max(move_states, key=lambda x: state_results[move_states[x]] / state_samples[move_states[x]]) 124 | 125 | return state_results[move_states[move]] / state_samples[move_states[move]], move 126 | 127 | 128 | if __name__ == '__main__': 129 | board_state = ((1, 0, -1), 130 | (1, 0, 0), 131 | (0, -1, 0)) 132 | 133 | print(monte_carlo_tree_search_uct(board_state, -1, 10000)) 134 | -------------------------------------------------------------------------------- /Chapter 02/Ch2Example.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | from matplotlib.colors import ListedColormap 3 | import matplotlib.pyplot as plt 4 | 5 | def tanh(x): 6 | return (1.0 - numpy.exp(-2*x))/(1.0 + numpy.exp(-2*x)) 7 | 8 | def tanh_derivative(x): 9 | return (1 + tanh(x))*(1 - tanh(x)) 10 | 11 | class NeuralNetwork: 12 | #network consists of a list of integers, indicating 13 | #the number of neurons in each layer 14 | def __init__(self, net_arch): 15 | numpy.random.seed(0) 16 | self.activity = tanh 17 | self.activity_derivative = tanh_derivative 18 | self.layers = len(net_arch) 19 | self.steps_per_epoch = 1000 20 | self.arch = net_arch 21 | 22 | self.weights = [] 23 | #range of weight values (-1,1) 24 | for layer in range(len(net_arch) - 1): 25 | w = 2*numpy.random.rand(net_arch[layer] + 1, net_arch[layer+1]) - 1 26 | self.weights.append(w) 27 | 28 | def fit(self, data, labels, learning_rate=0.1, epochs=10): 29 | #Add bias units to the input layer 30 | ones = numpy.ones((1, data.shape[0])) 31 | Z = numpy.concatenate((ones.T, data), axis=1) 32 | training = epochs*self.steps_per_epoch 33 | 34 | 35 | for k in range(training): 36 | if k % self.steps_per_epoch == 0: 37 | #print ('epochs:', k/self.steps_per_epoch) 38 | print('epochs: {}'.format(k/self.steps_per_epoch)) 39 | for s in data: 40 | print(s, self.predict(s)) 41 | 42 | sample = numpy.random.randint(data.shape[0]) 43 | y = [Z[sample]] 44 | 45 | for i in range(len(self.weights)-1): 46 | activation = numpy.dot(y[i], self.weights[i]) 47 | activity = self.activity(activation) 48 | #add the bias for the next layer 49 | activity = numpy.concatenate((numpy.ones(1), numpy.array(activity))) 50 | y.append(activity) 51 | 52 | #last layer 53 | activation = numpy.dot(y[-1], self.weights[-1]) 54 | activity = self.activity(activation) 55 | y.append(activity) 56 | 57 | #error for the output layer 58 | error = labels[sample] - y[-1] 59 | delta_vec = [error * self.activity_derivative(y[-1])] 60 | 61 | #we need to begin from the back from the next to last layer 62 | for i in range(self.layers-2, 0, -1): 63 | #delta_vec [1].dot(self.weights[i][1:].T) 64 | error = delta_vec[-1].dot(self.weights[i][1:].T) 65 | error = error*self.activity_derivative(y[i][1:]) 66 | delta_vec.append(error) 67 | 68 | # reverse 69 | # [level3(output)->level2(hidden)] => [level2(hidden)->level3(output)] 70 | delta_vec.reverse() 71 | 72 | # backpropagation 73 | # 1. Multiply its output delta and input activation 74 | # to get the gradient of the weight. 75 | # 2. Subtract a ratio (percentage) of the gradient from the weight 76 | for i in range(len(self.weights)): 77 | layer = y[i].reshape(1, self.arch[i]+1) 78 | 79 | delta = delta_vec[i].reshape(1, self.arch[i+1]) 80 | self.weights[i] += learning_rate * layer.T.dot(delta) 81 | 82 | def predict(self, x): 83 | val = numpy.concatenate((numpy.ones(1).T, numpy.array(x))) 84 | for i in range(0, len(self.weights)): 85 | val = self.activity(numpy.dot(val, self.weights[i])) 86 | val = numpy.concatenate((numpy.ones(1).T, numpy.array(val))) 87 | 88 | return val[1] 89 | 90 | def plot_decision_regions(self, X, y, points=200): 91 | markers = ('o', '^') 92 | colors = ('red', 'blue') 93 | cmap = ListedColormap(colors) 94 | # plot the decision surface 95 | x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 96 | x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 97 | 98 | resolution = max(x1_max - x1_min, x2_max - x2_min)/float(points) 99 | #resolution = 0.01 100 | 101 | xx1, xx2 = numpy.meshgrid(numpy.arange(x1_min, x1_max, resolution), numpy.arange(x2_min, x2_max, resolution)) 102 | input = numpy.array([xx1.ravel(), xx2.ravel()]).T 103 | Z = numpy.empty(0) 104 | for i in range(input.shape[0]): 105 | val = self.predict(numpy.array(input[i])) 106 | if val < 0.5: val = 0 107 | if val >= 0.5: val = 1 108 | Z = numpy.append(Z, val) 109 | 110 | Z = Z.reshape(xx1.shape) 111 | 112 | plt.pcolormesh(xx1, xx2, Z, cmap=cmap) 113 | plt.xlim(xx1.min(), xx1.max()) 114 | plt.ylim(xx2.min(), xx2.max()) 115 | # plot all samples 116 | 117 | classes = ["False", "True"] 118 | for idx, cl in enumerate(numpy.unique(y)): 119 | plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=1.0, c=cmap(idx), marker=markers[idx], s=80, label=classes[idx]) 120 | 121 | plt.xlabel('x-axis') 122 | plt.ylabel('y-axis') 123 | plt.legend(loc='upper left') 124 | plt.show() 125 | 126 | if __name__ == '__main__': 127 | 128 | nn = NeuralNetwork([2,2,1]) 129 | 130 | X = numpy.array([[0, 0], 131 | [0, 1], 132 | [1, 0], 133 | [1, 1]]) 134 | 135 | y = numpy.array([0, 1, 1, 0]) 136 | 137 | nn.fit(X, y, epochs=10) 138 | 139 | print "Final prediction" 140 | for s in X: 141 | print(s, nn.predict(s)) 142 | 143 | nn.plot_decision_regions(X, y) 144 | -------------------------------------------------------------------------------- /Chapter 08/actor_critic_baseline_cart_pole.py: -------------------------------------------------------------------------------- 1 | # note must import tensorflow before gym 2 | import pickle 3 | from collections import deque 4 | 5 | import tensorflow as tf 6 | import gym 7 | import numpy as np 8 | 9 | env = gym.make('CartPole-v0') 10 | 11 | ACTIONS_COUNT = 2 12 | FUTURE_REWARD_DISCOUNT = 0.9 13 | LEARN_RATE_ACTOR = 0.01 14 | LEARN_RATE_CRITIC = 0.01 15 | STORE_SCORES_LEN = 5 16 | GAMES_PER_TRAINING = 3 17 | INPUT_NODES = env.observation_space.shape[0] 18 | 19 | ACTOR_HIDDEN = 20 20 | 21 | session = tf.Session() 22 | 23 | actor_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, ACTOR_HIDDEN], stddev=0.01)) 24 | actor_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[ACTOR_HIDDEN])) 25 | 26 | actor_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([ACTOR_HIDDEN, ACTIONS_COUNT], stddev=0.01)) 27 | actor_feed_forward_bias_2 = tf.Variable(tf.constant(0.1, shape=[ACTIONS_COUNT])) 28 | 29 | actor_input_placeholder = tf.placeholder("float", [None, INPUT_NODES]) 30 | actor_hidden_layer = tf.nn.tanh( 31 | tf.matmul(actor_input_placeholder, actor_feed_forward_weights_1) + actor_feed_forward_bias_1) 32 | actor_output_layer = tf.nn.softmax( 33 | tf.matmul(actor_hidden_layer, actor_feed_forward_weights_2) + actor_feed_forward_bias_2) 34 | 35 | actor_action_placeholder = tf.placeholder("float", [None, ACTIONS_COUNT]) 36 | actor_advantage_placeholder = tf.placeholder("float", [None, 1]) 37 | 38 | policy_gradient = tf.reduce_mean(actor_advantage_placeholder * actor_action_placeholder * tf.log(actor_output_layer)) 39 | actor_train_operation = tf.train.AdamOptimizer(LEARN_RATE_ACTOR).minimize(-policy_gradient) 40 | 41 | CRITIC_HIDDEN = 20 42 | 43 | critic_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, CRITIC_HIDDEN], stddev=0.01)) 44 | critic_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[CRITIC_HIDDEN])) 45 | 46 | critic_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([CRITIC_HIDDEN, 1], stddev=0.01)) 47 | critic_feed_forward_bias_2 = tf.Variable(tf.constant(0.0, shape=[1])) 48 | 49 | critic_input_placeholder = tf.placeholder("float", [None, INPUT_NODES]) 50 | critic_hidden_layer = tf.nn.tanh( 51 | tf.matmul(critic_input_placeholder, critic_feed_forward_weights_1) + critic_feed_forward_bias_1) 52 | critic_output_layer = tf.matmul(critic_hidden_layer, critic_feed_forward_weights_2) + critic_feed_forward_bias_2 53 | 54 | critic_target_placeholder = tf.placeholder("float", [None, 1]) 55 | 56 | critic_cost = tf.reduce_mean(tf.square(critic_target_placeholder - critic_output_layer)) 57 | critic_train_operation = tf.train.AdamOptimizer(LEARN_RATE_CRITIC).minimize(critic_cost) 58 | 59 | critic_advantages = critic_target_placeholder - critic_output_layer 60 | 61 | scores = deque(maxlen=STORE_SCORES_LEN) 62 | 63 | # set the first action to do nothing 64 | last_action = np.zeros(ACTIONS_COUNT) 65 | last_action[1] = 1 66 | 67 | time = 0 68 | 69 | session.run(tf.initialize_all_variables()) 70 | 71 | 72 | def choose_next_action(state): 73 | probability_of_actions = session.run(actor_output_layer, feed_dict={actor_input_placeholder: [state]})[0] 74 | try: 75 | move = np.random.multinomial(1, probability_of_actions) 76 | except ValueError: 77 | # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1. 78 | # so need to reduce slightly to be a valid value 79 | move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-6)) 80 | return move 81 | 82 | 83 | def train(states, actions_taken, advantages): 84 | # learn that these actions in these states lead to this reward 85 | session.run(actor_train_operation, feed_dict={ 86 | actor_input_placeholder: states, 87 | actor_action_placeholder: actions_taken, 88 | actor_advantage_placeholder: advantages}) 89 | 90 | 91 | last_state = env.reset() 92 | total_reward = 0 93 | current_game_observations = [] 94 | current_game_rewards = [] 95 | current_game_actions = [] 96 | 97 | episode_observation = [] 98 | episode_rewards = [] 99 | episode_actions = [] 100 | games = 0 101 | 102 | critic_costs = deque(maxlen=100) 103 | 104 | while True: 105 | env.render() 106 | last_action = choose_next_action(last_state) 107 | current_state, reward, terminal, info = env.step(np.argmax(last_action)) 108 | total_reward += reward 109 | 110 | if terminal: 111 | reward = -.10 112 | 113 | current_game_observations.append(last_state) 114 | current_game_rewards.append(reward) 115 | current_game_actions.append(last_action) 116 | 117 | if terminal: 118 | games += 1 119 | scores.append(total_reward) 120 | 121 | # get temporal difference values for critic 122 | cumulative_reward = 0 123 | for i in reversed(range(len(current_game_observations))): 124 | cumulative_reward = current_game_rewards[i] + FUTURE_REWARD_DISCOUNT * cumulative_reward 125 | current_game_rewards[i] = [cumulative_reward] 126 | 127 | _, cost, advantages = session.run([critic_train_operation, critic_cost, critic_advantages], { 128 | critic_input_placeholder: current_game_observations, 129 | critic_target_placeholder: current_game_rewards}) 130 | 131 | critic_costs.append(cost) 132 | 133 | print("Game: %s reward %s average scores %s critic cost %s" % 134 | (games, total_reward, 135 | np.mean(scores), np.mean(critic_costs))) 136 | 137 | episode_observation.extend(current_game_observations) 138 | episode_actions.extend(current_game_actions) 139 | episode_rewards.extend(advantages) 140 | 141 | total_reward = 0 142 | current_game_observations = [] 143 | current_game_rewards = [] 144 | current_game_actions = [] 145 | 146 | if games % GAMES_PER_TRAINING == 0: 147 | train(episode_observation, episode_actions, episode_rewards) 148 | 149 | episode_observation = [] 150 | episode_actions = [] 151 | episode_rewards = [] 152 | 153 | time += 1 154 | 155 | # update the old values 156 | if terminal: 157 | last_state = env.reset() 158 | else: 159 | last_state = current_state 160 | -------------------------------------------------------------------------------- /Chapter 07/tic_tac_toe.py: -------------------------------------------------------------------------------- 1 | """ 2 | Full code for running a game of tic-tac-toe on a 3 by 3 board. 3 | Two players take turns making moves on squares of the board, the first to get 3 in a row, including diagonals, wins. If 4 | there are no valid moves left to make the game ends a draw. 5 | 6 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine 7 | where each player plays. 8 | The board is represented by a 3 x 3 tuple of ints. A 0 means no player has played in a space, 1 means player one has 9 | played there, -1 means the seconds player has played there. The apply_move method can be used to return a copy of a 10 | given state with a given move applied. This can be useful for doing min-max or monte carlo sampling. 11 | """ 12 | import random 13 | import itertools 14 | 15 | 16 | def _new_board(): 17 | """Return a emprty tic-tac-toe board we can use for simulating a game. 18 | 19 | Returns: 20 | 3x3 tuple of ints 21 | """ 22 | return ((0, 0, 0), 23 | (0, 0, 0), 24 | (0, 0, 0)) 25 | 26 | 27 | def apply_move(board_state, move, side): 28 | """Returns a copy of the given board_state with the desired move applied. 29 | 30 | Args: 31 | board_state (3x3 tuple of int): The given board_state we want to apply the move to. 32 | move (int, int): The position we want to make the move in. 33 | side (int): The side we are making this move for, 1 for the first player, -1 for the second player. 34 | 35 | Returns: 36 | (3x3 tuple of int): A copy of the board_state with the given move applied for the given side. 37 | """ 38 | move_x, move_y = move 39 | 40 | def get_tuples(): 41 | for x in range(3): 42 | if move_x == x: 43 | temp = list(board_state[x]) 44 | temp[move_y] = side 45 | yield tuple(temp) 46 | else: 47 | yield board_state[x] 48 | 49 | return tuple(get_tuples()) 50 | 51 | 52 | def available_moves(board_state): 53 | """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have 54 | pieces played. 55 | 56 | Args: 57 | board_state: The board_state we want to check for valid moves. 58 | 59 | Returns: 60 | Generator of (int, int): All the valid moves that can be played in this position. 61 | """ 62 | for x, y in itertools.product(range(3), range(3)): 63 | if board_state[x][y] == 0: 64 | yield (x, y) 65 | 66 | 67 | def _has_3_in_a_line(line): 68 | return all(x == -1 for x in line) | all(x == 1 for x in line) 69 | 70 | 71 | def has_winner(board_state): 72 | """Determine if a player has won on the given board_state. 73 | 74 | Args: 75 | board_state (3x3 tuple of int): The current board_state we want to evaluate. 76 | 77 | Returns: 78 | int: 1 if player one has won, -1 if player 2 has won, otherwise 0. 79 | """ 80 | # check rows 81 | for x in range(3): 82 | if _has_3_in_a_line(board_state[x]): 83 | return board_state[x][0] 84 | # check columns 85 | for y in range(3): 86 | if _has_3_in_a_line([i[y] for i in board_state]): 87 | return board_state[0][y] 88 | 89 | # check diagonals 90 | if _has_3_in_a_line([board_state[i][i] for i in range(3)]): 91 | return board_state[0][0] 92 | if _has_3_in_a_line([board_state[2 - i][i] for i in range(3)]): 93 | return board_state[0][2] 94 | 95 | return 0 # no one has won, return 0 for a draw 96 | 97 | 98 | def play_game(plus_player_func, minus_player_func, log=False): 99 | """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each 100 | player. 101 | 102 | Args: 103 | plus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the 104 | current board_state and side this player is playing, and returns the move the player wants to play. 105 | minus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the 106 | current board_state and side this player is playing, and returns the move the player wants to play. 107 | log (bool): If True progress is logged to console, defaults to False 108 | 109 | Returns: 110 | int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw 111 | """ 112 | board_state = _new_board() 113 | player_turn = 1 114 | 115 | while True: 116 | _available_moves = list(available_moves(board_state)) 117 | 118 | if len(_available_moves) == 0: 119 | # draw 120 | if log: 121 | print("no moves left, game ended a draw") 122 | return 0. 123 | if player_turn > 0: 124 | move = plus_player_func(board_state, 1) 125 | else: 126 | move = minus_player_func(board_state, -1) 127 | 128 | if move not in _available_moves: 129 | # if a player makes an invalid move the other player wins 130 | if log: 131 | print("illegal move ", move) 132 | return -player_turn 133 | 134 | board_state = apply_move(board_state, move, player_turn) 135 | if log: 136 | print(board_state) 137 | 138 | winner = has_winner(board_state) 139 | if winner != 0: 140 | if log: 141 | print("we have a winner, side: %s" % player_turn) 142 | return winner 143 | player_turn = -player_turn 144 | 145 | 146 | def random_player(board_state, _): 147 | """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the 148 | valid moves in the current state. 149 | 150 | Args: 151 | board_state (3x3 tuple of int): The current state of the board 152 | _: the side this player is playing, not used in this function because we are simply choosing the moves randomly 153 | 154 | Returns: 155 | (int, int): the move we want to play on the current board 156 | """ 157 | moves = list(available_moves(board_state)) 158 | return random.choice(moves) 159 | 160 | 161 | if __name__ == '__main__': 162 | # example of playing a game 163 | play_game(random_player, random_player, log=True) -------------------------------------------------------------------------------- /Chapter 08/actor_critic_advantage_cart_pole.py: -------------------------------------------------------------------------------- 1 | # note must import tensorflow before gym 2 | import pickle 3 | from collections import deque 4 | 5 | import tensorflow as tf 6 | import gym 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | env = gym.make('CartPole-v0') 11 | 12 | ACTIONS_COUNT = 2 13 | FUTURE_REWARD_DISCOUNT = 0.9 14 | LEARN_RATE_ACTOR = 0.01 15 | LEARN_RATE_CRITIC = 0.01 16 | STORE_SCORES_LEN = 5 17 | GAMES_PER_TRAINING = 3 18 | INPUT_NODES = env.observation_space.shape[0] 19 | 20 | ACTOR_HIDDEN = 20 21 | CRITIC_HIDDEN = 20 22 | 23 | session = tf.Session() 24 | 25 | actor_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, ACTOR_HIDDEN], stddev=0.01)) 26 | actor_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[ACTOR_HIDDEN])) 27 | 28 | actor_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([ACTOR_HIDDEN, ACTIONS_COUNT], stddev=0.01)) 29 | actor_feed_forward_bias_2 = tf.Variable(tf.constant(0.1, shape=[ACTIONS_COUNT])) 30 | 31 | actor_input_placeholder = tf.placeholder("float", [None, INPUT_NODES]) 32 | actor_hidden_layer = tf.nn.tanh( 33 | tf.matmul(actor_input_placeholder, actor_feed_forward_weights_1) + actor_feed_forward_bias_1) 34 | actor_output_layer = tf.nn.softmax( 35 | tf.matmul(actor_hidden_layer, actor_feed_forward_weights_2) + actor_feed_forward_bias_2) 36 | 37 | actor_action_placeholder = tf.placeholder("float", [None, ACTIONS_COUNT]) 38 | actor_advantage_placeholder = tf.placeholder("float", [None, 1]) 39 | 40 | policy_gradient = tf.reduce_mean(actor_advantage_placeholder * actor_action_placeholder * tf.log(actor_output_layer)) 41 | actor_train_operation = tf.train.AdamOptimizer(LEARN_RATE_ACTOR).minimize(-policy_gradient) 42 | 43 | critic_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, CRITIC_HIDDEN], stddev=0.01)) 44 | critic_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[CRITIC_HIDDEN])) 45 | 46 | critic_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([CRITIC_HIDDEN, 1], stddev=0.01)) 47 | critic_feed_forward_bias_2 = tf.Variable(tf.constant(0.0, shape=[1])) 48 | 49 | critic_input_placeholder = tf.placeholder("float", [None, INPUT_NODES]) 50 | critic_hidden_layer = tf.nn.tanh( 51 | tf.matmul(critic_input_placeholder, critic_feed_forward_weights_1) + critic_feed_forward_bias_1) 52 | critic_output_layer = tf.matmul(critic_hidden_layer, critic_feed_forward_weights_2) + critic_feed_forward_bias_2 53 | 54 | critic_target_placeholder = tf.placeholder("float", [None, 1]) 55 | 56 | critic_cost = tf.reduce_mean(tf.square(critic_target_placeholder - critic_output_layer)) 57 | critic_train_operation = tf.train.AdamOptimizer(LEARN_RATE_CRITIC).minimize(critic_cost) 58 | 59 | critic_baseline = critic_target_placeholder - critic_output_layer 60 | 61 | scores = deque(maxlen=STORE_SCORES_LEN) 62 | 63 | # set the first action to do nothing 64 | last_action = np.zeros(ACTIONS_COUNT) 65 | last_action[1] = 1 66 | 67 | time = 0 68 | 69 | session.run(tf.initialize_all_variables()) 70 | 71 | 72 | def choose_next_action(state): 73 | probability_of_actions = session.run(actor_output_layer, feed_dict={actor_input_placeholder: [state]})[0] 74 | try: 75 | move = np.random.multinomial(1, probability_of_actions) 76 | except ValueError: 77 | # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1. 78 | # so need to reduce slightly to be a valid value 79 | move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-6)) 80 | return move 81 | 82 | 83 | def train(states, actions_taken, advantages): 84 | # learn that these actions in these states lead to this reward 85 | session.run(actor_train_operation, feed_dict={ 86 | actor_input_placeholder: states, 87 | actor_action_placeholder: actions_taken, 88 | actor_advantage_placeholder: advantages}) 89 | 90 | 91 | last_state = env.reset() 92 | total_reward = 0 93 | current_game_observations = [] 94 | current_game_rewards = [] 95 | current_game_actions = [] 96 | 97 | episode_observation = [] 98 | episode_rewards = [] 99 | episode_actions = [] 100 | games = 0 101 | plot_x = [] 102 | plot_y = [] 103 | 104 | critic_costs = deque(maxlen=10) 105 | 106 | 107 | while True: 108 | env.render() 109 | last_action = choose_next_action(last_state) 110 | current_state, reward, terminal, info = env.step(np.argmax(last_action)) 111 | total_reward += reward 112 | 113 | if terminal: 114 | reward = -.10 115 | else: 116 | reward = 0.1 117 | 118 | current_game_observations.append(last_state) 119 | current_game_rewards.append(reward) 120 | current_game_actions.append(last_action) 121 | 122 | if terminal: 123 | games += 1 124 | scores.append(total_reward) 125 | 126 | if games % STORE_SCORES_LEN == 0: 127 | plot_x.append(games) 128 | plot_y.append(np.mean(scores)) 129 | 130 | # get temporal difference values for critic 131 | cumulative_reward = 0 132 | for i in reversed(range(len(current_game_observations))): 133 | cumulative_reward = current_game_rewards[i] + FUTURE_REWARD_DISCOUNT * cumulative_reward 134 | current_game_rewards[i] = [cumulative_reward] 135 | 136 | values_t = session.run(critic_output_layer, { 137 | critic_input_placeholder: current_game_observations}) 138 | advantages = [] 139 | 140 | for i in range(len(current_game_observations) - 1): 141 | advantages.append([current_game_rewards[i][0] + FUTURE_REWARD_DISCOUNT*values_t[i+1][0] - values_t[i][0]]) 142 | 143 | advantages.append([current_game_rewards[-1][0]-values_t[-1][0]]) 144 | 145 | _, cost = session.run([critic_train_operation, critic_cost], { 146 | critic_input_placeholder: current_game_observations, 147 | critic_target_placeholder: current_game_rewards}) 148 | 149 | critic_costs.append(cost) 150 | 151 | print("Game: %s reward %s average scores %s critic cost %s" % 152 | (games, total_reward, 153 | np.mean(scores), np.mean(critic_costs))) 154 | 155 | episode_observation.extend(current_game_observations) 156 | episode_actions.extend(current_game_actions) 157 | episode_rewards.extend(advantages) 158 | 159 | total_reward = 0 160 | current_game_observations = [] 161 | current_game_rewards = [] 162 | current_game_actions = [] 163 | 164 | if games % GAMES_PER_TRAINING == 0: 165 | episode_rewards = np.array(episode_rewards) 166 | normalized_rewards = episode_rewards - np.mean(episode_rewards) 167 | normalized_rewards /= np.std(normalized_rewards) 168 | 169 | train(episode_observation, episode_actions, normalized_rewards) 170 | 171 | episode_observation = [] 172 | episode_actions = [] 173 | episode_rewards = [] 174 | 175 | time += 1 176 | 177 | # update the old values 178 | if terminal: 179 | last_state = env.reset() 180 | else: 181 | last_state = current_state -------------------------------------------------------------------------------- /Chapter 07/connect_4.py: -------------------------------------------------------------------------------- 1 | """ 2 | Full code for running a game of connect 4 on a board_width, board_height and winning length can be specified in relevant 3 | methods. Allowing you to play connect 5, 6, 7, etc. Defaults are board_width = 7, board_height = 6, winning_length = 4 4 | 5 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine 6 | where each player plays. 7 | The board is represented by a board_width x board_height tuple of ints. A 0 means no player has played in a space, 1 8 | means player one has played there, -1 means the seconds player has played there. The apply_move method can be used to 9 | return a copy of a given state with a given move applied. This can be useful for doing min-max or monte carlo sampling. 10 | """ 11 | 12 | import random 13 | 14 | 15 | def _new_board(board_width, board_height): 16 | """Return a emprty tic-tac-toe board we can use for simulating a game. 17 | 18 | Args: 19 | board_width (int): The width of the board, a board_width * board_height board is created 20 | board_height (int): The height of the board, a board_width * board_height board is created 21 | 22 | Returns: 23 | board_width x board_height tuple of ints 24 | """ 25 | return tuple(tuple(0 for _ in range(board_height)) for _ in range(board_width)) 26 | 27 | 28 | def apply_move(board_state, move_x, side): 29 | """Returns a copy of the given board_state with the desired move applied. 30 | 31 | Args: 32 | board_state (2d tuple of int): The given board_state we want to apply the move to. 33 | move_x (int): Which column we are going to "drop" our piece in 34 | side (int): The side we are making this move for, 1 for the first player, -1 for the second player. 35 | 36 | Returns: 37 | (2d tuple of int): A copy of the board_state with the given move applied for the given side. 38 | """ 39 | # find position in which move will settle 40 | move_y = 0 41 | for x in board_state[move_x]: 42 | if x == 0: 43 | break 44 | else: 45 | move_y += 1 46 | 47 | def get_tuples(): 48 | for i in range(len(board_state)): 49 | if move_x == i: 50 | temp = list(board_state[i]) 51 | temp[move_y] = side 52 | yield tuple(temp) 53 | else: 54 | yield board_state[i] 55 | 56 | return tuple(get_tuples()) 57 | 58 | 59 | def available_moves(board_state): 60 | """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have 61 | pieces played. 62 | 63 | Args: 64 | board_state: The board_state we want to check for valid moves. 65 | 66 | Returns: 67 | Generator of int: All the valid moves that can be played in this position. 68 | """ 69 | for x in range(len(board_state)): 70 | if any(y == 0 for y in board_state[x]): 71 | yield x 72 | 73 | 74 | def _has_winning_line(line, winning_length): 75 | count = 0 76 | last_side = 0 77 | for x in line: 78 | if x == last_side: 79 | count += 1 80 | if count == winning_length: 81 | return last_side 82 | else: 83 | count = 1 84 | last_side = x 85 | return 0 86 | 87 | 88 | def has_winner(board_state, winning_length=4): 89 | """Determine if a player has won on the given board_state. 90 | 91 | Args: 92 | board_state (2d tuple of int): The current board_state we want to evaluate. 93 | winning_length (int): The number of moves in a row needed for a win. 94 | 95 | Returns: 96 | int: 1 if player one has won, -1 if player 2 has won, otherwise 0. 97 | """ 98 | board_width = len(board_state) 99 | board_height = len(board_state[0]) 100 | 101 | # check rows 102 | for x in range(board_width): 103 | winner = _has_winning_line(board_state[x], winning_length) 104 | if winner != 0: 105 | return winner 106 | # check columns 107 | for y in range(board_height): 108 | winner = _has_winning_line((i[y] for i in board_state), winning_length) 109 | if winner != 0: 110 | return winner 111 | 112 | # check diagonals 113 | diagonals_start = -(board_width - winning_length) 114 | diagonals_end = (board_width - winning_length) 115 | for d in range(diagonals_start, diagonals_end): 116 | winner = _has_winning_line( 117 | (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))), 118 | winning_length) 119 | if winner != 0: 120 | return winner 121 | for d in range(diagonals_start, diagonals_end): 122 | winner = _has_winning_line( 123 | (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))), 124 | winning_length) 125 | if winner != 0: 126 | return winner 127 | 128 | return 0 # no one has won, return 0 for a draw 129 | 130 | 131 | def play_game(plus_player_func, minus_player_func, board_width=7, board_height=6, winning_length=4, log=False): 132 | """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each 133 | player. 134 | 135 | Args: 136 | plus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))): 137 | Function that takes the current board_state and side this player is playing, and returns the move the player 138 | wants to play. 139 | minus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))): 140 | Function that takes the current board_state and side this player is playing, and returns the move the player 141 | wants to play. 142 | board_width (int): The width of the board, a board_width * board_height board is created 143 | board_height (int): The height of the board, a board_width * board_height board is created 144 | winning_length (int): The number of pieces in a row needed to win a game. 145 | log (bool): If True progress is logged to console, defaults to False 146 | 147 | Returns: 148 | int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw 149 | """ 150 | board_state = _new_board(board_width, board_height) 151 | player_turn = 1 152 | 153 | while True: 154 | _avialable_moves = list(available_moves(board_state)) 155 | if len(_avialable_moves) == 0: 156 | # draw 157 | if log: 158 | print("no moves left, game ended a draw") 159 | return 0. 160 | if player_turn > 0: 161 | move = plus_player_func(board_state, 1) 162 | else: 163 | move = minus_player_func(board_state, -1) 164 | 165 | if move not in _avialable_moves: 166 | # if a player makes an invalid move the other player wins 167 | if log: 168 | print("illegal move ", move) 169 | return -player_turn 170 | 171 | board_state = apply_move(board_state, move, player_turn) 172 | if log: 173 | print(board_state) 174 | 175 | winner = has_winner(board_state, winning_length) 176 | if winner != 0: 177 | if log: 178 | print("we have a winner, side: %s" % player_turn) 179 | return winner 180 | player_turn = -player_turn 181 | 182 | 183 | def random_player(board_state, _): 184 | """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the 185 | valid moves in the current state. 186 | 187 | Args: 188 | board_state (2d tuple of int): The current state of the board 189 | _: the side this player is playing, not used in this function because we are simply choosing the moves randomly 190 | 191 | Returns: 192 | (int, int): the move we want to play on the current board 193 | """ 194 | moves = list(available_moves(board_state)) 195 | return random.choice(moves) 196 | 197 | 198 | if __name__ == '__main__': 199 | # example of playing a game 200 | play_game(random_player, random_player, log=True, board_width=7, board_height=6, winning_length=4) 201 | -------------------------------------------------------------------------------- /Chapter 07/tic_tac_toe_x.py: -------------------------------------------------------------------------------- 1 | """ 2 | Full code for running a game of tic-tac-toe on a board of any size with a specified number in a row for the win. This is 3 | similar to tic_tac_toe.py but all relevent moves are paramiterized by board_size arg that sets how big the board is and 4 | winning_length which determines how many in a row are needed to win. Defaults are 5 and 4. This allows you to play games 5 | in a more complex environment than standard tic-tac-toe. 6 | 7 | Two players take turns making moves on squares of the board, the first to get winning_length in a row, including 8 | diagonals, wins. If there are no valid moves left to make the game ends a draw. 9 | 10 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine 11 | where each player plays. 12 | The board is represented by a board_size x board_size tuple of ints. A 0 means no player has played in a space, 1 means 13 | player one has played there, -1 means the seconds player has played there. The apply_move method can be used to return a 14 | copy of a given state with a given move applied. This can be useful for doing min-max or monte carlo sampling. 15 | """ 16 | import random 17 | import itertools 18 | 19 | 20 | def _new_board(board_size): 21 | """Return a emprty tic-tac-toe board we can use for simulating a game. 22 | 23 | Args: 24 | board_size (int): The size of one side of the board, a board_size * board_size board is created 25 | 26 | Returns: 27 | board_size x board_size tuple of ints 28 | """ 29 | return tuple(tuple(0 for _ in range(board_size)) for _ in range(board_size)) 30 | 31 | 32 | def apply_move(board_state, move, side): 33 | """Returns a copy of the given board_state with the desired move applied. 34 | 35 | Args: 36 | board_state (2d tuple of int): The given board_state we want to apply the move to. 37 | move (int, int): The position we want to make the move in. 38 | side (int): The side we are making this move for, 1 for the first player, -1 for the second player. 39 | 40 | Returns: 41 | (2d tuple of int): A copy of the board_state with the given move applied for the given side. 42 | """ 43 | move_x, move_y = move 44 | 45 | def get_tuples(): 46 | for x in range(len(board_state)): 47 | if move_x == x: 48 | temp = list(board_state[x]) 49 | temp[move_y] = side 50 | yield tuple(temp) 51 | else: 52 | yield board_state[x] 53 | 54 | return tuple(get_tuples()) 55 | 56 | 57 | def available_moves(board_state): 58 | """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have 59 | pieces played. 60 | 61 | Args: 62 | board_state: The board_state we want to check for valid moves. 63 | 64 | Returns: 65 | Generator of (int, int): All the valid moves that can be played in this position. 66 | """ 67 | for x, y in itertools.product(range(len(board_state)), range(len(board_state[0]))): 68 | if board_state[x][y] == 0: 69 | yield (x, y) 70 | 71 | 72 | def _has_winning_line(line, winning_length): 73 | count = 0 74 | last_side = 0 75 | for x in line: 76 | if x == last_side: 77 | count += 1 78 | if count == winning_length: 79 | return last_side 80 | else: 81 | count = 1 82 | last_side = x 83 | return 0 84 | 85 | 86 | def has_winner(board_state, winning_length): 87 | """Determine if a player has won on the given board_state. 88 | 89 | Args: 90 | board_state (2d tuple of int): The current board_state we want to evaluate. 91 | winning_length (int): The number of moves in a row needed for a win. 92 | 93 | Returns: 94 | int: 1 if player one has won, -1 if player 2 has won, otherwise 0. 95 | """ 96 | board_width = len(board_state) 97 | board_height = len(board_state[0]) 98 | 99 | # check rows 100 | for x in range(board_width): 101 | winner = _has_winning_line(board_state[x], winning_length) 102 | if winner != 0: 103 | return winner 104 | # check columns 105 | for y in range(board_height): 106 | winner = _has_winning_line((i[y] for i in board_state), winning_length) 107 | if winner != 0: 108 | return winner 109 | 110 | # check diagonals 111 | diagonals_start = -(board_width - winning_length) 112 | diagonals_end = (board_width - winning_length) 113 | for d in range(diagonals_start, diagonals_end+1): 114 | winner = _has_winning_line( 115 | (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))), 116 | winning_length) 117 | if winner != 0: 118 | return winner 119 | for d in range(diagonals_start, diagonals_end+1): 120 | winner = _has_winning_line( 121 | (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))), 122 | winning_length) 123 | if winner != 0: 124 | return winner 125 | 126 | return 0 # no one has won, return 0 for a draw 127 | 128 | 129 | def play_game(plus_player_func, minus_player_func, board_size=5, winning_length=4, log=False): 130 | """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each 131 | player. 132 | 133 | Args: 134 | plus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))): 135 | Function that takes the current board_state and side this player is playing, and returns the move the player 136 | wants to play. 137 | minus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))): 138 | Function that takes the current board_state and side this player is playing, and returns the move the player 139 | wants to play. 140 | board_size (int): The size of a single side of the board. Game is played on a board_size*board_size sized board 141 | winning_length (int): The number of pieces in a row needed to win a game. 142 | log (bool): If True progress is logged to console, defaults to False 143 | 144 | Returns: 145 | int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw 146 | """ 147 | board_state = _new_board(board_size) 148 | player_turn = 1 149 | 150 | while True: 151 | _available_moves = list(available_moves(board_state)) 152 | if len(_available_moves) == 0: 153 | # draw 154 | if log: 155 | print("no moves left, game ended a draw") 156 | return 0. 157 | if player_turn > 0: 158 | move = plus_player_func(board_state, 1) 159 | else: 160 | move = minus_player_func(board_state, -1) 161 | 162 | if move not in _available_moves: 163 | # if a player makes an invalid move the other player wins 164 | if log: 165 | print("illegal move ", move) 166 | return -player_turn 167 | 168 | board_state = apply_move(board_state, move, player_turn) 169 | print(board_state) 170 | 171 | winner = has_winner(board_state, winning_length) 172 | if winner != 0: 173 | if log: 174 | print("we have a winner, side: %s" % player_turn) 175 | return winner 176 | player_turn = -player_turn 177 | 178 | 179 | def random_player(board_state, _): 180 | """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the 181 | valid moves in the current state. 182 | 183 | Args: 184 | board_state (2d tuple of int): The current state of the board 185 | _: the side this player is playing, not used in this function because we are simply choosing the moves randomly 186 | 187 | Returns: 188 | (int, int): the move we want to play on the current board 189 | """ 190 | moves = list(available_moves(board_state)) 191 | return random.choice(moves) 192 | 193 | 194 | if __name__ == '__main__': 195 | # example of playing a game 196 | play_game(random_player, random_player, log=True, board_size=10, winning_length=4) 197 | -------------------------------------------------------------------------------- /Chapter 08/deep_q_breakout.py: -------------------------------------------------------------------------------- 1 | # note must import tensorflow before gym 2 | import pickle 3 | import random 4 | from collections import deque 5 | 6 | import tensorflow as tf 7 | import gym 8 | import numpy as np 9 | import os 10 | 11 | import zlib 12 | 13 | resume = True 14 | CHECKPOINT_PATH = 'deep_q_breakout_path' 15 | ACTIONS_COUNT = 3 16 | SCREEN_WIDTH, SCREEN_HEIGHT = (72, 84) 17 | FUTURE_REWARD_DISCOUNT = 0.99 18 | OBSERVATION_STEPS = 100000. # time steps to observe before training 19 | EXPLORE_STEPS = 2000000. # frames over which to anneal epsilon 20 | INITIAL_RANDOM_ACTION_PROB = 1.0 # starting chance of an action being random 21 | FINAL_RANDOM_ACTION_PROB = 0.05 # final chance of an action being random 22 | MEMORY_SIZE = 800000 # number of observations to remember 23 | MINI_BATCH_SIZE = 128 # size of mini batches 24 | STATE_FRAMES = 2 # number of frames to store in the state 25 | OBS_LAST_STATE_INDEX, OBS_ACTION_INDEX, OBS_REWARD_INDEX, OBS_CURRENT_STATE_INDEX, OBS_TERMINAL_INDEX = range(5) 26 | SAVE_EVERY_X_STEPS = 20000 27 | LEARN_RATE = 1e-4 28 | STORE_SCORES_LEN = 100 29 | verbose_logging = True 30 | 31 | 32 | def _create_network(): 33 | CONVOLUTIONS_LAYER_1 = 32 34 | CONVOLUTIONS_LAYER_2 = 64 35 | CONVOLUTIONS_LAYER_3 = 64 36 | FLAT_SIZE = 11*9*CONVOLUTIONS_LAYER_3 37 | FLAT_HIDDEN_NODES = 512 38 | 39 | # network weights 40 | convolution_weights_1 = tf.Variable(tf.truncated_normal([8, 8, STATE_FRAMES, CONVOLUTIONS_LAYER_1], stddev=0.01)) 41 | convolution_bias_1 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_1])) 42 | 43 | convolution_weights_2 = tf.Variable(tf.truncated_normal([4, 4, CONVOLUTIONS_LAYER_1, CONVOLUTIONS_LAYER_2], stddev=0.01)) 44 | convolution_bias_2 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_2])) 45 | 46 | convolution_weights_3 = tf.Variable(tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_2, CONVOLUTIONS_LAYER_3], stddev=0.01)) 47 | convolution_bias_3 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_2])) 48 | 49 | feed_forward_weights_1 = tf.Variable(tf.truncated_normal([FLAT_SIZE, FLAT_HIDDEN_NODES], stddev=0.01)) 50 | feed_forward_bias_1 = tf.Variable(tf.constant(0.01, shape=[FLAT_HIDDEN_NODES])) 51 | 52 | feed_forward_weights_2 = tf.Variable(tf.truncated_normal([FLAT_HIDDEN_NODES, ACTIONS_COUNT], stddev=0.01)) 53 | feed_forward_bias_2 = tf.Variable(tf.constant(0.01, shape=[ACTIONS_COUNT])) 54 | 55 | input_layer = tf.placeholder("float", [None, SCREEN_HEIGHT, SCREEN_WIDTH, 56 | STATE_FRAMES]) 57 | 58 | hidden_convolutional_layer_1 = tf.nn.relu( 59 | tf.nn.conv2d(input_layer, convolution_weights_1, strides=[1, 4, 4, 1], padding="SAME") + convolution_bias_1) 60 | 61 | hidden_convolutional_layer_2 = tf.nn.relu( 62 | tf.nn.conv2d(hidden_convolutional_layer_1, convolution_weights_2, strides=[1, 2, 2, 1], 63 | padding="SAME") + convolution_bias_2) 64 | 65 | hidden_convolutional_layer_3 = tf.nn.relu( 66 | tf.nn.conv2d(hidden_convolutional_layer_2, convolution_weights_3, strides=[1, 1, 1, 1], 67 | padding="SAME") + convolution_bias_3) 68 | 69 | hidden_convolutional_layer_3_flat = tf.reshape(hidden_convolutional_layer_3, [-1, FLAT_SIZE]) 70 | 71 | final_hidden_activations = tf.nn.relu( 72 | tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_1) + feed_forward_bias_1) 73 | 74 | output_layer = tf.matmul(final_hidden_activations, feed_forward_weights_2) + feed_forward_bias_2 75 | 76 | return input_layer, output_layer 77 | 78 | 79 | _session = tf.Session() 80 | _input_layer, _output_layer = _create_network() 81 | 82 | _action = tf.placeholder("float", [None, ACTIONS_COUNT]) 83 | _target = tf.placeholder("float", [None]) 84 | 85 | readout_action = tf.reduce_sum(tf.mul(_output_layer, _action), reduction_indices=1) 86 | 87 | cost = tf.reduce_mean(tf.square(_target - readout_action)) 88 | _train_operation = tf.train.AdamOptimizer(LEARN_RATE).minimize(cost) 89 | 90 | _observations = deque(maxlen=MEMORY_SIZE) 91 | _last_scores = deque(maxlen=STORE_SCORES_LEN) 92 | 93 | # set the first action to do nothing 94 | _last_action = np.zeros(ACTIONS_COUNT) 95 | _last_action[1] = 1 96 | 97 | _last_state = None 98 | _probability_of_random_action = INITIAL_RANDOM_ACTION_PROB 99 | _time = 0 100 | 101 | _session.run(tf.initialize_all_variables()) 102 | 103 | saver = tf.train.Saver() 104 | 105 | if not os.path.exists(CHECKPOINT_PATH): 106 | os.mkdir(CHECKPOINT_PATH) 107 | 108 | if resume: 109 | checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_PATH) 110 | if checkpoint: 111 | saver.restore(_session, checkpoint.model_checkpoint_path) 112 | 113 | 114 | def _choose_next_action(state): 115 | new_action = np.zeros([ACTIONS_COUNT]) 116 | 117 | if random.random() <= _probability_of_random_action: 118 | # choose an action randomly 119 | action_index = random.randrange(ACTIONS_COUNT) 120 | else: 121 | # choose an action given our last state 122 | readout_t = _session.run(_output_layer, feed_dict={_input_layer: [state]})[0] 123 | if verbose_logging: 124 | print("Action Q-Values are %s" % readout_t) 125 | action_index = np.argmax(readout_t) 126 | 127 | new_action[action_index] = 1 128 | return new_action 129 | 130 | 131 | def pre_process(screen_image): 132 | """ change the 210x160x3 uint8 frame into 84x72 float """ 133 | screen_image = screen_image[32:-10, 8:-8] # crop 134 | screen_image = screen_image[::2, ::2, 0] # downsample by factor of 2 135 | screen_image[screen_image != 0] = 1 # set everything is either black:0 or white:1 136 | return screen_image.astype(np.float) 137 | 138 | 139 | def _key_presses_from_action(action_set): 140 | if action_set[0] == 1: 141 | return 1 142 | elif action_set[1] == 1: 143 | return 2 144 | elif action_set[2] == 1: 145 | return 3 146 | raise Exception("Unexpected action") 147 | 148 | 149 | def _train(): 150 | # sample a mini_batch to train on 151 | mini_batch_compressed = random.sample(_observations, MINI_BATCH_SIZE) 152 | mini_batch = [pickle.loads(zlib.decompress(comp_item)) for comp_item in mini_batch_compressed] 153 | 154 | # get the batch variables 155 | previous_states = [d[OBS_LAST_STATE_INDEX] for d in mini_batch] 156 | actions = [d[OBS_ACTION_INDEX] for d in mini_batch] 157 | rewards = [d[OBS_REWARD_INDEX] for d in mini_batch] 158 | current_states = [d[OBS_CURRENT_STATE_INDEX] for d in mini_batch] 159 | agents_expected_reward = [] 160 | # this gives us the agents expected reward for each action we might take 161 | agents_reward_per_action = _session.run(_output_layer, feed_dict={_input_layer: current_states}) 162 | for i in range(len(mini_batch)): 163 | if mini_batch[i][OBS_TERMINAL_INDEX]: 164 | # this was a terminal frame so there is no future reward... 165 | agents_expected_reward.append(rewards[i]) 166 | else: 167 | agents_expected_reward.append( 168 | rewards[i] + FUTURE_REWARD_DISCOUNT * np.max(agents_reward_per_action[i])) 169 | 170 | # learn that these actions in these states lead to this reward 171 | _session.run(_train_operation, feed_dict={ 172 | _input_layer: previous_states, 173 | _action: actions, 174 | _target: agents_expected_reward}) 175 | 176 | # save checkpoints for later 177 | if _time % SAVE_EVERY_X_STEPS == 0: 178 | saver.save(_session, CHECKPOINT_PATH + '/network', global_step=_time) 179 | 180 | 181 | env = gym.make("Breakout-v0") 182 | observation = env.reset() 183 | reward = 0 184 | score_pre_game = 0 185 | 186 | while True: 187 | env.render() 188 | 189 | observation, reward, terminal, info = env.step(_key_presses_from_action(_last_action)) 190 | score_pre_game += reward 191 | 192 | screen_binary = pre_process(observation) 193 | 194 | # first frame must be handled differently 195 | if _last_state is None: 196 | # the _last_state will contain the image data from the last self.STATE_FRAMES frames 197 | _last_state = np.stack(tuple(screen_binary for _ in range(STATE_FRAMES)), axis=2) 198 | else: 199 | screen_binary = np.reshape(screen_binary, 200 | (SCREEN_HEIGHT, SCREEN_WIDTH, 1)) 201 | current_state = np.append(_last_state[:, :, 1:], screen_binary, axis=2) 202 | 203 | _observations.append( 204 | zlib.compress(pickle.dumps((_last_state, _last_action, reward, current_state, terminal), 2), 2)) 205 | 206 | # only train if done observing 207 | if len(_observations) > OBSERVATION_STEPS: 208 | _train() 209 | _time += 1 210 | 211 | if terminal: 212 | _last_scores.append(score_pre_game) 213 | score_pre_game = 0 214 | env.reset() 215 | _last_state = None 216 | else: 217 | # update the old values 218 | _last_state = current_state 219 | _last_action = _choose_next_action(_last_state) 220 | 221 | # gradually reduce the probability of a random action 222 | if _probability_of_random_action > FINAL_RANDOM_ACTION_PROB \ 223 | and len(_observations) > OBSERVATION_STEPS: 224 | _probability_of_random_action -= \ 225 | (INITIAL_RANDOM_ACTION_PROB - FINAL_RANDOM_ACTION_PROB) / EXPLORE_STEPS 226 | 227 | print("Time: %s random_action_prob: %s reward %s scores differential %s" % 228 | (_time, _probability_of_random_action, reward, 229 | np.mean(_last_scores))) 230 | -------------------------------------------------------------------------------- /Chapter 08/deep_q_pong.py: -------------------------------------------------------------------------------- 1 | # note must import tensorflow before gym 2 | import random 3 | from collections import deque 4 | 5 | import tensorflow as tf 6 | import gym 7 | import numpy as np 8 | import os 9 | 10 | resume = True 11 | CHECKPOINT_PATH = 'deep_q_pong' 12 | ACTIONS_COUNT = 3 13 | SCREEN_WIDTH, SCREEN_HEIGHT = (80, 80) 14 | FUTURE_REWARD_DISCOUNT = 0.99 15 | OBSERVATION_STEPS = 50000. # time steps to observe before training 16 | EXPLORE_STEPS = 2000000. # frames over which to anneal epsilon 17 | INITIAL_RANDOM_ACTION_PROB = 1.0 # starting chance of an action being random 18 | FINAL_RANDOM_ACTION_PROB = 0.05 # final chance of an action being random 19 | MEMORY_SIZE = 100000 # number of observations to remember 20 | MINI_BATCH_SIZE = 100 # size of mini batches 21 | STATE_FRAMES = 2 # number of frames to store in the state 22 | OBS_LAST_STATE_INDEX, OBS_ACTION_INDEX, OBS_REWARD_INDEX, OBS_CURRENT_STATE_INDEX, OBS_TERMINAL_INDEX = range(5) 23 | SAVE_EVERY_X_STEPS = 10000 24 | LEARN_RATE = 1e-6 25 | STORE_SCORES_LEN = 1000. 26 | verbose_logging = True 27 | 28 | 29 | def _create_network(): 30 | # network weights 31 | convolution_weights_1 = tf.Variable(tf.truncated_normal([8, 8, STATE_FRAMES, 32], stddev=0.01)) 32 | convolution_bias_1 = tf.Variable(tf.constant(0.01, shape=[32])) 33 | 34 | convolution_weights_2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.01)) 35 | convolution_bias_2 = tf.Variable(tf.constant(0.01, shape=[64])) 36 | 37 | convolution_weights_3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.01)) 38 | convolution_bias_3 = tf.Variable(tf.constant(0.01, shape=[64])) 39 | 40 | feed_forward_weights_1 = tf.Variable(tf.truncated_normal([256, 256], stddev=0.01)) 41 | feed_forward_bias_1 = tf.Variable(tf.constant(0.01, shape=[256])) 42 | 43 | feed_forward_weights_2 = tf.Variable(tf.truncated_normal([256, ACTIONS_COUNT], stddev=0.01)) 44 | feed_forward_bias_2 = tf.Variable(tf.constant(0.01, shape=[ACTIONS_COUNT])) 45 | 46 | input_layer = tf.placeholder("float", [None, SCREEN_WIDTH, SCREEN_HEIGHT, 47 | STATE_FRAMES]) 48 | 49 | hidden_convolutional_layer_1 = tf.nn.relu( 50 | tf.nn.conv2d(input_layer, convolution_weights_1, strides=[1, 4, 4, 1], padding="SAME") + convolution_bias_1) 51 | 52 | hidden_max_pooling_layer_1 = tf.nn.max_pool(hidden_convolutional_layer_1, ksize=[1, 2, 2, 1], 53 | strides=[1, 2, 2, 1], padding="SAME") 54 | 55 | hidden_convolutional_layer_2 = tf.nn.relu( 56 | tf.nn.conv2d(hidden_max_pooling_layer_1, convolution_weights_2, strides=[1, 2, 2, 1], 57 | padding="SAME") + convolution_bias_2) 58 | 59 | hidden_max_pooling_layer_2 = tf.nn.max_pool(hidden_convolutional_layer_2, ksize=[1, 2, 2, 1], 60 | strides=[1, 2, 2, 1], padding="SAME") 61 | 62 | hidden_convolutional_layer_3 = tf.nn.relu( 63 | tf.nn.conv2d(hidden_max_pooling_layer_2, convolution_weights_3, 64 | strides=[1, 1, 1, 1], padding="SAME") + convolution_bias_3) 65 | 66 | hidden_max_pooling_layer_3 = tf.nn.max_pool(hidden_convolutional_layer_3, ksize=[1, 2, 2, 1], 67 | strides=[1, 2, 2, 1], padding="SAME") 68 | 69 | hidden_convolutional_layer_3_flat = tf.reshape(hidden_max_pooling_layer_3, [-1, 256]) 70 | 71 | final_hidden_activations = tf.nn.relu( 72 | tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_1) + feed_forward_bias_1) 73 | 74 | output_layer = tf.matmul(final_hidden_activations, feed_forward_weights_2) + feed_forward_bias_2 75 | 76 | return input_layer, output_layer 77 | 78 | 79 | _session = tf.Session() 80 | _input_layer, _output_layer = _create_network() 81 | 82 | _action = tf.placeholder("float", [None, ACTIONS_COUNT]) 83 | _target = tf.placeholder("float", [None]) 84 | 85 | readout_action = tf.reduce_sum(tf.mul(_output_layer, _action), reduction_indices=1) 86 | 87 | cost = tf.reduce_mean(tf.square(_target - readout_action)) 88 | _train_operation = tf.train.AdamOptimizer(LEARN_RATE).minimize(cost) 89 | 90 | _observations = deque() 91 | _last_scores = deque() 92 | 93 | # set the first action to do nothing 94 | _last_action = np.zeros(ACTIONS_COUNT) 95 | _last_action[1] = 1 96 | 97 | _last_state = None 98 | _probability_of_random_action = INITIAL_RANDOM_ACTION_PROB 99 | _time = 0 100 | 101 | _session.run(tf.initialize_all_variables()) 102 | 103 | saver = tf.train.Saver() 104 | 105 | if not os.path.exists(CHECKPOINT_PATH): 106 | os.mkdir(CHECKPOINT_PATH) 107 | 108 | if resume: 109 | checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_PATH) 110 | if checkpoint: 111 | saver.restore(_session, checkpoint.model_checkpoint_path) 112 | 113 | 114 | def _choose_next_action(): 115 | new_action = np.zeros([ACTIONS_COUNT]) 116 | 117 | if random.random() <= _probability_of_random_action: 118 | # choose an action randomly 119 | action_index = random.randrange(ACTIONS_COUNT) 120 | else: 121 | # choose an action given our last state 122 | readout_t = _session.run(_output_layer, feed_dict={_input_layer: [_last_state]})[0] 123 | if verbose_logging: 124 | print("Action Q-Values are %s" % readout_t) 125 | action_index = np.argmax(readout_t) 126 | 127 | new_action[action_index] = 1 128 | return new_action 129 | 130 | 131 | def pre_process(screen_image): 132 | """ change the 210x160x3 uint8 frame into 6400 (80x80) float """ 133 | screen_image = screen_image[35:195] # crop 134 | screen_image = screen_image[::2, ::2, 0] # downsample by factor of 2 135 | screen_image[screen_image == 144] = 0 # erase background (background type 1) 136 | screen_image[screen_image == 109] = 0 # erase background (background type 2) 137 | screen_image[screen_image != 0] = 1 # everything else (paddles, ball) just set to 1 138 | return screen_image.astype(np.float) 139 | 140 | 141 | def _key_presses_from_action(action_set): 142 | # 1 = still 143 | # 2 = up 144 | # 3 = down 145 | 146 | if action_set[0] == 1: 147 | return 1 148 | elif action_set[1] == 1: 149 | return 2 150 | elif action_set[2] == 1: 151 | return 3 152 | raise Exception("Unexpected action") 153 | 154 | 155 | def _train(): 156 | # sample a mini_batch to train on 157 | mini_batch = random.sample(_observations, MINI_BATCH_SIZE) 158 | # get the batch variables 159 | previous_states = [d[OBS_LAST_STATE_INDEX] for d in mini_batch] 160 | actions = [d[OBS_ACTION_INDEX] for d in mini_batch] 161 | rewards = [d[OBS_REWARD_INDEX] for d in mini_batch] 162 | current_states = [d[OBS_CURRENT_STATE_INDEX] for d in mini_batch] 163 | agents_expected_reward = [] 164 | # this gives us the agents expected reward for each action we might take 165 | agents_reward_per_action = _session.run(_output_layer, feed_dict={_input_layer: current_states}) 166 | for i in range(len(mini_batch)): 167 | if mini_batch[i][OBS_TERMINAL_INDEX]: 168 | # this was a terminal frame so there is no future reward... 169 | agents_expected_reward.append(rewards[i]) 170 | else: 171 | agents_expected_reward.append( 172 | rewards[i] + FUTURE_REWARD_DISCOUNT * np.max(agents_reward_per_action[i])) 173 | 174 | # learn that these actions in these states lead to this reward 175 | _session.run(_train_operation, feed_dict={ 176 | _input_layer: previous_states, 177 | _action: actions, 178 | _target: agents_expected_reward}) 179 | 180 | # save checkpoints for later 181 | if _time % SAVE_EVERY_X_STEPS == 0: 182 | saver.save(_session, CHECKPOINT_PATH + '/network', global_step=_time) 183 | 184 | env = gym.make("Pong-v0") 185 | observation = env.reset() 186 | next_action = 1 187 | 188 | while True: 189 | env.render() 190 | 191 | observation, reward, done, info = env.step(next_action) 192 | 193 | if done: 194 | env.reset() 195 | 196 | terminal = False 197 | 198 | screen_binary = pre_process(observation) 199 | 200 | if reward != 0.0: 201 | terminal = True 202 | _last_scores.append(reward) 203 | if len(_last_scores) > STORE_SCORES_LEN: 204 | _last_scores.popleft() 205 | 206 | # first frame must be handled differently 207 | if _last_state is None: 208 | # the _last_state will contain the image data from the last self.STATE_FRAMES frames 209 | _last_state = np.stack(tuple(screen_binary for _ in range(STATE_FRAMES)), axis=2) 210 | next_action = _key_presses_from_action(_last_action) 211 | else: 212 | screen_binary = np.reshape(screen_binary, 213 | (SCREEN_WIDTH, SCREEN_HEIGHT, 1)) 214 | current_state = np.append(_last_state[:, :, 1:], screen_binary, axis=2) 215 | 216 | # store the transition in previous_observations 217 | _observations.append((_last_state, _last_action, reward, current_state, terminal)) 218 | 219 | if len(_observations) > MEMORY_SIZE: 220 | _observations.popleft() 221 | 222 | # only train if done observing 223 | if len(_observations) > OBSERVATION_STEPS: 224 | _train() 225 | _time += 1 226 | 227 | # update the old values 228 | _last_state = current_state 229 | 230 | _last_action = _choose_next_action() 231 | 232 | # gradually reduce the probability of a random action 233 | if _probability_of_random_action > FINAL_RANDOM_ACTION_PROB \ 234 | and len(_observations) > OBSERVATION_STEPS: 235 | _probability_of_random_action -= \ 236 | (INITIAL_RANDOM_ACTION_PROB - FINAL_RANDOM_ACTION_PROB) / EXPLORE_STEPS 237 | 238 | print("Time: %s random_action_prob: %s reward %s scores differential %s" % 239 | (_time, _probability_of_random_action, reward, 240 | sum(_last_scores) / STORE_SCORES_LEN)) 241 | 242 | next_action = _key_presses_from_action(_last_action) -------------------------------------------------------------------------------- /Chapter 06/language model/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function, division 3 | 4 | import time 5 | import codecs 6 | import locale 7 | import sys 8 | import numpy as np 9 | import tensorflow as tf 10 | import data_reader 11 | 12 | class Model(object): 13 | """RNN language model.""" 14 | def __init__(self, batch_size, sequence_length, lstm_sizes, dropout, 15 | labels, save_path): 16 | self.batch_size = batch_size 17 | self.sequence_length = sequence_length 18 | self.lstm_sizes = lstm_sizes 19 | self.labels = labels 20 | self.label_map = {val: idx for idx, val in enumerate(labels)} 21 | self.number_of_characters = len(labels) 22 | self.save_path = save_path 23 | self.dropout = dropout 24 | 25 | def init_graph(self): 26 | # Variable sequence length 27 | self.inputs = tf.placeholder( 28 | tf.int32, [self.batch_size, self.sequence_length]) 29 | self.targets = tf.placeholder( 30 | tf.int32, [self.batch_size, self.sequence_length]) 31 | self.init_architecture() 32 | self.saver = tf.train.Saver(tf.trainable_variables()) 33 | 34 | def init_architecture(self): 35 | # Define a multilayer LSTM cell 36 | self.one_hot_inputs = tf.one_hot( 37 | self.inputs, depth=self.number_of_characters) 38 | cell_list = [tf.nn.rnn_cell.LSTMCell(lstm_size, state_is_tuple=True) 39 | for lstm_size in self.lstm_sizes] 40 | self.multi_cell_lstm = tf.nn.rnn_cell.MultiRNNCell( 41 | cell_list, state_is_tuple=True) 42 | # Initial state of the LSTM memory. 43 | # Keep state in graph memory to use between batches 44 | self.initial_state = self.multi_cell_lstm.zero_state( 45 | self.batch_size, tf.float32) 46 | # Convert to variables so that the state can be stored between batches 47 | # Note that LSTM states is a tuple of tensors, this structure has to be 48 | # re-created in order to use as LSTM state. 49 | self.state_variables = tf.python.util.nest.pack_sequence_as( 50 | self.initial_state, 51 | [tf.Variable(var, trainable=False) 52 | for var in tf.python.util.nest.flatten(self.initial_state)]) 53 | # Define the rnn through time 54 | lstm_output, final_state = tf.nn.dynamic_rnn( 55 | cell=self.multi_cell_lstm, inputs=self.one_hot_inputs, 56 | initial_state=self.state_variables) 57 | # Force the initial state to be set to the new state for the next batch 58 | # before returning the output 59 | store_states = [ 60 | state_variable.assign(new_state) 61 | for (state_variable, new_state) in zip( 62 | tf.python.util.nest.flatten(self.state_variables), 63 | tf.python.util.nest.flatten(final_state))] 64 | with tf.control_dependencies(store_states): 65 | lstm_output = tf.identity(lstm_output) 66 | # Reshape so that we can apply the linear transformation to all outputs 67 | output_flat = tf.reshape(lstm_output, (-1, self.lstm_sizes[-1])) 68 | # Define output layer 69 | self.logit_weights = tf.Variable( 70 | tf.truncated_normal( 71 | (self.lstm_sizes[-1], self.number_of_characters), stddev=0.01), 72 | name='logit_weights') 73 | self.logit_bias = tf.Variable( 74 | tf.zeros((self.number_of_characters)), name='logit_bias') 75 | # Apply last layer transformation 76 | self.logits_flat = tf.matmul( 77 | output_flat, self.logit_weights) + self.logit_bias 78 | probabilities_flat = tf.nn.softmax(self.logits_flat) 79 | self.probabilities = tf.reshape( 80 | probabilities_flat, 81 | (self.batch_size, -1, self.number_of_characters)) 82 | 83 | def init_train_op(self, optimizer): 84 | # Flatten the targets to be compatible with the flattened logits 85 | targets_flat = tf.reshape(self.targets, (-1, )) 86 | # Get the loss over all outputs 87 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits( 88 | self.logits_flat, targets_flat, name='x_entropy') 89 | self.loss = tf.reduce_mean(loss) 90 | trainable_variables = tf.trainable_variables() 91 | gradients = tf.gradients(loss, trainable_variables) 92 | gradients, _ = tf.clip_by_global_norm( 93 | gradients, 5) 94 | self.train_op = optimizer.apply_gradients( 95 | zip(gradients, trainable_variables)) 96 | 97 | def sample(self, session, prime_string, sample_length): 98 | self.reset_state(session) 99 | # Prime state 100 | print('prime_string: ', prime_string) 101 | for character in prime_string: 102 | character_idx = self.label_map[character] 103 | out = session.run( 104 | self.probabilities, 105 | feed_dict={self.inputs: np.asarray([[character_idx]])}) 106 | sample_label = np.random.choice( 107 | self.labels, size=(1), p=out[0, 0]) 108 | output_sample = prime_string 109 | print('start sampling') 110 | # Sample for sample_length steps 111 | for _ in range(sample_length): 112 | sample_label = np.random.choice( 113 | self.labels, size=(1), p=out[0, 0])[0] 114 | output_sample += sample_label 115 | sample_idx = self.label_map[sample_label] 116 | out = session.run( 117 | self.probabilities, 118 | feed_dict={self.inputs: np.asarray([[sample_idx]])}) 119 | return output_sample 120 | 121 | def reset_state(self, session): 122 | for state in tf.python.util.nest.flatten(self.state_variables): 123 | session.run(state.initializer) 124 | 125 | def save(self, sess): 126 | self.saver.save(sess, self.save_path) 127 | 128 | def restore(self, sess): 129 | self.saver.restore(sess, self.save_path) 130 | 131 | 132 | def train_and_sample(minibatch_iterations, restore): 133 | tf.reset_default_graph() 134 | batch_size = 64 135 | lstm_sizes = [512, 512] 136 | batch_len = 100 137 | learning_rate = 2e-3 138 | 139 | filepath = './wap.txt' 140 | 141 | data_feed = data_reader.DataReader( 142 | filepath, batch_len, batch_size) 143 | labels = data_feed.char_list 144 | print('labels: ', labels) 145 | 146 | save_path = './model.tf' 147 | model = Model( 148 | batch_size, batch_len, lstm_sizes, 0.8, labels, 149 | save_path) 150 | model.init_graph() 151 | optimizer = tf.train.AdamOptimizer(learning_rate) 152 | model.init_train_op(optimizer) 153 | 154 | init_op = tf.initialize_all_variables() 155 | with tf.Session() as sess: 156 | sess.run(init_op) 157 | if restore: 158 | print('Restoring model') 159 | model.restore(sess) 160 | model.reset_state(sess) 161 | start_time = time.time() 162 | for i in range(minibatch_iterations): 163 | input_batch, target_batch = next(iter(data_feed)) 164 | loss, _ = sess.run( 165 | [model.loss, model.train_op], 166 | feed_dict={ 167 | model.inputs: input_batch, model.targets: target_batch}) 168 | if i % 50 == 0 and i != 0: 169 | print('i: ', i) 170 | duration = time.time() - start_time 171 | print('loss: {} ({} sec.)'.format(loss, duration)) 172 | start_time = time.time() 173 | if i % 1000 == 0 and i != 0: 174 | model.save(sess) 175 | if i % 100 == 0 and i != 0: 176 | print('Reset initial state') 177 | model.reset_state(sess) 178 | if i % 1000 == 0 and i != 0: 179 | print('Reset minibatch feeder') 180 | data_feed.reset_indices() 181 | model.save(sess) 182 | 183 | print('\n sampling after {} iterations'.format(minibatch_iterations)) 184 | tf.reset_default_graph() 185 | model = Model( 186 | 1, None, lstm_sizes, 1.0, labels, save_path) 187 | model.init_graph() 188 | init_op = tf.initialize_all_variables() 189 | with tf.Session() as sess: 190 | sess.run(init_op) 191 | model.restore(sess) 192 | print('\nSample 1:') 193 | sample = model.sample( 194 | sess, prime_string=u'\n\nThis feeling was ', sample_length=500) 195 | print(u'sample: \n{}'.format(sample)) 196 | print('\nSample 2:') 197 | sample = model.sample( 198 | sess, prime_string=u'She was born in the year ', sample_length=500) 199 | print(u'sample: \n{}'.format(sample)) 200 | print('\nSample 3:') 201 | sample = model.sample( 202 | sess, prime_string=u'The meaning of this all is ', 203 | sample_length=500) 204 | print(u'sample: \n{}'.format(sample)) 205 | print('\nSample 4:') 206 | sample = model.sample( 207 | sess, 208 | prime_string=u'In the midst of a conversation on political matters Anna Pávlovna burst out:,', 209 | sample_length=500) 210 | print(u'sample: \n{}'.format(sample)) 211 | print('\nSample 5:') 212 | sample = model.sample( 213 | sess, prime_string=u'\n\nCHAPTER X\n\n', 214 | sample_length=500) 215 | print(u'sample: \n{}'.format(sample)) 216 | print('\nSample 5:') 217 | sample = model.sample( 218 | sess, prime_string=u'"If only you knew,"', 219 | sample_length=500) 220 | print(u'sample: \n{}'.format(sample)) 221 | 222 | 223 | def main(): 224 | total_iterations = 500 225 | print('\n\n\nTrain for {}'.format(500)) 226 | print('Total iters: {}'.format(total_iterations)) 227 | train_and_sample(500, restore=False) 228 | for i in [500, 1000, 3000, 5000, 10000, 30000, 50000, 100000, 300000]: 229 | total_iterations += i 230 | print('\n\n\nTrain for {}'.format(i)) 231 | print('Total iters: {}'.format(total_iterations)) 232 | train_and_sample(i, restore=True) 233 | 234 | 235 | if __name__ == "__main__": 236 | main() 237 | --------------------------------------------------------------------------------