├── Chapter 01
    ├── Chapter1_ex1_v2.py
    ├── Chapter1_ex2_v2.py
    └── Chapter1_ex3_v2.py
├── Chapter 06
    └── language model
    │   ├── data_processing.py
    │   ├── data_reader.py
    │   └── model.py
├── LICENSE
├── Chapter 05
    ├── astro_chapter5.py
    ├── mnist_chapter5_example.py
    ├── mnist_chapter5_example_convolution.py
    └── cifar_chapter5_example_convolution.py
├── Chapter 03
    └── mnist_chapter3_example.py
├── Chapter 08
    ├── q_learning_1d.py
    ├── q_learning_1d_terminal.py
    ├── deep_q_cart_pole.py
    ├── actor_critic_baseline_cart_pole.py
    ├── actor_critic_advantage_cart_pole.py
    ├── deep_q_breakout.py
    └── deep_q_pong.py
├── Chapter 04
    └── restricted_boltzmann_machine.py
├── README.md
├── Chapter 07
    ├── policy_gradient.py
    ├── min_max.py
    ├── monte_carlo.py
    ├── tic_tac_toe.py
    ├── connect_4.py
    └── tic_tac_toe_x.py
└── Chapter 02
    └── Ch2Example.py


/Chapter 01/Chapter1_ex1_v2.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neural_network.multilayer_perceptron import MLPClassifier
 2 | from sklearn import datasets
 3 | from sklearn.metrics import accuracy_score 
 4 | 
 5 | iris = datasets.load_iris() 
 6 | data = iris.data 
 7 | labels = iris.target
 8 | 
 9 | # We add max_iter=1000 becaue the default is max_iter=200 and 
10 | # it is not enough for full convergence
11 | mlp = MLPClassifier(random_state=1, max_iter=1000) 
12 | mlp.fit(data, labels)
13 | 
14 | pred = mlp.predict(data)
15 | 
16 | print()
17 | print('Accuracy: %.2f' % accuracy_score(labels, pred))
18 | 


--------------------------------------------------------------------------------
/Chapter 06/language model/data_processing.py:
--------------------------------------------------------------------------------
 1 | """Process text file for language model training."""
 2 | from __future__ import print_function, division
 3 | 
 4 | import re
 5 | import codecs
 6 | 
 7 | 
 8 | filepath = 'war_and_peace.txt'  # in
 9 | out_file = 'wap.txt'  # out
10 | 
11 | # Regexes used to clean up the text
12 | NEW_LINE_IN_PARAGRAPH_REGEX = re.compile(r'(\S)\n(\S)')
13 | MULTIPLE_NEWLINES_REGEX = re.compile(r'(\n)(\n)+')
14 | 
15 | # Read text as string
16 | with codecs.open(filepath, encoding='utf-8', mode='r') as f_input:
17 |     book_str = f_input.read()
18 | 
19 | # Cleanup
20 | book_str = NEW_LINE_IN_PARAGRAPH_REGEX.sub('\g<1> \g<2>', book_str)
21 | book_str = MULTIPLE_NEWLINES_REGEX.sub('\n\n', book_str)
22 | 
23 | # Write proccessed text to file
24 | with codecs.open(out_file, encoding='utf-8', mode='w')as f_output:
25 |     f_output.write(book_str)
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Chapter 05/astro_chapter5.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy  
 4 | import theano  
 5 | import matplotlib.pyplot as plt 
 6 | import theano.tensor as T
 7 | from theano.tensor.nnet import conv
 8 | import skimage.data
 9 | 
10 | import matplotlib.cm as cm
11 | 
12 | depth = 4
13 | filter_shape = (3, 3) 
14 |    
15 | input = T.tensor4(name='input')  
16 |    
17 | w_shape = (depth, 3, filter_shape[0], filter_shape[1]) 
18 | dist = numpy.random.uniform(-0.2, 0.2, size=w_shape)
19 | W = theano.shared(numpy.asarray(dist, dtype=input.dtype), name = 'W')                             
20 | conv_output = conv.conv2d(input, W)   
21 | output = T.nnet.sigmoid(conv_output)
22 | f = theano.function([input], output)
23 | 
24 | astronaut = skimage.data.astronaut()
25 | img = numpy.asarray(astronaut, dtype='float32') / 255
26 | filtered_img = f(img.transpose(2, 0, 1).reshape(1, 3, 512, 512))
27 | 
28 | 
29 | plt.axis('off') 
30 | plt.imshow(img) 
31 | plt.show()  
32 | for img in range(depth):
33 |   fig = plt.figure()   
34 |   plt.axis( 'off')   
35 |   plt.imshow(filtered_img[0, img, :, :, ], cmap = cm.gray)
36 |   plt.show()
37 | 
38 |   filename = "astro" + str(img)
39 |   fig.savefig(filename, bbox_inches='tight')
40 |   
41 | 
42 | 


--------------------------------------------------------------------------------
/Chapter 05/mnist_chapter5_example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from keras.datasets import mnist
 4 | from keras.models import Sequential 
 5 | from keras.layers.core import Dense, Activation
 6 | from keras.utils import np_utils
 7 | 
 8 | (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
 9 | 
10 | X_train = X_train.reshape(60000, 784)     
11 | X_test = X_test.reshape(10000, 784)
12 | X_train = X_train.astype('float32')     
13 | X_test = X_test.astype('float32')     
14 | X_train /= 255     
15 | X_test /= 255
16 | 
17 | 
18 | classes = 10
19 | Y_train = np_utils.to_categorical(Y_train, classes)     
20 | Y_test = np_utils.to_categorical(Y_test, classes)
21 | 
22 | input_size = 784
23 | batch_size = 100     
24 | hidden_neurons = 400     
25 | epochs = 30
26 | 
27 | model = Sequential()     
28 | model.add(Dense(hidden_neurons, input_dim=input_size)) 
29 | model.add(Activation('relu'))     
30 | model.add(Dense(classes, input_dim=hidden_neurons)) 
31 | model.add(Activation('softmax'))
32 | 
33 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adadelta')
34 | 
35 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=1)
36 | 
37 | score = model.evaluate(X_test, Y_test, verbose=1)
38 | print('Test accuracy:', score[1]) 
39 | 
40 | 


--------------------------------------------------------------------------------
/Chapter 01/Chapter1_ex2_v2.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neural_network.multilayer_perceptron import MLPClassifier
 2 | from sklearn import datasets
 3 | from sklearn.metrics import accuracy_score 
 4 | 
 5 | # Since the book came out, the cross_validation method has been moved to
 6 | # the model_selection library from the cross_validation library
 7 | #from sklearn.cross_validation import train_test_split 
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | from sklearn.preprocessing import StandardScaler
11 | 
12 | iris = datasets.load_iris() 
13 | data = iris.data 
14 | labels = iris.target
15 | 
16 | data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=0.5, random_state=1)  
17 | 
18 | scaler = StandardScaler() 
19 | scaler.fit(data) 
20 | data_train_std = scaler.transform(data_train) 
21 | data_test_std = scaler.transform(data_test)  
22 | 
23 | data_train = data_train_std 
24 | data_test = data_test_std
25 | 
26 | # We add max_iter=1000 becaue the default is max_iter=200 and 
27 | # it is not enough for full convergence 
28 | mlp = MLPClassifier(random_state=1, max_iter=1000)
29 | mlp.fit(data, labels)
30 | mlp.fit(data_train, labels_train)
31 | pred = mlp.predict(data_test)
32 | 
33 | print()
34 | print('Misclassified samples: %d' % (labels_test != pred).sum())
35 | print('Accuracy: %.2f' % accuracy_score(labels_test, pred))
36 | 


--------------------------------------------------------------------------------
/Chapter 03/mnist_chapter3_example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from keras.datasets import mnist
 4 | from keras.models import Sequential 
 5 | from keras.layers.core import Dense, Activation
 6 | from keras.utils import np_utils
 7 | 
 8 | (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
 9 | 
10 | X_train = X_train.reshape(60000, 784)     
11 | X_test = X_test.reshape(10000, 784)
12 | 
13 | classes = 10
14 | Y_train = np_utils.to_categorical(Y_train, classes)     
15 | Y_test = np_utils.to_categorical(Y_test, classes)
16 | 
17 | input_size = 784
18 | batch_size = 100     
19 | hidden_neurons = 100     
20 | epochs = 30
21 | 
22 | model = Sequential()     
23 | model.add(Dense(hidden_neurons, input_dim=input_size)) 
24 | model.add(Activation('sigmoid'))     
25 | model.add(Dense(classes, input_dim=hidden_neurons)) 
26 | model.add(Activation('softmax'))
27 | 
28 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='sgd')
29 | 
30 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, verbose=1)
31 | 
32 | score = model.evaluate(X_test, Y_test, verbose=1)
33 | print('Test accuracy:', score[1]) 
34 | 
35 | 
36 | weights = model.layers[0].get_weights()
37 | 
38 | import matplotlib.pyplot as plt     
39 | import matplotlib.cm as cm 
40 | import numpy
41 | 
42 | fig = plt.figure()
43 |   
44 | w = weights[0].T          
45 | for neuron in range(hidden_neurons):         
46 |     ax = fig.add_subplot(10, 10, neuron+1)
47 |     ax.axis("off")
48 |     ax.imshow(numpy.reshape(w[neuron], (28, 28)), cmap = cm.Greys_r)
49 | 
50 | plt.savefig("neuron_images.png", dpi=300)    
51 | plt.show()  
52 | 


--------------------------------------------------------------------------------
/Chapter 05/mnist_chapter5_example_convolution.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np     
 4 | np.random.seed(0)  #for reproducibility            
 5 | 
 6 | from keras.datasets import mnist
 7 | from keras.models import Sequential 
 8 | from keras.layers import Dense, Activation
 9 | from keras.layers import Convolution2D, MaxPooling2D
10 | from keras.layers import Dropout, Flatten
11 | 
12 | from keras.utils import np_utils
13 | 
14 | input_size = 784
15 | batch_size = 100     
16 | hidden_neurons = 200
17 | classes = 10     
18 | epochs = 8
19 | 
20 | (X_train, Y_train), (X_test, Y_test) = mnist.load_data()
21 | 
22 | X_train = X_train.reshape(60000, 28, 28, 1)     
23 | X_test = X_test.reshape(10000, 28, 28, 1)
24 | 
25 | X_train = X_train.astype('float32')     
26 | X_test = X_test.astype('float32')     
27 | X_train /= 255     
28 | X_test /= 255
29 | 
30 | Y_train = np_utils.to_categorical(Y_train, classes)     
31 | Y_test = np_utils.to_categorical(Y_test, classes)
32 | 
33 | model = Sequential() 
34 | model.add(Convolution2D(32, (3, 3), input_shape=(28, 28, 1)))
35 | model.add(Activation('relu'))
36 | model.add(Convolution2D(32, (3, 3)))  
37 | model.add(Activation('relu'))
38 | model.add(MaxPooling2D(pool_size=(2, 2))) 
39 | model.add(Dropout(0.25))  
40 |                
41 | model.add(Flatten())
42 |  
43 | model.add(Dense(hidden_neurons)) 
44 | model.add(Activation('relu'))      
45 | model.add(Dense(classes)) 
46 | model.add(Activation('softmax'))
47 |      
48 | 
49 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adadelta')
50 | 
51 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, validation_split = 0.1, verbose=1)
52 | 
53 | score = model.evaluate(X_test, Y_test, verbose=1)
54 | print('Test accuracy:', score[1]) 
55 | 
56 | 


--------------------------------------------------------------------------------
/Chapter 08/q_learning_1d.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | states = [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 5 | NUM_STATES = len(states)
 6 | NUM_ACTIONS = 2
 7 | DISCOUNT_FACTOR = 0.5
 8 | 
 9 | 
10 | def one_hot_state(index):
11 |     array = np.zeros(NUM_STATES)
12 |     array[index] = 1.
13 |     return array
14 | 
15 | 
16 | session = tf.Session()
17 | state = tf.placeholder("float", [None, NUM_STATES])
18 | targets = tf.placeholder("float", [None, NUM_ACTIONS])
19 | 
20 | weights = tf.Variable(tf.constant(0., shape=[NUM_STATES, NUM_ACTIONS]))
21 | 
22 | output = tf.matmul(state, weights)
23 | 
24 | loss = tf.reduce_mean(tf.square(output - targets))
25 | train_operation = tf.train.GradientDescentOptimizer(1.).minimize(loss)
26 | 
27 | session.run(tf.initialize_all_variables())
28 | 
29 | for _ in range(50):
30 |     state_batch = []
31 |     rewards_batch = []
32 | 
33 |     for state_index in range(NUM_STATES):
34 |         state_batch.append(one_hot_state(state_index))
35 | 
36 |         minus_action_index = (state_index - 1) % NUM_STATES
37 |         plus_action_index = (state_index + 1) % NUM_STATES
38 | 
39 |         minus_action_state_reward = session.run(output, feed_dict={state: [one_hot_state(minus_action_index)]})
40 |         plus_action_state_reward = session.run(output, feed_dict={state: [one_hot_state(plus_action_index)]})
41 | 
42 |         minus_action_q_value = DISCOUNT_FACTOR * (states[minus_action_index] + np.max(minus_action_state_reward))
43 |         plus_action_q_value = DISCOUNT_FACTOR * (states[plus_action_index] + np.max(plus_action_state_reward))
44 | 
45 |         action_rewards = [minus_action_q_value, plus_action_q_value]
46 |         rewards_batch.append(action_rewards)
47 | 
48 |     session.run(train_operation, feed_dict={
49 |         state: state_batch,
50 |         targets: rewards_batch})
51 | 
52 |     print([states[x] + np.max(session.run(output, feed_dict={state: [one_hot_state(x)]}))
53 |            for x in range(NUM_STATES)])
54 | 


--------------------------------------------------------------------------------
/Chapter 05/cifar_chapter5_example_convolution.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy               
 4 | 
 5 | from keras.datasets import cifar10
 6 | from keras.models import Sequential 
 7 | from keras.layers.core import Dense, Activation
 8 | from keras.layers import Convolution2D, MaxPooling2D, Flatten
 9 | from keras.layers import Dropout
10 | from keras.utils import np_utils
11 | 
12 | batch_size = 100     
13 | hidden_neurons = 200
14 | classes = 10     
15 | epochs = 20
16 | 
17 | (X_train, Y_train), (X_test, Y_test) = cifar10.load_data()
18 | 
19 | 
20 | Y_train = np_utils.to_categorical(Y_train, classes)     
21 | Y_test = np_utils.to_categorical(Y_test, classes)
22 | 
23 | model = Sequential() 
24 | model.add(Convolution2D(32, (3, 3), input_shape=(32, 32, 3)))
25 | model.add(Activation('relu'))
26 | model.add(Convolution2D(32, (3, 3)))  
27 | model.add(Activation('relu'))
28 | model.add(MaxPooling2D(pool_size=(2, 2)))
29 | model.add(Dropout(0.25))   
30 | 
31 | model.add(Convolution2D(64, (3, 3))) 
32 | model.add(Activation('relu'))     
33 | model.add(Convolution2D(64, (3, 3)))     
34 | model.add(Activation('relu'))     
35 | model.add(MaxPooling2D(pool_size=(2, 2)))     
36 | model.add(Dropout(0.25))
37 |                
38 | model.add(Flatten())
39 |  
40 | model.add(Dense(hidden_neurons)) 
41 | model.add(Activation('relu')) 
42 | model.add(Dropout(0.5))      
43 | model.add(Dense(classes)) 
44 | model.add(Activation('softmax'))
45 |      
46 | 
47 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adadelta')
48 | 
49 | model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, validation_split = 0.1, verbose=1)
50 | 
51 | score = model.evaluate(X_test, Y_test, verbose=1)
52 | print('Test accuracy:', score[1]) 
53 | 
54 | numpy.set_printoptions(threshold='nan')  
55 | index = 0   
56 | for layer in model.layers:       
57 |   filename = "conv_layer_" + str(index)       
58 |   f1 = open(filename, 'w+')       
59 |   f1.write(repr(layer.get_weights()))       
60 |   f1.close()       
61 |   print (filename + " has been opened and closed")     
62 |   index = index+1
63 | 


--------------------------------------------------------------------------------
/Chapter 04/restricted_boltzmann_machine.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from tensorflow.examples.tutorials.mnist import input_data
 3 | 
 4 | VISIBLE_NODES = 784
 5 | HIDDEN_NODES = 400
 6 | LEARNING_RATE = 0.01
 7 | 
 8 | mnist = input_data.read_data_sets("MNIST_data/")
 9 | 
10 | input_placeholder = tf.placeholder("float", shape=(None, VISIBLE_NODES))
11 | 
12 | weights = tf.Variable(tf.random_normal((VISIBLE_NODES, HIDDEN_NODES), mean=0.0, stddev=1. / VISIBLE_NODES))
13 | hidden_bias = tf.Variable(tf.zeros([HIDDEN_NODES]))
14 | visible_bias = tf.Variable(tf.zeros([VISIBLE_NODES]))
15 | 
16 | hidden_activation = tf.nn.sigmoid(tf.matmul(input_placeholder, weights) + hidden_bias)
17 | visible_reconstruction = tf.nn.sigmoid(tf.matmul(hidden_activation, tf.transpose(weights)) + visible_bias)
18 | 
19 | final_hidden_activation = tf.nn.sigmoid(tf.matmul(visible_reconstruction, weights) + hidden_bias)
20 | 
21 | positive_phase = tf.matmul(tf.transpose(input_placeholder), hidden_activation)
22 | negative_phase = tf.matmul(tf.transpose(visible_reconstruction), final_hidden_activation)
23 | 
24 | weight_update = weights.assign_add(LEARNING_RATE * (positive_phase - negative_phase))
25 | visible_bias_update = visible_bias.assign_add(LEARNING_RATE *
26 |                                               tf.reduce_mean(input_placeholder - visible_reconstruction, 0))
27 | hidden_bias_update = hidden_bias.assign_add(LEARNING_RATE *
28 |                                             tf.reduce_mean(hidden_activation - final_hidden_activation, 0))
29 | 
30 | train_op = tf.group(weight_update, visible_bias_update, hidden_bias_update)
31 | 
32 | loss_op = tf.reduce_sum(tf.square(input_placeholder - visible_reconstruction))
33 | 
34 | session = tf.Session()
35 | 
36 | session.run(tf.initialize_all_variables())
37 | 
38 | current_epochs = 0
39 | 
40 | for i in range(20):
41 |     total_loss = 0
42 |     while mnist.train.epochs_completed == current_epochs:
43 |         batch_inputs, batch_labels = mnist.train.next_batch(100)
44 |         _, reconstruction_loss = session.run([train_op, loss_op], feed_dict={input_placeholder: batch_inputs})
45 |         total_loss += reconstruction_loss
46 | 
47 |     print("epochs %s loss %s" % (current_epochs, reconstruction_loss))
48 |     current_epochs = mnist.train.epochs_completed
49 | 


--------------------------------------------------------------------------------
/Chapter 08/q_learning_1d_terminal.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | states = [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
 5 | terminal = [False, False, False, False, True, False, False, False, False, False]
 6 | NUM_STATES = len(states)
 7 | NUM_ACTIONS = 2
 8 | DISCOUNT_FACTOR = 0.5
 9 | 
10 | 
11 | def one_hot_state(index):
12 |     array = np.zeros(NUM_STATES)
13 |     array[index] = 1.
14 |     return array
15 | 
16 | 
17 | session = tf.Session()
18 | state = tf.placeholder("float", [None, NUM_STATES])
19 | targets = tf.placeholder("float", [None, NUM_ACTIONS])
20 | 
21 | weights = tf.Variable(tf.constant(0., shape=[NUM_STATES, NUM_ACTIONS]))
22 | 
23 | output = tf.matmul(state, weights)
24 | 
25 | loss = tf.reduce_mean(tf.square(output - targets))
26 | train_operation = tf.train.GradientDescentOptimizer(1.).minimize(loss)
27 | 
28 | session.run(tf.initialize_all_variables())
29 | 
30 | for _ in range(50):
31 |     state_batch = []
32 |     rewards_batch = []
33 | 
34 |     for state_index in range(NUM_STATES):
35 |         state_batch.append(one_hot_state(state_index))
36 | 
37 |         minus_action_index = (state_index - 1) % NUM_STATES
38 |         plus_action_index = (state_index + 1) % NUM_STATES
39 | 
40 |         if terminal[minus_action_index]:
41 |             minus_action_q_value = DISCOUNT_FACTOR * states[minus_action_index]
42 |         else:
43 |             minus_action_state_reward = session.run(output, feed_dict={state: [one_hot_state(minus_action_index)]})
44 |             minus_action_q_value = DISCOUNT_FACTOR * (states[minus_action_index] + np.max(minus_action_state_reward))
45 | 
46 |         if terminal[plus_action_index]:
47 |             plus_action_q_value = DISCOUNT_FACTOR * states[plus_action_index]
48 |         else:
49 |             plus_action_state_reward = session.run(output, feed_dict={state: [one_hot_state(plus_action_index)]})
50 |             plus_action_q_value = DISCOUNT_FACTOR * (states[plus_action_index] + np.max(plus_action_state_reward))
51 | 
52 |         action_rewards = [minus_action_q_value, plus_action_q_value]
53 |         rewards_batch.append(action_rewards)
54 | 
55 |     session.run(train_operation, feed_dict={
56 |         state: state_batch,
57 |         targets: rewards_batch})
58 | 
59 |     print([states[x] + (1-float(terminal[x]))*np.max(session.run(output, feed_dict={state: [one_hot_state(x)]}))
60 |            for x in range(NUM_STATES)])
61 | 


--------------------------------------------------------------------------------
/Chapter 06/language model/data_reader.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function, division
 2 | 
 3 | from six.moves import range
 4 | import codecs
 5 | import numpy as np
 6 | 
 7 | 
 8 | class DataReader(object):
 9 |     """Data reader used for training language model."""
10 |     def __init__(self, filepath, batch_length, batch_size):
11 |         self.batch_length = batch_length
12 |         self.batch_size = batch_size
13 |         # Read data into string
14 |         with codecs.open(filepath, encoding='utf-8', mode='r') as f:
15 |             self.data_str = f.read()
16 |         self.data_length = len(self.data_str)
17 |         print('data_length: ', self.data_length)
18 |         # Create a list of characters, indices are class indices for softmax
19 |         char_set = set()
20 |         for ch in self.data_str:
21 |             char_set.add(ch)
22 |         self.char_list = sorted(list(char_set))
23 |         print('char_list: ', len(self.char_list), self.char_list)
24 |         # Create reverse mapping to look up the index based on the character
25 |         self.char_dict = {val: idx for idx, val in enumerate(self.char_list)}
26 |         print('char_dict: ', self.char_dict)
27 |         # Initalise random start indices
28 |         self.reset_indices()
29 | 
30 |     def reset_indices(self):
31 |         self.start_idxs = np.random.random_integers(
32 |             0, self.data_length, self.batch_size)
33 | 
34 |     def get_sample(self, start_idx, length):
35 |         # Get a sample and wrap around the data string
36 |         return [self.char_dict[self.data_str[i % self.data_length]]
37 |                 for i in range(start_idx, start_idx+length)]
38 | 
39 |     def get_input_target_sample(self, start_idx):
40 |         sample = self.get_sample(start_idx, self.batch_length+1)
41 |         inpt = sample[0:self.batch_length]
42 |         trgt = sample[1:self.batch_length+1]
43 |         return inpt, trgt
44 | 
45 |     def get_batch(self, start_idxs):
46 |         input_batch = np.zeros((self.batch_size, self.batch_length),
47 |                                dtype=np.int32)
48 |         target_batch = np.zeros((self.batch_size, self.batch_length),
49 |                                 dtype=np.int32)
50 |         for i, start_idx in enumerate(start_idxs):
51 |             inpt, trgt = self.get_input_target_sample(start_idx)
52 |             input_batch[i, :] = inpt
53 |             target_batch[i, :] = trgt
54 |         return input_batch, target_batch
55 | 
56 |     def __iter__(self):
57 |         while True:
58 |             input_batch, target_batch = self.get_batch(self.start_idxs)
59 |             self.start_idxs = (
60 |                 self.start_idxs + self.batch_length) % self.data_length
61 |             yield input_batch, target_batch
62 | 
63 | 
64 | def main():
65 |     filepath = './wap.txt'
66 |     batch_length = 10
67 |     batch_size = 2
68 |     reader = DataReader(filepath, batch_length, batch_size)
69 |     s = 'As in the question of astronomy then, so in the question of history now,'
70 |     print([reader.char_dict[c] for c in s])
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Python Deep Learning
 5 | This is the code repository for [Python Deep Learning](https://www.packtpub.com/big-data-and-business-intelligence/python-deep-learning?utm_source=github&utm_medium=repository&utm_campaign=9781786464453), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish.
 6 | ## About the Book
 7 | With an increasing interest in AI around the world, deep learning has attracted a great deal of public attention. Every day, deep learning algorithms are used broadly across different industries.
 8 | 
 9 | The book will give you all the practical information available on the subject, including the best practices, using real-world use cases. You will learn to recognize and extract information to increase predictive accuracy and optimize results.
10 | 
11 | Starting with a quick recap of important machine learning concepts, the book will delve straight into deep learning principles using Sci-kit learn. Moving ahead, you will learn to use the latest open source libraries such as Theano, Keras, Google's TensorFlow, and H20. Use this guide to uncover the difficulties of pattern recognition, scaling data with greater accuracy and discussing deep learning algorithms and techniques.
12 | ## Instructions and Navigation
13 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02.
14 | 
15 | 
16 | 
17 | The code will look like the following:
18 | ```
19 | mlp.fit(data_train, labels_train)
20 | pred = mlp.predict(data_test)
21 | print('Misclassified samples: %d' % (labels_test != pred).sum())
22 | from sklearn.metrics import accuracy_score print('Accuracy: %.2f' % accuracy_score(labels_test, pred))
23 | ```
24 | 
25 | 
26 | 
27 | ## Related Products
28 | * [Python: Deeper Insights into Machine Learning](https://www.packtpub.com/big-data-and-business-intelligence/python-deeper-insights-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781787128576)
29 | 
30 | * [Deep Learning with Keras](https://www.packtpub.com/big-data-and-business-intelligence/deep-learning-keras?utm_source=github&utm_medium=repository&utm_campaign=9781787128422)
31 | 
32 | * [Deep Learning with Hadoop](https://www.packtpub.com/big-data-and-business-intelligence/deep-learning-hadoop?utm_source=github&utm_medium=repository&utm_campaign=9781787124769)
33 | 
34 | * [Python Machine Learning](https://www.packtpub.com/big-data-and-business-intelligence/python-machine-learning?utm_source=github&utm_medium=repository&utm_campaign=9781783555130)
35 | 
36 | ### Suggestions and Feedback
37 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions.
38 | ### Download a free PDF
39 | 
40 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
41 | <p align="center"> <a href="https://packt.link/free-ebook/9781789348460">https://packt.link/free-ebook/9781789348460 </a> </p>


--------------------------------------------------------------------------------
/Chapter 01/Chapter1_ex3_v2.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neural_network.multilayer_perceptron import MLPClassifier
 2 | from sklearn import datasets
 3 | 
 4 | # Since the book came out, the cross_validation method has been moved to
 5 | # the model_selection library from the cross_validation library
 6 | #from sklearn.cross_validation import train_test_split
 7 | from sklearn.model_selection import train_test_split
 8 |  
 9 | from sklearn.preprocessing import StandardScaler
10 | from sklearn.metrics import accuracy_score
11 | 
12 | import numpy
13 | from matplotlib.colors import ListedColormap
14 | import matplotlib.pyplot as plt
15 | 
16 | #Apply standardization
17 | standardised = True
18 | 
19 | M = {0:"sepal length", 1:"sepal width", 2:"petal length", 3:"petal width"}
20 | 
21 | #Choose two features
22 | x=1 #1 corresponds to the sepal width
23 | y=3 #3 corresponds to the petal width
24 | 
25 | iris = datasets.load_iris()
26 | data = iris.data[:,[x,y]]
27 | 
28 | labels = iris.target
29 | 
30 | X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.5, random_state=1)
31 | 
32 | reg = StandardScaler()
33 | reg.fit(data)
34 | X_train_std = reg.transform(X_train)
35 | X_test_std = reg.transform(X_test)
36 | 
37 | if (standardised == False):
38 |   X_train_std = X_train
39 |   X_test_std = X_test
40 | 
41 | # We add max_iter=1000 becaue the default is max_iter=200 and 
42 | # it is not enough for full convergence
43 | mlp = MLPClassifier(random_state=1, max_iter=1000)
44 | mlp.fit(X_train_std, y_train)
45 | 
46 | y_pred = mlp.predict(X_test_std)
47 | print('Misclassified samples: %d' % (y_test != y_pred).sum())
48 | 
49 | print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
50 | 
51 | 
52 | def plot_decision_regions(data, labels, classifier, resolution=0.01):
53 |     markers = ('s', '*', '^')
54 |     colors = ('blue', 'green', 'red')
55 |     cmap = ListedColormap(colors)
56 |     # plot the decision surface
57 |     x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
58 |     y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
59 | 
60 |     x, y = numpy.meshgrid(numpy.arange(x_min, x_max, resolution), numpy.arange(y_min, y_max, resolution))
61 |     Z = classifier.predict(numpy.array([x.ravel(), y.ravel()]).T)
62 |     Z = Z.reshape(x.shape)
63 |     
64 |     plt.pcolormesh(x, y, Z, cmap=cmap)
65 |     plt.xlim(x.min(), x.max())
66 |     plt.ylim(y.min(), y.max())
67 | 
68 |     colors = ('yellow', 'white', 'black')
69 |     #cmap = ListedColormap(colors)
70 |     #plot the data
71 |     classes = ["setosa", "versicolor", "verginica"]
72 |     for index, cl in enumerate(numpy.unique(labels)):
73 |         plt.scatter(data[labels == cl, 0], data[labels == cl, 1], c=cmap(index), marker=markers[index], edgecolor="black", alpha=1.0, s=50, label=classes[index])  
74 |  
75 | X_combined_std = numpy.vstack((X_train_std, X_test_std))
76 | y_combined = numpy.hstack((y_train, y_test))
77 | plot_decision_regions(X_combined_std, y_combined, classifier=mlp)
78 | 
79 | if (standardised == False):
80 |   xString = M[x] + " [not standardized]"  
81 |   yString = M[y] + " [not standardized]" 
82 | else:
83 |   xString = M[x] + " [standardized]"  
84 |   yString = M[y] + " [standardized]"  
85 | 
86 | plt.xlabel(xString)
87 | plt.ylabel(yString)
88 | plt.legend(loc='upper left')
89 | plt.show()


--------------------------------------------------------------------------------
/Chapter 07/policy_gradient.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import numpy as np
 3 | import tensorflow as tf
 4 | 
 5 | from tic_tac_toe import play_game, random_player
 6 | 
 7 | HIDDEN_NODES = (100, 100, 100)  # number of hidden layer neurons
 8 | INPUT_NODES = 3 * 3  # board size
 9 | BATCH_SIZE = 100  # every how many games to do a parameter update?
10 | LEARN_RATE = 1e-4
11 | OUTPUT_NODES = INPUT_NODES
12 | PRINT_RESULTS_EVERY_X = 1000  # every how many games to print the results
13 | 
14 | input_placeholder = tf.placeholder("float", shape=(None, INPUT_NODES))
15 | reward_placeholder = tf.placeholder("float", shape=(None,))
16 | actual_move_placeholder = tf.placeholder("float", shape=(None, OUTPUT_NODES))
17 | 
18 | hidden_weights_1 = tf.Variable(tf.truncated_normal((INPUT_NODES, HIDDEN_NODES[0]), stddev=1. / np.sqrt(INPUT_NODES)))
19 | hidden_weights_2 = tf.Variable(
20 |     tf.truncated_normal((HIDDEN_NODES[0], HIDDEN_NODES[1]), stddev=1. / np.sqrt(HIDDEN_NODES[0])))
21 | hidden_weights_3 = tf.Variable(
22 |     tf.truncated_normal((HIDDEN_NODES[1], HIDDEN_NODES[2]), stddev=1. / np.sqrt(HIDDEN_NODES[1])))
23 | output_weights = tf.Variable(tf.truncated_normal((HIDDEN_NODES[-1], OUTPUT_NODES), stddev=1. / np.sqrt(OUTPUT_NODES)))
24 | 
25 | hidden_layer_1 = tf.nn.relu(
26 |     tf.matmul(input_placeholder, hidden_weights_1) + tf.Variable(tf.constant(0.01, shape=(HIDDEN_NODES[0],))))
27 | hidden_layer_2 = tf.nn.relu(
28 |     tf.matmul(hidden_layer_1, hidden_weights_2) + tf.Variable(tf.constant(0.01, shape=(HIDDEN_NODES[1],))))
29 | hidden_layer_3 = tf.nn.relu(
30 |     tf.matmul(hidden_layer_2, hidden_weights_3) + tf.Variable(tf.constant(0.01, shape=(HIDDEN_NODES[2],))))
31 | output_layer = tf.nn.softmax(
32 |     tf.matmul(hidden_layer_3, output_weights) + tf.Variable(tf.constant(0.01, shape=(OUTPUT_NODES,))))
33 | 
34 | policy_gradient = tf.reduce_sum(tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer)
35 | train_step = tf.train.RMSPropOptimizer(LEARN_RATE).minimize(-policy_gradient)
36 | 
37 | sess = tf.Session()
38 | sess.run(tf.initialize_all_variables())
39 | 
40 | board_states, actual_moves, rewards = [], [], []
41 | episode_number = 1
42 | results = collections.deque()
43 | 
44 | 
45 | def make_move(board_state, side):
46 |     board_state_flat = np.ravel(board_state)
47 |     board_states.append(board_state_flat)
48 |     probability_of_actions = sess.run(output_layer, feed_dict={input_placeholder: [board_state_flat]})[0]
49 | 
50 |     try:
51 |         move = np.random.multinomial(1, probability_of_actions)
52 |     except ValueError:
53 |         # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1.
54 |         # so need to reduce slightly to be a valid value
55 |         move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-7))
56 | 
57 |     actual_moves.append(move)
58 | 
59 |     move_index = move.argmax()
60 |     return move_index / 3, move_index % 3
61 | 
62 | 
63 | while True:
64 |     reward = play_game(make_move, random_player)
65 | 
66 |     results.append(reward)
67 |     if len(results) > PRINT_RESULTS_EVERY_X:
68 |         results.popleft()
69 | 
70 |     last_game_length = len(board_states) - len(rewards)
71 | 
72 |     # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
73 |     reward /= float(last_game_length)
74 | 
75 |     rewards += ([reward] * last_game_length)
76 | 
77 |     episode_number += 1
78 | 
79 |     if episode_number % BATCH_SIZE == 0:
80 |         normalized_rewards = rewards - np.mean(rewards)
81 |         normalized_rewards /= np.std(normalized_rewards)
82 | 
83 |         sess.run(train_step, feed_dict={input_placeholder: board_states,
84 |                                         reward_placeholder: normalized_rewards,
85 |                                         actual_move_placeholder: actual_moves})
86 | 
87 |         # clear batches
88 |         del board_states[:]
89 |         del actual_moves[:]
90 |         del rewards[:]
91 | 
92 |     if episode_number % PRINT_RESULTS_EVERY_X == 0:
93 |         print("episode: %s win_rate: %s" % (episode_number, 0.5 + sum(results) / (PRINT_RESULTS_EVERY_X * 2.)))
94 | 


--------------------------------------------------------------------------------
/Chapter 07/min_max.py:
--------------------------------------------------------------------------------
  1 | from tic_tac_toe import available_moves, apply_move, has_winner
  2 | import sys
  3 | 
  4 | 
  5 | def _score_line(line):
  6 |     minus_count = line.count(-1)
  7 |     plus_count = line.count(1)
  8 |     if minus_count + plus_count < 3:
  9 |         if minus_count == 2:
 10 |             return -1
 11 |         elif plus_count == 2:
 12 |             return 1
 13 |     return 0
 14 | 
 15 | 
 16 | def evaluate(board_state):
 17 |     """Get a rough score for how good we think this board position is for the plus_player. Does this based on number of
 18 |     2 in row lines we have.
 19 | 
 20 |     Args:
 21 |         board_state (3x3 tuple of int): The board state we are evaluating
 22 | 
 23 |     Returns:
 24 |         int: evaluated score for the position for the plus player, posative is good for the plus player, negative good
 25 |             for the minus player
 26 |     """
 27 |     score = 0
 28 |     for x in range(3):
 29 |         score += _score_line(board_state[x])
 30 |     for y in range(3):
 31 |         score += _score_line([i[y] for i in board_state])
 32 | 
 33 |     # diagonals
 34 |     score += _score_line([board_state[i][i] for i in range(3)])
 35 |     score += _score_line([board_state[2 - i][i] for i in range(3)])
 36 | 
 37 |     return score
 38 | 
 39 | 
 40 | def min_max(board_state, side, max_depth, evaluation_func=evaluate):
 41 |     """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best
 42 |     move
 43 | 
 44 |     Args:
 45 |         board_state (3x3 tuple of int): The board state we are evaluating
 46 |         side (int): either +1 or -1
 47 |         max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the
 48 |         position is.
 49 |         evaluation_func (board_state -> int): Function used to evaluate the position for the plus player
 50 | 
 51 |     Returns:
 52 |         (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was
 53 |     """
 54 |     best_score = None
 55 |     best_score_move = None
 56 | 
 57 |     moves = list(available_moves(board_state))
 58 |     if not moves:
 59 |         # this is a draw
 60 |         return 0, None
 61 | 
 62 |     for move in moves:
 63 |         new_board_state = apply_move(board_state, move, side)
 64 |         winner = has_winner(new_board_state)
 65 |         if winner != 0:
 66 |             return winner * 10000, move
 67 |         else:
 68 |             if max_depth <= 1:
 69 |                 score = evaluation_func(new_board_state)
 70 |             else:
 71 |                 score, _ = min_max(new_board_state, -side, max_depth - 1)
 72 |             if side > 0:
 73 |                 if best_score is None or score > best_score:
 74 |                     best_score = score
 75 |                     best_score_move = move
 76 |             else:
 77 |                 if best_score is None or score < best_score:
 78 |                     best_score = score
 79 |                     best_score_move = move
 80 |     return best_score, best_score_move
 81 | 
 82 | 
 83 | def min_max_alpha_beta(board_state, side, max_depth, evaluation_func=evaluate, alpha=-sys.float_info.max,
 84 |                        beta=sys.float_info.max):
 85 |     """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best
 86 |     move
 87 | 
 88 |     Args:
 89 |         board_state (3x3 tuple of int): The board state we are evaluating
 90 |         side (int): either +1 or -1
 91 |         max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the
 92 |         position is.
 93 |         evaluation_func (board_state -> int): Function used to evaluate the position for the plus player
 94 |         alpha (float): Used when this is called recursively, normally ignore
 95 |         beta (float): Used when this is called recursively, normally ignore
 96 | 
 97 |     Returns:
 98 |         (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was
 99 |     """
100 |     best_score_move = None
101 |     moves = list(available_moves(board_state))
102 |     if not moves:
103 |         return 0, None
104 | 
105 |     for move in moves:
106 |         new_board_state = apply_move(board_state, move, side)
107 |         winner = has_winner(new_board_state)
108 |         if winner != 0:
109 |             return winner * 10000, move
110 |         else:
111 |             if max_depth <= 1:
112 |                 score = evaluation_func(new_board_state)
113 |             else:
114 |                 score, _ = min_max_alpha_beta(new_board_state, -side, max_depth - 1, alpha, beta)
115 | 
116 |         if side > 0:
117 |             if score > alpha:
118 |                 alpha = score
119 |                 best_score_move = move
120 |         else:
121 |             if score < beta:
122 |                 beta = score
123 |                 best_score_move = move
124 |         if alpha >= beta:
125 |             break
126 | 
127 |     return alpha if side > 0 else beta, best_score_move
128 | 
129 | 
130 | def min_max_player(board_state, side):
131 |     return min_max(board_state, side, 5)[1]
132 | 


--------------------------------------------------------------------------------
/Chapter 08/deep_q_cart_pole.py:
--------------------------------------------------------------------------------
  1 | # note must import tensorflow before gym
  2 | import random
  3 | from collections import deque
  4 | 
  5 | import tensorflow as tf
  6 | import gym
  7 | import numpy as np
  8 | 
  9 | env = gym.make('CartPole-v0')
 10 | 
 11 | ACTIONS_COUNT = 2
 12 | FUTURE_REWARD_DISCOUNT = 0.9
 13 | OBSERVATION_STEPS = 5000.  # time steps to observe before training
 14 | EXPLORE_STEPS = 15000.  # frames over which to anneal epsilon
 15 | INITIAL_RANDOM_ACTION_PROB = 1.0  # starting chance of an action being random
 16 | FINAL_RANDOM_ACTION_PROB = 0.0  # final chance of an action being random
 17 | MEMORY_SIZE = 20000  # number of observations to remember
 18 | MINI_BATCH_SIZE = 100  # size of mini batches
 19 | OBS_LAST_STATE_INDEX, OBS_ACTION_INDEX, OBS_REWARD_INDEX, OBS_CURRENT_STATE_INDEX, OBS_TERMINAL_INDEX = range(5)
 20 | LEARN_RATE = 1e-3
 21 | STORE_SCORES_LEN = 100.
 22 | INPUT_NODES = env.observation_space.shape[0]
 23 | HIDDEN_NODES = 20
 24 | 
 25 | session = tf.Session()
 26 | 
 27 | feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, HIDDEN_NODES], stddev=0.01))
 28 | feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[HIDDEN_NODES]))
 29 | 
 30 | feed_forward_weights_2 = tf.Variable(tf.truncated_normal([HIDDEN_NODES, ACTIONS_COUNT], stddev=0.01))
 31 | feed_forward_bias_2 = tf.Variable(tf.constant(0.0, shape=[ACTIONS_COUNT]))
 32 | 
 33 | input_placeholder = tf.placeholder("float", [None, INPUT_NODES])
 34 | hidden_layer = tf.nn.tanh(tf.matmul(input_placeholder, feed_forward_weights_1) + feed_forward_bias_1)
 35 | output_layer = tf.matmul(hidden_layer, feed_forward_weights_2) + feed_forward_bias_2
 36 | 
 37 | action_placeholder = tf.placeholder("float", [None, ACTIONS_COUNT])
 38 | target_placeholder = tf.placeholder("float", [None])
 39 | 
 40 | readout_action = tf.reduce_sum(tf.mul(output_layer, action_placeholder), reduction_indices=1)
 41 | 
 42 | cost = tf.reduce_mean(tf.square(target_placeholder - readout_action))
 43 | train_operation = tf.train.AdamOptimizer(LEARN_RATE).minimize(cost)
 44 | 
 45 | observations = deque(maxlen=MEMORY_SIZE)
 46 | scores = deque(maxlen=STORE_SCORES_LEN)
 47 | 
 48 | # set the first action to do nothing
 49 | last_action = np.zeros(ACTIONS_COUNT)
 50 | last_action[1] = 1
 51 | 
 52 | probability_of_random_action = INITIAL_RANDOM_ACTION_PROB
 53 | time = 0
 54 | 
 55 | session.run(tf.initialize_all_variables())
 56 | 
 57 | 
 58 | def choose_next_action(state):
 59 |     new_action = np.zeros([ACTIONS_COUNT])
 60 | 
 61 |     if random.random() <= probability_of_random_action:
 62 |         # choose an action randomly
 63 |         action_index = random.randrange(ACTIONS_COUNT)
 64 |     else:
 65 |         # choose an action given our state
 66 |         action_values = session.run(output_layer, feed_dict={input_placeholder: [state]})[0]
 67 |         # we will take the highest value action
 68 |         action_index = np.argmax(action_values)
 69 | 
 70 |     new_action[action_index] = 1
 71 |     return new_action
 72 | 
 73 | 
 74 | def train():
 75 |     # sample a mini_batch to train on
 76 |     mini_batch = random.sample(observations, MINI_BATCH_SIZE)
 77 | 
 78 |     # get the batch variables
 79 |     previous_states = [d[OBS_LAST_STATE_INDEX] for d in mini_batch]
 80 |     actions = [d[OBS_ACTION_INDEX] for d in mini_batch]
 81 |     rewards = [d[OBS_REWARD_INDEX] for d in mini_batch]
 82 |     current_states = [d[OBS_CURRENT_STATE_INDEX] for d in mini_batch]
 83 |     agents_expected_reward = []
 84 |     # this gives us the agents expected reward for each action we might take
 85 |     agents_reward_per_action = session.run(output_layer, feed_dict={input_placeholder: current_states})
 86 |     for i in range(len(mini_batch)):
 87 |         if mini_batch[i][OBS_TERMINAL_INDEX]:
 88 |             # this was a terminal frame so there is no future reward...
 89 |             agents_expected_reward.append(rewards[i])
 90 |         else:
 91 |             agents_expected_reward.append(
 92 |                 rewards[i] + FUTURE_REWARD_DISCOUNT * np.max(agents_reward_per_action[i]))
 93 | 
 94 |     # learn that these actions in these states lead to this reward
 95 |     session.run(train_operation, feed_dict={
 96 |         input_placeholder: previous_states,
 97 |         action_placeholder: actions,
 98 |         target_placeholder: agents_expected_reward})
 99 | 
100 | 
101 | last_state = env.reset()
102 | total_reward = 0
103 | 
104 | while True:
105 |     env.render()
106 |     last_action = choose_next_action(last_state)
107 |     current_state, reward, terminal, info = env.step(np.argmax(last_action))
108 |     total_reward += reward
109 | 
110 |     if terminal:
111 |         reward = -1.
112 |         scores.append(total_reward)
113 | 
114 |         print("Time: %s random_action_prob: %s reward %s scores differential %s" %
115 |               (time, probability_of_random_action, total_reward,
116 |                np.mean(scores)))
117 |         total_reward = 0
118 | 
119 |     # store the transition in previous_observations
120 |     observations.append((last_state, last_action, reward, current_state, terminal))
121 | 
122 |     # only train if done observing
123 |     if len(observations) > OBSERVATION_STEPS:
124 |         train()
125 |         time += 1
126 | 
127 |     # update the old values
128 |     if terminal:
129 |         last_state = env.reset()
130 |     else:
131 |         last_state = current_state
132 | 
133 |     # gradually reduce the probability of a random action
134 |     if probability_of_random_action > FINAL_RANDOM_ACTION_PROB \
135 |             and len(observations) > OBSERVATION_STEPS:
136 |         probability_of_random_action -= \
137 |             (INITIAL_RANDOM_ACTION_PROB - FINAL_RANDOM_ACTION_PROB) / EXPLORE_STEPS
138 | 


--------------------------------------------------------------------------------
/Chapter 07/monte_carlo.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import random
  3 | import math
  4 | from tic_tac_toe import has_winner, available_moves, apply_move
  5 | 
  6 | 
  7 | def monte_carlo_sample(board_state, side):
  8 |     """Sample a single rollout from the current board_state and side. Moves are made to the current board_state until we
  9 |      reach a terminal state then the result and the first move made to get there is returned.
 10 | 
 11 |     Args:
 12 |         board_state (3x3 tuple of int): state of the board
 13 |         side (int): side currently to play. +1 for the plus player, -1 for the minus player
 14 | 
 15 |     Returns:
 16 |         (result(int), move(int,int)): The result from this rollout, +1 for a win for the plus player -1 for a win for
 17 |             the minus player, 0 for a draw
 18 |     """
 19 |     result = has_winner(board_state)
 20 |     if result != 0:
 21 |         return result, None
 22 |     moves = list(available_moves(board_state))
 23 |     if not moves:
 24 |         return 0, None
 25 | 
 26 |     # select a random move
 27 |     move = random.choice(moves)
 28 |     result, next_move = monte_carlo_sample(apply_move(board_state, move, side), -side)
 29 |     return result, move
 30 | 
 31 | 
 32 | def monte_carlo_tree_search(board_state, side, number_of_samples):
 33 |     """Evaluate the best from the current board_state for the given side using monte carlo sampling.
 34 | 
 35 |     Args:
 36 |         board_state (3x3 tuple of int): state of the board
 37 |         side (int): side currently to play. +1 for the plus player, -1 for the minus player
 38 |         number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the
 39 |             better the estimation of the position
 40 | 
 41 |     Returns:
 42 |         (result(int), move(int,int)): The average result for the best move from this position and what that move was.
 43 |     """
 44 |     move_wins = collections.defaultdict(int)
 45 |     move_samples = collections.defaultdict(int)
 46 |     for _ in range(number_of_samples):
 47 |         result, move = monte_carlo_sample(board_state, side)
 48 |         # store the result and a count of the number of times we have tried this move
 49 |         if result == side:
 50 |             move_wins[move] += 1
 51 |         move_samples[move] += 1
 52 | 
 53 |     # get the move with the best average result
 54 |     move = max(move_wins, key=lambda x: move_wins.get(x) / move_samples[move])
 55 | 
 56 |     return move_wins[move] / move_samples[move], move
 57 | 
 58 | 
 59 | def _upper_confidence_bounds(payout, samples_for_this_machine, log_total_samples):
 60 |     return payout / samples_for_this_machine + math.sqrt((2 * log_total_samples) / samples_for_this_machine)
 61 | 
 62 | 
 63 | def monte_carlo_tree_search_uct(board_state, side, number_of_samples):
 64 |     """Evaluate the best from the current board_state for the given side using monte carlo sampling with upper
 65 |     confidence bounds for trees.
 66 | 
 67 |     Args:
 68 |         board_state (3x3 tuple of int): state of the board
 69 |         side (int): side currently to play. +1 for the plus player, -1 for the minus player
 70 |         number_of_samples (int): number of samples rollouts to run from the current position, the higher the number the
 71 |             better the estimation of the position
 72 | 
 73 |     Returns:
 74 |         (result(int), move(int,int)): The average result for the best move from this position and what that move was.
 75 |     """
 76 |     state_results = collections.defaultdict(float)
 77 |     state_samples = collections.defaultdict(float)
 78 | 
 79 |     for _ in range(number_of_samples):
 80 |         current_side = side
 81 |         current_board_state = board_state
 82 |         first_unvisited_node = True
 83 |         rollout_path = []
 84 |         result = 0
 85 | 
 86 |         while result == 0:
 87 |             move_states = {move: apply_move(current_board_state, move, current_side)
 88 |                            for move in available_moves(current_board_state)}
 89 | 
 90 |             if not move_states:
 91 |                 result = 0
 92 |                 break
 93 | 
 94 |             if all((state in state_samples) for _, state in move_states):
 95 |                 log_total_samples = math.log(sum(state_samples[s] for s in move_states.values()))
 96 |                 move, state = max(move_states, key=lambda _, s: _upper_confidence_bounds(state_results[s],
 97 |                                                                                          state_samples[s],
 98 |                                                                                          log_total_samples))
 99 |             else:
100 |                 move = random.choice(list(move_states.keys()))
101 | 
102 |             current_board_state = move_states[move]
103 | 
104 |             if first_unvisited_node:
105 |                 rollout_path.append((current_board_state, current_side))
106 |                 if current_board_state not in state_samples:
107 |                     first_unvisited_node = False
108 | 
109 |             current_side = -current_side
110 | 
111 |             result = has_winner(current_board_state)
112 | 
113 |         for path_board_state, path_side in rollout_path:
114 |             state_samples[path_board_state] += 1.
115 |             result *= path_side
116 |             # normalize results to be between 0 and 1 before this it between -1 and 1
117 |             result /= 2.
118 |             result += .5
119 |             state_results[path_board_state] += result
120 | 
121 |     move_states = {move: apply_move(board_state, move, side) for move in available_moves(board_state)}
122 | 
123 |     move = max(move_states, key=lambda x: state_results[move_states[x]] / state_samples[move_states[x]])
124 | 
125 |     return state_results[move_states[move]] / state_samples[move_states[move]], move
126 | 
127 | 
128 | if __name__ == '__main__':
129 |     board_state = ((1, 0, -1),
130 |                    (1, 0, 0),
131 |                    (0, -1, 0))
132 | 
133 |     print(monte_carlo_tree_search_uct(board_state, -1, 10000))
134 | 


--------------------------------------------------------------------------------
/Chapter 02/Ch2Example.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | from matplotlib.colors import ListedColormap
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | def tanh(x):     
  6 |     return (1.0 - numpy.exp(-2*x))/(1.0 + numpy.exp(-2*x))
  7 | 
  8 | def tanh_derivative(x):     
  9 |     return (1 + tanh(x))*(1 - tanh(x))
 10 |     
 11 | class NeuralNetwork:
 12 |     #network consists of a list of integers, indicating 
 13 |     #the number of neurons in each layer
 14 |     def __init__(self, net_arch): 
 15 |         numpy.random.seed(0)                  
 16 |         self.activity = tanh         
 17 |         self.activity_derivative = tanh_derivative 
 18 |         self.layers = len(net_arch)         
 19 |         self.steps_per_epoch = 1000
 20 |         self.arch = net_arch        
 21 | 
 22 |         self.weights = []         
 23 |         #range of weight values (-1,1)         
 24 |         for layer in range(len(net_arch) - 1):             
 25 |             w = 2*numpy.random.rand(net_arch[layer] + 1, net_arch[layer+1]) - 1           
 26 |             self.weights.append(w)
 27 | 
 28 |     def fit(self, data, labels, learning_rate=0.1, epochs=10):         
 29 |         #Add bias units to the input layer         
 30 |         ones = numpy.ones((1, data.shape[0]))        
 31 |         Z = numpy.concatenate((ones.T, data), axis=1)
 32 |         training = epochs*self.steps_per_epoch
 33 | 
 34 | 
 35 |         for k in range(training):             
 36 |             if k % self.steps_per_epoch == 0:                  
 37 |                 #print ('epochs:', k/self.steps_per_epoch)    
 38 |                 print('epochs: {}'.format(k/self.steps_per_epoch))              
 39 |                 for s in data:                     
 40 |                     print(s, self.predict(s))
 41 | 
 42 |             sample = numpy.random.randint(data.shape[0])            
 43 |             y = [Z[sample]] 
 44 | 
 45 |             for i in range(len(self.weights)-1):                     
 46 |                 activation = numpy.dot(y[i], self.weights[i])                         
 47 |                 activity = self.activity(activation)  
 48 |                 #add the bias for the next layer                     
 49 |                 activity = numpy.concatenate((numpy.ones(1), numpy.array(activity)))                      
 50 |                 y.append(activity)   
 51 |              
 52 |             #last layer              
 53 |             activation = numpy.dot(y[-1], self.weights[-1])             
 54 |             activity = self.activity(activation)             
 55 |             y.append(activity)
 56 |                     
 57 |             #error for the output layer             
 58 |             error = labels[sample] - y[-1]             
 59 |             delta_vec = [error * self.activity_derivative(y[-1])] 
 60 | 
 61 |             #we need to begin from the back from the next to last layer
 62 |             for i in range(self.layers-2, 0, -1):  
 63 |                 #delta_vec [1].dot(self.weights[i][1:].T)                
 64 |                 error = delta_vec[-1].dot(self.weights[i][1:].T) 
 65 |                 error = error*self.activity_derivative(y[i][1:])               
 66 |                 delta_vec.append(error)
 67 | 
 68 |             # reverse
 69 |             # [level3(output)->level2(hidden)]  => [level2(hidden)->level3(output)]
 70 |             delta_vec.reverse()
 71 | 
 72 |             # backpropagation
 73 |             # 1. Multiply its output delta and input activation 
 74 |             #    to get the gradient of the weight.
 75 |             # 2. Subtract a ratio (percentage) of the gradient from the weight
 76 |             for i in range(len(self.weights)):
 77 |                 layer = y[i].reshape(1, self.arch[i]+1) 
 78 |  
 79 |                 delta = delta_vec[i].reshape(1, self.arch[i+1])
 80 |                 self.weights[i] += learning_rate * layer.T.dot(delta)
 81 | 
 82 |     def predict(self, x): 
 83 |         val = numpy.concatenate((numpy.ones(1).T, numpy.array(x)))      
 84 |         for i in range(0, len(self.weights)):
 85 |             val = self.activity(numpy.dot(val, self.weights[i]))
 86 |             val = numpy.concatenate((numpy.ones(1).T, numpy.array(val)))
 87 |             
 88 |         return val[1]
 89 | 
 90 |     def plot_decision_regions(self, X, y, points=200):
 91 |         markers = ('o', '^')
 92 |         colors = ('red', 'blue')
 93 |         cmap = ListedColormap(colors)
 94 |         # plot the decision surface
 95 |         x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
 96 |         x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
 97 |         
 98 |         resolution = max(x1_max - x1_min, x2_max - x2_min)/float(points)
 99 |         #resolution = 0.01
100 |      
101 |         xx1, xx2 = numpy.meshgrid(numpy.arange(x1_min, x1_max, resolution), numpy.arange(x2_min, x2_max, resolution))
102 |         input = numpy.array([xx1.ravel(), xx2.ravel()]).T 
103 |         Z = numpy.empty(0)
104 |         for i in range(input.shape[0]):
105 |             val = self.predict(numpy.array(input[i]))
106 |             if val < 0.5: val = 0 
107 |             if val >= 0.5: val = 1
108 |             Z = numpy.append(Z, val)
109 | 
110 |         Z = Z.reshape(xx1.shape)
111 |         
112 |         plt.pcolormesh(xx1, xx2, Z, cmap=cmap)
113 |         plt.xlim(xx1.min(), xx1.max())
114 |         plt.ylim(xx2.min(), xx2.max())
115 |         # plot all samples
116 | 
117 |         classes = ["False", "True"]
118 |         for idx, cl in enumerate(numpy.unique(y)):
119 |             plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=1.0, c=cmap(idx), marker=markers[idx], s=80, label=classes[idx])
120 |             
121 |         plt.xlabel('x-axis')            
122 |         plt.ylabel('y-axis')
123 |         plt.legend(loc='upper left')
124 |         plt.show()            
125 | 
126 | if __name__ == '__main__':
127 | 
128 |     nn = NeuralNetwork([2,2,1])
129 | 
130 |     X = numpy.array([[0, 0],
131 |                      [0, 1],
132 |                      [1, 0],
133 |                      [1, 1]])
134 | 
135 |     y = numpy.array([0, 1, 1, 0])
136 | 
137 |     nn.fit(X, y, epochs=10)
138 | 
139 |     print "Final prediction"
140 |     for s in X:
141 |         print(s, nn.predict(s))
142 |         
143 |     nn.plot_decision_regions(X, y)
144 | 


--------------------------------------------------------------------------------
/Chapter 08/actor_critic_baseline_cart_pole.py:
--------------------------------------------------------------------------------
  1 | # note must import tensorflow before gym
  2 | import pickle
  3 | from collections import deque
  4 | 
  5 | import tensorflow as tf
  6 | import gym
  7 | import numpy as np
  8 | 
  9 | env = gym.make('CartPole-v0')
 10 | 
 11 | ACTIONS_COUNT = 2
 12 | FUTURE_REWARD_DISCOUNT = 0.9
 13 | LEARN_RATE_ACTOR = 0.01
 14 | LEARN_RATE_CRITIC = 0.01
 15 | STORE_SCORES_LEN = 5
 16 | GAMES_PER_TRAINING = 3
 17 | INPUT_NODES = env.observation_space.shape[0]
 18 | 
 19 | ACTOR_HIDDEN = 20
 20 | 
 21 | session = tf.Session()
 22 | 
 23 | actor_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, ACTOR_HIDDEN], stddev=0.01))
 24 | actor_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[ACTOR_HIDDEN]))
 25 | 
 26 | actor_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([ACTOR_HIDDEN, ACTIONS_COUNT], stddev=0.01))
 27 | actor_feed_forward_bias_2 = tf.Variable(tf.constant(0.1, shape=[ACTIONS_COUNT]))
 28 | 
 29 | actor_input_placeholder = tf.placeholder("float", [None, INPUT_NODES])
 30 | actor_hidden_layer = tf.nn.tanh(
 31 |     tf.matmul(actor_input_placeholder, actor_feed_forward_weights_1) + actor_feed_forward_bias_1)
 32 | actor_output_layer = tf.nn.softmax(
 33 |     tf.matmul(actor_hidden_layer, actor_feed_forward_weights_2) + actor_feed_forward_bias_2)
 34 | 
 35 | actor_action_placeholder = tf.placeholder("float", [None, ACTIONS_COUNT])
 36 | actor_advantage_placeholder = tf.placeholder("float", [None, 1])
 37 | 
 38 | policy_gradient = tf.reduce_mean(actor_advantage_placeholder * actor_action_placeholder * tf.log(actor_output_layer))
 39 | actor_train_operation = tf.train.AdamOptimizer(LEARN_RATE_ACTOR).minimize(-policy_gradient)
 40 | 
 41 | CRITIC_HIDDEN = 20
 42 | 
 43 | critic_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, CRITIC_HIDDEN], stddev=0.01))
 44 | critic_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[CRITIC_HIDDEN]))
 45 | 
 46 | critic_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([CRITIC_HIDDEN, 1], stddev=0.01))
 47 | critic_feed_forward_bias_2 = tf.Variable(tf.constant(0.0, shape=[1]))
 48 | 
 49 | critic_input_placeholder = tf.placeholder("float", [None, INPUT_NODES])
 50 | critic_hidden_layer = tf.nn.tanh(
 51 |     tf.matmul(critic_input_placeholder, critic_feed_forward_weights_1) + critic_feed_forward_bias_1)
 52 | critic_output_layer = tf.matmul(critic_hidden_layer, critic_feed_forward_weights_2) + critic_feed_forward_bias_2
 53 | 
 54 | critic_target_placeholder = tf.placeholder("float", [None, 1])
 55 | 
 56 | critic_cost = tf.reduce_mean(tf.square(critic_target_placeholder - critic_output_layer))
 57 | critic_train_operation = tf.train.AdamOptimizer(LEARN_RATE_CRITIC).minimize(critic_cost)
 58 | 
 59 | critic_advantages = critic_target_placeholder - critic_output_layer
 60 | 
 61 | scores = deque(maxlen=STORE_SCORES_LEN)
 62 | 
 63 | # set the first action to do nothing
 64 | last_action = np.zeros(ACTIONS_COUNT)
 65 | last_action[1] = 1
 66 | 
 67 | time = 0
 68 | 
 69 | session.run(tf.initialize_all_variables())
 70 | 
 71 | 
 72 | def choose_next_action(state):
 73 |     probability_of_actions = session.run(actor_output_layer, feed_dict={actor_input_placeholder: [state]})[0]
 74 |     try:
 75 |         move = np.random.multinomial(1, probability_of_actions)
 76 |     except ValueError:
 77 |         # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1.
 78 |         # so need to reduce slightly to be a valid value
 79 |         move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-6))
 80 |     return move
 81 | 
 82 | 
 83 | def train(states, actions_taken, advantages):
 84 |     # learn that these actions in these states lead to this reward
 85 |     session.run(actor_train_operation, feed_dict={
 86 |         actor_input_placeholder: states,
 87 |         actor_action_placeholder: actions_taken,
 88 |         actor_advantage_placeholder: advantages})
 89 | 
 90 | 
 91 | last_state = env.reset()
 92 | total_reward = 0
 93 | current_game_observations = []
 94 | current_game_rewards = []
 95 | current_game_actions = []
 96 | 
 97 | episode_observation = []
 98 | episode_rewards = []
 99 | episode_actions = []
100 | games = 0
101 | 
102 | critic_costs = deque(maxlen=100)
103 | 
104 | while True:
105 |     env.render()
106 |     last_action = choose_next_action(last_state)
107 |     current_state, reward, terminal, info = env.step(np.argmax(last_action))
108 |     total_reward += reward
109 | 
110 |     if terminal:
111 |         reward = -.10
112 | 
113 |     current_game_observations.append(last_state)
114 |     current_game_rewards.append(reward)
115 |     current_game_actions.append(last_action)
116 | 
117 |     if terminal:
118 |         games += 1
119 |         scores.append(total_reward)
120 | 
121 |         # get temporal difference values for critic
122 |         cumulative_reward = 0
123 |         for i in reversed(range(len(current_game_observations))):
124 |             cumulative_reward = current_game_rewards[i] + FUTURE_REWARD_DISCOUNT * cumulative_reward
125 |             current_game_rewards[i] = [cumulative_reward]
126 | 
127 |         _, cost, advantages = session.run([critic_train_operation, critic_cost, critic_advantages], {
128 |             critic_input_placeholder: current_game_observations,
129 |             critic_target_placeholder: current_game_rewards})
130 | 
131 |         critic_costs.append(cost)
132 | 
133 |         print("Game: %s reward %s average scores %s critic cost %s" %
134 |               (games, total_reward,
135 |                np.mean(scores), np.mean(critic_costs)))
136 | 
137 |         episode_observation.extend(current_game_observations)
138 |         episode_actions.extend(current_game_actions)
139 |         episode_rewards.extend(advantages)
140 | 
141 |         total_reward = 0
142 |         current_game_observations = []
143 |         current_game_rewards = []
144 |         current_game_actions = []
145 | 
146 |         if games % GAMES_PER_TRAINING == 0:
147 |             train(episode_observation, episode_actions, episode_rewards)
148 | 
149 |             episode_observation = []
150 |             episode_actions = []
151 |             episode_rewards = []
152 | 
153 |     time += 1
154 | 
155 |     # update the old values
156 |     if terminal:
157 |         last_state = env.reset()
158 |     else:
159 |         last_state = current_state
160 | 


--------------------------------------------------------------------------------
/Chapter 07/tic_tac_toe.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Full code for running a game of tic-tac-toe on a 3 by 3 board.
  3 | Two players take turns making moves on squares of the board, the first to get 3 in a row, including diagonals, wins. If
  4 | there are no valid moves left to make the game ends a draw.
  5 | 
  6 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine
  7 | where each player plays.
  8 | The board is represented by a 3 x 3 tuple of ints. A 0 means no player has played in a space, 1 means player one has
  9 | played there, -1 means the seconds player has played there. The apply_move method can be used to return a copy of a
 10 | given state with a given move applied. This can be useful for doing min-max or monte carlo sampling.
 11 | """
 12 | import random
 13 | import itertools
 14 | 
 15 | 
 16 | def _new_board():
 17 |     """Return a emprty tic-tac-toe board we can use for simulating a game.
 18 | 
 19 |     Returns:
 20 |         3x3 tuple of ints
 21 |     """
 22 |     return ((0, 0, 0),
 23 |             (0, 0, 0),
 24 |             (0, 0, 0))
 25 | 
 26 | 
 27 | def apply_move(board_state, move, side):
 28 |     """Returns a copy of the given board_state with the desired move applied.
 29 | 
 30 |     Args:
 31 |         board_state (3x3 tuple of int): The given board_state we want to apply the move to.
 32 |         move (int, int): The position we want to make the move in.
 33 |         side (int): The side we are making this move for, 1 for the first player, -1 for the second player.
 34 | 
 35 |     Returns:
 36 |         (3x3 tuple of int): A copy of the board_state with the given move applied for the given side.
 37 |     """
 38 |     move_x, move_y = move
 39 | 
 40 |     def get_tuples():
 41 |         for x in range(3):
 42 |             if move_x == x:
 43 |                 temp = list(board_state[x])
 44 |                 temp[move_y] = side
 45 |                 yield tuple(temp)
 46 |             else:
 47 |                 yield board_state[x]
 48 | 
 49 |     return tuple(get_tuples())
 50 | 
 51 | 
 52 | def available_moves(board_state):
 53 |     """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have
 54 |     pieces played.
 55 | 
 56 |     Args:
 57 |         board_state: The board_state we want to check for valid moves.
 58 | 
 59 |     Returns:
 60 |         Generator of (int, int): All the valid moves that can be played in this position.
 61 |     """
 62 |     for x, y in itertools.product(range(3), range(3)):
 63 |         if board_state[x][y] == 0:
 64 |             yield (x, y)
 65 | 
 66 | 
 67 | def _has_3_in_a_line(line):
 68 |     return all(x == -1 for x in line) | all(x == 1 for x in line)
 69 | 
 70 | 
 71 | def has_winner(board_state):
 72 |     """Determine if a player has won on the given board_state.
 73 | 
 74 |     Args:
 75 |         board_state (3x3 tuple of int): The current board_state we want to evaluate.
 76 | 
 77 |     Returns:
 78 |         int: 1 if player one has won, -1 if player 2 has won, otherwise 0.
 79 |     """
 80 |     # check rows
 81 |     for x in range(3):
 82 |         if _has_3_in_a_line(board_state[x]):
 83 |             return board_state[x][0]
 84 |     # check columns
 85 |     for y in range(3):
 86 |         if _has_3_in_a_line([i[y] for i in board_state]):
 87 |             return board_state[0][y]
 88 | 
 89 |     # check diagonals
 90 |     if _has_3_in_a_line([board_state[i][i] for i in range(3)]):
 91 |         return board_state[0][0]
 92 |     if _has_3_in_a_line([board_state[2 - i][i] for i in range(3)]):
 93 |         return board_state[0][2]
 94 | 
 95 |     return 0  # no one has won, return 0 for a draw
 96 | 
 97 | 
 98 | def play_game(plus_player_func, minus_player_func, log=False):
 99 |     """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each
100 |     player.
101 | 
102 |     Args:
103 |         plus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the
104 |             current board_state and side this player is playing, and returns the move the player wants to play.
105 |         minus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the
106 |             current board_state and side this player is playing, and returns the move the player wants to play.
107 |         log (bool): If True progress is logged to console, defaults to False
108 | 
109 |     Returns:
110 |         int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw
111 |     """
112 |     board_state = _new_board()
113 |     player_turn = 1
114 | 
115 |     while True:
116 |         _available_moves = list(available_moves(board_state))
117 | 
118 |         if len(_available_moves) == 0:
119 |             # draw
120 |             if log:
121 |                 print("no moves left, game ended a draw")
122 |             return 0.
123 |         if player_turn > 0:
124 |             move = plus_player_func(board_state, 1)
125 |         else:
126 |             move = minus_player_func(board_state, -1)
127 | 
128 |         if move not in _available_moves:
129 |             # if a player makes an invalid move the other player wins
130 |             if log:
131 |                 print("illegal move ", move)
132 |             return -player_turn
133 | 
134 |         board_state = apply_move(board_state, move, player_turn)
135 |         if log:
136 |             print(board_state)
137 | 
138 |         winner = has_winner(board_state)
139 |         if winner != 0:
140 |             if log:
141 |                 print("we have a winner, side: %s" % player_turn)
142 |             return winner
143 |         player_turn = -player_turn
144 | 
145 | 
146 | def random_player(board_state, _):
147 |     """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the
148 |     valid moves in the current state.
149 | 
150 |     Args:
151 |         board_state (3x3 tuple of int): The current state of the board
152 |         _: the side this player is playing, not used in this function because we are simply choosing the moves randomly
153 | 
154 |     Returns:
155 |         (int, int): the move we want to play on the current board
156 |     """
157 |     moves = list(available_moves(board_state))
158 |     return random.choice(moves)
159 | 
160 | 
161 | if __name__ == '__main__':
162 |     # example of playing a game
163 |     play_game(random_player, random_player, log=True)


--------------------------------------------------------------------------------
/Chapter 08/actor_critic_advantage_cart_pole.py:
--------------------------------------------------------------------------------
  1 | # note must import tensorflow before gym
  2 | import pickle
  3 | from collections import deque
  4 | 
  5 | import tensorflow as tf
  6 | import gym
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | env = gym.make('CartPole-v0')
 11 | 
 12 | ACTIONS_COUNT = 2
 13 | FUTURE_REWARD_DISCOUNT = 0.9
 14 | LEARN_RATE_ACTOR = 0.01
 15 | LEARN_RATE_CRITIC = 0.01
 16 | STORE_SCORES_LEN = 5
 17 | GAMES_PER_TRAINING = 3
 18 | INPUT_NODES = env.observation_space.shape[0]
 19 | 
 20 | ACTOR_HIDDEN = 20
 21 | CRITIC_HIDDEN = 20
 22 | 
 23 | session = tf.Session()
 24 | 
 25 | actor_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, ACTOR_HIDDEN], stddev=0.01))
 26 | actor_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[ACTOR_HIDDEN]))
 27 | 
 28 | actor_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([ACTOR_HIDDEN, ACTIONS_COUNT], stddev=0.01))
 29 | actor_feed_forward_bias_2 = tf.Variable(tf.constant(0.1, shape=[ACTIONS_COUNT]))
 30 | 
 31 | actor_input_placeholder = tf.placeholder("float", [None, INPUT_NODES])
 32 | actor_hidden_layer = tf.nn.tanh(
 33 |     tf.matmul(actor_input_placeholder, actor_feed_forward_weights_1) + actor_feed_forward_bias_1)
 34 | actor_output_layer = tf.nn.softmax(
 35 |     tf.matmul(actor_hidden_layer, actor_feed_forward_weights_2) + actor_feed_forward_bias_2)
 36 | 
 37 | actor_action_placeholder = tf.placeholder("float", [None, ACTIONS_COUNT])
 38 | actor_advantage_placeholder = tf.placeholder("float", [None, 1])
 39 | 
 40 | policy_gradient = tf.reduce_mean(actor_advantage_placeholder * actor_action_placeholder * tf.log(actor_output_layer))
 41 | actor_train_operation = tf.train.AdamOptimizer(LEARN_RATE_ACTOR).minimize(-policy_gradient)
 42 | 
 43 | critic_feed_forward_weights_1 = tf.Variable(tf.truncated_normal([INPUT_NODES, CRITIC_HIDDEN], stddev=0.01))
 44 | critic_feed_forward_bias_1 = tf.Variable(tf.constant(0.0, shape=[CRITIC_HIDDEN]))
 45 | 
 46 | critic_feed_forward_weights_2 = tf.Variable(tf.truncated_normal([CRITIC_HIDDEN, 1], stddev=0.01))
 47 | critic_feed_forward_bias_2 = tf.Variable(tf.constant(0.0, shape=[1]))
 48 | 
 49 | critic_input_placeholder = tf.placeholder("float", [None, INPUT_NODES])
 50 | critic_hidden_layer = tf.nn.tanh(
 51 |     tf.matmul(critic_input_placeholder, critic_feed_forward_weights_1) + critic_feed_forward_bias_1)
 52 | critic_output_layer = tf.matmul(critic_hidden_layer, critic_feed_forward_weights_2) + critic_feed_forward_bias_2
 53 | 
 54 | critic_target_placeholder = tf.placeholder("float", [None, 1])
 55 | 
 56 | critic_cost = tf.reduce_mean(tf.square(critic_target_placeholder - critic_output_layer))
 57 | critic_train_operation = tf.train.AdamOptimizer(LEARN_RATE_CRITIC).minimize(critic_cost)
 58 | 
 59 | critic_baseline = critic_target_placeholder - critic_output_layer
 60 | 
 61 | scores = deque(maxlen=STORE_SCORES_LEN)
 62 | 
 63 | # set the first action to do nothing
 64 | last_action = np.zeros(ACTIONS_COUNT)
 65 | last_action[1] = 1
 66 | 
 67 | time = 0
 68 | 
 69 | session.run(tf.initialize_all_variables())
 70 | 
 71 | 
 72 | def choose_next_action(state):
 73 |     probability_of_actions = session.run(actor_output_layer, feed_dict={actor_input_placeholder: [state]})[0]
 74 |     try:
 75 |         move = np.random.multinomial(1, probability_of_actions)
 76 |     except ValueError:
 77 |         # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1.
 78 |         # so need to reduce slightly to be a valid value
 79 |         move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-6))
 80 |     return move
 81 | 
 82 | 
 83 | def train(states, actions_taken, advantages):
 84 |     # learn that these actions in these states lead to this reward
 85 |     session.run(actor_train_operation, feed_dict={
 86 |         actor_input_placeholder: states,
 87 |         actor_action_placeholder: actions_taken,
 88 |         actor_advantage_placeholder: advantages})
 89 | 
 90 | 
 91 | last_state = env.reset()
 92 | total_reward = 0
 93 | current_game_observations = []
 94 | current_game_rewards = []
 95 | current_game_actions = []
 96 | 
 97 | episode_observation = []
 98 | episode_rewards = []
 99 | episode_actions = []
100 | games = 0
101 | plot_x = []
102 | plot_y = []
103 | 
104 | critic_costs = deque(maxlen=10)
105 | 
106 | 
107 | while True:
108 |     env.render()
109 |     last_action = choose_next_action(last_state)
110 |     current_state, reward, terminal, info = env.step(np.argmax(last_action))
111 |     total_reward += reward
112 | 
113 |     if terminal:
114 |         reward = -.10
115 |     else:
116 |         reward = 0.1
117 | 
118 |     current_game_observations.append(last_state)
119 |     current_game_rewards.append(reward)
120 |     current_game_actions.append(last_action)
121 | 
122 |     if terminal:
123 |         games += 1
124 |         scores.append(total_reward)
125 | 
126 |         if games % STORE_SCORES_LEN == 0:
127 |             plot_x.append(games)
128 |             plot_y.append(np.mean(scores))
129 | 
130 |         # get temporal difference values for critic
131 |         cumulative_reward = 0
132 |         for i in reversed(range(len(current_game_observations))):
133 |             cumulative_reward = current_game_rewards[i] + FUTURE_REWARD_DISCOUNT * cumulative_reward
134 |             current_game_rewards[i] = [cumulative_reward]
135 | 
136 |         values_t = session.run(critic_output_layer, {
137 |             critic_input_placeholder: current_game_observations})
138 |         advantages = []
139 | 
140 |         for i in range(len(current_game_observations) - 1):
141 |             advantages.append([current_game_rewards[i][0] + FUTURE_REWARD_DISCOUNT*values_t[i+1][0] - values_t[i][0]])
142 | 
143 |         advantages.append([current_game_rewards[-1][0]-values_t[-1][0]])
144 | 
145 |         _, cost = session.run([critic_train_operation, critic_cost], {
146 |                     critic_input_placeholder: current_game_observations,
147 |                     critic_target_placeholder: current_game_rewards})
148 | 
149 |         critic_costs.append(cost)
150 | 
151 |         print("Game: %s reward %s average scores %s critic cost %s" %
152 |               (games, total_reward,
153 |                np.mean(scores), np.mean(critic_costs)))
154 | 
155 |         episode_observation.extend(current_game_observations)
156 |         episode_actions.extend(current_game_actions)
157 |         episode_rewards.extend(advantages)
158 | 
159 |         total_reward = 0
160 |         current_game_observations = []
161 |         current_game_rewards = []
162 |         current_game_actions = []
163 | 
164 |         if games % GAMES_PER_TRAINING == 0:
165 |             episode_rewards = np.array(episode_rewards)
166 |             normalized_rewards = episode_rewards - np.mean(episode_rewards)
167 |             normalized_rewards /= np.std(normalized_rewards)
168 | 
169 |             train(episode_observation, episode_actions, normalized_rewards)
170 | 
171 |             episode_observation = []
172 |             episode_actions = []
173 |             episode_rewards = []
174 | 
175 |     time += 1
176 | 
177 |     # update the old values
178 |     if terminal:
179 |         last_state = env.reset()
180 |     else:
181 |         last_state = current_state


--------------------------------------------------------------------------------
/Chapter 07/connect_4.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Full code for running a game of connect 4 on a board_width, board_height and winning length can be specified in relevant
  3 | methods. Allowing you to play connect 5, 6, 7, etc. Defaults are board_width = 7, board_height = 6, winning_length = 4
  4 | 
  5 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine
  6 | where each player plays.
  7 | The board is represented by a board_width x board_height tuple of ints. A 0 means no player has played in a space, 1
  8 | means player one has played there, -1 means the seconds player has played there. The apply_move method can be used to
  9 | return a copy of a given state with a given move applied. This can be useful for doing min-max or monte carlo sampling.
 10 | """
 11 | 
 12 | import random
 13 | 
 14 | 
 15 | def _new_board(board_width, board_height):
 16 |     """Return a emprty tic-tac-toe board we can use for simulating a game.
 17 | 
 18 |     Args:
 19 |         board_width (int): The width of the board, a board_width * board_height board is created
 20 |         board_height (int): The height of the board, a board_width * board_height board is created
 21 | 
 22 |     Returns:
 23 |         board_width x board_height tuple of ints
 24 |     """
 25 |     return tuple(tuple(0 for _ in range(board_height)) for _ in range(board_width))
 26 | 
 27 | 
 28 | def apply_move(board_state, move_x, side):
 29 |     """Returns a copy of the given board_state with the desired move applied.
 30 | 
 31 |     Args:
 32 |         board_state (2d tuple of int): The given board_state we want to apply the move to.
 33 |         move_x (int): Which column we are going to "drop" our piece in
 34 |         side (int): The side we are making this move for, 1 for the first player, -1 for the second player.
 35 | 
 36 |     Returns:
 37 |         (2d tuple of int): A copy of the board_state with the given move applied for the given side.
 38 |     """
 39 |     # find position in which move will settle
 40 |     move_y = 0
 41 |     for x in board_state[move_x]:
 42 |         if x == 0:
 43 |             break
 44 |         else:
 45 |             move_y += 1
 46 | 
 47 |     def get_tuples():
 48 |         for i in range(len(board_state)):
 49 |             if move_x == i:
 50 |                 temp = list(board_state[i])
 51 |                 temp[move_y] = side
 52 |                 yield tuple(temp)
 53 |             else:
 54 |                 yield board_state[i]
 55 | 
 56 |     return tuple(get_tuples())
 57 | 
 58 | 
 59 | def available_moves(board_state):
 60 |     """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have
 61 |     pieces played.
 62 | 
 63 |     Args:
 64 |         board_state: The board_state we want to check for valid moves.
 65 | 
 66 |     Returns:
 67 |         Generator of int: All the valid moves that can be played in this position.
 68 |     """
 69 |     for x in range(len(board_state)):
 70 |         if any(y == 0 for y in board_state[x]):
 71 |             yield x
 72 | 
 73 | 
 74 | def _has_winning_line(line, winning_length):
 75 |     count = 0
 76 |     last_side = 0
 77 |     for x in line:
 78 |         if x == last_side:
 79 |             count += 1
 80 |             if count == winning_length:
 81 |                 return last_side
 82 |         else:
 83 |             count = 1
 84 |             last_side = x
 85 |     return 0
 86 | 
 87 | 
 88 | def has_winner(board_state, winning_length=4):
 89 |     """Determine if a player has won on the given board_state.
 90 | 
 91 |     Args:
 92 |         board_state (2d tuple of int): The current board_state we want to evaluate.
 93 |         winning_length (int): The number of moves in a row needed for a win.
 94 | 
 95 |     Returns:
 96 |         int: 1 if player one has won, -1 if player 2 has won, otherwise 0.
 97 |     """
 98 |     board_width = len(board_state)
 99 |     board_height = len(board_state[0])
100 | 
101 |     # check rows
102 |     for x in range(board_width):
103 |         winner = _has_winning_line(board_state[x], winning_length)
104 |         if winner != 0:
105 |             return winner
106 |     # check columns
107 |     for y in range(board_height):
108 |         winner = _has_winning_line((i[y] for i in board_state), winning_length)
109 |         if winner != 0:
110 |             return winner
111 | 
112 |     # check diagonals
113 |     diagonals_start = -(board_width - winning_length)
114 |     diagonals_end = (board_width - winning_length)
115 |     for d in range(diagonals_start, diagonals_end):
116 |         winner = _has_winning_line(
117 |             (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))),
118 |             winning_length)
119 |         if winner != 0:
120 |             return winner
121 |     for d in range(diagonals_start, diagonals_end):
122 |         winner = _has_winning_line(
123 |             (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))),
124 |             winning_length)
125 |         if winner != 0:
126 |             return winner
127 | 
128 |     return 0  # no one has won, return 0 for a draw
129 | 
130 | 
131 | def play_game(plus_player_func, minus_player_func, board_width=7, board_height=6, winning_length=4, log=False):
132 |     """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each
133 |     player.
134 | 
135 |     Args:
136 |         plus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))):
137 |             Function that takes the current board_state and side this player is playing, and returns the move the player
138 |             wants to play.
139 |         minus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))):
140 |             Function that takes the current board_state and side this player is playing, and returns the move the player
141 |             wants to play.
142 |         board_width (int): The width of the board, a board_width * board_height board is created
143 |         board_height (int): The height of the board, a board_width * board_height board is created
144 |         winning_length (int): The number of pieces in a row needed to win a game.
145 |         log (bool): If True progress is logged to console, defaults to False
146 | 
147 |     Returns:
148 |         int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw
149 |     """
150 |     board_state = _new_board(board_width, board_height)
151 |     player_turn = 1
152 | 
153 |     while True:
154 |         _avialable_moves = list(available_moves(board_state))
155 |         if len(_avialable_moves) == 0:
156 |             # draw
157 |             if log:
158 |                 print("no moves left, game ended a draw")
159 |             return 0.
160 |         if player_turn > 0:
161 |             move = plus_player_func(board_state, 1)
162 |         else:
163 |             move = minus_player_func(board_state, -1)
164 | 
165 |         if move not in _avialable_moves:
166 |             # if a player makes an invalid move the other player wins
167 |             if log:
168 |                 print("illegal move ", move)
169 |             return -player_turn
170 | 
171 |         board_state = apply_move(board_state, move, player_turn)
172 |         if log:
173 |             print(board_state)
174 | 
175 |         winner = has_winner(board_state, winning_length)
176 |         if winner != 0:
177 |             if log:
178 |                 print("we have a winner, side: %s" % player_turn)
179 |             return winner
180 |         player_turn = -player_turn
181 | 
182 | 
183 | def random_player(board_state, _):
184 |     """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the
185 |     valid moves in the current state.
186 | 
187 |     Args:
188 |         board_state (2d tuple of int): The current state of the board
189 |         _: the side this player is playing, not used in this function because we are simply choosing the moves randomly
190 | 
191 |     Returns:
192 |         (int, int): the move we want to play on the current board
193 |     """
194 |     moves = list(available_moves(board_state))
195 |     return random.choice(moves)
196 | 
197 | 
198 | if __name__ == '__main__':
199 |     # example of playing a game
200 |     play_game(random_player, random_player, log=True, board_width=7, board_height=6, winning_length=4)
201 | 


--------------------------------------------------------------------------------
/Chapter 07/tic_tac_toe_x.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Full code for running a game of tic-tac-toe on a board of any size with a specified number in a row for the win. This is
  3 | similar to tic_tac_toe.py but all relevent moves are paramiterized by board_size arg that sets how big the board is and
  4 | winning_length which determines how many in a row are needed to win. Defaults are 5 and 4. This allows you to play games
  5 | in a more complex environment than standard tic-tac-toe.
  6 | 
  7 | Two players take turns making moves on squares of the board, the first to get winning_length in a row, including
  8 | diagonals, wins. If there are no valid moves left to make the game ends a draw.
  9 | 
 10 | The main method to use here is play_game which simulates a game to the end using the function args it takes to determine
 11 | where each player plays.
 12 | The board is represented by a board_size x board_size tuple of ints. A 0 means no player has played in a space, 1 means
 13 | player one has played there, -1 means the seconds player has played there. The apply_move method can be used to return a
 14 | copy of a given state with a given move applied. This can be useful for doing min-max or monte carlo sampling.
 15 | """
 16 | import random
 17 | import itertools
 18 | 
 19 | 
 20 | def _new_board(board_size):
 21 |     """Return a emprty tic-tac-toe board we can use for simulating a game.
 22 | 
 23 |     Args:
 24 |         board_size (int): The size of one side of the board, a board_size * board_size board is created
 25 | 
 26 |     Returns:
 27 |         board_size x board_size tuple of ints
 28 |     """
 29 |     return tuple(tuple(0 for _ in range(board_size)) for _ in range(board_size))
 30 | 
 31 | 
 32 | def apply_move(board_state, move, side):
 33 |     """Returns a copy of the given board_state with the desired move applied.
 34 | 
 35 |     Args:
 36 |         board_state (2d tuple of int): The given board_state we want to apply the move to.
 37 |         move (int, int): The position we want to make the move in.
 38 |         side (int): The side we are making this move for, 1 for the first player, -1 for the second player.
 39 | 
 40 |     Returns:
 41 |         (2d tuple of int): A copy of the board_state with the given move applied for the given side.
 42 |     """
 43 |     move_x, move_y = move
 44 | 
 45 |     def get_tuples():
 46 |         for x in range(len(board_state)):
 47 |             if move_x == x:
 48 |                 temp = list(board_state[x])
 49 |                 temp[move_y] = side
 50 |                 yield tuple(temp)
 51 |             else:
 52 |                 yield board_state[x]
 53 | 
 54 |     return tuple(get_tuples())
 55 | 
 56 | 
 57 | def available_moves(board_state):
 58 |     """Get all legal moves for the current board_state. For Tic-tac-toe that is all positions that do not currently have
 59 |     pieces played.
 60 | 
 61 |     Args:
 62 |         board_state: The board_state we want to check for valid moves.
 63 | 
 64 |     Returns:
 65 |         Generator of (int, int): All the valid moves that can be played in this position.
 66 |     """
 67 |     for x, y in itertools.product(range(len(board_state)), range(len(board_state[0]))):
 68 |         if board_state[x][y] == 0:
 69 |             yield (x, y)
 70 | 
 71 | 
 72 | def _has_winning_line(line, winning_length):
 73 |     count = 0
 74 |     last_side = 0
 75 |     for x in line:
 76 |         if x == last_side:
 77 |             count += 1
 78 |             if count == winning_length:
 79 |                 return last_side
 80 |         else:
 81 |             count = 1
 82 |             last_side = x
 83 |     return 0
 84 | 
 85 | 
 86 | def has_winner(board_state, winning_length):
 87 |     """Determine if a player has won on the given board_state.
 88 | 
 89 |     Args:
 90 |         board_state (2d tuple of int): The current board_state we want to evaluate.
 91 |         winning_length (int): The number of moves in a row needed for a win.
 92 | 
 93 |     Returns:
 94 |         int: 1 if player one has won, -1 if player 2 has won, otherwise 0.
 95 |     """
 96 |     board_width = len(board_state)
 97 |     board_height = len(board_state[0])
 98 | 
 99 |     # check rows
100 |     for x in range(board_width):
101 |         winner = _has_winning_line(board_state[x], winning_length)
102 |         if winner != 0:
103 |             return winner
104 |     # check columns
105 |     for y in range(board_height):
106 |         winner = _has_winning_line((i[y] for i in board_state), winning_length)
107 |         if winner != 0:
108 |             return winner
109 | 
110 |     # check diagonals
111 |     diagonals_start = -(board_width - winning_length)
112 |     diagonals_end = (board_width - winning_length)
113 |     for d in range(diagonals_start, diagonals_end+1):
114 |         winner = _has_winning_line(
115 |             (board_state[i][i + d] for i in range(max(-d, 0), min(board_width, board_height - d))),
116 |             winning_length)
117 |         if winner != 0:
118 |             return winner
119 |     for d in range(diagonals_start, diagonals_end+1):
120 |         winner = _has_winning_line(
121 |             (board_state[i][board_height - i - d - 1] for i in range(max(-d, 0), min(board_width, board_height - d))),
122 |             winning_length)
123 |         if winner != 0:
124 |             return winner
125 | 
126 |     return 0  # no one has won, return 0 for a draw
127 | 
128 | 
129 | def play_game(plus_player_func, minus_player_func, board_size=5, winning_length=4, log=False):
130 |     """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each
131 |     player.
132 | 
133 |     Args:
134 |         plus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))):
135 |             Function that takes the current board_state and side this player is playing, and returns the move the player
136 |             wants to play.
137 |         minus_player_func ((board_state(board_size by board_size tuple of int), side(int)) -> move((int, int))):
138 |             Function that takes the current board_state and side this player is playing, and returns the move the player
139 |             wants to play.
140 |         board_size (int): The size of a single side of the board. Game is played on a board_size*board_size sized board
141 |         winning_length (int): The number of pieces in a row needed to win a game.
142 |         log (bool): If True progress is logged to console, defaults to False
143 | 
144 |     Returns:
145 |         int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw
146 |     """
147 |     board_state = _new_board(board_size)
148 |     player_turn = 1
149 | 
150 |     while True:
151 |         _available_moves = list(available_moves(board_state))
152 |         if len(_available_moves) == 0:
153 |             # draw
154 |             if log:
155 |                 print("no moves left, game ended a draw")
156 |             return 0.
157 |         if player_turn > 0:
158 |             move = plus_player_func(board_state, 1)
159 |         else:
160 |             move = minus_player_func(board_state, -1)
161 | 
162 |         if move not in _available_moves:
163 |             # if a player makes an invalid move the other player wins
164 |             if log:
165 |                 print("illegal move ", move)
166 |             return -player_turn
167 | 
168 |         board_state = apply_move(board_state, move, player_turn)
169 |         print(board_state)
170 | 
171 |         winner = has_winner(board_state, winning_length)
172 |         if winner != 0:
173 |             if log:
174 |                 print("we have a winner, side: %s" % player_turn)
175 |             return winner
176 |         player_turn = -player_turn
177 | 
178 | 
179 | def random_player(board_state, _):
180 |     """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the
181 |     valid moves in the current state.
182 | 
183 |     Args:
184 |         board_state (2d tuple of int): The current state of the board
185 |         _: the side this player is playing, not used in this function because we are simply choosing the moves randomly
186 | 
187 |     Returns:
188 |         (int, int): the move we want to play on the current board
189 |     """
190 |     moves = list(available_moves(board_state))
191 |     return random.choice(moves)
192 | 
193 | 
194 | if __name__ == '__main__':
195 |     # example of playing a game
196 |     play_game(random_player, random_player, log=True, board_size=10, winning_length=4)
197 | 


--------------------------------------------------------------------------------
/Chapter 08/deep_q_breakout.py:
--------------------------------------------------------------------------------
  1 | # note must import tensorflow before gym
  2 | import pickle
  3 | import random
  4 | from collections import deque
  5 | 
  6 | import tensorflow as tf
  7 | import gym
  8 | import numpy as np
  9 | import os
 10 | 
 11 | import zlib
 12 | 
 13 | resume = True
 14 | CHECKPOINT_PATH = 'deep_q_breakout_path'
 15 | ACTIONS_COUNT = 3
 16 | SCREEN_WIDTH, SCREEN_HEIGHT = (72, 84)
 17 | FUTURE_REWARD_DISCOUNT = 0.99
 18 | OBSERVATION_STEPS = 100000.  # time steps to observe before training
 19 | EXPLORE_STEPS = 2000000.  # frames over which to anneal epsilon
 20 | INITIAL_RANDOM_ACTION_PROB = 1.0  # starting chance of an action being random
 21 | FINAL_RANDOM_ACTION_PROB = 0.05  # final chance of an action being random
 22 | MEMORY_SIZE = 800000  # number of observations to remember
 23 | MINI_BATCH_SIZE = 128  # size of mini batches
 24 | STATE_FRAMES = 2  # number of frames to store in the state
 25 | OBS_LAST_STATE_INDEX, OBS_ACTION_INDEX, OBS_REWARD_INDEX, OBS_CURRENT_STATE_INDEX, OBS_TERMINAL_INDEX = range(5)
 26 | SAVE_EVERY_X_STEPS = 20000
 27 | LEARN_RATE = 1e-4
 28 | STORE_SCORES_LEN = 100
 29 | verbose_logging = True
 30 | 
 31 | 
 32 | def _create_network():
 33 |     CONVOLUTIONS_LAYER_1 = 32
 34 |     CONVOLUTIONS_LAYER_2 = 64
 35 |     CONVOLUTIONS_LAYER_3 = 64
 36 |     FLAT_SIZE = 11*9*CONVOLUTIONS_LAYER_3
 37 |     FLAT_HIDDEN_NODES = 512
 38 | 
 39 |     # network weights
 40 |     convolution_weights_1 = tf.Variable(tf.truncated_normal([8, 8, STATE_FRAMES, CONVOLUTIONS_LAYER_1], stddev=0.01))
 41 |     convolution_bias_1 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_1]))
 42 | 
 43 |     convolution_weights_2 = tf.Variable(tf.truncated_normal([4, 4, CONVOLUTIONS_LAYER_1, CONVOLUTIONS_LAYER_2], stddev=0.01))
 44 |     convolution_bias_2 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_2]))
 45 | 
 46 |     convolution_weights_3 = tf.Variable(tf.truncated_normal([3, 3, CONVOLUTIONS_LAYER_2, CONVOLUTIONS_LAYER_3], stddev=0.01))
 47 |     convolution_bias_3 = tf.Variable(tf.constant(0.01, shape=[CONVOLUTIONS_LAYER_2]))
 48 | 
 49 |     feed_forward_weights_1 = tf.Variable(tf.truncated_normal([FLAT_SIZE, FLAT_HIDDEN_NODES], stddev=0.01))
 50 |     feed_forward_bias_1 = tf.Variable(tf.constant(0.01, shape=[FLAT_HIDDEN_NODES]))
 51 | 
 52 |     feed_forward_weights_2 = tf.Variable(tf.truncated_normal([FLAT_HIDDEN_NODES, ACTIONS_COUNT], stddev=0.01))
 53 |     feed_forward_bias_2 = tf.Variable(tf.constant(0.01, shape=[ACTIONS_COUNT]))
 54 | 
 55 |     input_layer = tf.placeholder("float", [None, SCREEN_HEIGHT, SCREEN_WIDTH,
 56 |                                            STATE_FRAMES])
 57 | 
 58 |     hidden_convolutional_layer_1 = tf.nn.relu(
 59 |         tf.nn.conv2d(input_layer, convolution_weights_1, strides=[1, 4, 4, 1], padding="SAME") + convolution_bias_1)
 60 | 
 61 |     hidden_convolutional_layer_2 = tf.nn.relu(
 62 |         tf.nn.conv2d(hidden_convolutional_layer_1, convolution_weights_2, strides=[1, 2, 2, 1],
 63 |                      padding="SAME") + convolution_bias_2)
 64 | 
 65 |     hidden_convolutional_layer_3 = tf.nn.relu(
 66 |         tf.nn.conv2d(hidden_convolutional_layer_2, convolution_weights_3, strides=[1, 1, 1, 1],
 67 |                      padding="SAME") + convolution_bias_3)
 68 | 
 69 |     hidden_convolutional_layer_3_flat = tf.reshape(hidden_convolutional_layer_3, [-1, FLAT_SIZE])
 70 | 
 71 |     final_hidden_activations = tf.nn.relu(
 72 |         tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_1) + feed_forward_bias_1)
 73 | 
 74 |     output_layer = tf.matmul(final_hidden_activations, feed_forward_weights_2) + feed_forward_bias_2
 75 | 
 76 |     return input_layer, output_layer
 77 | 
 78 | 
 79 | _session = tf.Session()
 80 | _input_layer, _output_layer = _create_network()
 81 | 
 82 | _action = tf.placeholder("float", [None, ACTIONS_COUNT])
 83 | _target = tf.placeholder("float", [None])
 84 | 
 85 | readout_action = tf.reduce_sum(tf.mul(_output_layer, _action), reduction_indices=1)
 86 | 
 87 | cost = tf.reduce_mean(tf.square(_target - readout_action))
 88 | _train_operation = tf.train.AdamOptimizer(LEARN_RATE).minimize(cost)
 89 | 
 90 | _observations = deque(maxlen=MEMORY_SIZE)
 91 | _last_scores = deque(maxlen=STORE_SCORES_LEN)
 92 | 
 93 | # set the first action to do nothing
 94 | _last_action = np.zeros(ACTIONS_COUNT)
 95 | _last_action[1] = 1
 96 | 
 97 | _last_state = None
 98 | _probability_of_random_action = INITIAL_RANDOM_ACTION_PROB
 99 | _time = 0
100 | 
101 | _session.run(tf.initialize_all_variables())
102 | 
103 | saver = tf.train.Saver()
104 | 
105 | if not os.path.exists(CHECKPOINT_PATH):
106 |     os.mkdir(CHECKPOINT_PATH)
107 | 
108 | if resume:
109 |     checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_PATH)
110 |     if checkpoint:
111 |         saver.restore(_session, checkpoint.model_checkpoint_path)
112 | 
113 | 
114 | def _choose_next_action(state):
115 |     new_action = np.zeros([ACTIONS_COUNT])
116 | 
117 |     if random.random() <= _probability_of_random_action:
118 |         # choose an action randomly
119 |         action_index = random.randrange(ACTIONS_COUNT)
120 |     else:
121 |         # choose an action given our last state
122 |         readout_t = _session.run(_output_layer, feed_dict={_input_layer: [state]})[0]
123 |         if verbose_logging:
124 |             print("Action Q-Values are %s" % readout_t)
125 |         action_index = np.argmax(readout_t)
126 | 
127 |     new_action[action_index] = 1
128 |     return new_action
129 | 
130 | 
131 | def pre_process(screen_image):
132 |     """ change the 210x160x3 uint8 frame into 84x72 float """
133 |     screen_image = screen_image[32:-10, 8:-8]  # crop
134 |     screen_image = screen_image[::2, ::2, 0]  # downsample by factor of 2
135 |     screen_image[screen_image != 0] = 1  # set everything is either black:0 or white:1
136 |     return screen_image.astype(np.float)
137 | 
138 | 
139 | def _key_presses_from_action(action_set):
140 |     if action_set[0] == 1:
141 |         return 1
142 |     elif action_set[1] == 1:
143 |         return 2
144 |     elif action_set[2] == 1:
145 |         return 3
146 |     raise Exception("Unexpected action")
147 | 
148 | 
149 | def _train():
150 |     # sample a mini_batch to train on
151 |     mini_batch_compressed = random.sample(_observations, MINI_BATCH_SIZE)
152 |     mini_batch = [pickle.loads(zlib.decompress(comp_item)) for comp_item in mini_batch_compressed]
153 | 
154 |     # get the batch variables
155 |     previous_states = [d[OBS_LAST_STATE_INDEX] for d in mini_batch]
156 |     actions = [d[OBS_ACTION_INDEX] for d in mini_batch]
157 |     rewards = [d[OBS_REWARD_INDEX] for d in mini_batch]
158 |     current_states = [d[OBS_CURRENT_STATE_INDEX] for d in mini_batch]
159 |     agents_expected_reward = []
160 |     # this gives us the agents expected reward for each action we might take
161 |     agents_reward_per_action = _session.run(_output_layer, feed_dict={_input_layer: current_states})
162 |     for i in range(len(mini_batch)):
163 |         if mini_batch[i][OBS_TERMINAL_INDEX]:
164 |             # this was a terminal frame so there is no future reward...
165 |             agents_expected_reward.append(rewards[i])
166 |         else:
167 |             agents_expected_reward.append(
168 |                 rewards[i] + FUTURE_REWARD_DISCOUNT * np.max(agents_reward_per_action[i]))
169 | 
170 |     # learn that these actions in these states lead to this reward
171 |     _session.run(_train_operation, feed_dict={
172 |         _input_layer: previous_states,
173 |         _action: actions,
174 |         _target: agents_expected_reward})
175 | 
176 |     # save checkpoints for later
177 |     if _time % SAVE_EVERY_X_STEPS == 0:
178 |         saver.save(_session, CHECKPOINT_PATH + '/network', global_step=_time)
179 | 
180 | 
181 | env = gym.make("Breakout-v0")
182 | observation = env.reset()
183 | reward = 0
184 | score_pre_game = 0
185 | 
186 | while True:
187 |     env.render()
188 | 
189 |     observation, reward, terminal, info = env.step(_key_presses_from_action(_last_action))
190 |     score_pre_game += reward
191 | 
192 |     screen_binary = pre_process(observation)
193 | 
194 |     # first frame must be handled differently
195 |     if _last_state is None:
196 |         # the _last_state will contain the image data from the last self.STATE_FRAMES frames
197 |         _last_state = np.stack(tuple(screen_binary for _ in range(STATE_FRAMES)), axis=2)
198 |     else:
199 |         screen_binary = np.reshape(screen_binary,
200 |                                    (SCREEN_HEIGHT, SCREEN_WIDTH, 1))
201 |         current_state = np.append(_last_state[:, :, 1:], screen_binary, axis=2)
202 | 
203 |         _observations.append(
204 |             zlib.compress(pickle.dumps((_last_state, _last_action, reward, current_state, terminal), 2), 2))
205 | 
206 |         # only train if done observing
207 |         if len(_observations) > OBSERVATION_STEPS:
208 |             _train()
209 |             _time += 1
210 | 
211 |         if terminal:
212 |             _last_scores.append(score_pre_game)
213 |             score_pre_game = 0
214 |             env.reset()
215 |             _last_state = None
216 |         else:
217 |             # update the old values
218 |             _last_state = current_state
219 |             _last_action = _choose_next_action(_last_state)
220 | 
221 |         # gradually reduce the probability of a random action
222 |         if _probability_of_random_action > FINAL_RANDOM_ACTION_PROB \
223 |                 and len(_observations) > OBSERVATION_STEPS:
224 |             _probability_of_random_action -= \
225 |                 (INITIAL_RANDOM_ACTION_PROB - FINAL_RANDOM_ACTION_PROB) / EXPLORE_STEPS
226 | 
227 |         print("Time: %s random_action_prob: %s reward %s scores differential %s" %
228 |               (_time, _probability_of_random_action, reward,
229 |                np.mean(_last_scores)))
230 | 


--------------------------------------------------------------------------------
/Chapter 08/deep_q_pong.py:
--------------------------------------------------------------------------------
  1 | # note must import tensorflow before gym
  2 | import random
  3 | from collections import deque
  4 | 
  5 | import tensorflow as tf
  6 | import gym
  7 | import numpy as np
  8 | import os
  9 | 
 10 | resume = True
 11 | CHECKPOINT_PATH = 'deep_q_pong'
 12 | ACTIONS_COUNT = 3
 13 | SCREEN_WIDTH, SCREEN_HEIGHT = (80, 80)
 14 | FUTURE_REWARD_DISCOUNT = 0.99
 15 | OBSERVATION_STEPS = 50000.  # time steps to observe before training
 16 | EXPLORE_STEPS = 2000000.  # frames over which to anneal epsilon
 17 | INITIAL_RANDOM_ACTION_PROB = 1.0  # starting chance of an action being random
 18 | FINAL_RANDOM_ACTION_PROB = 0.05  # final chance of an action being random
 19 | MEMORY_SIZE = 100000  # number of observations to remember
 20 | MINI_BATCH_SIZE = 100  # size of mini batches
 21 | STATE_FRAMES = 2  # number of frames to store in the state
 22 | OBS_LAST_STATE_INDEX, OBS_ACTION_INDEX, OBS_REWARD_INDEX, OBS_CURRENT_STATE_INDEX, OBS_TERMINAL_INDEX = range(5)
 23 | SAVE_EVERY_X_STEPS = 10000
 24 | LEARN_RATE = 1e-6
 25 | STORE_SCORES_LEN = 1000.
 26 | verbose_logging = True
 27 | 
 28 | 
 29 | def _create_network():
 30 |     # network weights
 31 |     convolution_weights_1 = tf.Variable(tf.truncated_normal([8, 8, STATE_FRAMES, 32], stddev=0.01))
 32 |     convolution_bias_1 = tf.Variable(tf.constant(0.01, shape=[32]))
 33 | 
 34 |     convolution_weights_2 = tf.Variable(tf.truncated_normal([4, 4, 32, 64], stddev=0.01))
 35 |     convolution_bias_2 = tf.Variable(tf.constant(0.01, shape=[64]))
 36 | 
 37 |     convolution_weights_3 = tf.Variable(tf.truncated_normal([3, 3, 64, 64], stddev=0.01))
 38 |     convolution_bias_3 = tf.Variable(tf.constant(0.01, shape=[64]))
 39 | 
 40 |     feed_forward_weights_1 = tf.Variable(tf.truncated_normal([256, 256], stddev=0.01))
 41 |     feed_forward_bias_1 = tf.Variable(tf.constant(0.01, shape=[256]))
 42 | 
 43 |     feed_forward_weights_2 = tf.Variable(tf.truncated_normal([256, ACTIONS_COUNT], stddev=0.01))
 44 |     feed_forward_bias_2 = tf.Variable(tf.constant(0.01, shape=[ACTIONS_COUNT]))
 45 | 
 46 |     input_layer = tf.placeholder("float", [None, SCREEN_WIDTH, SCREEN_HEIGHT,
 47 |                                            STATE_FRAMES])
 48 | 
 49 |     hidden_convolutional_layer_1 = tf.nn.relu(
 50 |         tf.nn.conv2d(input_layer, convolution_weights_1, strides=[1, 4, 4, 1], padding="SAME") + convolution_bias_1)
 51 | 
 52 |     hidden_max_pooling_layer_1 = tf.nn.max_pool(hidden_convolutional_layer_1, ksize=[1, 2, 2, 1],
 53 |                                                 strides=[1, 2, 2, 1], padding="SAME")
 54 | 
 55 |     hidden_convolutional_layer_2 = tf.nn.relu(
 56 |         tf.nn.conv2d(hidden_max_pooling_layer_1, convolution_weights_2, strides=[1, 2, 2, 1],
 57 |                      padding="SAME") + convolution_bias_2)
 58 | 
 59 |     hidden_max_pooling_layer_2 = tf.nn.max_pool(hidden_convolutional_layer_2, ksize=[1, 2, 2, 1],
 60 |                                                 strides=[1, 2, 2, 1], padding="SAME")
 61 | 
 62 |     hidden_convolutional_layer_3 = tf.nn.relu(
 63 |         tf.nn.conv2d(hidden_max_pooling_layer_2, convolution_weights_3,
 64 |                      strides=[1, 1, 1, 1], padding="SAME") + convolution_bias_3)
 65 | 
 66 |     hidden_max_pooling_layer_3 = tf.nn.max_pool(hidden_convolutional_layer_3, ksize=[1, 2, 2, 1],
 67 |                                                 strides=[1, 2, 2, 1], padding="SAME")
 68 | 
 69 |     hidden_convolutional_layer_3_flat = tf.reshape(hidden_max_pooling_layer_3, [-1, 256])
 70 | 
 71 |     final_hidden_activations = tf.nn.relu(
 72 |         tf.matmul(hidden_convolutional_layer_3_flat, feed_forward_weights_1) + feed_forward_bias_1)
 73 | 
 74 |     output_layer = tf.matmul(final_hidden_activations, feed_forward_weights_2) + feed_forward_bias_2
 75 | 
 76 |     return input_layer, output_layer
 77 | 
 78 | 
 79 | _session = tf.Session()
 80 | _input_layer, _output_layer = _create_network()
 81 | 
 82 | _action = tf.placeholder("float", [None, ACTIONS_COUNT])
 83 | _target = tf.placeholder("float", [None])
 84 | 
 85 | readout_action = tf.reduce_sum(tf.mul(_output_layer, _action), reduction_indices=1)
 86 | 
 87 | cost = tf.reduce_mean(tf.square(_target - readout_action))
 88 | _train_operation = tf.train.AdamOptimizer(LEARN_RATE).minimize(cost)
 89 | 
 90 | _observations = deque()
 91 | _last_scores = deque()
 92 | 
 93 | # set the first action to do nothing
 94 | _last_action = np.zeros(ACTIONS_COUNT)
 95 | _last_action[1] = 1
 96 | 
 97 | _last_state = None
 98 | _probability_of_random_action = INITIAL_RANDOM_ACTION_PROB
 99 | _time = 0
100 | 
101 | _session.run(tf.initialize_all_variables())
102 | 
103 | saver = tf.train.Saver()
104 | 
105 | if not os.path.exists(CHECKPOINT_PATH):
106 |     os.mkdir(CHECKPOINT_PATH)
107 | 
108 | if resume:
109 |     checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_PATH)
110 |     if checkpoint:
111 |         saver.restore(_session, checkpoint.model_checkpoint_path)
112 | 
113 | 
114 | def _choose_next_action():
115 |     new_action = np.zeros([ACTIONS_COUNT])
116 | 
117 |     if random.random() <= _probability_of_random_action:
118 |         # choose an action randomly
119 |         action_index = random.randrange(ACTIONS_COUNT)
120 |     else:
121 |         # choose an action given our last state
122 |         readout_t = _session.run(_output_layer, feed_dict={_input_layer: [_last_state]})[0]
123 |         if verbose_logging:
124 |             print("Action Q-Values are %s" % readout_t)
125 |         action_index = np.argmax(readout_t)
126 | 
127 |     new_action[action_index] = 1
128 |     return new_action
129 | 
130 | 
131 | def pre_process(screen_image):
132 |     """ change the 210x160x3 uint8 frame into 6400 (80x80) float """
133 |     screen_image = screen_image[35:195]  # crop
134 |     screen_image = screen_image[::2, ::2, 0]  # downsample by factor of 2
135 |     screen_image[screen_image == 144] = 0  # erase background (background type 1)
136 |     screen_image[screen_image == 109] = 0  # erase background (background type 2)
137 |     screen_image[screen_image != 0] = 1  # everything else (paddles, ball) just set to 1
138 |     return screen_image.astype(np.float)
139 | 
140 | 
141 | def _key_presses_from_action(action_set):
142 |     # 1 = still
143 |     # 2 = up
144 |     # 3 = down
145 | 
146 |     if action_set[0] == 1:
147 |         return 1
148 |     elif action_set[1] == 1:
149 |         return 2
150 |     elif action_set[2] == 1:
151 |         return 3
152 |     raise Exception("Unexpected action")
153 | 
154 | 
155 | def _train():
156 |     # sample a mini_batch to train on
157 |     mini_batch = random.sample(_observations, MINI_BATCH_SIZE)
158 |     # get the batch variables
159 |     previous_states = [d[OBS_LAST_STATE_INDEX] for d in mini_batch]
160 |     actions = [d[OBS_ACTION_INDEX] for d in mini_batch]
161 |     rewards = [d[OBS_REWARD_INDEX] for d in mini_batch]
162 |     current_states = [d[OBS_CURRENT_STATE_INDEX] for d in mini_batch]
163 |     agents_expected_reward = []
164 |     # this gives us the agents expected reward for each action we might take
165 |     agents_reward_per_action = _session.run(_output_layer, feed_dict={_input_layer: current_states})
166 |     for i in range(len(mini_batch)):
167 |         if mini_batch[i][OBS_TERMINAL_INDEX]:
168 |             # this was a terminal frame so there is no future reward...
169 |             agents_expected_reward.append(rewards[i])
170 |         else:
171 |             agents_expected_reward.append(
172 |                 rewards[i] + FUTURE_REWARD_DISCOUNT * np.max(agents_reward_per_action[i]))
173 | 
174 |     # learn that these actions in these states lead to this reward
175 |     _session.run(_train_operation, feed_dict={
176 |         _input_layer: previous_states,
177 |         _action: actions,
178 |         _target: agents_expected_reward})
179 | 
180 |     # save checkpoints for later
181 |     if _time % SAVE_EVERY_X_STEPS == 0:
182 |         saver.save(_session, CHECKPOINT_PATH + '/network', global_step=_time)
183 | 
184 | env = gym.make("Pong-v0")
185 | observation = env.reset()
186 | next_action = 1
187 | 
188 | while True:
189 |     env.render()
190 | 
191 |     observation, reward, done, info = env.step(next_action)
192 | 
193 |     if done:
194 |         env.reset()
195 | 
196 |     terminal = False
197 | 
198 |     screen_binary = pre_process(observation)
199 | 
200 |     if reward != 0.0:
201 |         terminal = True
202 |         _last_scores.append(reward)
203 |         if len(_last_scores) > STORE_SCORES_LEN:
204 |             _last_scores.popleft()
205 | 
206 |     # first frame must be handled differently
207 |     if _last_state is None:
208 |         # the _last_state will contain the image data from the last self.STATE_FRAMES frames
209 |         _last_state = np.stack(tuple(screen_binary for _ in range(STATE_FRAMES)), axis=2)
210 |         next_action = _key_presses_from_action(_last_action)
211 |     else:
212 |         screen_binary = np.reshape(screen_binary,
213 |                                    (SCREEN_WIDTH, SCREEN_HEIGHT, 1))
214 |         current_state = np.append(_last_state[:, :, 1:], screen_binary, axis=2)
215 | 
216 |         # store the transition in previous_observations
217 |         _observations.append((_last_state, _last_action, reward, current_state, terminal))
218 | 
219 |         if len(_observations) > MEMORY_SIZE:
220 |             _observations.popleft()
221 | 
222 |         # only train if done observing
223 |         if len(_observations) > OBSERVATION_STEPS:
224 |             _train()
225 |             _time += 1
226 | 
227 |         # update the old values
228 |         _last_state = current_state
229 | 
230 |         _last_action = _choose_next_action()
231 | 
232 |         # gradually reduce the probability of a random action
233 |         if _probability_of_random_action > FINAL_RANDOM_ACTION_PROB \
234 |                 and len(_observations) > OBSERVATION_STEPS:
235 |             _probability_of_random_action -= \
236 |                 (INITIAL_RANDOM_ACTION_PROB - FINAL_RANDOM_ACTION_PROB) / EXPLORE_STEPS
237 | 
238 |         print("Time: %s random_action_prob: %s reward %s scores differential %s" %
239 |               (_time, _probability_of_random_action, reward,
240 |                sum(_last_scores) / STORE_SCORES_LEN))
241 | 
242 |         next_action = _key_presses_from_action(_last_action)


--------------------------------------------------------------------------------
/Chapter 06/language model/model.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import print_function, division
  3 | 
  4 | import time
  5 | import codecs
  6 | import locale
  7 | import sys
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | import data_reader
 11 | 
 12 | class Model(object):
 13 |     """RNN language model."""
 14 |     def __init__(self, batch_size, sequence_length, lstm_sizes, dropout,
 15 |                  labels, save_path):
 16 |         self.batch_size = batch_size
 17 |         self.sequence_length = sequence_length
 18 |         self.lstm_sizes = lstm_sizes
 19 |         self.labels = labels
 20 |         self.label_map = {val: idx for idx, val in enumerate(labels)}
 21 |         self.number_of_characters = len(labels)
 22 |         self.save_path = save_path
 23 |         self.dropout = dropout
 24 | 
 25 |     def init_graph(self):
 26 |         # Variable sequence length
 27 |         self.inputs = tf.placeholder(
 28 |             tf.int32, [self.batch_size, self.sequence_length])
 29 |         self.targets = tf.placeholder(
 30 |             tf.int32, [self.batch_size, self.sequence_length])
 31 |         self.init_architecture()
 32 |         self.saver = tf.train.Saver(tf.trainable_variables())
 33 | 
 34 |     def init_architecture(self):
 35 |         # Define a multilayer LSTM cell
 36 |         self.one_hot_inputs = tf.one_hot(
 37 |             self.inputs, depth=self.number_of_characters)
 38 |         cell_list = [tf.nn.rnn_cell.LSTMCell(lstm_size, state_is_tuple=True)
 39 |                      for lstm_size in self.lstm_sizes]
 40 |         self.multi_cell_lstm = tf.nn.rnn_cell.MultiRNNCell(
 41 |             cell_list, state_is_tuple=True)
 42 |         # Initial state of the LSTM memory.
 43 |         # Keep state in graph memory to use between batches
 44 |         self.initial_state = self.multi_cell_lstm.zero_state(
 45 |             self.batch_size, tf.float32)
 46 |         # Convert to variables so that the state can be stored between batches
 47 |         # Note that LSTM states is a tuple of tensors, this structure has to be
 48 |         # re-created in order to use as LSTM state.
 49 |         self.state_variables = tf.python.util.nest.pack_sequence_as(
 50 |             self.initial_state,
 51 |             [tf.Variable(var, trainable=False)
 52 |              for var in tf.python.util.nest.flatten(self.initial_state)])
 53 |         # Define the rnn through time
 54 |         lstm_output, final_state = tf.nn.dynamic_rnn(
 55 |             cell=self.multi_cell_lstm, inputs=self.one_hot_inputs,
 56 |             initial_state=self.state_variables)
 57 |         # Force the initial state to be set to the new state for the next batch
 58 |         # before returning the output
 59 |         store_states = [
 60 |             state_variable.assign(new_state)
 61 |             for (state_variable, new_state) in zip(
 62 |                 tf.python.util.nest.flatten(self.state_variables),
 63 |                 tf.python.util.nest.flatten(final_state))]
 64 |         with tf.control_dependencies(store_states):
 65 |             lstm_output = tf.identity(lstm_output)
 66 |         # Reshape so that we can apply the linear transformation to all outputs
 67 |         output_flat = tf.reshape(lstm_output, (-1, self.lstm_sizes[-1]))
 68 |         # Define output layer
 69 |         self.logit_weights = tf.Variable(
 70 |             tf.truncated_normal(
 71 |                 (self.lstm_sizes[-1], self.number_of_characters), stddev=0.01),
 72 |             name='logit_weights')
 73 |         self.logit_bias = tf.Variable(
 74 |             tf.zeros((self.number_of_characters)), name='logit_bias')
 75 |         # Apply last layer transformation
 76 |         self.logits_flat = tf.matmul(
 77 |             output_flat, self.logit_weights) + self.logit_bias
 78 |         probabilities_flat = tf.nn.softmax(self.logits_flat)
 79 |         self.probabilities = tf.reshape(
 80 |             probabilities_flat,
 81 |             (self.batch_size, -1, self.number_of_characters))
 82 | 
 83 |     def init_train_op(self, optimizer):
 84 |         # Flatten the targets to be compatible with the flattened logits
 85 |         targets_flat = tf.reshape(self.targets, (-1, ))
 86 |         # Get the loss over all outputs
 87 |         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
 88 |             self.logits_flat, targets_flat, name='x_entropy')
 89 |         self.loss = tf.reduce_mean(loss)
 90 |         trainable_variables = tf.trainable_variables()
 91 |         gradients = tf.gradients(loss, trainable_variables)
 92 |         gradients, _ = tf.clip_by_global_norm(
 93 |             gradients, 5)
 94 |         self.train_op = optimizer.apply_gradients(
 95 |             zip(gradients, trainable_variables))
 96 | 
 97 |     def sample(self, session, prime_string, sample_length):
 98 |         self.reset_state(session)
 99 |         # Prime state
100 |         print('prime_string: ', prime_string)
101 |         for character in prime_string:
102 |             character_idx = self.label_map[character]
103 |             out = session.run(
104 |                 self.probabilities,
105 |                 feed_dict={self.inputs: np.asarray([[character_idx]])})
106 |             sample_label = np.random.choice(
107 |                 self.labels, size=(1),  p=out[0, 0])
108 |         output_sample = prime_string
109 |         print('start sampling')
110 |         # Sample for sample_length steps
111 |         for _ in range(sample_length):
112 |             sample_label = np.random.choice(
113 |                 self.labels, size=(1),  p=out[0, 0])[0]
114 |             output_sample += sample_label
115 |             sample_idx = self.label_map[sample_label]
116 |             out = session.run(
117 |                 self.probabilities,
118 |                 feed_dict={self.inputs: np.asarray([[sample_idx]])})
119 |         return output_sample
120 | 
121 |     def reset_state(self, session):
122 |         for state in tf.python.util.nest.flatten(self.state_variables):
123 |             session.run(state.initializer)
124 | 
125 |     def save(self, sess):
126 |         self.saver.save(sess, self.save_path)
127 | 
128 |     def restore(self, sess):
129 |         self.saver.restore(sess, self.save_path)
130 | 
131 | 
132 | def train_and_sample(minibatch_iterations, restore):
133 |     tf.reset_default_graph()
134 |     batch_size = 64
135 |     lstm_sizes = [512, 512]
136 |     batch_len = 100
137 |     learning_rate = 2e-3
138 | 
139 |     filepath = './wap.txt'
140 | 
141 |     data_feed = data_reader.DataReader(
142 |          filepath, batch_len, batch_size)
143 |     labels = data_feed.char_list
144 |     print('labels: ', labels)
145 | 
146 |     save_path = './model.tf'
147 |     model = Model(
148 |         batch_size, batch_len, lstm_sizes, 0.8, labels,
149 |         save_path)
150 |     model.init_graph()
151 |     optimizer = tf.train.AdamOptimizer(learning_rate)
152 |     model.init_train_op(optimizer)
153 | 
154 |     init_op = tf.initialize_all_variables()
155 |     with tf.Session() as sess:
156 |         sess.run(init_op)
157 |         if restore:
158 |             print('Restoring model')
159 |             model.restore(sess)
160 |         model.reset_state(sess)
161 |         start_time = time.time()
162 |         for i in range(minibatch_iterations):
163 |             input_batch, target_batch = next(iter(data_feed))
164 |             loss, _ = sess.run(
165 |                 [model.loss, model.train_op],
166 |                 feed_dict={
167 |                     model.inputs: input_batch, model.targets: target_batch})
168 |             if i % 50 == 0 and i != 0:
169 |                 print('i: ', i)
170 |                 duration = time.time() - start_time
171 |                 print('loss: {} ({} sec.)'.format(loss, duration))
172 |                 start_time = time.time()
173 |             if i % 1000 == 0 and i != 0:
174 |                 model.save(sess)
175 |             if i % 100 == 0 and i != 0:
176 |                 print('Reset initial state')
177 |                 model.reset_state(sess)
178 |             if i % 1000 == 0 and i != 0:
179 |                 print('Reset minibatch feeder')
180 |                 data_feed.reset_indices()
181 |         model.save(sess)
182 | 
183 |     print('\n sampling after {} iterations'.format(minibatch_iterations))
184 |     tf.reset_default_graph()
185 |     model = Model(
186 |         1, None, lstm_sizes, 1.0, labels, save_path)
187 |     model.init_graph()
188 |     init_op = tf.initialize_all_variables()
189 |     with tf.Session() as sess:
190 |         sess.run(init_op)
191 |         model.restore(sess)
192 |         print('\nSample 1:')
193 |         sample = model.sample(
194 |             sess, prime_string=u'\n\nThis feeling was ', sample_length=500)
195 |         print(u'sample: \n{}'.format(sample))
196 |         print('\nSample 2:')
197 |         sample = model.sample(
198 |             sess, prime_string=u'She was born in the year ', sample_length=500)
199 |         print(u'sample: \n{}'.format(sample))
200 |         print('\nSample 3:')
201 |         sample = model.sample(
202 |             sess, prime_string=u'The meaning of this all is ',
203 |             sample_length=500)
204 |         print(u'sample: \n{}'.format(sample))
205 |         print('\nSample 4:')
206 |         sample = model.sample(
207 |             sess,
208 |             prime_string=u'In the midst of a conversation on political matters Anna Pávlovna burst out:,',
209 |             sample_length=500)
210 |         print(u'sample: \n{}'.format(sample))
211 |         print('\nSample 5:')
212 |         sample = model.sample(
213 |             sess, prime_string=u'\n\nCHAPTER X\n\n',
214 |             sample_length=500)
215 |         print(u'sample: \n{}'.format(sample))
216 |         print('\nSample 5:')
217 |         sample = model.sample(
218 |             sess, prime_string=u'"If only you knew,"',
219 |             sample_length=500)
220 |         print(u'sample: \n{}'.format(sample))
221 | 
222 | 
223 | def main():
224 |     total_iterations = 500
225 |     print('\n\n\nTrain for {}'.format(500))
226 |     print('Total iters: {}'.format(total_iterations))
227 |     train_and_sample(500, restore=False)
228 |     for i in [500, 1000, 3000, 5000, 10000, 30000, 50000, 100000, 300000]:
229 |         total_iterations += i
230 |         print('\n\n\nTrain for {}'.format(i))
231 |         print('Total iters: {}'.format(total_iterations))
232 |         train_and_sample(i, restore=True)
233 | 
234 | 
235 | if __name__ == "__main__":
236 |     main()
237 | 


--------------------------------------------------------------------------------