├── .gitignore
├── .floydignore
├── keras_mnist_cnn.py
├── tf_mnist_cnn.py
├── pytorch_mnist_cnn.py
├── tf_mnist_cnn_jupyter.ipynb
├── keras_mnist_cnn_jupyter.ipynb
├── README.md
└── pytorch_mnist_cnn_jupyter.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .floydexpt


--------------------------------------------------------------------------------
/.floydignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Directories and files to ignore when uploading code to floyd
 3 | 
 4 | .git
 5 | .eggs
 6 | eggs
 7 | lib
 8 | lib64
 9 | parts
10 | sdist
11 | var
12 | *.pyc
13 | *.swp
14 | .DS_Store
15 | 


--------------------------------------------------------------------------------
/keras_mnist_cnn.py:
--------------------------------------------------------------------------------
  1 | """Convolutional Neural Network for MNIST, built with Keras.
  2 | 
  3 | Adapted from
  4 | https://github.com/minimaxir/deep-learning-cpu-gpu-benchmark/blob/master/test_files/mnist_cnn.py
  5 | 
  6 | MIT License
  7 | 
  8 | Copyright (c) 2017 Max Woolf
  9 | 
 10 | Permission is hereby granted, free of charge, to any person obtaining a copy
 11 | of this software and associated documentation files (the "Software"), to deal
 12 | in the Software without restriction, including without limitation the rights
 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 14 | copies of the Software, and to permit persons to whom the Software is
 15 | furnished to do so, subject to the following conditions:
 16 | 
 17 | The above copyright notice and this permission notice shall be included in all
 18 | copies or substantial portions of the Software.
 19 | 
 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 26 | SOFTWARE.
 27 | """
 28 | 
 29 | from __future__ import print_function
 30 | import keras
 31 | import os.path
 32 | from keras.datasets import mnist
 33 | from keras.models import Sequential
 34 | from keras.layers import Dense, Dropout, Flatten
 35 | from keras.layers import Conv2D, MaxPooling2D
 36 | from keras import backend as K
 37 | from keras.callbacks import ModelCheckpoint
 38 | 
 39 | # Path to saved model weights(as hdf5)
 40 | resume_weights = "/model/mnist-cnn-best.hdf5"
 41 | 
 42 | # Hyper-parameters
 43 | batch_size = 128
 44 | num_classes = 10
 45 | epochs = 12
 46 | 
 47 | # input image dimensions
 48 | img_rows, img_cols = 28, 28
 49 | 
 50 | # MNIST handwritten image classification
 51 | # the data, shuffled and split between train and test sets
 52 | (x_train, y_train), (x_test, y_test) = mnist.load_data()
 53 | 
 54 | # Reshape strategy according to backend
 55 | if K.image_data_format() == 'channels_first':
 56 | 	x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
 57 | 	x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
 58 | 	# 1 x 28 x 28 [number_of_channels (colors) x height x weight]
 59 | 	input_shape = (1, img_rows, img_cols)
 60 | else:
 61 | 	x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
 62 | 	x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
 63 | 	# 28 x 28 x 1 [height x weight x number_of_channels (colors)]
 64 | 	input_shape = (img_rows, img_cols, 1)
 65 | 
 66 | # Reshape, type, normalized, print
 67 | x_train = x_train.astype('float32')
 68 | x_test = x_test.astype('float32')
 69 | x_train /= 255
 70 | x_test /= 255
 71 | 
 72 | # Dataset info
 73 | print('x_train shape:', x_train.shape)
 74 | print(x_train.shape[0], 'train samples')
 75 | print(x_test.shape[0], 'test samples')
 76 | 
 77 | # convert class vectors to binary class matrices
 78 | y_train = keras.utils.to_categorical(y_train, num_classes)
 79 | y_test = keras.utils.to_categorical(y_test, num_classes)
 80 | 
 81 | # MODEL
 82 | # Conv(32,3,3)[ReLU] -> Conv(64,3,3)[ReLU] -> MaxPool(2,2)[Dropout 0.25] ->
 83 | # FC(_, 128)[ReLU][Dropout 0.5] -> FC(128, 10)[Softmax]
 84 | model = Sequential()
 85 | model.add(Conv2D(32, kernel_size=(3, 3),
 86 | 					activation='relu',
 87 | 					input_shape=input_shape))
 88 | model.add(Conv2D(64, (3, 3), activation='relu'))
 89 | model.add(MaxPooling2D(pool_size=(2, 2)))
 90 | model.add(Dropout(0.25))
 91 | model.add(Flatten())
 92 | model.add(Dense(128, activation='relu'))
 93 | model.add(Dropout(0.5))
 94 | model.add(Dense(num_classes, activation='softmax'))
 95 | 
 96 | model.summary()
 97 | 
 98 | # If exists a best model, load its weights!
 99 | if os.path.isfile(resume_weights):
100 | 	print ("Resumed model's weights from {}".format(resume_weights))
101 | 	# load weights
102 | 	model.load_weights(resume_weights)
103 | 
104 | # CEE, Adam
105 | model.compile(loss=keras.losses.categorical_crossentropy,
106 | 			optimizer=keras.optimizers.Adam(),
107 | 			metrics=['accuracy'])
108 | 
109 | # Checkpoint In the /output folder
110 | filepath = "/output/mnist-cnn-best.hdf5"
111 | 
112 | # Keep only a single checkpoint, the best over test accuracy.
113 | checkpoint = ModelCheckpoint(filepath,
114 | 							monitor='val_acc',
115 | 							verbose=1,
116 | 							save_best_only=True,
117 | 							mode='max')
118 | 
119 | # Train
120 | model.fit(x_train, y_train,
121 | 				batch_size=batch_size,
122 | 				epochs=epochs,
123 | 				verbose=1,
124 | 				validation_data=(x_test, y_test),
125 | 				callbacks=[checkpoint])
126 | 
127 | # Eval
128 | score = model.evaluate(x_test, y_test, verbose=0)
129 | print('Test loss:', score[0])
130 | print('Test accuracy:', score[1])
131 | 


--------------------------------------------------------------------------------
/tf_mnist_cnn.py:
--------------------------------------------------------------------------------
  1 | """Convolutional Neural Network Estimator for MNIST, built with tf.layers.
  2 | 
  3 | Adapted from:
  4 | https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/tutorials/layers/cnn_mnist.py
  5 | 
  6 | Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  7 | 
  8 | Licensed under the Apache License, Version 2.0 (the "License");
  9 | you may not use this file except in compliance with the License.
 10 | You may obtain a copy of the License at
 11 | 
 12 | http://www.apache.org/licenses/LICENSE-2.0
 13 | 
 14 | Unless required by applicable law or agreed to in writing, software
 15 | distributed under the License is distributed on an "AS IS" BASIS,
 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 | See the License for the specific language governing permissions and
 18 | limitations under the License.
 19 | """
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
 26 | 
 27 | import numpy as np
 28 | import tensorflow as tf
 29 | import shutil
 30 | import os
 31 | 
 32 | tf.logging.set_verbosity(tf.logging.INFO)
 33 | 
 34 | 
 35 | # Where to save Checkpoint(In the /output folder)
 36 | resumepath = "/model/mnist_convnet_model"
 37 | filepath = "/output/mnist_convnet_model"
 38 | 
 39 | # Hyper-parameters
 40 | batch_size = 128
 41 | num_classes = 10
 42 | num_epochs = 12
 43 | learning_rate = 1e-3
 44 | 
 45 | # If exists an checkpoint model, move it into the /output folder
 46 | if os.path.exists(resumepath):
 47 |     shutil.copytree(resumepath, filepath)
 48 | 
 49 | # Load training and eval data
 50 | mnist = read_data_sets(train_dir='/input/MNIST_data', validation_size=0)
 51 | train_data = mnist.train.images  # Returns np.array
 52 | train_labels = np.asarray(mnist.train.labels, dtype=np.int32)
 53 | eval_data = mnist.test.images  # Returns np.array
 54 | eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)
 55 | 
 56 | print (train_data.shape)
 57 | print (eval_data.shape)
 58 | 
 59 | def cnn_model_fn(features, labels, mode):
 60 |     """Model function for CNN."""
 61 |     # Input Layer
 62 |     # Reshape X to 4-D tensor: [batch_size, width, height, channels]
 63 |     # MNIST images are 28x28 pixels, and have one color channel
 64 |     input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])
 65 | 
 66 |     # Convolutional Layer #1
 67 |     # Computes 32 features using a 3x3 filter with ReLU activation.
 68 |     # Input Tensor Shape: [batch_size, 28, 28, 1]
 69 |     # Output Tensor Shape: [batch_size, 26, 26, 32]
 70 |     conv1 = tf.layers.conv2d(
 71 |       inputs=input_layer,
 72 |       filters=32,
 73 |       kernel_size=[3, 3],
 74 |       activation=tf.nn.relu)
 75 | 
 76 |     # Convolutional Layer #2
 77 |     # Computes 64 features using a 3x3 filter.
 78 |     # Input Tensor Shape: [batch_size, 26, 26 32]
 79 |     # Output Tensor Shape: [batch_size, 24, 24, 64]
 80 |     conv2 = tf.layers.conv2d(
 81 |       inputs=conv1,
 82 |       filters=64,
 83 |       kernel_size=[3, 3],
 84 |       activation=tf.nn.relu)
 85 | 
 86 |       # Pooling Layer
 87 |     # Max pooling layer with a 2x2 filter and stride of 2
 88 |     # Input Tensor Shape: [batch_size, 24, 24, 64]
 89 |     # Output Tensor Shape: [batch_size, 12, 12, 64]
 90 |     pool = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
 91 | 
 92 |     # Dropout # 1
 93 |     # Add dropout operation; 0.25 probability that element will be kept
 94 |     dropout = tf.layers.dropout(
 95 |       inputs=pool, rate=0.25, training=mode == tf.estimator.ModeKeys.TRAIN)
 96 | 
 97 |     # Flatten tensor into a batch of vectors
 98 |     # Input Tensor Shape: [batch_size, 12, 12, 64]
 99 |     # Output Tensor Shape: [batch_size, 12 * 12 * 64]
100 |     flat = tf.reshape(dropout, [-1, 12 * 12 * 64])  # 9216
101 | 
102 | 
103 |     # Dense Layer # 1
104 |     # Densely connected layer with 128 neurons
105 |     # Input Tensor Shape: [batch_size, 12 * 12 * 64] (batch_size, 9216)
106 |     # Output Tensor Shape: [batch_size, 128]
107 |     dense1 = tf.layers.dense(inputs=flat, units=128, activation=tf.nn.relu)
108 | 
109 |     # Dropout # 2
110 |     # Add dropout operation; 0.5 probability that element will be kept
111 |     dropout2 = tf.layers.dropout(
112 |       inputs=dense1, rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)
113 | 
114 |     # Logits layer
115 |     # Input Tensor Shape: [batch_size, 128]
116 |     # Output Tensor Shape: [batch_size, 10]
117 |     logits = tf.layers.dense(inputs=dropout2, units=num_classes)
118 | 
119 |     predictions = {
120 |         # Generate predictions (for PREDICT and EVAL mode)
121 |         "classes": tf.argmax(input=logits, axis=1),
122 |         # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
123 |         # `logging_hook`.
124 |         "probabilities": tf.nn.softmax(logits, name="softmax_tensor")
125 |     }
126 |     # Inference (for TEST mode)
127 |     if mode == tf.estimator.ModeKeys.PREDICT:
128 |         return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
129 | 
130 |     # Calculate Loss (for both TRAIN and EVAL modes)
131 |     onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_classes)
132 |     # Cross Entropy
133 |     loss = tf.losses.softmax_cross_entropy(
134 |       onehot_labels=onehot_labels, logits=logits)
135 | 
136 |     # Configure the Training Op (for TRAIN mode)
137 |     if mode == tf.estimator.ModeKeys.TRAIN:
138 |         # AdamOptimizer
139 |         optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
140 |         train_op = optimizer.minimize(
141 |             loss=loss,
142 |             global_step=tf.train.get_global_step())
143 |         return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
144 | 
145 |     # Add evaluation metrics (for EVAL mode)
146 |     eval_metric_ops = {
147 |       "accuracy": tf.metrics.accuracy(
148 |           labels=labels, predictions=predictions["classes"])}
149 |     return tf.estimator.EstimatorSpec(
150 |       mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
151 | 
152 | # Checkpoint Strategy configuration
153 | run_config = tf.contrib.learn.RunConfig(
154 |     model_dir=filepath,
155 |     keep_checkpoint_max=1)
156 | 
157 | # Create the Estimator
158 | mnist_classifier = tf.estimator.Estimator(
159 |       model_fn=cnn_model_fn, config=run_config)
160 | 
161 | # Keep track of the best accuracy
162 | best_acc = 0
163 | 
164 | # Training for num_epochs
165 | for i in range(num_epochs):
166 |     print("Begin Training - Epoch {}/{}".format(i+1, num_epochs))
167 |     # Train the model for 1 epoch
168 |     train_input_fn = tf.estimator.inputs.numpy_input_fn(
169 |         x={"x": train_data},
170 |         y=train_labels,
171 |         batch_size=batch_size,
172 |         num_epochs=1,
173 |         shuffle=True)
174 | 
175 |     mnist_classifier.train(
176 |         input_fn=train_input_fn)
177 | 
178 |     # Evaluate the model and print results
179 |     eval_input_fn = tf.estimator.inputs.numpy_input_fn(
180 |         x={"x": eval_data},
181 |         y=eval_labels,
182 |         num_epochs=1,
183 |         shuffle=False)
184 | 
185 |     eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
186 | 
187 |     accuracy = eval_results["accuracy"] * 100
188 |     # Set the best acc if we have a new best or if it is the first step
189 |     if accuracy > best_acc or i == 0:
190 |         best_acc = accuracy
191 |         print ("=> New Best Accuracy {}".format(accuracy))
192 |     else:
193 |         print("=> Validation Accuracy did not improve")
194 | 


--------------------------------------------------------------------------------
/pytorch_mnist_cnn.py:
--------------------------------------------------------------------------------
  1 | """Convolutional Neural Network for MNIST, built with PyTorch.
  2 | 
  3 | Adapted from:
  4 | https://github.com/pytorch/examples/blob/master/mnist/main.py
  5 | https://github.com/pytorch/examples/blob/master/imagenet/main.py
  6 | 
  7 | BSD 3-Clause License
  8 | 
  9 | Copyright (c) 2017,
 10 | All rights reserved.
 11 | 
 12 | Redistribution and use in source and binary forms, with or without
 13 | modification, are permitted provided that the following conditions are met:
 14 | 
 15 | * Redistributions of source code must retain the above copyright notice, this
 16 |   list of conditions and the following disclaimer.
 17 | 
 18 | * Redistributions in binary form must reproduce the above copyright notice,
 19 |   this list of conditions and the following disclaimer in the documentation
 20 |   and/or other materials provided with the distribution.
 21 | 
 22 | * Neither the name of the copyright holder nor the names of its
 23 |   contributors may be used to endorse or promote products derived from
 24 |   this software without specific prior written permission.
 25 | 
 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 29 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 30 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 31 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 32 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 33 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 34 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | """
 37 | 
 38 | import torch
 39 | import torchvision.datasets as dsets
 40 | import torch.nn as nn
 41 | import torch.nn.functional as F
 42 | import torchvision.transforms as transforms
 43 | from torch.autograd import Variable
 44 | from torchvision.utils import make_grid
 45 | import shutil
 46 | import os.path
 47 | import time
 48 | import numpy as np
 49 | 
 50 | # Hyperparameter
 51 | batch_size = 128
 52 | input_size = 784  # 28 * 28
 53 | hidden_size = 500
 54 | num_classes = 10
 55 | learning_rate = 1e-3
 56 | num_epochs = 12
 57 | print_every = 100
 58 | best_accuracy = torch.FloatTensor([0])
 59 | start_epoch = 0
 60 | 
 61 | # Path to saved model weights(as hdf5)
 62 | resume_weights = "/model/checkpoint.pth.tar"
 63 | 
 64 | # CUDA?
 65 | cuda = torch.cuda.is_available()
 66 | 
 67 | # Seed for reproducibility
 68 | torch.manual_seed(1)
 69 | if cuda:
 70 | 	torch.cuda.manual_seed(1)
 71 | 
 72 | 
 73 | def train(model, optimizer, train_loader, test_loader, loss_fn):
 74 | 	"""Perform a full training over dataset"""
 75 | 	average_time = 0
 76 | 	# Model train mode
 77 | 	model.train()
 78 | 	for i, (images, labels) in enumerate(train_loader):
 79 | 		# measure data loading time
 80 | 		batch_time = time.time()
 81 | 		images = Variable(images)
 82 | 		labels = Variable(labels)
 83 | 
 84 | 		if cuda:
 85 | 			images, labels = images.cuda(), labels.cuda()
 86 | 
 87 | 		# Forward + Backward + Optimize
 88 | 		optimizer.zero_grad()
 89 | 		outputs = model(images)
 90 | 		loss = loss_fn(outputs, labels)
 91 | 
 92 | 		# Load loss on CPU
 93 | 		if cuda:
 94 | 			loss.cpu()
 95 | 
 96 | 		loss.backward()
 97 | 		optimizer.step()
 98 | 
 99 | 		# Measure elapsed time
100 | 		batch_time = time.time() - batch_time
101 | 		# Accumulate over batch
102 | 		average_time += batch_time
103 | 
104 | 		# ### Keep track of metric every batch
105 | 		# Accuracy Metric
106 | 		prediction = outputs.data.max(1)[1]   # first column has actual prob.
107 | 		accuracy = prediction.eq(labels.data).sum() / batch_size * 100
108 | 
109 | 		# Log
110 | 		if (i + 1) % print_every == 0:
111 | 			print ('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f, Accuracy: %.4f, Batch time: %f'
112 | 				% (epoch + 1,
113 | 					num_epochs,
114 | 					i + 1,
115 | 					len(train_dataset) // batch_size,
116 | 					loss.data[0],
117 | 					accuracy,
118 | 					average_time/print_every))  # Average
119 | 
120 | 
121 | def eval(model, optimizer, test_loader):
122 | 	"""Eval over test set"""
123 | 	model.eval()
124 | 	correct = 0
125 | 	# Get Batch
126 | 	for data, target in test_loader:
127 | 		data, target = Variable(data, volatile=True), Variable(target)
128 | 		if cuda:
129 | 			data, target = data.cuda(), target.cuda()
130 | 		# Evaluate
131 | 		output = model(data)
132 | 		# Load output on CPU
133 | 		if cuda:
134 | 			output.cpu()
135 | 		# Compute Accuracy
136 | 		prediction = output.data.max(1)[1]
137 | 		correct += prediction.eq(target.data).sum()
138 | 	return correct
139 | 
140 | 
141 | def save_checkpoint(state, is_best, filename='/output/checkpoint.pth.tar'):
142 | 	"""Save checkpoint if a new best is achieved"""
143 | 	if is_best:
144 | 		print ("=> Saving a new best")
145 | 		torch.save(state, filename)  # save checkpoint
146 | 	else:
147 | 		print ("=> Validation Accuracy did not improve")
148 | 
149 | 
150 | # MNIST Dataset (Images and Labels)
151 | # If you have not mounted the dataset, you can download it
152 | # just adding download=True as parameter
153 | train_dataset = dsets.MNIST(root='/input',
154 | 							train=True,
155 | 							download=True,
156 | 							transform=transforms.ToTensor())
157 | x_train_mnist, y_train_mnist = train_dataset.train_data.type(torch.FloatTensor), \
158 | 							train_dataset.train_labels
159 | test_dataset = dsets.MNIST(root='/input',
160 | 							train=False,
161 | 							download=True,
162 | 							transform=transforms.ToTensor())
163 | x_test_mnist, y_test_mnist = test_dataset.test_data.type(torch.FloatTensor), \
164 | 							test_dataset.test_labels
165 | 
166 | # Dataset info
167 | print('Training Data Size: ', x_train_mnist.size(), '-', y_train_mnist.size())
168 | print('Testing Data Size: ', x_test_mnist.size(), '-', y_test_mnist.size())
169 | 
170 | # Training Dataset Loader (Input Pipline)
171 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
172 | 											batch_size=batch_size,
173 | 											shuffle=True)
174 | # Testing Dataset Loader (Input Pipline)
175 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
176 | 											batch_size=batch_size,
177 | 											shuffle=False)
178 | 
179 | # #### Model ####
180 | # Convolutional Neural Network Model
181 | class CNN(nn.Module):
182 | 	"""Conv[ReLU] -> Conv[ReLU] -> MaxPool -> Dropout(0.25)-
183 | 	-> Flatten -> FC()[ReLU] -> Dropout(0.5) -> FC()[Softmax]
184 | 	"""
185 | 	def __init__(self, num_classes):
186 | 		super(CNN, self).__init__()
187 | 		self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
188 | 		self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
189 | 		self.drop1 = nn.Dropout2d(p=0.25)
190 | 		self.fc1 = nn.Linear(9216, 128)
191 | 		self.drop2 = nn.Dropout2d(p=0.5)
192 | 		self.fc2 = nn.Linear(128, num_classes)
193 | 
194 | 	def forward(self, x):
195 | 		x = F.relu(self.conv1(x))
196 | 		x = F.max_pool2d(F.relu(self.conv2(x)), 2)
197 | 		x = self.drop1(x)
198 | 		x = x.view(-1, 9216)
199 | 		x = F.relu(self.fc1(x))
200 | 		x = self.drop2(x)
201 | 		x = self.fc2(x)
202 | 		return F.log_softmax(x)
203 | 
204 | model = CNN(num_classes)
205 | print(model)
206 | 
207 | # If you are running a GPU instance, load the model on GPU
208 | if cuda:
209 | 	model.cuda()
210 | 
211 | # #### Loss and Optimizer ####
212 | # Softmax is internally computed.
213 | loss_fn = nn.CrossEntropyLoss()
214 | # If you are running a GPU instance, compute the loss on GPU
215 | if cuda:
216 | 	loss_fn.cuda()
217 | 
218 | # Set parameters to be updated.
219 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
220 | 
221 | # If exists a best model, load its weights!
222 | if os.path.isfile(resume_weights):
223 | 	print("=> loading checkpoint '{}' ...".format(resume_weights))
224 | 	if cuda:
225 | 		checkpoint = torch.load(resume_weights)
226 | 	else:
227 | 		# Load GPU model on CPU
228 | 		checkpoint = torch.load(resume_weights,
229 | 								map_location=lambda storage,
230 | 								loc: storage)
231 | 	start_epoch = checkpoint['epoch']
232 | 	best_accuracy = checkpoint['best_accuracy']
233 | 	model.load_state_dict(checkpoint['state_dict'])
234 | 	print("=> loaded checkpoint '{}' (trained for {} epochs)".format(resume_weights,
235 | 		checkpoint['epoch']))
236 | 
237 | 
238 | # Training the Model
239 | for epoch in range(num_epochs):
240 | 	train(model, optimizer, train_loader, test_loader, loss_fn)
241 | 	acc = eval(model, optimizer, test_loader)
242 | 	acc = 100. * acc / len(test_loader.dataset)
243 | 	print('=> Test set: Accuracy: {:.2f}%'.format(acc))
244 | 	acc = torch.FloatTensor([acc])
245 | 	# Get bool not ByteTensor
246 | 	is_best = bool(acc.numpy() > best_accuracy.numpy())
247 | 	# Get greater Tensor to keep track best acc
248 | 	best_accuracy = torch.FloatTensor(max(acc.numpy(), best_accuracy.numpy()))
249 | 	# Save checkpoint if is a new best
250 | 	save_checkpoint({
251 | 		'epoch': start_epoch + epoch + 1,
252 | 		'state_dict': model.state_dict(),
253 | 		'best_accuracy': best_accuracy
254 | 	}, is_best)
255 | 


--------------------------------------------------------------------------------
/tf_mnist_cnn_jupyter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Save and Resume a Tensorflow MNIST ConvNet Model\n",
  8 |     "\n",
  9 |     "This jupyter notebook, show you how to save and resume a Tensorflow Model. In this example we will use the Deep Learning hello-world!: the MNIST classification task.\n",
 10 |     "Note: to run code cell you have to press **`Shift + Enter`**."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Import Packages\n",
 18 |     "First we need a single point with all the dependencies:"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from __future__ import absolute_import\n",
 28 |     "from __future__ import division\n",
 29 |     "from __future__ import print_function\n",
 30 |     "\n",
 31 |     "from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets\n",
 32 |     "\n",
 33 |     "import numpy as np\n",
 34 |     "import tensorflow as tf\n",
 35 |     "import shutil, os\n",
 36 |     "\n",
 37 |     "tf.logging.set_verbosity(tf.logging.INFO)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "### Hyper Parameters and Variables\n",
 45 |     "\n",
 46 |     "Even for Hyper-Parameters and Variables is a good practice have a single point, this improve code readability and experiments interation."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# Where to save Checkpoint(In the /output folder)\n",
 56 |     "resumepath =\"/model/mnist_convnet_model\"\n",
 57 |     "filepath = \"/output/mnist_convnet_model\" \n",
 58 |     "\n",
 59 |     "# Hyper-parameters\n",
 60 |     "batch_size = 128\n",
 61 |     "num_classes = 10\n",
 62 |     "num_epochs = 12\n",
 63 |     "learning_rate = 1e-3"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "### Resuming from Previuos Run\n",
 71 |     "\n",
 72 |     "If we have mounted a previuos run, copy the checkpoint to the `/output` folder so that the Model will continue from that and save everything in it."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 3,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# If exists an checkpoint model, move it into the /output folder\n",
 82 |     "if os.path.exists(resumepath):\n",
 83 |     "    shutil.copytree(resumepath, filepath)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "### Data Processing and Transformation\n",
 91 |     "Next, we process the dataset sample in tensor, ready to be feed into the model."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 4,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.\n",
104 |       "Extracting /input/MNIST_data/train-images-idx3-ubyte.gz\n",
105 |       "Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.\n",
106 |       "Extracting /input/MNIST_data/train-labels-idx1-ubyte.gz\n",
107 |       "Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.\n",
108 |       "Extracting /input/MNIST_data/t10k-images-idx3-ubyte.gz\n",
109 |       "Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.\n",
110 |       "Extracting /input/MNIST_data/t10k-labels-idx1-ubyte.gz\n",
111 |       "(60000, 784)\n",
112 |       "(10000, 784)\n"
113 |      ]
114 |     }
115 |    ],
116 |    "source": [
117 |     "# Load training and eval data\n",
118 |     "mnist = read_data_sets(train_dir='/input/MNIST_data', validation_size=0)\n",
119 |     "train_data = mnist.train.images  # Returns np.array\n",
120 |     "train_labels = np.asarray(mnist.train.labels, dtype=np.int32)\n",
121 |     "eval_data = mnist.test.images  # Returns np.array\n",
122 |     "eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)\n",
123 |     "\n",
124 |     "print (train_data.shape)\n",
125 |     "print (eval_data.shape)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "### Define the Model\n",
133 |     "A ConvNet Model, state of the art for image classification task."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 9,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "def cnn_model_fn(features, labels, mode):\n",
143 |     "    \"\"\"Model function for CNN.\"\"\"\n",
144 |     "    # Input Layer\n",
145 |     "    # Reshape X to 4-D tensor: [batch_size, width, height, channels]\n",
146 |     "    # MNIST images are 28x28 pixels, and have one color channel\n",
147 |     "    input_layer = tf.reshape(features[\"x\"], [-1, 28, 28, 1])\n",
148 |     "\n",
149 |     "    # Convolutional Layer #1\n",
150 |     "    # Computes 32 features using a 3x3 filter with ReLU activation.\n",
151 |     "    # Input Tensor Shape: [batch_size, 28, 28, 1]\n",
152 |     "    # Output Tensor Shape: [batch_size, 26, 26, 32]\n",
153 |     "    conv1 = tf.layers.conv2d(\n",
154 |     "      inputs=input_layer,\n",
155 |     "      filters=32,\n",
156 |     "      kernel_size=[3, 3],\n",
157 |     "      activation=tf.nn.relu)\n",
158 |     "\n",
159 |     "    # Convolutional Layer #2\n",
160 |     "    # Computes 64 features using a 3x3 filter.\n",
161 |     "    # Input Tensor Shape: [batch_size, 26, 26 32]\n",
162 |     "    # Output Tensor Shape: [batch_size, 24, 24, 64]\n",
163 |     "    conv2 = tf.layers.conv2d(\n",
164 |     "      inputs=conv1,\n",
165 |     "      filters=64,\n",
166 |     "      kernel_size=[3, 3],\n",
167 |     "      activation=tf.nn.relu)\n",
168 |     "\n",
169 |     "    # Pooling Layer\n",
170 |     "    # Max pooling layer with a 2x2 filter and stride of 2\n",
171 |     "    # Input Tensor Shape: [batch_size, 24, 24, 64]\n",
172 |     "    # Output Tensor Shape: [batch_size, 12, 12, 64]\n",
173 |     "    pool = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)\n",
174 |     "\n",
175 |     "    # Dropout # 1\n",
176 |     "    # Add dropout operation; 0.25 probability that element will be kept\n",
177 |     "    dropout = tf.layers.dropout(\n",
178 |     "      inputs=pool, rate=0.25, training=mode == tf.estimator.ModeKeys.TRAIN)\n",
179 |     "\n",
180 |     "    # Flatten tensor into a batch of vectors\n",
181 |     "    # Input Tensor Shape: [batch_size, 12, 12, 64]\n",
182 |     "    # Output Tensor Shape: [batch_size, 12 * 12 * 64]\n",
183 |     "    flat = tf.reshape(dropout, [-1, 12 * 12 * 64])  # 9216\n",
184 |     "\n",
185 |     "    \n",
186 |     "    # Dense Layer # 1\n",
187 |     "    # Densely connected layer with 128 neurons\n",
188 |     "    # Input Tensor Shape: [batch_size, 12 * 12 * 64] (batch_size, 9216)\n",
189 |     "    # Output Tensor Shape: [batch_size, 128]\n",
190 |     "    dense1 = tf.layers.dense(inputs=flat, units=128, activation=tf.nn.relu)\n",
191 |     "    \n",
192 |     "    # Dropout # 2\n",
193 |     "    # Add dropout operation; 0.5 probability that element will be kept\n",
194 |     "    dropout2 = tf.layers.dropout(\n",
195 |     "      inputs=dense1, rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)\n",
196 |     "\n",
197 |     "    # Logits layer\n",
198 |     "    # Input Tensor Shape: [batch_size, 128]\n",
199 |     "    # Output Tensor Shape: [batch_size, 10]\n",
200 |     "    logits = tf.layers.dense(inputs=dropout2, units=num_classes)\n",
201 |     "\n",
202 |     "    predictions = {\n",
203 |     "        # Generate predictions (for PREDICT and EVAL mode)\n",
204 |     "        \"classes\": tf.argmax(input=logits, axis=1),\n",
205 |     "        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the\n",
206 |     "        # `logging_hook`.\n",
207 |     "        \"probabilities\": tf.nn.softmax(logits, name=\"softmax_tensor\")\n",
208 |     "    }\n",
209 |     "    # Inference (for TEST mode)\n",
210 |     "    if mode == tf.estimator.ModeKeys.PREDICT:\n",
211 |     "        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n",
212 |     "\n",
213 |     "    # Calculate Loss (for both TRAIN and EVAL modes)\n",
214 |     "    onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_classes)\n",
215 |     "    # Cross Entropy\n",
216 |     "    loss = tf.losses.softmax_cross_entropy(\n",
217 |     "      onehot_labels=onehot_labels, logits=logits)\n",
218 |     "\n",
219 |     "    # Configure the Training Op (for TRAIN mode)\n",
220 |     "    if mode == tf.estimator.ModeKeys.TRAIN:\n",
221 |     "        # AdamOptimizer\n",
222 |     "        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
223 |     "        train_op = optimizer.minimize(\n",
224 |     "            loss=loss,\n",
225 |     "            global_step=tf.train.get_global_step())\n",
226 |     "        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)\n",
227 |     "\n",
228 |     "    # Add evaluation metrics (for EVAL mode)\n",
229 |     "    eval_metric_ops = {\n",
230 |     "      \"accuracy\": tf.metrics.accuracy(\n",
231 |     "          labels=labels, predictions=predictions[\"classes\"])}\n",
232 |     "    return tf.estimator.EstimatorSpec(\n",
233 |     "      mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "### Checkpoint Strategy\n",
241 |     "\n",
242 |     "The strategy we have adopted for the this example is the following:\n",
243 |     "\n",
244 |     "- Keep only one checkpoints\n",
245 |     "- Trigger the strategy at the end of every epoch"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 6,
251 |    "metadata": {},
252 |    "outputs": [],
253 |    "source": [
254 |     "# Checkpoint Strategy configuration\n",
255 |     "run_config = tf.contrib.learn.RunConfig(\n",
256 |     "    model_dir=filepath,\n",
257 |     "    keep_checkpoint_max=1)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 7,
263 |    "metadata": {},
264 |    "outputs": [
265 |     {
266 |      "name": "stdout",
267 |      "output_type": "stream",
268 |      "text": [
269 |       "INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc252758d30>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {\n",
270 |       "  per_process_gpu_memory_fraction: 1.0\n",
271 |       "}\n",
272 |       ", '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/output/mnist_convnet_model'}\n"
273 |      ]
274 |     }
275 |    ],
276 |    "source": [
277 |     "# Create the Estimator\n",
278 |     "mnist_classifier = tf.estimator.Estimator(\n",
279 |     "      model_fn=cnn_model_fn, config=run_config)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "markdown",
284 |    "metadata": {},
285 |    "source": [
286 |     "### Training\n",
287 |     "Let's train the model and see our checkpoint strategy in action."
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [
295 |     {
296 |      "name": "stdout",
297 |      "output_type": "stream",
298 |      "text": [
299 |       "Begin Training - Epoch 1/12\n",
300 |       "INFO:tensorflow:Create CheckpointSaverHook.\n",
301 |       "INFO:tensorflow:Restoring parameters from /output/mnist_convnet_model/model.ckpt-3284\n",
302 |       "INFO:tensorflow:Saving checkpoints for 3285 into /output/mnist_convnet_model/model.ckpt.\n",
303 |       "INFO:tensorflow:loss = 0.0112094, step = 3285\n",
304 |       "INFO:tensorflow:global_step/sec: 55.4695\n",
305 |       "INFO:tensorflow:loss = 0.0575566, step = 3385 (1.804 sec)\n"
306 |      ]
307 |     }
308 |    ],
309 |    "source": [
310 |     "# Keep track of the best accuracy\n",
311 |     "best_acc = 0\n",
312 |     "\n",
313 |     "# Training for num_epochs\n",
314 |     "for i in range(num_epochs):\n",
315 |     "    print(\"Begin Training - Epoch {}/{}\".format(i+1, num_epochs))\n",
316 |     "    # Train the model for 1 epoch\n",
317 |     "    train_input_fn = tf.estimator.inputs.numpy_input_fn(\n",
318 |     "        x={\"x\": train_data},\n",
319 |     "        y=train_labels,\n",
320 |     "        batch_size=batch_size,\n",
321 |     "        num_epochs=1,\n",
322 |     "        shuffle=True)\n",
323 |     "\n",
324 |     "    mnist_classifier.train(\n",
325 |     "        input_fn=train_input_fn)\n",
326 |     "\n",
327 |     "    # Evaluate the model and print results\n",
328 |     "    eval_input_fn = tf.estimator.inputs.numpy_input_fn(\n",
329 |     "        x={\"x\": eval_data},\n",
330 |     "        y=eval_labels,\n",
331 |     "        num_epochs=1,\n",
332 |     "        shuffle=False)\n",
333 |     "    \n",
334 |     "    eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)\n",
335 |     "    \n",
336 |     "    accuracy = eval_results[\"accuracy\"] * 100\n",
337 |     "    # Set the best acc if we have a new best or if it is the first step \n",
338 |     "    if accuracy > best_acc or i == 0:\n",
339 |     "        best_acc = accuracy\n",
340 |     "        print (\"=> New Best Accuracy {}\".format(accuracy))\n",
341 |     "    else:\n",
342 |     "        print(\"=> Validation Accuracy did not improve\")"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "markdown",
347 |    "metadata": {},
348 |    "source": [
349 |     "### Resume the checkpoint after the training\n",
350 |     "Let's take a look at the checkpoint just created. (you should see the `mnist_convnet_model` folder)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 39,
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "name": "stdout",
360 |      "output_type": "stream",
361 |      "text": [
362 |       "\u001b[0m\u001b[01;34mMNIST-data\u001b[0m/     command.sh                     \u001b[01;34mmnist_convnet_model\u001b[0m/\r\n",
363 |       "README.md       keras_mnist_cnn.py             pytorch_mnist_cnn.py\r\n",
364 |       "Untitled.ipynb  keras_mnist_cnn_jupyter.ipynb  pytorch_mnist_cnn_jupyter.ipynb\r\n"
365 |      ]
366 |     }
367 |    ],
368 |    "source": [
369 |     "% ls"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "Jupyter Notebook run in the `/output folder`, so it's here. If you want to load it, you only need to restart the **Training** Cell Code, the Estimator will take care of everything."
377 |    ]
378 |   }
379 |  ],
380 |  "metadata": {
381 |   "kernelspec": {
382 |    "display_name": "Python 3",
383 |    "language": "python",
384 |    "name": "python3"
385 |   },
386 |   "language_info": {
387 |    "codemirror_mode": {
388 |     "name": "ipython",
389 |     "version": 3
390 |    },
391 |    "file_extension": ".py",
392 |    "mimetype": "text/x-python",
393 |    "name": "python",
394 |    "nbconvert_exporter": "python",
395 |    "pygments_lexer": "ipython3",
396 |    "version": "3.6.2"
397 |   }
398 |  },
399 |  "nbformat": 4,
400 |  "nbformat_minor": 2
401 | }
402 | 


--------------------------------------------------------------------------------
/keras_mnist_cnn_jupyter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Save and Resume a Keras MNIST ConvNet Model\n",
  8 |     "\n",
  9 |     "This jupyter notebook, show you how to save and resume a Keras Model. In this example we will use the Deep Learning hello-world!: the MNIST classification task.\n",
 10 |     "\n",
 11 |     "Note: to run code cell you have to press **`Shift + Enter`**."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "### Import Packages\n",
 19 |     "\n",
 20 |     "First we need a single point with all the dependencies:"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stderr",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "Using TensorFlow backend.\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "from __future__ import print_function\n",
 38 |     "import keras\n",
 39 |     "import os.path\n",
 40 |     "from keras.datasets import mnist\n",
 41 |     "from keras.models import Sequential\n",
 42 |     "from keras.layers import Dense, Dropout, Flatten\n",
 43 |     "from keras.layers import Conv2D, MaxPooling2D\n",
 44 |     "from keras import backend as K\n",
 45 |     "from keras.callbacks import ModelCheckpoint"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "### Hyper Parameters and Variables\n",
 53 |     "\n",
 54 |     "Even for Hyper-Parameters and Variables is a good practice have a single point, this improve code readability and experiments interation."
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 10,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "# Path to saved model weights(as hdf5)\n",
 64 |     "resume_weights = \"/model/mnist-cnn-best.hdf5\"\n",
 65 |     "\n",
 66 |     "# Where to save Checkpoint(In the /output folder)\n",
 67 |     "filepath = \"/output/mnist-cnn-best.hdf5\"\n",
 68 |     "\n",
 69 |     "# Hyper-parameters\n",
 70 |     "batch_size = 128\n",
 71 |     "num_classes = 10\n",
 72 |     "epochs = 12\n",
 73 |     "\n",
 74 |     "# input image dimensions\n",
 75 |     "img_rows, img_cols = 28, 28"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "### Data Processing and Transformation\n",
 83 |     "\n",
 84 |     "Next, we process the dataset sample in tensor, ready to be feed into the model."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 3,
 90 |    "metadata": {},
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz\n",
 97 |       "10682368/11490434 [==========================>...] - ETA: 0sx_train shape: (60000, 28, 28, 1)\n",
 98 |       "60000 train samples\n",
 99 |       "10000 test samples\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "# MNIST handwritten image classification\n",
105 |     "# the data, shuffled and split between train and test sets\n",
106 |     "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
107 |     "\n",
108 |     "# Reshape strategy according to backend\n",
109 |     "if K.image_data_format() == 'channels_first':\n",
110 |     "    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)\n",
111 |     "    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)\n",
112 |     "    # 1 x 28 x 28 [number_of_channels (colors) x height x weight]\n",
113 |     "    input_shape = (1, img_rows, img_cols)\n",
114 |     "else:\n",
115 |     "    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)\n",
116 |     "    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)\n",
117 |     "    # 28 x 28 x 1 [height x weight x number_of_channels (colors)]\n",
118 |     "    input_shape = (img_rows, img_cols, 1)\n",
119 |     "\n",
120 |     "# Reshape, type, normalized, print\n",
121 |     "x_train = x_train.astype('float32')\n",
122 |     "x_test = x_test.astype('float32')\n",
123 |     "x_train /= 255\n",
124 |     "x_test /= 255\n",
125 |     "\n",
126 |     "# Dataset info\n",
127 |     "print('x_train shape:', x_train.shape)\n",
128 |     "print(x_train.shape[0], 'train samples')\n",
129 |     "print(x_test.shape[0], 'test samples')\n",
130 |     "\n",
131 |     "# convert class vectors to binary class matrices\n",
132 |     "y_train = keras.utils.to_categorical(y_train, num_classes)\n",
133 |     "y_test = keras.utils.to_categorical(y_test, num_classes)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "### Define the Model\n",
141 |     "\n",
142 |     "A ConvNet Model, state of the art for image classification task."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 4,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "_________________________________________________________________\n",
155 |       "Layer (type)                 Output Shape              Param #   \n",
156 |       "=================================================================\n",
157 |       "conv2d_1 (Conv2D)            (None, 26, 26, 32)        320       \n",
158 |       "_________________________________________________________________\n",
159 |       "conv2d_2 (Conv2D)            (None, 24, 24, 64)        18496     \n",
160 |       "_________________________________________________________________\n",
161 |       "max_pooling2d_1 (MaxPooling2 (None, 12, 12, 64)        0         \n",
162 |       "_________________________________________________________________\n",
163 |       "dropout_1 (Dropout)          (None, 12, 12, 64)        0         \n",
164 |       "_________________________________________________________________\n",
165 |       "flatten_1 (Flatten)          (None, 9216)              0         \n",
166 |       "_________________________________________________________________\n",
167 |       "dense_1 (Dense)              (None, 128)               1179776   \n",
168 |       "_________________________________________________________________\n",
169 |       "dropout_2 (Dropout)          (None, 128)               0         \n",
170 |       "_________________________________________________________________\n",
171 |       "dense_2 (Dense)              (None, 10)                1290      \n",
172 |       "=================================================================\n",
173 |       "Total params: 1,199,882\n",
174 |       "Trainable params: 1,199,882\n",
175 |       "Non-trainable params: 0\n",
176 |       "_________________________________________________________________\n"
177 |      ]
178 |     }
179 |    ],
180 |    "source": [
181 |     "# MODEL\n",
182 |     "# Conv(32,3,3)[ReLU] -> Conv(64,3,3)[ReLU] -> MaxPool(2,2)[Dropout 0.25] ->\n",
183 |     "# FC(_, 128)[ReLU][Dropout 0.5] -> FC(128, 10)[Softmax]\n",
184 |     "model = Sequential()\n",
185 |     "model.add(Conv2D(32, kernel_size=(3, 3),\n",
186 |     "                    activation='relu',\n",
187 |     "                    input_shape=input_shape))\n",
188 |     "model.add(Conv2D(64, (3, 3), activation='relu'))\n",
189 |     "model.add(MaxPooling2D(pool_size=(2, 2)))\n",
190 |     "model.add(Dropout(0.25))\n",
191 |     "model.add(Flatten())\n",
192 |     "model.add(Dense(128, activation='relu'))\n",
193 |     "model.add(Dropout(0.5))\n",
194 |     "model.add(Dense(num_classes, activation='softmax'))\n",
195 |     "\n",
196 |     "model.summary()"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "### Resume a checkpoint\n",
204 |     "\n",
205 |     "Run the following line if you want to resume an existing checkpoint."
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 11,
211 |    "metadata": {},
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "Resumed model's weights from ./mnist-cnn-best.hdf5\n"
218 |      ]
219 |     }
220 |    ],
221 |    "source": [
222 |     "# If exists a best model, load its weights!\n",
223 |     "if os.path.isfile(resume_weights):\n",
224 |     "    print (\"Resumed model's weights from {}\".format(resume_weights))\n",
225 |     "    # load weights\n",
226 |     "    model.load_weights(resume_weights)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "### Define The Loss Function and The Optimizers\n",
234 |     "\n",
235 |     "In this example we use the Cross Entropy Loss and Adam Optimizer."
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 6,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "# CEE, Adam\n",
245 |     "model.compile(loss=keras.losses.categorical_crossentropy,\n",
246 |     "            optimizer=keras.optimizers.Adam(),\n",
247 |     "            metrics=['accuracy'])"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "metadata": {},
253 |    "source": [
254 |     "### Checkpoint Strategy\n",
255 |     "\n",
256 |     "The strategy we have adopted for the this example is the following:\n",
257 |     "- Keep only one checkpoints\n",
258 |     "- Trigger the strategy at the end of every epoch\n",
259 |     "- Save the one with the best(max) validation accuracy "
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 7,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "# Keep only a single checkpoint, the best over test accuracy.\n",
269 |     "checkpoint = ModelCheckpoint(filepath,\n",
270 |     "                            monitor='val_acc',\n",
271 |     "                            verbose=1,\n",
272 |     "                            save_best_only=True,\n",
273 |     "                            mode='max')"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {},
279 |    "source": [
280 |     "### Training\n",
281 |     "\n",
282 |     "Let's train the model and see our checkpoint strategy in action."
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 12,
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "name": "stdout",
292 |      "output_type": "stream",
293 |      "text": [
294 |       "Train on 60000 samples, validate on 10000 samples\n",
295 |       "Epoch 1/12\n",
296 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0210 - acc: 0.9928Epoch 00000: val_acc did not improve\n",
297 |       "60000/60000 [==============================] - 9s - loss: 0.0210 - acc: 0.9929 - val_loss: 0.0309 - val_acc: 0.9912\n",
298 |       "Epoch 2/12\n",
299 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0207 - acc: 0.9931Epoch 00001: val_acc did not improve\n",
300 |       "60000/60000 [==============================] - 9s - loss: 0.0207 - acc: 0.9931 - val_loss: 0.0248 - val_acc: 0.9927\n",
301 |       "Epoch 3/12\n",
302 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0204 - acc: 0.9934Epoch 00002: val_acc did not improve\n",
303 |       "60000/60000 [==============================] - 9s - loss: 0.0205 - acc: 0.9934 - val_loss: 0.0270 - val_acc: 0.9922\n",
304 |       "Epoch 4/12\n",
305 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0186 - acc: 0.9939Epoch 00003: val_acc did not improve\n",
306 |       "60000/60000 [==============================] - 9s - loss: 0.0186 - acc: 0.9940 - val_loss: 0.0279 - val_acc: 0.9928\n",
307 |       "Epoch 5/12\n",
308 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0171 - acc: 0.9944Epoch 00004: val_acc did not improve\n",
309 |       "60000/60000 [==============================] - 9s - loss: 0.0171 - acc: 0.9944 - val_loss: 0.0273 - val_acc: 0.9924\n",
310 |       "Epoch 6/12\n",
311 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0153 - acc: 0.9948Epoch 00005: val_acc did not improve\n",
312 |       "60000/60000 [==============================] - 9s - loss: 0.0153 - acc: 0.9948 - val_loss: 0.0289 - val_acc: 0.9928\n",
313 |       "Epoch 7/12\n",
314 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0157 - acc: 0.9944Epoch 00006: val_acc improved from 0.99300 to 0.99360, saving model to /output/mnist-cnn-best.hdf5\n",
315 |       "60000/60000 [==============================] - 9s - loss: 0.0157 - acc: 0.9944 - val_loss: 0.0296 - val_acc: 0.9936\n",
316 |       "Epoch 8/12\n",
317 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0145 - acc: 0.9949Epoch 00007: val_acc did not improve\n",
318 |       "60000/60000 [==============================] - 9s - loss: 0.0145 - acc: 0.9949 - val_loss: 0.0281 - val_acc: 0.9928\n",
319 |       "Epoch 9/12\n",
320 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0152 - acc: 0.9953Epoch 00008: val_acc did not improve\n",
321 |       "60000/60000 [==============================] - 9s - loss: 0.0152 - acc: 0.9953 - val_loss: 0.0278 - val_acc: 0.9927\n",
322 |       "Epoch 10/12\n",
323 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0127 - acc: 0.9958Epoch 00009: val_acc did not improve\n",
324 |       "60000/60000 [==============================] - 9s - loss: 0.0127 - acc: 0.9959 - val_loss: 0.0303 - val_acc: 0.9926\n",
325 |       "Epoch 11/12\n",
326 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0136 - acc: 0.9955Epoch 00010: val_acc did not improve\n",
327 |       "60000/60000 [==============================] - 9s - loss: 0.0135 - acc: 0.9956 - val_loss: 0.0296 - val_acc: 0.9931\n",
328 |       "Epoch 12/12\n",
329 |       "59648/60000 [============================>.] - ETA: 0s - loss: 0.0128 - acc: 0.9955Epoch 00011: val_acc improved from 0.99360 to 0.99380, saving model to /output/mnist-cnn-best.hdf5\n",
330 |       "60000/60000 [==============================] - 9s - loss: 0.0130 - acc: 0.9954 - val_loss: 0.0276 - val_acc: 0.9938\n",
331 |       "Test loss: 0.0276465165614\n",
332 |       "Test accuracy: 0.9938\n"
333 |      ]
334 |     }
335 |    ],
336 |    "source": [
337 |     "# Train\n",
338 |     "model.fit(x_train, y_train,\n",
339 |     "                batch_size=batch_size,\n",
340 |     "                epochs=epochs,\n",
341 |     "                verbose=1,\n",
342 |     "                validation_data=(x_test, y_test),\n",
343 |     "                callbacks=[checkpoint])\n",
344 |     "\n",
345 |     "# Eval\n",
346 |     "score = model.evaluate(x_test, y_test, verbose=0)\n",
347 |     "print('Test loss:', score[0])\n",
348 |     "print('Test accuracy:', score[1])"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "### Resume the checkpoint after the training\n",
356 |     "\n",
357 |     "Let's take a look at the checkpoint just created. (you should see the `mnist-cnn-best.hdf5` file)"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 9,
363 |    "metadata": {},
364 |    "outputs": [
365 |     {
366 |      "name": "stdout",
367 |      "output_type": "stream",
368 |      "text": [
369 |       "\u001b[0m\u001b[01;34mMNIST_data\u001b[0m/     command.sh                     mnist-cnn-best.hdf5\r\n",
370 |       "README.md       keras_mnist_cnn.py             pytorch_mnist_cnn.py\r\n",
371 |       "Untitled.ipynb  keras_mnist_cnn_jupyter.ipynb  pytorch_mnist_cnn_jupyter.ipynb\r\n"
372 |      ]
373 |     }
374 |    ],
375 |    "source": [
376 |     "% ls"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "Jupyter Notebook run in the `/output` folder, so it's here.\n",
384 |     "If you want to load it, go to the Hyper parameters and Varables Code Cell, replace the resume weigths var in this way:\n",
385 |     "`# Path to saved model weights(as hdf5)\n",
386 |     "resume_weights = \"./mnist-cnn-best.hdf5\"`, run the cell, go to the **Resume a checkpoint** Code Cell, run it, and rerun the **Training Code Cell**, that's it."
387 |    ]
388 |   }
389 |  ],
390 |  "metadata": {
391 |   "kernelspec": {
392 |    "display_name": "Python 3",
393 |    "language": "python",
394 |    "name": "python3"
395 |   },
396 |   "language_info": {
397 |    "codemirror_mode": {
398 |     "name": "ipython",
399 |     "version": 3
400 |    },
401 |    "file_extension": ".py",
402 |    "mimetype": "text/x-python",
403 |    "name": "python",
404 |    "nbconvert_exporter": "python",
405 |    "pygments_lexer": "ipython3",
406 |    "version": "3.6.2"
407 |   }
408 |  },
409 |  "nbformat": 4,
410 |  "nbformat_minor": 2
411 | }
412 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Save And Resume your Experiments
  2 | 
  3 | This repo contains the code to show how to save checkpoints during training and resume your experiments from them.
  4 | We will show you how to perform it on Tensorflow, Keras and PyTorch.
  5 | 
  6 | ## Why checkpointing?
  7 | 
  8 | ![save game screen FF-like](https://i.imgur.com/xdpSAzq.png)
  9 | 
 10 | Image your experiments as a video game, sometimes you want to save your game or resume it from an existing state. Checkpoints in Machine/Deep Learning experiments are the same thing, you do not want to lose your experiments due to blackout, OS faults or other types of bad errors. Sometimes you want just to resume a particular state of the training for new experiments or try different things. That's why you need checkpoints!
 11 | 
 12 | Not to mention that without a checkpoint at the end of the training, you will have lost all the training! Like finishing a game without saving at the end.
 13 | 
 14 | ## What is a checkpoint made of?
 15 | 
 16 | A checkpoint can consist of:
 17 | 
 18 | - The architecture of the model, allowing to re-create the model
 19 | - The weights of the model
 20 | - The training configuration (loss, optimizer, epochs and other meta-infos)
 21 | - The state of the optimizer, allowing to resume training exactly where you left off.
 22 | 
 23 | *Taken from Keras docs [how-can-i-save-a-keras-model](https://keras.io/getting-started/faq/#how-can-i-save-a-keras-model)*.
 24 | 
 25 | ## Checkpoint Strategies
 26 | 
 27 | There are different checkpoint strategies according to the type of training regime you are performing:
 28 | 
 29 | - Short Training Regime (minutes - hours)
 30 | - Normal Training Regime (hours - day)
 31 | - Long Training Regime (days - weeks)
 32 | 
 33 | ### Short Training Regime
 34 | In this type of training regime is a common practice to save only a checkpoint at the end of the training or at the end of every epoch.
 35 | 
 36 | ### Normal Training Regime
 37 | In this type of training regime is a common practice to save multiple checkpoints every n_epochs and keep track about what's the best one with respect to validation metric we care about. Usually there is a fixed number of checkpoints we care about so to not take to much space, such as restrict it to keep only 10 checkpoints(the new ones will replace the last ones).
 38 | 
 39 | ### Long Training Regime
 40 | In this type of training regime is a common practice to save multiple checkpoints every n_epochs and keep track about what's the best one with respect to validation metric we care about. Since the training can be really long, is common to save less frequently but keep more checkpoints file, so that we will be able to resume the training in particular situations.
 41 | 
 42 | *Obviously you can use a custom Checkpoint Strategy according to your need and the task you will run.*
 43 | 
 44 | ## The Tradeoff
 45 | 
 46 | The tradeoff is between the **frequency** and the **number of checkpoints files** to keep. Let's take a look what's happen when we act over these two parameters:
 47 | 
 48 | Frequency | Number of checkpoints to keep | Cons | Pro
 49 | --------- | ----------------------------- | ---- | ---
 50 | High | High | You need a lot of space!! | You can resume very quickly in almost all the interesting training states.
 51 | High | Low | You could have lost preciuos states. | Minimize the storage space you need.
 52 | Low | High | If some things happened between two checkpoints, it will cost you some time to retrieve it. | You can resume the experiments in a lot of interesting states.
 53 | Low | Low | You could have lost preciuos states | Minimize the storage space you need.
 54 | 
 55 | 
 56 | Now you have a good intuition about what's the best strategy you can adopt according to your training regime.
 57 | 
 58 | ## Save and Resume on FloydHub
 59 | 
 60 | Before you start, log in on FloydHub with the [floyd login](http://docs.floydhub.com/commands/login/) command, then fork and init the project:
 61 | 
 62 | ```bash
 63 | $ git clone https://github.com/floydhub/save-and-resume.git
 64 | $ cd save-and-resume
 65 | $ floyd init save-and-resume
 66 | ```
 67 | 
 68 | For this examples we use the Deep Learning hello-world: the [MNIST](http://yann.lecun.com/exdb/mnist/) classification task using a Convolutional Neural Network model.
 69 | 
 70 | The strategy we have adopted for the next example is the following:
 71 | - Keep only one checkpoints
 72 | - Trigger the strategy at the end of every epoch
 73 | - Save the one with the best(max) validation accuracy
 74 | 
 75 | Considering the toy example, a Short Training Regime provide a good strategy.
 76 | 
 77 | *As said this tutorial follows a basic setup, if you have a more sofisticated experiments you will have to hack it.*
 78 | 
 79 | This is the basic template you have to follow for saving and resuming when you run your experimets on FloydHub *via script*:
 80 | 
 81 | #### Saving Template command
 82 | 
 83 | ```bash
 84 | floyd run \
 85 |     [--gpu] \
 86 |     --env <env> \
 87 |     --data <your_dataset>:<mounting_point_dataset> \
 88 |     "python <script_and_parameters>"
 89 | ```
 90 | 
 91 | The checkpoint of this script must be saved in the `/output` foler.
 92 | 
 93 | #### Resuming Template after training
 94 | 
 95 | ```bash
 96 | floyd run \
 97 |     [--gpu] \
 98 |     --env <env> \
 99 |     --data <your_dataset>:<mounting_point_dataset> \
100 |     --data <output_of_previuos_job>:<mounting_point_model> \
101 |     "python <script_and_parameters>"
102 | ```
103 | The scipt will resum the checkpoint from the previus Job's Output.
104 | 
105 | Let's see how to make it tangible for the different framework on FloydHub.
106 | 
107 | ## Tensorflow
108 | 
109 | <p align="center">
110 |     <img src="https://www.tensorflow.org/_static/images/tensorflow/logo.png">
111 | </p>
112 | 
113 | Tensorflow provide different way for saving and resuming a checkpoint. In the example we will use the [tf.Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator) API, that behind the scene uses [tf.train.Saver](https://www.tensorflow.org/api_docs/python/tf/train/Saver), [tf.train.CheckpointSaverHook](https://www.tensorflow.org/api_docs/python/tf/train/CheckpointSaverHook) and [tf.saved_model.builder.SavedModelBuilder](https://www.tensorflow.org/api_docs/python/tf/saved_model/builder/SavedModelBuilder).
114 | 
115 | More in detail, it uses the first function to save, the second one to act according to the adopted strategy and the last one to export the model to be served with `export_savedmodel()` method.
116 | 
117 | ### Saving
118 | 
119 | Before init an Estimator, we have to define the checkpoint strategy. To do this we have to create a configuration for the Estimator using the [tf.estimator.RunConfig](https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig) API such this:
120 | 
121 | ```python
122 | # Checkpoint Strategy configuration
123 | run_config = tf.contrib.learn.RunConfig(
124 |     model_dir=filepath,
125 |     keep_checkpoint_max=1)
126 | ```
127 | 
128 | In this way we are telling the estimator in which directory save or resume a checkpoint and how many checkpoints to keep.
129 | 
130 | Then we have to provide it, at the initialization of the Estimator:
131 | 
132 | ```python
133 | # Create the Estimator
134 | mnist_classifier = tf.estimator.Estimator(
135 |       model_fn=cnn_model_fn, config=run_config)
136 | ```
137 | 
138 | That's it about saving a checkpoint in Tensorflow using Estimator.
139 | 
140 | ### Resuming
141 | 
142 | After having configurated the Estimator, everything is done. If it will find a checkpoint inside the given model folder, it will load the last one.
143 | 
144 | That's it about resuming a checkpoint in Tensorflow using Estimator.
145 | 
146 | ### Run on FloydHub
147 | Here's the steps to run the example on FloydHub.
148 | 
149 | #### Via script
150 | 
151 | First time training:
152 | 
153 | ```bash
154 | floyd run \
155 |     --gpu \
156 |     --env tensorflow-1.3 \
157 |     --data redeipirati/datasets/mnist/1:input \
158 |     'python tf_mnist_cnn.py'
159 | ```
160 | 
161 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6,
162 | - The `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory,
163 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine.
164 | 
165 | Resuming:
166 | 
167 | ```bash
168 | floyd run \
169 |     --gpu \
170 |     --env tensorflow-1.3 \
171 |     --data redeipirati/datasets/mnist/1:input \
172 |     --data <your-username>/projects/save-and-resume/<jobs>/output:/model \
173 |     'python tf_mnist_cnn.py'
174 | ```
175 | 
176 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6,
177 | - The first `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory,
178 | - The second `--data` flag specifies that the output of a previus Job should be available at the `/model` directory,
179 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine.
180 | 
181 | 
182 | #### Via Jupyter
183 | 
184 | ```bash
185 | floyd run \
186 |     --gpu \
187 |     --env tensorflow-1.3 \
188 |     --data redeipirati/datasets/mnist/1:input \
189 |     --mode jupyter
190 | ```
191 | 
192 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6.
193 | - The `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory,
194 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine.
195 | - The `--mode` flag specifies that this job should provide us a Jupyter notebook.
196 | 
197 | Add `--data <your-username>/projects/save-and-resume/<jobs>/output:/model`, if you want to load a checkpoint from a previous Job.
198 | 
199 | ## Keras
200 | 
201 | ![Keras logo](https://s3.amazonaws.com/keras.io/img/keras-logo-2018-large-1200.png)
202 | 
203 | Keras provide a great API for saving and loading a checkpoints. Let's take a look:
204 | 
205 | ### Saving
206 | Keras provides a set of functions called [callback](https://keras.io/callbacks/): you can think of it as events that will triggered at certain training state. The callback we need for checkpointing is the [ModelCheckpoint](https://keras.io/callbacks/#modelcheckpoint) which provides all the features we need according to the checkpoint strategy adopted.
207 | 
208 | **This function save only the model's weights**, if you want to save the whole model or some of the components take a look at [how can i save a keras model from Keras docs](https://keras.io/getting-started/faq/#how-can-i-save-a-keras-model).
209 | 
210 | First of all we have to import the callback functions:
211 | ```python
212 | from keras.callbacks import ModelCheckpoint
213 | ```
214 | Next, just before the call to `model.fit(...)` it's time to prepare the checkpoint strategy.
215 | 
216 | ```python
217 | # Checkpoint In the /output folder
218 | filepath = "/output/mnist-cnn-best.hdf5"
219 | 
220 | # Keep only a single checkpoint, the best over test accuracy.
221 | checkpoint = ModelCheckpoint(filepath,
222 |                             monitor='val_acc',
223 |                             verbose=1,
224 |                             save_best_only=True,
225 |                             mode='max')
226 | ```
227 | - `filepath="/output/mnist-cnn-best.hdf5"`: FloydHub returns only the contents inside the `/output` folder! See [save output in the docs](https://docs.floydhub.com/guides/data/storing_output/),
228 | - `monitor='val_acc'`: the metric we care about, validation accuracy,
229 | - `verbose=1`: it will print more infos,
230 | - `save_best_only=True`: Keep only the best one(in term of max val_acc),
231 | - `mode='max'`: save the one with max validation accuracy.
232 | 
233 | Default period(checkpointing frequency) is set to 1, this means at the end of every epoch.
234 | 
235 | For more infos(such as filepath formatting options, checkpointing period and more) we encourage you to explore the [ModelCheckpoint](https://keras.io/callbacks/#modelcheckpoint) API.
236 | 
237 | Now we are ready to see it apply during training, to do this, we need to pass the callback variable to the `model.fit(...)` call:
238 | 
239 | ```python
240 | # Train
241 | model.fit(x_train, y_train,
242 |                 batch_size=batch_size,
243 |                 epochs=epochs,
244 |                 verbose=1,
245 |                 validation_data=(x_test, y_test),
246 |                 callbacks=[checkpoint])  # <- Apply our checkpoint strategy
247 | ```
248 | 
249 | According to the chosen strategy you will see:
250 | ```
251 | # This line when the training reach a new max
252 | Epoch <n_epoch>: val_acc improved from <previus val_acc> to <new max val_acc>, saving model to /output/mnist-cnn-best.hdf5
253 | 
254 | # Or this line
255 | Epoch <n_epoch>: val_acc did not improve
256 | ```
257 | 
258 | That's it about saving a checkpoint in Keras.
259 | 
260 | ### Resuming
261 | Keras models have the [`load_weights()`](https://github.com/fchollet/keras/blob/master/keras/models.py#L718-L735) method which load the weights from a hdf5 file.
262 | 
263 | To load the model's weight you have to add this line just after the model definition:
264 | 
265 | ```python
266 | ... # Model Definition
267 | 
268 | model.load_weights(resume_weights)
269 | ```
270 | 
271 | That's it about resuming a checkpoint in Keras.
272 | 
273 | 
274 | ### Run on FloydHub
275 | Here's the steps to run the example on FloydHub.
276 | 
277 | #### Via script
278 | 
279 | First time training:
280 | 
281 | ```bash
282 | floyd run \
283 |     --gpu \
284 |     --env tensorflow-1.3 \
285 |     'python keras_mnist_cnn.py'
286 | ```
287 | 
288 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6.
289 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine.
290 | 
291 | [Keras provide an API to handle MNIST data](https://keras.io/datasets/#mnist-database-of-handwritten-digits), so we can skip the dataset mounting since the dataset size is irrilevant.
292 | 
293 | Resuming:
294 | 
295 | ```bash
296 | floyd run \
297 |     --gpu \
298 |     --env tensorflow-1.3 \
299 |     --data <your-username>/projects/save-and-resume/<jobs>/output:/model \
300 |     'python keras_mnist_cnn.py'
301 | ```
302 | 
303 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6.
304 | - The `--data` flag specifies that the output of a previus Job should be available at the `/model` directory
305 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine.
306 | 
307 | 
308 | #### Via Jupyter
309 | 
310 | ```bash
311 | floyd run \
312 |     --gpu \
313 |     --env tensorflow-1.3 \
314 |     --mode jupyter
315 | ```
316 | 
317 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6.
318 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine.
319 | - The `--mode` flag specifies that this job should provide us a Jupyter notebook.
320 | 
321 | Add `--data <your-username>/projects/save-and-resume/<jobs>/output:/model`, if you want to load a checkpoint from a previous Job.
322 | 
323 | 
324 | ## PyTorch
325 | 
326 | ![Pytorch logo](http://pytorch.org/docs/master/_static/pytorch-logo-dark.svg)
327 | 
328 | Unfortunately at the moment PyTorch has not a great API as Keras, therefore we need to write our own solution according to the checkpoint strategy adopted(the same we have used on Keras).
329 | 
330 | 
331 | ### Saving
332 | PyTorch does not provide an all-in-one API in which defines the checkpoint strategy but it provide a simple way to save and resume a checkpoint. According the official docs about [semantic serialization](http://pytorch.org/docs/master/notes/serialization.html), the best practice consist of save only the weights due to code refactoring issue.
333 | 
334 | Let's take a look at how to save the model weights in PyTorch:
335 | 
336 | 
337 | First of all define a `save_checkpoint` function which handles all the instructions about the number of checkpoints to keep and the serialization on file:
338 | 
339 | ```python
340 | def save_checkpoint(state, is_best, filename='/output/checkpoint.pth.tar'):
341 |     """Save checkpoint if a new best is achieved"""
342 |     if is_best:
343 |         print ("=> Saving a new best")
344 |         torch.save(state, filename)  # save checkpoint
345 |     else:
346 |         print ("=> Validation Accuracy did not improve")
347 | ```
348 | 
349 | Then, inside the training(usually a for loop with the number of epochs), we define the checkpoint frequency(at the end of every epoch) and the informations(epochs, model weights and best accuracy achieved) we want to save:
350 | 
351 | ```python
352 | ...
353 | 
354 | # Training the Model
355 | for epoch in range(num_epochs):
356 |     train(...)  # Train
357 |     acc = eval(...)  # Evaluate after every epoch
358 | 
359 |     # Some stuff with acc(accuracy)
360 |     ...
361 | 
362 |     # Get bool not ByteTensor
363 |     is_best = bool(acc.numpy() > best_accuracy.numpy())
364 |     # Get greater Tensor to keep track best acc
365 |     best_accuracy = torch.FloatTensor(max(acc.numpy(), best_accuracy.numpy()))
366 |     # Save checkpoint if is a new best
367 |     save_checkpoint({
368 |         'epoch': start_epoch + epoch + 1,
369 |         'state_dict': model.state_dict(),
370 |         'best_accuracy': best_accuracy
371 |     }, is_best)
372 | ```
373 | 
374 | That's it about saving a checkpoint in PyTorch.
375 | 
376 | ### Resuming
377 | To resume a checkpoint, before the training we have to load the weights and the meta information we need:
378 | 
379 | ```python
380 | # cuda = torch.cuda.is_available()
381 | if cuda:
382 |     checkpoint = torch.load(resume_weights)
383 | else:
384 |     # Load GPU model on CPU
385 |     checkpoint = torch.load(resume_weights,
386 |                             map_location=lambda storage,
387 |                             loc: storage)
388 | start_epoch = checkpoint['epoch']
389 | best_accuracy = checkpoint['best_accuracy']
390 | model.load_state_dict(checkpoint['state_dict'])
391 | print("=> loaded checkpoint '{}' (trained for {} epochs)".format(resume_weights, checkpoint['epoch']))
392 | ```
393 | 
394 | For more info about loading GPU trained weights on CPU, see this [PyTorch discussion](https://discuss.pytorch.org/t/loading-weights-for-cpu-model-while-trained-on-gpu/1032).
395 | 
396 | That's it about resuming a checkpoint in PyTorch.
397 | 
398 | ### Run on FloydHub
399 | Here's the steps to run the example on FloydHub.
400 | 
401 | #### Via script
402 | 
403 | First time training:
404 | 
405 | ```bash
406 | floyd run \
407 |     --gpu \
408 |     --env pytorch-0.2 \
409 |     --data redeipirati/datasets/pytorch-mnist/1:input \
410 |     'python pytorch_mnist_cnn.py'
411 | ```
412 | 
413 | - The `--env` flag specifies the environment that this project should run on, which is a PyTorch 0.2.0 on Python 3.
414 | - The `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory
415 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine.
416 | 
417 | 
418 | Resuming:
419 | 
420 | ```bash
421 | floyd run \
422 |     --gpu \
423 |     --env pytorch-0.2 \
424 |     --data redeipirati/datasets/pytorch-mnist/1:input \
425 |     --data <your-username>/projects/save-and-resume/<jobs>/output:/model \
426 |     'python pytorch_mnist_cnn.py'
427 | ```
428 | 
429 | - The `--env` flag specifies the environment that this project should run on, which is a PyTorch 0.2.0 on Python 3.
430 | - The first `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory
431 | - The second `--data` flag specifies that the output of a previus Job should be available at the `/model` directory
432 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine.
433 | 
434 | #### Via Jupyter
435 | 
436 | ```bash
437 | floyd run \
438 |     --gpu \
439 |     --env pytorch-0.2 \
440 |     --data redeipirati/datasets/pytorch-mnist/1:input \
441 |     --mode jupyter
442 | ```
443 | 
444 | - The `--env` flag specifies the environment that this project should run on, which is a PyTorch 0.2.0 on Python 3.
445 | - The `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory
446 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine.
447 | - The `--mode` flag specifies that this job should provide us a Jupyter notebook.
448 | 
449 | Add `--data <your-username>/projects/save-and-resume/<jobs>/output:/model`, if you want to load a checkpoint from a previous Job.
450 | 
451 | Have a great training :)
452 | 
453 | ## Contributing
454 | 
455 | For any questions, bug(even typos) and/or features requests do not hesitate to contact me, open an issue or a PR!
456 | 
457 | 
458 | 


--------------------------------------------------------------------------------
/pytorch_mnist_cnn_jupyter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Save and Resume a Keras MNIST ConvNet Model\n",
  8 |     "\n",
  9 |     "This jupyter notebook, show you how to save and resume a PyTorch Model. In this example we will use the Deep Learning hello-world!: the MNIST classification task.\n",
 10 |     "\n",
 11 |     "Note: to run code cell you have to press **`Shift + Enter`**."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "### Import Packages\n",
 19 |     "\n",
 20 |     "First we need a single point with all the dependencies:"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import torch\n",
 32 |     "import torchvision.datasets as dsets\n",
 33 |     "import torch.nn as nn\n",
 34 |     "import torch.nn.functional as F\n",
 35 |     "import torchvision.transforms as transforms\n",
 36 |     "from torch.autograd import Variable\n",
 37 |     "from torchvision.utils import make_grid\n",
 38 |     "import shutil\n",
 39 |     "import os.path\n",
 40 |     "import time\n",
 41 |     "import numpy as np"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "### Hyper Parameters and Variables\n",
 49 |     "\n",
 50 |     "Even for Hyper-Parameters and Variables is a good practice have a single point, it's improve code readability and experiments interation."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 14,
 56 |    "metadata": {
 57 |     "collapsed": true
 58 |    },
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# Hyperparameter\n",
 62 |     "batch_size = 128\n",
 63 |     "input_size = 784  # 28 * 28\n",
 64 |     "hidden_size = 500\n",
 65 |     "num_classes = 10\n",
 66 |     "learning_rate = 1e-3\n",
 67 |     "num_epochs = 12\n",
 68 |     "print_every = 100\n",
 69 |     "best_accuracy = torch.FloatTensor([0])\n",
 70 |     "start_epoch = 0\n",
 71 |     "\n",
 72 |     "# Path to saved model weights(as hdf5)\n",
 73 |     "resume_weights = \"/model/checkpoint.pth.tar\"\n",
 74 |     "\n",
 75 |     "# CUDA?\n",
 76 |     "cuda = torch.cuda.is_available()\n",
 77 |     "\n",
 78 |     "# Seed for reproducibility\n",
 79 |     "torch.manual_seed(1)\n",
 80 |     "if cuda:\n",
 81 |     "    torch.cuda.manual_seed(1)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## Utility function\n",
 89 |     "\n",
 90 |     "In this Cell we have the training, evaluating and save checkpoint function:"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 5,
 96 |    "metadata": {
 97 |     "collapsed": true
 98 |    },
 99 |    "outputs": [],
100 |    "source": [
101 |     "def train(model, optimizer, train_loader, test_loader, loss_fn):\n",
102 |     "    \"\"\"Perform a full training over dataset\"\"\"\n",
103 |     "    average_time = 0\n",
104 |     "    # Model train mode\n",
105 |     "    model.train()\n",
106 |     "    for i, (images, labels) in enumerate(train_loader):\n",
107 |     "        # measure data loading time\n",
108 |     "        batch_time = time.time()\n",
109 |     "        images = Variable(images)\n",
110 |     "        labels = Variable(labels)\n",
111 |     "\n",
112 |     "        if cuda:\n",
113 |     "            images, labels = images.cuda(), labels.cuda()\n",
114 |     "\n",
115 |     "        # Forward + Backward + Optimize\n",
116 |     "        optimizer.zero_grad()\n",
117 |     "        outputs = model(images)\n",
118 |     "        loss = loss_fn(outputs, labels)\n",
119 |     "\n",
120 |     "        # Load loss on CPU\n",
121 |     "        if cuda:\n",
122 |     "            loss.cpu()\n",
123 |     "\n",
124 |     "        loss.backward()\n",
125 |     "        optimizer.step()\n",
126 |     "\n",
127 |     "        # Measure elapsed time\n",
128 |     "        batch_time = time.time() - batch_time\n",
129 |     "        # Accumulate over batch\n",
130 |     "        average_time += batch_time\n",
131 |     "\n",
132 |     "        # ### Keep track of metric every batch\n",
133 |     "        # Accuracy Metric\n",
134 |     "        prediction = outputs.data.max(1)[1]   # first column has actual prob.\n",
135 |     "        accuracy = prediction.eq(labels.data).sum() / batch_size * 100\n",
136 |     "\n",
137 |     "        # Log\n",
138 |     "        if (i + 1) % print_every == 0:\n",
139 |     "            print ('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f, Accuracy: %.4f, Batch time: %f'\n",
140 |     "                % (epoch + 1,\n",
141 |     "                    num_epochs,\n",
142 |     "                    i + 1,\n",
143 |     "                    len(train_dataset) // batch_size,\n",
144 |     "                    loss.data[0],\n",
145 |     "                    accuracy,\n",
146 |     "                    average_time/print_every))  # Average\n",
147 |     "\n",
148 |     "\n",
149 |     "def eval(model, optimizer, test_loader):\n",
150 |     "    \"\"\"Eval over test set\"\"\"\n",
151 |     "    model.eval()\n",
152 |     "    correct = 0\n",
153 |     "    # Get Batch\n",
154 |     "    for data, target in test_loader:\n",
155 |     "        data, target = Variable(data, volatile=True), Variable(target)\n",
156 |     "        if cuda:\n",
157 |     "            data, target = data.cuda(), target.cuda()\n",
158 |     "        # Evaluate\n",
159 |     "        output = model(data)\n",
160 |     "        # Load output on CPU\n",
161 |     "        if cuda:\n",
162 |     "            output.cpu()\n",
163 |     "        # Compute Accuracy\n",
164 |     "        prediction = output.data.max(1)[1]\n",
165 |     "        correct += prediction.eq(target.data).sum()\n",
166 |     "    return correct"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "### Data Processing and Transformation\n",
174 |     "\n",
175 |     "Next, we process the dataset sample in tensor, ready to be feed into the model."
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 6,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n",
188 |       "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n",
189 |       "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n",
190 |       "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n",
191 |       "Processing...\n",
192 |       "Done!\n",
193 |       "Training Data Size:  torch.Size([60000, 28, 28]) - torch.Size([60000])\n",
194 |       "Testing Data Size:  torch.Size([10000, 28, 28]) - torch.Size([10000])\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "# MNIST Dataset (Images and Labels)\n",
200 |     "# If you have not mounted the dataset, you can download it\n",
201 |     "# just adding download=True as parameter\n",
202 |     "train_dataset = dsets.MNIST(root='/input',\n",
203 |     "                        train=True,\n",
204 |     "                        download=True,\n",
205 |     "                        transform=transforms.ToTensor())\n",
206 |     "x_train_mnist, y_train_mnist = train_dataset.train_data.type(torch.FloatTensor), \\\n",
207 |     "                        train_dataset.train_labels\n",
208 |     "test_dataset = dsets.MNIST(root='/input',\n",
209 |     "                        train=False,\n",
210 |     "                        download=True,\n",
211 |     "                        transform=transforms.ToTensor())\n",
212 |     "x_test_mnist, y_test_mnist = test_dataset.test_data.type(torch.FloatTensor), \\\n",
213 |     "                        test_dataset.test_labels\n",
214 |     "\n",
215 |     "# Dataset info\n",
216 |     "print('Training Data Size: ', x_train_mnist.size(), '-', y_train_mnist.size())\n",
217 |     "print('Testing Data Size: ', x_test_mnist.size(), '-', y_test_mnist.size())\n",
218 |     "\n",
219 |     "# Training Dataset Loader (Input Pipline)\n",
220 |     "train_loader = torch.utils.data.DataLoader(dataset=train_dataset,\n",
221 |     "                                        batch_size=batch_size,\n",
222 |     "                                        shuffle=True)\n",
223 |     "# Testing Dataset Loader (Input Pipline)\n",
224 |     "test_loader = torch.utils.data.DataLoader(dataset=test_dataset,\n",
225 |     "                                        batch_size=batch_size,\n",
226 |     "                                        shuffle=False)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "### Define the Model\n",
234 |     "\n",
235 |     "A ConvNet Model, state of the art for image classification task."
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 8,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "CNN (\n",
248 |       "  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))\n",
249 |       "  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))\n",
250 |       "  (drop1): Dropout2d (p=0.25)\n",
251 |       "  (fc1): Linear (9216 -> 128)\n",
252 |       "  (drop2): Dropout2d (p=0.5)\n",
253 |       "  (fc2): Linear (128 -> 10)\n",
254 |       ")\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "# #### Model ####\n",
260 |     "# Convolutional Neural Network Model\n",
261 |     "class CNN(nn.Module):\n",
262 |     "    \"\"\"Conv[ReLU] -> Conv[ReLU] -> MaxPool -> Dropout(0.25)-\n",
263 |     "    -> Flatten -> FC()[ReLU] -> Dropout(0.5) -> FC()[Softmax]\n",
264 |     "    \"\"\"\n",
265 |     "    def __init__(self, num_classes):\n",
266 |     "        super(CNN, self).__init__()\n",
267 |     "        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)\n",
268 |     "        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)\n",
269 |     "        self.drop1 = nn.Dropout2d(p=0.25)\n",
270 |     "        self.fc1 = nn.Linear(9216, 128)\n",
271 |     "        self.drop2 = nn.Dropout2d(p=0.5)\n",
272 |     "        self.fc2 = nn.Linear(128, num_classes)\n",
273 |     "\n",
274 |     "    def forward(self, x):\n",
275 |     "        x = F.relu(self.conv1(x))\n",
276 |     "        x = F.max_pool2d(F.relu(self.conv2(x)), 2)\n",
277 |     "        x = self.drop1(x)\n",
278 |     "        x = x.view(-1, 9216)\n",
279 |     "        x = F.relu(self.fc1(x))\n",
280 |     "        x = self.drop2(x)\n",
281 |     "        x = self.fc2(x)\n",
282 |     "        return F.log_softmax(x)\n",
283 |     "\n",
284 |     "model = CNN(num_classes)\n",
285 |     "print(model)\n",
286 |     "\n",
287 |     "# If you are running a GPU instance, load the model on GPU\n",
288 |     "if cuda:\n",
289 |     "    model.cuda()"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "### Resume a checkpoint\n",
297 |     "\n",
298 |     "Run the following line if you want to resume an existing checkpoint."
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 15,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "name": "stdout",
308 |      "output_type": "stream",
309 |      "text": [
310 |       "=> loading checkpoint './checkpoint.pth.tar' ...\n",
311 |       "=> loaded checkpoint './checkpoint.pth.tar' (trained for 10 epochs)\n"
312 |      ]
313 |     }
314 |    ],
315 |    "source": [
316 |     "# If exists a best model, load its weights!\n",
317 |     "if os.path.isfile(resume_weights):\n",
318 |     "    print(\"=> loading checkpoint '{}' ...\".format(resume_weights))\n",
319 |     "    if cuda:\n",
320 |     "        checkpoint = torch.load(resume_weights)\n",
321 |     "    else:\n",
322 |     "        # Load GPU model on CPU\n",
323 |     "        checkpoint = torch.load(resume_weights,\n",
324 |     "                                map_location=lambda storage,\n",
325 |     "                                loc: storage)\n",
326 |     "    start_epoch = checkpoint['epoch']\n",
327 |     "    best_accuracy = checkpoint['best_accuracy']\n",
328 |     "    model.load_state_dict(checkpoint['state_dict'])\n",
329 |     "    print(\"=> loaded checkpoint '{}' (trained for {} epochs)\".format(resume_weights,\n",
330 |     "        checkpoint['epoch']))"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "### Define The Loss Function and The Optimizers\n",
338 |     "\n",
339 |     "In this example we use the Cross Entropy Loss and Adam Optimizer."
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": 10,
345 |    "metadata": {
346 |     "collapsed": true
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "# #### Loss and Optimizer ####\n",
351 |     "# Softmax is internally computed.\n",
352 |     "loss_fn = nn.CrossEntropyLoss()\n",
353 |     "# If you are running a GPU instance, compute the loss on GPU\n",
354 |     "if cuda:\n",
355 |     "    loss_fn.cuda()\n",
356 |     "\n",
357 |     "# Set parameters to be updated.\n",
358 |     "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "metadata": {},
364 |    "source": [
365 |     "### Checkpoint Strategy\n",
366 |     "\n",
367 |     "The strategy we have adopted for the this example is the following:\n",
368 |     "- Keep only one checkpoints\n",
369 |     "- Trigger the strategy at the end of every epoch\n",
370 |     "- Save the one with the best(max) validation accuracy "
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 11,
376 |    "metadata": {
377 |     "collapsed": true
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "# Keep only a single checkpoint, the best over test accuracy.\n",
382 |     "def save_checkpoint(state, is_best, filename='/output/checkpoint.pth.tar'):\n",
383 |     "    \"\"\"Save checkpoint if a new best is achieved\"\"\"\n",
384 |     "    if is_best:\n",
385 |     "        print (\"=> Saving a new best\")\n",
386 |     "        torch.save(state, filename)  # save checkpoint\n",
387 |     "    else:\n",
388 |     "        print (\"=> Validation Accuracy did not improve\")"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "markdown",
393 |    "metadata": {},
394 |    "source": [
395 |     "### Training\n",
396 |     "\n",
397 |     "Let's train the model and see our checkpoint strategy in action."
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {},
404 |    "outputs": [
405 |     {
406 |      "name": "stdout",
407 |      "output_type": "stream",
408 |      "text": [
409 |       "Epoch: [1/12], Step: [100/468], Loss: 0.0304, Accuracy: 98.4375, Batch time: 0.005967\n",
410 |       "Epoch: [1/12], Step: [200/468], Loss: 0.0331, Accuracy: 99.2188, Batch time: 0.011944\n",
411 |       "Epoch: [1/12], Step: [300/468], Loss: 0.0181, Accuracy: 99.2188, Batch time: 0.017946\n",
412 |       "Epoch: [1/12], Step: [400/468], Loss: 0.0079, Accuracy: 100.0000, Batch time: 0.023949\n",
413 |       "=> Test set: Accuracy: 99.15%\n",
414 |       "=> Validation Accuracy did not improve\n",
415 |       "Epoch: [2/12], Step: [100/468], Loss: 0.0141, Accuracy: 99.2188, Batch time: 0.005985\n",
416 |       "Epoch: [2/12], Step: [200/468], Loss: 0.0186, Accuracy: 99.2188, Batch time: 0.011918\n",
417 |       "Epoch: [2/12], Step: [300/468], Loss: 0.0136, Accuracy: 100.0000, Batch time: 0.017855\n",
418 |       "Epoch: [2/12], Step: [400/468], Loss: 0.0307, Accuracy: 99.2188, Batch time: 0.023823\n",
419 |       "=> Test set: Accuracy: 99.27%\n",
420 |       "=> Saving a new best\n",
421 |       "Epoch: [3/12], Step: [100/468], Loss: 0.0545, Accuracy: 98.4375, Batch time: 0.005938\n",
422 |       "Epoch: [3/12], Step: [200/468], Loss: 0.0043, Accuracy: 100.0000, Batch time: 0.011886\n",
423 |       "Epoch: [3/12], Step: [300/468], Loss: 0.0408, Accuracy: 96.8750, Batch time: 0.017852\n",
424 |       "Epoch: [3/12], Step: [400/468], Loss: 0.0161, Accuracy: 99.2188, Batch time: 0.023796\n",
425 |       "=> Test set: Accuracy: 99.23%\n",
426 |       "=> Validation Accuracy did not improve\n",
427 |       "Epoch: [4/12], Step: [100/468], Loss: 0.0357, Accuracy: 98.4375, Batch time: 0.005919\n",
428 |       "Epoch: [4/12], Step: [200/468], Loss: 0.0415, Accuracy: 99.2188, Batch time: 0.011863\n",
429 |       "Epoch: [4/12], Step: [300/468], Loss: 0.0079, Accuracy: 100.0000, Batch time: 0.017821\n",
430 |       "Epoch: [4/12], Step: [400/468], Loss: 0.0173, Accuracy: 99.2188, Batch time: 0.023815\n",
431 |       "=> Test set: Accuracy: 99.24%\n",
432 |       "=> Validation Accuracy did not improve\n",
433 |       "Epoch: [5/12], Step: [100/468], Loss: 0.0064, Accuracy: 100.0000, Batch time: 0.005956\n",
434 |       "Epoch: [5/12], Step: [200/468], Loss: 0.0075, Accuracy: 100.0000, Batch time: 0.011898\n",
435 |       "Epoch: [5/12], Step: [300/468], Loss: 0.0220, Accuracy: 99.2188, Batch time: 0.017835\n",
436 |       "Epoch: [5/12], Step: [400/468], Loss: 0.0158, Accuracy: 99.2188, Batch time: 0.023799\n",
437 |       "=> Test set: Accuracy: 99.23%\n",
438 |       "=> Validation Accuracy did not improve\n",
439 |       "Epoch: [6/12], Step: [100/468], Loss: 0.0175, Accuracy: 100.0000, Batch time: 0.006003\n",
440 |       "Epoch: [6/12], Step: [200/468], Loss: 0.0097, Accuracy: 99.2188, Batch time: 0.011995\n",
441 |       "Epoch: [6/12], Step: [300/468], Loss: 0.0392, Accuracy: 99.2188, Batch time: 0.017989\n",
442 |       "Epoch: [6/12], Step: [400/468], Loss: 0.0161, Accuracy: 99.2188, Batch time: 0.023942\n",
443 |       "=> Test set: Accuracy: 99.28%\n",
444 |       "=> Saving a new best\n",
445 |       "Epoch: [7/12], Step: [100/468], Loss: 0.0579, Accuracy: 98.4375, Batch time: 0.005972\n",
446 |       "Epoch: [7/12], Step: [200/468], Loss: 0.0248, Accuracy: 99.2188, Batch time: 0.011897\n",
447 |       "Epoch: [7/12], Step: [300/468], Loss: 0.0006, Accuracy: 100.0000, Batch time: 0.017830\n",
448 |       "Epoch: [7/12], Step: [400/468], Loss: 0.0103, Accuracy: 100.0000, Batch time: 0.023758\n",
449 |       "=> Test set: Accuracy: 99.25%\n",
450 |       "=> Validation Accuracy did not improve\n",
451 |       "Epoch: [8/12], Step: [100/468], Loss: 0.0637, Accuracy: 98.4375, Batch time: 0.005992\n",
452 |       "Epoch: [8/12], Step: [200/468], Loss: 0.0023, Accuracy: 100.0000, Batch time: 0.011934\n",
453 |       "Epoch: [8/12], Step: [300/468], Loss: 0.0076, Accuracy: 100.0000, Batch time: 0.017924\n",
454 |       "Epoch: [8/12], Step: [400/468], Loss: 0.0016, Accuracy: 100.0000, Batch time: 0.023878\n",
455 |       "=> Test set: Accuracy: 99.26%\n",
456 |       "=> Validation Accuracy did not improve\n",
457 |       "Epoch: [9/12], Step: [100/468], Loss: 0.0120, Accuracy: 100.0000, Batch time: 0.005922\n",
458 |       "Epoch: [9/12], Step: [200/468], Loss: 0.0008, Accuracy: 100.0000, Batch time: 0.011840\n",
459 |       "Epoch: [9/12], Step: [300/468], Loss: 0.0016, Accuracy: 100.0000, Batch time: 0.017767\n",
460 |       "Epoch: [9/12], Step: [400/468], Loss: 0.0299, Accuracy: 99.2188, Batch time: 0.023730\n",
461 |       "=> Test set: Accuracy: 99.29%\n",
462 |       "=> Saving a new best\n",
463 |       "Epoch: [10/12], Step: [100/468], Loss: 0.0009, Accuracy: 100.0000, Batch time: 0.006006\n",
464 |       "Epoch: [10/12], Step: [200/468], Loss: 0.0075, Accuracy: 100.0000, Batch time: 0.012032\n",
465 |       "Epoch: [10/12], Step: [300/468], Loss: 0.0016, Accuracy: 100.0000, Batch time: 0.018024\n",
466 |       "Epoch: [10/12], Step: [400/468], Loss: 0.0007, Accuracy: 100.0000, Batch time: 0.023979\n"
467 |      ]
468 |     }
469 |    ],
470 |    "source": [
471 |     "# Training the Model\n",
472 |     "for epoch in range(num_epochs):\n",
473 |     "    train(model, optimizer, train_loader, test_loader, loss_fn)\n",
474 |     "    acc = eval(model, optimizer, test_loader)\n",
475 |     "    acc = 100. * acc / len(test_loader.dataset)\n",
476 |     "    print('=> Test set: Accuracy: {:.2f}%'.format(acc))\n",
477 |     "    acc = torch.FloatTensor([acc])\n",
478 |     "    # Get bool not ByteTensor\n",
479 |     "    is_best = bool(acc.numpy() > best_accuracy.numpy())\n",
480 |     "    # Get greater Tensor to keep track best acc\n",
481 |     "    best_accuracy = torch.FloatTensor(max(acc.numpy(), best_accuracy.numpy()))\n",
482 |     "    # Save checkpoint if is a new best\n",
483 |     "    save_checkpoint({\n",
484 |     "        'epoch': start_epoch + epoch + 1,\n",
485 |     "        'state_dict': model.state_dict(),\n",
486 |     "        'best_accuracy': best_accuracy\n",
487 |     "    }, is_best)"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "markdown",
492 |    "metadata": {},
493 |    "source": [
494 |     "### Resume the checkpoint after the training\n",
495 |     "\n",
496 |     "Let's take a look at the checkpoint just created. (you should see the `checkpoint.pth.tar` file)"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": 13,
502 |    "metadata": {},
503 |    "outputs": [
504 |     {
505 |      "name": "stdout",
506 |      "output_type": "stream",
507 |      "text": [
508 |       "README.md           keras_mnist_cnn_jupyter.ipynb\r\n",
509 |       "checkpoint.pth.tar  pytorch_mnist_cnn.py\r\n",
510 |       "command.sh          pytorch_mnist_cnn_jupyter.ipynb\r\n",
511 |       "keras_mnist_cnn.py\r\n"
512 |      ]
513 |     }
514 |    ],
515 |    "source": [
516 |     "% ls"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "markdown",
521 |    "metadata": {},
522 |    "source": [
523 |     "Jupyter Notebook run in the `/output` folder, so it's here.\n",
524 |     "If you want to load it, go to the Hyper parameters and Varables Code Cell, replace the resume weigths var in this way:\n",
525 |     "`# Path to saved model weights(as hdf5)\n",
526 |     "resume_weights = \"./checkpoint.pth.tar\"`, run the cell, go to the **Resume a checkpoint** Code Cell, run it, and rerun the **Training Code Cell**, that's it."
527 |    ]
528 |   }
529 |  ],
530 |  "metadata": {
531 |   "kernelspec": {
532 |    "display_name": "Python 3",
533 |    "language": "python",
534 |    "name": "python3"
535 |   },
536 |   "language_info": {
537 |    "codemirror_mode": {
538 |     "name": "ipython",
539 |     "version": 3
540 |    },
541 |    "file_extension": ".py",
542 |    "mimetype": "text/x-python",
543 |    "name": "python",
544 |    "nbconvert_exporter": "python",
545 |    "pygments_lexer": "ipython3",
546 |    "version": "3.5.3"
547 |   }
548 |  },
549 |  "nbformat": 4,
550 |  "nbformat_minor": 2
551 | }
552 | 


--------------------------------------------------------------------------------