├── .gitignore ├── .floydignore ├── keras_mnist_cnn.py ├── tf_mnist_cnn.py ├── pytorch_mnist_cnn.py ├── tf_mnist_cnn_jupyter.ipynb ├── keras_mnist_cnn_jupyter.ipynb ├── README.md └── pytorch_mnist_cnn_jupyter.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .floydexpt -------------------------------------------------------------------------------- /.floydignore: -------------------------------------------------------------------------------- 1 | 2 | # Directories and files to ignore when uploading code to floyd 3 | 4 | .git 5 | .eggs 6 | eggs 7 | lib 8 | lib64 9 | parts 10 | sdist 11 | var 12 | *.pyc 13 | *.swp 14 | .DS_Store 15 | -------------------------------------------------------------------------------- /keras_mnist_cnn.py: -------------------------------------------------------------------------------- 1 | """Convolutional Neural Network for MNIST, built with Keras. 2 | 3 | Adapted from 4 | https://github.com/minimaxir/deep-learning-cpu-gpu-benchmark/blob/master/test_files/mnist_cnn.py 5 | 6 | MIT License 7 | 8 | Copyright (c) 2017 Max Woolf 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | """ 28 | 29 | from __future__ import print_function 30 | import keras 31 | import os.path 32 | from keras.datasets import mnist 33 | from keras.models import Sequential 34 | from keras.layers import Dense, Dropout, Flatten 35 | from keras.layers import Conv2D, MaxPooling2D 36 | from keras import backend as K 37 | from keras.callbacks import ModelCheckpoint 38 | 39 | # Path to saved model weights(as hdf5) 40 | resume_weights = "/model/mnist-cnn-best.hdf5" 41 | 42 | # Hyper-parameters 43 | batch_size = 128 44 | num_classes = 10 45 | epochs = 12 46 | 47 | # input image dimensions 48 | img_rows, img_cols = 28, 28 49 | 50 | # MNIST handwritten image classification 51 | # the data, shuffled and split between train and test sets 52 | (x_train, y_train), (x_test, y_test) = mnist.load_data() 53 | 54 | # Reshape strategy according to backend 55 | if K.image_data_format() == 'channels_first': 56 | x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) 57 | x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) 58 | # 1 x 28 x 28 [number_of_channels (colors) x height x weight] 59 | input_shape = (1, img_rows, img_cols) 60 | else: 61 | x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) 62 | x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) 63 | # 28 x 28 x 1 [height x weight x number_of_channels (colors)] 64 | input_shape = (img_rows, img_cols, 1) 65 | 66 | # Reshape, type, normalized, print 67 | x_train = x_train.astype('float32') 68 | x_test = x_test.astype('float32') 69 | x_train /= 255 70 | x_test /= 255 71 | 72 | # Dataset info 73 | print('x_train shape:', x_train.shape) 74 | print(x_train.shape[0], 'train samples') 75 | print(x_test.shape[0], 'test samples') 76 | 77 | # convert class vectors to binary class matrices 78 | y_train = keras.utils.to_categorical(y_train, num_classes) 79 | y_test = keras.utils.to_categorical(y_test, num_classes) 80 | 81 | # MODEL 82 | # Conv(32,3,3)[ReLU] -> Conv(64,3,3)[ReLU] -> MaxPool(2,2)[Dropout 0.25] -> 83 | # FC(_, 128)[ReLU][Dropout 0.5] -> FC(128, 10)[Softmax] 84 | model = Sequential() 85 | model.add(Conv2D(32, kernel_size=(3, 3), 86 | activation='relu', 87 | input_shape=input_shape)) 88 | model.add(Conv2D(64, (3, 3), activation='relu')) 89 | model.add(MaxPooling2D(pool_size=(2, 2))) 90 | model.add(Dropout(0.25)) 91 | model.add(Flatten()) 92 | model.add(Dense(128, activation='relu')) 93 | model.add(Dropout(0.5)) 94 | model.add(Dense(num_classes, activation='softmax')) 95 | 96 | model.summary() 97 | 98 | # If exists a best model, load its weights! 99 | if os.path.isfile(resume_weights): 100 | print ("Resumed model's weights from {}".format(resume_weights)) 101 | # load weights 102 | model.load_weights(resume_weights) 103 | 104 | # CEE, Adam 105 | model.compile(loss=keras.losses.categorical_crossentropy, 106 | optimizer=keras.optimizers.Adam(), 107 | metrics=['accuracy']) 108 | 109 | # Checkpoint In the /output folder 110 | filepath = "/output/mnist-cnn-best.hdf5" 111 | 112 | # Keep only a single checkpoint, the best over test accuracy. 113 | checkpoint = ModelCheckpoint(filepath, 114 | monitor='val_acc', 115 | verbose=1, 116 | save_best_only=True, 117 | mode='max') 118 | 119 | # Train 120 | model.fit(x_train, y_train, 121 | batch_size=batch_size, 122 | epochs=epochs, 123 | verbose=1, 124 | validation_data=(x_test, y_test), 125 | callbacks=[checkpoint]) 126 | 127 | # Eval 128 | score = model.evaluate(x_test, y_test, verbose=0) 129 | print('Test loss:', score[0]) 130 | print('Test accuracy:', score[1]) 131 | -------------------------------------------------------------------------------- /tf_mnist_cnn.py: -------------------------------------------------------------------------------- 1 | """Convolutional Neural Network Estimator for MNIST, built with tf.layers. 2 | 3 | Adapted from: 4 | https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/examples/tutorials/layers/cnn_mnist.py 5 | 6 | Copyright 2016 The TensorFlow Authors. All Rights Reserved. 7 | 8 | Licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | See the License for the specific language governing permissions and 18 | limitations under the License. 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets 26 | 27 | import numpy as np 28 | import tensorflow as tf 29 | import shutil 30 | import os 31 | 32 | tf.logging.set_verbosity(tf.logging.INFO) 33 | 34 | 35 | # Where to save Checkpoint(In the /output folder) 36 | resumepath = "/model/mnist_convnet_model" 37 | filepath = "/output/mnist_convnet_model" 38 | 39 | # Hyper-parameters 40 | batch_size = 128 41 | num_classes = 10 42 | num_epochs = 12 43 | learning_rate = 1e-3 44 | 45 | # If exists an checkpoint model, move it into the /output folder 46 | if os.path.exists(resumepath): 47 | shutil.copytree(resumepath, filepath) 48 | 49 | # Load training and eval data 50 | mnist = read_data_sets(train_dir='/input/MNIST_data', validation_size=0) 51 | train_data = mnist.train.images # Returns np.array 52 | train_labels = np.asarray(mnist.train.labels, dtype=np.int32) 53 | eval_data = mnist.test.images # Returns np.array 54 | eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) 55 | 56 | print (train_data.shape) 57 | print (eval_data.shape) 58 | 59 | def cnn_model_fn(features, labels, mode): 60 | """Model function for CNN.""" 61 | # Input Layer 62 | # Reshape X to 4-D tensor: [batch_size, width, height, channels] 63 | # MNIST images are 28x28 pixels, and have one color channel 64 | input_layer = tf.reshape(features["x"], [-1, 28, 28, 1]) 65 | 66 | # Convolutional Layer #1 67 | # Computes 32 features using a 3x3 filter with ReLU activation. 68 | # Input Tensor Shape: [batch_size, 28, 28, 1] 69 | # Output Tensor Shape: [batch_size, 26, 26, 32] 70 | conv1 = tf.layers.conv2d( 71 | inputs=input_layer, 72 | filters=32, 73 | kernel_size=[3, 3], 74 | activation=tf.nn.relu) 75 | 76 | # Convolutional Layer #2 77 | # Computes 64 features using a 3x3 filter. 78 | # Input Tensor Shape: [batch_size, 26, 26 32] 79 | # Output Tensor Shape: [batch_size, 24, 24, 64] 80 | conv2 = tf.layers.conv2d( 81 | inputs=conv1, 82 | filters=64, 83 | kernel_size=[3, 3], 84 | activation=tf.nn.relu) 85 | 86 | # Pooling Layer 87 | # Max pooling layer with a 2x2 filter and stride of 2 88 | # Input Tensor Shape: [batch_size, 24, 24, 64] 89 | # Output Tensor Shape: [batch_size, 12, 12, 64] 90 | pool = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) 91 | 92 | # Dropout # 1 93 | # Add dropout operation; 0.25 probability that element will be kept 94 | dropout = tf.layers.dropout( 95 | inputs=pool, rate=0.25, training=mode == tf.estimator.ModeKeys.TRAIN) 96 | 97 | # Flatten tensor into a batch of vectors 98 | # Input Tensor Shape: [batch_size, 12, 12, 64] 99 | # Output Tensor Shape: [batch_size, 12 * 12 * 64] 100 | flat = tf.reshape(dropout, [-1, 12 * 12 * 64]) # 9216 101 | 102 | 103 | # Dense Layer # 1 104 | # Densely connected layer with 128 neurons 105 | # Input Tensor Shape: [batch_size, 12 * 12 * 64] (batch_size, 9216) 106 | # Output Tensor Shape: [batch_size, 128] 107 | dense1 = tf.layers.dense(inputs=flat, units=128, activation=tf.nn.relu) 108 | 109 | # Dropout # 2 110 | # Add dropout operation; 0.5 probability that element will be kept 111 | dropout2 = tf.layers.dropout( 112 | inputs=dense1, rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN) 113 | 114 | # Logits layer 115 | # Input Tensor Shape: [batch_size, 128] 116 | # Output Tensor Shape: [batch_size, 10] 117 | logits = tf.layers.dense(inputs=dropout2, units=num_classes) 118 | 119 | predictions = { 120 | # Generate predictions (for PREDICT and EVAL mode) 121 | "classes": tf.argmax(input=logits, axis=1), 122 | # Add `softmax_tensor` to the graph. It is used for PREDICT and by the 123 | # `logging_hook`. 124 | "probabilities": tf.nn.softmax(logits, name="softmax_tensor") 125 | } 126 | # Inference (for TEST mode) 127 | if mode == tf.estimator.ModeKeys.PREDICT: 128 | return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) 129 | 130 | # Calculate Loss (for both TRAIN and EVAL modes) 131 | onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_classes) 132 | # Cross Entropy 133 | loss = tf.losses.softmax_cross_entropy( 134 | onehot_labels=onehot_labels, logits=logits) 135 | 136 | # Configure the Training Op (for TRAIN mode) 137 | if mode == tf.estimator.ModeKeys.TRAIN: 138 | # AdamOptimizer 139 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 140 | train_op = optimizer.minimize( 141 | loss=loss, 142 | global_step=tf.train.get_global_step()) 143 | return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) 144 | 145 | # Add evaluation metrics (for EVAL mode) 146 | eval_metric_ops = { 147 | "accuracy": tf.metrics.accuracy( 148 | labels=labels, predictions=predictions["classes"])} 149 | return tf.estimator.EstimatorSpec( 150 | mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) 151 | 152 | # Checkpoint Strategy configuration 153 | run_config = tf.contrib.learn.RunConfig( 154 | model_dir=filepath, 155 | keep_checkpoint_max=1) 156 | 157 | # Create the Estimator 158 | mnist_classifier = tf.estimator.Estimator( 159 | model_fn=cnn_model_fn, config=run_config) 160 | 161 | # Keep track of the best accuracy 162 | best_acc = 0 163 | 164 | # Training for num_epochs 165 | for i in range(num_epochs): 166 | print("Begin Training - Epoch {}/{}".format(i+1, num_epochs)) 167 | # Train the model for 1 epoch 168 | train_input_fn = tf.estimator.inputs.numpy_input_fn( 169 | x={"x": train_data}, 170 | y=train_labels, 171 | batch_size=batch_size, 172 | num_epochs=1, 173 | shuffle=True) 174 | 175 | mnist_classifier.train( 176 | input_fn=train_input_fn) 177 | 178 | # Evaluate the model and print results 179 | eval_input_fn = tf.estimator.inputs.numpy_input_fn( 180 | x={"x": eval_data}, 181 | y=eval_labels, 182 | num_epochs=1, 183 | shuffle=False) 184 | 185 | eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) 186 | 187 | accuracy = eval_results["accuracy"] * 100 188 | # Set the best acc if we have a new best or if it is the first step 189 | if accuracy > best_acc or i == 0: 190 | best_acc = accuracy 191 | print ("=> New Best Accuracy {}".format(accuracy)) 192 | else: 193 | print("=> Validation Accuracy did not improve") 194 | -------------------------------------------------------------------------------- /pytorch_mnist_cnn.py: -------------------------------------------------------------------------------- 1 | """Convolutional Neural Network for MNIST, built with PyTorch. 2 | 3 | Adapted from: 4 | https://github.com/pytorch/examples/blob/master/mnist/main.py 5 | https://github.com/pytorch/examples/blob/master/imagenet/main.py 6 | 7 | BSD 3-Clause License 8 | 9 | Copyright (c) 2017, 10 | All rights reserved. 11 | 12 | Redistribution and use in source and binary forms, with or without 13 | modification, are permitted provided that the following conditions are met: 14 | 15 | * Redistributions of source code must retain the above copyright notice, this 16 | list of conditions and the following disclaimer. 17 | 18 | * Redistributions in binary form must reproduce the above copyright notice, 19 | this list of conditions and the following disclaimer in the documentation 20 | and/or other materials provided with the distribution. 21 | 22 | * Neither the name of the copyright holder nor the names of its 23 | contributors may be used to endorse or promote products derived from 24 | this software without specific prior written permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 29 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 30 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 32 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 33 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 34 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | """ 37 | 38 | import torch 39 | import torchvision.datasets as dsets 40 | import torch.nn as nn 41 | import torch.nn.functional as F 42 | import torchvision.transforms as transforms 43 | from torch.autograd import Variable 44 | from torchvision.utils import make_grid 45 | import shutil 46 | import os.path 47 | import time 48 | import numpy as np 49 | 50 | # Hyperparameter 51 | batch_size = 128 52 | input_size = 784 # 28 * 28 53 | hidden_size = 500 54 | num_classes = 10 55 | learning_rate = 1e-3 56 | num_epochs = 12 57 | print_every = 100 58 | best_accuracy = torch.FloatTensor([0]) 59 | start_epoch = 0 60 | 61 | # Path to saved model weights(as hdf5) 62 | resume_weights = "/model/checkpoint.pth.tar" 63 | 64 | # CUDA? 65 | cuda = torch.cuda.is_available() 66 | 67 | # Seed for reproducibility 68 | torch.manual_seed(1) 69 | if cuda: 70 | torch.cuda.manual_seed(1) 71 | 72 | 73 | def train(model, optimizer, train_loader, test_loader, loss_fn): 74 | """Perform a full training over dataset""" 75 | average_time = 0 76 | # Model train mode 77 | model.train() 78 | for i, (images, labels) in enumerate(train_loader): 79 | # measure data loading time 80 | batch_time = time.time() 81 | images = Variable(images) 82 | labels = Variable(labels) 83 | 84 | if cuda: 85 | images, labels = images.cuda(), labels.cuda() 86 | 87 | # Forward + Backward + Optimize 88 | optimizer.zero_grad() 89 | outputs = model(images) 90 | loss = loss_fn(outputs, labels) 91 | 92 | # Load loss on CPU 93 | if cuda: 94 | loss.cpu() 95 | 96 | loss.backward() 97 | optimizer.step() 98 | 99 | # Measure elapsed time 100 | batch_time = time.time() - batch_time 101 | # Accumulate over batch 102 | average_time += batch_time 103 | 104 | # ### Keep track of metric every batch 105 | # Accuracy Metric 106 | prediction = outputs.data.max(1)[1] # first column has actual prob. 107 | accuracy = prediction.eq(labels.data).sum() / batch_size * 100 108 | 109 | # Log 110 | if (i + 1) % print_every == 0: 111 | print ('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f, Accuracy: %.4f, Batch time: %f' 112 | % (epoch + 1, 113 | num_epochs, 114 | i + 1, 115 | len(train_dataset) // batch_size, 116 | loss.data[0], 117 | accuracy, 118 | average_time/print_every)) # Average 119 | 120 | 121 | def eval(model, optimizer, test_loader): 122 | """Eval over test set""" 123 | model.eval() 124 | correct = 0 125 | # Get Batch 126 | for data, target in test_loader: 127 | data, target = Variable(data, volatile=True), Variable(target) 128 | if cuda: 129 | data, target = data.cuda(), target.cuda() 130 | # Evaluate 131 | output = model(data) 132 | # Load output on CPU 133 | if cuda: 134 | output.cpu() 135 | # Compute Accuracy 136 | prediction = output.data.max(1)[1] 137 | correct += prediction.eq(target.data).sum() 138 | return correct 139 | 140 | 141 | def save_checkpoint(state, is_best, filename='/output/checkpoint.pth.tar'): 142 | """Save checkpoint if a new best is achieved""" 143 | if is_best: 144 | print ("=> Saving a new best") 145 | torch.save(state, filename) # save checkpoint 146 | else: 147 | print ("=> Validation Accuracy did not improve") 148 | 149 | 150 | # MNIST Dataset (Images and Labels) 151 | # If you have not mounted the dataset, you can download it 152 | # just adding download=True as parameter 153 | train_dataset = dsets.MNIST(root='/input', 154 | train=True, 155 | download=True, 156 | transform=transforms.ToTensor()) 157 | x_train_mnist, y_train_mnist = train_dataset.train_data.type(torch.FloatTensor), \ 158 | train_dataset.train_labels 159 | test_dataset = dsets.MNIST(root='/input', 160 | train=False, 161 | download=True, 162 | transform=transforms.ToTensor()) 163 | x_test_mnist, y_test_mnist = test_dataset.test_data.type(torch.FloatTensor), \ 164 | test_dataset.test_labels 165 | 166 | # Dataset info 167 | print('Training Data Size: ', x_train_mnist.size(), '-', y_train_mnist.size()) 168 | print('Testing Data Size: ', x_test_mnist.size(), '-', y_test_mnist.size()) 169 | 170 | # Training Dataset Loader (Input Pipline) 171 | train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 172 | batch_size=batch_size, 173 | shuffle=True) 174 | # Testing Dataset Loader (Input Pipline) 175 | test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 176 | batch_size=batch_size, 177 | shuffle=False) 178 | 179 | # #### Model #### 180 | # Convolutional Neural Network Model 181 | class CNN(nn.Module): 182 | """Conv[ReLU] -> Conv[ReLU] -> MaxPool -> Dropout(0.25)- 183 | -> Flatten -> FC()[ReLU] -> Dropout(0.5) -> FC()[Softmax] 184 | """ 185 | def __init__(self, num_classes): 186 | super(CNN, self).__init__() 187 | self.conv1 = nn.Conv2d(1, 32, kernel_size=3) 188 | self.conv2 = nn.Conv2d(32, 64, kernel_size=3) 189 | self.drop1 = nn.Dropout2d(p=0.25) 190 | self.fc1 = nn.Linear(9216, 128) 191 | self.drop2 = nn.Dropout2d(p=0.5) 192 | self.fc2 = nn.Linear(128, num_classes) 193 | 194 | def forward(self, x): 195 | x = F.relu(self.conv1(x)) 196 | x = F.max_pool2d(F.relu(self.conv2(x)), 2) 197 | x = self.drop1(x) 198 | x = x.view(-1, 9216) 199 | x = F.relu(self.fc1(x)) 200 | x = self.drop2(x) 201 | x = self.fc2(x) 202 | return F.log_softmax(x) 203 | 204 | model = CNN(num_classes) 205 | print(model) 206 | 207 | # If you are running a GPU instance, load the model on GPU 208 | if cuda: 209 | model.cuda() 210 | 211 | # #### Loss and Optimizer #### 212 | # Softmax is internally computed. 213 | loss_fn = nn.CrossEntropyLoss() 214 | # If you are running a GPU instance, compute the loss on GPU 215 | if cuda: 216 | loss_fn.cuda() 217 | 218 | # Set parameters to be updated. 219 | optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 220 | 221 | # If exists a best model, load its weights! 222 | if os.path.isfile(resume_weights): 223 | print("=> loading checkpoint '{}' ...".format(resume_weights)) 224 | if cuda: 225 | checkpoint = torch.load(resume_weights) 226 | else: 227 | # Load GPU model on CPU 228 | checkpoint = torch.load(resume_weights, 229 | map_location=lambda storage, 230 | loc: storage) 231 | start_epoch = checkpoint['epoch'] 232 | best_accuracy = checkpoint['best_accuracy'] 233 | model.load_state_dict(checkpoint['state_dict']) 234 | print("=> loaded checkpoint '{}' (trained for {} epochs)".format(resume_weights, 235 | checkpoint['epoch'])) 236 | 237 | 238 | # Training the Model 239 | for epoch in range(num_epochs): 240 | train(model, optimizer, train_loader, test_loader, loss_fn) 241 | acc = eval(model, optimizer, test_loader) 242 | acc = 100. * acc / len(test_loader.dataset) 243 | print('=> Test set: Accuracy: {:.2f}%'.format(acc)) 244 | acc = torch.FloatTensor([acc]) 245 | # Get bool not ByteTensor 246 | is_best = bool(acc.numpy() > best_accuracy.numpy()) 247 | # Get greater Tensor to keep track best acc 248 | best_accuracy = torch.FloatTensor(max(acc.numpy(), best_accuracy.numpy())) 249 | # Save checkpoint if is a new best 250 | save_checkpoint({ 251 | 'epoch': start_epoch + epoch + 1, 252 | 'state_dict': model.state_dict(), 253 | 'best_accuracy': best_accuracy 254 | }, is_best) 255 | -------------------------------------------------------------------------------- /tf_mnist_cnn_jupyter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Save and Resume a Tensorflow MNIST ConvNet Model\n", 8 | "\n", 9 | "This jupyter notebook, show you how to save and resume a Tensorflow Model. In this example we will use the Deep Learning hello-world!: the MNIST classification task.\n", 10 | "Note: to run code cell you have to press **`Shift + Enter`**." 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Import Packages\n", 18 | "First we need a single point with all the dependencies:" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from __future__ import absolute_import\n", 28 | "from __future__ import division\n", 29 | "from __future__ import print_function\n", 30 | "\n", 31 | "from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets\n", 32 | "\n", 33 | "import numpy as np\n", 34 | "import tensorflow as tf\n", 35 | "import shutil, os\n", 36 | "\n", 37 | "tf.logging.set_verbosity(tf.logging.INFO)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### Hyper Parameters and Variables\n", 45 | "\n", 46 | "Even for Hyper-Parameters and Variables is a good practice have a single point, this improve code readability and experiments interation." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# Where to save Checkpoint(In the /output folder)\n", 56 | "resumepath =\"/model/mnist_convnet_model\"\n", 57 | "filepath = \"/output/mnist_convnet_model\" \n", 58 | "\n", 59 | "# Hyper-parameters\n", 60 | "batch_size = 128\n", 61 | "num_classes = 10\n", 62 | "num_epochs = 12\n", 63 | "learning_rate = 1e-3" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### Resuming from Previuos Run\n", 71 | "\n", 72 | "If we have mounted a previuos run, copy the checkpoint to the `/output` folder so that the Model will continue from that and save everything in it." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# If exists an checkpoint model, move it into the /output folder\n", 82 | "if os.path.exists(resumepath):\n", 83 | " shutil.copytree(resumepath, filepath)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "### Data Processing and Transformation\n", 91 | "Next, we process the dataset sample in tensor, ready to be feed into the model." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.\n", 104 | "Extracting /input/MNIST_data/train-images-idx3-ubyte.gz\n", 105 | "Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.\n", 106 | "Extracting /input/MNIST_data/train-labels-idx1-ubyte.gz\n", 107 | "Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.\n", 108 | "Extracting /input/MNIST_data/t10k-images-idx3-ubyte.gz\n", 109 | "Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.\n", 110 | "Extracting /input/MNIST_data/t10k-labels-idx1-ubyte.gz\n", 111 | "(60000, 784)\n", 112 | "(10000, 784)\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "# Load training and eval data\n", 118 | "mnist = read_data_sets(train_dir='/input/MNIST_data', validation_size=0)\n", 119 | "train_data = mnist.train.images # Returns np.array\n", 120 | "train_labels = np.asarray(mnist.train.labels, dtype=np.int32)\n", 121 | "eval_data = mnist.test.images # Returns np.array\n", 122 | "eval_labels = np.asarray(mnist.test.labels, dtype=np.int32)\n", 123 | "\n", 124 | "print (train_data.shape)\n", 125 | "print (eval_data.shape)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "### Define the Model\n", 133 | "A ConvNet Model, state of the art for image classification task." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 9, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "def cnn_model_fn(features, labels, mode):\n", 143 | " \"\"\"Model function for CNN.\"\"\"\n", 144 | " # Input Layer\n", 145 | " # Reshape X to 4-D tensor: [batch_size, width, height, channels]\n", 146 | " # MNIST images are 28x28 pixels, and have one color channel\n", 147 | " input_layer = tf.reshape(features[\"x\"], [-1, 28, 28, 1])\n", 148 | "\n", 149 | " # Convolutional Layer #1\n", 150 | " # Computes 32 features using a 3x3 filter with ReLU activation.\n", 151 | " # Input Tensor Shape: [batch_size, 28, 28, 1]\n", 152 | " # Output Tensor Shape: [batch_size, 26, 26, 32]\n", 153 | " conv1 = tf.layers.conv2d(\n", 154 | " inputs=input_layer,\n", 155 | " filters=32,\n", 156 | " kernel_size=[3, 3],\n", 157 | " activation=tf.nn.relu)\n", 158 | "\n", 159 | " # Convolutional Layer #2\n", 160 | " # Computes 64 features using a 3x3 filter.\n", 161 | " # Input Tensor Shape: [batch_size, 26, 26 32]\n", 162 | " # Output Tensor Shape: [batch_size, 24, 24, 64]\n", 163 | " conv2 = tf.layers.conv2d(\n", 164 | " inputs=conv1,\n", 165 | " filters=64,\n", 166 | " kernel_size=[3, 3],\n", 167 | " activation=tf.nn.relu)\n", 168 | "\n", 169 | " # Pooling Layer\n", 170 | " # Max pooling layer with a 2x2 filter and stride of 2\n", 171 | " # Input Tensor Shape: [batch_size, 24, 24, 64]\n", 172 | " # Output Tensor Shape: [batch_size, 12, 12, 64]\n", 173 | " pool = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)\n", 174 | "\n", 175 | " # Dropout # 1\n", 176 | " # Add dropout operation; 0.25 probability that element will be kept\n", 177 | " dropout = tf.layers.dropout(\n", 178 | " inputs=pool, rate=0.25, training=mode == tf.estimator.ModeKeys.TRAIN)\n", 179 | "\n", 180 | " # Flatten tensor into a batch of vectors\n", 181 | " # Input Tensor Shape: [batch_size, 12, 12, 64]\n", 182 | " # Output Tensor Shape: [batch_size, 12 * 12 * 64]\n", 183 | " flat = tf.reshape(dropout, [-1, 12 * 12 * 64]) # 9216\n", 184 | "\n", 185 | " \n", 186 | " # Dense Layer # 1\n", 187 | " # Densely connected layer with 128 neurons\n", 188 | " # Input Tensor Shape: [batch_size, 12 * 12 * 64] (batch_size, 9216)\n", 189 | " # Output Tensor Shape: [batch_size, 128]\n", 190 | " dense1 = tf.layers.dense(inputs=flat, units=128, activation=tf.nn.relu)\n", 191 | " \n", 192 | " # Dropout # 2\n", 193 | " # Add dropout operation; 0.5 probability that element will be kept\n", 194 | " dropout2 = tf.layers.dropout(\n", 195 | " inputs=dense1, rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)\n", 196 | "\n", 197 | " # Logits layer\n", 198 | " # Input Tensor Shape: [batch_size, 128]\n", 199 | " # Output Tensor Shape: [batch_size, 10]\n", 200 | " logits = tf.layers.dense(inputs=dropout2, units=num_classes)\n", 201 | "\n", 202 | " predictions = {\n", 203 | " # Generate predictions (for PREDICT and EVAL mode)\n", 204 | " \"classes\": tf.argmax(input=logits, axis=1),\n", 205 | " # Add `softmax_tensor` to the graph. It is used for PREDICT and by the\n", 206 | " # `logging_hook`.\n", 207 | " \"probabilities\": tf.nn.softmax(logits, name=\"softmax_tensor\")\n", 208 | " }\n", 209 | " # Inference (for TEST mode)\n", 210 | " if mode == tf.estimator.ModeKeys.PREDICT:\n", 211 | " return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)\n", 212 | "\n", 213 | " # Calculate Loss (for both TRAIN and EVAL modes)\n", 214 | " onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_classes)\n", 215 | " # Cross Entropy\n", 216 | " loss = tf.losses.softmax_cross_entropy(\n", 217 | " onehot_labels=onehot_labels, logits=logits)\n", 218 | "\n", 219 | " # Configure the Training Op (for TRAIN mode)\n", 220 | " if mode == tf.estimator.ModeKeys.TRAIN:\n", 221 | " # AdamOptimizer\n", 222 | " optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n", 223 | " train_op = optimizer.minimize(\n", 224 | " loss=loss,\n", 225 | " global_step=tf.train.get_global_step())\n", 226 | " return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)\n", 227 | "\n", 228 | " # Add evaluation metrics (for EVAL mode)\n", 229 | " eval_metric_ops = {\n", 230 | " \"accuracy\": tf.metrics.accuracy(\n", 231 | " labels=labels, predictions=predictions[\"classes\"])}\n", 232 | " return tf.estimator.EstimatorSpec(\n", 233 | " mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### Checkpoint Strategy\n", 241 | "\n", 242 | "The strategy we have adopted for the this example is the following:\n", 243 | "\n", 244 | "- Keep only one checkpoints\n", 245 | "- Trigger the strategy at the end of every epoch" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 6, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "# Checkpoint Strategy configuration\n", 255 | "run_config = tf.contrib.learn.RunConfig(\n", 256 | " model_dir=filepath,\n", 257 | " keep_checkpoint_max=1)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 7, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "name": "stdout", 267 | "output_type": "stream", 268 | "text": [ 269 | "INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': , '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_tf_config': gpu_options {\n", 270 | " per_process_gpu_memory_fraction: 1.0\n", 271 | "}\n", 272 | ", '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/output/mnist_convnet_model'}\n" 273 | ] 274 | } 275 | ], 276 | "source": [ 277 | "# Create the Estimator\n", 278 | "mnist_classifier = tf.estimator.Estimator(\n", 279 | " model_fn=cnn_model_fn, config=run_config)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "### Training\n", 287 | "Let's train the model and see our checkpoint strategy in action." 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "Begin Training - Epoch 1/12\n", 300 | "INFO:tensorflow:Create CheckpointSaverHook.\n", 301 | "INFO:tensorflow:Restoring parameters from /output/mnist_convnet_model/model.ckpt-3284\n", 302 | "INFO:tensorflow:Saving checkpoints for 3285 into /output/mnist_convnet_model/model.ckpt.\n", 303 | "INFO:tensorflow:loss = 0.0112094, step = 3285\n", 304 | "INFO:tensorflow:global_step/sec: 55.4695\n", 305 | "INFO:tensorflow:loss = 0.0575566, step = 3385 (1.804 sec)\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "# Keep track of the best accuracy\n", 311 | "best_acc = 0\n", 312 | "\n", 313 | "# Training for num_epochs\n", 314 | "for i in range(num_epochs):\n", 315 | " print(\"Begin Training - Epoch {}/{}\".format(i+1, num_epochs))\n", 316 | " # Train the model for 1 epoch\n", 317 | " train_input_fn = tf.estimator.inputs.numpy_input_fn(\n", 318 | " x={\"x\": train_data},\n", 319 | " y=train_labels,\n", 320 | " batch_size=batch_size,\n", 321 | " num_epochs=1,\n", 322 | " shuffle=True)\n", 323 | "\n", 324 | " mnist_classifier.train(\n", 325 | " input_fn=train_input_fn)\n", 326 | "\n", 327 | " # Evaluate the model and print results\n", 328 | " eval_input_fn = tf.estimator.inputs.numpy_input_fn(\n", 329 | " x={\"x\": eval_data},\n", 330 | " y=eval_labels,\n", 331 | " num_epochs=1,\n", 332 | " shuffle=False)\n", 333 | " \n", 334 | " eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)\n", 335 | " \n", 336 | " accuracy = eval_results[\"accuracy\"] * 100\n", 337 | " # Set the best acc if we have a new best or if it is the first step \n", 338 | " if accuracy > best_acc or i == 0:\n", 339 | " best_acc = accuracy\n", 340 | " print (\"=> New Best Accuracy {}\".format(accuracy))\n", 341 | " else:\n", 342 | " print(\"=> Validation Accuracy did not improve\")" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "### Resume the checkpoint after the training\n", 350 | "Let's take a look at the checkpoint just created. (you should see the `mnist_convnet_model` folder)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 39, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "name": "stdout", 360 | "output_type": "stream", 361 | "text": [ 362 | "\u001b[0m\u001b[01;34mMNIST-data\u001b[0m/ command.sh \u001b[01;34mmnist_convnet_model\u001b[0m/\r\n", 363 | "README.md keras_mnist_cnn.py pytorch_mnist_cnn.py\r\n", 364 | "Untitled.ipynb keras_mnist_cnn_jupyter.ipynb pytorch_mnist_cnn_jupyter.ipynb\r\n" 365 | ] 366 | } 367 | ], 368 | "source": [ 369 | "% ls" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "Jupyter Notebook run in the `/output folder`, so it's here. If you want to load it, you only need to restart the **Training** Cell Code, the Estimator will take care of everything." 377 | ] 378 | } 379 | ], 380 | "metadata": { 381 | "kernelspec": { 382 | "display_name": "Python 3", 383 | "language": "python", 384 | "name": "python3" 385 | }, 386 | "language_info": { 387 | "codemirror_mode": { 388 | "name": "ipython", 389 | "version": 3 390 | }, 391 | "file_extension": ".py", 392 | "mimetype": "text/x-python", 393 | "name": "python", 394 | "nbconvert_exporter": "python", 395 | "pygments_lexer": "ipython3", 396 | "version": "3.6.2" 397 | } 398 | }, 399 | "nbformat": 4, 400 | "nbformat_minor": 2 401 | } 402 | -------------------------------------------------------------------------------- /keras_mnist_cnn_jupyter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Save and Resume a Keras MNIST ConvNet Model\n", 8 | "\n", 9 | "This jupyter notebook, show you how to save and resume a Keras Model. In this example we will use the Deep Learning hello-world!: the MNIST classification task.\n", 10 | "\n", 11 | "Note: to run code cell you have to press **`Shift + Enter`**." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Import Packages\n", 19 | "\n", 20 | "First we need a single point with all the dependencies:" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "Using TensorFlow backend.\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "from __future__ import print_function\n", 38 | "import keras\n", 39 | "import os.path\n", 40 | "from keras.datasets import mnist\n", 41 | "from keras.models import Sequential\n", 42 | "from keras.layers import Dense, Dropout, Flatten\n", 43 | "from keras.layers import Conv2D, MaxPooling2D\n", 44 | "from keras import backend as K\n", 45 | "from keras.callbacks import ModelCheckpoint" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "### Hyper Parameters and Variables\n", 53 | "\n", 54 | "Even for Hyper-Parameters and Variables is a good practice have a single point, this improve code readability and experiments interation." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 10, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "# Path to saved model weights(as hdf5)\n", 64 | "resume_weights = \"/model/mnist-cnn-best.hdf5\"\n", 65 | "\n", 66 | "# Where to save Checkpoint(In the /output folder)\n", 67 | "filepath = \"/output/mnist-cnn-best.hdf5\"\n", 68 | "\n", 69 | "# Hyper-parameters\n", 70 | "batch_size = 128\n", 71 | "num_classes = 10\n", 72 | "epochs = 12\n", 73 | "\n", 74 | "# input image dimensions\n", 75 | "img_rows, img_cols = 28, 28" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "### Data Processing and Transformation\n", 83 | "\n", 84 | "Next, we process the dataset sample in tensor, ready to be feed into the model." 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 3, 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz\n", 97 | "10682368/11490434 [==========================>...] - ETA: 0sx_train shape: (60000, 28, 28, 1)\n", 98 | "60000 train samples\n", 99 | "10000 test samples\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "# MNIST handwritten image classification\n", 105 | "# the data, shuffled and split between train and test sets\n", 106 | "(x_train, y_train), (x_test, y_test) = mnist.load_data()\n", 107 | "\n", 108 | "# Reshape strategy according to backend\n", 109 | "if K.image_data_format() == 'channels_first':\n", 110 | " x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)\n", 111 | " x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)\n", 112 | " # 1 x 28 x 28 [number_of_channels (colors) x height x weight]\n", 113 | " input_shape = (1, img_rows, img_cols)\n", 114 | "else:\n", 115 | " x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)\n", 116 | " x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)\n", 117 | " # 28 x 28 x 1 [height x weight x number_of_channels (colors)]\n", 118 | " input_shape = (img_rows, img_cols, 1)\n", 119 | "\n", 120 | "# Reshape, type, normalized, print\n", 121 | "x_train = x_train.astype('float32')\n", 122 | "x_test = x_test.astype('float32')\n", 123 | "x_train /= 255\n", 124 | "x_test /= 255\n", 125 | "\n", 126 | "# Dataset info\n", 127 | "print('x_train shape:', x_train.shape)\n", 128 | "print(x_train.shape[0], 'train samples')\n", 129 | "print(x_test.shape[0], 'test samples')\n", 130 | "\n", 131 | "# convert class vectors to binary class matrices\n", 132 | "y_train = keras.utils.to_categorical(y_train, num_classes)\n", 133 | "y_test = keras.utils.to_categorical(y_test, num_classes)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "### Define the Model\n", 141 | "\n", 142 | "A ConvNet Model, state of the art for image classification task." 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 4, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "_________________________________________________________________\n", 155 | "Layer (type) Output Shape Param # \n", 156 | "=================================================================\n", 157 | "conv2d_1 (Conv2D) (None, 26, 26, 32) 320 \n", 158 | "_________________________________________________________________\n", 159 | "conv2d_2 (Conv2D) (None, 24, 24, 64) 18496 \n", 160 | "_________________________________________________________________\n", 161 | "max_pooling2d_1 (MaxPooling2 (None, 12, 12, 64) 0 \n", 162 | "_________________________________________________________________\n", 163 | "dropout_1 (Dropout) (None, 12, 12, 64) 0 \n", 164 | "_________________________________________________________________\n", 165 | "flatten_1 (Flatten) (None, 9216) 0 \n", 166 | "_________________________________________________________________\n", 167 | "dense_1 (Dense) (None, 128) 1179776 \n", 168 | "_________________________________________________________________\n", 169 | "dropout_2 (Dropout) (None, 128) 0 \n", 170 | "_________________________________________________________________\n", 171 | "dense_2 (Dense) (None, 10) 1290 \n", 172 | "=================================================================\n", 173 | "Total params: 1,199,882\n", 174 | "Trainable params: 1,199,882\n", 175 | "Non-trainable params: 0\n", 176 | "_________________________________________________________________\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "# MODEL\n", 182 | "# Conv(32,3,3)[ReLU] -> Conv(64,3,3)[ReLU] -> MaxPool(2,2)[Dropout 0.25] ->\n", 183 | "# FC(_, 128)[ReLU][Dropout 0.5] -> FC(128, 10)[Softmax]\n", 184 | "model = Sequential()\n", 185 | "model.add(Conv2D(32, kernel_size=(3, 3),\n", 186 | " activation='relu',\n", 187 | " input_shape=input_shape))\n", 188 | "model.add(Conv2D(64, (3, 3), activation='relu'))\n", 189 | "model.add(MaxPooling2D(pool_size=(2, 2)))\n", 190 | "model.add(Dropout(0.25))\n", 191 | "model.add(Flatten())\n", 192 | "model.add(Dense(128, activation='relu'))\n", 193 | "model.add(Dropout(0.5))\n", 194 | "model.add(Dense(num_classes, activation='softmax'))\n", 195 | "\n", 196 | "model.summary()" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "### Resume a checkpoint\n", 204 | "\n", 205 | "Run the following line if you want to resume an existing checkpoint." 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 11, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "Resumed model's weights from ./mnist-cnn-best.hdf5\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "# If exists a best model, load its weights!\n", 223 | "if os.path.isfile(resume_weights):\n", 224 | " print (\"Resumed model's weights from {}\".format(resume_weights))\n", 225 | " # load weights\n", 226 | " model.load_weights(resume_weights)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "### Define The Loss Function and The Optimizers\n", 234 | "\n", 235 | "In this example we use the Cross Entropy Loss and Adam Optimizer." 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 6, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "# CEE, Adam\n", 245 | "model.compile(loss=keras.losses.categorical_crossentropy,\n", 246 | " optimizer=keras.optimizers.Adam(),\n", 247 | " metrics=['accuracy'])" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "### Checkpoint Strategy\n", 255 | "\n", 256 | "The strategy we have adopted for the this example is the following:\n", 257 | "- Keep only one checkpoints\n", 258 | "- Trigger the strategy at the end of every epoch\n", 259 | "- Save the one with the best(max) validation accuracy " 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 7, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "# Keep only a single checkpoint, the best over test accuracy.\n", 269 | "checkpoint = ModelCheckpoint(filepath,\n", 270 | " monitor='val_acc',\n", 271 | " verbose=1,\n", 272 | " save_best_only=True,\n", 273 | " mode='max')" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "### Training\n", 281 | "\n", 282 | "Let's train the model and see our checkpoint strategy in action." 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 12, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "Train on 60000 samples, validate on 10000 samples\n", 295 | "Epoch 1/12\n", 296 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0210 - acc: 0.9928Epoch 00000: val_acc did not improve\n", 297 | "60000/60000 [==============================] - 9s - loss: 0.0210 - acc: 0.9929 - val_loss: 0.0309 - val_acc: 0.9912\n", 298 | "Epoch 2/12\n", 299 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0207 - acc: 0.9931Epoch 00001: val_acc did not improve\n", 300 | "60000/60000 [==============================] - 9s - loss: 0.0207 - acc: 0.9931 - val_loss: 0.0248 - val_acc: 0.9927\n", 301 | "Epoch 3/12\n", 302 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0204 - acc: 0.9934Epoch 00002: val_acc did not improve\n", 303 | "60000/60000 [==============================] - 9s - loss: 0.0205 - acc: 0.9934 - val_loss: 0.0270 - val_acc: 0.9922\n", 304 | "Epoch 4/12\n", 305 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0186 - acc: 0.9939Epoch 00003: val_acc did not improve\n", 306 | "60000/60000 [==============================] - 9s - loss: 0.0186 - acc: 0.9940 - val_loss: 0.0279 - val_acc: 0.9928\n", 307 | "Epoch 5/12\n", 308 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0171 - acc: 0.9944Epoch 00004: val_acc did not improve\n", 309 | "60000/60000 [==============================] - 9s - loss: 0.0171 - acc: 0.9944 - val_loss: 0.0273 - val_acc: 0.9924\n", 310 | "Epoch 6/12\n", 311 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0153 - acc: 0.9948Epoch 00005: val_acc did not improve\n", 312 | "60000/60000 [==============================] - 9s - loss: 0.0153 - acc: 0.9948 - val_loss: 0.0289 - val_acc: 0.9928\n", 313 | "Epoch 7/12\n", 314 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0157 - acc: 0.9944Epoch 00006: val_acc improved from 0.99300 to 0.99360, saving model to /output/mnist-cnn-best.hdf5\n", 315 | "60000/60000 [==============================] - 9s - loss: 0.0157 - acc: 0.9944 - val_loss: 0.0296 - val_acc: 0.9936\n", 316 | "Epoch 8/12\n", 317 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0145 - acc: 0.9949Epoch 00007: val_acc did not improve\n", 318 | "60000/60000 [==============================] - 9s - loss: 0.0145 - acc: 0.9949 - val_loss: 0.0281 - val_acc: 0.9928\n", 319 | "Epoch 9/12\n", 320 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0152 - acc: 0.9953Epoch 00008: val_acc did not improve\n", 321 | "60000/60000 [==============================] - 9s - loss: 0.0152 - acc: 0.9953 - val_loss: 0.0278 - val_acc: 0.9927\n", 322 | "Epoch 10/12\n", 323 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0127 - acc: 0.9958Epoch 00009: val_acc did not improve\n", 324 | "60000/60000 [==============================] - 9s - loss: 0.0127 - acc: 0.9959 - val_loss: 0.0303 - val_acc: 0.9926\n", 325 | "Epoch 11/12\n", 326 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0136 - acc: 0.9955Epoch 00010: val_acc did not improve\n", 327 | "60000/60000 [==============================] - 9s - loss: 0.0135 - acc: 0.9956 - val_loss: 0.0296 - val_acc: 0.9931\n", 328 | "Epoch 12/12\n", 329 | "59648/60000 [============================>.] - ETA: 0s - loss: 0.0128 - acc: 0.9955Epoch 00011: val_acc improved from 0.99360 to 0.99380, saving model to /output/mnist-cnn-best.hdf5\n", 330 | "60000/60000 [==============================] - 9s - loss: 0.0130 - acc: 0.9954 - val_loss: 0.0276 - val_acc: 0.9938\n", 331 | "Test loss: 0.0276465165614\n", 332 | "Test accuracy: 0.9938\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "# Train\n", 338 | "model.fit(x_train, y_train,\n", 339 | " batch_size=batch_size,\n", 340 | " epochs=epochs,\n", 341 | " verbose=1,\n", 342 | " validation_data=(x_test, y_test),\n", 343 | " callbacks=[checkpoint])\n", 344 | "\n", 345 | "# Eval\n", 346 | "score = model.evaluate(x_test, y_test, verbose=0)\n", 347 | "print('Test loss:', score[0])\n", 348 | "print('Test accuracy:', score[1])" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "### Resume the checkpoint after the training\n", 356 | "\n", 357 | "Let's take a look at the checkpoint just created. (you should see the `mnist-cnn-best.hdf5` file)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 9, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "name": "stdout", 367 | "output_type": "stream", 368 | "text": [ 369 | "\u001b[0m\u001b[01;34mMNIST_data\u001b[0m/ command.sh mnist-cnn-best.hdf5\r\n", 370 | "README.md keras_mnist_cnn.py pytorch_mnist_cnn.py\r\n", 371 | "Untitled.ipynb keras_mnist_cnn_jupyter.ipynb pytorch_mnist_cnn_jupyter.ipynb\r\n" 372 | ] 373 | } 374 | ], 375 | "source": [ 376 | "% ls" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "Jupyter Notebook run in the `/output` folder, so it's here.\n", 384 | "If you want to load it, go to the Hyper parameters and Varables Code Cell, replace the resume weigths var in this way:\n", 385 | "`# Path to saved model weights(as hdf5)\n", 386 | "resume_weights = \"./mnist-cnn-best.hdf5\"`, run the cell, go to the **Resume a checkpoint** Code Cell, run it, and rerun the **Training Code Cell**, that's it." 387 | ] 388 | } 389 | ], 390 | "metadata": { 391 | "kernelspec": { 392 | "display_name": "Python 3", 393 | "language": "python", 394 | "name": "python3" 395 | }, 396 | "language_info": { 397 | "codemirror_mode": { 398 | "name": "ipython", 399 | "version": 3 400 | }, 401 | "file_extension": ".py", 402 | "mimetype": "text/x-python", 403 | "name": "python", 404 | "nbconvert_exporter": "python", 405 | "pygments_lexer": "ipython3", 406 | "version": "3.6.2" 407 | } 408 | }, 409 | "nbformat": 4, 410 | "nbformat_minor": 2 411 | } 412 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Save And Resume your Experiments 2 | 3 | This repo contains the code to show how to save checkpoints during training and resume your experiments from them. 4 | We will show you how to perform it on Tensorflow, Keras and PyTorch. 5 | 6 | ## Why checkpointing? 7 | 8 | ![save game screen FF-like](https://i.imgur.com/xdpSAzq.png) 9 | 10 | Image your experiments as a video game, sometimes you want to save your game or resume it from an existing state. Checkpoints in Machine/Deep Learning experiments are the same thing, you do not want to lose your experiments due to blackout, OS faults or other types of bad errors. Sometimes you want just to resume a particular state of the training for new experiments or try different things. That's why you need checkpoints! 11 | 12 | Not to mention that without a checkpoint at the end of the training, you will have lost all the training! Like finishing a game without saving at the end. 13 | 14 | ## What is a checkpoint made of? 15 | 16 | A checkpoint can consist of: 17 | 18 | - The architecture of the model, allowing to re-create the model 19 | - The weights of the model 20 | - The training configuration (loss, optimizer, epochs and other meta-infos) 21 | - The state of the optimizer, allowing to resume training exactly where you left off. 22 | 23 | *Taken from Keras docs [how-can-i-save-a-keras-model](https://keras.io/getting-started/faq/#how-can-i-save-a-keras-model)*. 24 | 25 | ## Checkpoint Strategies 26 | 27 | There are different checkpoint strategies according to the type of training regime you are performing: 28 | 29 | - Short Training Regime (minutes - hours) 30 | - Normal Training Regime (hours - day) 31 | - Long Training Regime (days - weeks) 32 | 33 | ### Short Training Regime 34 | In this type of training regime is a common practice to save only a checkpoint at the end of the training or at the end of every epoch. 35 | 36 | ### Normal Training Regime 37 | In this type of training regime is a common practice to save multiple checkpoints every n_epochs and keep track about what's the best one with respect to validation metric we care about. Usually there is a fixed number of checkpoints we care about so to not take to much space, such as restrict it to keep only 10 checkpoints(the new ones will replace the last ones). 38 | 39 | ### Long Training Regime 40 | In this type of training regime is a common practice to save multiple checkpoints every n_epochs and keep track about what's the best one with respect to validation metric we care about. Since the training can be really long, is common to save less frequently but keep more checkpoints file, so that we will be able to resume the training in particular situations. 41 | 42 | *Obviously you can use a custom Checkpoint Strategy according to your need and the task you will run.* 43 | 44 | ## The Tradeoff 45 | 46 | The tradeoff is between the **frequency** and the **number of checkpoints files** to keep. Let's take a look what's happen when we act over these two parameters: 47 | 48 | Frequency | Number of checkpoints to keep | Cons | Pro 49 | --------- | ----------------------------- | ---- | --- 50 | High | High | You need a lot of space!! | You can resume very quickly in almost all the interesting training states. 51 | High | Low | You could have lost preciuos states. | Minimize the storage space you need. 52 | Low | High | If some things happened between two checkpoints, it will cost you some time to retrieve it. | You can resume the experiments in a lot of interesting states. 53 | Low | Low | You could have lost preciuos states | Minimize the storage space you need. 54 | 55 | 56 | Now you have a good intuition about what's the best strategy you can adopt according to your training regime. 57 | 58 | ## Save and Resume on FloydHub 59 | 60 | Before you start, log in on FloydHub with the [floyd login](http://docs.floydhub.com/commands/login/) command, then fork and init the project: 61 | 62 | ```bash 63 | $ git clone https://github.com/floydhub/save-and-resume.git 64 | $ cd save-and-resume 65 | $ floyd init save-and-resume 66 | ``` 67 | 68 | For this examples we use the Deep Learning hello-world: the [MNIST](http://yann.lecun.com/exdb/mnist/) classification task using a Convolutional Neural Network model. 69 | 70 | The strategy we have adopted for the next example is the following: 71 | - Keep only one checkpoints 72 | - Trigger the strategy at the end of every epoch 73 | - Save the one with the best(max) validation accuracy 74 | 75 | Considering the toy example, a Short Training Regime provide a good strategy. 76 | 77 | *As said this tutorial follows a basic setup, if you have a more sofisticated experiments you will have to hack it.* 78 | 79 | This is the basic template you have to follow for saving and resuming when you run your experimets on FloydHub *via script*: 80 | 81 | #### Saving Template command 82 | 83 | ```bash 84 | floyd run \ 85 | [--gpu] \ 86 | --env \ 87 | --data : \ 88 | "python " 89 | ``` 90 | 91 | The checkpoint of this script must be saved in the `/output` foler. 92 | 93 | #### Resuming Template after training 94 | 95 | ```bash 96 | floyd run \ 97 | [--gpu] \ 98 | --env \ 99 | --data : \ 100 | --data : \ 101 | "python " 102 | ``` 103 | The scipt will resum the checkpoint from the previus Job's Output. 104 | 105 | Let's see how to make it tangible for the different framework on FloydHub. 106 | 107 | ## Tensorflow 108 | 109 |

110 | 111 |

112 | 113 | Tensorflow provide different way for saving and resuming a checkpoint. In the example we will use the [tf.Estimator](https://www.tensorflow.org/api_docs/python/tf/estimator) API, that behind the scene uses [tf.train.Saver](https://www.tensorflow.org/api_docs/python/tf/train/Saver), [tf.train.CheckpointSaverHook](https://www.tensorflow.org/api_docs/python/tf/train/CheckpointSaverHook) and [tf.saved_model.builder.SavedModelBuilder](https://www.tensorflow.org/api_docs/python/tf/saved_model/builder/SavedModelBuilder). 114 | 115 | More in detail, it uses the first function to save, the second one to act according to the adopted strategy and the last one to export the model to be served with `export_savedmodel()` method. 116 | 117 | ### Saving 118 | 119 | Before init an Estimator, we have to define the checkpoint strategy. To do this we have to create a configuration for the Estimator using the [tf.estimator.RunConfig](https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig) API such this: 120 | 121 | ```python 122 | # Checkpoint Strategy configuration 123 | run_config = tf.contrib.learn.RunConfig( 124 | model_dir=filepath, 125 | keep_checkpoint_max=1) 126 | ``` 127 | 128 | In this way we are telling the estimator in which directory save or resume a checkpoint and how many checkpoints to keep. 129 | 130 | Then we have to provide it, at the initialization of the Estimator: 131 | 132 | ```python 133 | # Create the Estimator 134 | mnist_classifier = tf.estimator.Estimator( 135 | model_fn=cnn_model_fn, config=run_config) 136 | ``` 137 | 138 | That's it about saving a checkpoint in Tensorflow using Estimator. 139 | 140 | ### Resuming 141 | 142 | After having configurated the Estimator, everything is done. If it will find a checkpoint inside the given model folder, it will load the last one. 143 | 144 | That's it about resuming a checkpoint in Tensorflow using Estimator. 145 | 146 | ### Run on FloydHub 147 | Here's the steps to run the example on FloydHub. 148 | 149 | #### Via script 150 | 151 | First time training: 152 | 153 | ```bash 154 | floyd run \ 155 | --gpu \ 156 | --env tensorflow-1.3 \ 157 | --data redeipirati/datasets/mnist/1:input \ 158 | 'python tf_mnist_cnn.py' 159 | ``` 160 | 161 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6, 162 | - The `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory, 163 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine. 164 | 165 | Resuming: 166 | 167 | ```bash 168 | floyd run \ 169 | --gpu \ 170 | --env tensorflow-1.3 \ 171 | --data redeipirati/datasets/mnist/1:input \ 172 | --data /projects/save-and-resume//output:/model \ 173 | 'python tf_mnist_cnn.py' 174 | ``` 175 | 176 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6, 177 | - The first `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory, 178 | - The second `--data` flag specifies that the output of a previus Job should be available at the `/model` directory, 179 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine. 180 | 181 | 182 | #### Via Jupyter 183 | 184 | ```bash 185 | floyd run \ 186 | --gpu \ 187 | --env tensorflow-1.3 \ 188 | --data redeipirati/datasets/mnist/1:input \ 189 | --mode jupyter 190 | ``` 191 | 192 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6. 193 | - The `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory, 194 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine. 195 | - The `--mode` flag specifies that this job should provide us a Jupyter notebook. 196 | 197 | Add `--data /projects/save-and-resume//output:/model`, if you want to load a checkpoint from a previous Job. 198 | 199 | ## Keras 200 | 201 | ![Keras logo](https://s3.amazonaws.com/keras.io/img/keras-logo-2018-large-1200.png) 202 | 203 | Keras provide a great API for saving and loading a checkpoints. Let's take a look: 204 | 205 | ### Saving 206 | Keras provides a set of functions called [callback](https://keras.io/callbacks/): you can think of it as events that will triggered at certain training state. The callback we need for checkpointing is the [ModelCheckpoint](https://keras.io/callbacks/#modelcheckpoint) which provides all the features we need according to the checkpoint strategy adopted. 207 | 208 | **This function save only the model's weights**, if you want to save the whole model or some of the components take a look at [how can i save a keras model from Keras docs](https://keras.io/getting-started/faq/#how-can-i-save-a-keras-model). 209 | 210 | First of all we have to import the callback functions: 211 | ```python 212 | from keras.callbacks import ModelCheckpoint 213 | ``` 214 | Next, just before the call to `model.fit(...)` it's time to prepare the checkpoint strategy. 215 | 216 | ```python 217 | # Checkpoint In the /output folder 218 | filepath = "/output/mnist-cnn-best.hdf5" 219 | 220 | # Keep only a single checkpoint, the best over test accuracy. 221 | checkpoint = ModelCheckpoint(filepath, 222 | monitor='val_acc', 223 | verbose=1, 224 | save_best_only=True, 225 | mode='max') 226 | ``` 227 | - `filepath="/output/mnist-cnn-best.hdf5"`: FloydHub returns only the contents inside the `/output` folder! See [save output in the docs](https://docs.floydhub.com/guides/data/storing_output/), 228 | - `monitor='val_acc'`: the metric we care about, validation accuracy, 229 | - `verbose=1`: it will print more infos, 230 | - `save_best_only=True`: Keep only the best one(in term of max val_acc), 231 | - `mode='max'`: save the one with max validation accuracy. 232 | 233 | Default period(checkpointing frequency) is set to 1, this means at the end of every epoch. 234 | 235 | For more infos(such as filepath formatting options, checkpointing period and more) we encourage you to explore the [ModelCheckpoint](https://keras.io/callbacks/#modelcheckpoint) API. 236 | 237 | Now we are ready to see it apply during training, to do this, we need to pass the callback variable to the `model.fit(...)` call: 238 | 239 | ```python 240 | # Train 241 | model.fit(x_train, y_train, 242 | batch_size=batch_size, 243 | epochs=epochs, 244 | verbose=1, 245 | validation_data=(x_test, y_test), 246 | callbacks=[checkpoint]) # <- Apply our checkpoint strategy 247 | ``` 248 | 249 | According to the chosen strategy you will see: 250 | ``` 251 | # This line when the training reach a new max 252 | Epoch : val_acc improved from to , saving model to /output/mnist-cnn-best.hdf5 253 | 254 | # Or this line 255 | Epoch : val_acc did not improve 256 | ``` 257 | 258 | That's it about saving a checkpoint in Keras. 259 | 260 | ### Resuming 261 | Keras models have the [`load_weights()`](https://github.com/fchollet/keras/blob/master/keras/models.py#L718-L735) method which load the weights from a hdf5 file. 262 | 263 | To load the model's weight you have to add this line just after the model definition: 264 | 265 | ```python 266 | ... # Model Definition 267 | 268 | model.load_weights(resume_weights) 269 | ``` 270 | 271 | That's it about resuming a checkpoint in Keras. 272 | 273 | 274 | ### Run on FloydHub 275 | Here's the steps to run the example on FloydHub. 276 | 277 | #### Via script 278 | 279 | First time training: 280 | 281 | ```bash 282 | floyd run \ 283 | --gpu \ 284 | --env tensorflow-1.3 \ 285 | 'python keras_mnist_cnn.py' 286 | ``` 287 | 288 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6. 289 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine. 290 | 291 | [Keras provide an API to handle MNIST data](https://keras.io/datasets/#mnist-database-of-handwritten-digits), so we can skip the dataset mounting since the dataset size is irrilevant. 292 | 293 | Resuming: 294 | 295 | ```bash 296 | floyd run \ 297 | --gpu \ 298 | --env tensorflow-1.3 \ 299 | --data /projects/save-and-resume//output:/model \ 300 | 'python keras_mnist_cnn.py' 301 | ``` 302 | 303 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6. 304 | - The `--data` flag specifies that the output of a previus Job should be available at the `/model` directory 305 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine. 306 | 307 | 308 | #### Via Jupyter 309 | 310 | ```bash 311 | floyd run \ 312 | --gpu \ 313 | --env tensorflow-1.3 \ 314 | --mode jupyter 315 | ``` 316 | 317 | - The `--env` flag specifies the environment that this project should run on, which is Tensorflow 1.3.0 + Keras 2.0.6 on Python3.6. 318 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine. 319 | - The `--mode` flag specifies that this job should provide us a Jupyter notebook. 320 | 321 | Add `--data /projects/save-and-resume//output:/model`, if you want to load a checkpoint from a previous Job. 322 | 323 | 324 | ## PyTorch 325 | 326 | ![Pytorch logo](http://pytorch.org/docs/master/_static/pytorch-logo-dark.svg) 327 | 328 | Unfortunately at the moment PyTorch has not a great API as Keras, therefore we need to write our own solution according to the checkpoint strategy adopted(the same we have used on Keras). 329 | 330 | 331 | ### Saving 332 | PyTorch does not provide an all-in-one API in which defines the checkpoint strategy but it provide a simple way to save and resume a checkpoint. According the official docs about [semantic serialization](http://pytorch.org/docs/master/notes/serialization.html), the best practice consist of save only the weights due to code refactoring issue. 333 | 334 | Let's take a look at how to save the model weights in PyTorch: 335 | 336 | 337 | First of all define a `save_checkpoint` function which handles all the instructions about the number of checkpoints to keep and the serialization on file: 338 | 339 | ```python 340 | def save_checkpoint(state, is_best, filename='/output/checkpoint.pth.tar'): 341 | """Save checkpoint if a new best is achieved""" 342 | if is_best: 343 | print ("=> Saving a new best") 344 | torch.save(state, filename) # save checkpoint 345 | else: 346 | print ("=> Validation Accuracy did not improve") 347 | ``` 348 | 349 | Then, inside the training(usually a for loop with the number of epochs), we define the checkpoint frequency(at the end of every epoch) and the informations(epochs, model weights and best accuracy achieved) we want to save: 350 | 351 | ```python 352 | ... 353 | 354 | # Training the Model 355 | for epoch in range(num_epochs): 356 | train(...) # Train 357 | acc = eval(...) # Evaluate after every epoch 358 | 359 | # Some stuff with acc(accuracy) 360 | ... 361 | 362 | # Get bool not ByteTensor 363 | is_best = bool(acc.numpy() > best_accuracy.numpy()) 364 | # Get greater Tensor to keep track best acc 365 | best_accuracy = torch.FloatTensor(max(acc.numpy(), best_accuracy.numpy())) 366 | # Save checkpoint if is a new best 367 | save_checkpoint({ 368 | 'epoch': start_epoch + epoch + 1, 369 | 'state_dict': model.state_dict(), 370 | 'best_accuracy': best_accuracy 371 | }, is_best) 372 | ``` 373 | 374 | That's it about saving a checkpoint in PyTorch. 375 | 376 | ### Resuming 377 | To resume a checkpoint, before the training we have to load the weights and the meta information we need: 378 | 379 | ```python 380 | # cuda = torch.cuda.is_available() 381 | if cuda: 382 | checkpoint = torch.load(resume_weights) 383 | else: 384 | # Load GPU model on CPU 385 | checkpoint = torch.load(resume_weights, 386 | map_location=lambda storage, 387 | loc: storage) 388 | start_epoch = checkpoint['epoch'] 389 | best_accuracy = checkpoint['best_accuracy'] 390 | model.load_state_dict(checkpoint['state_dict']) 391 | print("=> loaded checkpoint '{}' (trained for {} epochs)".format(resume_weights, checkpoint['epoch'])) 392 | ``` 393 | 394 | For more info about loading GPU trained weights on CPU, see this [PyTorch discussion](https://discuss.pytorch.org/t/loading-weights-for-cpu-model-while-trained-on-gpu/1032). 395 | 396 | That's it about resuming a checkpoint in PyTorch. 397 | 398 | ### Run on FloydHub 399 | Here's the steps to run the example on FloydHub. 400 | 401 | #### Via script 402 | 403 | First time training: 404 | 405 | ```bash 406 | floyd run \ 407 | --gpu \ 408 | --env pytorch-0.2 \ 409 | --data redeipirati/datasets/pytorch-mnist/1:input \ 410 | 'python pytorch_mnist_cnn.py' 411 | ``` 412 | 413 | - The `--env` flag specifies the environment that this project should run on, which is a PyTorch 0.2.0 on Python 3. 414 | - The `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory 415 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine. 416 | 417 | 418 | Resuming: 419 | 420 | ```bash 421 | floyd run \ 422 | --gpu \ 423 | --env pytorch-0.2 \ 424 | --data redeipirati/datasets/pytorch-mnist/1:input \ 425 | --data /projects/save-and-resume//output:/model \ 426 | 'python pytorch_mnist_cnn.py' 427 | ``` 428 | 429 | - The `--env` flag specifies the environment that this project should run on, which is a PyTorch 0.2.0 on Python 3. 430 | - The first `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory 431 | - The second `--data` flag specifies that the output of a previus Job should be available at the `/model` directory 432 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine. 433 | 434 | #### Via Jupyter 435 | 436 | ```bash 437 | floyd run \ 438 | --gpu \ 439 | --env pytorch-0.2 \ 440 | --data redeipirati/datasets/pytorch-mnist/1:input \ 441 | --mode jupyter 442 | ``` 443 | 444 | - The `--env` flag specifies the environment that this project should run on, which is a PyTorch 0.2.0 on Python 3. 445 | - The `--data` flag specifies that the pytorch-mnist dataset should be available at the `/input` directory 446 | - Note that the `--gpu` flag is optional for now, unless you want to start right away to run the code on a GPU machine. 447 | - The `--mode` flag specifies that this job should provide us a Jupyter notebook. 448 | 449 | Add `--data /projects/save-and-resume//output:/model`, if you want to load a checkpoint from a previous Job. 450 | 451 | Have a great training :) 452 | 453 | ## Contributing 454 | 455 | For any questions, bug(even typos) and/or features requests do not hesitate to contact me, open an issue or a PR! 456 | 457 | 458 | -------------------------------------------------------------------------------- /pytorch_mnist_cnn_jupyter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Save and Resume a Keras MNIST ConvNet Model\n", 8 | "\n", 9 | "This jupyter notebook, show you how to save and resume a PyTorch Model. In this example we will use the Deep Learning hello-world!: the MNIST classification task.\n", 10 | "\n", 11 | "Note: to run code cell you have to press **`Shift + Enter`**." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Import Packages\n", 19 | "\n", 20 | "First we need a single point with all the dependencies:" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "import torch\n", 32 | "import torchvision.datasets as dsets\n", 33 | "import torch.nn as nn\n", 34 | "import torch.nn.functional as F\n", 35 | "import torchvision.transforms as transforms\n", 36 | "from torch.autograd import Variable\n", 37 | "from torchvision.utils import make_grid\n", 38 | "import shutil\n", 39 | "import os.path\n", 40 | "import time\n", 41 | "import numpy as np" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "### Hyper Parameters and Variables\n", 49 | "\n", 50 | "Even for Hyper-Parameters and Variables is a good practice have a single point, it's improve code readability and experiments interation." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 14, 56 | "metadata": { 57 | "collapsed": true 58 | }, 59 | "outputs": [], 60 | "source": [ 61 | "# Hyperparameter\n", 62 | "batch_size = 128\n", 63 | "input_size = 784 # 28 * 28\n", 64 | "hidden_size = 500\n", 65 | "num_classes = 10\n", 66 | "learning_rate = 1e-3\n", 67 | "num_epochs = 12\n", 68 | "print_every = 100\n", 69 | "best_accuracy = torch.FloatTensor([0])\n", 70 | "start_epoch = 0\n", 71 | "\n", 72 | "# Path to saved model weights(as hdf5)\n", 73 | "resume_weights = \"/model/checkpoint.pth.tar\"\n", 74 | "\n", 75 | "# CUDA?\n", 76 | "cuda = torch.cuda.is_available()\n", 77 | "\n", 78 | "# Seed for reproducibility\n", 79 | "torch.manual_seed(1)\n", 80 | "if cuda:\n", 81 | " torch.cuda.manual_seed(1)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Utility function\n", 89 | "\n", 90 | "In this Cell we have the training, evaluating and save checkpoint function:" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "def train(model, optimizer, train_loader, test_loader, loss_fn):\n", 102 | " \"\"\"Perform a full training over dataset\"\"\"\n", 103 | " average_time = 0\n", 104 | " # Model train mode\n", 105 | " model.train()\n", 106 | " for i, (images, labels) in enumerate(train_loader):\n", 107 | " # measure data loading time\n", 108 | " batch_time = time.time()\n", 109 | " images = Variable(images)\n", 110 | " labels = Variable(labels)\n", 111 | "\n", 112 | " if cuda:\n", 113 | " images, labels = images.cuda(), labels.cuda()\n", 114 | "\n", 115 | " # Forward + Backward + Optimize\n", 116 | " optimizer.zero_grad()\n", 117 | " outputs = model(images)\n", 118 | " loss = loss_fn(outputs, labels)\n", 119 | "\n", 120 | " # Load loss on CPU\n", 121 | " if cuda:\n", 122 | " loss.cpu()\n", 123 | "\n", 124 | " loss.backward()\n", 125 | " optimizer.step()\n", 126 | "\n", 127 | " # Measure elapsed time\n", 128 | " batch_time = time.time() - batch_time\n", 129 | " # Accumulate over batch\n", 130 | " average_time += batch_time\n", 131 | "\n", 132 | " # ### Keep track of metric every batch\n", 133 | " # Accuracy Metric\n", 134 | " prediction = outputs.data.max(1)[1] # first column has actual prob.\n", 135 | " accuracy = prediction.eq(labels.data).sum() / batch_size * 100\n", 136 | "\n", 137 | " # Log\n", 138 | " if (i + 1) % print_every == 0:\n", 139 | " print ('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f, Accuracy: %.4f, Batch time: %f'\n", 140 | " % (epoch + 1,\n", 141 | " num_epochs,\n", 142 | " i + 1,\n", 143 | " len(train_dataset) // batch_size,\n", 144 | " loss.data[0],\n", 145 | " accuracy,\n", 146 | " average_time/print_every)) # Average\n", 147 | "\n", 148 | "\n", 149 | "def eval(model, optimizer, test_loader):\n", 150 | " \"\"\"Eval over test set\"\"\"\n", 151 | " model.eval()\n", 152 | " correct = 0\n", 153 | " # Get Batch\n", 154 | " for data, target in test_loader:\n", 155 | " data, target = Variable(data, volatile=True), Variable(target)\n", 156 | " if cuda:\n", 157 | " data, target = data.cuda(), target.cuda()\n", 158 | " # Evaluate\n", 159 | " output = model(data)\n", 160 | " # Load output on CPU\n", 161 | " if cuda:\n", 162 | " output.cpu()\n", 163 | " # Compute Accuracy\n", 164 | " prediction = output.data.max(1)[1]\n", 165 | " correct += prediction.eq(target.data).sum()\n", 166 | " return correct" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### Data Processing and Transformation\n", 174 | "\n", 175 | "Next, we process the dataset sample in tensor, ready to be feed into the model." 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 6, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", 188 | "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", 189 | "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n", 190 | "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n", 191 | "Processing...\n", 192 | "Done!\n", 193 | "Training Data Size: torch.Size([60000, 28, 28]) - torch.Size([60000])\n", 194 | "Testing Data Size: torch.Size([10000, 28, 28]) - torch.Size([10000])\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "# MNIST Dataset (Images and Labels)\n", 200 | "# If you have not mounted the dataset, you can download it\n", 201 | "# just adding download=True as parameter\n", 202 | "train_dataset = dsets.MNIST(root='/input',\n", 203 | " train=True,\n", 204 | " download=True,\n", 205 | " transform=transforms.ToTensor())\n", 206 | "x_train_mnist, y_train_mnist = train_dataset.train_data.type(torch.FloatTensor), \\\n", 207 | " train_dataset.train_labels\n", 208 | "test_dataset = dsets.MNIST(root='/input',\n", 209 | " train=False,\n", 210 | " download=True,\n", 211 | " transform=transforms.ToTensor())\n", 212 | "x_test_mnist, y_test_mnist = test_dataset.test_data.type(torch.FloatTensor), \\\n", 213 | " test_dataset.test_labels\n", 214 | "\n", 215 | "# Dataset info\n", 216 | "print('Training Data Size: ', x_train_mnist.size(), '-', y_train_mnist.size())\n", 217 | "print('Testing Data Size: ', x_test_mnist.size(), '-', y_test_mnist.size())\n", 218 | "\n", 219 | "# Training Dataset Loader (Input Pipline)\n", 220 | "train_loader = torch.utils.data.DataLoader(dataset=train_dataset,\n", 221 | " batch_size=batch_size,\n", 222 | " shuffle=True)\n", 223 | "# Testing Dataset Loader (Input Pipline)\n", 224 | "test_loader = torch.utils.data.DataLoader(dataset=test_dataset,\n", 225 | " batch_size=batch_size,\n", 226 | " shuffle=False)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "### Define the Model\n", 234 | "\n", 235 | "A ConvNet Model, state of the art for image classification task." 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 8, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "CNN (\n", 248 | " (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))\n", 249 | " (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))\n", 250 | " (drop1): Dropout2d (p=0.25)\n", 251 | " (fc1): Linear (9216 -> 128)\n", 252 | " (drop2): Dropout2d (p=0.5)\n", 253 | " (fc2): Linear (128 -> 10)\n", 254 | ")\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "# #### Model ####\n", 260 | "# Convolutional Neural Network Model\n", 261 | "class CNN(nn.Module):\n", 262 | " \"\"\"Conv[ReLU] -> Conv[ReLU] -> MaxPool -> Dropout(0.25)-\n", 263 | " -> Flatten -> FC()[ReLU] -> Dropout(0.5) -> FC()[Softmax]\n", 264 | " \"\"\"\n", 265 | " def __init__(self, num_classes):\n", 266 | " super(CNN, self).__init__()\n", 267 | " self.conv1 = nn.Conv2d(1, 32, kernel_size=3)\n", 268 | " self.conv2 = nn.Conv2d(32, 64, kernel_size=3)\n", 269 | " self.drop1 = nn.Dropout2d(p=0.25)\n", 270 | " self.fc1 = nn.Linear(9216, 128)\n", 271 | " self.drop2 = nn.Dropout2d(p=0.5)\n", 272 | " self.fc2 = nn.Linear(128, num_classes)\n", 273 | "\n", 274 | " def forward(self, x):\n", 275 | " x = F.relu(self.conv1(x))\n", 276 | " x = F.max_pool2d(F.relu(self.conv2(x)), 2)\n", 277 | " x = self.drop1(x)\n", 278 | " x = x.view(-1, 9216)\n", 279 | " x = F.relu(self.fc1(x))\n", 280 | " x = self.drop2(x)\n", 281 | " x = self.fc2(x)\n", 282 | " return F.log_softmax(x)\n", 283 | "\n", 284 | "model = CNN(num_classes)\n", 285 | "print(model)\n", 286 | "\n", 287 | "# If you are running a GPU instance, load the model on GPU\n", 288 | "if cuda:\n", 289 | " model.cuda()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "### Resume a checkpoint\n", 297 | "\n", 298 | "Run the following line if you want to resume an existing checkpoint." 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 15, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "=> loading checkpoint './checkpoint.pth.tar' ...\n", 311 | "=> loaded checkpoint './checkpoint.pth.tar' (trained for 10 epochs)\n" 312 | ] 313 | } 314 | ], 315 | "source": [ 316 | "# If exists a best model, load its weights!\n", 317 | "if os.path.isfile(resume_weights):\n", 318 | " print(\"=> loading checkpoint '{}' ...\".format(resume_weights))\n", 319 | " if cuda:\n", 320 | " checkpoint = torch.load(resume_weights)\n", 321 | " else:\n", 322 | " # Load GPU model on CPU\n", 323 | " checkpoint = torch.load(resume_weights,\n", 324 | " map_location=lambda storage,\n", 325 | " loc: storage)\n", 326 | " start_epoch = checkpoint['epoch']\n", 327 | " best_accuracy = checkpoint['best_accuracy']\n", 328 | " model.load_state_dict(checkpoint['state_dict'])\n", 329 | " print(\"=> loaded checkpoint '{}' (trained for {} epochs)\".format(resume_weights,\n", 330 | " checkpoint['epoch']))" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "### Define The Loss Function and The Optimizers\n", 338 | "\n", 339 | "In this example we use the Cross Entropy Loss and Adam Optimizer." 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 10, 345 | "metadata": { 346 | "collapsed": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "# #### Loss and Optimizer ####\n", 351 | "# Softmax is internally computed.\n", 352 | "loss_fn = nn.CrossEntropyLoss()\n", 353 | "# If you are running a GPU instance, compute the loss on GPU\n", 354 | "if cuda:\n", 355 | " loss_fn.cuda()\n", 356 | "\n", 357 | "# Set parameters to be updated.\n", 358 | "optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "### Checkpoint Strategy\n", 366 | "\n", 367 | "The strategy we have adopted for the this example is the following:\n", 368 | "- Keep only one checkpoints\n", 369 | "- Trigger the strategy at the end of every epoch\n", 370 | "- Save the one with the best(max) validation accuracy " 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 11, 376 | "metadata": { 377 | "collapsed": true 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "# Keep only a single checkpoint, the best over test accuracy.\n", 382 | "def save_checkpoint(state, is_best, filename='/output/checkpoint.pth.tar'):\n", 383 | " \"\"\"Save checkpoint if a new best is achieved\"\"\"\n", 384 | " if is_best:\n", 385 | " print (\"=> Saving a new best\")\n", 386 | " torch.save(state, filename) # save checkpoint\n", 387 | " else:\n", 388 | " print (\"=> Validation Accuracy did not improve\")" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "### Training\n", 396 | "\n", 397 | "Let's train the model and see our checkpoint strategy in action." 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "Epoch: [1/12], Step: [100/468], Loss: 0.0304, Accuracy: 98.4375, Batch time: 0.005967\n", 410 | "Epoch: [1/12], Step: [200/468], Loss: 0.0331, Accuracy: 99.2188, Batch time: 0.011944\n", 411 | "Epoch: [1/12], Step: [300/468], Loss: 0.0181, Accuracy: 99.2188, Batch time: 0.017946\n", 412 | "Epoch: [1/12], Step: [400/468], Loss: 0.0079, Accuracy: 100.0000, Batch time: 0.023949\n", 413 | "=> Test set: Accuracy: 99.15%\n", 414 | "=> Validation Accuracy did not improve\n", 415 | "Epoch: [2/12], Step: [100/468], Loss: 0.0141, Accuracy: 99.2188, Batch time: 0.005985\n", 416 | "Epoch: [2/12], Step: [200/468], Loss: 0.0186, Accuracy: 99.2188, Batch time: 0.011918\n", 417 | "Epoch: [2/12], Step: [300/468], Loss: 0.0136, Accuracy: 100.0000, Batch time: 0.017855\n", 418 | "Epoch: [2/12], Step: [400/468], Loss: 0.0307, Accuracy: 99.2188, Batch time: 0.023823\n", 419 | "=> Test set: Accuracy: 99.27%\n", 420 | "=> Saving a new best\n", 421 | "Epoch: [3/12], Step: [100/468], Loss: 0.0545, Accuracy: 98.4375, Batch time: 0.005938\n", 422 | "Epoch: [3/12], Step: [200/468], Loss: 0.0043, Accuracy: 100.0000, Batch time: 0.011886\n", 423 | "Epoch: [3/12], Step: [300/468], Loss: 0.0408, Accuracy: 96.8750, Batch time: 0.017852\n", 424 | "Epoch: [3/12], Step: [400/468], Loss: 0.0161, Accuracy: 99.2188, Batch time: 0.023796\n", 425 | "=> Test set: Accuracy: 99.23%\n", 426 | "=> Validation Accuracy did not improve\n", 427 | "Epoch: [4/12], Step: [100/468], Loss: 0.0357, Accuracy: 98.4375, Batch time: 0.005919\n", 428 | "Epoch: [4/12], Step: [200/468], Loss: 0.0415, Accuracy: 99.2188, Batch time: 0.011863\n", 429 | "Epoch: [4/12], Step: [300/468], Loss: 0.0079, Accuracy: 100.0000, Batch time: 0.017821\n", 430 | "Epoch: [4/12], Step: [400/468], Loss: 0.0173, Accuracy: 99.2188, Batch time: 0.023815\n", 431 | "=> Test set: Accuracy: 99.24%\n", 432 | "=> Validation Accuracy did not improve\n", 433 | "Epoch: [5/12], Step: [100/468], Loss: 0.0064, Accuracy: 100.0000, Batch time: 0.005956\n", 434 | "Epoch: [5/12], Step: [200/468], Loss: 0.0075, Accuracy: 100.0000, Batch time: 0.011898\n", 435 | "Epoch: [5/12], Step: [300/468], Loss: 0.0220, Accuracy: 99.2188, Batch time: 0.017835\n", 436 | "Epoch: [5/12], Step: [400/468], Loss: 0.0158, Accuracy: 99.2188, Batch time: 0.023799\n", 437 | "=> Test set: Accuracy: 99.23%\n", 438 | "=> Validation Accuracy did not improve\n", 439 | "Epoch: [6/12], Step: [100/468], Loss: 0.0175, Accuracy: 100.0000, Batch time: 0.006003\n", 440 | "Epoch: [6/12], Step: [200/468], Loss: 0.0097, Accuracy: 99.2188, Batch time: 0.011995\n", 441 | "Epoch: [6/12], Step: [300/468], Loss: 0.0392, Accuracy: 99.2188, Batch time: 0.017989\n", 442 | "Epoch: [6/12], Step: [400/468], Loss: 0.0161, Accuracy: 99.2188, Batch time: 0.023942\n", 443 | "=> Test set: Accuracy: 99.28%\n", 444 | "=> Saving a new best\n", 445 | "Epoch: [7/12], Step: [100/468], Loss: 0.0579, Accuracy: 98.4375, Batch time: 0.005972\n", 446 | "Epoch: [7/12], Step: [200/468], Loss: 0.0248, Accuracy: 99.2188, Batch time: 0.011897\n", 447 | "Epoch: [7/12], Step: [300/468], Loss: 0.0006, Accuracy: 100.0000, Batch time: 0.017830\n", 448 | "Epoch: [7/12], Step: [400/468], Loss: 0.0103, Accuracy: 100.0000, Batch time: 0.023758\n", 449 | "=> Test set: Accuracy: 99.25%\n", 450 | "=> Validation Accuracy did not improve\n", 451 | "Epoch: [8/12], Step: [100/468], Loss: 0.0637, Accuracy: 98.4375, Batch time: 0.005992\n", 452 | "Epoch: [8/12], Step: [200/468], Loss: 0.0023, Accuracy: 100.0000, Batch time: 0.011934\n", 453 | "Epoch: [8/12], Step: [300/468], Loss: 0.0076, Accuracy: 100.0000, Batch time: 0.017924\n", 454 | "Epoch: [8/12], Step: [400/468], Loss: 0.0016, Accuracy: 100.0000, Batch time: 0.023878\n", 455 | "=> Test set: Accuracy: 99.26%\n", 456 | "=> Validation Accuracy did not improve\n", 457 | "Epoch: [9/12], Step: [100/468], Loss: 0.0120, Accuracy: 100.0000, Batch time: 0.005922\n", 458 | "Epoch: [9/12], Step: [200/468], Loss: 0.0008, Accuracy: 100.0000, Batch time: 0.011840\n", 459 | "Epoch: [9/12], Step: [300/468], Loss: 0.0016, Accuracy: 100.0000, Batch time: 0.017767\n", 460 | "Epoch: [9/12], Step: [400/468], Loss: 0.0299, Accuracy: 99.2188, Batch time: 0.023730\n", 461 | "=> Test set: Accuracy: 99.29%\n", 462 | "=> Saving a new best\n", 463 | "Epoch: [10/12], Step: [100/468], Loss: 0.0009, Accuracy: 100.0000, Batch time: 0.006006\n", 464 | "Epoch: [10/12], Step: [200/468], Loss: 0.0075, Accuracy: 100.0000, Batch time: 0.012032\n", 465 | "Epoch: [10/12], Step: [300/468], Loss: 0.0016, Accuracy: 100.0000, Batch time: 0.018024\n", 466 | "Epoch: [10/12], Step: [400/468], Loss: 0.0007, Accuracy: 100.0000, Batch time: 0.023979\n" 467 | ] 468 | } 469 | ], 470 | "source": [ 471 | "# Training the Model\n", 472 | "for epoch in range(num_epochs):\n", 473 | " train(model, optimizer, train_loader, test_loader, loss_fn)\n", 474 | " acc = eval(model, optimizer, test_loader)\n", 475 | " acc = 100. * acc / len(test_loader.dataset)\n", 476 | " print('=> Test set: Accuracy: {:.2f}%'.format(acc))\n", 477 | " acc = torch.FloatTensor([acc])\n", 478 | " # Get bool not ByteTensor\n", 479 | " is_best = bool(acc.numpy() > best_accuracy.numpy())\n", 480 | " # Get greater Tensor to keep track best acc\n", 481 | " best_accuracy = torch.FloatTensor(max(acc.numpy(), best_accuracy.numpy()))\n", 482 | " # Save checkpoint if is a new best\n", 483 | " save_checkpoint({\n", 484 | " 'epoch': start_epoch + epoch + 1,\n", 485 | " 'state_dict': model.state_dict(),\n", 486 | " 'best_accuracy': best_accuracy\n", 487 | " }, is_best)" 488 | ] 489 | }, 490 | { 491 | "cell_type": "markdown", 492 | "metadata": {}, 493 | "source": [ 494 | "### Resume the checkpoint after the training\n", 495 | "\n", 496 | "Let's take a look at the checkpoint just created. (you should see the `checkpoint.pth.tar` file)" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 13, 502 | "metadata": {}, 503 | "outputs": [ 504 | { 505 | "name": "stdout", 506 | "output_type": "stream", 507 | "text": [ 508 | "README.md keras_mnist_cnn_jupyter.ipynb\r\n", 509 | "checkpoint.pth.tar pytorch_mnist_cnn.py\r\n", 510 | "command.sh pytorch_mnist_cnn_jupyter.ipynb\r\n", 511 | "keras_mnist_cnn.py\r\n" 512 | ] 513 | } 514 | ], 515 | "source": [ 516 | "% ls" 517 | ] 518 | }, 519 | { 520 | "cell_type": "markdown", 521 | "metadata": {}, 522 | "source": [ 523 | "Jupyter Notebook run in the `/output` folder, so it's here.\n", 524 | "If you want to load it, go to the Hyper parameters and Varables Code Cell, replace the resume weigths var in this way:\n", 525 | "`# Path to saved model weights(as hdf5)\n", 526 | "resume_weights = \"./checkpoint.pth.tar\"`, run the cell, go to the **Resume a checkpoint** Code Cell, run it, and rerun the **Training Code Cell**, that's it." 527 | ] 528 | } 529 | ], 530 | "metadata": { 531 | "kernelspec": { 532 | "display_name": "Python 3", 533 | "language": "python", 534 | "name": "python3" 535 | }, 536 | "language_info": { 537 | "codemirror_mode": { 538 | "name": "ipython", 539 | "version": 3 540 | }, 541 | "file_extension": ".py", 542 | "mimetype": "text/x-python", 543 | "name": "python", 544 | "nbconvert_exporter": "python", 545 | "pygments_lexer": "ipython3", 546 | "version": "3.5.3" 547 | } 548 | }, 549 | "nbformat": 4, 550 | "nbformat_minor": 2 551 | } 552 | --------------------------------------------------------------------------------