├── README.md
└── main.py


/README.md:
--------------------------------------------------------------------------------
1 | Using deep learning and neural networks, we'll be able to classify benign and malignant skin diseases, which may help the doctor diagnose cancer at an earlier stage.
2 | In this tutorial, we will make a skin disease classifier that tries to distinguish between benign (nevus and seborrheic keratosis) and malignant (melanoma) skin diseases from only photographic images using TensorFlow framework in Python.
3 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow_hub as hub
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | import pandas as pd
  6 | import seaborn as sns
  7 | 
  8 | # from tensorflow.keras.utils import get_file
  9 | from keras.utils import get_file
 10 | from sklearn.metrics import accuracy_score
 11 | 
 12 | from sklearn.metrics import roc_curve, auc, confusion_matrix
 13 | from imblearn.metrics import sensitivity_score, specificity_score
 14 | 
 15 | import os
 16 | import glob
 17 | import zipfile
 18 | import random
 19 | 
 20 | import ssl
 21 | ssl._create_default_https_context = ssl._create_unverified_context
 22 | 
 23 | 
 24 | # to get consistent results after multiple runs
 25 | tf.random.set_seed(7)
 26 | np.random.seed(7)
 27 | random.seed(7)
 28 | 
 29 | # 0 for benign, 1 for malignant
 30 | class_names = ["benign", "malignant"]
 31 | 
 32 | # C:\Users\Visitor\data if using VSCode
 33 | # C:\Users\Visitor\PycharmProjects\pythonProject\data if using PyCharm
 34 | def download_and_extract_dataset():
 35 |     # dataset from https://github.com/udacity/dermatologist-ai
 36 |     # 5.3GB
 37 |     train_url = "https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/train.zip"
 38 |     # 824.5MB
 39 |     valid_url = "https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/valid.zip"
 40 |     # 5.1GB
 41 |     test_url = "https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/test.zip"
 42 |     for i, download_link in enumerate([valid_url, train_url, test_url]):
 43 |         temp_file = f"temp{i}.zip"
 44 |         data_dir = get_file(origin=download_link, fname=os.path.join(os.getcwd(), temp_file))
 45 |         print("Extracting", download_link)
 46 |         with zipfile.ZipFile(data_dir, "r") as z:
 47 |             z.extractall("data")
 48 |         # remove the temp file
 49 |         os.remove(temp_file)
 50 | 
 51 | 
 52 | # comment the below line if you already downloaded the dataset
 53 | download_and_extract_dataset()
 54 | 
 55 | 
 56 | # preparing data
 57 | # generate CSV metadata file to read img paths and labels from it
 58 | def generate_csv(folder, label2int):
 59 |     folder_name = os.path.basename(folder)
 60 |     labels = list(label2int)
 61 |     # generate CSV file
 62 |     df = pd.DataFrame(columns=["filepath", "label"])
 63 |     i = 0
 64 |     for label in labels:
 65 |         print("Reading", os.path.join(folder, label, "*"))
 66 |         for filepath in glob.glob(os.path.join(folder, label, "*")):
 67 |             df.loc[i] = [filepath, label2int[label]]
 68 |             i += 1
 69 |     output_file = f"{folder_name}.csv"
 70 |     print("Saving", output_file)
 71 |     df.to_csv(output_file)
 72 | 
 73 | 
 74 | # generate CSV files for all data portions, labeling nevus and seborrheic keratosis
 75 | # as 0 (benign), and melanoma as 1 (malignant)
 76 | # you should replace "data" path to your extracted dataset path
 77 | # don't replace if you used download_and_extract_dataset() function
 78 | generate_csv("data/train", {"nevus": 0, "seborrheic_keratosis": 0, "melanoma": 1})
 79 | generate_csv("data/valid", {"nevus": 0, "seborrheic_keratosis": 0, "melanoma": 1})
 80 | generate_csv("data/test", {"nevus": 0, "seborrheic_keratosis": 0, "melanoma": 1})
 81 | 
 82 | # loading data
 83 | train_metadata_filename = "train.csv"
 84 | valid_metadata_filename = "valid.csv"
 85 | # load CSV files as DataFrames
 86 | df_train = pd.read_csv(train_metadata_filename)
 87 | df_valid = pd.read_csv(valid_metadata_filename)
 88 | n_training_samples = len(df_train)
 89 | n_validation_samples = len(df_valid)
 90 | print("Number of training samples:", n_training_samples)
 91 | print("Number of validation samples:", n_validation_samples)
 92 | train_ds = tf.data.Dataset.from_tensor_slices((df_train["filepath"], df_train["label"]))
 93 | valid_ds = tf.data.Dataset.from_tensor_slices((df_valid["filepath"], df_valid["label"]))
 94 | 
 95 | 
 96 | # preprocess data
 97 | def decode_img(img):
 98 |     # convert the compressed string to a 3D uint8 tensor
 99 |     img = tf.image.decode_jpeg(img, channels=3)
100 |     # Use `convert_image_dtype` to convert to floats in the [0,1] range.
101 |     img = tf.image.convert_image_dtype(img, tf.float32)
102 |     # resize the image to the desired size.
103 |     return tf.image.resize(img, [299, 299])
104 | 
105 | 
106 | def process_path(filepath, label):
107 |     # load the raw data from the file as a string
108 |     img = tf.io.read_file(filepath)
109 |     img = decode_img(img)
110 |     return img, label
111 | 
112 | 
113 | valid_ds = valid_ds.map(process_path)
114 | train_ds = train_ds.map(process_path)
115 | # test_ds = test_ds
116 | for image, label in train_ds.take(1):
117 |     print("Image shape:", image.shape)
118 |     print("Label:", label.numpy())
119 | 
120 | # training parameters
121 | batch_size = 64
122 | optimizer = "rmsprop"
123 | 
124 | 
125 | def prepare_for_training(ds, cache=True, batch_size=64, shuffle_buffer_size=1000):
126 |     if cache:
127 |         if isinstance(cache, str):
128 |             ds = ds.cache(cache)
129 |         else:
130 |             ds = ds.cache()
131 |     # shuffle the dataset
132 |     ds = ds.shuffle(buffer_size=shuffle_buffer_size)
133 |     # Repeat forever
134 |     ds = ds.repeat()
135 |     # split to batches
136 |     ds = ds.batch(batch_size)
137 |     # `prefetch` lets the dataset fetch batches in the background while the model
138 |     # is training.
139 |     ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
140 |     return ds
141 | 
142 | 
143 | valid_ds = prepare_for_training(valid_ds, batch_size=batch_size, cache="valid-cached-data")
144 | train_ds = prepare_for_training(train_ds, batch_size=batch_size, cache="train-cached-data")
145 | 
146 | batch = next(iter(valid_ds))
147 | 
148 | 
149 | def show_batch(batch):
150 |     plt.figure(figsize=(12, 12))
151 |     for n in range(25):
152 |         ax = plt.subplot(5, 5, n + 1)
153 |         plt.imshow(batch[0][n])
154 |         plt.title(class_names[batch[1][n].numpy()].title())
155 |         plt.axis('off')
156 | 
157 | 
158 | show_batch(batch)
159 | 
160 | # building the model
161 | # InceptionV3 model & pre-trained weights
162 | module_url = "https://tfhub.dev/google/tf2-preview/inception_v3/feature_vector/4"
163 | m = tf.keras.Sequential([
164 |     hub.KerasLayer(module_url, output_shape=[2048], trainable=False),
165 |     tf.keras.layers.Dense(1, activation="sigmoid")
166 | ])
167 | 
168 | m.build([None, 299, 299, 3])
169 | m.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
170 | m.summary()
171 | 
172 | model_name = f"benign-vs-malignant_{batch_size}_{optimizer}"
173 | tensorboard = tf.keras.callbacks.TensorBoard(log_dir=os.path.join("logs", model_name))
174 | # saves model checkpoint whenever we reach better weights
175 | modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(model_name + "_{val_loss:.3f}.h5", save_best_only=True, verbose=1)
176 | 
177 | history = m.fit(train_ds, validation_data=valid_ds,
178 |                 steps_per_epoch=n_training_samples // batch_size,
179 | #                validation_steps=n_validation_samples // batch_size, verbose=1, epochs=100,                
180 |                 validation_steps=n_validation_samples // batch_size, verbose=1, epochs=100,
181 |                 callbacks=[tensorboard, modelcheckpoint])
182 | 
183 | # evaluation
184 | # load testing set
185 | test_metadata_filename = "test.csv"
186 | df_test = pd.read_csv(test_metadata_filename)
187 | n_testing_samples = len(df_test)
188 | print("Number of testing samples:", n_testing_samples)
189 | test_ds = tf.data.Dataset.from_tensor_slices((df_test["filepath"], df_test["label"]))
190 | 
191 | 
192 | def prepare_for_testing(ds, cache=True, shuffle_buffer_size=1000):
193 |     if cache:
194 |         if isinstance(cache, str):
195 |             ds = ds.cache(cache)
196 |         else:
197 |             ds = ds.cache()
198 |     ds = ds.shuffle(buffer_size=shuffle_buffer_size)
199 |     return ds
200 | 
201 | 
202 | test_ds = test_ds.map(process_path)
203 | test_ds = prepare_for_testing(test_ds, cache="test-cached-data")
204 | 
205 | # convert testing set to numpy array to fit in memory (don't do that when testing
206 | # set is too large)
207 | y_test = np.zeros((n_testing_samples,))
208 | X_test = np.zeros((n_testing_samples, 299, 299, 3))
209 | for i, (img, label) in enumerate(test_ds.take(n_testing_samples)):
210 |     # print(img.shape, label.shape)
211 |     X_test[i] = img
212 |     y_test[i] = label.numpy()
213 | 
214 | print("y_test.shape:", y_test.shape)
215 | 
216 | # load the weights with the least loss
217 | m.load_weights("benign-vs-malignant_64_rmsprop_0.390.h5")
218 | 
219 | print("Evaluating the model...")
220 | loss, accuracy = m.evaluate(X_test, y_test, verbose=0)
221 | print("Loss:", loss, "  Accuracy:", accuracy)
222 | 
223 | 
224 | def get_predictions(threshold=None):
225 |     """
226 |     Returns predictions for binary classification given `threshold`
227 |     For instance, if threshold is 0.3, then it'll output 1 (malignant) for that sample if
228 |     the probability of 1 is 30% or more (instead of 50%)
229 |     """
230 |     y_pred = m.predict(X_test)
231 |     if not threshold:
232 |         threshold = 0.5
233 |     result = np.zeros((n_testing_samples,))
234 |     for i in range(n_testing_samples):
235 |         # test melanoma probability
236 |         if y_pred[i][0] >= threshold:
237 |             result[i] = 1
238 |         # else, it's 0 (benign)
239 |     return result
240 | 
241 | 
242 | threshold = 0.23
243 | # get predictions with 23% threshold
244 | # which means if the model is 23% sure or more that is malignant,
245 | # it's assigned as malignant, otherwise it's benign
246 | y_pred = get_predictions(threshold)
247 | accuracy_after = accuracy_score(y_test, y_pred)
248 | print("Accuracy after setting the threshold:", accuracy_after)
249 | 
250 | 
251 | def plot_confusion_matrix(y_test, y_pred):
252 |     cmn = confusion_matrix(y_test, y_pred)
253 |     # Normalise
254 |     cmn = cmn.astype('float') / cmn.sum(axis=1)[:, np.newaxis]
255 |     # print it
256 |     print(cmn)
257 |     fig, ax = plt.subplots(figsize=(10, 10))
258 |     sns.heatmap(cmn, annot=True, fmt='.2f',
259 |                 xticklabels=[f"pred_{c}" for c in class_names],
260 |                 yticklabels=[f"true_{c}" for c in class_names],
261 |                 cmap="Blues"
262 |                 )
263 |     plt.ylabel('Actual')
264 |     plt.xlabel('Predicted')
265 |     # plot the resulting confusion matrix
266 |     plt.show()
267 | 
268 | 
269 | plot_confusion_matrix(y_test, y_pred)
270 | 
271 | sensitivity = sensitivity_score(y_test, y_pred)
272 | specificity = specificity_score(y_test, y_pred)
273 | 
274 | print("Melanoma Sensitivity:", sensitivity)
275 | print("Melanoma Specificity:", specificity)
276 | 
277 | 
278 | def plot_roc_auc(y_true, y_pred):
279 |     """
280 |     This function plots the ROC curves and provides the scores.
281 |     """
282 |     # prepare for figure
283 |     plt.figure()
284 |     fpr, tpr, _ = roc_curve(y_true, y_pred)
285 |     # obtain ROC AUC
286 |     roc_auc = auc(fpr, tpr)
287 |     # print score
288 |     print(f"ROC AUC: {roc_auc:.3f}")
289 |     # plot ROC curve
290 |     plt.plot(fpr, tpr, color="blue", lw=2,
291 |              label='ROC curve (area = {f:.2f})'.format(d=1, f=roc_auc))
292 |     plt.xlim([0.0, 1.0])
293 |     plt.ylim([0.0, 1.05])
294 |     plt.xlabel('False Positive Rate')
295 |     plt.ylabel('True Positive Rate')
296 |     plt.title('ROC curves')
297 |     plt.legend(loc="lower right")
298 |     plt.show()
299 | 
300 | 
301 | plot_roc_auc(y_test, y_pred)
302 | 
303 | 
304 | # a function given a function, it predicts the class of the image
305 | def predict_image_class(img_path, model, threshold=0.5):
306 |     img = tf.keras.preprocessing.image.load_img(img_path, target_size=(299, 299))
307 |     img = tf.keras.preprocessing.image.img_to_array(img)
308 |     img = tf.expand_dims(img, 0)  # Create a batch
309 |     img = tf.keras.applications.inception_v3.preprocess_input(img)
310 |     img = tf.image.convert_image_dtype(img, tf.float32)
311 |     predictions = model.predict(img)
312 |     score = predictions.squeeze()
313 |     if score >= threshold:
314 |         print(f"This image is {100 * score:.2f}% malignant.")
315 |     else:
316 |         print(f"This image is {100 * (1 - score):.2f}% benign.")
317 |     plt.imshow(img[0])
318 |     plt.axis('off')
319 |     plt.show()
320 | 
321 | 
322 | predict_image_class("data/test/melanoma/ISIC_0013767.jpg", m)
323 | 
324 | predict_image_class("data/test/nevus/ISIC_0012092.jpg", m)
325 | 
326 | predict_image_class("data/test/seborrheic_keratosis/ISIC_0012136.jpg", m)
327 | 
328 | 
329 | 
330 | 
331 | 


--------------------------------------------------------------------------------