├── README.md └── main.py /README.md: -------------------------------------------------------------------------------- 1 | Using deep learning and neural networks, we'll be able to classify benign and malignant skin diseases, which may help the doctor diagnose cancer at an earlier stage. 2 | In this tutorial, we will make a skin disease classifier that tries to distinguish between benign (nevus and seborrheic keratosis) and malignant (melanoma) skin diseases from only photographic images using TensorFlow framework in Python. 3 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow_hub as hub 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import seaborn as sns 7 | 8 | # from tensorflow.keras.utils import get_file 9 | from keras.utils import get_file 10 | from sklearn.metrics import accuracy_score 11 | 12 | from sklearn.metrics import roc_curve, auc, confusion_matrix 13 | from imblearn.metrics import sensitivity_score, specificity_score 14 | 15 | import os 16 | import glob 17 | import zipfile 18 | import random 19 | 20 | import ssl 21 | ssl._create_default_https_context = ssl._create_unverified_context 22 | 23 | 24 | # to get consistent results after multiple runs 25 | tf.random.set_seed(7) 26 | np.random.seed(7) 27 | random.seed(7) 28 | 29 | # 0 for benign, 1 for malignant 30 | class_names = ["benign", "malignant"] 31 | 32 | # C:\Users\Visitor\data if using VSCode 33 | # C:\Users\Visitor\PycharmProjects\pythonProject\data if using PyCharm 34 | def download_and_extract_dataset(): 35 | # dataset from https://github.com/udacity/dermatologist-ai 36 | # 5.3GB 37 | train_url = "https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/train.zip" 38 | # 824.5MB 39 | valid_url = "https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/valid.zip" 40 | # 5.1GB 41 | test_url = "https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/test.zip" 42 | for i, download_link in enumerate([valid_url, train_url, test_url]): 43 | temp_file = f"temp{i}.zip" 44 | data_dir = get_file(origin=download_link, fname=os.path.join(os.getcwd(), temp_file)) 45 | print("Extracting", download_link) 46 | with zipfile.ZipFile(data_dir, "r") as z: 47 | z.extractall("data") 48 | # remove the temp file 49 | os.remove(temp_file) 50 | 51 | 52 | # comment the below line if you already downloaded the dataset 53 | download_and_extract_dataset() 54 | 55 | 56 | # preparing data 57 | # generate CSV metadata file to read img paths and labels from it 58 | def generate_csv(folder, label2int): 59 | folder_name = os.path.basename(folder) 60 | labels = list(label2int) 61 | # generate CSV file 62 | df = pd.DataFrame(columns=["filepath", "label"]) 63 | i = 0 64 | for label in labels: 65 | print("Reading", os.path.join(folder, label, "*")) 66 | for filepath in glob.glob(os.path.join(folder, label, "*")): 67 | df.loc[i] = [filepath, label2int[label]] 68 | i += 1 69 | output_file = f"{folder_name}.csv" 70 | print("Saving", output_file) 71 | df.to_csv(output_file) 72 | 73 | 74 | # generate CSV files for all data portions, labeling nevus and seborrheic keratosis 75 | # as 0 (benign), and melanoma as 1 (malignant) 76 | # you should replace "data" path to your extracted dataset path 77 | # don't replace if you used download_and_extract_dataset() function 78 | generate_csv("data/train", {"nevus": 0, "seborrheic_keratosis": 0, "melanoma": 1}) 79 | generate_csv("data/valid", {"nevus": 0, "seborrheic_keratosis": 0, "melanoma": 1}) 80 | generate_csv("data/test", {"nevus": 0, "seborrheic_keratosis": 0, "melanoma": 1}) 81 | 82 | # loading data 83 | train_metadata_filename = "train.csv" 84 | valid_metadata_filename = "valid.csv" 85 | # load CSV files as DataFrames 86 | df_train = pd.read_csv(train_metadata_filename) 87 | df_valid = pd.read_csv(valid_metadata_filename) 88 | n_training_samples = len(df_train) 89 | n_validation_samples = len(df_valid) 90 | print("Number of training samples:", n_training_samples) 91 | print("Number of validation samples:", n_validation_samples) 92 | train_ds = tf.data.Dataset.from_tensor_slices((df_train["filepath"], df_train["label"])) 93 | valid_ds = tf.data.Dataset.from_tensor_slices((df_valid["filepath"], df_valid["label"])) 94 | 95 | 96 | # preprocess data 97 | def decode_img(img): 98 | # convert the compressed string to a 3D uint8 tensor 99 | img = tf.image.decode_jpeg(img, channels=3) 100 | # Use `convert_image_dtype` to convert to floats in the [0,1] range. 101 | img = tf.image.convert_image_dtype(img, tf.float32) 102 | # resize the image to the desired size. 103 | return tf.image.resize(img, [299, 299]) 104 | 105 | 106 | def process_path(filepath, label): 107 | # load the raw data from the file as a string 108 | img = tf.io.read_file(filepath) 109 | img = decode_img(img) 110 | return img, label 111 | 112 | 113 | valid_ds = valid_ds.map(process_path) 114 | train_ds = train_ds.map(process_path) 115 | # test_ds = test_ds 116 | for image, label in train_ds.take(1): 117 | print("Image shape:", image.shape) 118 | print("Label:", label.numpy()) 119 | 120 | # training parameters 121 | batch_size = 64 122 | optimizer = "rmsprop" 123 | 124 | 125 | def prepare_for_training(ds, cache=True, batch_size=64, shuffle_buffer_size=1000): 126 | if cache: 127 | if isinstance(cache, str): 128 | ds = ds.cache(cache) 129 | else: 130 | ds = ds.cache() 131 | # shuffle the dataset 132 | ds = ds.shuffle(buffer_size=shuffle_buffer_size) 133 | # Repeat forever 134 | ds = ds.repeat() 135 | # split to batches 136 | ds = ds.batch(batch_size) 137 | # `prefetch` lets the dataset fetch batches in the background while the model 138 | # is training. 139 | ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) 140 | return ds 141 | 142 | 143 | valid_ds = prepare_for_training(valid_ds, batch_size=batch_size, cache="valid-cached-data") 144 | train_ds = prepare_for_training(train_ds, batch_size=batch_size, cache="train-cached-data") 145 | 146 | batch = next(iter(valid_ds)) 147 | 148 | 149 | def show_batch(batch): 150 | plt.figure(figsize=(12, 12)) 151 | for n in range(25): 152 | ax = plt.subplot(5, 5, n + 1) 153 | plt.imshow(batch[0][n]) 154 | plt.title(class_names[batch[1][n].numpy()].title()) 155 | plt.axis('off') 156 | 157 | 158 | show_batch(batch) 159 | 160 | # building the model 161 | # InceptionV3 model & pre-trained weights 162 | module_url = "https://tfhub.dev/google/tf2-preview/inception_v3/feature_vector/4" 163 | m = tf.keras.Sequential([ 164 | hub.KerasLayer(module_url, output_shape=[2048], trainable=False), 165 | tf.keras.layers.Dense(1, activation="sigmoid") 166 | ]) 167 | 168 | m.build([None, 299, 299, 3]) 169 | m.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]) 170 | m.summary() 171 | 172 | model_name = f"benign-vs-malignant_{batch_size}_{optimizer}" 173 | tensorboard = tf.keras.callbacks.TensorBoard(log_dir=os.path.join("logs", model_name)) 174 | # saves model checkpoint whenever we reach better weights 175 | modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(model_name + "_{val_loss:.3f}.h5", save_best_only=True, verbose=1) 176 | 177 | history = m.fit(train_ds, validation_data=valid_ds, 178 | steps_per_epoch=n_training_samples // batch_size, 179 | # validation_steps=n_validation_samples // batch_size, verbose=1, epochs=100, 180 | validation_steps=n_validation_samples // batch_size, verbose=1, epochs=100, 181 | callbacks=[tensorboard, modelcheckpoint]) 182 | 183 | # evaluation 184 | # load testing set 185 | test_metadata_filename = "test.csv" 186 | df_test = pd.read_csv(test_metadata_filename) 187 | n_testing_samples = len(df_test) 188 | print("Number of testing samples:", n_testing_samples) 189 | test_ds = tf.data.Dataset.from_tensor_slices((df_test["filepath"], df_test["label"])) 190 | 191 | 192 | def prepare_for_testing(ds, cache=True, shuffle_buffer_size=1000): 193 | if cache: 194 | if isinstance(cache, str): 195 | ds = ds.cache(cache) 196 | else: 197 | ds = ds.cache() 198 | ds = ds.shuffle(buffer_size=shuffle_buffer_size) 199 | return ds 200 | 201 | 202 | test_ds = test_ds.map(process_path) 203 | test_ds = prepare_for_testing(test_ds, cache="test-cached-data") 204 | 205 | # convert testing set to numpy array to fit in memory (don't do that when testing 206 | # set is too large) 207 | y_test = np.zeros((n_testing_samples,)) 208 | X_test = np.zeros((n_testing_samples, 299, 299, 3)) 209 | for i, (img, label) in enumerate(test_ds.take(n_testing_samples)): 210 | # print(img.shape, label.shape) 211 | X_test[i] = img 212 | y_test[i] = label.numpy() 213 | 214 | print("y_test.shape:", y_test.shape) 215 | 216 | # load the weights with the least loss 217 | m.load_weights("benign-vs-malignant_64_rmsprop_0.390.h5") 218 | 219 | print("Evaluating the model...") 220 | loss, accuracy = m.evaluate(X_test, y_test, verbose=0) 221 | print("Loss:", loss, " Accuracy:", accuracy) 222 | 223 | 224 | def get_predictions(threshold=None): 225 | """ 226 | Returns predictions for binary classification given `threshold` 227 | For instance, if threshold is 0.3, then it'll output 1 (malignant) for that sample if 228 | the probability of 1 is 30% or more (instead of 50%) 229 | """ 230 | y_pred = m.predict(X_test) 231 | if not threshold: 232 | threshold = 0.5 233 | result = np.zeros((n_testing_samples,)) 234 | for i in range(n_testing_samples): 235 | # test melanoma probability 236 | if y_pred[i][0] >= threshold: 237 | result[i] = 1 238 | # else, it's 0 (benign) 239 | return result 240 | 241 | 242 | threshold = 0.23 243 | # get predictions with 23% threshold 244 | # which means if the model is 23% sure or more that is malignant, 245 | # it's assigned as malignant, otherwise it's benign 246 | y_pred = get_predictions(threshold) 247 | accuracy_after = accuracy_score(y_test, y_pred) 248 | print("Accuracy after setting the threshold:", accuracy_after) 249 | 250 | 251 | def plot_confusion_matrix(y_test, y_pred): 252 | cmn = confusion_matrix(y_test, y_pred) 253 | # Normalise 254 | cmn = cmn.astype('float') / cmn.sum(axis=1)[:, np.newaxis] 255 | # print it 256 | print(cmn) 257 | fig, ax = plt.subplots(figsize=(10, 10)) 258 | sns.heatmap(cmn, annot=True, fmt='.2f', 259 | xticklabels=[f"pred_{c}" for c in class_names], 260 | yticklabels=[f"true_{c}" for c in class_names], 261 | cmap="Blues" 262 | ) 263 | plt.ylabel('Actual') 264 | plt.xlabel('Predicted') 265 | # plot the resulting confusion matrix 266 | plt.show() 267 | 268 | 269 | plot_confusion_matrix(y_test, y_pred) 270 | 271 | sensitivity = sensitivity_score(y_test, y_pred) 272 | specificity = specificity_score(y_test, y_pred) 273 | 274 | print("Melanoma Sensitivity:", sensitivity) 275 | print("Melanoma Specificity:", specificity) 276 | 277 | 278 | def plot_roc_auc(y_true, y_pred): 279 | """ 280 | This function plots the ROC curves and provides the scores. 281 | """ 282 | # prepare for figure 283 | plt.figure() 284 | fpr, tpr, _ = roc_curve(y_true, y_pred) 285 | # obtain ROC AUC 286 | roc_auc = auc(fpr, tpr) 287 | # print score 288 | print(f"ROC AUC: {roc_auc:.3f}") 289 | # plot ROC curve 290 | plt.plot(fpr, tpr, color="blue", lw=2, 291 | label='ROC curve (area = {f:.2f})'.format(d=1, f=roc_auc)) 292 | plt.xlim([0.0, 1.0]) 293 | plt.ylim([0.0, 1.05]) 294 | plt.xlabel('False Positive Rate') 295 | plt.ylabel('True Positive Rate') 296 | plt.title('ROC curves') 297 | plt.legend(loc="lower right") 298 | plt.show() 299 | 300 | 301 | plot_roc_auc(y_test, y_pred) 302 | 303 | 304 | # a function given a function, it predicts the class of the image 305 | def predict_image_class(img_path, model, threshold=0.5): 306 | img = tf.keras.preprocessing.image.load_img(img_path, target_size=(299, 299)) 307 | img = tf.keras.preprocessing.image.img_to_array(img) 308 | img = tf.expand_dims(img, 0) # Create a batch 309 | img = tf.keras.applications.inception_v3.preprocess_input(img) 310 | img = tf.image.convert_image_dtype(img, tf.float32) 311 | predictions = model.predict(img) 312 | score = predictions.squeeze() 313 | if score >= threshold: 314 | print(f"This image is {100 * score:.2f}% malignant.") 315 | else: 316 | print(f"This image is {100 * (1 - score):.2f}% benign.") 317 | plt.imshow(img[0]) 318 | plt.axis('off') 319 | plt.show() 320 | 321 | 322 | predict_image_class("data/test/melanoma/ISIC_0013767.jpg", m) 323 | 324 | predict_image_class("data/test/nevus/ISIC_0012092.jpg", m) 325 | 326 | predict_image_class("data/test/seborrheic_keratosis/ISIC_0012136.jpg", m) 327 | 328 | 329 | 330 | 331 | --------------------------------------------------------------------------------