├── Architecture_plot.py
├── __pycache__
    ├── data_input.cpython-35.pyc
    ├── model.cpython-35.pyc
    └── preparation.cpython-35.pyc
├── data_analysis.py
├── data_input.py
├── main.py
├── model.py
├── model
    └── model-0416-1.h5
├── pic
    ├── graph_large_attrs_key=_too_large_attrs&limit_attr_size=1024&run=.png
    └── model.png
├── post_process.py
├── preparation.py
├── readme.md
├── result
    └── sub-dsbowl2018_post_process_0416.zip
├── test.py
├── train.py
└── write_result.py


/Architecture_plot.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | """
 4 | python=3.5.2
 5 | """
 6 | 
 7 | from keras.utils import plot_model
 8 | 
 9 | from model import get_unet, dice_coef
10 | 
11 | 
12 | # apply a 3x3 convolution with 64 output filters on a 256x256 image:
13 | # get u_net model
14 | model = get_unet()
15 | print("We finish building the model")
16 | 
17 | plot_model(model, to_file='unet_model.png', show_shapes=True)


--------------------------------------------------------------------------------
/__pycache__/data_input.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InsaneLife/nucleus_detection/a53b10965b2963922b7c266bb93ad4cbe2906db0/__pycache__/data_input.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/model.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InsaneLife/nucleus_detection/a53b10965b2963922b7c266bb93ad4cbe2906db0/__pycache__/model.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/preparation.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InsaneLife/nucleus_detection/a53b10965b2963922b7c266bb93ad4cbe2906db0/__pycache__/preparation.cpython-35.pyc


--------------------------------------------------------------------------------
/data_analysis.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | """
4 | python=3.5.2
5 | """
6 | 
7 | 


--------------------------------------------------------------------------------
/data_input.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | """
  4 | python=3.5.2
  5 | """
  6 | 
  7 | import os
  8 | import random
  9 | import sys
 10 | import warnings
 11 | import numpy as np
 12 | from itertools import chain
 13 | import matplotlib.pyplot as plt
 14 | import skimage
 15 | from skimage.io import imread, imshow, imread_collection, concatenate_images
 16 | from skimage.transform import resize
 17 | from skimage.morphology import label
 18 | from keras.utils import Progbar
 19 | import scipy
 20 | warnings.filterwarnings('ignore', category=UserWarning, module='skimage')
 21 | from preparation import get_contour, split_overlay_mask_by_contour
 22 | 
 23 | # Setting seed for reproducability
 24 | seed = 42
 25 | random.seed = seed
 26 | np.random.seed = seed
 27 | 
 28 | # Data Path
 29 | data_root = 'E:/project_data/nucleus_detection/'
 30 | data_root = '/home/aaron/project_data/nucleus_detection/'
 31 | TRAIN_PATH = data_root + '/stage1_train/'
 32 | TEST_PATH = data_root + '/stage2_test/'
 33 | INPUT_PATH = data_root + '/input/'
 34 | 
 35 | # Get train and test IDs
 36 | train_ids = next(os.walk(TRAIN_PATH))[1]
 37 | test_ids = next(os.walk(TEST_PATH))[1]
 38 | 
 39 | 
 40 | # Function read train images and mask return as nump array
 41 | def read_train_data(IMG_WIDTH=256, IMG_HEIGHT=256, IMG_CHANNELS=3):
 42 |     X_train = np.zeros((len(train_ids), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.uint8)
 43 |     Y_train = np.zeros((len(train_ids), IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.bool)
 44 |     print('Getting and resizing train images and masks ... ')
 45 |     sys.stdout.flush()
 46 |     if os.path.isfile(INPUT_PATH + "train_img.npy") and os.path.isfile(INPUT_PATH + "train_mask.npy"):
 47 |         print("Train file loaded from memory")
 48 |         X_train = np.load(INPUT_PATH + "train_img.npy")
 49 |         Y_train = np.load(INPUT_PATH + "train_mask.npy")
 50 |         return X_train, Y_train
 51 |     a = Progbar(len(train_ids))
 52 |     for n, id_ in enumerate(train_ids):
 53 |         path = TRAIN_PATH + id_
 54 |         img = imread(path + '/images/' + id_ + '.png')
 55 |         if len(img.shape) == 2:
 56 |             img = skimage.color.gray2rgb(img)
 57 |         img = img[:, :, :IMG_CHANNELS]
 58 |         img = resize(img, (IMG_HEIGHT, IMG_WIDTH), mode='constant', preserve_range=True)
 59 |         X_train[n] = img
 60 |         masks, masks_counters = [], []
 61 |         for mask_file in next(os.walk(path + '/masks/'))[2]:
 62 |             mask_ = imread(path + '/masks/' + mask_file)
 63 |             masks.append(mask_)
 64 |             mask_contour = get_contour(mask_)
 65 |             masks_counters.append(mask_contour)
 66 |         masks = np.sum(np.array(masks), axis=0)
 67 |         masks_counters = np.sum(np.array(masks_counters), axis=0)
 68 |         split_masks = split_overlay_mask_by_contour(masks, masks_counters)
 69 |         Y_train[n] = np.expand_dims(resize(split_masks, (IMG_HEIGHT, IMG_WIDTH), mode='constant',
 70 |                                                                             preserve_range=True), axis=-1)
 71 |         a.update(n)
 72 | 
 73 |     np.save(INPUT_PATH + "train_img", X_train)
 74 |     np.save(INPUT_PATH + "train_mask", Y_train)
 75 |     return X_train, Y_train
 76 | 
 77 | 
 78 | # Function to read test images and return as numpy array
 79 | def read_test_data(IMG_WIDTH=256, IMG_HEIGHT=256, IMG_CHANNELS=3):
 80 |     X_test = np.zeros((len(test_ids), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.uint8)
 81 |     sizes_test = []
 82 |     print('\nGetting and resizing test images ... ')
 83 |     sys.stdout.flush()
 84 |     if os.path.isfile(INPUT_PATH + "test_img.npy") and os.path.isfile(INPUT_PATH + "test_size.npy"):
 85 |         print("Test file loaded from memory")
 86 |         X_test = np.load(INPUT_PATH + "test_img.npy")
 87 |         sizes_test = np.load(INPUT_PATH + "test_size.npy")
 88 |         return X_test, sizes_test
 89 |     b = Progbar(len(test_ids))
 90 |     for n, id_ in enumerate(test_ids):
 91 |         path = TEST_PATH + id_
 92 |         img = imread(path + '/images/' + id_ + '.png')
 93 |         if len(img.shape) == 2:
 94 |             img = skimage.color.gray2rgb(img)
 95 |         img = img[:, :, :IMG_CHANNELS]
 96 |         sizes_test.append([img.shape[0], img.shape[1]])
 97 |         img = resize(img, (IMG_HEIGHT, IMG_WIDTH), mode='constant', preserve_range=True)
 98 |         X_test[n] = img
 99 |         b.update(n)
100 |     np.save(INPUT_PATH + "test_img", X_test)
101 |     np.save(INPUT_PATH + "test_size", sizes_test)
102 |     return X_test, sizes_test
103 | 
104 | 
105 | # Run-length encoding stolen from https://www.kaggle.com/rakhlin/fast-run-length-encoding-python
106 | def rle_encoding(x):
107 |     dots = np.where(x.T.flatten() == 1)[0]
108 |     run_lengths = []
109 |     prev = -2
110 |     for b in dots:
111 |         if (b > prev + 1): run_lengths.extend((b + 1, 0))
112 |         run_lengths[-1] += 1
113 |         prev = b
114 |     return run_lengths
115 | 
116 | 
117 | def prob_to_rles(x, cutoff=0.5):
118 |     lab_img = label(x > cutoff)
119 |     for i in range(1, lab_img.max() + 1):
120 |         yield rle_encoding(lab_img == i)
121 | 
122 | 
123 | # Iterate over the test IDs and generate run-length encodings for each seperate mask identified by skimage
124 | def mask_to_rle(preds_test_upsampled):
125 |     new_test_ids = []
126 |     rles = []
127 |     for n, id_ in enumerate(test_ids):
128 |         rle = list(prob_to_rles(preds_test_upsampled[n]))
129 |         rles.extend(rle)
130 |         new_test_ids.extend([id_] * len(rle))
131 |     return new_test_ids, rles
132 | 
133 | 
134 | if __name__ == '__main__':
135 |     x, y = read_train_data()
136 |     x, y = read_test_data()
137 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | """
 4 | python=3.5.2
 5 | """
 6 | 
 7 | from data_input import read_train_data, read_test_data, prob_to_rles, mask_to_rle, resize, np
 8 | from model import get_unet, dice_coef
 9 | import pandas as pd
10 | from post_process import post_processing
11 | from skimage.io import imshow
12 | import matplotlib.pyplot as plt
13 | from keras.models import load_model
14 | from keras.callbacks import TensorBoard
15 | epochs = 50
16 | model_name = 'model-0416-test.h5'
17 | # get train_data
18 | train_img, train_mask = read_train_data()
19 | 
20 | # get test_data
21 | test_img, test_img_sizes = read_test_data()
22 | 
23 | # get u_net model
24 | u_net = get_unet()
25 | 
26 | # fit model on train_data
27 | print("\n Training...")
28 | tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
29 | u_net.fit(train_img, train_mask, batch_size=16, epochs=epochs, callbacks=[tb])
30 | 
31 | print("\n Saving")
32 | u_net.save(model_name)
33 | 
34 | print("\n load model")
35 | u_net = load_model(model_name, custom_objects={'dice_coef': dice_coef})
36 | 
37 | print("\n Predicting and Saving predict")
38 | # Predict on test data
39 | test_mask = u_net.predict(test_img, verbose=1)
40 | np.save("test_img_pred", test_mask)
41 | 
42 | test_mask = np.load("test_img_pred.npy")
43 | # get test_data
44 | # test_img, test_img_sizes = read_test_data()
45 | 
46 | # post processing
47 | post_test_mask = post_processing(test_mask)
48 | 
49 | post_test_mask = np.expand_dims(post_test_mask, axis=-1)
50 | # Create list of upsampled test masks
51 | test_mask_upsampled = []
52 | for i in range(len(post_test_mask)):
53 |     test_mask_upsampled.append(resize(np.squeeze(post_test_mask[i]),
54 |                                       (test_img_sizes[i][0], test_img_sizes[i][1]),
55 |                                       mode='constant', preserve_range=True))
56 | print('Done!')
57 | 
58 | test_ids, rles = mask_to_rle(test_mask_upsampled)
59 | 
60 | # Create submission DataFrame
61 | sub = pd.DataFrame()
62 | sub['ImageId'] = test_ids
63 | sub['EncodedPixels'] = pd.Series(rles).apply(lambda x: ' '.join(str(y) for y in x))
64 | sub.to_csv('sub-dsbowl2018_preprocess_post_process_0416_1.csv', index=False)
65 | 
66 | print("Data saved")


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | """
 4 | python=3.5.2
 5 | """
 6 | 
 7 | from keras.models import Model, load_model
 8 | from keras.layers import Input, BatchNormalization
 9 | from keras.layers.core import Dropout, Lambda
10 | from keras.layers.convolutional import Conv2D, Conv2DTranspose, Convolution2D
11 | from keras.layers.pooling import MaxPooling2D
12 | from keras.layers.merge import concatenate
13 | from keras import backend as K
14 | 
15 | smooth = 1.
16 | padding_type_contract = 'same'
17 | padding_type_expand = 'same'
18 | 
19 | # Metric function
20 | def dice_coef(y_true, y_pred):
21 |     y_true_f = K.flatten(y_true)
22 |     y_pred_f = K.flatten(y_pred)
23 |     intersection = K.sum(y_true_f * y_pred_f)
24 |     return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
25 | 
26 | 
27 | # Loss funtion
28 | def dice_coef_loss(y_true, y_pred):
29 |     return -dice_coef(y_true, y_pred)
30 | 
31 | 
32 | def get_unet(IMG_WIDTH=256, IMG_HEIGHT=256, IMG_CHANNELS=3):
33 |     inputs = Input((IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS))
34 |     s = Lambda(lambda x: x / 255)(inputs)
35 |     s = BatchNormalization()(s)
36 |     c1 = Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(s)
37 |     c1 = Dropout(0.1)(c1)
38 |     c1 = BatchNormalization()(c1)
39 |     c1 = Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(c1)
40 |     p1 = MaxPooling2D((2, 2))(c1)
41 |     c2 = Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(p1)
42 |     c2 = Dropout(0.1)(c2)
43 |     c2 = BatchNormalization()(c2)
44 |     c2 = Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(c2)
45 |     p2 = MaxPooling2D((2, 2))(c2)
46 | 
47 |     c3 = Conv2D(64, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(p2)
48 |     c3 = Dropout(0.2)(c3)
49 |     c3 = BatchNormalization()(c3)
50 |     c3 = Conv2D(64, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(c3)
51 |     p3 = MaxPooling2D((2, 2))(c3)
52 | 
53 |     c4 = Conv2D(128, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(p3)
54 |     c4 = Dropout(0.2)(c4)
55 |     c4 = BatchNormalization()(c4)
56 |     c4 = Conv2D(128, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(c4)
57 |     p4 = MaxPooling2D(pool_size=(2, 2))(c4)
58 | 
59 |     c5 = Conv2D(256, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(p4)
60 |     c5 = Dropout(0.3)(c5)
61 |     c5 = BatchNormalization()(c5)
62 |     c5 = Conv2D(256, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_contract)(c5)
63 | 
64 |     u6 = Conv2DTranspose(128, (2, 2), strides=(2, 2), padding=padding_type_expand)(c5)
65 |     u6 = concatenate([u6, c4])
66 |     c6 = Conv2D(128, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_expand)(u6)
67 |     c6 = Dropout(0.2)(c6)
68 |     c6 = BatchNormalization()(c6)
69 |     c6 = Conv2D(128, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_expand)(c6)
70 | 
71 |     u7 = Conv2DTranspose(64, (2, 2), strides=(2, 2), padding=padding_type_expand)(c6)
72 |     u7 = concatenate([u7, c3])
73 |     c7 = Conv2D(64, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_expand)(u7)
74 |     c7 = Dropout(0.2)(c7)
75 |     c7 = BatchNormalization()(c7)
76 |     c7 = Conv2D(64, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_expand)(c7)
77 | 
78 |     u8 = Conv2DTranspose(32, (2, 2), strides=(2, 2), padding=padding_type_expand)(c7)
79 |     u8 = concatenate([u8, c2])
80 |     c8 = Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_expand)(u8)
81 |     c8 = Dropout(0.1)(c8)
82 |     c8 = BatchNormalization()(c8)
83 |     c8 = Conv2D(32, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_expand)(c8)
84 | 
85 |     u9 = Conv2DTranspose(16, (2, 2), strides=(2, 2), padding=padding_type_expand)(c8)
86 |     u9 = concatenate([u9, c1], axis=3)
87 |     c9 = Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_expand)(u9)
88 |     c9 = Dropout(0.1)(c9)
89 |     c9 = BatchNormalization()(c9)
90 |     c9 = Conv2D(16, (3, 3), activation='elu', kernel_initializer='he_normal', padding=padding_type_expand)(c9)
91 | 
92 |     outputs = Conv2D(1, (1, 1), activation='sigmoid')(c9)
93 | 
94 |     model = Model(inputs=[inputs], outputs=[outputs])
95 |     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[dice_coef])
96 |     return model
97 | 


--------------------------------------------------------------------------------
/model/model-0416-1.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InsaneLife/nucleus_detection/a53b10965b2963922b7c266bb93ad4cbe2906db0/model/model-0416-1.h5


--------------------------------------------------------------------------------
/pic/graph_large_attrs_key=_too_large_attrs&limit_attr_size=1024&run=.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InsaneLife/nucleus_detection/a53b10965b2963922b7c266bb93ad4cbe2906db0/pic/graph_large_attrs_key=_too_large_attrs&limit_attr_size=1024&run=.png


--------------------------------------------------------------------------------
/pic/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InsaneLife/nucleus_detection/a53b10965b2963922b7c266bb93ad4cbe2906db0/pic/model.png


--------------------------------------------------------------------------------
/post_process.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | """
  4 | python=3.5.2
  5 | """
  6 | 
  7 | from skimage.io import imshow
  8 | import os
  9 | import cv2
 10 | import numpy as np
 11 | from matplotlib import pyplot as plt
 12 | from skimage.morphology import label
 13 | import scipy.ndimage as ndi
 14 | from skimage.measure import regionprops
 15 | 
 16 | 
 17 | def split_mask_v1(mask):
 18 |     thresh = mask.copy().astype(np.uint8)
 19 |     im2, contours, hierarchy = cv2.findContours(thresh, 2, 1)
 20 |     i = 0
 21 |     for contour in contours:
 22 |         if cv2.contourArea(contour) > 20:
 23 |             hull = cv2.convexHull(contour, returnPoints=False)
 24 |             defects = cv2.convexityDefects(contour, hull)
 25 |             if defects is None:
 26 |                 continue
 27 |             points = []
 28 |             dd = []
 29 | 
 30 |             #
 31 |             # In this loop we gather all defect points
 32 |             # so that they can be filtered later on.
 33 |             for i in range(defects.shape[0]):
 34 |                 s, e, f, d = defects[i, 0]
 35 |                 start = tuple(contour[s][0])
 36 |                 end = tuple(contour[e][0])
 37 |                 far = tuple(contour[f][0])
 38 |                 d = d / 256
 39 |                 dd.append(d)
 40 | 
 41 |             for i in range(len(dd)):
 42 |                 s, e, f, d = defects[i, 0]
 43 |                 start = tuple(contour[s][0])
 44 |                 end = tuple(contour[e][0])
 45 |                 far = tuple(contour[f][0])
 46 |                 if dd[i] > 1.0 and dd[i] / np.max(dd) > 0.2:
 47 |                     points.append(f)
 48 | 
 49 |             i = i + 1
 50 |             if len(points) >= 2:
 51 |                 for i in range(len(points)):
 52 |                     f1 = points[i]
 53 |                     p1 = tuple(contour[f1][0])
 54 |                     nearest = None
 55 |                     min_dist = np.inf
 56 |                     for j in range(len(points)):
 57 |                         if i != j:
 58 |                             f2 = points[j]
 59 |                             p2 = tuple(contour[f2][0])
 60 |                             dist = (p1[0] - p2[0]) * (p1[0] - p2[0]) + (p1[1] - p2[1]) * (p1[1] - p2[1])
 61 |                             if dist < min_dist:
 62 |                                 min_dist = dist
 63 |                                 nearest = p2
 64 | 
 65 |                     cv2.line(thresh, p1, nearest, [0, 0, 0], 2)
 66 |     return thresh
 67 | 
 68 | 
 69 | def process(img_gray):
 70 |     # green channel happends to produce slightly better results
 71 |     # than the grayscale image and other channels
 72 |     #     img_gray=img_rgb[:,:,1]#cv2.cvtColor(img_rgb, cv2.COLOR_BGR2GRAY)
 73 |     # morphological opening (size tuned on training data)
 74 |     circle7 = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (7, 7))
 75 |     img_open = cv2.morphologyEx(img_gray, cv2.MORPH_OPEN, circle7)
 76 |     # Otsu thresholding
 77 |     img_th = cv2.threshold(img_open, 0, 255, cv2.THRESH_OTSU)[1]
 78 |     # Invert the image in case the objects of interest are in the dark side
 79 |     if (np.sum(img_th == 255) > np.sum(img_th == 0)):
 80 |         img_th = cv2.bitwise_not(img_th)
 81 |     # second morphological opening (on binary image this time)
 82 |     bin_open = cv2.morphologyEx(img_th, cv2.MORPH_OPEN, circle7)
 83 |     # connected components
 84 |     cc = cv2.connectedComponents(bin_open)[1]
 85 |     # cc=segment_on_dt(bin_open,20)
 86 |     return cc
 87 | 
 88 | 
 89 | def post_processing(test_masks):
 90 |     post_masks = []
 91 |     for mask_id, mask in enumerate(test_masks):
 92 |         #         print('mask shape', mask.shape)# (256, 256, 1)
 93 |         mask = np.squeeze(mask * 255).astype(np.uint8)
 94 |         mask = process(mask)
 95 |         labeled_mask, labels_num = ndi.label(mask)
 96 |         #         print(labels_num)
 97 |         post_mask = []
 98 |         if labels_num < 2:
 99 |             # only one area
100 |             post_masks.append(mask)
101 |             continue
102 |         for i in range(labels_num + 1):
103 |             # 分割出n个mask
104 |             if i == 0:  # id = 0 is for background
105 |                 continue
106 |             mask_i = (labeled_mask == i).astype(np.uint8)
107 |             props = regionprops(mask_i, cache=False)
108 |             if len(props) > 0:
109 |                 prop = props[0]
110 |                 if prop.convex_area / prop.filled_area > 1.1:
111 |                     mask_i = split_mask_v1(mask_i)
112 |             post_mask.append(mask_i)
113 |         # print(mask_i)
114 |         post_mask = np.array(post_mask)
115 |         post_mask_combined = np.amax(post_mask, axis=0)
116 |         labels = label(post_mask_combined)
117 |         post_masks.append(labels > 0)
118 |     # break
119 |     return np.array(post_masks)
120 | 


--------------------------------------------------------------------------------
/preparation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | """
 4 | python=3.5.2
 5 | """
 6 | 
 7 | import glob
 8 | import os
 9 | 
10 | import cv2
11 | import numpy as np
12 | import scipy.ndimage as ndi
13 | from PIL import Image
14 | from imageio import imwrite
15 | from skimage.transform import resize
16 | from sklearn.cluster import KMeans
17 | from tqdm import tqdm
18 | 
19 | 
20 | def train_valid_split(meta, validation_size, valid_category_ids=None):
21 |     meta_train = meta[meta['is_train'] == 1]
22 |     meta_train_split, meta_valid_split = split_on_column(meta_train,
23 |                                                          column='vgg_features_clusters',
24 |                                                          test_size=validation_size,
25 |                                                          random_state=1234,
26 |                                                          valid_category_ids=valid_category_ids
27 |                                                          )
28 |     return meta_train_split, meta_valid_split
29 | 
30 | 
31 | def split_on_column(meta, column, test_size, random_state=1, valid_category_ids=None):
32 |     if valid_category_ids is None:
33 |         categories = meta[column].unique()
34 |         np.random.seed(random_state)
35 |         valid_category_ids = np.random.choice(categories,
36 |                                               int(test_size * len(categories)))
37 |     valid = meta[meta[column].isin(valid_category_ids)].sample(frac=1, random_state=random_state)
38 |     train = meta[~(meta[column].isin(valid_category_ids))].sample(frac=1, random_state=random_state)
39 |     return train, valid
40 | 
41 | 
42 | def overlay_centers(images_dir, subdir_name, target_dir):
43 |     train_dir = os.path.join(images_dir, subdir_name)
44 |     for mask_dirname in tqdm(glob.glob('{}/*/masks'.format(train_dir))):
45 |         masks = []
46 |         for image_filepath in glob.glob('{}/*'.format(mask_dirname)):
47 |             image = np.asarray(Image.open(image_filepath))
48 |             image = image / 255.0
49 |             masks.append(get_center(image))
50 |         overlayed_masks = np.where(np.sum(masks, axis=0) > 128., 255., 0.).astype(np.uint8)
51 |         target_filepath = '/'.join(mask_dirname.replace(images_dir, target_dir).split('/')[:-1]) + '.png'
52 |         os.makedirs(os.path.dirname(target_filepath), exist_ok=True)
53 |         imwrite(target_filepath, overlayed_masks)
54 | 
55 | 
56 | def get_contour(img):
57 |     img_contour = np.zeros_like(img).astype(np.uint8)
58 |     _, contours, hierarchy = cv2.findContours(img.astype(np.uint8), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
59 |     cv2.drawContours(img_contour, contours, -1, (255, 255, 255), 4)
60 |     return img_contour
61 | 
62 | 
63 | def get_center(img):
64 |     img_center = np.zeros_like(img).astype(np.uint8)
65 |     y, x = ndi.measurements.center_of_mass(img)
66 |     cv2.circle(img_center, (int(x), int(y)), 4, (255, 255, 255), -1)
67 |     return img_center
68 | 
69 | 
70 | def split_overlay_mask_by_contour(masks, masks_counters):
71 |     # retain overlay contour
72 |     masks_counters[masks_counters <= 255] = 0
73 |     splited_masks = np.sum([masks_counters, masks], axis=0)
74 |     # drop overlay contour
75 |     splited_masks[splited_masks > 255] = 0
76 |     # to bool
77 |     splited_masks[splited_masks > 0] = 1
78 |     return splited_masks
79 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # 1. 赛题背景
  4 | 
  5 | 通过自动化细胞核检测，有利于检测细胞对各种治疗方法的反应，了解潜在生物学过程。队伍需要分析数据观察模式，抽象出问题并通过建立计算机模型识别各种条件下的一系列细胞核。
  6 | 
  7 | # 2. 数据预处理
  8 | 
  9 | ## 数据分析
 10 | 
 11 | 数据集包含部分的分割核图像。由于其获取方式、细胞类型、放大倍数和呈现模式不同（brightfield vs. fluorescence），对算法的抽象概括能力较高。
 12 | 
 13 | 对于每个图片都有对应的ImageId，训练集包含有原始图片和图中每个细胞核的分割图像，对于测试集只有原始图片。
 14 | 
 15 | 1. 其中训练集有670幅图片，测试集1有65幅图片，测试集2有3019幅图。
 16 | 2. 训练集中共有9种分辨率图片，测试集1有11种，测试集2有26种。
 17 | 3. 对于原始图片，分为灰度图和彩图。（虽然都是3或者四通道，但是其中有些图片多个通道数值一样，实际为灰度图。）
 18 | 4. 训练集的每一张图片对应多个mask，即一张图中会有多个细胞核。
 19 | 
 20 | ## 图片大小归一化
 21 | 
 22 | 对于不同分辨率的图片，我们使用skimage.transform.resize将图片的分辨率统一为256x256。之所以选择这个分辨率，是因为大部分图片都是此分辨率。
 23 | 
 24 | 同时对于训练集中出现的灰度图片（只有一个），将其转换为三通道相同的RGB图片以便被预测。
 25 | 
 26 | ## 训练集mask分割
 27 | 
 28 | 训练集中一副图片包含多个单细胞核的mask，当我们将所有mask合并时，难免mask之间会重叠，为了将合并后的图中mask之间分隔开。我们使用将重叠置为0。下面为处理前后的结果。
 29 | 
 30 | 但是分析发现本赛题的数据中mask之间几乎没有重叠，大部分mask都是十分接近，因此我们将单个mask识别出边界，然后对边界使用合成图片，对于边界重叠的地方像素置为0以分隔开mask。
 31 | 
 32 | 下图为获得的边界重叠：![overlay_contour](https://raw.githubusercontent.com/InsaneLife/MyPicture/master/kaggle-DATA-SCIENCE-BOWL-2018/BOWL-2018_overlay_contour.png)
 33 | 
 34 | 对于重叠的边界我们将其化为背景，来将每个细胞核分开，分割后的效果见下图
 35 | 
 36 | ![BOWL-2018_splited_mask](https://raw.githubusercontent.com/InsaneLife/MyPicture/master/kaggle-DATA-SCIENCE-BOWL-2018/BOWL-2018_splited_mask.png)
 37 | 
 38 | 之后将其转化为bool类型矩阵，上述操作将成绩提高了0.01左右。
 39 | 
 40 | 
 41 | 
 42 | # 3. U-Net
 43 | 
 44 | ## 建模
 45 | 
 46 | 我们假设图像中有两个类，一类是背景，另一类是细胞核，即转化为一个二分类问题，因此，构建一个目标是预测一个bool类型的矩阵，即对应像素点是否为细胞核。
 47 | 
 48 | ## Architecture
 49 | 
 50 | U-Net实际是一个端到端的完全卷积编码网络，我们基于论文 [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/pdf/1505.04597.pdf) 和 [this repo](https://github.com/jocicmarko/ultrasound-nerve-segmentation)。
 51 | 
 52 | 结构包含收缩路径（contracting path）和对称扩展路径（symmetric expanding path），收缩路径是典型的卷积编码网络，每一层卷积核大小是3x3，并通过一个ReLU和2x2的最大池化操作组成一次下采样。每一个下采样后将特征通道数加倍。扩展路径每一层对特征映射进行上采样，包含2x2的上卷积，同样3x3的卷积核和ReLU层。在最后一层使用1x1卷积来将16个特征分量映射到类别中（即正负，是否为核）。
 53 | 
 54 | 网络结构见下图，
 55 | 
 56 | ![model_tensorboard](https://raw.githubusercontent.com/InsaneLife/MyPicture/master/kaggle-DATA-SCIENCE-BOWL-2018/BOWL-2018_model_tensorboard.png)
 57 | 
 58 | ## 3.1 Training
 59 | 
 60 | 选用损失函数为binary_crossentropy，即
 61 | $$
 62 | E=\sum_{x\in \Omega} w(x) \log(p_{l(x)}(x))
 63 | $$
 64 | 其中l是每个像素的真实标签，w是权重地图，表示训练中某些像素更加重要。
 65 | 
 66 | 使用adam优化器来训练网络。训练过程中为了防止过拟合，将训练节划分1/10作为验证集，通过keras的callbacks函数中添加early_stopper和check_pointer来提前停止训练并保存最优的模型。验证函数见下公式，加入smooth是为了防止分母出现0。
 67 | $$
 68 | dice_coef = \frac{2* y_{true} \cap y_{pred} + smooth}{|y_{true}|+| y_{pred}| + smooth}
 69 | $$
 70 | 实现如下。
 71 | 
 72 | ```python
 73 | # Metric function
 74 | def dice_coef(y_true, y_pred):
 75 |     y_true_f = K.flatten(y_true)
 76 |     y_pred_f = K.flatten(y_pred)
 77 |     intersection = K.sum(y_true_f * y_pred_f)
 78 |     return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)
 79 | ```
 80 | 
 81 | ## 3.2 visualization
 82 | 
 83 | 通过keras调用tensorboard来可视化整个训练的过程，前期通过较大的迭代次数下，观察我们验证集上的验证函数dice_coef和binary_crossentropy的变化曲线，选择在曲线的梯度较小的迭代次数。
 84 | 
 85 | 训练过程见下图，结合图分析在30次迭代时曲线下降的梯度已经较小，因此选择了30次迭代。
 86 | 
 87 | ![BOWL-2018_tensorboard_score](https://raw.githubusercontent.com/InsaneLife/MyPicture/master/kaggle-DATA-SCIENCE-BOWL-2018/BOWL-2018_tensorboard_score.jpg)
 88 | 
 89 | ## 3.3 Result
 90 | 
 91 | U-Net预测结果
 92 | 
 93 | ![U-Net predict](https://raw.githubusercontent.com/InsaneLife/MyPicture/master/kaggle-DATA-SCIENCE-BOWL-2018/BOWL-2018_U-Net predict.png)
 94 | 
 95 | 
 96 | 
 97 | # 4. Post Process
 98 | 
 99 | 分析U-Net输出结果发现，图像中重叠的细胞核被分到成了一个核，如何分理处单个的核。
100 | 
101 | 我们假设核是凸的，通过凸性分析来分离被合并的核。
102 | 
103 | 在分析分割前后的图片，我们发现有不错的分割也有过分割的案例，但是总体上来说好的分割多于坏的，同时也需要改进我们的分割方法。
104 | 
105 | 分割后的前后结果：
106 | 
107 | 单个对比：![single post process good](https://raw.githubusercontent.com/InsaneLife/MyPicture/master/kaggle-DATA-SCIENCE-BOWL-2018/BOWL-2018_single%20post%20process%20good.png)
108 | 
109 | 整个对比
110 | 
111 | ![post%20process%20all%201](https://raw.githubusercontent.com/InsaneLife/MyPicture/master/kaggle-DATA-SCIENCE-BOWL-2018/BOWL-2018_post%20process%20all%201.png)![postprocess%20all%20good](https://raw.githubusercontent.com/InsaneLife/MyPicture/master/kaggle-DATA-SCIENCE-BOWL-2018/BOWL-2018_postprocess%20all%20good.png)
112 | 
113 | 使用post process之后，整体成绩提高了0.04。
114 | 
115 | 最终将mask转换为RLE编码参考于代码[https://www.kaggle.com/rakhlin/fast-run-length-encoding-python](https://www.kaggle.com/rakhlin/fast-run-length-encoding-python)
116 | 
117 | # Conclusion
118 | 
119 | 1. 最终的方法即上面介绍的方法，最好的成绩是0.412，被选为最终提交的结果成绩是0.398，排名是507。
120 | 
121 | 2. 由于U-Net是一种端到端的方法，加上合适的数据预处理和后处理，使得最终能够对每个像素点做出预测。
122 | 
123 | 3. 建模过程和使用数据前文已经介绍。
124 | 
125 | 4. 通过adam优化器来训练网络使得损失降低。模型训练中通过keras的callbacks函数中添加early_stopper和check_pointer来提前停止训练并保存最优的模型。
126 | 5. 本实验是一个目标检测的问题。数据集是医疗方面的数据。因此算法使用了针对小数据的U-Net.
127 | 
128 | # Submit
129 | 
130 | | model                                 | score |
131 | | ------------------------------------- | ----- |
132 | | pixel threshold                       | 0.20  |
133 | | base U-Net                            | 0.236 |
134 | | U-Net V2                              | 0.334 |
135 | | U-Net with preprocess                 | 0.359 |
136 | | U-Net with preprocess and postprocess | 0.412 |
137 | | Add Batch Normalization               | 0.426 |
138 | 
139 | 
140 | 
141 | # Discussion
142 | 
143 | 1. 对于原始图片直接resize为固定的256x256，对于部分图形会有一定程度的变形（但是生物学上讲细胞变形很正常），可以尝试对图像使用padding查看效果。
144 | 2. 看了很多大神预处理用了erosion operation，还未尝试。
145 | 3. 模型只用了U-Net，还未来得及尝试其他模型。
146 | 4. post process还可以继续深入做，对于细胞形态学深度地研究。
147 | 
148 | github源码：[https://github.com/InsaneLife/nucleus_detection](https://github.com/InsaneLife/nucleus_detection)
149 | 持续更新中。。。。。
150 | 
151 | 原文出处：[https://blog.csdn.net/shine19930820/article/details/80098284](https://blog.csdn.net/shine19930820/article/details/80098284)
152 | 
153 | # Reference
154 | 
155 | 1. https://www.kaggle.com/rakhlin/fast-run-length-encoding-python
156 | 2. [『 论文阅读』U-Net Convolutional Networks for Biomedical Image Segmentation](https://blog.csdn.net/shine19930820/article/details/80098091)
157 | 3. https://www.kaggle.com/rexhaif/morphological-postprocessing-on-unet-lb-0-429/notebook
158 | 4. https://www.kaggle.com/voglinio/separating-nuclei-masks-using-convexity-defects
159 | 5. https://www.kaggle.com/keegil/keras-u-net-starter-lb-0-277?scriptVersionId=2164855
160 | 6. https://github.com/jocicmarko/ultrasound-nerve-segmentation
161 | 
162 | 


--------------------------------------------------------------------------------
/result/sub-dsbowl2018_post_process_0416.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InsaneLife/nucleus_detection/a53b10965b2963922b7c266bb93ad4cbe2906db0/result/sub-dsbowl2018_post_process_0416.zip


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | a = [[1, 2, 3, 2], [1, 2, 3, 1], [2, 3, 4, 1], [1, 0, 2, 0], [2, 1, 2, 0], [2, 1, 1, 1]]
 4 | b = [[1, 4, 3, 2], [1, 2, 3, 1], [2, 3, 4, 1], [1, 0, 2, 0], [2, 1, 2, 0], [2, 1, 1, 4]]
 5 | c = []
 6 | c.append(a)
 7 | c.append(b)
 8 | c = np.array(c)
 9 | print(c)
10 | c = np.sum(c, axis=0)
11 | print(c)
12 | print(c.max())
13 | print(a + b)
14 | 
15 | a = np.array(a)
16 | b = np.array(b)
17 | d = np.concatenate((a, b), axis=1)
18 | print(np.sum([a,b], axis=0))
19 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | """
 4 | python=3.5.2
 5 | """
 6 | 
 7 | from data_input import read_train_data, read_test_data, prob_to_rles, mask_to_rle, resize, np
 8 | from model import get_unet, dice_coef
 9 | import pandas as pd
10 | from post_process import post_processing
11 | from skimage.io import imshow
12 | import matplotlib.pyplot as plt
13 | from keras.models import load_model
14 | from keras.callbacks import TensorBoard, EarlyStopping, ModelCheckpoint
15 | 
16 | epochs = 50
17 | model_name = 'model-0416-bn.h5'
18 | # best_model_name = 'model-dsbowl2018-0416-best.h5'
19 | # get train_data
20 | train_img, train_mask = read_train_data()
21 | 
22 | # get test_data
23 | test_img, test_img_sizes = read_test_data()
24 | 
25 | # get u_net model
26 | u_net = get_unet()
27 | 
28 | # fit model on train_data
29 | print("\n Training...")
30 | tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
31 | # early_stopper = EarlyStopping(patience=5, verbose=1)
32 | # check_pointer = ModelCheckpoint(best_model_name, verbose=1, save_best_only=True)
33 | u_net.fit(train_img, train_mask, batch_size=16, epochs=epochs, callbacks=[tb])
34 | 
35 | 
36 | print("\n Saving")
37 | u_net.save(model_name)
38 | 
39 | print("\n load model")
40 | u_net = load_model(model_name, custom_objects={'dice_coef': dice_coef})
41 | 
42 | print("\n Predicting and Saving predict")
43 | # Predict on test data
44 | test_mask = u_net.predict(test_img, verbose=1)
45 | np.save("test_img_bn_pred", test_mask)
46 | 


--------------------------------------------------------------------------------
/write_result.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | """
 4 | python=3.5.2
 5 | """
 6 | 
 7 | from data_input import read_train_data, read_test_data, prob_to_rles, mask_to_rle, resize, np
 8 | from model import get_unet, dice_coef
 9 | import pandas as pd
10 | from post_process import post_processing
11 | from skimage.io import imshow
12 | import matplotlib.pyplot as plt
13 | from keras.models import load_model
14 | from keras.callbacks import TensorBoard
15 | epochs = 50
16 | model_name = 'model-0416-test.h5'
17 | # get train_data
18 | # train_img, train_mask = read_train_data()
19 | 
20 | # get test_data
21 | test_img, test_img_sizes = read_test_data()
22 | 
23 | # get u_net model
24 | u_net = get_unet()
25 | 
26 | # fit model on train_data
27 | # print("\n Training...")
28 | # tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False, embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None)
29 | # u_net.fit(train_img, train_mask, batch_size=16, epochs=epochs, callbacks=[tb])
30 | 
31 | # print("\n Saving")
32 | # u_net.save(model_name)
33 | 
34 | # print("\n load model")
35 | # u_net = load_model(model_name, custom_objects={'dice_coef': dice_coef})
36 | #
37 | # print("\n Predicting and Saving predict")
38 | # # Predict on test data
39 | # test_mask = u_net.predict(test_img, verbose=1)
40 | # np.save("test_img_pred", test_mask)
41 | 
42 | # load test_mask
43 | print("load predict")
44 | test_mask = np.load("test_img_bn_pred.npy")
45 | # get test_data
46 | # test_img, test_img_sizes = read_test_data()
47 | 
48 | # post processing
49 | post_test_mask = post_processing(test_mask)
50 | 
51 | post_test_mask = np.expand_dims(post_test_mask, axis=-1)
52 | # Create list of upsampled test masks
53 | test_mask_upsampled = []
54 | for i in range(len(post_test_mask)):
55 |     test_mask_upsampled.append(resize(np.squeeze(post_test_mask[i]),
56 |                                       (test_img_sizes[i][0], test_img_sizes[i][1]),
57 |                                       mode='constant', preserve_range=True))
58 | print('Done!')
59 | 
60 | test_ids, rles = mask_to_rle(test_mask_upsampled)
61 | 
62 | # Create submission DataFrame
63 | sub = pd.DataFrame()
64 | sub['ImageId'] = test_ids
65 | sub['EncodedPixels'] = pd.Series(rles).apply(lambda x: ' '.join(str(y) for y in x))
66 | sub.to_csv('sub-dsbowl2018_preprocess_0416_1.csv', index=False)
67 | 
68 | print("Data saved")


--------------------------------------------------------------------------------