├── inference.sh
├── assets
    ├── slide.pdf
    ├── model_description.png
    └── task_description.png
├── preprocess
    ├── target_norm.png
    ├── utils.py
    └── prep.py
├── model
    ├── net.py
    └── data_loader.py
├── README.md
├── utils.py
├── inference.py
├── train.py
├── feature_extraction.py
└── stage2.py


/inference.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | python ./feature_extraction.py
3 | python ./inference.py


--------------------------------------------------------------------------------
/assets/slide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyc1am3n/HeLP2019_Breast_Cancer_1st_solution/HEAD/assets/slide.pdf


--------------------------------------------------------------------------------
/preprocess/target_norm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyc1am3n/HeLP2019_Breast_Cancer_1st_solution/HEAD/preprocess/target_norm.png


--------------------------------------------------------------------------------
/assets/model_description.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyc1am3n/HeLP2019_Breast_Cancer_1st_solution/HEAD/assets/model_description.png


--------------------------------------------------------------------------------
/assets/task_description.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cyc1am3n/HeLP2019_Breast_Cancer_1st_solution/HEAD/assets/task_description.png


--------------------------------------------------------------------------------
/model/net.py:
--------------------------------------------------------------------------------
 1 | import segmentation_models as sm
 2 | from keras.optimizers import Adam
 3 | 
 4 | """
 5 | Network Architecture
 6 | """
 7 | 
 8 | def fpn(backbone, pretrained_weights=None):
 9 |     model = sm.FPN(backbone, 
10 |                    input_shape=(256, 256, 3), 
11 |                    classes=1, 
12 |                    activation='sigmoid', 
13 |                    encoder_weights=pretrained_weights)
14 |     
15 |     model.compile(optimizer='adam', 
16 |                   loss=sm.losses.bce_jaccard_loss, 
17 |                   metrics=[sm.metrics.iou_score, sm.metrics.f1_score])
18 |     return model
19 | 
20 | 
21 | def unet(backbone, pretrained_weights=None):
22 |     model = sm.Unet(backbone, 
23 |                     input_shape=(256, 256, 3), 
24 |                     classes=1, 
25 |                     activation='sigmoid',
26 |                     encoder_weights=pretrained_weights)
27 | 
28 |     model.compile(optimizer='adam', 
29 |                   loss=sm.losses.bce_jaccard_loss, 
30 |                   metrics=[sm.metrics.iou_score, sm.metrics.f1_score])
31 |     
32 |     return model


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # HeLP Challenge 2019 Breast Cancer 1st place solution
 2 | 
 3 | This repository is **1st place solution** to the **Breast Cancer Classification Task of HeLP Challenge 2019**.  
 4 | ![task_description](./assets/task_description.png)
 5 | 
 6 | 
 7 | ## Model
 8 | ![model_description](./assets/model_description.png)
 9 | ### Stage 1
10 | - Preprocessing: ROI extraction, Rescale, Vahadane Stain Normalization
11 | - Pixel-wise Segmentation: Feature Pyramid Network(FPN)
12 | ### Stage 2
13 | - Feature extraction from probability heatmap
14 | - Prediction final probability and major axis based on features
15 | 
16 | And also, please click [this link](./assets/slide.pdf) to see the detailed model description.
17 | 
18 | ## Dependencies
19 | - keras
20 | - segmentation_models
21 | - openslide
22 | - staintools
23 | - numpy
24 | - pandas
25 | - sklearn
26 | - skimage
27 | 
28 | ## Usage
29 | 
30 | ### Dataset
31 | 
32 | ```bash
33 | data
34 |   └── train
35 |      ├── level4
36 |      │  ├── Image
37 |      │  │  ├── slide_001.png
38 |      │  │  ├── ...
39 |      │  │  └── slide_#.png
40 |      │  └── Mask
41 |      │     ├── mask_001.png
42 |      │	   ├── ...
43 |      │	   └── mask_#.png
44 |      └── label.csv
45 |             
46 | ========= After training, the directories are created as below. =========
47 | 
48 |   ├── volume
49 |   │  ├── dataset
50 |   │  │  └── level4 
51 |   │  │     ├── img
52 |   │  │	   │  ├── slide001_patch001.png
53 |   │  │ 	   │  ├── ...
54 |   │  │     │  └── slide#_patch#.png
55 |   │  │	   └── mask
56 |   │  │	      ├── mask001_patch001.png
57 |   │  │        ├── ...
58 |   │  │        └── mask#_patch#.png
59 |   │  └── model
60 |   │       └── fpn_weights.h5
61 |   └── heatmap
62 |       ...
63 | ```
64 | 
65 | 
66 | 
67 | ### Train
68 | Run the `train.py`.  
69 | ```bash
70 | $ python train.py
71 | ```
72 | ### Inference
73 | Run the `inference.sh`.
74 | ```bash
75 | $ sh inference.sh
76 | ```
77 | 
78 | ## Authors
79 | - Daeyoung Kim / [@cyc1am3n](https://github.com/cyc1am3n)  
80 | - Taewoo Kim / [@Taeu](https://github.com/Taeu)  
81 | - Jonghyun Choi / [@ExcelsiorCJH](https://github.com/ExcelsiorCJH)
82 | 


--------------------------------------------------------------------------------
/preprocess/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | sys.path.append('..')
 4 | 
 5 | def make_directory():
 6 |     if not os.path.exists('./data/volume/train'):
 7 |     os.mkdir('./data/volume/train')
 8 |     if not os.path.exists('./data/volume/valid'):
 9 |         os.mkdir('./data/volume/valid')
10 | 
11 |     if not os.path.exists('./data/volume/train/level4'):
12 |         os.mkdir('./data/volume/train/level4')
13 |     if not os.path.exists('./data/volume/valid/level4'):
14 |         os.mkdir('./data/volume/valid/level4')   
15 | 
16 |     if not os.path.exists('./data/volume/train/level4/img'):
17 |         os.mkdir('./data/volume/train/level4/img')
18 |     if not os.path.exists('./data/volume/train/level4/mask'):
19 |         os.mkdir('./data/volume/train/level4/mask')
20 |         
21 |     if not os.path.exists('./data/volume/valid/level4/img'):
22 |         os.mkdir('./data/volume/valid/level4/img')
23 |     if not os.path.exists('./data/volume/valid/level4/mask'):
24 |         os.mkdir('./data/volume/valid/level4/mask')
25 | 
26 |     # 0: normal, 1: tumor
27 |     if not os.path.exists('./data/volume/train/level4/img/0'):
28 |         os.mkdir('./data/volume/train/level4/img/0')
29 |     if not os.path.exists('./data/volume/train/level4/mask/0'):
30 |         os.mkdir('./data/volume/train/level4/mask/0')
31 |     if not os.path.exists('./data/volume/train/level4/img/1'):
32 |         os.mkdir('./data/volume/train/level4/img/1')
33 |     if not os.path.exists('./data/volume/train/level4/mask/1'):
34 |         os.mkdir('./data/volume/train/level4/mask/1')
35 |     # 0: normal, 1: tumor
36 |     if not os.path.exists('./data/volume/valid/level4/img/0'):
37 |         os.mkdir('./data/volume/valid/level4/img/0')
38 |     if not os.path.exists('./data/volume/valid/level4/mask/0'):
39 |         os.mkdir('./data/volume/valid/level4/mask/0')
40 |     if not os.path.exists('./data/volume/valid/level4/img/1'):
41 |         os.mkdir('./data/volume/valid/level4/img/1')
42 |     if not os.path.exists('./data/volume/valid/level4/mask/1'):
43 |         os.mkdir('./data/volume/valid/level4/mask/1')
44 | 
45 |     print('Created Directories')
46 |     return None


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import staintools
 4 | from skimage.measure import label, regionprops
 5 | 
 6 | def stain_norm_func(target_image_path):
 7 |     target = staintools.read_image(target_image_path)
 8 |     target = staintools.LuminosityStandardizer.standardize(target)
 9 |     normalizer = staintools.StainNormalizer(method='vahadane')
10 |     normalizer.fit(target)
11 |     return normalizer
12 | 
13 | def stain_patch_dir(PATCHES_DIR, slide_pathes):
14 |     phase = 'train'
15 |     stain_patches_save_path = PATCHES_DIR + 'train/'
16 |     if len(slide_pathes) < 110 :
17 |         phase = 'test1'
18 |         stain_patches_save_path = PATCHES_DIR + 'test1/'
19 |     elif len(slide_pathes) < 200 :
20 |         phase = 'test2'
21 |         stain_patches_save_path = PATCHES_DIR + 'test2/'
22 |     make_directory(stain_patches_save_path)
23 |     print('current phase : ',phase)
24 |     return stain_patches_save_path, phase
25 | 
26 | 
27 | def set_directory(CKPT_DIR, MODEL_NAME):
28 |     path1, path2 = os.path.split(CKPT_DIR[:-1])
29 |     if not os.path.isdir(path1):
30 |         os.mkdir(path1)
31 |     if not os.path.isdir(CKPT_DIR):
32 |         os.mkdir(CKPT_DIR)
33 |     if not os.path.isdir(CKPT_DIR + MODEL_NAME):
34 |         os.mkdir(CKPT_DIR + MODEL_NAME)
35 |     print('Set Directory')
36 | 
37 |     
38 | def get_major_axis(mask):
39 |     from skimage.measure import label, regionprops
40 |     
41 |     # divide entire masks into each instance using connected-components labelling
42 |     labels = label(mask)
43 |     
44 |     # iterate to calculate the length of the major axis of each instance
45 |     major_axis_list = [regionprops((labels == i).astype('uint8'))[0].major_axis_length \
46 |                        for i in np.unique(labels) if i != 0]
47 |     
48 |     # find the longest major axis
49 |     if len(major_axis_list):
50 |         longest_major_axis = max(major_axis_list)
51 |     else:
52 |         longest_major_axis = 0
53 |     return longest_major_axis
54 | 
55 | 
56 | def predict_from_model(patch, model):
57 |     """Predict which pixels are tumor.
58 |     
59 |     input: patch: 256x256x3, rgb image
60 |     input: model: keras model
61 |     output: prediction: 256x256x1, per-pixel tumor probability
62 |     """
63 |     
64 |     prediction = model.predict(patch.reshape(1, 256, 256, 3))
65 |     prediction = prediction.reshape(256, 256)
66 |     return prediction
67 | 
68 | def make_directory(DIR):
69 |     if not os.path.isdir(DIR):
70 |         os.mkdir(DIR)
71 |         print(DIR,'made!')
72 | 
73 | def acc_score(truth, pred):
74 |     cnt = 0
75 | 
76 |     for i in range(len(truth)):
77 |         diff = np.abs(truth[i] - pred[i])
78 |         if diff <= truth[i]*0.05 :
79 |             cnt += 1
80 |     return cnt / len(truth)


--------------------------------------------------------------------------------
/model/data_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import keras
 7 | 
 8 | from keras.preprocessing.image import ImageDataGenerator
 9 | from sklearn.model_selection import KFold, StratifiedKFold
10 | 
11 | import warnings
12 | warnings.filterwarnings("ignore")
13 | 
14 | 
15 | class PatchLoader:
16 |     def __init__(self, n_kfold, seed, use_norm=True, server='kakao'):
17 |         self.n_kfold = n_kfold
18 |         self.seed = seed
19 |         
20 |         if server == 'local':
21 |             if use_norm:
22 |                 self.patches_mask_path = './data/volume/dataset/level4/img_norm/'
23 |                 self.patches_mask_path = './data/volume/dataset/level4/mask_norm/'
24 |                 self.img_mask_pairs_path = './data/volume/dataset/level4/img_mask_norm_pairs.pkl'
25 |             else:
26 |                 self.patches_mask_path = './data/volume/dataset/level4/img/'
27 |                 self.patches_mask_path = './data/volume/dataset/level4/mask/'
28 |                 self.img_mask_pairs_path = './data/volume/dataset/level4/img_mask_pairs.pkl'
29 |         elif server == 'kakao':
30 |             if use_norm:
31 |                 self.patches_img_path = '/data/volume/dataset/level4/img_norm/'
32 |                 self.patches_mask_path = '/data/volume/dataset/level4/mask_norm/'
33 |                 self.img_mask_pairs_path = '/data/volume/dataset/level4/img_mask_norm_pairs.pkl'
34 |             else:
35 |                 self.patches_img_path = '/data/volume/dataset/level4/img/'
36 |                 self.patches_mask_path = '/data/volume/dataset/level4/mask/'
37 |                 self.img_mask_pairs_path = '/data/volume/dataset/level4/img_mask_pairs.pkl'
38 |             
39 |     
40 |     def get_all_patches(self):
41 |         '''slide & mask의 pair를 불러와 dataframe으로 만드는 함수'''
42 | 
43 |         with open(self.img_mask_pairs_path, 'rb') as f:
44 |             img_mask_pairs = pickle.load(f)
45 | 
46 |         self.all_patches_sample = pd.DataFrame(img_mask_pairs, columns=['slide_path', 'mask_path'])
47 |         self.all_patches_sample = self.all_patches_sample.sample(frac=1, random_state=42).reset_index(drop=True)
48 |         return self.all_patches_sample
49 |     
50 |     
51 |     def split_sample(self):
52 |         kf = KFold(n_splits=self.n_kfold, shuffle=True, random_state=self.seed)
53 |         folds = list(kf.split(self.all_patches_sample))
54 |         return folds
55 | 
56 | # K-Fold Data Generator
57 | def kfold_data_generator(slide_datagen, mask_datagen, df, batch_size=32, seed=42):
58 |     slide_generator = \
59 |         slide_datagen.flow_from_dataframe(df,
60 |                                           x_col='slide_path',
61 |                                           y_col='mask_path',
62 |                                           seed=seed,
63 |                                           batch_size=batch_size,
64 |                                           shuffle=False,
65 |                                           class_mode=None)
66 | 
67 |     mask_generator = \
68 |         mask_datagen.flow_from_dataframe(df,
69 |                                          x_col='mask_path',
70 |                                          y_col='mask_path',
71 |                                          color_mode='grayscale',
72 |                                          seed=seed,
73 |                                          batch_size=batch_size,
74 |                                          shuffle=False,
75 |                                          class_mode=None)
76 |     
77 |     
78 |     generator = zip(slide_generator, mask_generator)
79 |     for (slide, mask) in generator:
80 |         mask = mask.astype(np.int8)
81 |         yield slide, mask


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import time
  3 | import random
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | 
  8 | from utils import *
  9 | from stage2 import *
 10 | 
 11 | '''CONFIG'''
 12 | OUTPUT_PATH = '/data/output/output.csv'
 13 | SEED = np.random.randint(1000)
 14 | THRESHOLD = 0.5
 15 | 
 16 | '''stage 2 '''
 17 | MODEL_NAME = 'fpn_model/'
 18 | 
 19 | FEATURE_DIR = '/data/volume/feature/'
 20 | RESULTS_PATH = '/data/volume/results/'
 21 | 
 22 | E1_meta = 'fpn_cjh_major_taeu345.csv'
 23 | E2_major = 'fpn_cjh_major_taeu345.csv'
 24 | 
 25 | fpn_test1 = 'fpn_cjh_test1_pickcol_meta_taeu8.csv'
 26 | fpn_test2 = 'fpn_cjh_test2_pickcol_meta_taeu8.csv'
 27 | unet_test1 = 'unet_cjh_test1_pickcol_meta_taeu8.csv'
 28 | unet_test2 = 'unet_cjh_test2_pickcol_meta_taeu8.csv'
 29 | 
 30 | 
 31 | IS_PREPROCESSED = True # 한번 돌리고 둘다 True 로 바꾸자
 32 | IS_FEATURE = True # 한번 돌리고 둘다 True 로 바꾸자
 33 | ''''''
 34 | # print(MODEL_NAME,WEIGHT_NAME)
 35 | 
 36 | random.seed(SEED)
 37 | np.random.seed(SEED)
 38 | 
 39 | # check the meta stasis
 40 | 
 41 | # feature full path
 42 | #train_feature_path = '/data/volume/feature/fpn_cjh_rescale4_train_feature2_more.csv' 
 43 | #test2_feature_path = '/data/volume/feature/fpn_cjh_rescale4_test2_feature2_more.csv'
 44 | 
 45 | feature_train_path = FEATURE_DIR + 'fpn_cjh_train_feature1.csv'
 46 | feature_test1_path = FEATURE_DIR + 'fpn_cjh_test1_feature1.csv'
 47 | feature_test2_path = FEATURE_DIR + 'fpn_cjh_test2_feature1.csv'
 48 | 
 49 | train_feature_path = feature_train_path
 50 | test2_feature_path = '/data/volume/feature/fpn_cjh_test2_feature_snu.csv'
 51 | 
 52 | 
 53 | def main():
 54 |     print('Start Inference!')
 55 |     print('!!!Stage 2 ENSEMBLE with load meta predict major!!!')
 56 | 
 57 |     ## load features
 58 |     pd_feature = pd.read_csv(train_feature_path,index_col = [0])
 59 |     ## this is for prediction only by A feature.
 60 |     best_auc_col , best_acc_col, best_acc_threshold = check_train_score(pd_feature)
 61 |     ## the best model of metastasis prediction 2nd stage model 
 62 |     best_model_meta = stage2_train_meta(pd_feature)
 63 |     ## the best model of major-axis prediction 2nd stage model 
 64 |     best_model_major = stage2_train_meta(pd_feature)
 65 |     ## load meta
 66 |     e1_meta = pd.read_csv(RESULTS_PATH + E1_meta,index_col = [0])
 67 |     e2_major = pd.read_csv(RESULTS_PATH + E2_major, index_col = [0])
 68 |     e1_meta_list = e1_meta.metastasis.tolist()
 69 |     e2_major_list = e2_major.major_axis.tolist()
 70 | 
 71 | 
 72 |     ## predict major axis ...
 73 |     slide_dir = '/data/test/level4/Image/'
 74 |     slide_pathes = sorted(os.listdir(slide_dir))
 75 |     
 76 |     e1_list = []
 77 |     e2_list = []
 78 |     cols = [1,3,5,8, 12,14,16,19, 23,25,27,30, 34,36,38,41] 
 79 |     current_test_col = cols[3] # 0 ~ 16
 80 | 
 81 | 
 82 |     if len(slide_pathes) == 107:
 83 |         # test 1 phase
 84 |         phase = 'test1'
 85 |         print('test1 phase predict...')
 86 |     
 87 |         #ensemble
 88 |         pd_feature = pd.read_csv(FEATURE_DIR + MODEL_NAME[:-1] + '_test1_feature1.csv',index_col = [0])
 89 |         fpn_pd = pd.read_csv(RESULTS_PATH + fpn_test1,index_col = [0])
 90 |         unet_pd = pd.read_csv(RESULTS_PATH + unet_test1, index_col = [0])
 91 | 
 92 |         fpn_meta = fpn_pd.metastasis.tolist()
 93 |         unet_meta = unet_pd.metastasis.tolist()
 94 | 
 95 |         for i in range(len(fpn_meta)):
 96 |             e1_meta_list[i] = 0.5 *(fpn_meta[i] + unet_meta[i])
 97 | 
 98 |         e2_major_list = pd_feature.iloc[:,11].tolist()
 99 |         
100 |         for i in range(len(e2_major_list)):
101 | 
102 |             if e2_major_list[i] < 500 :
103 |                 e2_major_list[i] = 0
104 |             
105 |     else :
106 |         # test 2 phase - only SNU dataset
107 |         phase = 'test2'
108 |         print('test2 phase predict...')
109 |     
110 |         ## load feature for test 2phase
111 |         pd_feature = pd.read_csv(test2_feature_path,index_col = [0])
112 |             
113 |         ## prediction by only one feature
114 |         best_auc_col = 5 # 4, 5, 11, 12, 18, 19  # max probability value of the given probability heatmap
115 |         best_acc_col = 16 # 0, 2, 7, 9, 14, 16 (best 로 바꿔서 제출)  # major axis of the given probability heatmap
116 |         best_acc_threshold = 500
117 |         e1_meta_list = pd_feature.iloc[:,best_auc_col].tolist() # 
118 |         e2_major_list = np.array(pd_feature.iloc[:,best_acc_col].tolist()) / 1.76
119 | 
120 |         ## prediction by 2nd stage model
121 |         e1_meta_list, e2_major_list = stage2_predict(pd_feature, best_model_meta, best_model_major)
122 |         
123 |         for i in range(len(e2_major_list)):
124 |             if e2_major_list[i] < best_acc_threshold :
125 |                 e2_major_list[i] = 0
126 |         """
127 |         # ensemble
128 |         fpn_pd = pd.read_csv(RESULTS_PATH + fpn_test2,index_col = [0])
129 |         unet_pd = pd.read_csv(RESULTS_PATH + unet_test2, index_col = [0])
130 |         fpn_meta = fpn_pd.metastasis.tolist()
131 |         unet_meta = unet_pd.metastasis.tolist()
132 |         for i in range(len(e1_meta_list)):
133 |             e1_meta_list[i] = 0.5 * (fpn_meta[i] + unet_meta[i])
134 |         """
135 | 
136 | 
137 |     total_result = []
138 |     for i, slide_path in enumerate(slide_pathes):
139 |         slide_id = slide_path.split('.')[0]
140 |         total_result.append([slide_id, e1_meta_list[i], e2_major_list[i]])
141 |         print(total_result[i])
142 |     
143 |     result = pd.DataFrame(data=total_result, columns=['id', 'metastasis', 'major_axis'])
144 |     result.to_csv(OUTPUT_PATH, index=False)
145 | 
146 |     print(SEED)
147 |     save_path = '/data/volume/results/1_'+phase+'_final_' + str(best_auc_col)+'_'+str(best_acc_col)+'.csv'
148 |     result.to_csv(save_path, index=False)
149 |     print(save_path)
150 | 
151 | if __name__ == "__main__":
152 |     main()
153 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import time
  4 | import random
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import keras
  9 | 
 10 | from keras import models
 11 | from keras.preprocessing.image import ImageDataGenerator
 12 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 13 | 
 14 | from segmentation_models import get_preprocessing
 15 | 
 16 | from utils import set_directory
 17 | from preprocess.prep import Preprocess
 18 | from model.net import fpn, unet
 19 | from model.data_loader import PatchLoader
 20 | from model.data_loader import kfold_data_generator
 21 | 
 22 | if __name__ == "__main__":
 23 |     parser = argparse.ArgumentParser(description='parser')
 24 |     parser.add_argument('--model', type=str, default='fpn')
 25 |     parser.add_argument('--epochs', type=int, default=30)
 26 |     parser.add_argument('--batch_size', type=int, default=32)
 27 |     parser.add_argument('--patch_size', type=int, default=256)
 28 |     parser.add_argument('--n_folds', type=int, default=3)
 29 |     parser.add_argument('--preprocess',
 30 |                             type=lambda x: True if x == 'True' else False,
 31 |                             default=True)
 32 |     parser.add_argument('--stain_norm',
 33 |                             type=lambda x: True if x == 'True' else False,
 34 |                             default=True)
 35 |     parser.add_argument('--seed', type=int, default=42)
 36 |     parser.add_argument('--ckpt_dir', type=str, default='/data/volume/model/')
 37 |     parser.add_argument('--model_name', type=str, default='fpn_model/')
 38 |     args = parser.parse_args()
 39 | 
 40 |     TRAIN_DIR, LABEL_PATH = '/data/train', '/data/train/label.csv'
 41 |     CKPT_DIR, MODEL_NAME = args.ckpt_dir, args.model_name
 42 |     PREPROCESS = arg.preprocess
 43 |     N_KFOLD = args.n_folds
 44 | 
 45 |     random.seed(args.seed)
 46 |     np.random.seed(args.seed)
 47 | 
 48 |     # check isdir
 49 |     set_directory(CKPT_DIR, MODEL_NAME)
 50 | 
 51 |     # preprocessing
 52 |     if PREPROCESS:
 53 |         preprocess = Preprocess(patch_size=args.patch_size, 
 54 |                                 is_norm=args.stain_norm,
 55 |                                 target_norm_path='./preprocess/target_norm.png',
 56 |                                 mode='train',
 57 |                                 server='local')
 58 |         preprocess.save_patches()
 59 |     else:
 60 |         print('Already Preprocessed.')
 61 | 
 62 |     # set dataset
 63 |     patch_loader = PatchLoader(n_kfold=args.n_folds, 
 64 |                                seed=args.seed, 
 65 |                                use_norm=args.stain_norm, 
 66 |                                server='local')
 67 |     all_patches_sample = patch_loader.get_all_patches()
 68 |     folds = patch_loader.split_sample()
 69 | 
 70 |     # set generator
 71 |     print('Set Generator.')
 72 |     preprocess_input = get_preprocessing('resnet34')
 73 | 
 74 |     # Slide, Mask ImageDataGenerator
 75 |     train_slide_datagen = ImageDataGenerator(# rescale= 1./255,
 76 |                                             width_shift_range=[-10, 10],
 77 |                                             rotation_range=90, 
 78 |                                             fill_mode='reflect',
 79 |                                             horizontal_flip=True,
 80 |                                             vertical_flip=True,
 81 |                                             preprocessing_function=preprocess_input)
 82 | 
 83 |     train_mask_datagen = ImageDataGenerator(rescale= 1./255,
 84 |                                             width_shift_range=[-10, 10],
 85 |                                             rotation_range=90,
 86 |                                             fill_mode='reflect',
 87 |                                             horizontal_flip=True,
 88 |                                             vertical_flip=True)
 89 | 
 90 | 
 91 |     valid_slide_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
 92 |     valid_mask_datagen = ImageDataGenerator(rescale= 1./255)
 93 |     
 94 |     # set model
 95 |     if args.model == 'fpn':
 96 |         model = fpn(backbone='resnet34', pretrained_weights='imagenet')
 97 |     else:
 98 |         model = unet(backbone='resnet34', pretrained_weights='imagenet')
 99 | 
100 |     train_start_time = time.time()
101 |     for f_idx, (train_idx, valid_idx) in enumerate(folds):
102 |         print('*'*20, f'{f_idx}-Fold 학습 시작', '*'*20)
103 |         train_df = all_patches_sample.iloc[train_idx]
104 |         valid_df = all_patches_sample.iloc[valid_idx]
105 |         
106 |         train_slide_mask_gen = kfold_data_generator(train_slide_datagen, 
107 |                                                     train_mask_datagen, 
108 |                                                     df=train_df,
109 |                                                     batch_size=args.batch_size,
110 |                                                     seed=args.seed)
111 | 
112 |         valid_slide_mask_gen = kfold_data_generator(valid_slide_datagen, 
113 |                                                     valid_mask_datagen, 
114 |                                                     df=valid_df,
115 |                                                     batch_size=args.batch_size,
116 |                                                     seed=args.batch_size)
117 |         
118 |         train_steps = len(train_df) // args.batch_size
119 |         valid_steps = len(valid_df) // args.batch_size
120 |         
121 |         # callbacks_list
122 |         callbacks_list = [
123 |             ModelCheckpoint(
124 |                 filepath=f'{CKPT_DIR}{MODEL_NAME}{f_idx+1}_fold_{args.model}_best_model.h5',
125 |                 monitor='val_iou_score',
126 |                 mode='max',
127 |                 save_best_only=True,
128 |                 verbose=1,
129 |             ),
130 |             ReduceLROnPlateau(
131 |                 monitor='val_iou_score',
132 |                 mode='max',
133 |                 factor=0.1,
134 |                 patience=3,
135 |                 verbose=1,
136 |             )
137 |         ]
138 |         
139 |         history = model.fit_generator(train_slide_mask_gen, 
140 |                                     steps_per_epoch=train_steps,
141 |                                     validation_data=valid_slide_mask_gen,
142 |                                     validation_steps=valid_steps,
143 |                                     epochs=args.epochs, verbose=2,
144 |                                     callbacks=callbacks_list)
145 |         
146 |         model.save(f'{CKPT_DIR}{MODEL_NAME}{args.model}_im_{f_idx+1}_fold_last_model.h5')
147 |         
148 |         print('*'*20, f'{f_idx}-Fold 학습 완료', '*'*20)
149 |         print('='*60)
150 |     
151 |     train_end_time = time.time()
152 |     print('Train time : ', (train_end_time - train_start_time) / 60, 'minutes')
153 |     model.save(f'{CKPT_DIR}{MODEL_NAME}{args.model}_{N_KFOLD}_fold_total_model.h5')
154 |     print('model save completed')


--------------------------------------------------------------------------------
/feature_extraction.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import time
  4 | import random
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from keras import models
 10 | import segmentation_models as sm
 11 | from segmentation_models import get_preprocessing
 12 | 
 13 | from preprocess.prep import Preprocess
 14 | 
 15 | from utils import *
 16 | from stage2 import *
 17 | import staintools
 18 | import openslide
 19 | from openslide.deepzoom import DeepZoomGenerator
 20 | from PIL import Image
 21 | 
 22 | if __name__ == "__main__":
 23 |     parser = argparse.ArgumentParser(description='parser')
 24 |     parser.add_argument('--seed', type=int, default=42)
 25 |     parser.add_argument('--patch_size', type=int, default=256)
 26 |     parser.add_argument('--is_preprocessed',
 27 |                             type=lambda x: True if x == 'True' else False,
 28 |                             default=True)
 29 |     parser.add_argument('--model_name', type=str, default='fpn_model')
 30 |     parser.add_argument('--model_weight', type=str, default='2_fold_fpn_best_model.h5')
 31 |     parser.add_argument('--ckpt_dir', type=str, default='./data/volume/model/')
 32 |     parser.add_argument('--heatmap_dir', type=str, default='./data/volume/heatmap/')
 33 |     parser.add_argument('--feature_dir', type=str, default='./data/volume/feature/')
 34 |     parser.add_argument('--patches_dir', type=str, default='./data/volume/patches/rescale/')
 35 |     args = parser.parse_args()
 36 | 
 37 |     TRAIN_DIR, LABEL_PATH = './data/train', './data/train/label.csv'
 38 |     MODEL_NAME, HEATMAP_DIR, FEATURE_DIR, PATCHES_DIR = args.model_name, args.heatmap_dir, args.feature_dir, args.patches_dir
 39 | 
 40 |     random.seed(args.seed)
 41 |     np.random.seed(args.seed)
 42 |     
 43 |     # check isdir
 44 |     set_directory(CKPT_DIR, MODEL_NAME)
 45 |     make_directory(HEATMAP_DIR)
 46 |     make_directory(FEATURE_DIR)
 47 |     make_directory(PATCHES_DIR)
 48 | 
 49 |     #load model
 50 |     MODEL_PATH = args.ckpt_dir + args.model_name + '/' + args.model_weight
 51 |     model = models.load_model(
 52 |         MODEL_PATH,
 53 |         custom_objects={
 54 |             'binary_crossentropy_plus_jaccard_loss': sm.losses.bce_jaccard_loss,
 55 |             'iou_score': sm.metrics.iou_score,
 56 |             'f1-score': sm.metrics.f1_score
 57 |         }
 58 |     )
 59 |     print(MODEL_PATH,'Model loaded.')
 60 | 
 61 |     # set preprocess
 62 |     preprocess_input = get_preprocessing('resnet34')
 63 |     preprocess = Preprocess(patch_size=PATCH_SIZE, mode='inference', server='kakao')
 64 |     
 65 |     TARGET_NORM_PATH = './preprocess/target_norm.png'
 66 |     normalizer = stain_norm_func(TARGET_NORM_PATH)
 67 |     slide_pathes = sorted(os.listdir(preprocess.slide_dir))
 68 |     stain_patches_save_path, phase = stain_patch_dir(PATCHES_DIR, slide_pathes)
 69 | 
 70 |     start_time = time.time()
 71 |     full_feature_list = []
 72 |     
 73 |     for i, slide_path in enumerate(slide_pathes):
 74 |         current_save_dir = stain_patches_save_path + slide_path[:-4] + '/' # ex) '/data/volume/patches/rescale/test1/slide_001/'
 75 |         
 76 |         if phase == 'test1' and i <= 60: # AMC dataset
 77 |             full_slide_path = preprocess.slide_dir + slide_path
 78 |         else : # SNU dataset
 79 |             full_slide_path = '/data/test/level0/'+ slide_path +'.mrxs'
 80 | 
 81 |         print(current_save_dir)
 82 |         if IS_PREPROCESSED : 
 83 |             stain_patches_names = sorted(os.listdir(current_save_dir))
 84 |         else : 
 85 |             make_directory(current_save_dir)
 86 | 
 87 |         with openslide.open_slide(full_slide_path) as slide:
 88 |             if slide.dimensions[1] < 20000:
 89 |                 print('AMC data!')
 90 |                 patch_size = 256
 91 |             else :
 92 |                 print('SNU data!')
 93 |                 patch_size = 290
 94 |             
 95 | 
 96 |             slide_tiles = DeepZoomGenerator(slide, tile_size = patch_size, overlap = 0 , limit_bounds = False)
 97 |             if patch_size == 290:
 98 |                 output_preds = np.zeros((int((slide.dimensions[1] / 8 + 1)/1.13), int((slide.dimensions[0] / 8 + 1)/1.13)))
 99 |             else: ### snu resolution
100 |                 output_preds = np.zeros((slide.dimensions[1],slide.dimensions[0]))
101 |             print('output_preds shape : ',output_preds.shape)
102 |             samples, _ = preprocess.find_patches_from_slide(slide_path = full_slide_path, mask_path = None, patch_size = patch_size) 
103 |             print(samples.is_tissue.value_counts())
104 |             cnt = 0
105 |             for idx, batch_sample in samples.iterrows():
106 |                 is_tissue = batch_sample.is_tissue
107 |                 x,y = batch_sample.tile_loc[::-1]
108 |                 if is_tissue : 
109 |                     if patch_size == 290:
110 |                         img = slide_tiles.get_tile(slide_tiles.level_count-1 -3,(x,y)) # SNU -> level 3
111 |                     else :
112 |                         img = slide_tiles.get_tile(slide_tiles.level_count-1,(x,y))
113 |                     if (img.size == (patch_size, patch_size)): 
114 |                         if IS_PREPROCESSED:
115 |                             try :
116 |                                 full_stain_patches_path = current_save_dir + str(idx) + '.png'
117 |                                 cnt += 1
118 |                                 img = Image.open(full_stain_patches_path)
119 |                                 X = np.array(img, dtype =np.uint8)
120 |                             except:
121 |                                 X = np.zeros((256,256,3))
122 |                         else :
123 |                             if img.size[0] == 290 : 
124 |                                 img = img.resize((256,256))
125 |                                 X = np.array(img, dtype = np.uint8)
126 |                                 try : 
127 |                                     X = staintools.LuminosityStandardizer.standardize(X)
128 |                                     X = normalizer.transform(X)
129 |                                     x_img = Image.fromarray(X)
130 |                                     x_img.save(current_save_dir + str(idx) + '.png')
131 |                                 except:
132 |                                     X = np.zeros((256, 256,3))
133 |                             else :
134 |                                 try :
135 |                                     full_stain_patches_path = current_save_dir + str(idx) + '.png'
136 |                                     cnt += 1
137 |                                     img = Image.open(full_stain_patches_path)
138 |                                     X = np.array(img, dtype =np.uint8)
139 |                                 except :
140 |                                     X = np.zeros((256,256, 3))
141 | 
142 |                         X = X.astype(np.float32)
143 |                         X = preprocess_input(X)
144 | 
145 |                         pred_j = predict_from_model(X, model)
146 | 
147 |                         '''fill output_preds : full heatmap'''
148 |                         new_x, new_y = batch_sample.tile_loc[0] * 256, batch_sample.tile_loc[1] * 256
149 |                         output_preds[new_x:new_x+256, new_y:new_y+256] = pred_j
150 |         '''make different level heatmaps / input : full size heatmap / output : different scale heatmap'''
151 |         heatmaps_list = make_different_level_heatmaps(output_preds)
152 |         '''extract feature from different level heatmaps'''
153 |         feature_list, feature_name_list = extract_feature_from_heatmaps(heatmaps_list)
154 |         if i == 0:
155 |             print(feature_name_list)
156 |         print(feature_list)
157 |         full_feature_list.append(feature_list)
158 | 
159 |     pd_feature = pd.DataFrame(np.array(full_feature_list), columns=feature_name_list)
160 |     save_feature_path = FEATURE_DIR + MODEL_NAME +'_' +phase+'_feature.csv'
161 |     pd_feature.to_csv(save_feature_path)
162 | 


--------------------------------------------------------------------------------
/stage2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
  5 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
  6 | from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge,LinearRegression, Ridge, Lasso
  7 | from sklearn.kernel_ridge import KernelRidge
  8 | from sklearn.svm import SVR, LinearSVR
  9 | from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
 10 | from sklearn.externals import joblib
 11 | from PIL import Image
 12 | import random
 13 | from skimage.measure import block_reduce
 14 | import cv2
 15 | from utils import * # get_major_axis, acc_score
 16 | from sklearn.metrics import roc_auc_score
 17 | 
 18 | 
 19 | RESIZE_LIST = [4,16,64]
 20 | 
 21 | def make_different_level_heatmaps(output_preds):
 22 |   output_preds = np.array(output_preds)
 23 |   heatmaps_list = []
 24 |   resize_list = RESIZE_LIST
 25 |   for i in resize_list:
 26 |     heatmaps_list.append(block_reduce(output_preds, (i,i), np.mean))
 27 |   return heatmaps_list
 28 | 
 29 | def extract_feature_from_heatmaps(heatmaps_list):
 30 |   THRESHOLDS = [0.2,0.5] ##
 31 |   resize_list = RESIZE_LIST
 32 |   feature_list = []
 33 |   feature_name_list = []
 34 |   for i, heatmap in enumerate(heatmaps_list):
 35 |     for threshold in THRESHOLDS:
 36 |       test_np = (heatmap > threshold).astype(np.uint8)
 37 |       #kernel = np.ones((resize_list[i], resize_list[i]), np.uint8)
 38 |       #test_np = cv2.morphologyEx(test_np, cv2.MORPH_CLOSE, kernel)
 39 |       mx_i = get_major_axis(test_np)
 40 |       feature_name_list.append(str(resize_list[i]) + '_major_axis_t' + str(threshold))
 41 |       feature_list.append(mx_i * resize_list[i])
 42 |       tumor_len = np.sum(heatmap > threshold)
 43 |       
 44 |       tissue_len = np.sum(heatmap > 0.0)
 45 |       feature_name_list.append(str(resize_list[i]) + '_tumor_ratio_t' + str(threshold))
 46 |       if tissue_len != 0:
 47 |         feature_list.append(tumor_len / tissue_len)
 48 |       else :
 49 |         feature_list.append(0.0)
 50 |     
 51 |     feature_name_list.append(str(resize_list[i]) + '_max')
 52 |     feature_name_list.append(str(resize_list[i]) + '_mean')
 53 |     feature_name_list.append(str(resize_list[i]) + '_std')
 54 |     feature_list.append(np.max(heatmap))
 55 |     feature_list.append(np.mean(heatmap))
 56 |     feature_list.append(np.std(heatmap))
 57 | 
 58 |   return feature_list, feature_name_list
 59 | 
 60 | def rmse_cv(model,X,y):
 61 |     rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
 62 |     return rmse
 63 | 
 64 | 
 65 | ml_models = [
 66 |           RandomForestRegressor(),
 67 |           ]
 68 | 
 69 | ROOT_DIR = ''
 70 | LABEL_PATH = ROOT_DIR + '/data/train/label.csv' ####### 
 71 | OUTPUT_PATH = ROOT_DIR + '/data/output/output.csv'
 72 | CKPT_DIR = ROOT_DIR + '/data/volume/model/'
 73 | HIST_DIR = ROOT_DIR + '/data/volume/history/'
 74 | FEAT_DIR = ROOT_DIR + '/data/volume/feature/'
 75 | ML_DIR = CKPT_DIR + 'stage2/'
 76 | make_directory(ML_DIR)
 77 | 
 78 | 
 79 | drop_columns  = ['4_major_axis_t0.5', '4_tumor_ratio_t0.5', '4_major_axis_t0.9','4_tumor_ratio_t0.9',
 80 |                 '16_major_axis_t0.5', '16_tumor_ratio_t0.5', '16_major_axis_t0.9', '16_tumor_ratio_t0.9',
 81 |                   '64_major_axis_t0.5','64_tumor_ratio_t0.5', '64_major_axis_t0.9', '64_tumor_ratio_t0.9',
 82 |                  '256_major_axis_t0.5','256_tumor_ratio_t0.5', '256_major_axis_t0.9', '256_tumor_ratio_t0.9'
 83 |                  ]
 84 | drop_indexes = [2, 3,  9, 10, 16, 17, 23, 24]
 85 | 
 86 | def stage2_train(pd_feature):
 87 |   print(pd_feature.columns)
 88 |   pd_feature.drop(pd_feature.columns[drop_indexes], axis='columns', inplace=True)
 89 |   x = np.array(pd_feature.values)
 90 |   scaler2 = StandardScaler()
 91 |   x2 = scaler2.fit_transform(x)
 92 | 
 93 |   label_df = pd.read_csv(LABEL_PATH)
 94 |   y_meta = np.array(label_df.metastasis.tolist())
 95 |   y_major_axis = np.array(label_df.major_axis.tolist())
 96 |   y_major_axis_log = np.log(y_major_axis + 1) # 나중에 꼭 e 과 1 빼주기
 97 | 
 98 |   names = [ "RF"]
 99 |   for i in range(len(ml_models)):
100 |     model = ml_models[i]
101 |     model.fit(x2,y_meta)
102 |     auc_score = roc_auc_score(y_meta, model.predict(x2))
103 |     print(names[i],' roc_auc_score : ',auc_score)
104 | 
105 |     model.fit(x2,y_major_axis_log)
106 |     pred = model.predict(x2)
107 |     pred = np.exp(pred) - 1
108 |     thresholds = [50, 100, 250, 500, 1000]
109 |     pred_tmp = pred.copy()
110 |     for thresh in thresholds:
111 |       for j in range(len(pred)):
112 |         if pred[j] < thresh :
113 |           pred_tmp[j] = 0
114 |       acc_sc = acc_score(y_major_axis, pred_tmp)
115 |       print(names[i], thresh, 'thresh value, acc score : ',acc_sc)
116 | 
117 | def stage2(pd_feature):
118 |     ## x 
119 |     pd_feature.drop(pd_feature.columns[drop_indexes], axis='columns', inplace=True)
120 |     #pd_feature.drop(drop_columns, axis='columns', inplace=True)
121 |     x = np.array(pd_feature.values)
122 |     scaler1 = MinMaxScaler()
123 |     scaler2 = StandardScaler()
124 |     x1 = scaler1.fit_transform(x)
125 |     x2 = scaler2.fit_transform(x)
126 | 
127 |     ## y
128 |     label_df = pd.read_csv(LABEL_PATH)
129 |     y_meta = np.array(label_df.metastasis.tolist())
130 |     y_major_axis = np.array(label_df.major_axis.tolist())
131 |     y_major_axis_log = np.log(y_major_axis + 1) # 나중에 꼭 e 과 1 빼주기
132 | 
133 |     names = [ "RF"]
134 |     models_ml_len = len(ml_models)
135 |     random_model_index = np.random.randint(models_ml_len)
136 |     random_model_meta = ml_models[random_model_index]
137 |     random_model_meta.fit(x2,y_meta)
138 |     auc_score = roc_auc_score(y_meta, random_model_meta.predict(x2))
139 |     print(names[random_model_index],' roc_auc_score : ',auc_score)
140 |     print(random_model_meta)
141 | 
142 |     random_model_index = np.random.randint(models_ml_len)
143 |     random_model_major = ml_models[random_model_index]
144 |     random_model_major.fit(x2, y_major_axis_log)
145 |     pred = random_model_major.predict(x2)
146 |     pred = np.exp(pred) - 1
147 |     for i in range(len(pred)):
148 |         pred[i] = 0
149 |     acc_sc = acc_score(y_major_axis, pred)
150 |     print(random_model_major)
151 |     print('major_axis all 0, acc score : ',acc_sc)
152 | 
153 |     return random_model_meta, random_model_major
154 | 
155 | def stage2_predict(pd_feature, b_me, b_ma):
156 | 
157 |     pd_feature_1 = pd_feature.iloc[:,test2_pick_indexes]
158 |     print('test2 meta : ',pd_feature_1.columns)
159 |     x = np.array(pd_feature_1.values)
160 |     scaler2 = StandardScaler()
161 |     x2 = scaler2.fit_transform(x)
162 |     y_me = b_me.predict(x2)
163 | 
164 |     print('test2 major : ',pd_feature.columns)
165 |     x = np.array(pd_feature.values)
166 |     scaler2 = StandardScaler()
167 |     x2 = scaler2.fit_transform(x)
168 |     y_ma = b_ma.predict(x2)
169 | 
170 |     y_ma = np.exp(y_ma) - 1
171 |     if len(pd_feature) > 108: # for SNU dataset
172 |       y_ma = y_ma / 1.76
173 |     for i in range(len(y_ma)):
174 |       if y_ma[i] < 500:
175 |         y_ma[i] = 0
176 |   
177 |     return y_me, y_ma
178 |   
179 |   
180 | RESIZE_LIST = [4,16,64]
181 | 
182 | train_pick_indexes = [1,7,8,9,10, 12,18,19,20,21, 23,29,30,31,32]
183 | test2_pick_indexes = [1,3,4,5,6,  8,9,11,12,13,   15,17,18,19,20]
184 | def stage2_train_meta(pd_feature):
185 |   print('train meta : ',pd_feature.columns)
186 |   
187 |   #pd_feature.drop(pd_feature.columns[drop_indexes], axis='columns', inplace=True)
188 |   pd_feature = pd_feature.iloc[:,train_pick_indexes]
189 |   x = np.array(pd_feature.values)
190 |   scaler2 = StandardScaler()
191 |   x2 = scaler2.fit_transform(x)
192 | 
193 |   label_df = pd.read_csv(LABEL_PATH)
194 |   y_meta = np.array(label_df.metastasis.tolist())
195 |   y_major_axis = np.array(label_df.major_axis.tolist())
196 |   y_major_axis_log = np.log(y_major_axis + 1) # 나중에 꼭 e 과 1 빼주기
197 | 
198 |   names = ["RFR"]
199 |   for i in range(len(ml_models)):
200 |     model = ml_models[i]
201 |     model.fit(x2,y_meta)
202 |     auc_score = roc_auc_score(y_meta, model.predict(x2))
203 |     print(names[i],' roc_auc_score : ',auc_score)
204 |   
205 |   print(model.feature_importances_)
206 |   return model
207 | 
208 | 
209 | train_pick_indexes_major = [0,1,6,7,8,9,10, 11,12,17,18,19,20,21, 22,23,28,29,30,31,32]
210 | def stage2_train_major(pd_feature):
211 |   
212 |   pd_feature = pd_feature.iloc[:,train_pick_indexes_major]
213 |   print('train major : ',pd_feature.columns)
214 |   #pd_feature.drop(pd_feature.columns[drop_indexes], axis='columns', inplace=True)
215 |   x = np.array(pd_feature.values)
216 |   scaler2 = StandardScaler()
217 |   x2 = scaler2.fit_transform(x)
218 | 
219 |   label_df = pd.read_csv(LABEL_PATH)
220 |   y_meta = np.array(label_df.metastasis.tolist())
221 |   y_major_axis = np.array(label_df.major_axis.tolist())
222 |   y_major_axis_log = np.log(y_major_axis + 1) # 나중에 꼭 e 과 1 빼주기
223 | 
224 |   names = ["RFR"]
225 |   for i in range(len(ml_models)):
226 |     model = ml_models[i]
227 |     model.fit(x2,y_major_axis_log)
228 |     pred = model.predict(x2)
229 |     pred = np.exp(pred) - 1
230 | 
231 |     for i in range(len(pred)):
232 |       if pred[i] < 500:
233 |         pred[i] = 0
234 |     acc_sc = acc_score(y_major_axis, pred)
235 |     print('major_axis, acc score : ',acc_sc)
236 |   
237 |   print(model.feature_importances_)
238 |   return model
239 |   
240 | def check_train_score(pd_feature):
241 |   LABEL_PATH = '/data/train/label.csv'
242 |   label_df = pd.read_csv(LABEL_PATH)
243 |   y_meta = np.array(label_df.metastasis.tolist())
244 |   y_major_axis = np.array(label_df.major_axis.tolist())
245 | 
246 |   # check train meta score
247 |   cols_name = list(pd_feature.columns)
248 |   best_auc = 0
249 |   best_auc_col = 0
250 |   for i in range(len(cols_name)):
251 | 
252 |       col_idx = i
253 |       col_name = cols_name[col_idx]
254 |       predict_meta = pd_feature.iloc[:,col_idx].tolist()
255 | 
256 |       auc_score = roc_auc_score(y_meta, predict_meta)
257 |       print(col_name, 'AUC score : ',auc_score)
258 | 
259 |       if best_auc < auc_score :
260 |           best_auc = auc_score
261 |           best_auc_col = col_idx
262 |   
263 |   # check train major_axis score
264 |   best_threshold = 0
265 |   best_acc_sc = 0
266 |   major_cols = [0,2,4,6,11,13,15,17, 22,24,26,28]
267 |   for i in range(len(major_cols)):
268 |       col_idx = major_cols[i]
269 |       col_name = cols_name[col_idx] 
270 |       predict_major = np.array(pd_feature.iloc[:, col_idx].tolist()) * 1.757
271 |       
272 |       acc_sc = acc_score(y_major_axis, predict_major)
273 |       print('--------------------------------------')
274 |       print(col_name, 'ACC score : ',acc_sc)
275 |       ## 
276 |       print('----------- set thresholds ----------')
277 | 
278 |       
279 |       thresholds = [50,100,250, 300,350,400,450,500,550,600,1000]
280 |       for j in range(len(thresholds)):
281 |           threshold = thresholds[j]
282 |           tmp_major = []
283 |           for k in range(len(predict_major)):
284 |               if predict_major[k] < threshold:
285 |                   tmp_major.append(0)
286 |               else :
287 |                   tmp_major.append(predict_major[k])
288 |           acc_sc = acc_score(y_major_axis, tmp_major)
289 |           print(threshold, ' threshold acc_score : ',acc_sc)
290 |           if acc_sc > best_acc_sc :
291 |               best_acc_sc = acc_sc
292 |               best_threshold = threshold
293 |               best_acc_col = col_idx
294 |   return best_auc_col , best_acc_col, best_threshold
295 | 


--------------------------------------------------------------------------------
/preprocess/prep.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import openslide
  7 | import staintools
  8 | 
  9 | from glob import glob
 10 | from datetime import datetime
 11 | from PIL import Image
 12 | from tqdm import tqdm_notebook as tqdm
 13 | from openslide.deepzoom import DeepZoomGenerator
 14 | from skimage.filters import threshold_otsu, threshold_yen
 15 | 
 16 | import warnings
 17 | warnings.filterwarnings("ignore")
 18 | 
 19 | 
 20 | class Preprocess:
 21 |     def __init__(self, 
 22 |                  patch_size=256,
 23 |                  is_norm=False,
 24 |                  target_norm_path=False,
 25 |                  mode='train',
 26 |                  server='kakao'): ##### add option for calculating auc score
 27 |         
 28 |         if mode == 'train':
 29 |             phase = 'train'
 30 |         else:
 31 |             phase = 'test'
 32 | 
 33 |         self.mode = mode
 34 |         self.patch_size = patch_size
 35 |         self.server = server
 36 |         
 37 |         if self.server == 'kakao':
 38 |             self.slide_dir = f'/data/{phase}/level4/Image/'
 39 |             self.mask_dir = f'/data/{phase}/level4/Mask/'
 40 |             self.img_mask_pairs_path = '/data/volume/dataset/level4/'
 41 |             self.patches_img_path = '/data/volume/dataset/level4/img/'
 42 |             self.patches_mask_path = '/data/volume/dataset/level4/mask/'
 43 |             if is_norm:
 44 |                 self.patches_img_norm_path = '/data/volume/dataset/level4/img_norm/'
 45 |                 self.patches_mask_norm_path = '/data/volume/dataset/level4/mask_norm/'
 46 |         elif self.server == 'local':
 47 |             self.slide_dir = f'./data/{phase}/level4/Image/'
 48 |             self.mask_dir = f'./data/{phase}/level4/Mask/'
 49 |             self.img_mask_pairs_path = './data/volume/dataset/level4/'
 50 |             self.patches_img_path = './data/volume/dataset/level4/img/'
 51 |             self.patches_mask_path = './data/volume/dataset/level4/mask/'
 52 |             if is_norm:
 53 |                 self.patches_img_norm_path = './data/volume/dataset/level4/img_norm/'
 54 |                 self.patches_mask_norm_path = './data/volume/dataset/level4/mask_norm/'
 55 |         
 56 |         if is_norm:
 57 |             print('*'*20, 'Color Normalization : True', '*'*20)
 58 |             self.is_norm = is_norm
 59 |             self.normalizer = self.stain_norm_func(target_norm_path)
 60 |         
 61 |     
 62 |     def _make_directory(self):
 63 |         '''학습 시킬 데이터셋(patches)을 저장하는 함수'''
 64 | 
 65 |         if self.server == 'local':
 66 |             dir_path = './data/volume'
 67 |         elif self.server == 'kakao':
 68 |             dir_path = '/data/volume'
 69 | 
 70 |         if not os.path.exists(f'{dir_path}/dataset'):
 71 |             os.mkdir(f'{dir_path}/dataset')
 72 |         if not os.path.exists(f'{dir_path}/dataset/level4'):
 73 |             os.mkdir(f'{dir_path}/dataset/level4')
 74 |         
 75 |         if self.is_norm:
 76 |             if not os.path.exists(f'{dir_path}/dataset/level4/img'):
 77 |                 os.mkdir(f'{dir_path}/dataset/level4/img')        
 78 |             if not os.path.exists(f'{dir_path}/dataset/level4/mask'):
 79 |                 os.mkdir(f'{dir_path}/dataset/level4/mask')
 80 |             if not os.path.exists(f'{dir_path}/dataset/level4/img_norm'):
 81 |                 os.mkdir(f'{dir_path}/dataset/level4/img_norm')        
 82 |             if not os.path.exists(f'{dir_path}/dataset/level4/mask_norm'):
 83 |                 os.mkdir(f'{dir_path}/dataset/level4/mask_norm')
 84 |         else:
 85 |             if not os.path.exists(f'{dir_path}/dataset/level4/img'):
 86 |                 os.mkdir(f'{dir_path}/dataset/level4/img')        
 87 |             if not os.path.exists(f'{dir_path}/dataset/level4/mask'):
 88 |                 os.mkdir(f'{dir_path}/dataset/level4/mask')
 89 | 
 90 |         print('Created Directories')
 91 |         return None
 92 |     
 93 |     
 94 |     def find_patches_from_slide(self, 
 95 |                                 slide_path, 
 96 |                                 mask_path, 
 97 |                                 patch_size=256, 
 98 |                                 filter_nontissue=True):
 99 |         '''
100 |         Returns a DataFrame of all patches in slide
101 |         Args:
102 |             - slide_path: path of slide
103 |             - truth_path: path of truth(mask)
104 |             - patch_size: patch size for samples
105 |             - filter_non_tissue: remove samples no tissue detected
106 |         Returns:
107 |             - samples: patches samples from slide
108 |             - positive: > 0 if tumor else not tumor 0
109 |         '''
110 |         
111 |         with openslide.open_slide(slide_path) as slide:
112 |             tiles = DeepZoomGenerator(slide, tile_size=patch_size, overlap=0, limit_bounds=False)
113 |             if patch_size == 256 :
114 |                 size = tiles.level_tiles[tiles.level_count-1]
115 |                 # print(f'tile size : {size}')  # (23, 58)
116 |             else :
117 |                 size = tiles.level_tiles[tiles.level_count-1 -3]
118 |             thumb_slide = slide.get_thumbnail(size)
119 |             # print(f'thumb_slide size : {thumb_slide.size}')
120 |            
121 | 
122 |         
123 |         if self.mode == 'train':
124 |             with openslide.open_slide(mask_path) as mask:
125 |                 thumb_mask = mask.get_thumbnail(size)  # (23, 58)
126 |                 # print(f'thumb_mask size : {thumb_mask.size}')
127 |             
128 |         # ############## is tissue 부분 ##############
129 |         slide4_grey = np.array(thumb_slide.convert('L'))
130 |         binary = slide4_grey < 255  # white = 255 
131 |         slide4_not_white = slide4_grey[binary]  # white = 255
132 |         thresh = threshold_yen(slide4_not_white)
133 |         # thresh = threshold_otsu(slide4_not_white)
134 |         # print(f'current thersh : {thresh}')
135 |         
136 |         height, width = slide4_grey.shape  # (height, width)
137 |         for h in range(height):
138 |             for w in range(width):
139 |                 if slide4_grey[h, w] > thresh:
140 |                     binary[h, w] = False
141 |                     
142 |         # create pathces DataFrame
143 |         patches = pd.DataFrame(pd.DataFrame(binary).stack())
144 |         patches['is_tissue'] = patches[0]
145 |         patches = pd.DataFrame(pd.DataFrame(binary).stack(), columns=['is_tissue'])
146 |         patches.loc[:, 'slide_path'] = slide_path
147 |         
148 |         
149 |         # ############## is_tumor 부분 ##############
150 |         if self.mode == 'train':
151 |             truth_img_grey = np.array(thumb_mask.convert('L'))
152 |             positive = truth_img_grey.mean()
153 | 
154 |             if positive > 0:  # tumor인 경우
155 |                 # print('positive(tumor)')
156 |                 truth_not_black = truth_img_grey[truth_img_grey > 0]
157 |                 try:
158 |                     m_thresh = threshold_otsu(truth_not_black)
159 |                 except:
160 |                     m_thresh = 190
161 |                 patches_y = pd.DataFrame(pd.DataFrame(truth_img_grey).stack(), columns=['is_tumor'])
162 |                 patches_y['is_tumor'] = patches_y['is_tumor'] > m_thresh  # 190  # threshold method를 사용 안한 이유?
163 |                 samples = pd.concat([patches, patches_y], axis=1)  # slide의 patches와 mask의 patches_y concat 해주기
164 |             else:
165 |                 # print('negative(not tumor)')
166 |                 samples = patches
167 |                 samples.loc[:, 'is_tumor'] = False
168 | 
169 |         if self.mode == 'test':
170 |             # Inference pahse
171 |             positive = 0
172 |             samples = patches
173 |             
174 |         if filter_nontissue == True:  # tissue인 것만 가져오기
175 |             samples = samples[samples['is_tissue']==True]
176 | 
177 |         samples['tile_loc'] = samples.index.tolist()
178 |         samples.reset_index(inplace=True, drop=True)
179 |         # print(f"samples['is_tumor'].value_counts()\n{samples['is_tumor'].value_counts()}")
180 |         
181 |         return samples, positive
182 | 
183 |     
184 |     def save_patches(self):
185 |         ''' patches들을 저장하는 함수 '''
186 |         
187 |         prepro_start_time = datetime.now()
188 |         print('='*20, 'Step 1 - create patches', '='*20)
189 |         
190 |         
191 |         # create directory if not exist
192 |         self._make_directory()
193 |         
194 |         # create slide_path, mask_path pair
195 |         slide_path_list = glob(f'{self.slide_dir}*.png')
196 |         mask_path_list = glob(f'{self.mask_dir}*.png')
197 | 
198 |         slide_path_dict, mask_path_dict = {}, {}
199 |         for slide_path, mask_path in zip(slide_path_list, mask_path_list):
200 |             # slide
201 |             slide_name, _ = os.path.splitext(slide_path)
202 |             slide_idx = slide_name.split('_')[-1]   
203 |             
204 |             # mask
205 |             mask_name, _ = os.path.splitext(mask_path)
206 |             mask_idx = mask_name.split('_')[-1]
207 |             
208 |             # update each dictionary
209 |             slide_path_dict[slide_idx] = slide_path
210 |             mask_path_dict[mask_idx] = mask_path
211 |             
212 |         slide_mask_path_pairs = [(idx, slide_path, mask_path_dict[idx]) 
213 |                                     for idx, slide_path in slide_path_dict.items()]
214 | 
215 |         slide_mask_pairs = []
216 |         slide_mask_norm_pairs = []
217 |         for cnt, (s_idx, slide_path, mask_path) in enumerate(slide_mask_path_pairs):
218 |             print(f'{slide_path} 패치 추출 중 ...')
219 |             samples, positive = self.find_patches_from_slide(slide_path, mask_path)
220 |             
221 |             if positive:  # tumor인 경우
222 |                 samples_pos = samples[samples['is_tumor'] == True]
223 |                 samples_neg = samples[samples['is_tumor'] == False]
224 |                 total_pos = len(samples_pos)
225 |                 sample_num = 100
226 |                 if total_pos < 50:
227 |                     sample_num = total_pos * 2
228 |                 elif total_pos < 10:
229 |                     sample_num = total_os * 5
230 |                     
231 |                 samples_pos = samples_pos.sample(sample_num, random_state=42, replace=True)
232 |                 samples_neg = samples_neg.sample(sample_num, random_state=42, replace=True)
233 |                 samples = samples_neg.append(samples_pos)
234 |             else:  # non tumor인 경우
235 |                 sample_num = 100
236 |                 total_neg = len(samples)
237 |                 if total_neg > 100:
238 |                     samples = samples.sample(sample_num, random_state=42, replace=True)
239 |                     
240 |             
241 |             with openslide.open_slide(slide_path) as slide:
242 |                 with openslide.open_slide(mask_path) as mask:
243 |                     slide_tiles = DeepZoomGenerator(slide, tile_size=self.patch_size, overlap=0, limit_bounds=False)
244 |                     mask_tiles = DeepZoomGenerator(mask, tile_size=self.patch_size, overlap=0, limit_bounds=False)
245 |                     for p_idx, (tile_loc, is_tumor) in enumerate(
246 |                             zip(samples['tile_loc'].tolist(), samples['is_tumor'].tolist())):
247 |                         
248 |                         y, x = tile_loc
249 |                         img = slide_tiles.get_tile(slide_tiles.level_count-1, (x, y))
250 |                         mask = mask_tiles.get_tile(mask_tiles.level_count-1, (x, y))
251 |                         slide_mask_pairs.append((f'{self.patches_img_path}slide{s_idx}_{p_idx}.png', 
252 |                                                     f'{self.patches_mask_path}slide{s_idx}_{p_idx}.png'))
253 |                         img.save(f'{self.patches_img_path}slide{s_idx}_{p_idx}.png')  # slide#_patch#.png
254 |                         mask.save(f'{self.patches_mask_path}slide{s_idx}_{p_idx}.png')
255 |                         if self.is_norm:
256 |                             try:
257 |                                 img = np.array(img, dtype=np.uint8)
258 |                                 to_transform = staintools.LuminosityStandardizer.standardize(img)
259 |                                 img_normed = self.normalizer.transform(to_transform)
260 |                                 img_normed = Image.fromarray(img_normed)
261 |                                 slide_mask_norm_pairs.append((f'{self.patches_img_norm_path}slide_norm{s_idx}_{p_idx}.png', 
262 |                                                              f'{self.patches_mask_norm_path}slide_norm{s_idx}_{p_idx}.png'))
263 |                                 img_normed.save(f'{self.patches_img_norm_path}slide_norm{s_idx}_{p_idx}.png')  # slide#_patch#.png
264 |                                 mask.save(f'{self.patches_mask_norm_path}slide_norm{s_idx}_{p_idx}.png')
265 |                             except:
266 |                                 continue
267 |             
268 |             if cnt % 5 == 0:
269 |                 # img - mask pair 저장하기
270 |                 with open(f'{self.img_mask_pairs_path}img_mask_pairs_{cnt}.pkl', 'wb') as f:
271 |                     pickle.dump(slide_mask_pairs, f)
272 |                 with open(f'{self.img_mask_pairs_path}img_mask_norm_pairs_{cnt}.pkl', 'wb') as f:
273 |                     pickle.dump(slide_mask_norm_pairs, f)
274 | 
275 |                             
276 |         
277 |         # img - mask pair 저장하기
278 |         with open(f'{self.img_mask_pairs_path}img_mask_pairs.pkl', 'wb') as f:
279 |             pickle.dump(slide_mask_pairs, f)
280 |         with open(f'{self.img_mask_pairs_path}img_mask_norm_pairs.pkl', 'wb') as f:
281 |             pickle.dump(slide_mask_norm_pairs, f)
282 |             
283 |         prepro_end_time = datetime.now()
284 |         print('preprocessing patches img time : %.1f minutes'%((prepro_end_time - prepro_start_time).seconds/60))
285 |         print('='*50)
286 |         return None
287 |     
288 |     
289 |     def stain_norm_func(self, target_image_path):
290 |         target = staintools.read_image(target_image_path)
291 |         target = staintools.LuminosityStandardizer.standardize(target)
292 |         normalizer = staintools.StainNormalizer(method='vahadane')
293 |         normalizer.fit(target)
294 |         return normalizer
295 |     
296 |     
297 | if __name__ == "__main__":
298 |     preprocess = Preprocess(patch_size=256, 
299 |                             is_norm=True, 
300 |                             target_norm_path='./target_norm.png', 
301 |                             mode='train', 
302 |                             server='local')
303 |     
304 |     preprocess.save_patches()
305 |     
306 |     


--------------------------------------------------------------------------------