├── inference.sh ├── assets ├── slide.pdf ├── model_description.png └── task_description.png ├── preprocess ├── target_norm.png ├── utils.py └── prep.py ├── model ├── net.py └── data_loader.py ├── README.md ├── utils.py ├── inference.py ├── train.py ├── feature_extraction.py └── stage2.py /inference.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | python ./feature_extraction.py 3 | python ./inference.py -------------------------------------------------------------------------------- /assets/slide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyc1am3n/HeLP2019_Breast_Cancer_1st_solution/HEAD/assets/slide.pdf -------------------------------------------------------------------------------- /preprocess/target_norm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyc1am3n/HeLP2019_Breast_Cancer_1st_solution/HEAD/preprocess/target_norm.png -------------------------------------------------------------------------------- /assets/model_description.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyc1am3n/HeLP2019_Breast_Cancer_1st_solution/HEAD/assets/model_description.png -------------------------------------------------------------------------------- /assets/task_description.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cyc1am3n/HeLP2019_Breast_Cancer_1st_solution/HEAD/assets/task_description.png -------------------------------------------------------------------------------- /model/net.py: -------------------------------------------------------------------------------- 1 | import segmentation_models as sm 2 | from keras.optimizers import Adam 3 | 4 | """ 5 | Network Architecture 6 | """ 7 | 8 | def fpn(backbone, pretrained_weights=None): 9 | model = sm.FPN(backbone, 10 | input_shape=(256, 256, 3), 11 | classes=1, 12 | activation='sigmoid', 13 | encoder_weights=pretrained_weights) 14 | 15 | model.compile(optimizer='adam', 16 | loss=sm.losses.bce_jaccard_loss, 17 | metrics=[sm.metrics.iou_score, sm.metrics.f1_score]) 18 | return model 19 | 20 | 21 | def unet(backbone, pretrained_weights=None): 22 | model = sm.Unet(backbone, 23 | input_shape=(256, 256, 3), 24 | classes=1, 25 | activation='sigmoid', 26 | encoder_weights=pretrained_weights) 27 | 28 | model.compile(optimizer='adam', 29 | loss=sm.losses.bce_jaccard_loss, 30 | metrics=[sm.metrics.iou_score, sm.metrics.f1_score]) 31 | 32 | return model -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HeLP Challenge 2019 Breast Cancer 1st place solution 2 | 3 | This repository is **1st place solution** to the **Breast Cancer Classification Task of HeLP Challenge 2019**. 4 | ![task_description](./assets/task_description.png) 5 | 6 | 7 | ## Model 8 | ![model_description](./assets/model_description.png) 9 | ### Stage 1 10 | - Preprocessing: ROI extraction, Rescale, Vahadane Stain Normalization 11 | - Pixel-wise Segmentation: Feature Pyramid Network(FPN) 12 | ### Stage 2 13 | - Feature extraction from probability heatmap 14 | - Prediction final probability and major axis based on features 15 | 16 | And also, please click [this link](./assets/slide.pdf) to see the detailed model description. 17 | 18 | ## Dependencies 19 | - keras 20 | - segmentation_models 21 | - openslide 22 | - staintools 23 | - numpy 24 | - pandas 25 | - sklearn 26 | - skimage 27 | 28 | ## Usage 29 | 30 | ### Dataset 31 | 32 | ```bash 33 | data 34 | └── train 35 | ├── level4 36 | │ ├── Image 37 | │ │ ├── slide_001.png 38 | │ │ ├── ... 39 | │ │ └── slide_#.png 40 | │ └── Mask 41 | │ ├── mask_001.png 42 | │ ├── ... 43 | │ └── mask_#.png 44 | └── label.csv 45 | 46 | ========= After training, the directories are created as below. ========= 47 | 48 | ├── volume 49 | │ ├── dataset 50 | │ │ └── level4 51 | │ │ ├── img 52 | │ │ │ ├── slide001_patch001.png 53 | │ │ │ ├── ... 54 | │ │ │ └── slide#_patch#.png 55 | │ │ └── mask 56 | │ │ ├── mask001_patch001.png 57 | │ │ ├── ... 58 | │ │ └── mask#_patch#.png 59 | │ └── model 60 | │ └── fpn_weights.h5 61 | └── heatmap 62 | ... 63 | ``` 64 | 65 | 66 | 67 | ### Train 68 | Run the `train.py`. 69 | ```bash 70 | $ python train.py 71 | ``` 72 | ### Inference 73 | Run the `inference.sh`. 74 | ```bash 75 | $ sh inference.sh 76 | ``` 77 | 78 | ## Authors 79 | - Daeyoung Kim / [@cyc1am3n](https://github.com/cyc1am3n) 80 | - Taewoo Kim / [@Taeu](https://github.com/Taeu) 81 | - Jonghyun Choi / [@ExcelsiorCJH](https://github.com/ExcelsiorCJH) 82 | -------------------------------------------------------------------------------- /preprocess/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.append('..') 4 | 5 | def make_directory(): 6 | if not os.path.exists('./data/volume/train'): 7 | os.mkdir('./data/volume/train') 8 | if not os.path.exists('./data/volume/valid'): 9 | os.mkdir('./data/volume/valid') 10 | 11 | if not os.path.exists('./data/volume/train/level4'): 12 | os.mkdir('./data/volume/train/level4') 13 | if not os.path.exists('./data/volume/valid/level4'): 14 | os.mkdir('./data/volume/valid/level4') 15 | 16 | if not os.path.exists('./data/volume/train/level4/img'): 17 | os.mkdir('./data/volume/train/level4/img') 18 | if not os.path.exists('./data/volume/train/level4/mask'): 19 | os.mkdir('./data/volume/train/level4/mask') 20 | 21 | if not os.path.exists('./data/volume/valid/level4/img'): 22 | os.mkdir('./data/volume/valid/level4/img') 23 | if not os.path.exists('./data/volume/valid/level4/mask'): 24 | os.mkdir('./data/volume/valid/level4/mask') 25 | 26 | # 0: normal, 1: tumor 27 | if not os.path.exists('./data/volume/train/level4/img/0'): 28 | os.mkdir('./data/volume/train/level4/img/0') 29 | if not os.path.exists('./data/volume/train/level4/mask/0'): 30 | os.mkdir('./data/volume/train/level4/mask/0') 31 | if not os.path.exists('./data/volume/train/level4/img/1'): 32 | os.mkdir('./data/volume/train/level4/img/1') 33 | if not os.path.exists('./data/volume/train/level4/mask/1'): 34 | os.mkdir('./data/volume/train/level4/mask/1') 35 | # 0: normal, 1: tumor 36 | if not os.path.exists('./data/volume/valid/level4/img/0'): 37 | os.mkdir('./data/volume/valid/level4/img/0') 38 | if not os.path.exists('./data/volume/valid/level4/mask/0'): 39 | os.mkdir('./data/volume/valid/level4/mask/0') 40 | if not os.path.exists('./data/volume/valid/level4/img/1'): 41 | os.mkdir('./data/volume/valid/level4/img/1') 42 | if not os.path.exists('./data/volume/valid/level4/mask/1'): 43 | os.mkdir('./data/volume/valid/level4/mask/1') 44 | 45 | print('Created Directories') 46 | return None -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import staintools 4 | from skimage.measure import label, regionprops 5 | 6 | def stain_norm_func(target_image_path): 7 | target = staintools.read_image(target_image_path) 8 | target = staintools.LuminosityStandardizer.standardize(target) 9 | normalizer = staintools.StainNormalizer(method='vahadane') 10 | normalizer.fit(target) 11 | return normalizer 12 | 13 | def stain_patch_dir(PATCHES_DIR, slide_pathes): 14 | phase = 'train' 15 | stain_patches_save_path = PATCHES_DIR + 'train/' 16 | if len(slide_pathes) < 110 : 17 | phase = 'test1' 18 | stain_patches_save_path = PATCHES_DIR + 'test1/' 19 | elif len(slide_pathes) < 200 : 20 | phase = 'test2' 21 | stain_patches_save_path = PATCHES_DIR + 'test2/' 22 | make_directory(stain_patches_save_path) 23 | print('current phase : ',phase) 24 | return stain_patches_save_path, phase 25 | 26 | 27 | def set_directory(CKPT_DIR, MODEL_NAME): 28 | path1, path2 = os.path.split(CKPT_DIR[:-1]) 29 | if not os.path.isdir(path1): 30 | os.mkdir(path1) 31 | if not os.path.isdir(CKPT_DIR): 32 | os.mkdir(CKPT_DIR) 33 | if not os.path.isdir(CKPT_DIR + MODEL_NAME): 34 | os.mkdir(CKPT_DIR + MODEL_NAME) 35 | print('Set Directory') 36 | 37 | 38 | def get_major_axis(mask): 39 | from skimage.measure import label, regionprops 40 | 41 | # divide entire masks into each instance using connected-components labelling 42 | labels = label(mask) 43 | 44 | # iterate to calculate the length of the major axis of each instance 45 | major_axis_list = [regionprops((labels == i).astype('uint8'))[0].major_axis_length \ 46 | for i in np.unique(labels) if i != 0] 47 | 48 | # find the longest major axis 49 | if len(major_axis_list): 50 | longest_major_axis = max(major_axis_list) 51 | else: 52 | longest_major_axis = 0 53 | return longest_major_axis 54 | 55 | 56 | def predict_from_model(patch, model): 57 | """Predict which pixels are tumor. 58 | 59 | input: patch: 256x256x3, rgb image 60 | input: model: keras model 61 | output: prediction: 256x256x1, per-pixel tumor probability 62 | """ 63 | 64 | prediction = model.predict(patch.reshape(1, 256, 256, 3)) 65 | prediction = prediction.reshape(256, 256) 66 | return prediction 67 | 68 | def make_directory(DIR): 69 | if not os.path.isdir(DIR): 70 | os.mkdir(DIR) 71 | print(DIR,'made!') 72 | 73 | def acc_score(truth, pred): 74 | cnt = 0 75 | 76 | for i in range(len(truth)): 77 | diff = np.abs(truth[i] - pred[i]) 78 | if diff <= truth[i]*0.05 : 79 | cnt += 1 80 | return cnt / len(truth) -------------------------------------------------------------------------------- /model/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import keras 7 | 8 | from keras.preprocessing.image import ImageDataGenerator 9 | from sklearn.model_selection import KFold, StratifiedKFold 10 | 11 | import warnings 12 | warnings.filterwarnings("ignore") 13 | 14 | 15 | class PatchLoader: 16 | def __init__(self, n_kfold, seed, use_norm=True, server='kakao'): 17 | self.n_kfold = n_kfold 18 | self.seed = seed 19 | 20 | if server == 'local': 21 | if use_norm: 22 | self.patches_mask_path = './data/volume/dataset/level4/img_norm/' 23 | self.patches_mask_path = './data/volume/dataset/level4/mask_norm/' 24 | self.img_mask_pairs_path = './data/volume/dataset/level4/img_mask_norm_pairs.pkl' 25 | else: 26 | self.patches_mask_path = './data/volume/dataset/level4/img/' 27 | self.patches_mask_path = './data/volume/dataset/level4/mask/' 28 | self.img_mask_pairs_path = './data/volume/dataset/level4/img_mask_pairs.pkl' 29 | elif server == 'kakao': 30 | if use_norm: 31 | self.patches_img_path = '/data/volume/dataset/level4/img_norm/' 32 | self.patches_mask_path = '/data/volume/dataset/level4/mask_norm/' 33 | self.img_mask_pairs_path = '/data/volume/dataset/level4/img_mask_norm_pairs.pkl' 34 | else: 35 | self.patches_img_path = '/data/volume/dataset/level4/img/' 36 | self.patches_mask_path = '/data/volume/dataset/level4/mask/' 37 | self.img_mask_pairs_path = '/data/volume/dataset/level4/img_mask_pairs.pkl' 38 | 39 | 40 | def get_all_patches(self): 41 | '''slide & mask의 pair를 불러와 dataframe으로 만드는 함수''' 42 | 43 | with open(self.img_mask_pairs_path, 'rb') as f: 44 | img_mask_pairs = pickle.load(f) 45 | 46 | self.all_patches_sample = pd.DataFrame(img_mask_pairs, columns=['slide_path', 'mask_path']) 47 | self.all_patches_sample = self.all_patches_sample.sample(frac=1, random_state=42).reset_index(drop=True) 48 | return self.all_patches_sample 49 | 50 | 51 | def split_sample(self): 52 | kf = KFold(n_splits=self.n_kfold, shuffle=True, random_state=self.seed) 53 | folds = list(kf.split(self.all_patches_sample)) 54 | return folds 55 | 56 | # K-Fold Data Generator 57 | def kfold_data_generator(slide_datagen, mask_datagen, df, batch_size=32, seed=42): 58 | slide_generator = \ 59 | slide_datagen.flow_from_dataframe(df, 60 | x_col='slide_path', 61 | y_col='mask_path', 62 | seed=seed, 63 | batch_size=batch_size, 64 | shuffle=False, 65 | class_mode=None) 66 | 67 | mask_generator = \ 68 | mask_datagen.flow_from_dataframe(df, 69 | x_col='mask_path', 70 | y_col='mask_path', 71 | color_mode='grayscale', 72 | seed=seed, 73 | batch_size=batch_size, 74 | shuffle=False, 75 | class_mode=None) 76 | 77 | 78 | generator = zip(slide_generator, mask_generator) 79 | for (slide, mask) in generator: 80 | mask = mask.astype(np.int8) 81 | yield slide, mask -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import random 4 | 5 | import numpy as np 6 | import pandas as pd 7 | 8 | from utils import * 9 | from stage2 import * 10 | 11 | '''CONFIG''' 12 | OUTPUT_PATH = '/data/output/output.csv' 13 | SEED = np.random.randint(1000) 14 | THRESHOLD = 0.5 15 | 16 | '''stage 2 ''' 17 | MODEL_NAME = 'fpn_model/' 18 | 19 | FEATURE_DIR = '/data/volume/feature/' 20 | RESULTS_PATH = '/data/volume/results/' 21 | 22 | E1_meta = 'fpn_cjh_major_taeu345.csv' 23 | E2_major = 'fpn_cjh_major_taeu345.csv' 24 | 25 | fpn_test1 = 'fpn_cjh_test1_pickcol_meta_taeu8.csv' 26 | fpn_test2 = 'fpn_cjh_test2_pickcol_meta_taeu8.csv' 27 | unet_test1 = 'unet_cjh_test1_pickcol_meta_taeu8.csv' 28 | unet_test2 = 'unet_cjh_test2_pickcol_meta_taeu8.csv' 29 | 30 | 31 | IS_PREPROCESSED = True # 한번 돌리고 둘다 True 로 바꾸자 32 | IS_FEATURE = True # 한번 돌리고 둘다 True 로 바꾸자 33 | '''''' 34 | # print(MODEL_NAME,WEIGHT_NAME) 35 | 36 | random.seed(SEED) 37 | np.random.seed(SEED) 38 | 39 | # check the meta stasis 40 | 41 | # feature full path 42 | #train_feature_path = '/data/volume/feature/fpn_cjh_rescale4_train_feature2_more.csv' 43 | #test2_feature_path = '/data/volume/feature/fpn_cjh_rescale4_test2_feature2_more.csv' 44 | 45 | feature_train_path = FEATURE_DIR + 'fpn_cjh_train_feature1.csv' 46 | feature_test1_path = FEATURE_DIR + 'fpn_cjh_test1_feature1.csv' 47 | feature_test2_path = FEATURE_DIR + 'fpn_cjh_test2_feature1.csv' 48 | 49 | train_feature_path = feature_train_path 50 | test2_feature_path = '/data/volume/feature/fpn_cjh_test2_feature_snu.csv' 51 | 52 | 53 | def main(): 54 | print('Start Inference!') 55 | print('!!!Stage 2 ENSEMBLE with load meta predict major!!!') 56 | 57 | ## load features 58 | pd_feature = pd.read_csv(train_feature_path,index_col = [0]) 59 | ## this is for prediction only by A feature. 60 | best_auc_col , best_acc_col, best_acc_threshold = check_train_score(pd_feature) 61 | ## the best model of metastasis prediction 2nd stage model 62 | best_model_meta = stage2_train_meta(pd_feature) 63 | ## the best model of major-axis prediction 2nd stage model 64 | best_model_major = stage2_train_meta(pd_feature) 65 | ## load meta 66 | e1_meta = pd.read_csv(RESULTS_PATH + E1_meta,index_col = [0]) 67 | e2_major = pd.read_csv(RESULTS_PATH + E2_major, index_col = [0]) 68 | e1_meta_list = e1_meta.metastasis.tolist() 69 | e2_major_list = e2_major.major_axis.tolist() 70 | 71 | 72 | ## predict major axis ... 73 | slide_dir = '/data/test/level4/Image/' 74 | slide_pathes = sorted(os.listdir(slide_dir)) 75 | 76 | e1_list = [] 77 | e2_list = [] 78 | cols = [1,3,5,8, 12,14,16,19, 23,25,27,30, 34,36,38,41] 79 | current_test_col = cols[3] # 0 ~ 16 80 | 81 | 82 | if len(slide_pathes) == 107: 83 | # test 1 phase 84 | phase = 'test1' 85 | print('test1 phase predict...') 86 | 87 | #ensemble 88 | pd_feature = pd.read_csv(FEATURE_DIR + MODEL_NAME[:-1] + '_test1_feature1.csv',index_col = [0]) 89 | fpn_pd = pd.read_csv(RESULTS_PATH + fpn_test1,index_col = [0]) 90 | unet_pd = pd.read_csv(RESULTS_PATH + unet_test1, index_col = [0]) 91 | 92 | fpn_meta = fpn_pd.metastasis.tolist() 93 | unet_meta = unet_pd.metastasis.tolist() 94 | 95 | for i in range(len(fpn_meta)): 96 | e1_meta_list[i] = 0.5 *(fpn_meta[i] + unet_meta[i]) 97 | 98 | e2_major_list = pd_feature.iloc[:,11].tolist() 99 | 100 | for i in range(len(e2_major_list)): 101 | 102 | if e2_major_list[i] < 500 : 103 | e2_major_list[i] = 0 104 | 105 | else : 106 | # test 2 phase - only SNU dataset 107 | phase = 'test2' 108 | print('test2 phase predict...') 109 | 110 | ## load feature for test 2phase 111 | pd_feature = pd.read_csv(test2_feature_path,index_col = [0]) 112 | 113 | ## prediction by only one feature 114 | best_auc_col = 5 # 4, 5, 11, 12, 18, 19 # max probability value of the given probability heatmap 115 | best_acc_col = 16 # 0, 2, 7, 9, 14, 16 (best 로 바꿔서 제출) # major axis of the given probability heatmap 116 | best_acc_threshold = 500 117 | e1_meta_list = pd_feature.iloc[:,best_auc_col].tolist() # 118 | e2_major_list = np.array(pd_feature.iloc[:,best_acc_col].tolist()) / 1.76 119 | 120 | ## prediction by 2nd stage model 121 | e1_meta_list, e2_major_list = stage2_predict(pd_feature, best_model_meta, best_model_major) 122 | 123 | for i in range(len(e2_major_list)): 124 | if e2_major_list[i] < best_acc_threshold : 125 | e2_major_list[i] = 0 126 | """ 127 | # ensemble 128 | fpn_pd = pd.read_csv(RESULTS_PATH + fpn_test2,index_col = [0]) 129 | unet_pd = pd.read_csv(RESULTS_PATH + unet_test2, index_col = [0]) 130 | fpn_meta = fpn_pd.metastasis.tolist() 131 | unet_meta = unet_pd.metastasis.tolist() 132 | for i in range(len(e1_meta_list)): 133 | e1_meta_list[i] = 0.5 * (fpn_meta[i] + unet_meta[i]) 134 | """ 135 | 136 | 137 | total_result = [] 138 | for i, slide_path in enumerate(slide_pathes): 139 | slide_id = slide_path.split('.')[0] 140 | total_result.append([slide_id, e1_meta_list[i], e2_major_list[i]]) 141 | print(total_result[i]) 142 | 143 | result = pd.DataFrame(data=total_result, columns=['id', 'metastasis', 'major_axis']) 144 | result.to_csv(OUTPUT_PATH, index=False) 145 | 146 | print(SEED) 147 | save_path = '/data/volume/results/1_'+phase+'_final_' + str(best_auc_col)+'_'+str(best_acc_col)+'.csv' 148 | result.to_csv(save_path, index=False) 149 | print(save_path) 150 | 151 | if __name__ == "__main__": 152 | main() 153 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import time 4 | import random 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import keras 9 | 10 | from keras import models 11 | from keras.preprocessing.image import ImageDataGenerator 12 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 13 | 14 | from segmentation_models import get_preprocessing 15 | 16 | from utils import set_directory 17 | from preprocess.prep import Preprocess 18 | from model.net import fpn, unet 19 | from model.data_loader import PatchLoader 20 | from model.data_loader import kfold_data_generator 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser(description='parser') 24 | parser.add_argument('--model', type=str, default='fpn') 25 | parser.add_argument('--epochs', type=int, default=30) 26 | parser.add_argument('--batch_size', type=int, default=32) 27 | parser.add_argument('--patch_size', type=int, default=256) 28 | parser.add_argument('--n_folds', type=int, default=3) 29 | parser.add_argument('--preprocess', 30 | type=lambda x: True if x == 'True' else False, 31 | default=True) 32 | parser.add_argument('--stain_norm', 33 | type=lambda x: True if x == 'True' else False, 34 | default=True) 35 | parser.add_argument('--seed', type=int, default=42) 36 | parser.add_argument('--ckpt_dir', type=str, default='/data/volume/model/') 37 | parser.add_argument('--model_name', type=str, default='fpn_model/') 38 | args = parser.parse_args() 39 | 40 | TRAIN_DIR, LABEL_PATH = '/data/train', '/data/train/label.csv' 41 | CKPT_DIR, MODEL_NAME = args.ckpt_dir, args.model_name 42 | PREPROCESS = arg.preprocess 43 | N_KFOLD = args.n_folds 44 | 45 | random.seed(args.seed) 46 | np.random.seed(args.seed) 47 | 48 | # check isdir 49 | set_directory(CKPT_DIR, MODEL_NAME) 50 | 51 | # preprocessing 52 | if PREPROCESS: 53 | preprocess = Preprocess(patch_size=args.patch_size, 54 | is_norm=args.stain_norm, 55 | target_norm_path='./preprocess/target_norm.png', 56 | mode='train', 57 | server='local') 58 | preprocess.save_patches() 59 | else: 60 | print('Already Preprocessed.') 61 | 62 | # set dataset 63 | patch_loader = PatchLoader(n_kfold=args.n_folds, 64 | seed=args.seed, 65 | use_norm=args.stain_norm, 66 | server='local') 67 | all_patches_sample = patch_loader.get_all_patches() 68 | folds = patch_loader.split_sample() 69 | 70 | # set generator 71 | print('Set Generator.') 72 | preprocess_input = get_preprocessing('resnet34') 73 | 74 | # Slide, Mask ImageDataGenerator 75 | train_slide_datagen = ImageDataGenerator(# rescale= 1./255, 76 | width_shift_range=[-10, 10], 77 | rotation_range=90, 78 | fill_mode='reflect', 79 | horizontal_flip=True, 80 | vertical_flip=True, 81 | preprocessing_function=preprocess_input) 82 | 83 | train_mask_datagen = ImageDataGenerator(rescale= 1./255, 84 | width_shift_range=[-10, 10], 85 | rotation_range=90, 86 | fill_mode='reflect', 87 | horizontal_flip=True, 88 | vertical_flip=True) 89 | 90 | 91 | valid_slide_datagen = ImageDataGenerator(preprocessing_function=preprocess_input) 92 | valid_mask_datagen = ImageDataGenerator(rescale= 1./255) 93 | 94 | # set model 95 | if args.model == 'fpn': 96 | model = fpn(backbone='resnet34', pretrained_weights='imagenet') 97 | else: 98 | model = unet(backbone='resnet34', pretrained_weights='imagenet') 99 | 100 | train_start_time = time.time() 101 | for f_idx, (train_idx, valid_idx) in enumerate(folds): 102 | print('*'*20, f'{f_idx}-Fold 학습 시작', '*'*20) 103 | train_df = all_patches_sample.iloc[train_idx] 104 | valid_df = all_patches_sample.iloc[valid_idx] 105 | 106 | train_slide_mask_gen = kfold_data_generator(train_slide_datagen, 107 | train_mask_datagen, 108 | df=train_df, 109 | batch_size=args.batch_size, 110 | seed=args.seed) 111 | 112 | valid_slide_mask_gen = kfold_data_generator(valid_slide_datagen, 113 | valid_mask_datagen, 114 | df=valid_df, 115 | batch_size=args.batch_size, 116 | seed=args.batch_size) 117 | 118 | train_steps = len(train_df) // args.batch_size 119 | valid_steps = len(valid_df) // args.batch_size 120 | 121 | # callbacks_list 122 | callbacks_list = [ 123 | ModelCheckpoint( 124 | filepath=f'{CKPT_DIR}{MODEL_NAME}{f_idx+1}_fold_{args.model}_best_model.h5', 125 | monitor='val_iou_score', 126 | mode='max', 127 | save_best_only=True, 128 | verbose=1, 129 | ), 130 | ReduceLROnPlateau( 131 | monitor='val_iou_score', 132 | mode='max', 133 | factor=0.1, 134 | patience=3, 135 | verbose=1, 136 | ) 137 | ] 138 | 139 | history = model.fit_generator(train_slide_mask_gen, 140 | steps_per_epoch=train_steps, 141 | validation_data=valid_slide_mask_gen, 142 | validation_steps=valid_steps, 143 | epochs=args.epochs, verbose=2, 144 | callbacks=callbacks_list) 145 | 146 | model.save(f'{CKPT_DIR}{MODEL_NAME}{args.model}_im_{f_idx+1}_fold_last_model.h5') 147 | 148 | print('*'*20, f'{f_idx}-Fold 학습 완료', '*'*20) 149 | print('='*60) 150 | 151 | train_end_time = time.time() 152 | print('Train time : ', (train_end_time - train_start_time) / 60, 'minutes') 153 | model.save(f'{CKPT_DIR}{MODEL_NAME}{args.model}_{N_KFOLD}_fold_total_model.h5') 154 | print('model save completed') -------------------------------------------------------------------------------- /feature_extraction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import time 4 | import random 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from keras import models 10 | import segmentation_models as sm 11 | from segmentation_models import get_preprocessing 12 | 13 | from preprocess.prep import Preprocess 14 | 15 | from utils import * 16 | from stage2 import * 17 | import staintools 18 | import openslide 19 | from openslide.deepzoom import DeepZoomGenerator 20 | from PIL import Image 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser(description='parser') 24 | parser.add_argument('--seed', type=int, default=42) 25 | parser.add_argument('--patch_size', type=int, default=256) 26 | parser.add_argument('--is_preprocessed', 27 | type=lambda x: True if x == 'True' else False, 28 | default=True) 29 | parser.add_argument('--model_name', type=str, default='fpn_model') 30 | parser.add_argument('--model_weight', type=str, default='2_fold_fpn_best_model.h5') 31 | parser.add_argument('--ckpt_dir', type=str, default='./data/volume/model/') 32 | parser.add_argument('--heatmap_dir', type=str, default='./data/volume/heatmap/') 33 | parser.add_argument('--feature_dir', type=str, default='./data/volume/feature/') 34 | parser.add_argument('--patches_dir', type=str, default='./data/volume/patches/rescale/') 35 | args = parser.parse_args() 36 | 37 | TRAIN_DIR, LABEL_PATH = './data/train', './data/train/label.csv' 38 | MODEL_NAME, HEATMAP_DIR, FEATURE_DIR, PATCHES_DIR = args.model_name, args.heatmap_dir, args.feature_dir, args.patches_dir 39 | 40 | random.seed(args.seed) 41 | np.random.seed(args.seed) 42 | 43 | # check isdir 44 | set_directory(CKPT_DIR, MODEL_NAME) 45 | make_directory(HEATMAP_DIR) 46 | make_directory(FEATURE_DIR) 47 | make_directory(PATCHES_DIR) 48 | 49 | #load model 50 | MODEL_PATH = args.ckpt_dir + args.model_name + '/' + args.model_weight 51 | model = models.load_model( 52 | MODEL_PATH, 53 | custom_objects={ 54 | 'binary_crossentropy_plus_jaccard_loss': sm.losses.bce_jaccard_loss, 55 | 'iou_score': sm.metrics.iou_score, 56 | 'f1-score': sm.metrics.f1_score 57 | } 58 | ) 59 | print(MODEL_PATH,'Model loaded.') 60 | 61 | # set preprocess 62 | preprocess_input = get_preprocessing('resnet34') 63 | preprocess = Preprocess(patch_size=PATCH_SIZE, mode='inference', server='kakao') 64 | 65 | TARGET_NORM_PATH = './preprocess/target_norm.png' 66 | normalizer = stain_norm_func(TARGET_NORM_PATH) 67 | slide_pathes = sorted(os.listdir(preprocess.slide_dir)) 68 | stain_patches_save_path, phase = stain_patch_dir(PATCHES_DIR, slide_pathes) 69 | 70 | start_time = time.time() 71 | full_feature_list = [] 72 | 73 | for i, slide_path in enumerate(slide_pathes): 74 | current_save_dir = stain_patches_save_path + slide_path[:-4] + '/' # ex) '/data/volume/patches/rescale/test1/slide_001/' 75 | 76 | if phase == 'test1' and i <= 60: # AMC dataset 77 | full_slide_path = preprocess.slide_dir + slide_path 78 | else : # SNU dataset 79 | full_slide_path = '/data/test/level0/'+ slide_path +'.mrxs' 80 | 81 | print(current_save_dir) 82 | if IS_PREPROCESSED : 83 | stain_patches_names = sorted(os.listdir(current_save_dir)) 84 | else : 85 | make_directory(current_save_dir) 86 | 87 | with openslide.open_slide(full_slide_path) as slide: 88 | if slide.dimensions[1] < 20000: 89 | print('AMC data!') 90 | patch_size = 256 91 | else : 92 | print('SNU data!') 93 | patch_size = 290 94 | 95 | 96 | slide_tiles = DeepZoomGenerator(slide, tile_size = patch_size, overlap = 0 , limit_bounds = False) 97 | if patch_size == 290: 98 | output_preds = np.zeros((int((slide.dimensions[1] / 8 + 1)/1.13), int((slide.dimensions[0] / 8 + 1)/1.13))) 99 | else: ### snu resolution 100 | output_preds = np.zeros((slide.dimensions[1],slide.dimensions[0])) 101 | print('output_preds shape : ',output_preds.shape) 102 | samples, _ = preprocess.find_patches_from_slide(slide_path = full_slide_path, mask_path = None, patch_size = patch_size) 103 | print(samples.is_tissue.value_counts()) 104 | cnt = 0 105 | for idx, batch_sample in samples.iterrows(): 106 | is_tissue = batch_sample.is_tissue 107 | x,y = batch_sample.tile_loc[::-1] 108 | if is_tissue : 109 | if patch_size == 290: 110 | img = slide_tiles.get_tile(slide_tiles.level_count-1 -3,(x,y)) # SNU -> level 3 111 | else : 112 | img = slide_tiles.get_tile(slide_tiles.level_count-1,(x,y)) 113 | if (img.size == (patch_size, patch_size)): 114 | if IS_PREPROCESSED: 115 | try : 116 | full_stain_patches_path = current_save_dir + str(idx) + '.png' 117 | cnt += 1 118 | img = Image.open(full_stain_patches_path) 119 | X = np.array(img, dtype =np.uint8) 120 | except: 121 | X = np.zeros((256,256,3)) 122 | else : 123 | if img.size[0] == 290 : 124 | img = img.resize((256,256)) 125 | X = np.array(img, dtype = np.uint8) 126 | try : 127 | X = staintools.LuminosityStandardizer.standardize(X) 128 | X = normalizer.transform(X) 129 | x_img = Image.fromarray(X) 130 | x_img.save(current_save_dir + str(idx) + '.png') 131 | except: 132 | X = np.zeros((256, 256,3)) 133 | else : 134 | try : 135 | full_stain_patches_path = current_save_dir + str(idx) + '.png' 136 | cnt += 1 137 | img = Image.open(full_stain_patches_path) 138 | X = np.array(img, dtype =np.uint8) 139 | except : 140 | X = np.zeros((256,256, 3)) 141 | 142 | X = X.astype(np.float32) 143 | X = preprocess_input(X) 144 | 145 | pred_j = predict_from_model(X, model) 146 | 147 | '''fill output_preds : full heatmap''' 148 | new_x, new_y = batch_sample.tile_loc[0] * 256, batch_sample.tile_loc[1] * 256 149 | output_preds[new_x:new_x+256, new_y:new_y+256] = pred_j 150 | '''make different level heatmaps / input : full size heatmap / output : different scale heatmap''' 151 | heatmaps_list = make_different_level_heatmaps(output_preds) 152 | '''extract feature from different level heatmaps''' 153 | feature_list, feature_name_list = extract_feature_from_heatmaps(heatmaps_list) 154 | if i == 0: 155 | print(feature_name_list) 156 | print(feature_list) 157 | full_feature_list.append(feature_list) 158 | 159 | pd_feature = pd.DataFrame(np.array(full_feature_list), columns=feature_name_list) 160 | save_feature_path = FEATURE_DIR + MODEL_NAME +'_' +phase+'_feature.csv' 161 | pd_feature.to_csv(save_feature_path) 162 | -------------------------------------------------------------------------------- /stage2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.model_selection import cross_val_score, GridSearchCV, KFold 5 | from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor 6 | from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge,LinearRegression, Ridge, Lasso 7 | from sklearn.kernel_ridge import KernelRidge 8 | from sklearn.svm import SVR, LinearSVR 9 | from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler 10 | from sklearn.externals import joblib 11 | from PIL import Image 12 | import random 13 | from skimage.measure import block_reduce 14 | import cv2 15 | from utils import * # get_major_axis, acc_score 16 | from sklearn.metrics import roc_auc_score 17 | 18 | 19 | RESIZE_LIST = [4,16,64] 20 | 21 | def make_different_level_heatmaps(output_preds): 22 | output_preds = np.array(output_preds) 23 | heatmaps_list = [] 24 | resize_list = RESIZE_LIST 25 | for i in resize_list: 26 | heatmaps_list.append(block_reduce(output_preds, (i,i), np.mean)) 27 | return heatmaps_list 28 | 29 | def extract_feature_from_heatmaps(heatmaps_list): 30 | THRESHOLDS = [0.2,0.5] ## 31 | resize_list = RESIZE_LIST 32 | feature_list = [] 33 | feature_name_list = [] 34 | for i, heatmap in enumerate(heatmaps_list): 35 | for threshold in THRESHOLDS: 36 | test_np = (heatmap > threshold).astype(np.uint8) 37 | #kernel = np.ones((resize_list[i], resize_list[i]), np.uint8) 38 | #test_np = cv2.morphologyEx(test_np, cv2.MORPH_CLOSE, kernel) 39 | mx_i = get_major_axis(test_np) 40 | feature_name_list.append(str(resize_list[i]) + '_major_axis_t' + str(threshold)) 41 | feature_list.append(mx_i * resize_list[i]) 42 | tumor_len = np.sum(heatmap > threshold) 43 | 44 | tissue_len = np.sum(heatmap > 0.0) 45 | feature_name_list.append(str(resize_list[i]) + '_tumor_ratio_t' + str(threshold)) 46 | if tissue_len != 0: 47 | feature_list.append(tumor_len / tissue_len) 48 | else : 49 | feature_list.append(0.0) 50 | 51 | feature_name_list.append(str(resize_list[i]) + '_max') 52 | feature_name_list.append(str(resize_list[i]) + '_mean') 53 | feature_name_list.append(str(resize_list[i]) + '_std') 54 | feature_list.append(np.max(heatmap)) 55 | feature_list.append(np.mean(heatmap)) 56 | feature_list.append(np.std(heatmap)) 57 | 58 | return feature_list, feature_name_list 59 | 60 | def rmse_cv(model,X,y): 61 | rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5)) 62 | return rmse 63 | 64 | 65 | ml_models = [ 66 | RandomForestRegressor(), 67 | ] 68 | 69 | ROOT_DIR = '' 70 | LABEL_PATH = ROOT_DIR + '/data/train/label.csv' ####### 71 | OUTPUT_PATH = ROOT_DIR + '/data/output/output.csv' 72 | CKPT_DIR = ROOT_DIR + '/data/volume/model/' 73 | HIST_DIR = ROOT_DIR + '/data/volume/history/' 74 | FEAT_DIR = ROOT_DIR + '/data/volume/feature/' 75 | ML_DIR = CKPT_DIR + 'stage2/' 76 | make_directory(ML_DIR) 77 | 78 | 79 | drop_columns = ['4_major_axis_t0.5', '4_tumor_ratio_t0.5', '4_major_axis_t0.9','4_tumor_ratio_t0.9', 80 | '16_major_axis_t0.5', '16_tumor_ratio_t0.5', '16_major_axis_t0.9', '16_tumor_ratio_t0.9', 81 | '64_major_axis_t0.5','64_tumor_ratio_t0.5', '64_major_axis_t0.9', '64_tumor_ratio_t0.9', 82 | '256_major_axis_t0.5','256_tumor_ratio_t0.5', '256_major_axis_t0.9', '256_tumor_ratio_t0.9' 83 | ] 84 | drop_indexes = [2, 3, 9, 10, 16, 17, 23, 24] 85 | 86 | def stage2_train(pd_feature): 87 | print(pd_feature.columns) 88 | pd_feature.drop(pd_feature.columns[drop_indexes], axis='columns', inplace=True) 89 | x = np.array(pd_feature.values) 90 | scaler2 = StandardScaler() 91 | x2 = scaler2.fit_transform(x) 92 | 93 | label_df = pd.read_csv(LABEL_PATH) 94 | y_meta = np.array(label_df.metastasis.tolist()) 95 | y_major_axis = np.array(label_df.major_axis.tolist()) 96 | y_major_axis_log = np.log(y_major_axis + 1) # 나중에 꼭 e 과 1 빼주기 97 | 98 | names = [ "RF"] 99 | for i in range(len(ml_models)): 100 | model = ml_models[i] 101 | model.fit(x2,y_meta) 102 | auc_score = roc_auc_score(y_meta, model.predict(x2)) 103 | print(names[i],' roc_auc_score : ',auc_score) 104 | 105 | model.fit(x2,y_major_axis_log) 106 | pred = model.predict(x2) 107 | pred = np.exp(pred) - 1 108 | thresholds = [50, 100, 250, 500, 1000] 109 | pred_tmp = pred.copy() 110 | for thresh in thresholds: 111 | for j in range(len(pred)): 112 | if pred[j] < thresh : 113 | pred_tmp[j] = 0 114 | acc_sc = acc_score(y_major_axis, pred_tmp) 115 | print(names[i], thresh, 'thresh value, acc score : ',acc_sc) 116 | 117 | def stage2(pd_feature): 118 | ## x 119 | pd_feature.drop(pd_feature.columns[drop_indexes], axis='columns', inplace=True) 120 | #pd_feature.drop(drop_columns, axis='columns', inplace=True) 121 | x = np.array(pd_feature.values) 122 | scaler1 = MinMaxScaler() 123 | scaler2 = StandardScaler() 124 | x1 = scaler1.fit_transform(x) 125 | x2 = scaler2.fit_transform(x) 126 | 127 | ## y 128 | label_df = pd.read_csv(LABEL_PATH) 129 | y_meta = np.array(label_df.metastasis.tolist()) 130 | y_major_axis = np.array(label_df.major_axis.tolist()) 131 | y_major_axis_log = np.log(y_major_axis + 1) # 나중에 꼭 e 과 1 빼주기 132 | 133 | names = [ "RF"] 134 | models_ml_len = len(ml_models) 135 | random_model_index = np.random.randint(models_ml_len) 136 | random_model_meta = ml_models[random_model_index] 137 | random_model_meta.fit(x2,y_meta) 138 | auc_score = roc_auc_score(y_meta, random_model_meta.predict(x2)) 139 | print(names[random_model_index],' roc_auc_score : ',auc_score) 140 | print(random_model_meta) 141 | 142 | random_model_index = np.random.randint(models_ml_len) 143 | random_model_major = ml_models[random_model_index] 144 | random_model_major.fit(x2, y_major_axis_log) 145 | pred = random_model_major.predict(x2) 146 | pred = np.exp(pred) - 1 147 | for i in range(len(pred)): 148 | pred[i] = 0 149 | acc_sc = acc_score(y_major_axis, pred) 150 | print(random_model_major) 151 | print('major_axis all 0, acc score : ',acc_sc) 152 | 153 | return random_model_meta, random_model_major 154 | 155 | def stage2_predict(pd_feature, b_me, b_ma): 156 | 157 | pd_feature_1 = pd_feature.iloc[:,test2_pick_indexes] 158 | print('test2 meta : ',pd_feature_1.columns) 159 | x = np.array(pd_feature_1.values) 160 | scaler2 = StandardScaler() 161 | x2 = scaler2.fit_transform(x) 162 | y_me = b_me.predict(x2) 163 | 164 | print('test2 major : ',pd_feature.columns) 165 | x = np.array(pd_feature.values) 166 | scaler2 = StandardScaler() 167 | x2 = scaler2.fit_transform(x) 168 | y_ma = b_ma.predict(x2) 169 | 170 | y_ma = np.exp(y_ma) - 1 171 | if len(pd_feature) > 108: # for SNU dataset 172 | y_ma = y_ma / 1.76 173 | for i in range(len(y_ma)): 174 | if y_ma[i] < 500: 175 | y_ma[i] = 0 176 | 177 | return y_me, y_ma 178 | 179 | 180 | RESIZE_LIST = [4,16,64] 181 | 182 | train_pick_indexes = [1,7,8,9,10, 12,18,19,20,21, 23,29,30,31,32] 183 | test2_pick_indexes = [1,3,4,5,6, 8,9,11,12,13, 15,17,18,19,20] 184 | def stage2_train_meta(pd_feature): 185 | print('train meta : ',pd_feature.columns) 186 | 187 | #pd_feature.drop(pd_feature.columns[drop_indexes], axis='columns', inplace=True) 188 | pd_feature = pd_feature.iloc[:,train_pick_indexes] 189 | x = np.array(pd_feature.values) 190 | scaler2 = StandardScaler() 191 | x2 = scaler2.fit_transform(x) 192 | 193 | label_df = pd.read_csv(LABEL_PATH) 194 | y_meta = np.array(label_df.metastasis.tolist()) 195 | y_major_axis = np.array(label_df.major_axis.tolist()) 196 | y_major_axis_log = np.log(y_major_axis + 1) # 나중에 꼭 e 과 1 빼주기 197 | 198 | names = ["RFR"] 199 | for i in range(len(ml_models)): 200 | model = ml_models[i] 201 | model.fit(x2,y_meta) 202 | auc_score = roc_auc_score(y_meta, model.predict(x2)) 203 | print(names[i],' roc_auc_score : ',auc_score) 204 | 205 | print(model.feature_importances_) 206 | return model 207 | 208 | 209 | train_pick_indexes_major = [0,1,6,7,8,9,10, 11,12,17,18,19,20,21, 22,23,28,29,30,31,32] 210 | def stage2_train_major(pd_feature): 211 | 212 | pd_feature = pd_feature.iloc[:,train_pick_indexes_major] 213 | print('train major : ',pd_feature.columns) 214 | #pd_feature.drop(pd_feature.columns[drop_indexes], axis='columns', inplace=True) 215 | x = np.array(pd_feature.values) 216 | scaler2 = StandardScaler() 217 | x2 = scaler2.fit_transform(x) 218 | 219 | label_df = pd.read_csv(LABEL_PATH) 220 | y_meta = np.array(label_df.metastasis.tolist()) 221 | y_major_axis = np.array(label_df.major_axis.tolist()) 222 | y_major_axis_log = np.log(y_major_axis + 1) # 나중에 꼭 e 과 1 빼주기 223 | 224 | names = ["RFR"] 225 | for i in range(len(ml_models)): 226 | model = ml_models[i] 227 | model.fit(x2,y_major_axis_log) 228 | pred = model.predict(x2) 229 | pred = np.exp(pred) - 1 230 | 231 | for i in range(len(pred)): 232 | if pred[i] < 500: 233 | pred[i] = 0 234 | acc_sc = acc_score(y_major_axis, pred) 235 | print('major_axis, acc score : ',acc_sc) 236 | 237 | print(model.feature_importances_) 238 | return model 239 | 240 | def check_train_score(pd_feature): 241 | LABEL_PATH = '/data/train/label.csv' 242 | label_df = pd.read_csv(LABEL_PATH) 243 | y_meta = np.array(label_df.metastasis.tolist()) 244 | y_major_axis = np.array(label_df.major_axis.tolist()) 245 | 246 | # check train meta score 247 | cols_name = list(pd_feature.columns) 248 | best_auc = 0 249 | best_auc_col = 0 250 | for i in range(len(cols_name)): 251 | 252 | col_idx = i 253 | col_name = cols_name[col_idx] 254 | predict_meta = pd_feature.iloc[:,col_idx].tolist() 255 | 256 | auc_score = roc_auc_score(y_meta, predict_meta) 257 | print(col_name, 'AUC score : ',auc_score) 258 | 259 | if best_auc < auc_score : 260 | best_auc = auc_score 261 | best_auc_col = col_idx 262 | 263 | # check train major_axis score 264 | best_threshold = 0 265 | best_acc_sc = 0 266 | major_cols = [0,2,4,6,11,13,15,17, 22,24,26,28] 267 | for i in range(len(major_cols)): 268 | col_idx = major_cols[i] 269 | col_name = cols_name[col_idx] 270 | predict_major = np.array(pd_feature.iloc[:, col_idx].tolist()) * 1.757 271 | 272 | acc_sc = acc_score(y_major_axis, predict_major) 273 | print('--------------------------------------') 274 | print(col_name, 'ACC score : ',acc_sc) 275 | ## 276 | print('----------- set thresholds ----------') 277 | 278 | 279 | thresholds = [50,100,250, 300,350,400,450,500,550,600,1000] 280 | for j in range(len(thresholds)): 281 | threshold = thresholds[j] 282 | tmp_major = [] 283 | for k in range(len(predict_major)): 284 | if predict_major[k] < threshold: 285 | tmp_major.append(0) 286 | else : 287 | tmp_major.append(predict_major[k]) 288 | acc_sc = acc_score(y_major_axis, tmp_major) 289 | print(threshold, ' threshold acc_score : ',acc_sc) 290 | if acc_sc > best_acc_sc : 291 | best_acc_sc = acc_sc 292 | best_threshold = threshold 293 | best_acc_col = col_idx 294 | return best_auc_col , best_acc_col, best_threshold 295 | -------------------------------------------------------------------------------- /preprocess/prep.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | 4 | import numpy as np 5 | import pandas as pd 6 | import openslide 7 | import staintools 8 | 9 | from glob import glob 10 | from datetime import datetime 11 | from PIL import Image 12 | from tqdm import tqdm_notebook as tqdm 13 | from openslide.deepzoom import DeepZoomGenerator 14 | from skimage.filters import threshold_otsu, threshold_yen 15 | 16 | import warnings 17 | warnings.filterwarnings("ignore") 18 | 19 | 20 | class Preprocess: 21 | def __init__(self, 22 | patch_size=256, 23 | is_norm=False, 24 | target_norm_path=False, 25 | mode='train', 26 | server='kakao'): ##### add option for calculating auc score 27 | 28 | if mode == 'train': 29 | phase = 'train' 30 | else: 31 | phase = 'test' 32 | 33 | self.mode = mode 34 | self.patch_size = patch_size 35 | self.server = server 36 | 37 | if self.server == 'kakao': 38 | self.slide_dir = f'/data/{phase}/level4/Image/' 39 | self.mask_dir = f'/data/{phase}/level4/Mask/' 40 | self.img_mask_pairs_path = '/data/volume/dataset/level4/' 41 | self.patches_img_path = '/data/volume/dataset/level4/img/' 42 | self.patches_mask_path = '/data/volume/dataset/level4/mask/' 43 | if is_norm: 44 | self.patches_img_norm_path = '/data/volume/dataset/level4/img_norm/' 45 | self.patches_mask_norm_path = '/data/volume/dataset/level4/mask_norm/' 46 | elif self.server == 'local': 47 | self.slide_dir = f'./data/{phase}/level4/Image/' 48 | self.mask_dir = f'./data/{phase}/level4/Mask/' 49 | self.img_mask_pairs_path = './data/volume/dataset/level4/' 50 | self.patches_img_path = './data/volume/dataset/level4/img/' 51 | self.patches_mask_path = './data/volume/dataset/level4/mask/' 52 | if is_norm: 53 | self.patches_img_norm_path = './data/volume/dataset/level4/img_norm/' 54 | self.patches_mask_norm_path = './data/volume/dataset/level4/mask_norm/' 55 | 56 | if is_norm: 57 | print('*'*20, 'Color Normalization : True', '*'*20) 58 | self.is_norm = is_norm 59 | self.normalizer = self.stain_norm_func(target_norm_path) 60 | 61 | 62 | def _make_directory(self): 63 | '''학습 시킬 데이터셋(patches)을 저장하는 함수''' 64 | 65 | if self.server == 'local': 66 | dir_path = './data/volume' 67 | elif self.server == 'kakao': 68 | dir_path = '/data/volume' 69 | 70 | if not os.path.exists(f'{dir_path}/dataset'): 71 | os.mkdir(f'{dir_path}/dataset') 72 | if not os.path.exists(f'{dir_path}/dataset/level4'): 73 | os.mkdir(f'{dir_path}/dataset/level4') 74 | 75 | if self.is_norm: 76 | if not os.path.exists(f'{dir_path}/dataset/level4/img'): 77 | os.mkdir(f'{dir_path}/dataset/level4/img') 78 | if not os.path.exists(f'{dir_path}/dataset/level4/mask'): 79 | os.mkdir(f'{dir_path}/dataset/level4/mask') 80 | if not os.path.exists(f'{dir_path}/dataset/level4/img_norm'): 81 | os.mkdir(f'{dir_path}/dataset/level4/img_norm') 82 | if not os.path.exists(f'{dir_path}/dataset/level4/mask_norm'): 83 | os.mkdir(f'{dir_path}/dataset/level4/mask_norm') 84 | else: 85 | if not os.path.exists(f'{dir_path}/dataset/level4/img'): 86 | os.mkdir(f'{dir_path}/dataset/level4/img') 87 | if not os.path.exists(f'{dir_path}/dataset/level4/mask'): 88 | os.mkdir(f'{dir_path}/dataset/level4/mask') 89 | 90 | print('Created Directories') 91 | return None 92 | 93 | 94 | def find_patches_from_slide(self, 95 | slide_path, 96 | mask_path, 97 | patch_size=256, 98 | filter_nontissue=True): 99 | ''' 100 | Returns a DataFrame of all patches in slide 101 | Args: 102 | - slide_path: path of slide 103 | - truth_path: path of truth(mask) 104 | - patch_size: patch size for samples 105 | - filter_non_tissue: remove samples no tissue detected 106 | Returns: 107 | - samples: patches samples from slide 108 | - positive: > 0 if tumor else not tumor 0 109 | ''' 110 | 111 | with openslide.open_slide(slide_path) as slide: 112 | tiles = DeepZoomGenerator(slide, tile_size=patch_size, overlap=0, limit_bounds=False) 113 | if patch_size == 256 : 114 | size = tiles.level_tiles[tiles.level_count-1] 115 | # print(f'tile size : {size}') # (23, 58) 116 | else : 117 | size = tiles.level_tiles[tiles.level_count-1 -3] 118 | thumb_slide = slide.get_thumbnail(size) 119 | # print(f'thumb_slide size : {thumb_slide.size}') 120 | 121 | 122 | 123 | if self.mode == 'train': 124 | with openslide.open_slide(mask_path) as mask: 125 | thumb_mask = mask.get_thumbnail(size) # (23, 58) 126 | # print(f'thumb_mask size : {thumb_mask.size}') 127 | 128 | # ############## is tissue 부분 ############## 129 | slide4_grey = np.array(thumb_slide.convert('L')) 130 | binary = slide4_grey < 255 # white = 255 131 | slide4_not_white = slide4_grey[binary] # white = 255 132 | thresh = threshold_yen(slide4_not_white) 133 | # thresh = threshold_otsu(slide4_not_white) 134 | # print(f'current thersh : {thresh}') 135 | 136 | height, width = slide4_grey.shape # (height, width) 137 | for h in range(height): 138 | for w in range(width): 139 | if slide4_grey[h, w] > thresh: 140 | binary[h, w] = False 141 | 142 | # create pathces DataFrame 143 | patches = pd.DataFrame(pd.DataFrame(binary).stack()) 144 | patches['is_tissue'] = patches[0] 145 | patches = pd.DataFrame(pd.DataFrame(binary).stack(), columns=['is_tissue']) 146 | patches.loc[:, 'slide_path'] = slide_path 147 | 148 | 149 | # ############## is_tumor 부분 ############## 150 | if self.mode == 'train': 151 | truth_img_grey = np.array(thumb_mask.convert('L')) 152 | positive = truth_img_grey.mean() 153 | 154 | if positive > 0: # tumor인 경우 155 | # print('positive(tumor)') 156 | truth_not_black = truth_img_grey[truth_img_grey > 0] 157 | try: 158 | m_thresh = threshold_otsu(truth_not_black) 159 | except: 160 | m_thresh = 190 161 | patches_y = pd.DataFrame(pd.DataFrame(truth_img_grey).stack(), columns=['is_tumor']) 162 | patches_y['is_tumor'] = patches_y['is_tumor'] > m_thresh # 190 # threshold method를 사용 안한 이유? 163 | samples = pd.concat([patches, patches_y], axis=1) # slide의 patches와 mask의 patches_y concat 해주기 164 | else: 165 | # print('negative(not tumor)') 166 | samples = patches 167 | samples.loc[:, 'is_tumor'] = False 168 | 169 | if self.mode == 'test': 170 | # Inference pahse 171 | positive = 0 172 | samples = patches 173 | 174 | if filter_nontissue == True: # tissue인 것만 가져오기 175 | samples = samples[samples['is_tissue']==True] 176 | 177 | samples['tile_loc'] = samples.index.tolist() 178 | samples.reset_index(inplace=True, drop=True) 179 | # print(f"samples['is_tumor'].value_counts()\n{samples['is_tumor'].value_counts()}") 180 | 181 | return samples, positive 182 | 183 | 184 | def save_patches(self): 185 | ''' patches들을 저장하는 함수 ''' 186 | 187 | prepro_start_time = datetime.now() 188 | print('='*20, 'Step 1 - create patches', '='*20) 189 | 190 | 191 | # create directory if not exist 192 | self._make_directory() 193 | 194 | # create slide_path, mask_path pair 195 | slide_path_list = glob(f'{self.slide_dir}*.png') 196 | mask_path_list = glob(f'{self.mask_dir}*.png') 197 | 198 | slide_path_dict, mask_path_dict = {}, {} 199 | for slide_path, mask_path in zip(slide_path_list, mask_path_list): 200 | # slide 201 | slide_name, _ = os.path.splitext(slide_path) 202 | slide_idx = slide_name.split('_')[-1] 203 | 204 | # mask 205 | mask_name, _ = os.path.splitext(mask_path) 206 | mask_idx = mask_name.split('_')[-1] 207 | 208 | # update each dictionary 209 | slide_path_dict[slide_idx] = slide_path 210 | mask_path_dict[mask_idx] = mask_path 211 | 212 | slide_mask_path_pairs = [(idx, slide_path, mask_path_dict[idx]) 213 | for idx, slide_path in slide_path_dict.items()] 214 | 215 | slide_mask_pairs = [] 216 | slide_mask_norm_pairs = [] 217 | for cnt, (s_idx, slide_path, mask_path) in enumerate(slide_mask_path_pairs): 218 | print(f'{slide_path} 패치 추출 중 ...') 219 | samples, positive = self.find_patches_from_slide(slide_path, mask_path) 220 | 221 | if positive: # tumor인 경우 222 | samples_pos = samples[samples['is_tumor'] == True] 223 | samples_neg = samples[samples['is_tumor'] == False] 224 | total_pos = len(samples_pos) 225 | sample_num = 100 226 | if total_pos < 50: 227 | sample_num = total_pos * 2 228 | elif total_pos < 10: 229 | sample_num = total_os * 5 230 | 231 | samples_pos = samples_pos.sample(sample_num, random_state=42, replace=True) 232 | samples_neg = samples_neg.sample(sample_num, random_state=42, replace=True) 233 | samples = samples_neg.append(samples_pos) 234 | else: # non tumor인 경우 235 | sample_num = 100 236 | total_neg = len(samples) 237 | if total_neg > 100: 238 | samples = samples.sample(sample_num, random_state=42, replace=True) 239 | 240 | 241 | with openslide.open_slide(slide_path) as slide: 242 | with openslide.open_slide(mask_path) as mask: 243 | slide_tiles = DeepZoomGenerator(slide, tile_size=self.patch_size, overlap=0, limit_bounds=False) 244 | mask_tiles = DeepZoomGenerator(mask, tile_size=self.patch_size, overlap=0, limit_bounds=False) 245 | for p_idx, (tile_loc, is_tumor) in enumerate( 246 | zip(samples['tile_loc'].tolist(), samples['is_tumor'].tolist())): 247 | 248 | y, x = tile_loc 249 | img = slide_tiles.get_tile(slide_tiles.level_count-1, (x, y)) 250 | mask = mask_tiles.get_tile(mask_tiles.level_count-1, (x, y)) 251 | slide_mask_pairs.append((f'{self.patches_img_path}slide{s_idx}_{p_idx}.png', 252 | f'{self.patches_mask_path}slide{s_idx}_{p_idx}.png')) 253 | img.save(f'{self.patches_img_path}slide{s_idx}_{p_idx}.png') # slide#_patch#.png 254 | mask.save(f'{self.patches_mask_path}slide{s_idx}_{p_idx}.png') 255 | if self.is_norm: 256 | try: 257 | img = np.array(img, dtype=np.uint8) 258 | to_transform = staintools.LuminosityStandardizer.standardize(img) 259 | img_normed = self.normalizer.transform(to_transform) 260 | img_normed = Image.fromarray(img_normed) 261 | slide_mask_norm_pairs.append((f'{self.patches_img_norm_path}slide_norm{s_idx}_{p_idx}.png', 262 | f'{self.patches_mask_norm_path}slide_norm{s_idx}_{p_idx}.png')) 263 | img_normed.save(f'{self.patches_img_norm_path}slide_norm{s_idx}_{p_idx}.png') # slide#_patch#.png 264 | mask.save(f'{self.patches_mask_norm_path}slide_norm{s_idx}_{p_idx}.png') 265 | except: 266 | continue 267 | 268 | if cnt % 5 == 0: 269 | # img - mask pair 저장하기 270 | with open(f'{self.img_mask_pairs_path}img_mask_pairs_{cnt}.pkl', 'wb') as f: 271 | pickle.dump(slide_mask_pairs, f) 272 | with open(f'{self.img_mask_pairs_path}img_mask_norm_pairs_{cnt}.pkl', 'wb') as f: 273 | pickle.dump(slide_mask_norm_pairs, f) 274 | 275 | 276 | 277 | # img - mask pair 저장하기 278 | with open(f'{self.img_mask_pairs_path}img_mask_pairs.pkl', 'wb') as f: 279 | pickle.dump(slide_mask_pairs, f) 280 | with open(f'{self.img_mask_pairs_path}img_mask_norm_pairs.pkl', 'wb') as f: 281 | pickle.dump(slide_mask_norm_pairs, f) 282 | 283 | prepro_end_time = datetime.now() 284 | print('preprocessing patches img time : %.1f minutes'%((prepro_end_time - prepro_start_time).seconds/60)) 285 | print('='*50) 286 | return None 287 | 288 | 289 | def stain_norm_func(self, target_image_path): 290 | target = staintools.read_image(target_image_path) 291 | target = staintools.LuminosityStandardizer.standardize(target) 292 | normalizer = staintools.StainNormalizer(method='vahadane') 293 | normalizer.fit(target) 294 | return normalizer 295 | 296 | 297 | if __name__ == "__main__": 298 | preprocess = Preprocess(patch_size=256, 299 | is_norm=True, 300 | target_norm_path='./target_norm.png', 301 | mode='train', 302 | server='local') 303 | 304 | preprocess.save_patches() 305 | 306 | --------------------------------------------------------------------------------