├── data ├── Mask │ └── .gitkeep ├── Meta │ ├── .gitkeep │ └── meta_info.csv ├── Clean │ ├── .gitkeep │ ├── Image │ │ └── .gitkeep │ └── Mask │ │ └── .gitkeep └── Image │ └── .gitkeep ├── LIDC-IDRI └── .gitignore ├── requirements.txt ├── figures └── output_segment.png ├── lung.conf ├── config_file_create.py ├── utils.py ├── README.md ├── prepare_dataset.py └── notebook ├── make_label.ipynb └── .ipynb_checkpoints └── make_label-checkpoint.ipynb /data/Mask/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/Meta/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LIDC-IDRI/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/Clean/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/Image/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/Clean/Image/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/Clean/Mask/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | MedPy==0.4.0 2 | pylidc==0.2.1 3 | tqdm==4.42.1 -------------------------------------------------------------------------------- /data/Meta/meta_info.csv: -------------------------------------------------------------------------------- 1 | patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_clean 2 | -------------------------------------------------------------------------------- /figures/output_segment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaeho3690/LIDC-IDRI-Preprocessing/HEAD/figures/output_segment.png -------------------------------------------------------------------------------- /lung.conf: -------------------------------------------------------------------------------- 1 | [prepare_dataset] 2 | lidc_dicom_path = ./LIDC-IDRI 3 | mask_path = ./data/Mask 4 | image_path = ./data/Image 5 | clean_path_image = ./data/Clean/Image 6 | clean_path_mask = ./data/Clean/Mask 7 | meta_path = ./data/Meta/ 8 | mask_threshold = 8 9 | 10 | [pylidc] 11 | confidence_level = 0.5 12 | padding_size = 512 13 | 14 | -------------------------------------------------------------------------------- /config_file_create.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | 3 | if __name__ == "__main__": 4 | # This python file creates a configuartion file. Change the below directories for your application 5 | 6 | config = ConfigParser() 7 | 8 | # prepare_dataset.py configuration 9 | config['prepare_dataset'] = { 10 | #Path To LIDC Dataset 11 | 'LIDC_DICOM_PATH': './LIDC-IDRI', 12 | # Directory to save the output files 13 | # Directory for masks 14 | 'MASK_PATH':'./data/Mask', 15 | # Directory for images 16 | 'IMAGE_PATH':'./data/Image', 17 | # To save images and mask that doesn't contain any nodule or cancer 18 | # These images will be used later to evaluate our model 19 | 'CLEAN_PATH_IMAGE':'./data/Clean/Image', 20 | 'CLEAN_PATH_MASK':'./data/Clean/Mask', 21 | # CSV file containing nodule information, malignancy, train test split 22 | 'META_PATH': './data/Meta/', 23 | # Mask Threshold is the np.sum(MASK) threshold. Some Masks are too small. We remove these small images,masks as they might act as outliers 24 | # The threshold 8 was decided by empirical evaluation. 25 | 'Mask_Threshold':8 26 | } 27 | 28 | 29 | # This is the configuration file for pylidc library 30 | config['pylidc'] = { 31 | # Confidence level determines the overlap between the 4 doctors who have made annotation 32 | 'confidence_level': 0.5, 33 | # 512 determines the size of the image 34 | 'padding_size': 512 35 | } 36 | 37 | # Create the configuration file in lung.conf 38 | with open('./lung.conf', 'w') as f: 39 | config.write(f) 40 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import numpy as np 4 | 5 | from medpy.filter.smoothing import anisotropic_diffusion 6 | from scipy.ndimage import median_filter 7 | from skimage import measure, morphology 8 | import scipy.ndimage as ndimage 9 | from sklearn.cluster import KMeans 10 | 11 | def is_dir_path(string): 12 | if os.path.isdir(string): 13 | return string 14 | else: 15 | raise NotADirectoryError(string) 16 | 17 | def segment_lung(img): 18 | #function sourced from https://www.kaggle.com/c/data-science-bowl-2017#tutorial 19 | """ 20 | This segments the Lung Image(Don't get confused with lung nodule segmentation) 21 | """ 22 | mean = np.mean(img) 23 | std = np.std(img) 24 | img = img-mean 25 | img = img/std 26 | 27 | middle = img[100:400,100:400] 28 | mean = np.mean(middle) 29 | max = np.max(img) 30 | min = np.min(img) 31 | #remove the underflow bins 32 | img[img==max]=mean 33 | img[img==min]=mean 34 | 35 | #apply median filter 36 | img= median_filter(img,size=3) 37 | #apply anistropic non-linear diffusion filter- This removes noise without blurring the nodule boundary 38 | img= anisotropic_diffusion(img) 39 | 40 | kmeans = KMeans(n_clusters=2).fit(np.reshape(middle,[np.prod(middle.shape),1])) 41 | centers = sorted(kmeans.cluster_centers_.flatten()) 42 | threshold = np.mean(centers) 43 | thresh_img = np.where(img40 and B[2]<472: 53 | good_labels.append(prop.label) 54 | mask = np.ndarray([512,512],dtype=np.int8) 55 | mask[:] = 0 56 | # 57 | # The mask here is the mask for the lungs--not the nodes 58 | # After just the lungs are left, we do another large dilation 59 | # in order to fill in and out the lung mask 60 | # 61 | for N in good_labels: 62 | mask = mask + np.where(labels==N,1,0) 63 | mask = morphology.dilation(mask,np.ones([10,10])) # one last dilation 64 | # mask consists of 1 and 0. Thus by mutliplying with the orginial image, sections with 1 will remain 65 | return mask*img 66 | 67 | def count_params(model): 68 | return sum(p.numel() for p in model.parameters() if p.requires_grad) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LIDC Preprocessing with Pylidc library 2 | [Medium Link](https://medium.com/@jaeho3690/how-to-start-your-very-first-lung-cancer-detection-project-using-python-part-1-3ab490964aae) 3 | 4 | This repository would preprocess the LIDC-IDRI dataset. We use pylidc library to save nodule images into an .npy file format. 5 | The code file structure is as below 6 | 7 | ``` 8 | +-- LIDC-IDRI 9 | | # This file should contain the original LIDC dataset 10 | +-- data 11 | | # This file contains the preprocessed data 12 | | |-- _Clean 13 | | +-- Image 14 | | +-- Mask 15 | | |-- Image 16 | | +-- LIDC-IDRI-0001 17 | | +-- LIDC-IDRI-0002 18 | | +-- ... 19 | | |-- Mask 20 | | +-- LIDC-IDRI-0001 21 | | +-- LIDC-IDRI-0002 22 | | +-- ... 23 | | |-- Meta 24 | | +-- meta.csv 25 | +-- figures 26 | | # Save figures here 27 | +-- notebook 28 | | # This notebook file edits the meta.csv file to make indexing easier 29 | +-- config_file_create.py 30 | | # Creates configuration file. You can edit the hyperparameters of the Pylidc library here 31 | +-- prepare_dataset.py 32 | | # Run this file to preprocess the LIDC-IDRI dicom files. Results would be saved in the data folder 33 | +-- utils.py 34 | # Utility script 35 | 36 | ``` 37 | ![Segmented Image](/figures/output_segment.png) 38 | ## 1.Download LIDC-IDRI dataset 39 | First you would have to download the whole LIDC-IDRI [dataset](https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI). 40 | On the website, you will see the Data Acess section. You would need to click Search button to specify the images modality. 41 | I clicked on CT only and downloaded total of 1010 patients. 42 | 43 | ## 2. Set up pylidc library 44 | You would need to set up the pylidc library for preprocessing. There is an instruction in the [documentation](https://pylidc.github.io/install.html). 45 | Make sure to create the configuration file as stated in the instruction. Right now I am using library version 0.2.1 46 | 47 | ## 3. Explanation for each python file 48 | ```bash 49 | python config_file_create.py 50 | ``` 51 | This python script contains the configuration setting for the directories. Change the directories settings to where you want to save your output files. Without modification, it will automatically save the preprocessed file in the data folder. 52 | Running this script will create a configuration file 'lung.conf' 53 | 54 | This utils.py script contains function to segment the lung. Segmenting the lung and nodule are two different things. Segmenting the lung leaves the lung region only, while segmenting the nodule is finding prosepctive lung nodule regions in the lung. Don't get confused. 55 | 56 | ```bash 57 | python prepare_dataset.py 58 | ``` 59 | This python script will create the image, mask files and save them to the data folder. The script will also create a meta_info.csv file containing information about whether the nodule is 60 | cancerous. In the LIDC Dataset, each nodule is annotated at a maximum of 4 doctors. Each doctors have annotated the malignancy of each nodule in the scale of 1 to 5. 61 | I have chosed the median high label for each nodule as the final malignancy. The meta_csv data contains all the information and will be used later in the classification stage. 62 | This prepare_dataset.py looks for the lung.conf file. The configuration file should be in the same directory. Running this script will output .npy files for each slice with a size of 512*512 63 | 64 | To make a train/ val/ test split run the jupyter file in notebook folder. This will create an additional clean_meta.csv, meta.csv containing information about the nodules, train/val/test split. 65 | 66 | A nodule may contain several slices of images. Some researches have taken each of these slices indpendent from one another. 67 | However, I believe that these image slices should not be seen as independent from adjacent slice image. 68 | Thus, I have tried to maintain a same set of nodule images to be included in the same split. Although this apporach reduces the accuracy of test results, it seems to be the honest approach. 69 | 70 | 71 | 72 | ## 4. Data folder 73 | the data folder stores all the output images,masks. 74 | inside the data folder there are 3 subfolders. 75 | 76 | ### 1. Clean 77 | 78 | The Clean folder contains two subfolders. Image and Mask folders. 79 | Some patients don't have nodules. In the actual implementation, a person will have more slices of image without a nodule. To evaluate our generalization on real world application, we save lung images without nodules for testing purpose. 80 | These images will be used in the test set. 81 | 82 | ### 2. Image 83 | 84 | The Image folder contains the segmented lung .npy folders for each patient's folder 85 | 86 | ### 3. Mask 87 | 88 | The Mask folder contains the mask files for the nodule. 89 | 90 | ### 4. Meta 91 | 92 | The Meta folder contains the meta.csv file. The csv file contains information of each slice of image: Malignancy, whether the image should be used in train/val/test for the whole process, etc. 93 | 94 | 95 | ## 5. Contributing and Acknowledgement 96 | I started this Lung cancer detection project a year ago. I was really a newbie to python. I didn't even understand what a directory setting is at the time! However, I had to complete this project 97 | for some personal reasons. I looked through google and other githubs. But most of them were too hard to understand and the code itself lacked information. I hope my codes here could help 98 | other researchers first starting to do lung cancer detection projects. Please give a star if you found this repository useful. 99 | 100 | here is the link of github where I learned a lot from. Some of the codes are sourced from below. 101 | 1. https://github.com/mikejhuang/LungNoduleDetectionClassification 102 | 103 | -------------------------------------------------------------------------------- /prepare_dataset.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from pathlib import Path 4 | import glob 5 | from configparser import ConfigParser 6 | import pandas as pd 7 | import numpy as np 8 | import warnings 9 | import pylidc as pl 10 | from tqdm import tqdm 11 | from statistics import median_high 12 | 13 | from utils import is_dir_path,segment_lung 14 | from pylidc.utils import consensus 15 | from PIL import Image 16 | 17 | warnings.filterwarnings(action='ignore') 18 | 19 | # Read the configuration file generated from config_file_create.py 20 | parser = ConfigParser() 21 | parser.read('lung.conf') 22 | 23 | #Get Directory setting 24 | DICOM_DIR = is_dir_path(parser.get('prepare_dataset','LIDC_DICOM_PATH')) 25 | MASK_DIR = is_dir_path(parser.get('prepare_dataset','MASK_PATH')) 26 | IMAGE_DIR = is_dir_path(parser.get('prepare_dataset','IMAGE_PATH')) 27 | CLEAN_DIR_IMAGE = is_dir_path(parser.get('prepare_dataset','CLEAN_PATH_IMAGE')) 28 | CLEAN_DIR_MASK = is_dir_path(parser.get('prepare_dataset','CLEAN_PATH_MASK')) 29 | META_DIR = is_dir_path(parser.get('prepare_dataset','META_PATH')) 30 | 31 | #Hyper Parameter setting for prepare dataset function 32 | mask_threshold = parser.getint('prepare_dataset','Mask_Threshold') 33 | 34 | #Hyper Parameter setting for pylidc 35 | confidence_level = parser.getfloat('pylidc','confidence_level') 36 | padding = parser.getint('pylidc','padding_size') 37 | 38 | class MakeDataSet: 39 | def __init__(self, LIDC_Patients_list, IMAGE_DIR, MASK_DIR,CLEAN_DIR_IMAGE,CLEAN_DIR_MASK,META_DIR, mask_threshold, padding, confidence_level=0.5): 40 | self.IDRI_list = LIDC_Patients_list 41 | self.img_path = IMAGE_DIR 42 | self.mask_path = MASK_DIR 43 | self.clean_path_img = CLEAN_DIR_IMAGE 44 | self.clean_path_mask = CLEAN_DIR_MASK 45 | self.meta_path = META_DIR 46 | self.mask_threshold = mask_threshold 47 | self.c_level = confidence_level 48 | self.padding = [(padding,padding),(padding,padding),(0,0)] 49 | self.meta = pd.DataFrame(index=[],columns=['patient_id','nodule_no','slice_no','original_image','mask_image','malignancy','is_cancer','is_clean']) 50 | 51 | 52 | def calculate_malignancy(self,nodule): 53 | # Calculate the malignancy of a nodule with the annotations made by 4 doctors. Return median high of the annotated cancer, True or False label for cancer 54 | # if median high is above 3, we return a label True for cancer 55 | # if it is below 3, we return a label False for non-cancer 56 | # if it is 3, we return ambiguous 57 | list_of_malignancy =[] 58 | for annotation in nodule: 59 | list_of_malignancy.append(annotation.malignancy) 60 | 61 | malignancy = median_high(list_of_malignancy) 62 | if malignancy > 3: 63 | return malignancy,True 64 | elif malignancy < 3: 65 | return malignancy, False 66 | else: 67 | return malignancy, 'Ambiguous' 68 | def save_meta(self,meta_list): 69 | """Saves the information of nodule to csv file""" 70 | tmp = pd.Series(meta_list,index=['patient_id','nodule_no','slice_no','original_image','mask_image','malignancy','is_cancer','is_clean']) 71 | self.meta = self.meta.append(tmp,ignore_index=True) 72 | 73 | def prepare_dataset(self): 74 | # This is to name each image and mask 75 | prefix = [str(x).zfill(3) for x in range(1000)] 76 | 77 | # Make directory 78 | if not os.path.exists(self.img_path): 79 | os.makedirs(self.img_path) 80 | if not os.path.exists(self.mask_path): 81 | os.makedirs(self.mask_path) 82 | if not os.path.exists(self.clean_path_img): 83 | os.makedirs(self.clean_path_img) 84 | if not os.path.exists(self.clean_path_mask): 85 | os.makedirs(self.clean_path_mask) 86 | if not os.path.exists(self.meta_path): 87 | os.makedirs(self.meta_path) 88 | 89 | IMAGE_DIR = Path(self.img_path) 90 | MASK_DIR = Path(self.mask_path) 91 | CLEAN_DIR_IMAGE = Path(self.clean_path_img) 92 | CLEAN_DIR_MASK = Path(self.clean_path_mask) 93 | 94 | 95 | 96 | for patient in tqdm(self.IDRI_list): 97 | pid = patient #LIDC-IDRI-0001~ 98 | scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first() 99 | nodules_annotation = scan.cluster_annotations() 100 | vol = scan.to_volume() 101 | print("Patient ID: {} Dicom Shape: {} Number of Annotated Nodules: {}".format(pid,vol.shape,len(nodules_annotation))) 102 | 103 | patient_image_dir = IMAGE_DIR / pid 104 | patient_mask_dir = MASK_DIR / pid 105 | Path(patient_image_dir).mkdir(parents=True, exist_ok=True) 106 | Path(patient_mask_dir).mkdir(parents=True, exist_ok=True) 107 | 108 | if len(nodules_annotation) > 0: 109 | # Patients with nodules 110 | for nodule_idx, nodule in enumerate(nodules_annotation): 111 | # Call nodule images. Each Patient will have at maximum 4 annotations as there are only 4 doctors 112 | # This current for loop iterates over total number of nodules in a single patient 113 | mask, cbbox, masks = consensus(nodule,self.c_level,self.padding) 114 | lung_np_array = vol[cbbox] 115 | 116 | # We calculate the malignancy information 117 | malignancy, cancer_label = self.calculate_malignancy(nodule) 118 | 119 | for nodule_slice in range(mask.shape[2]): 120 | # This second for loop iterates over each single nodule. 121 | # There are some mask sizes that are too small. These may hinder training. 122 | if np.sum(mask[:,:,nodule_slice]) <= self.mask_threshold: 123 | continue 124 | # Segment Lung part only 125 | lung_segmented_np_array = segment_lung(lung_np_array[:,:,nodule_slice]) 126 | # I am not sure why but some values are stored as -0. <- this may result in datatype error in pytorch training # Not sure 127 | lung_segmented_np_array[lung_segmented_np_array==-0] =0 128 | # This itereates through the slices of a single nodule 129 | # Naming of each file: NI= Nodule Image, MA= Mask Original 130 | nodule_name = "{}_NI{}_slice{}".format(pid[-4:],prefix[nodule_idx],prefix[nodule_slice]) 131 | mask_name = "{}_MA{}_slice{}".format(pid[-4:],prefix[nodule_idx],prefix[nodule_slice]) 132 | meta_list = [pid[-4:],nodule_idx,prefix[nodule_slice],nodule_name,mask_name,malignancy,cancer_label,False] 133 | 134 | self.save_meta(meta_list) 135 | np.save(patient_image_dir / nodule_name,lung_segmented_np_array) 136 | np.save(patient_mask_dir / mask_name,mask[:,:,nodule_slice]) 137 | else: 138 | print("Clean Dataset",pid) 139 | patient_clean_dir_image = CLEAN_DIR_IMAGE / pid 140 | patient_clean_dir_mask = CLEAN_DIR_MASK / pid 141 | Path(patient_clean_dir_image).mkdir(parents=True, exist_ok=True) 142 | Path(patient_clean_dir_mask).mkdir(parents=True, exist_ok=True) 143 | #There are patients that don't have nodule at all. Meaning, its a clean dataset. We need to use this for validation 144 | for slice in range(vol.shape[2]): 145 | if slice >50: 146 | break 147 | lung_segmented_np_array = segment_lung(vol[:,:,slice]) 148 | lung_segmented_np_array[lung_segmented_np_array==-0] =0 149 | lung_mask = np.zeros_like(lung_segmented_np_array) 150 | 151 | #CN= CleanNodule, CM = CleanMask 152 | nodule_name = "{}/{}_CN001_slice{}".format(pid,pid[-4:],prefix[slice]) 153 | mask_name = "{}/{}_CM001_slice{}".format(pid,pid[-4:],prefix[slice]) 154 | meta_list = [pid[-4:],slice,prefix[slice],nodule_name,mask_name,0,False,True] 155 | self.save_meta(meta_list) 156 | np.save(patient_clean_dir_image / nodule_name, lung_segmented_np_array) 157 | np.save(patient_clean_dir_mask / mask_name, lung_mask) 158 | 159 | 160 | 161 | print("Saved Meta data") 162 | self.meta.to_csv(self.meta_path+'meta_info.csv',index=False) 163 | 164 | 165 | 166 | if __name__ == '__main__': 167 | # I found out that simply using os.listdir() includes the gitignore file 168 | LIDC_IDRI_list= [f for f in os.listdir(DICOM_DIR) if not f.startswith('.')] 169 | LIDC_IDRI_list.sort() 170 | 171 | 172 | test= MakeDataSet(LIDC_IDRI_list,IMAGE_DIR,MASK_DIR,CLEAN_DIR_IMAGE,CLEAN_DIR_MASK,META_DIR,mask_threshold,padding,confidence_level) 173 | test.prepare_dataset() 174 | -------------------------------------------------------------------------------- /notebook/make_label.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Create label for training in future" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "from sklearn.model_selection import train_test_split" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "meta = pd.read_csv('../meta_info.csv')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/html": [ 38 | "
\n", 39 | "\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | "
patient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_clean
0100LIDC-IDRI-0001/0001_NI000_slice000LIDC-IDRI-0001/0001_MA000_slice0005TrueFalse
1101LIDC-IDRI-0001/0001_NI000_slice001LIDC-IDRI-0001/0001_MA000_slice0015TrueFalse
2102LIDC-IDRI-0001/0001_NI000_slice002LIDC-IDRI-0001/0001_MA000_slice0025TrueFalse
3103LIDC-IDRI-0001/0001_NI000_slice003LIDC-IDRI-0001/0001_MA000_slice0035TrueFalse
4104LIDC-IDRI-0001/0001_NI000_slice004LIDC-IDRI-0001/0001_MA000_slice0045TrueFalse
\n", 124 | "
" 125 | ], 126 | "text/plain": [ 127 | " patient_id nodule_no slice_no original_image \\\n", 128 | "0 1 0 0 LIDC-IDRI-0001/0001_NI000_slice000 \n", 129 | "1 1 0 1 LIDC-IDRI-0001/0001_NI000_slice001 \n", 130 | "2 1 0 2 LIDC-IDRI-0001/0001_NI000_slice002 \n", 131 | "3 1 0 3 LIDC-IDRI-0001/0001_NI000_slice003 \n", 132 | "4 1 0 4 LIDC-IDRI-0001/0001_NI000_slice004 \n", 133 | "\n", 134 | " mask_image malignancy is_cancer is_clean \n", 135 | "0 LIDC-IDRI-0001/0001_MA000_slice000 5 True False \n", 136 | "1 LIDC-IDRI-0001/0001_MA000_slice001 5 True False \n", 137 | "2 LIDC-IDRI-0001/0001_MA000_slice002 5 True False \n", 138 | "3 LIDC-IDRI-0001/0001_MA000_slice003 5 True False \n", 139 | "4 LIDC-IDRI-0001/0001_MA000_slice004 5 True False " 140 | ] 141 | }, 142 | "execution_count": 3, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "meta.head()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 4, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# NI= Nodule Image, MA = Mask Original , CN = Clean Nodule , CM = Clean Mask" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "def is_nodule(row):\n", 167 | " if row[20:22] =='NI':\n", 168 | " return True\n", 169 | " else:\n", 170 | " return False" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 6, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "meta['is_nodule']= meta['original_image'].apply(lambda row: is_nodule(row))" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 7, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# Lets separate Clean meta and meta data\n", 189 | "clean_meta = meta[meta['is_nodule']==False]\n", 190 | "clean_meta.reset_index(inplace=True)\n", 191 | "meta = meta[meta['is_nodule']==True]\n", 192 | "meta.reset_index(inplace=True)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 8, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/html": [ 203 | "
\n", 204 | "\n", 217 | "\n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | "
indexpatient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_cleanis_nodule
02982800LIDC-IDRI-0028/0028_CN001_slice000LIDC-IDRI-0028/0028_CM001_slice0000FalseTrueFalse
12992811LIDC-IDRI-0028/0028_CN001_slice001LIDC-IDRI-0028/0028_CM001_slice0010FalseTrueFalse
\n", 262 | "
" 263 | ], 264 | "text/plain": [ 265 | " index patient_id nodule_no slice_no original_image \\\n", 266 | "0 298 28 0 0 LIDC-IDRI-0028/0028_CN001_slice000 \n", 267 | "1 299 28 1 1 LIDC-IDRI-0028/0028_CN001_slice001 \n", 268 | "\n", 269 | " mask_image malignancy is_cancer is_clean \\\n", 270 | "0 LIDC-IDRI-0028/0028_CM001_slice000 0 False True \n", 271 | "1 LIDC-IDRI-0028/0028_CM001_slice001 0 False True \n", 272 | "\n", 273 | " is_nodule \n", 274 | "0 False \n", 275 | "1 False " 276 | ] 277 | }, 278 | "execution_count": 8, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "clean_meta.head(2)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 9, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/html": [ 295 | "
\n", 296 | "\n", 309 | "\n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | "
indexpatient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_cleanis_nodule
00100LIDC-IDRI-0001/0001_NI000_slice000LIDC-IDRI-0001/0001_MA000_slice0005TrueFalseTrue
11101LIDC-IDRI-0001/0001_NI000_slice001LIDC-IDRI-0001/0001_MA000_slice0015TrueFalseTrue
\n", 354 | "
" 355 | ], 356 | "text/plain": [ 357 | " index patient_id nodule_no slice_no original_image \\\n", 358 | "0 0 1 0 0 LIDC-IDRI-0001/0001_NI000_slice000 \n", 359 | "1 1 1 0 1 LIDC-IDRI-0001/0001_NI000_slice001 \n", 360 | "\n", 361 | " mask_image malignancy is_cancer is_clean \\\n", 362 | "0 LIDC-IDRI-0001/0001_MA000_slice000 5 True False \n", 363 | "1 LIDC-IDRI-0001/0001_MA000_slice001 5 True False \n", 364 | "\n", 365 | " is_nodule \n", 366 | "0 True \n", 367 | "1 True " 368 | ] 369 | }, 370 | "execution_count": 9, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | } 374 | ], 375 | "source": [ 376 | "meta.head(2)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 10, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "def is_train(row,train,val,test):\n", 386 | " if row in train:\n", 387 | " return 'Train'\n", 388 | " elif row in val:\n", 389 | " return 'Validation'\n", 390 | " else:\n", 391 | " return 'Test'" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 11, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "\n", 401 | "clean_patient_id = list(np.unique(clean_meta['patient_id']))\n", 402 | "meta_patient_id = list(np.unique(meta['patient_id']))" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 12, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "def create_label_segmentation(meta):\n", 412 | " patient_id = list(np.unique(meta['patient_id']))\n", 413 | " train_patient , test_patient = train_test_split(patient_id,test_size= 0.2)\n", 414 | " train_patient, val_patient = train_test_split(train_patient,test_size= 0.25)\n", 415 | " print(len(train_patient),len(val_patient),len(test_patient))\n", 416 | " \n", 417 | " meta['data_split']= meta['patient_id'].apply(lambda row : is_train(row,train_patient,val_patient,test_patient))\n", 418 | " \n", 419 | " return meta" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 13, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "name": "stdout", 429 | "output_type": "stream", 430 | "text": [ 431 | "504 168 168\n", 432 | "81 27 27\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "# We need to train/test split independently for clean_meta, meta\n", 438 | "meta = create_label_segmentation(meta)\n", 439 | "clean_meta = create_label_segmentation(clean_meta)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 14, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "data": { 449 | "text/html": [ 450 | "
\n", 451 | "\n", 464 | "\n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | "
indexpatient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_cleanis_noduledata_split
00100LIDC-IDRI-0001/0001_NI000_slice000LIDC-IDRI-0001/0001_MA000_slice0005TrueFalseTrueTrain
11101LIDC-IDRI-0001/0001_NI000_slice001LIDC-IDRI-0001/0001_MA000_slice0015TrueFalseTrueTrain
22102LIDC-IDRI-0001/0001_NI000_slice002LIDC-IDRI-0001/0001_MA000_slice0025TrueFalseTrueTrain
33103LIDC-IDRI-0001/0001_NI000_slice003LIDC-IDRI-0001/0001_MA000_slice0035TrueFalseTrueTrain
44104LIDC-IDRI-0001/0001_NI000_slice004LIDC-IDRI-0001/0001_MA000_slice0045TrueFalseTrueTrain
\n", 554 | "
" 555 | ], 556 | "text/plain": [ 557 | " index patient_id nodule_no slice_no original_image \\\n", 558 | "0 0 1 0 0 LIDC-IDRI-0001/0001_NI000_slice000 \n", 559 | "1 1 1 0 1 LIDC-IDRI-0001/0001_NI000_slice001 \n", 560 | "2 2 1 0 2 LIDC-IDRI-0001/0001_NI000_slice002 \n", 561 | "3 3 1 0 3 LIDC-IDRI-0001/0001_NI000_slice003 \n", 562 | "4 4 1 0 4 LIDC-IDRI-0001/0001_NI000_slice004 \n", 563 | "\n", 564 | " mask_image malignancy is_cancer is_clean \\\n", 565 | "0 LIDC-IDRI-0001/0001_MA000_slice000 5 True False \n", 566 | "1 LIDC-IDRI-0001/0001_MA000_slice001 5 True False \n", 567 | "2 LIDC-IDRI-0001/0001_MA000_slice002 5 True False \n", 568 | "3 LIDC-IDRI-0001/0001_MA000_slice003 5 True False \n", 569 | "4 LIDC-IDRI-0001/0001_MA000_slice004 5 True False \n", 570 | "\n", 571 | " is_nodule data_split \n", 572 | "0 True Train \n", 573 | "1 True Train \n", 574 | "2 True Train \n", 575 | "3 True Train \n", 576 | "4 True Train " 577 | ] 578 | }, 579 | "execution_count": 14, 580 | "metadata": {}, 581 | "output_type": "execute_result" 582 | } 583 | ], 584 | "source": [ 585 | "meta.head()" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 15, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "data": { 595 | "text/html": [ 596 | "
\n", 597 | "\n", 610 | "\n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | "
indexpatient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_cleanis_noduledata_split
02982800LIDC-IDRI-0028/0028_CN001_slice000LIDC-IDRI-0028/0028_CM001_slice0000FalseTrueFalseTrain
12992811LIDC-IDRI-0028/0028_CN001_slice001LIDC-IDRI-0028/0028_CM001_slice0010FalseTrueFalseTrain
23002822LIDC-IDRI-0028/0028_CN001_slice002LIDC-IDRI-0028/0028_CM001_slice0020FalseTrueFalseTrain
33012833LIDC-IDRI-0028/0028_CN001_slice003LIDC-IDRI-0028/0028_CM001_slice0030FalseTrueFalseTrain
43022844LIDC-IDRI-0028/0028_CN001_slice004LIDC-IDRI-0028/0028_CM001_slice0040FalseTrueFalseTrain
\n", 700 | "
" 701 | ], 702 | "text/plain": [ 703 | " index patient_id nodule_no slice_no original_image \\\n", 704 | "0 298 28 0 0 LIDC-IDRI-0028/0028_CN001_slice000 \n", 705 | "1 299 28 1 1 LIDC-IDRI-0028/0028_CN001_slice001 \n", 706 | "2 300 28 2 2 LIDC-IDRI-0028/0028_CN001_slice002 \n", 707 | "3 301 28 3 3 LIDC-IDRI-0028/0028_CN001_slice003 \n", 708 | "4 302 28 4 4 LIDC-IDRI-0028/0028_CN001_slice004 \n", 709 | "\n", 710 | " mask_image malignancy is_cancer is_clean \\\n", 711 | "0 LIDC-IDRI-0028/0028_CM001_slice000 0 False True \n", 712 | "1 LIDC-IDRI-0028/0028_CM001_slice001 0 False True \n", 713 | "2 LIDC-IDRI-0028/0028_CM001_slice002 0 False True \n", 714 | "3 LIDC-IDRI-0028/0028_CM001_slice003 0 False True \n", 715 | "4 LIDC-IDRI-0028/0028_CM001_slice004 0 False True \n", 716 | "\n", 717 | " is_nodule data_split \n", 718 | "0 False Train \n", 719 | "1 False Train \n", 720 | "2 False Train \n", 721 | "3 False Train \n", 722 | "4 False Train " 723 | ] 724 | }, 725 | "execution_count": 15, 726 | "metadata": {}, 727 | "output_type": "execute_result" 728 | } 729 | ], 730 | "source": [ 731 | "clean_meta.head()" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 16, 737 | "metadata": {}, 738 | "outputs": [], 739 | "source": [ 740 | "# Clean Meta only stores meta information of patients without nodules." 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 17, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "meta.to_csv('/home/LUNG_DATA/meta_csv/meta.csv')\n", 750 | "clean_meta.to_csv('/home/LUNG_DATA/meta_csv/clean_meta.csv')" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [] 759 | } 760 | ], 761 | "metadata": { 762 | "kernelspec": { 763 | "display_name": "Python 3", 764 | "language": "python", 765 | "name": "python3" 766 | }, 767 | "language_info": { 768 | "codemirror_mode": { 769 | "name": "ipython", 770 | "version": 3 771 | }, 772 | "file_extension": ".py", 773 | "mimetype": "text/x-python", 774 | "name": "python", 775 | "nbconvert_exporter": "python", 776 | "pygments_lexer": "ipython3", 777 | "version": "3.6.10" 778 | } 779 | }, 780 | "nbformat": 4, 781 | "nbformat_minor": 4 782 | } 783 | -------------------------------------------------------------------------------- /notebook/.ipynb_checkpoints/make_label-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Create label for training in future" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "from sklearn.model_selection import train_test_split" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "meta = pd.read_csv('../meta_info.csv')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/html": [ 38 | "
\n", 39 | "\n", 52 | "\n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | "
patient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_clean
0100LIDC-IDRI-0001/0001_NI000_slice000LIDC-IDRI-0001/0001_MA000_slice0005TrueFalse
1101LIDC-IDRI-0001/0001_NI000_slice001LIDC-IDRI-0001/0001_MA000_slice0015TrueFalse
2102LIDC-IDRI-0001/0001_NI000_slice002LIDC-IDRI-0001/0001_MA000_slice0025TrueFalse
3103LIDC-IDRI-0001/0001_NI000_slice003LIDC-IDRI-0001/0001_MA000_slice0035TrueFalse
4104LIDC-IDRI-0001/0001_NI000_slice004LIDC-IDRI-0001/0001_MA000_slice0045TrueFalse
\n", 124 | "
" 125 | ], 126 | "text/plain": [ 127 | " patient_id nodule_no slice_no original_image \\\n", 128 | "0 1 0 0 LIDC-IDRI-0001/0001_NI000_slice000 \n", 129 | "1 1 0 1 LIDC-IDRI-0001/0001_NI000_slice001 \n", 130 | "2 1 0 2 LIDC-IDRI-0001/0001_NI000_slice002 \n", 131 | "3 1 0 3 LIDC-IDRI-0001/0001_NI000_slice003 \n", 132 | "4 1 0 4 LIDC-IDRI-0001/0001_NI000_slice004 \n", 133 | "\n", 134 | " mask_image malignancy is_cancer is_clean \n", 135 | "0 LIDC-IDRI-0001/0001_MA000_slice000 5 True False \n", 136 | "1 LIDC-IDRI-0001/0001_MA000_slice001 5 True False \n", 137 | "2 LIDC-IDRI-0001/0001_MA000_slice002 5 True False \n", 138 | "3 LIDC-IDRI-0001/0001_MA000_slice003 5 True False \n", 139 | "4 LIDC-IDRI-0001/0001_MA000_slice004 5 True False " 140 | ] 141 | }, 142 | "execution_count": 3, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "meta.head()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 4, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# NI= Nodule Image, MA = Mask Original , CN = Clean Nodule , CM = Clean Mask" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "def is_nodule(row):\n", 167 | " if row[20:22] =='NI':\n", 168 | " return True\n", 169 | " else:\n", 170 | " return False" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 6, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "meta['is_nodule']= meta['original_image'].apply(lambda row: is_nodule(row))" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 7, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# Lets separate Clean meta and meta data\n", 189 | "clean_meta = meta[meta['is_nodule']==False]\n", 190 | "clean_meta.reset_index(inplace=True)\n", 191 | "meta = meta[meta['is_nodule']==True]\n", 192 | "meta.reset_index(inplace=True)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 8, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/html": [ 203 | "
\n", 204 | "\n", 217 | "\n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | "
indexpatient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_cleanis_nodule
02982800LIDC-IDRI-0028/0028_CN001_slice000LIDC-IDRI-0028/0028_CM001_slice0000FalseTrueFalse
12992811LIDC-IDRI-0028/0028_CN001_slice001LIDC-IDRI-0028/0028_CM001_slice0010FalseTrueFalse
\n", 262 | "
" 263 | ], 264 | "text/plain": [ 265 | " index patient_id nodule_no slice_no original_image \\\n", 266 | "0 298 28 0 0 LIDC-IDRI-0028/0028_CN001_slice000 \n", 267 | "1 299 28 1 1 LIDC-IDRI-0028/0028_CN001_slice001 \n", 268 | "\n", 269 | " mask_image malignancy is_cancer is_clean \\\n", 270 | "0 LIDC-IDRI-0028/0028_CM001_slice000 0 False True \n", 271 | "1 LIDC-IDRI-0028/0028_CM001_slice001 0 False True \n", 272 | "\n", 273 | " is_nodule \n", 274 | "0 False \n", 275 | "1 False " 276 | ] 277 | }, 278 | "execution_count": 8, 279 | "metadata": {}, 280 | "output_type": "execute_result" 281 | } 282 | ], 283 | "source": [ 284 | "clean_meta.head(2)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 9, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/html": [ 295 | "
\n", 296 | "\n", 309 | "\n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | "
indexpatient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_cleanis_nodule
00100LIDC-IDRI-0001/0001_NI000_slice000LIDC-IDRI-0001/0001_MA000_slice0005TrueFalseTrue
11101LIDC-IDRI-0001/0001_NI000_slice001LIDC-IDRI-0001/0001_MA000_slice0015TrueFalseTrue
\n", 354 | "
" 355 | ], 356 | "text/plain": [ 357 | " index patient_id nodule_no slice_no original_image \\\n", 358 | "0 0 1 0 0 LIDC-IDRI-0001/0001_NI000_slice000 \n", 359 | "1 1 1 0 1 LIDC-IDRI-0001/0001_NI000_slice001 \n", 360 | "\n", 361 | " mask_image malignancy is_cancer is_clean \\\n", 362 | "0 LIDC-IDRI-0001/0001_MA000_slice000 5 True False \n", 363 | "1 LIDC-IDRI-0001/0001_MA000_slice001 5 True False \n", 364 | "\n", 365 | " is_nodule \n", 366 | "0 True \n", 367 | "1 True " 368 | ] 369 | }, 370 | "execution_count": 9, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | } 374 | ], 375 | "source": [ 376 | "meta.head(2)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": 10, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "def is_train(row,train,val,test):\n", 386 | " if row in train:\n", 387 | " return 'Train'\n", 388 | " elif row in val:\n", 389 | " return 'Validation'\n", 390 | " else:\n", 391 | " return 'Test'" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 11, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "\n", 401 | "clean_patient_id = list(np.unique(clean_meta['patient_id']))\n", 402 | "meta_patient_id = list(np.unique(meta['patient_id']))" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 12, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "def create_label_segmentation(meta):\n", 412 | " patient_id = list(np.unique(meta['patient_id']))\n", 413 | " train_patient , test_patient = train_test_split(patient_id,test_size= 0.2)\n", 414 | " train_patient, val_patient = train_test_split(train_patient,test_size= 0.25)\n", 415 | " print(len(train_patient),len(val_patient),len(test_patient))\n", 416 | " \n", 417 | " meta['data_split']= meta['patient_id'].apply(lambda row : is_train(row,train_patient,val_patient,test_patient))\n", 418 | " \n", 419 | " return meta" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 13, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "name": "stdout", 429 | "output_type": "stream", 430 | "text": [ 431 | "504 168 168\n", 432 | "81 27 27\n" 433 | ] 434 | } 435 | ], 436 | "source": [ 437 | "# We need to train/test split independently for clean_meta, meta\n", 438 | "meta = create_label_segmentation(meta)\n", 439 | "clean_meta = create_label_segmentation(clean_meta)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 14, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "data": { 449 | "text/html": [ 450 | "
\n", 451 | "\n", 464 | "\n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | "
indexpatient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_cleanis_noduledata_split
00100LIDC-IDRI-0001/0001_NI000_slice000LIDC-IDRI-0001/0001_MA000_slice0005TrueFalseTrueTrain
11101LIDC-IDRI-0001/0001_NI000_slice001LIDC-IDRI-0001/0001_MA000_slice0015TrueFalseTrueTrain
22102LIDC-IDRI-0001/0001_NI000_slice002LIDC-IDRI-0001/0001_MA000_slice0025TrueFalseTrueTrain
33103LIDC-IDRI-0001/0001_NI000_slice003LIDC-IDRI-0001/0001_MA000_slice0035TrueFalseTrueTrain
44104LIDC-IDRI-0001/0001_NI000_slice004LIDC-IDRI-0001/0001_MA000_slice0045TrueFalseTrueTrain
\n", 554 | "
" 555 | ], 556 | "text/plain": [ 557 | " index patient_id nodule_no slice_no original_image \\\n", 558 | "0 0 1 0 0 LIDC-IDRI-0001/0001_NI000_slice000 \n", 559 | "1 1 1 0 1 LIDC-IDRI-0001/0001_NI000_slice001 \n", 560 | "2 2 1 0 2 LIDC-IDRI-0001/0001_NI000_slice002 \n", 561 | "3 3 1 0 3 LIDC-IDRI-0001/0001_NI000_slice003 \n", 562 | "4 4 1 0 4 LIDC-IDRI-0001/0001_NI000_slice004 \n", 563 | "\n", 564 | " mask_image malignancy is_cancer is_clean \\\n", 565 | "0 LIDC-IDRI-0001/0001_MA000_slice000 5 True False \n", 566 | "1 LIDC-IDRI-0001/0001_MA000_slice001 5 True False \n", 567 | "2 LIDC-IDRI-0001/0001_MA000_slice002 5 True False \n", 568 | "3 LIDC-IDRI-0001/0001_MA000_slice003 5 True False \n", 569 | "4 LIDC-IDRI-0001/0001_MA000_slice004 5 True False \n", 570 | "\n", 571 | " is_nodule data_split \n", 572 | "0 True Train \n", 573 | "1 True Train \n", 574 | "2 True Train \n", 575 | "3 True Train \n", 576 | "4 True Train " 577 | ] 578 | }, 579 | "execution_count": 14, 580 | "metadata": {}, 581 | "output_type": "execute_result" 582 | } 583 | ], 584 | "source": [ 585 | "meta.head()" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 15, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "data": { 595 | "text/html": [ 596 | "
\n", 597 | "\n", 610 | "\n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | "
indexpatient_idnodule_noslice_nooriginal_imagemask_imagemalignancyis_canceris_cleanis_noduledata_split
02982800LIDC-IDRI-0028/0028_CN001_slice000LIDC-IDRI-0028/0028_CM001_slice0000FalseTrueFalseTrain
12992811LIDC-IDRI-0028/0028_CN001_slice001LIDC-IDRI-0028/0028_CM001_slice0010FalseTrueFalseTrain
23002822LIDC-IDRI-0028/0028_CN001_slice002LIDC-IDRI-0028/0028_CM001_slice0020FalseTrueFalseTrain
33012833LIDC-IDRI-0028/0028_CN001_slice003LIDC-IDRI-0028/0028_CM001_slice0030FalseTrueFalseTrain
43022844LIDC-IDRI-0028/0028_CN001_slice004LIDC-IDRI-0028/0028_CM001_slice0040FalseTrueFalseTrain
\n", 700 | "
" 701 | ], 702 | "text/plain": [ 703 | " index patient_id nodule_no slice_no original_image \\\n", 704 | "0 298 28 0 0 LIDC-IDRI-0028/0028_CN001_slice000 \n", 705 | "1 299 28 1 1 LIDC-IDRI-0028/0028_CN001_slice001 \n", 706 | "2 300 28 2 2 LIDC-IDRI-0028/0028_CN001_slice002 \n", 707 | "3 301 28 3 3 LIDC-IDRI-0028/0028_CN001_slice003 \n", 708 | "4 302 28 4 4 LIDC-IDRI-0028/0028_CN001_slice004 \n", 709 | "\n", 710 | " mask_image malignancy is_cancer is_clean \\\n", 711 | "0 LIDC-IDRI-0028/0028_CM001_slice000 0 False True \n", 712 | "1 LIDC-IDRI-0028/0028_CM001_slice001 0 False True \n", 713 | "2 LIDC-IDRI-0028/0028_CM001_slice002 0 False True \n", 714 | "3 LIDC-IDRI-0028/0028_CM001_slice003 0 False True \n", 715 | "4 LIDC-IDRI-0028/0028_CM001_slice004 0 False True \n", 716 | "\n", 717 | " is_nodule data_split \n", 718 | "0 False Train \n", 719 | "1 False Train \n", 720 | "2 False Train \n", 721 | "3 False Train \n", 722 | "4 False Train " 723 | ] 724 | }, 725 | "execution_count": 15, 726 | "metadata": {}, 727 | "output_type": "execute_result" 728 | } 729 | ], 730 | "source": [ 731 | "clean_meta.head()" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 16, 737 | "metadata": {}, 738 | "outputs": [], 739 | "source": [ 740 | "# Clean Meta only stores meta information of patients without nodules." 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 17, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "meta.to_csv('/home/LUNG_DATA/meta_csv/meta.csv')\n", 750 | "clean_meta.to_csv('/home/LUNG_DATA/meta_csv/clean_meta.csv')" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [] 759 | } 760 | ], 761 | "metadata": { 762 | "kernelspec": { 763 | "display_name": "Python 3", 764 | "language": "python", 765 | "name": "python3" 766 | }, 767 | "language_info": { 768 | "codemirror_mode": { 769 | "name": "ipython", 770 | "version": 3 771 | }, 772 | "file_extension": ".py", 773 | "mimetype": "text/x-python", 774 | "name": "python", 775 | "nbconvert_exporter": "python", 776 | "pygments_lexer": "ipython3", 777 | "version": "3.6.10" 778 | } 779 | }, 780 | "nbformat": 4, 781 | "nbformat_minor": 4 782 | } 783 | --------------------------------------------------------------------------------