├── data
    ├── Mask
    │   └── .gitkeep
    ├── Meta
    │   ├── .gitkeep
    │   └── meta_info.csv
    ├── Clean
    │   ├── .gitkeep
    │   ├── Image
    │   │   └── .gitkeep
    │   └── Mask
    │   │   └── .gitkeep
    └── Image
    │   └── .gitkeep
├── LIDC-IDRI
    └── .gitignore
├── requirements.txt
├── figures
    └── output_segment.png
├── lung.conf
├── config_file_create.py
├── utils.py
├── README.md
├── prepare_dataset.py
└── notebook
    ├── make_label.ipynb
    └── .ipynb_checkpoints
        └── make_label-checkpoint.ipynb


/data/Mask/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/Meta/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/LIDC-IDRI/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/Clean/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/Image/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/Clean/Image/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/Clean/Mask/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | MedPy==0.4.0
2 | pylidc==0.2.1
3 | tqdm==4.42.1


--------------------------------------------------------------------------------
/data/Meta/meta_info.csv:
--------------------------------------------------------------------------------
1 | patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_clean
2 | 


--------------------------------------------------------------------------------
/figures/output_segment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaeho3690/LIDC-IDRI-Preprocessing/HEAD/figures/output_segment.png


--------------------------------------------------------------------------------
/lung.conf:
--------------------------------------------------------------------------------
 1 | [prepare_dataset]
 2 | lidc_dicom_path = ./LIDC-IDRI
 3 | mask_path = ./data/Mask
 4 | image_path = ./data/Image
 5 | clean_path_image = ./data/Clean/Image
 6 | clean_path_mask = ./data/Clean/Mask
 7 | meta_path = ./data/Meta/
 8 | mask_threshold = 8
 9 | 
10 | [pylidc]
11 | confidence_level = 0.5
12 | padding_size = 512
13 | 
14 | 


--------------------------------------------------------------------------------
/config_file_create.py:
--------------------------------------------------------------------------------
 1 | from configparser import ConfigParser
 2 | 
 3 | if __name__ == "__main__":
 4 |     # This python file creates a configuartion file. Change the below directories for your application
 5 | 
 6 |     config = ConfigParser()
 7 | 
 8 |     # prepare_dataset.py configuration
 9 |     config['prepare_dataset'] = {
10 |         #Path To LIDC Dataset
11 |         'LIDC_DICOM_PATH': './LIDC-IDRI',
12 |         # Directory to save the output files
13 |         # Directory for masks
14 |         'MASK_PATH':'./data/Mask',
15 |         # Directory for images
16 |         'IMAGE_PATH':'./data/Image',
17 |         # To save images and mask that doesn't contain any nodule or cancer
18 |         # These images will be used later to evaluate our model
19 |         'CLEAN_PATH_IMAGE':'./data/Clean/Image',
20 |         'CLEAN_PATH_MASK':'./data/Clean/Mask',
21 |         # CSV file containing nodule information, malignancy, train test split
22 |         'META_PATH': './data/Meta/',
23 |         # Mask Threshold is the np.sum(MASK) threshold. Some Masks are too small. We remove these small images,masks as they might act as outliers
24 |         # The threshold 8 was decided by empirical evaluation.
25 |         'Mask_Threshold':8
26 |     }
27 | 
28 | 
29 |     # This is the configuration file for pylidc library
30 |     config['pylidc'] = {
31 |         # Confidence level determines the overlap between the 4 doctors who have made annotation
32 |         'confidence_level': 0.5,
33 |         # 512 determines the size of the image
34 |         'padding_size': 512
35 |     }
36 | 
37 |     # Create the configuration file in lung.conf
38 |     with open('./lung.conf', 'w') as f:
39 |           config.write(f)
40 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import numpy as np
 4 | 
 5 | from medpy.filter.smoothing import anisotropic_diffusion
 6 | from scipy.ndimage import median_filter
 7 | from skimage import measure, morphology
 8 | import scipy.ndimage as ndimage
 9 | from sklearn.cluster import KMeans
10 | 
11 | def is_dir_path(string):
12 |     if os.path.isdir(string):
13 |         return string
14 |     else:
15 |         raise NotADirectoryError(string)
16 | 
17 | def segment_lung(img):
18 |     #function sourced from https://www.kaggle.com/c/data-science-bowl-2017#tutorial
19 |     """
20 |     This segments the Lung Image(Don't get confused with lung nodule segmentation)
21 |     """
22 |     mean = np.mean(img)
23 |     std = np.std(img)
24 |     img = img-mean
25 |     img = img/std
26 |     
27 |     middle = img[100:400,100:400] 
28 |     mean = np.mean(middle)  
29 |     max = np.max(img)
30 |     min = np.min(img)
31 |     #remove the underflow bins
32 |     img[img==max]=mean
33 |     img[img==min]=mean
34 |     
35 |     #apply median filter
36 |     img= median_filter(img,size=3)
37 |     #apply anistropic non-linear diffusion filter- This removes noise without blurring the nodule boundary
38 |     img= anisotropic_diffusion(img)
39 |     
40 |     kmeans = KMeans(n_clusters=2).fit(np.reshape(middle,[np.prod(middle.shape),1]))
41 |     centers = sorted(kmeans.cluster_centers_.flatten())
42 |     threshold = np.mean(centers)
43 |     thresh_img = np.where(img<threshold,1.0,0.0)  # threshold the image
44 |     eroded = morphology.erosion(thresh_img,np.ones([4,4]))
45 |     dilation = morphology.dilation(eroded,np.ones([10,10]))
46 |     labels = measure.label(dilation)
47 |     label_vals = np.unique(labels)
48 |     regions = measure.regionprops(labels)
49 |     good_labels = []
50 |     for prop in regions:
51 |         B = prop.bbox
52 |         if B[2]-B[0]<475 and B[3]-B[1]<475 and B[0]>40 and B[2]<472:
53 |             good_labels.append(prop.label)
54 |     mask = np.ndarray([512,512],dtype=np.int8)
55 |     mask[:] = 0
56 |     #
57 |     #  The mask here is the mask for the lungs--not the nodes
58 |     #  After just the lungs are left, we do another large dilation
59 |     #  in order to fill in and out the lung mask 
60 |     #
61 |     for N in good_labels:
62 |         mask = mask + np.where(labels==N,1,0)
63 |     mask = morphology.dilation(mask,np.ones([10,10])) # one last dilation
64 |     # mask consists of 1 and 0. Thus by mutliplying with the orginial image, sections with 1 will remain
65 |     return mask*img
66 | 
67 | def count_params(model):
68 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LIDC Preprocessing with Pylidc library
  2 | [Medium Link](https://medium.com/@jaeho3690/how-to-start-your-very-first-lung-cancer-detection-project-using-python-part-1-3ab490964aae)
  3 | 
  4 | This repository would preprocess the LIDC-IDRI dataset. We use pylidc library to save nodule images into an .npy file format.
  5 | The code file structure is as below
  6 | 
  7 | ```
  8 | +-- LIDC-IDRI
  9 | |    # This file should contain the original LIDC dataset
 10 | +-- data
 11 | |    # This file contains the preprocessed data
 12 | |   |-- _Clean
 13 | |       +-- Image
 14 | |       +-- Mask
 15 | |   |-- Image
 16 | |       +-- LIDC-IDRI-0001
 17 | |       +-- LIDC-IDRI-0002
 18 | |       +-- ...
 19 | |   |-- Mask
 20 | |       +-- LIDC-IDRI-0001
 21 | |       +-- LIDC-IDRI-0002
 22 | |       +-- ...
 23 | |   |-- Meta
 24 | |       +-- meta.csv
 25 | +-- figures
 26 | |    # Save figures here
 27 | +-- notebook
 28 | |    # This notebook file edits the meta.csv file to make indexing easier
 29 | +-- config_file_create.py
 30 | |    # Creates configuration file. You can edit the hyperparameters of the Pylidc library here
 31 | +-- prepare_dataset.py
 32 | |    # Run this file to preprocess the LIDC-IDRI dicom files. Results would be saved in the data folder
 33 | +-- utils.py
 34 |      # Utility script
 35 | 
 36 | ```
 37 | ![Segmented Image](/figures/output_segment.png)
 38 | ## 1.Download LIDC-IDRI dataset
 39 | First you would have to download the whole LIDC-IDRI [dataset](https://wiki.cancerimagingarchive.net/display/Public/LIDC-IDRI).
 40 | On the website, you will see the Data Acess section. You would need to click Search button to specify the images modality.
 41 | I clicked on CT only and downloaded total of 1010 patients.
 42 | 
 43 | ## 2. Set up pylidc library
 44 | You would need to set up the pylidc library for preprocessing. There is an instruction in the [documentation](https://pylidc.github.io/install.html).
 45 | Make sure to create the configuration file as stated in the instruction. Right now I am using library version 0.2.1
 46 | 
 47 | ## 3. Explanation for each python file
 48 | ```bash
 49 | python config_file_create.py
 50 | ```
 51 | This python script contains the configuration setting for the directories. Change the directories settings to where you want to save your output files. Without modification, it will automatically save the preprocessed file in the data folder.
 52 | Running this script will create a configuration file 'lung.conf'
 53 | 
 54 | This utils.py script contains function to segment the lung. Segmenting the lung and nodule are two different things. Segmenting the lung leaves the lung region only, while segmenting the nodule is finding prosepctive lung nodule regions in the lung. Don't get confused. 
 55 | 
 56 | ```bash
 57 | python prepare_dataset.py
 58 | ```
 59 | This python script will create the image, mask files and save them to the data folder. The script will also create a meta_info.csv file containing information about whether the nodule is
 60 | cancerous. In the LIDC Dataset, each nodule is annotated at a maximum of 4 doctors. Each doctors have annotated the malignancy of each nodule in the scale of 1 to 5. 
 61 | I have chosed the median high label for each nodule as the final malignancy. The meta_csv data contains all the information and will be used later in the classification stage.
 62 | This prepare_dataset.py looks for the lung.conf file. The configuration file should be in the same directory. Running this script will output .npy files for each slice with a size of 512*512
 63 | 
 64 | To make a train/ val/ test split run the jupyter file in notebook folder. This will create an additional clean_meta.csv, meta.csv containing information about the nodules, train/val/test split.
 65 | 
 66 | A nodule may contain several slices of images. Some researches have taken each of these slices indpendent from one another. 
 67 | However, I believe that these image slices should not be seen as independent from adjacent slice image. 
 68 | Thus, I have tried to maintain a same set of nodule images to be included in the same split. Although this apporach reduces the accuracy of test results, it seems to be the honest approach.
 69 | 
 70 | 
 71 | 
 72 | ## 4. Data folder
 73 | the data folder stores all the output images,masks.
 74 | inside the data folder there are 3 subfolders. 
 75 | 
 76 | ### 1. Clean
 77 | 
 78 | The Clean folder contains two subfolders. Image and Mask folders.
 79 | Some patients don't have nodules. In the actual implementation, a person will have more slices of image without a nodule. To evaluate our generalization on real world application, we save lung images without nodules for testing purpose.
 80 | These images will be used in the test set.
 81 | 
 82 | ### 2. Image
 83 | 
 84 | The Image folder contains the segmented lung .npy folders for each patient's folder
 85 | 
 86 | ### 3. Mask
 87 | 
 88 | The Mask folder contains the mask files for the nodule.
 89 | 
 90 | ### 4. Meta
 91 | 
 92 | The Meta folder contains the meta.csv file. The csv file contains information of each slice of image: Malignancy, whether the image should be used in train/val/test for the whole process, etc.
 93 | 
 94 | 
 95 | ## 5. Contributing and Acknowledgement
 96 | I started this Lung cancer detection project a year ago. I was really a newbie to python. I didn't even understand what a directory setting is at the time! However, I had to complete this project
 97 | for some personal reasons. I looked through google and other githubs. But most of them were too hard to understand and the code itself lacked information. I hope my codes here could help
 98 | other researchers first starting to do lung cancer detection projects. Please give a star if you found this repository useful.
 99 | 
100 | here is the link of github where I learned a lot from. Some of the codes are sourced from below.
101 | 1. https://github.com/mikejhuang/LungNoduleDetectionClassification
102 | 
103 | 


--------------------------------------------------------------------------------
/prepare_dataset.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from pathlib import Path
  4 | import glob
  5 | from configparser import ConfigParser
  6 | import pandas as pd
  7 | import numpy as np
  8 | import warnings
  9 | import pylidc as pl
 10 | from tqdm import tqdm
 11 | from statistics import median_high
 12 | 
 13 | from utils import is_dir_path,segment_lung
 14 | from pylidc.utils import consensus
 15 | from PIL import Image
 16 | 
 17 | warnings.filterwarnings(action='ignore')
 18 | 
 19 | # Read the configuration file generated from config_file_create.py
 20 | parser = ConfigParser()
 21 | parser.read('lung.conf')
 22 | 
 23 | #Get Directory setting
 24 | DICOM_DIR = is_dir_path(parser.get('prepare_dataset','LIDC_DICOM_PATH'))
 25 | MASK_DIR = is_dir_path(parser.get('prepare_dataset','MASK_PATH'))
 26 | IMAGE_DIR = is_dir_path(parser.get('prepare_dataset','IMAGE_PATH'))
 27 | CLEAN_DIR_IMAGE = is_dir_path(parser.get('prepare_dataset','CLEAN_PATH_IMAGE'))
 28 | CLEAN_DIR_MASK = is_dir_path(parser.get('prepare_dataset','CLEAN_PATH_MASK'))
 29 | META_DIR = is_dir_path(parser.get('prepare_dataset','META_PATH'))
 30 | 
 31 | #Hyper Parameter setting for prepare dataset function
 32 | mask_threshold = parser.getint('prepare_dataset','Mask_Threshold')
 33 | 
 34 | #Hyper Parameter setting for pylidc
 35 | confidence_level = parser.getfloat('pylidc','confidence_level')
 36 | padding = parser.getint('pylidc','padding_size')
 37 | 
 38 | class MakeDataSet:
 39 |     def __init__(self, LIDC_Patients_list, IMAGE_DIR, MASK_DIR,CLEAN_DIR_IMAGE,CLEAN_DIR_MASK,META_DIR, mask_threshold, padding, confidence_level=0.5):
 40 |         self.IDRI_list = LIDC_Patients_list
 41 |         self.img_path = IMAGE_DIR
 42 |         self.mask_path = MASK_DIR
 43 |         self.clean_path_img = CLEAN_DIR_IMAGE
 44 |         self.clean_path_mask = CLEAN_DIR_MASK
 45 |         self.meta_path = META_DIR
 46 |         self.mask_threshold = mask_threshold
 47 |         self.c_level = confidence_level
 48 |         self.padding = [(padding,padding),(padding,padding),(0,0)]
 49 |         self.meta = pd.DataFrame(index=[],columns=['patient_id','nodule_no','slice_no','original_image','mask_image','malignancy','is_cancer','is_clean'])
 50 | 
 51 | 
 52 |     def calculate_malignancy(self,nodule):
 53 |         # Calculate the malignancy of a nodule with the annotations made by 4 doctors. Return median high of the annotated cancer, True or False label for cancer
 54 |         # if median high is above 3, we return a label True for cancer
 55 |         # if it is below 3, we return a label False for non-cancer
 56 |         # if it is 3, we return ambiguous
 57 |         list_of_malignancy =[]
 58 |         for annotation in nodule:
 59 |             list_of_malignancy.append(annotation.malignancy)
 60 | 
 61 |         malignancy = median_high(list_of_malignancy)
 62 |         if  malignancy > 3:
 63 |             return malignancy,True
 64 |         elif malignancy < 3:
 65 |             return malignancy, False
 66 |         else:
 67 |             return malignancy, 'Ambiguous'
 68 |     def save_meta(self,meta_list):
 69 |         """Saves the information of nodule to csv file"""
 70 |         tmp = pd.Series(meta_list,index=['patient_id','nodule_no','slice_no','original_image','mask_image','malignancy','is_cancer','is_clean'])
 71 |         self.meta = self.meta.append(tmp,ignore_index=True)
 72 | 
 73 |     def prepare_dataset(self):
 74 |         # This is to name each image and mask
 75 |         prefix = [str(x).zfill(3) for x in range(1000)]
 76 | 
 77 |         # Make directory
 78 |         if not os.path.exists(self.img_path):
 79 |             os.makedirs(self.img_path)
 80 |         if not os.path.exists(self.mask_path):
 81 |             os.makedirs(self.mask_path)
 82 |         if not os.path.exists(self.clean_path_img):
 83 |             os.makedirs(self.clean_path_img)
 84 |         if not os.path.exists(self.clean_path_mask):
 85 |             os.makedirs(self.clean_path_mask)
 86 |         if not os.path.exists(self.meta_path):
 87 |             os.makedirs(self.meta_path)
 88 | 
 89 |         IMAGE_DIR = Path(self.img_path)
 90 |         MASK_DIR = Path(self.mask_path)
 91 |         CLEAN_DIR_IMAGE = Path(self.clean_path_img)
 92 |         CLEAN_DIR_MASK = Path(self.clean_path_mask)
 93 | 
 94 | 
 95 | 
 96 |         for patient in tqdm(self.IDRI_list):
 97 |             pid = patient #LIDC-IDRI-0001~
 98 |             scan = pl.query(pl.Scan).filter(pl.Scan.patient_id == pid).first()
 99 |             nodules_annotation = scan.cluster_annotations()
100 |             vol = scan.to_volume()
101 |             print("Patient ID: {} Dicom Shape: {} Number of Annotated Nodules: {}".format(pid,vol.shape,len(nodules_annotation)))
102 | 
103 |             patient_image_dir = IMAGE_DIR / pid
104 |             patient_mask_dir = MASK_DIR / pid
105 |             Path(patient_image_dir).mkdir(parents=True, exist_ok=True)
106 |             Path(patient_mask_dir).mkdir(parents=True, exist_ok=True)
107 | 
108 |             if len(nodules_annotation) > 0:
109 |                 # Patients with nodules
110 |                 for nodule_idx, nodule in enumerate(nodules_annotation):
111 |                 # Call nodule images. Each Patient will have at maximum 4 annotations as there are only 4 doctors
112 |                 # This current for loop iterates over total number of nodules in a single patient
113 |                     mask, cbbox, masks = consensus(nodule,self.c_level,self.padding)
114 |                     lung_np_array = vol[cbbox]
115 | 
116 |                     # We calculate the malignancy information
117 |                     malignancy, cancer_label = self.calculate_malignancy(nodule)
118 | 
119 |                     for nodule_slice in range(mask.shape[2]):
120 |                         # This second for loop iterates over each single nodule.
121 |                         # There are some mask sizes that are too small. These may hinder training.
122 |                         if np.sum(mask[:,:,nodule_slice]) <= self.mask_threshold:
123 |                             continue
124 |                         # Segment Lung part only
125 |                         lung_segmented_np_array = segment_lung(lung_np_array[:,:,nodule_slice])
126 |                         # I am not sure why but some values are stored as -0. <- this may result in datatype error in pytorch training # Not sure
127 |                         lung_segmented_np_array[lung_segmented_np_array==-0] =0
128 |                         # This itereates through the slices of a single nodule
129 |                         # Naming of each file: NI= Nodule Image, MA= Mask Original
130 |                         nodule_name = "{}_NI{}_slice{}".format(pid[-4:],prefix[nodule_idx],prefix[nodule_slice])
131 |                         mask_name = "{}_MA{}_slice{}".format(pid[-4:],prefix[nodule_idx],prefix[nodule_slice])
132 |                         meta_list = [pid[-4:],nodule_idx,prefix[nodule_slice],nodule_name,mask_name,malignancy,cancer_label,False]
133 | 
134 |                         self.save_meta(meta_list)
135 |                         np.save(patient_image_dir / nodule_name,lung_segmented_np_array)
136 |                         np.save(patient_mask_dir / mask_name,mask[:,:,nodule_slice])
137 |             else:
138 |                 print("Clean Dataset",pid)
139 |                 patient_clean_dir_image = CLEAN_DIR_IMAGE / pid
140 |                 patient_clean_dir_mask = CLEAN_DIR_MASK / pid
141 |                 Path(patient_clean_dir_image).mkdir(parents=True, exist_ok=True)
142 |                 Path(patient_clean_dir_mask).mkdir(parents=True, exist_ok=True)
143 |                 #There are patients that don't have nodule at all. Meaning, its a clean dataset. We need to use this for validation
144 |                 for slice in range(vol.shape[2]):
145 |                     if slice >50:
146 |                         break
147 |                     lung_segmented_np_array = segment_lung(vol[:,:,slice])
148 |                     lung_segmented_np_array[lung_segmented_np_array==-0] =0
149 |                     lung_mask = np.zeros_like(lung_segmented_np_array)
150 | 
151 |                     #CN= CleanNodule, CM = CleanMask
152 |                     nodule_name = "{}/{}_CN001_slice{}".format(pid,pid[-4:],prefix[slice])
153 |                     mask_name = "{}/{}_CM001_slice{}".format(pid,pid[-4:],prefix[slice])
154 |                     meta_list = [pid[-4:],slice,prefix[slice],nodule_name,mask_name,0,False,True]
155 |                     self.save_meta(meta_list)
156 |                     np.save(patient_clean_dir_image / nodule_name, lung_segmented_np_array)
157 |                     np.save(patient_clean_dir_mask / mask_name, lung_mask)
158 | 
159 | 
160 | 
161 |         print("Saved Meta data")
162 |         self.meta.to_csv(self.meta_path+'meta_info.csv',index=False)
163 | 
164 | 
165 | 
166 | if __name__ == '__main__':
167 |     # I found out that simply using os.listdir() includes the gitignore file 
168 |     LIDC_IDRI_list= [f for f in os.listdir(DICOM_DIR) if not f.startswith('.')]
169 |     LIDC_IDRI_list.sort()
170 | 
171 | 
172 |     test= MakeDataSet(LIDC_IDRI_list,IMAGE_DIR,MASK_DIR,CLEAN_DIR_IMAGE,CLEAN_DIR_MASK,META_DIR,mask_threshold,padding,confidence_level)
173 |     test.prepare_dataset()
174 | 


--------------------------------------------------------------------------------
/notebook/make_label.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Create label for training in future"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "import numpy as np\n",
 18 |     "from sklearn.model_selection import train_test_split"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "meta = pd.read_csv('../meta_info.csv')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/html": [
 38 |        "<div>\n",
 39 |        "<style scoped>\n",
 40 |        "    .dataframe tbody tr th:only-of-type {\n",
 41 |        "        vertical-align: middle;\n",
 42 |        "    }\n",
 43 |        "\n",
 44 |        "    .dataframe tbody tr th {\n",
 45 |        "        vertical-align: top;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe thead th {\n",
 49 |        "        text-align: right;\n",
 50 |        "    }\n",
 51 |        "</style>\n",
 52 |        "<table border=\"1\" class=\"dataframe\">\n",
 53 |        "  <thead>\n",
 54 |        "    <tr style=\"text-align: right;\">\n",
 55 |        "      <th></th>\n",
 56 |        "      <th>patient_id</th>\n",
 57 |        "      <th>nodule_no</th>\n",
 58 |        "      <th>slice_no</th>\n",
 59 |        "      <th>original_image</th>\n",
 60 |        "      <th>mask_image</th>\n",
 61 |        "      <th>malignancy</th>\n",
 62 |        "      <th>is_cancer</th>\n",
 63 |        "      <th>is_clean</th>\n",
 64 |        "    </tr>\n",
 65 |        "  </thead>\n",
 66 |        "  <tbody>\n",
 67 |        "    <tr>\n",
 68 |        "      <th>0</th>\n",
 69 |        "      <td>1</td>\n",
 70 |        "      <td>0</td>\n",
 71 |        "      <td>0</td>\n",
 72 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice000</td>\n",
 73 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice000</td>\n",
 74 |        "      <td>5</td>\n",
 75 |        "      <td>True</td>\n",
 76 |        "      <td>False</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>1</th>\n",
 80 |        "      <td>1</td>\n",
 81 |        "      <td>0</td>\n",
 82 |        "      <td>1</td>\n",
 83 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice001</td>\n",
 84 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice001</td>\n",
 85 |        "      <td>5</td>\n",
 86 |        "      <td>True</td>\n",
 87 |        "      <td>False</td>\n",
 88 |        "    </tr>\n",
 89 |        "    <tr>\n",
 90 |        "      <th>2</th>\n",
 91 |        "      <td>1</td>\n",
 92 |        "      <td>0</td>\n",
 93 |        "      <td>2</td>\n",
 94 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice002</td>\n",
 95 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice002</td>\n",
 96 |        "      <td>5</td>\n",
 97 |        "      <td>True</td>\n",
 98 |        "      <td>False</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>3</th>\n",
102 |        "      <td>1</td>\n",
103 |        "      <td>0</td>\n",
104 |        "      <td>3</td>\n",
105 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice003</td>\n",
106 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice003</td>\n",
107 |        "      <td>5</td>\n",
108 |        "      <td>True</td>\n",
109 |        "      <td>False</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <th>4</th>\n",
113 |        "      <td>1</td>\n",
114 |        "      <td>0</td>\n",
115 |        "      <td>4</td>\n",
116 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice004</td>\n",
117 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice004</td>\n",
118 |        "      <td>5</td>\n",
119 |        "      <td>True</td>\n",
120 |        "      <td>False</td>\n",
121 |        "    </tr>\n",
122 |        "  </tbody>\n",
123 |        "</table>\n",
124 |        "</div>"
125 |       ],
126 |       "text/plain": [
127 |        "   patient_id  nodule_no  slice_no                      original_image  \\\n",
128 |        "0           1          0         0  LIDC-IDRI-0001/0001_NI000_slice000   \n",
129 |        "1           1          0         1  LIDC-IDRI-0001/0001_NI000_slice001   \n",
130 |        "2           1          0         2  LIDC-IDRI-0001/0001_NI000_slice002   \n",
131 |        "3           1          0         3  LIDC-IDRI-0001/0001_NI000_slice003   \n",
132 |        "4           1          0         4  LIDC-IDRI-0001/0001_NI000_slice004   \n",
133 |        "\n",
134 |        "                           mask_image  malignancy is_cancer  is_clean  \n",
135 |        "0  LIDC-IDRI-0001/0001_MA000_slice000           5      True     False  \n",
136 |        "1  LIDC-IDRI-0001/0001_MA000_slice001           5      True     False  \n",
137 |        "2  LIDC-IDRI-0001/0001_MA000_slice002           5      True     False  \n",
138 |        "3  LIDC-IDRI-0001/0001_MA000_slice003           5      True     False  \n",
139 |        "4  LIDC-IDRI-0001/0001_MA000_slice004           5      True     False  "
140 |       ]
141 |      },
142 |      "execution_count": 3,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "meta.head()"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 4,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "# NI= Nodule Image, MA = Mask Original , CN = Clean Nodule , CM = Clean Mask"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 5,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "def is_nodule(row):\n",
167 |     "    if row[20:22] =='NI':\n",
168 |     "        return True\n",
169 |     "    else:\n",
170 |     "        return False"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 6,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "meta['is_nodule']= meta['original_image'].apply(lambda row: is_nodule(row))"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 7,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "# Lets separate Clean meta and meta data\n",
189 |     "clean_meta = meta[meta['is_nodule']==False]\n",
190 |     "clean_meta.reset_index(inplace=True)\n",
191 |     "meta = meta[meta['is_nodule']==True]\n",
192 |     "meta.reset_index(inplace=True)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 8,
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "text/html": [
203 |        "<div>\n",
204 |        "<style scoped>\n",
205 |        "    .dataframe tbody tr th:only-of-type {\n",
206 |        "        vertical-align: middle;\n",
207 |        "    }\n",
208 |        "\n",
209 |        "    .dataframe tbody tr th {\n",
210 |        "        vertical-align: top;\n",
211 |        "    }\n",
212 |        "\n",
213 |        "    .dataframe thead th {\n",
214 |        "        text-align: right;\n",
215 |        "    }\n",
216 |        "</style>\n",
217 |        "<table border=\"1\" class=\"dataframe\">\n",
218 |        "  <thead>\n",
219 |        "    <tr style=\"text-align: right;\">\n",
220 |        "      <th></th>\n",
221 |        "      <th>index</th>\n",
222 |        "      <th>patient_id</th>\n",
223 |        "      <th>nodule_no</th>\n",
224 |        "      <th>slice_no</th>\n",
225 |        "      <th>original_image</th>\n",
226 |        "      <th>mask_image</th>\n",
227 |        "      <th>malignancy</th>\n",
228 |        "      <th>is_cancer</th>\n",
229 |        "      <th>is_clean</th>\n",
230 |        "      <th>is_nodule</th>\n",
231 |        "    </tr>\n",
232 |        "  </thead>\n",
233 |        "  <tbody>\n",
234 |        "    <tr>\n",
235 |        "      <th>0</th>\n",
236 |        "      <td>298</td>\n",
237 |        "      <td>28</td>\n",
238 |        "      <td>0</td>\n",
239 |        "      <td>0</td>\n",
240 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice000</td>\n",
241 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice000</td>\n",
242 |        "      <td>0</td>\n",
243 |        "      <td>False</td>\n",
244 |        "      <td>True</td>\n",
245 |        "      <td>False</td>\n",
246 |        "    </tr>\n",
247 |        "    <tr>\n",
248 |        "      <th>1</th>\n",
249 |        "      <td>299</td>\n",
250 |        "      <td>28</td>\n",
251 |        "      <td>1</td>\n",
252 |        "      <td>1</td>\n",
253 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice001</td>\n",
254 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice001</td>\n",
255 |        "      <td>0</td>\n",
256 |        "      <td>False</td>\n",
257 |        "      <td>True</td>\n",
258 |        "      <td>False</td>\n",
259 |        "    </tr>\n",
260 |        "  </tbody>\n",
261 |        "</table>\n",
262 |        "</div>"
263 |       ],
264 |       "text/plain": [
265 |        "   index  patient_id  nodule_no  slice_no                      original_image  \\\n",
266 |        "0    298          28          0         0  LIDC-IDRI-0028/0028_CN001_slice000   \n",
267 |        "1    299          28          1         1  LIDC-IDRI-0028/0028_CN001_slice001   \n",
268 |        "\n",
269 |        "                           mask_image  malignancy is_cancer  is_clean  \\\n",
270 |        "0  LIDC-IDRI-0028/0028_CM001_slice000           0     False      True   \n",
271 |        "1  LIDC-IDRI-0028/0028_CM001_slice001           0     False      True   \n",
272 |        "\n",
273 |        "   is_nodule  \n",
274 |        "0      False  \n",
275 |        "1      False  "
276 |       ]
277 |      },
278 |      "execution_count": 8,
279 |      "metadata": {},
280 |      "output_type": "execute_result"
281 |     }
282 |    ],
283 |    "source": [
284 |     "clean_meta.head(2)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 9,
290 |    "metadata": {},
291 |    "outputs": [
292 |     {
293 |      "data": {
294 |       "text/html": [
295 |        "<div>\n",
296 |        "<style scoped>\n",
297 |        "    .dataframe tbody tr th:only-of-type {\n",
298 |        "        vertical-align: middle;\n",
299 |        "    }\n",
300 |        "\n",
301 |        "    .dataframe tbody tr th {\n",
302 |        "        vertical-align: top;\n",
303 |        "    }\n",
304 |        "\n",
305 |        "    .dataframe thead th {\n",
306 |        "        text-align: right;\n",
307 |        "    }\n",
308 |        "</style>\n",
309 |        "<table border=\"1\" class=\"dataframe\">\n",
310 |        "  <thead>\n",
311 |        "    <tr style=\"text-align: right;\">\n",
312 |        "      <th></th>\n",
313 |        "      <th>index</th>\n",
314 |        "      <th>patient_id</th>\n",
315 |        "      <th>nodule_no</th>\n",
316 |        "      <th>slice_no</th>\n",
317 |        "      <th>original_image</th>\n",
318 |        "      <th>mask_image</th>\n",
319 |        "      <th>malignancy</th>\n",
320 |        "      <th>is_cancer</th>\n",
321 |        "      <th>is_clean</th>\n",
322 |        "      <th>is_nodule</th>\n",
323 |        "    </tr>\n",
324 |        "  </thead>\n",
325 |        "  <tbody>\n",
326 |        "    <tr>\n",
327 |        "      <th>0</th>\n",
328 |        "      <td>0</td>\n",
329 |        "      <td>1</td>\n",
330 |        "      <td>0</td>\n",
331 |        "      <td>0</td>\n",
332 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice000</td>\n",
333 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice000</td>\n",
334 |        "      <td>5</td>\n",
335 |        "      <td>True</td>\n",
336 |        "      <td>False</td>\n",
337 |        "      <td>True</td>\n",
338 |        "    </tr>\n",
339 |        "    <tr>\n",
340 |        "      <th>1</th>\n",
341 |        "      <td>1</td>\n",
342 |        "      <td>1</td>\n",
343 |        "      <td>0</td>\n",
344 |        "      <td>1</td>\n",
345 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice001</td>\n",
346 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice001</td>\n",
347 |        "      <td>5</td>\n",
348 |        "      <td>True</td>\n",
349 |        "      <td>False</td>\n",
350 |        "      <td>True</td>\n",
351 |        "    </tr>\n",
352 |        "  </tbody>\n",
353 |        "</table>\n",
354 |        "</div>"
355 |       ],
356 |       "text/plain": [
357 |        "   index  patient_id  nodule_no  slice_no                      original_image  \\\n",
358 |        "0      0           1          0         0  LIDC-IDRI-0001/0001_NI000_slice000   \n",
359 |        "1      1           1          0         1  LIDC-IDRI-0001/0001_NI000_slice001   \n",
360 |        "\n",
361 |        "                           mask_image  malignancy is_cancer  is_clean  \\\n",
362 |        "0  LIDC-IDRI-0001/0001_MA000_slice000           5      True     False   \n",
363 |        "1  LIDC-IDRI-0001/0001_MA000_slice001           5      True     False   \n",
364 |        "\n",
365 |        "   is_nodule  \n",
366 |        "0       True  \n",
367 |        "1       True  "
368 |       ]
369 |      },
370 |      "execution_count": 9,
371 |      "metadata": {},
372 |      "output_type": "execute_result"
373 |     }
374 |    ],
375 |    "source": [
376 |     "meta.head(2)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 10,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "def is_train(row,train,val,test):\n",
386 |     "    if row in train:\n",
387 |     "        return 'Train'\n",
388 |     "    elif row in val:\n",
389 |     "        return 'Validation'\n",
390 |     "    else:\n",
391 |     "        return 'Test'"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 11,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "\n",
401 |     "clean_patient_id = list(np.unique(clean_meta['patient_id']))\n",
402 |     "meta_patient_id = list(np.unique(meta['patient_id']))"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 12,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "def create_label_segmentation(meta):\n",
412 |     "    patient_id = list(np.unique(meta['patient_id']))\n",
413 |     "    train_patient , test_patient = train_test_split(patient_id,test_size= 0.2)\n",
414 |     "    train_patient, val_patient = train_test_split(train_patient,test_size= 0.25)\n",
415 |     "    print(len(train_patient),len(val_patient),len(test_patient))\n",
416 |     "    \n",
417 |     "    meta['data_split']= meta['patient_id'].apply(lambda row : is_train(row,train_patient,val_patient,test_patient))\n",
418 |     "    \n",
419 |     "    return meta"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 13,
425 |    "metadata": {},
426 |    "outputs": [
427 |     {
428 |      "name": "stdout",
429 |      "output_type": "stream",
430 |      "text": [
431 |       "504 168 168\n",
432 |       "81 27 27\n"
433 |      ]
434 |     }
435 |    ],
436 |    "source": [
437 |     "# We need to train/test split independently for clean_meta, meta\n",
438 |     "meta = create_label_segmentation(meta)\n",
439 |     "clean_meta = create_label_segmentation(clean_meta)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 14,
445 |    "metadata": {},
446 |    "outputs": [
447 |     {
448 |      "data": {
449 |       "text/html": [
450 |        "<div>\n",
451 |        "<style scoped>\n",
452 |        "    .dataframe tbody tr th:only-of-type {\n",
453 |        "        vertical-align: middle;\n",
454 |        "    }\n",
455 |        "\n",
456 |        "    .dataframe tbody tr th {\n",
457 |        "        vertical-align: top;\n",
458 |        "    }\n",
459 |        "\n",
460 |        "    .dataframe thead th {\n",
461 |        "        text-align: right;\n",
462 |        "    }\n",
463 |        "</style>\n",
464 |        "<table border=\"1\" class=\"dataframe\">\n",
465 |        "  <thead>\n",
466 |        "    <tr style=\"text-align: right;\">\n",
467 |        "      <th></th>\n",
468 |        "      <th>index</th>\n",
469 |        "      <th>patient_id</th>\n",
470 |        "      <th>nodule_no</th>\n",
471 |        "      <th>slice_no</th>\n",
472 |        "      <th>original_image</th>\n",
473 |        "      <th>mask_image</th>\n",
474 |        "      <th>malignancy</th>\n",
475 |        "      <th>is_cancer</th>\n",
476 |        "      <th>is_clean</th>\n",
477 |        "      <th>is_nodule</th>\n",
478 |        "      <th>data_split</th>\n",
479 |        "    </tr>\n",
480 |        "  </thead>\n",
481 |        "  <tbody>\n",
482 |        "    <tr>\n",
483 |        "      <th>0</th>\n",
484 |        "      <td>0</td>\n",
485 |        "      <td>1</td>\n",
486 |        "      <td>0</td>\n",
487 |        "      <td>0</td>\n",
488 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice000</td>\n",
489 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice000</td>\n",
490 |        "      <td>5</td>\n",
491 |        "      <td>True</td>\n",
492 |        "      <td>False</td>\n",
493 |        "      <td>True</td>\n",
494 |        "      <td>Train</td>\n",
495 |        "    </tr>\n",
496 |        "    <tr>\n",
497 |        "      <th>1</th>\n",
498 |        "      <td>1</td>\n",
499 |        "      <td>1</td>\n",
500 |        "      <td>0</td>\n",
501 |        "      <td>1</td>\n",
502 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice001</td>\n",
503 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice001</td>\n",
504 |        "      <td>5</td>\n",
505 |        "      <td>True</td>\n",
506 |        "      <td>False</td>\n",
507 |        "      <td>True</td>\n",
508 |        "      <td>Train</td>\n",
509 |        "    </tr>\n",
510 |        "    <tr>\n",
511 |        "      <th>2</th>\n",
512 |        "      <td>2</td>\n",
513 |        "      <td>1</td>\n",
514 |        "      <td>0</td>\n",
515 |        "      <td>2</td>\n",
516 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice002</td>\n",
517 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice002</td>\n",
518 |        "      <td>5</td>\n",
519 |        "      <td>True</td>\n",
520 |        "      <td>False</td>\n",
521 |        "      <td>True</td>\n",
522 |        "      <td>Train</td>\n",
523 |        "    </tr>\n",
524 |        "    <tr>\n",
525 |        "      <th>3</th>\n",
526 |        "      <td>3</td>\n",
527 |        "      <td>1</td>\n",
528 |        "      <td>0</td>\n",
529 |        "      <td>3</td>\n",
530 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice003</td>\n",
531 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice003</td>\n",
532 |        "      <td>5</td>\n",
533 |        "      <td>True</td>\n",
534 |        "      <td>False</td>\n",
535 |        "      <td>True</td>\n",
536 |        "      <td>Train</td>\n",
537 |        "    </tr>\n",
538 |        "    <tr>\n",
539 |        "      <th>4</th>\n",
540 |        "      <td>4</td>\n",
541 |        "      <td>1</td>\n",
542 |        "      <td>0</td>\n",
543 |        "      <td>4</td>\n",
544 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice004</td>\n",
545 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice004</td>\n",
546 |        "      <td>5</td>\n",
547 |        "      <td>True</td>\n",
548 |        "      <td>False</td>\n",
549 |        "      <td>True</td>\n",
550 |        "      <td>Train</td>\n",
551 |        "    </tr>\n",
552 |        "  </tbody>\n",
553 |        "</table>\n",
554 |        "</div>"
555 |       ],
556 |       "text/plain": [
557 |        "   index  patient_id  nodule_no  slice_no                      original_image  \\\n",
558 |        "0      0           1          0         0  LIDC-IDRI-0001/0001_NI000_slice000   \n",
559 |        "1      1           1          0         1  LIDC-IDRI-0001/0001_NI000_slice001   \n",
560 |        "2      2           1          0         2  LIDC-IDRI-0001/0001_NI000_slice002   \n",
561 |        "3      3           1          0         3  LIDC-IDRI-0001/0001_NI000_slice003   \n",
562 |        "4      4           1          0         4  LIDC-IDRI-0001/0001_NI000_slice004   \n",
563 |        "\n",
564 |        "                           mask_image  malignancy is_cancer  is_clean  \\\n",
565 |        "0  LIDC-IDRI-0001/0001_MA000_slice000           5      True     False   \n",
566 |        "1  LIDC-IDRI-0001/0001_MA000_slice001           5      True     False   \n",
567 |        "2  LIDC-IDRI-0001/0001_MA000_slice002           5      True     False   \n",
568 |        "3  LIDC-IDRI-0001/0001_MA000_slice003           5      True     False   \n",
569 |        "4  LIDC-IDRI-0001/0001_MA000_slice004           5      True     False   \n",
570 |        "\n",
571 |        "   is_nodule data_split  \n",
572 |        "0       True      Train  \n",
573 |        "1       True      Train  \n",
574 |        "2       True      Train  \n",
575 |        "3       True      Train  \n",
576 |        "4       True      Train  "
577 |       ]
578 |      },
579 |      "execution_count": 14,
580 |      "metadata": {},
581 |      "output_type": "execute_result"
582 |     }
583 |    ],
584 |    "source": [
585 |     "meta.head()"
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": 15,
591 |    "metadata": {},
592 |    "outputs": [
593 |     {
594 |      "data": {
595 |       "text/html": [
596 |        "<div>\n",
597 |        "<style scoped>\n",
598 |        "    .dataframe tbody tr th:only-of-type {\n",
599 |        "        vertical-align: middle;\n",
600 |        "    }\n",
601 |        "\n",
602 |        "    .dataframe tbody tr th {\n",
603 |        "        vertical-align: top;\n",
604 |        "    }\n",
605 |        "\n",
606 |        "    .dataframe thead th {\n",
607 |        "        text-align: right;\n",
608 |        "    }\n",
609 |        "</style>\n",
610 |        "<table border=\"1\" class=\"dataframe\">\n",
611 |        "  <thead>\n",
612 |        "    <tr style=\"text-align: right;\">\n",
613 |        "      <th></th>\n",
614 |        "      <th>index</th>\n",
615 |        "      <th>patient_id</th>\n",
616 |        "      <th>nodule_no</th>\n",
617 |        "      <th>slice_no</th>\n",
618 |        "      <th>original_image</th>\n",
619 |        "      <th>mask_image</th>\n",
620 |        "      <th>malignancy</th>\n",
621 |        "      <th>is_cancer</th>\n",
622 |        "      <th>is_clean</th>\n",
623 |        "      <th>is_nodule</th>\n",
624 |        "      <th>data_split</th>\n",
625 |        "    </tr>\n",
626 |        "  </thead>\n",
627 |        "  <tbody>\n",
628 |        "    <tr>\n",
629 |        "      <th>0</th>\n",
630 |        "      <td>298</td>\n",
631 |        "      <td>28</td>\n",
632 |        "      <td>0</td>\n",
633 |        "      <td>0</td>\n",
634 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice000</td>\n",
635 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice000</td>\n",
636 |        "      <td>0</td>\n",
637 |        "      <td>False</td>\n",
638 |        "      <td>True</td>\n",
639 |        "      <td>False</td>\n",
640 |        "      <td>Train</td>\n",
641 |        "    </tr>\n",
642 |        "    <tr>\n",
643 |        "      <th>1</th>\n",
644 |        "      <td>299</td>\n",
645 |        "      <td>28</td>\n",
646 |        "      <td>1</td>\n",
647 |        "      <td>1</td>\n",
648 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice001</td>\n",
649 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice001</td>\n",
650 |        "      <td>0</td>\n",
651 |        "      <td>False</td>\n",
652 |        "      <td>True</td>\n",
653 |        "      <td>False</td>\n",
654 |        "      <td>Train</td>\n",
655 |        "    </tr>\n",
656 |        "    <tr>\n",
657 |        "      <th>2</th>\n",
658 |        "      <td>300</td>\n",
659 |        "      <td>28</td>\n",
660 |        "      <td>2</td>\n",
661 |        "      <td>2</td>\n",
662 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice002</td>\n",
663 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice002</td>\n",
664 |        "      <td>0</td>\n",
665 |        "      <td>False</td>\n",
666 |        "      <td>True</td>\n",
667 |        "      <td>False</td>\n",
668 |        "      <td>Train</td>\n",
669 |        "    </tr>\n",
670 |        "    <tr>\n",
671 |        "      <th>3</th>\n",
672 |        "      <td>301</td>\n",
673 |        "      <td>28</td>\n",
674 |        "      <td>3</td>\n",
675 |        "      <td>3</td>\n",
676 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice003</td>\n",
677 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice003</td>\n",
678 |        "      <td>0</td>\n",
679 |        "      <td>False</td>\n",
680 |        "      <td>True</td>\n",
681 |        "      <td>False</td>\n",
682 |        "      <td>Train</td>\n",
683 |        "    </tr>\n",
684 |        "    <tr>\n",
685 |        "      <th>4</th>\n",
686 |        "      <td>302</td>\n",
687 |        "      <td>28</td>\n",
688 |        "      <td>4</td>\n",
689 |        "      <td>4</td>\n",
690 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice004</td>\n",
691 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice004</td>\n",
692 |        "      <td>0</td>\n",
693 |        "      <td>False</td>\n",
694 |        "      <td>True</td>\n",
695 |        "      <td>False</td>\n",
696 |        "      <td>Train</td>\n",
697 |        "    </tr>\n",
698 |        "  </tbody>\n",
699 |        "</table>\n",
700 |        "</div>"
701 |       ],
702 |       "text/plain": [
703 |        "   index  patient_id  nodule_no  slice_no                      original_image  \\\n",
704 |        "0    298          28          0         0  LIDC-IDRI-0028/0028_CN001_slice000   \n",
705 |        "1    299          28          1         1  LIDC-IDRI-0028/0028_CN001_slice001   \n",
706 |        "2    300          28          2         2  LIDC-IDRI-0028/0028_CN001_slice002   \n",
707 |        "3    301          28          3         3  LIDC-IDRI-0028/0028_CN001_slice003   \n",
708 |        "4    302          28          4         4  LIDC-IDRI-0028/0028_CN001_slice004   \n",
709 |        "\n",
710 |        "                           mask_image  malignancy is_cancer  is_clean  \\\n",
711 |        "0  LIDC-IDRI-0028/0028_CM001_slice000           0     False      True   \n",
712 |        "1  LIDC-IDRI-0028/0028_CM001_slice001           0     False      True   \n",
713 |        "2  LIDC-IDRI-0028/0028_CM001_slice002           0     False      True   \n",
714 |        "3  LIDC-IDRI-0028/0028_CM001_slice003           0     False      True   \n",
715 |        "4  LIDC-IDRI-0028/0028_CM001_slice004           0     False      True   \n",
716 |        "\n",
717 |        "   is_nodule data_split  \n",
718 |        "0      False      Train  \n",
719 |        "1      False      Train  \n",
720 |        "2      False      Train  \n",
721 |        "3      False      Train  \n",
722 |        "4      False      Train  "
723 |       ]
724 |      },
725 |      "execution_count": 15,
726 |      "metadata": {},
727 |      "output_type": "execute_result"
728 |     }
729 |    ],
730 |    "source": [
731 |     "clean_meta.head()"
732 |    ]
733 |   },
734 |   {
735 |    "cell_type": "code",
736 |    "execution_count": 16,
737 |    "metadata": {},
738 |    "outputs": [],
739 |    "source": [
740 |     "# Clean Meta only stores meta information of patients without nodules."
741 |    ]
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": 17,
746 |    "metadata": {},
747 |    "outputs": [],
748 |    "source": [
749 |     "meta.to_csv('/home/LUNG_DATA/meta_csv/meta.csv')\n",
750 |     "clean_meta.to_csv('/home/LUNG_DATA/meta_csv/clean_meta.csv')"
751 |    ]
752 |   },
753 |   {
754 |    "cell_type": "code",
755 |    "execution_count": null,
756 |    "metadata": {},
757 |    "outputs": [],
758 |    "source": []
759 |   }
760 |  ],
761 |  "metadata": {
762 |   "kernelspec": {
763 |    "display_name": "Python 3",
764 |    "language": "python",
765 |    "name": "python3"
766 |   },
767 |   "language_info": {
768 |    "codemirror_mode": {
769 |     "name": "ipython",
770 |     "version": 3
771 |    },
772 |    "file_extension": ".py",
773 |    "mimetype": "text/x-python",
774 |    "name": "python",
775 |    "nbconvert_exporter": "python",
776 |    "pygments_lexer": "ipython3",
777 |    "version": "3.6.10"
778 |   }
779 |  },
780 |  "nbformat": 4,
781 |  "nbformat_minor": 4
782 | }
783 | 


--------------------------------------------------------------------------------
/notebook/.ipynb_checkpoints/make_label-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Create label for training in future"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import pandas as pd\n",
 17 |     "import numpy as np\n",
 18 |     "from sklearn.model_selection import train_test_split"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "meta = pd.read_csv('../meta_info.csv')"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/html": [
 38 |        "<div>\n",
 39 |        "<style scoped>\n",
 40 |        "    .dataframe tbody tr th:only-of-type {\n",
 41 |        "        vertical-align: middle;\n",
 42 |        "    }\n",
 43 |        "\n",
 44 |        "    .dataframe tbody tr th {\n",
 45 |        "        vertical-align: top;\n",
 46 |        "    }\n",
 47 |        "\n",
 48 |        "    .dataframe thead th {\n",
 49 |        "        text-align: right;\n",
 50 |        "    }\n",
 51 |        "</style>\n",
 52 |        "<table border=\"1\" class=\"dataframe\">\n",
 53 |        "  <thead>\n",
 54 |        "    <tr style=\"text-align: right;\">\n",
 55 |        "      <th></th>\n",
 56 |        "      <th>patient_id</th>\n",
 57 |        "      <th>nodule_no</th>\n",
 58 |        "      <th>slice_no</th>\n",
 59 |        "      <th>original_image</th>\n",
 60 |        "      <th>mask_image</th>\n",
 61 |        "      <th>malignancy</th>\n",
 62 |        "      <th>is_cancer</th>\n",
 63 |        "      <th>is_clean</th>\n",
 64 |        "    </tr>\n",
 65 |        "  </thead>\n",
 66 |        "  <tbody>\n",
 67 |        "    <tr>\n",
 68 |        "      <th>0</th>\n",
 69 |        "      <td>1</td>\n",
 70 |        "      <td>0</td>\n",
 71 |        "      <td>0</td>\n",
 72 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice000</td>\n",
 73 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice000</td>\n",
 74 |        "      <td>5</td>\n",
 75 |        "      <td>True</td>\n",
 76 |        "      <td>False</td>\n",
 77 |        "    </tr>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>1</th>\n",
 80 |        "      <td>1</td>\n",
 81 |        "      <td>0</td>\n",
 82 |        "      <td>1</td>\n",
 83 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice001</td>\n",
 84 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice001</td>\n",
 85 |        "      <td>5</td>\n",
 86 |        "      <td>True</td>\n",
 87 |        "      <td>False</td>\n",
 88 |        "    </tr>\n",
 89 |        "    <tr>\n",
 90 |        "      <th>2</th>\n",
 91 |        "      <td>1</td>\n",
 92 |        "      <td>0</td>\n",
 93 |        "      <td>2</td>\n",
 94 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice002</td>\n",
 95 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice002</td>\n",
 96 |        "      <td>5</td>\n",
 97 |        "      <td>True</td>\n",
 98 |        "      <td>False</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>3</th>\n",
102 |        "      <td>1</td>\n",
103 |        "      <td>0</td>\n",
104 |        "      <td>3</td>\n",
105 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice003</td>\n",
106 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice003</td>\n",
107 |        "      <td>5</td>\n",
108 |        "      <td>True</td>\n",
109 |        "      <td>False</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <th>4</th>\n",
113 |        "      <td>1</td>\n",
114 |        "      <td>0</td>\n",
115 |        "      <td>4</td>\n",
116 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice004</td>\n",
117 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice004</td>\n",
118 |        "      <td>5</td>\n",
119 |        "      <td>True</td>\n",
120 |        "      <td>False</td>\n",
121 |        "    </tr>\n",
122 |        "  </tbody>\n",
123 |        "</table>\n",
124 |        "</div>"
125 |       ],
126 |       "text/plain": [
127 |        "   patient_id  nodule_no  slice_no                      original_image  \\\n",
128 |        "0           1          0         0  LIDC-IDRI-0001/0001_NI000_slice000   \n",
129 |        "1           1          0         1  LIDC-IDRI-0001/0001_NI000_slice001   \n",
130 |        "2           1          0         2  LIDC-IDRI-0001/0001_NI000_slice002   \n",
131 |        "3           1          0         3  LIDC-IDRI-0001/0001_NI000_slice003   \n",
132 |        "4           1          0         4  LIDC-IDRI-0001/0001_NI000_slice004   \n",
133 |        "\n",
134 |        "                           mask_image  malignancy is_cancer  is_clean  \n",
135 |        "0  LIDC-IDRI-0001/0001_MA000_slice000           5      True     False  \n",
136 |        "1  LIDC-IDRI-0001/0001_MA000_slice001           5      True     False  \n",
137 |        "2  LIDC-IDRI-0001/0001_MA000_slice002           5      True     False  \n",
138 |        "3  LIDC-IDRI-0001/0001_MA000_slice003           5      True     False  \n",
139 |        "4  LIDC-IDRI-0001/0001_MA000_slice004           5      True     False  "
140 |       ]
141 |      },
142 |      "execution_count": 3,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "meta.head()"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 4,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "# NI= Nodule Image, MA = Mask Original , CN = Clean Nodule , CM = Clean Mask"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 5,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "def is_nodule(row):\n",
167 |     "    if row[20:22] =='NI':\n",
168 |     "        return True\n",
169 |     "    else:\n",
170 |     "        return False"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 6,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "meta['is_nodule']= meta['original_image'].apply(lambda row: is_nodule(row))"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 7,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "# Lets separate Clean meta and meta data\n",
189 |     "clean_meta = meta[meta['is_nodule']==False]\n",
190 |     "clean_meta.reset_index(inplace=True)\n",
191 |     "meta = meta[meta['is_nodule']==True]\n",
192 |     "meta.reset_index(inplace=True)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 8,
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "text/html": [
203 |        "<div>\n",
204 |        "<style scoped>\n",
205 |        "    .dataframe tbody tr th:only-of-type {\n",
206 |        "        vertical-align: middle;\n",
207 |        "    }\n",
208 |        "\n",
209 |        "    .dataframe tbody tr th {\n",
210 |        "        vertical-align: top;\n",
211 |        "    }\n",
212 |        "\n",
213 |        "    .dataframe thead th {\n",
214 |        "        text-align: right;\n",
215 |        "    }\n",
216 |        "</style>\n",
217 |        "<table border=\"1\" class=\"dataframe\">\n",
218 |        "  <thead>\n",
219 |        "    <tr style=\"text-align: right;\">\n",
220 |        "      <th></th>\n",
221 |        "      <th>index</th>\n",
222 |        "      <th>patient_id</th>\n",
223 |        "      <th>nodule_no</th>\n",
224 |        "      <th>slice_no</th>\n",
225 |        "      <th>original_image</th>\n",
226 |        "      <th>mask_image</th>\n",
227 |        "      <th>malignancy</th>\n",
228 |        "      <th>is_cancer</th>\n",
229 |        "      <th>is_clean</th>\n",
230 |        "      <th>is_nodule</th>\n",
231 |        "    </tr>\n",
232 |        "  </thead>\n",
233 |        "  <tbody>\n",
234 |        "    <tr>\n",
235 |        "      <th>0</th>\n",
236 |        "      <td>298</td>\n",
237 |        "      <td>28</td>\n",
238 |        "      <td>0</td>\n",
239 |        "      <td>0</td>\n",
240 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice000</td>\n",
241 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice000</td>\n",
242 |        "      <td>0</td>\n",
243 |        "      <td>False</td>\n",
244 |        "      <td>True</td>\n",
245 |        "      <td>False</td>\n",
246 |        "    </tr>\n",
247 |        "    <tr>\n",
248 |        "      <th>1</th>\n",
249 |        "      <td>299</td>\n",
250 |        "      <td>28</td>\n",
251 |        "      <td>1</td>\n",
252 |        "      <td>1</td>\n",
253 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice001</td>\n",
254 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice001</td>\n",
255 |        "      <td>0</td>\n",
256 |        "      <td>False</td>\n",
257 |        "      <td>True</td>\n",
258 |        "      <td>False</td>\n",
259 |        "    </tr>\n",
260 |        "  </tbody>\n",
261 |        "</table>\n",
262 |        "</div>"
263 |       ],
264 |       "text/plain": [
265 |        "   index  patient_id  nodule_no  slice_no                      original_image  \\\n",
266 |        "0    298          28          0         0  LIDC-IDRI-0028/0028_CN001_slice000   \n",
267 |        "1    299          28          1         1  LIDC-IDRI-0028/0028_CN001_slice001   \n",
268 |        "\n",
269 |        "                           mask_image  malignancy is_cancer  is_clean  \\\n",
270 |        "0  LIDC-IDRI-0028/0028_CM001_slice000           0     False      True   \n",
271 |        "1  LIDC-IDRI-0028/0028_CM001_slice001           0     False      True   \n",
272 |        "\n",
273 |        "   is_nodule  \n",
274 |        "0      False  \n",
275 |        "1      False  "
276 |       ]
277 |      },
278 |      "execution_count": 8,
279 |      "metadata": {},
280 |      "output_type": "execute_result"
281 |     }
282 |    ],
283 |    "source": [
284 |     "clean_meta.head(2)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 9,
290 |    "metadata": {},
291 |    "outputs": [
292 |     {
293 |      "data": {
294 |       "text/html": [
295 |        "<div>\n",
296 |        "<style scoped>\n",
297 |        "    .dataframe tbody tr th:only-of-type {\n",
298 |        "        vertical-align: middle;\n",
299 |        "    }\n",
300 |        "\n",
301 |        "    .dataframe tbody tr th {\n",
302 |        "        vertical-align: top;\n",
303 |        "    }\n",
304 |        "\n",
305 |        "    .dataframe thead th {\n",
306 |        "        text-align: right;\n",
307 |        "    }\n",
308 |        "</style>\n",
309 |        "<table border=\"1\" class=\"dataframe\">\n",
310 |        "  <thead>\n",
311 |        "    <tr style=\"text-align: right;\">\n",
312 |        "      <th></th>\n",
313 |        "      <th>index</th>\n",
314 |        "      <th>patient_id</th>\n",
315 |        "      <th>nodule_no</th>\n",
316 |        "      <th>slice_no</th>\n",
317 |        "      <th>original_image</th>\n",
318 |        "      <th>mask_image</th>\n",
319 |        "      <th>malignancy</th>\n",
320 |        "      <th>is_cancer</th>\n",
321 |        "      <th>is_clean</th>\n",
322 |        "      <th>is_nodule</th>\n",
323 |        "    </tr>\n",
324 |        "  </thead>\n",
325 |        "  <tbody>\n",
326 |        "    <tr>\n",
327 |        "      <th>0</th>\n",
328 |        "      <td>0</td>\n",
329 |        "      <td>1</td>\n",
330 |        "      <td>0</td>\n",
331 |        "      <td>0</td>\n",
332 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice000</td>\n",
333 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice000</td>\n",
334 |        "      <td>5</td>\n",
335 |        "      <td>True</td>\n",
336 |        "      <td>False</td>\n",
337 |        "      <td>True</td>\n",
338 |        "    </tr>\n",
339 |        "    <tr>\n",
340 |        "      <th>1</th>\n",
341 |        "      <td>1</td>\n",
342 |        "      <td>1</td>\n",
343 |        "      <td>0</td>\n",
344 |        "      <td>1</td>\n",
345 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice001</td>\n",
346 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice001</td>\n",
347 |        "      <td>5</td>\n",
348 |        "      <td>True</td>\n",
349 |        "      <td>False</td>\n",
350 |        "      <td>True</td>\n",
351 |        "    </tr>\n",
352 |        "  </tbody>\n",
353 |        "</table>\n",
354 |        "</div>"
355 |       ],
356 |       "text/plain": [
357 |        "   index  patient_id  nodule_no  slice_no                      original_image  \\\n",
358 |        "0      0           1          0         0  LIDC-IDRI-0001/0001_NI000_slice000   \n",
359 |        "1      1           1          0         1  LIDC-IDRI-0001/0001_NI000_slice001   \n",
360 |        "\n",
361 |        "                           mask_image  malignancy is_cancer  is_clean  \\\n",
362 |        "0  LIDC-IDRI-0001/0001_MA000_slice000           5      True     False   \n",
363 |        "1  LIDC-IDRI-0001/0001_MA000_slice001           5      True     False   \n",
364 |        "\n",
365 |        "   is_nodule  \n",
366 |        "0       True  \n",
367 |        "1       True  "
368 |       ]
369 |      },
370 |      "execution_count": 9,
371 |      "metadata": {},
372 |      "output_type": "execute_result"
373 |     }
374 |    ],
375 |    "source": [
376 |     "meta.head(2)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": 10,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "def is_train(row,train,val,test):\n",
386 |     "    if row in train:\n",
387 |     "        return 'Train'\n",
388 |     "    elif row in val:\n",
389 |     "        return 'Validation'\n",
390 |     "    else:\n",
391 |     "        return 'Test'"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 11,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "\n",
401 |     "clean_patient_id = list(np.unique(clean_meta['patient_id']))\n",
402 |     "meta_patient_id = list(np.unique(meta['patient_id']))"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 12,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "def create_label_segmentation(meta):\n",
412 |     "    patient_id = list(np.unique(meta['patient_id']))\n",
413 |     "    train_patient , test_patient = train_test_split(patient_id,test_size= 0.2)\n",
414 |     "    train_patient, val_patient = train_test_split(train_patient,test_size= 0.25)\n",
415 |     "    print(len(train_patient),len(val_patient),len(test_patient))\n",
416 |     "    \n",
417 |     "    meta['data_split']= meta['patient_id'].apply(lambda row : is_train(row,train_patient,val_patient,test_patient))\n",
418 |     "    \n",
419 |     "    return meta"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 13,
425 |    "metadata": {},
426 |    "outputs": [
427 |     {
428 |      "name": "stdout",
429 |      "output_type": "stream",
430 |      "text": [
431 |       "504 168 168\n",
432 |       "81 27 27\n"
433 |      ]
434 |     }
435 |    ],
436 |    "source": [
437 |     "# We need to train/test split independently for clean_meta, meta\n",
438 |     "meta = create_label_segmentation(meta)\n",
439 |     "clean_meta = create_label_segmentation(clean_meta)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 14,
445 |    "metadata": {},
446 |    "outputs": [
447 |     {
448 |      "data": {
449 |       "text/html": [
450 |        "<div>\n",
451 |        "<style scoped>\n",
452 |        "    .dataframe tbody tr th:only-of-type {\n",
453 |        "        vertical-align: middle;\n",
454 |        "    }\n",
455 |        "\n",
456 |        "    .dataframe tbody tr th {\n",
457 |        "        vertical-align: top;\n",
458 |        "    }\n",
459 |        "\n",
460 |        "    .dataframe thead th {\n",
461 |        "        text-align: right;\n",
462 |        "    }\n",
463 |        "</style>\n",
464 |        "<table border=\"1\" class=\"dataframe\">\n",
465 |        "  <thead>\n",
466 |        "    <tr style=\"text-align: right;\">\n",
467 |        "      <th></th>\n",
468 |        "      <th>index</th>\n",
469 |        "      <th>patient_id</th>\n",
470 |        "      <th>nodule_no</th>\n",
471 |        "      <th>slice_no</th>\n",
472 |        "      <th>original_image</th>\n",
473 |        "      <th>mask_image</th>\n",
474 |        "      <th>malignancy</th>\n",
475 |        "      <th>is_cancer</th>\n",
476 |        "      <th>is_clean</th>\n",
477 |        "      <th>is_nodule</th>\n",
478 |        "      <th>data_split</th>\n",
479 |        "    </tr>\n",
480 |        "  </thead>\n",
481 |        "  <tbody>\n",
482 |        "    <tr>\n",
483 |        "      <th>0</th>\n",
484 |        "      <td>0</td>\n",
485 |        "      <td>1</td>\n",
486 |        "      <td>0</td>\n",
487 |        "      <td>0</td>\n",
488 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice000</td>\n",
489 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice000</td>\n",
490 |        "      <td>5</td>\n",
491 |        "      <td>True</td>\n",
492 |        "      <td>False</td>\n",
493 |        "      <td>True</td>\n",
494 |        "      <td>Train</td>\n",
495 |        "    </tr>\n",
496 |        "    <tr>\n",
497 |        "      <th>1</th>\n",
498 |        "      <td>1</td>\n",
499 |        "      <td>1</td>\n",
500 |        "      <td>0</td>\n",
501 |        "      <td>1</td>\n",
502 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice001</td>\n",
503 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice001</td>\n",
504 |        "      <td>5</td>\n",
505 |        "      <td>True</td>\n",
506 |        "      <td>False</td>\n",
507 |        "      <td>True</td>\n",
508 |        "      <td>Train</td>\n",
509 |        "    </tr>\n",
510 |        "    <tr>\n",
511 |        "      <th>2</th>\n",
512 |        "      <td>2</td>\n",
513 |        "      <td>1</td>\n",
514 |        "      <td>0</td>\n",
515 |        "      <td>2</td>\n",
516 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice002</td>\n",
517 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice002</td>\n",
518 |        "      <td>5</td>\n",
519 |        "      <td>True</td>\n",
520 |        "      <td>False</td>\n",
521 |        "      <td>True</td>\n",
522 |        "      <td>Train</td>\n",
523 |        "    </tr>\n",
524 |        "    <tr>\n",
525 |        "      <th>3</th>\n",
526 |        "      <td>3</td>\n",
527 |        "      <td>1</td>\n",
528 |        "      <td>0</td>\n",
529 |        "      <td>3</td>\n",
530 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice003</td>\n",
531 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice003</td>\n",
532 |        "      <td>5</td>\n",
533 |        "      <td>True</td>\n",
534 |        "      <td>False</td>\n",
535 |        "      <td>True</td>\n",
536 |        "      <td>Train</td>\n",
537 |        "    </tr>\n",
538 |        "    <tr>\n",
539 |        "      <th>4</th>\n",
540 |        "      <td>4</td>\n",
541 |        "      <td>1</td>\n",
542 |        "      <td>0</td>\n",
543 |        "      <td>4</td>\n",
544 |        "      <td>LIDC-IDRI-0001/0001_NI000_slice004</td>\n",
545 |        "      <td>LIDC-IDRI-0001/0001_MA000_slice004</td>\n",
546 |        "      <td>5</td>\n",
547 |        "      <td>True</td>\n",
548 |        "      <td>False</td>\n",
549 |        "      <td>True</td>\n",
550 |        "      <td>Train</td>\n",
551 |        "    </tr>\n",
552 |        "  </tbody>\n",
553 |        "</table>\n",
554 |        "</div>"
555 |       ],
556 |       "text/plain": [
557 |        "   index  patient_id  nodule_no  slice_no                      original_image  \\\n",
558 |        "0      0           1          0         0  LIDC-IDRI-0001/0001_NI000_slice000   \n",
559 |        "1      1           1          0         1  LIDC-IDRI-0001/0001_NI000_slice001   \n",
560 |        "2      2           1          0         2  LIDC-IDRI-0001/0001_NI000_slice002   \n",
561 |        "3      3           1          0         3  LIDC-IDRI-0001/0001_NI000_slice003   \n",
562 |        "4      4           1          0         4  LIDC-IDRI-0001/0001_NI000_slice004   \n",
563 |        "\n",
564 |        "                           mask_image  malignancy is_cancer  is_clean  \\\n",
565 |        "0  LIDC-IDRI-0001/0001_MA000_slice000           5      True     False   \n",
566 |        "1  LIDC-IDRI-0001/0001_MA000_slice001           5      True     False   \n",
567 |        "2  LIDC-IDRI-0001/0001_MA000_slice002           5      True     False   \n",
568 |        "3  LIDC-IDRI-0001/0001_MA000_slice003           5      True     False   \n",
569 |        "4  LIDC-IDRI-0001/0001_MA000_slice004           5      True     False   \n",
570 |        "\n",
571 |        "   is_nodule data_split  \n",
572 |        "0       True      Train  \n",
573 |        "1       True      Train  \n",
574 |        "2       True      Train  \n",
575 |        "3       True      Train  \n",
576 |        "4       True      Train  "
577 |       ]
578 |      },
579 |      "execution_count": 14,
580 |      "metadata": {},
581 |      "output_type": "execute_result"
582 |     }
583 |    ],
584 |    "source": [
585 |     "meta.head()"
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": 15,
591 |    "metadata": {},
592 |    "outputs": [
593 |     {
594 |      "data": {
595 |       "text/html": [
596 |        "<div>\n",
597 |        "<style scoped>\n",
598 |        "    .dataframe tbody tr th:only-of-type {\n",
599 |        "        vertical-align: middle;\n",
600 |        "    }\n",
601 |        "\n",
602 |        "    .dataframe tbody tr th {\n",
603 |        "        vertical-align: top;\n",
604 |        "    }\n",
605 |        "\n",
606 |        "    .dataframe thead th {\n",
607 |        "        text-align: right;\n",
608 |        "    }\n",
609 |        "</style>\n",
610 |        "<table border=\"1\" class=\"dataframe\">\n",
611 |        "  <thead>\n",
612 |        "    <tr style=\"text-align: right;\">\n",
613 |        "      <th></th>\n",
614 |        "      <th>index</th>\n",
615 |        "      <th>patient_id</th>\n",
616 |        "      <th>nodule_no</th>\n",
617 |        "      <th>slice_no</th>\n",
618 |        "      <th>original_image</th>\n",
619 |        "      <th>mask_image</th>\n",
620 |        "      <th>malignancy</th>\n",
621 |        "      <th>is_cancer</th>\n",
622 |        "      <th>is_clean</th>\n",
623 |        "      <th>is_nodule</th>\n",
624 |        "      <th>data_split</th>\n",
625 |        "    </tr>\n",
626 |        "  </thead>\n",
627 |        "  <tbody>\n",
628 |        "    <tr>\n",
629 |        "      <th>0</th>\n",
630 |        "      <td>298</td>\n",
631 |        "      <td>28</td>\n",
632 |        "      <td>0</td>\n",
633 |        "      <td>0</td>\n",
634 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice000</td>\n",
635 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice000</td>\n",
636 |        "      <td>0</td>\n",
637 |        "      <td>False</td>\n",
638 |        "      <td>True</td>\n",
639 |        "      <td>False</td>\n",
640 |        "      <td>Train</td>\n",
641 |        "    </tr>\n",
642 |        "    <tr>\n",
643 |        "      <th>1</th>\n",
644 |        "      <td>299</td>\n",
645 |        "      <td>28</td>\n",
646 |        "      <td>1</td>\n",
647 |        "      <td>1</td>\n",
648 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice001</td>\n",
649 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice001</td>\n",
650 |        "      <td>0</td>\n",
651 |        "      <td>False</td>\n",
652 |        "      <td>True</td>\n",
653 |        "      <td>False</td>\n",
654 |        "      <td>Train</td>\n",
655 |        "    </tr>\n",
656 |        "    <tr>\n",
657 |        "      <th>2</th>\n",
658 |        "      <td>300</td>\n",
659 |        "      <td>28</td>\n",
660 |        "      <td>2</td>\n",
661 |        "      <td>2</td>\n",
662 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice002</td>\n",
663 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice002</td>\n",
664 |        "      <td>0</td>\n",
665 |        "      <td>False</td>\n",
666 |        "      <td>True</td>\n",
667 |        "      <td>False</td>\n",
668 |        "      <td>Train</td>\n",
669 |        "    </tr>\n",
670 |        "    <tr>\n",
671 |        "      <th>3</th>\n",
672 |        "      <td>301</td>\n",
673 |        "      <td>28</td>\n",
674 |        "      <td>3</td>\n",
675 |        "      <td>3</td>\n",
676 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice003</td>\n",
677 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice003</td>\n",
678 |        "      <td>0</td>\n",
679 |        "      <td>False</td>\n",
680 |        "      <td>True</td>\n",
681 |        "      <td>False</td>\n",
682 |        "      <td>Train</td>\n",
683 |        "    </tr>\n",
684 |        "    <tr>\n",
685 |        "      <th>4</th>\n",
686 |        "      <td>302</td>\n",
687 |        "      <td>28</td>\n",
688 |        "      <td>4</td>\n",
689 |        "      <td>4</td>\n",
690 |        "      <td>LIDC-IDRI-0028/0028_CN001_slice004</td>\n",
691 |        "      <td>LIDC-IDRI-0028/0028_CM001_slice004</td>\n",
692 |        "      <td>0</td>\n",
693 |        "      <td>False</td>\n",
694 |        "      <td>True</td>\n",
695 |        "      <td>False</td>\n",
696 |        "      <td>Train</td>\n",
697 |        "    </tr>\n",
698 |        "  </tbody>\n",
699 |        "</table>\n",
700 |        "</div>"
701 |       ],
702 |       "text/plain": [
703 |        "   index  patient_id  nodule_no  slice_no                      original_image  \\\n",
704 |        "0    298          28          0         0  LIDC-IDRI-0028/0028_CN001_slice000   \n",
705 |        "1    299          28          1         1  LIDC-IDRI-0028/0028_CN001_slice001   \n",
706 |        "2    300          28          2         2  LIDC-IDRI-0028/0028_CN001_slice002   \n",
707 |        "3    301          28          3         3  LIDC-IDRI-0028/0028_CN001_slice003   \n",
708 |        "4    302          28          4         4  LIDC-IDRI-0028/0028_CN001_slice004   \n",
709 |        "\n",
710 |        "                           mask_image  malignancy is_cancer  is_clean  \\\n",
711 |        "0  LIDC-IDRI-0028/0028_CM001_slice000           0     False      True   \n",
712 |        "1  LIDC-IDRI-0028/0028_CM001_slice001           0     False      True   \n",
713 |        "2  LIDC-IDRI-0028/0028_CM001_slice002           0     False      True   \n",
714 |        "3  LIDC-IDRI-0028/0028_CM001_slice003           0     False      True   \n",
715 |        "4  LIDC-IDRI-0028/0028_CM001_slice004           0     False      True   \n",
716 |        "\n",
717 |        "   is_nodule data_split  \n",
718 |        "0      False      Train  \n",
719 |        "1      False      Train  \n",
720 |        "2      False      Train  \n",
721 |        "3      False      Train  \n",
722 |        "4      False      Train  "
723 |       ]
724 |      },
725 |      "execution_count": 15,
726 |      "metadata": {},
727 |      "output_type": "execute_result"
728 |     }
729 |    ],
730 |    "source": [
731 |     "clean_meta.head()"
732 |    ]
733 |   },
734 |   {
735 |    "cell_type": "code",
736 |    "execution_count": 16,
737 |    "metadata": {},
738 |    "outputs": [],
739 |    "source": [
740 |     "# Clean Meta only stores meta information of patients without nodules."
741 |    ]
742 |   },
743 |   {
744 |    "cell_type": "code",
745 |    "execution_count": 17,
746 |    "metadata": {},
747 |    "outputs": [],
748 |    "source": [
749 |     "meta.to_csv('/home/LUNG_DATA/meta_csv/meta.csv')\n",
750 |     "clean_meta.to_csv('/home/LUNG_DATA/meta_csv/clean_meta.csv')"
751 |    ]
752 |   },
753 |   {
754 |    "cell_type": "code",
755 |    "execution_count": null,
756 |    "metadata": {},
757 |    "outputs": [],
758 |    "source": []
759 |   }
760 |  ],
761 |  "metadata": {
762 |   "kernelspec": {
763 |    "display_name": "Python 3",
764 |    "language": "python",
765 |    "name": "python3"
766 |   },
767 |   "language_info": {
768 |    "codemirror_mode": {
769 |     "name": "ipython",
770 |     "version": 3
771 |    },
772 |    "file_extension": ".py",
773 |    "mimetype": "text/x-python",
774 |    "name": "python",
775 |    "nbconvert_exporter": "python",
776 |    "pygments_lexer": "ipython3",
777 |    "version": "3.6.10"
778 |   }
779 |  },
780 |  "nbformat": 4,
781 |  "nbformat_minor": 4
782 | }
783 | 


--------------------------------------------------------------------------------