├── AzureChestXRay_AMLWB
    ├── Code
    │   ├── src
    │   │   ├── azure_chestxray_pytorch_utils.py
    │   │   ├── finding_lungs
    │   │   │   ├── non_PA_AP_view_samples.png
    │   │   │   ├── rotated_images_samples.png
    │   │   │   ├── blacklist_rotated_images.csv
    │   │   │   ├── blacklist_non_PA_AP_view.csv
    │   │   │   ├── finding_lungs_DL_approach.py
    │   │   │   └── blacklist_other_images_with_lower_quality.csv
    │   │   ├── azure_chestxray_keras_utils.py
    │   │   ├── azure_chestxray_utils.py
    │   │   ├── azure_chestxray_cam.py
    │   │   └── score_image_and_cam.py
    │   ├── docker
    │   │   └── Dockerfile
    │   ├── Deployment_Guide.md
    │   ├── 01_DataPrep
    │   │   └── 001_get_data.ipynb
    │   └── 02_Model
    │   │   ├── 000_preprocess.ipynb
    │   │   ├── .ipynb_checkpoints
    │   │       └── 000_preprocess-checkpoint.ipynb
    │   │   ├── 010_train.ipynb
    │   │   ├── 020_evaluate.ipynb
    │   │   └── 060_Train_pyTorch.ipynb
    └── aml_config
    │   ├── gpucomputecontext.runconfig
    │   ├── gpucomputecontext.compute
    │   └── conda_dependencies_gpu.yml
├── LICENSE
├── README.md
└── .gitignore


/AzureChestXRay_AMLWB/Code/src/azure_chestxray_pytorch_utils.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/finding_lungs/non_PA_AP_view_samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tikyau/AzureChestXRay/master/AzureChestXRay_AMLWB/Code/src/finding_lungs/non_PA_AP_view_samples.png


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/finding_lungs/rotated_images_samples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tikyau/AzureChestXRay/master/AzureChestXRay_AMLWB/Code/src/finding_lungs/rotated_images_samples.png


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/aml_config/gpucomputecontext.runconfig:
--------------------------------------------------------------------------------
 1 | ArgumentVector:
 2 | - $file
 3 | CondaDependenciesFile: aml_config/conda_dependencies_gpu.yml
 4 | EnvironmentVariables: null
 5 | Framework: Python
 6 | PrepareEnvironment: true
 7 | SparkDependenciesFile: aml_config/spark_dependencies.yml
 8 | Target: gpucomputecontext
 9 | TrackedRun: true
10 | UseSampling: true
11 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/finding_lungs/blacklist_rotated_images.csv:
--------------------------------------------------------------------------------
 1 | 00001255_007.png
 2 | 00001814_001.png
 3 | 00002180_000.png
 4 | 00002815_003.png
 5 | 00003693_005.png
 6 | 00005823_000.png
 7 | 00007188_002.png
 8 | 00008051_036.png
 9 | 00008468_003.png
10 | 00009889_023.png
11 | 00009984_001.png
12 | 00011460_066.png
13 | 00013299_000.png
14 | 00013431_000.png
15 | 00017258_011.png
16 | 00017606_037.png
17 | 00019620_001.png
18 | 00026701_001.png
19 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/aml_config/gpucomputecontext.compute:
--------------------------------------------------------------------------------
 1 | address: ghiordanxrgpuvm02.westus2.cloudapp.azure.com
 2 | #baseDockerImage: microsoft/mmlspark:plus-0.9.9
 3 | baseDockerImage: georgedockeraccount/utils_with_amlwb_base_gpu:azcopyenabled
 4 | nvidiaDocker: true
 5 | 
 6 | #nativeSharedDirectory: ~/.azureml/share/
 7 | nativeSharedDirectory: /datadrive01/amlwbShare/
 8 | 
 9 | password: AzureMlSecret=gpucomputecontext#loginvm0011#56bdbc19d02f4df08a50f94cfc8ec9ef
10 | sharedVolumes: true
11 | type: remotedocker
12 | username: loginvm0011
13 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | # for CPU compute contexts
 2 | #FROM microsoft/mmlspark:plus-0.9.9
 3 | 
 4 | # for GPU compute contexts
 5 | FROM microsoft/mmlspark:plus-gpu-0.9.9
 6 | 
 7 | ENV PREVUSER=$USER
 8 | USER root
 9 | 
10 | # install AzCopy on Linux
11 | # https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux?toc=%2fazure%2fstorage%2fblobs%2ftoc.json
12 | RUN apt-get update && apt-get install -y apt-transport-https wget rsync git
13 | RUN curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg  && \
14 |         mv microsoft.gpg /etc/apt/trusted.gpg.d/microsoft.gpg  &&\
15 |         sh -c 'echo "deb [arch=amd64] https://packages.microsoft.com/repos/microsoft-ubuntu-xenial-prod xenial main" > /etc/apt/sources.list.d/dotnetdev.list'  && \
16 |         apt-get update  && \
17 |         apt-get install -y  --no-install-recommends && \
18 |         apt-get install -y dotnet-sdk-2.0.2  && \
19 |         wget -O azcopy.tar.gz https://aka.ms/downloadazcopyprlinux  && \
20 | 	tar -xf azcopy.tar.gz  && \
21 | 	./install.sh
22 | 
23 | 
24 | USER $PREVUSER
25 | 
26 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/azure_chestxray_keras_utils.py:
--------------------------------------------------------------------------------
 1 | ### Copyright (C) Microsoft Corporation.  
 2 | 
 3 | from keras.layers import Dense
 4 | from keras.models import Model
 5 | from keras_contrib.applications.densenet import DenseNetImageNet121
 6 | import keras_contrib
 7 | 
 8 | def build_model(crt_densenet_function):
 9 |     """
10 | 
11 |     Returns: a model with specified weights
12 | 
13 |     """
14 |     # define the model, use pre-trained weights for image_net
15 |     base_model = crt_densenet_function(input_shape=(224, 224, 3),
16 |                                      weights='imagenet',
17 |                                      include_top=False,
18 |                                      pooling='avg')
19 | 
20 |     x = base_model.output
21 |     predictions = Dense(14, activation='sigmoid')(x)
22 |     model = Model(inputs=base_model.input, outputs=predictions)
23 |     return model
24 | 
25 | if __name__=="__main__":        
26 |     model = build_model(DenseNetImageNet121)
27 |     print(model.summary())    
28 |     model = build_model(keras_contrib.applications.densenet.DenseNetImageNet201)
29 |     print(model.summary())


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation. All rights reserved.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/finding_lungs/blacklist_non_PA_AP_view.csv:
--------------------------------------------------------------------------------
 1 | 00000591_003.png
 2 | 00001136_001.png
 3 | 00001153_005.png
 4 | 00001602_000.png
 5 | 00001803_003.png
 6 | 00002097_000.png
 7 | 00002117_003.png
 8 | 00002354_000.png
 9 | 00002592_003.png
10 | 00002639_009.png
11 | 00003023_000.png
12 | 00003094_002.png
13 | 00004533_004.png
14 | 00004808_001.png
15 | 00004906_000.png
16 | 00005192_001.png
17 | 00005260_000.png
18 | 00005286_001.png
19 | 00006462_008.png
20 | 00006836_008.png
21 | 00006851_004.png
22 | 00007113_001.png
23 | 00007152_006.png
24 | 00007160_002.png
25 | 00007454_001.png
26 | 00007482_010.png
27 | 00007716_007.png
28 | 00008016_000.png
29 | 00008082_000.png
30 | 00009198_002.png
31 | 00009368_010.png
32 | 00009368_011.png
33 | 00009584_002.png
34 | 00009889_038.png
35 | 00010007_121.png
36 | 00010065_000.png
37 | 00012249_001.png
38 | 00012388_002.png
39 | 00012907_007.png
40 | 00013160_000.png
41 | 00013670_137.png
42 | 00013714_001.png
43 | 00014294_015.png
44 | 00014675_034.png
45 | 00014963_000.png
46 | 00015054_000.png
47 | 00015078_007.png
48 | 00016233_004.png
49 | 00016637_000.png
50 | 00017753_022.png
51 | 00017915_003.png
52 | 00020373_002.png
53 | 00020644_000.png
54 | 00025381_004.png
55 | 00026806_000.png
56 | 00029476_000.png
57 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/aml_config/conda_dependencies_gpu.yml:
--------------------------------------------------------------------------------
 1 | # Conda environment specification. The dependencies defined in this file will
 2 | # be automatically provisioned for managed runs. These include runs against
 3 | # the localdocker, remotedocker, and cluster compute targets.
 4 | 
 5 | # Note that this file is NOT used to automatically manage dependencies for the
 6 | # local compute target. To provision these dependencies locally, run:
 7 | # conda env update --file conda_dependencies.yml
 8 | 
 9 | # Details about the Conda environment file format:
10 | # https://conda.io/docs/using/envs.html#create-environment-file-by-hand
11 | 
12 | # For managing Spark packages and configuration, see spark_dependencies.yml.
13 | 
14 | # Version of this configuration file's structure and semantics in AzureML.
15 | # This directive is stored in a comment to preserve the Conda file structure.
16 | # [AzureMlVersion] = 2
17 | 
18 | name: project_environment
19 | channels:
20 |   # - conda-forge
21 |   - pytorch #soumith
22 | dependencies:
23 |   # The python interpreter version.
24 |   # Currently Azure ML Workbench only supports 3.5.2.
25 |   - python=3.5.2
26 |   - tqdm 
27 |   - opencv
28 |   - h5py
29 |   - scikit-learn
30 |   - nomkl
31 |   # - pytorch=0.3.0
32 |   # - cuda80
33 |   # - torchvision
34 | 
35 |   # Required for Jupyter Notebooks.
36 |   - ipykernel=4.6.1
37 | 
38 |   - pip:
39 |     # Required packages for AzureML execution, history, and data preparation.
40 |     - --index-url https://azuremldownloads.azureedge.net/python-repository/preview
41 |     - --extra-index-url https://pypi.python.org/simple
42 |     - azureml-requirements
43 | 
44 |     # The API for Azure Machine Learning Model Management Service.
45 |     # Details: https://github.com/Azure/Machine-Learning-Operationalization
46 |     - azure-ml-api-sdk==0.1.0a11
47 |     - git+git://github.com/fchollet/keras.git
48 |     - git+https://github.com/ahundt/keras-contrib.git
49 |     - tensorflow-gpu==1.4
50 |     - matplotlib
51 |     - numpy==1.14.0    
52 |     - git+https://github.com/aleju/imgaug
53 |     - http://download.pytorch.org/whl/cu80/torch-0.3.1-cp35-cp35m-linux_x86_64.whl
54 |     # - http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp35-cp35m-manylinux1_x86_64.whl 
55 |     - torchvision
56 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/azure_chestxray_utils.py:
--------------------------------------------------------------------------------
 1 | ### Copyright (C) Microsoft Corporation.  
 2 | 
 3 | import os
 4 | import numpy as np
 5 | 
 6 | class chestxray_consts(object):
 7 |     DISEASE_list = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia',
 8 |                 'Pneumothorax',
 9 |                 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural Thickening', 'Hernia']
10 |     
11 |     PRETRAINED_DENSENET201_IMAGENET_CHESTXRAY_MODEL_FILE_NAME =  'chexnet_14_weights_multigpu_contribmodel_121layer_712split_epoch_011_val_loss_153.9783.hdf5'
12 |     FULLY_PRETRAINED_MODEL_DIR_list = [ 'fully_trained_models']
13 | 
14 | 
15 |     CHESTXRAY_MODEL_EXPECTED_IMAGE_HEIGHT  = 224
16 |     CHESTXRAY_MODEL_EXPECTED_IMAGE_WIDTH = 224
17 | 
18 |     BASE_INPUT_DIR_list = ['chestxray', 'data', 'ChestX-ray8']
19 |     BASE_OUTPUT_DIR_list = ['chestxray', 'output']
20 |     CREDENTIALS_DIR_list = ['code', 'notShared']
21 | 
22 |     SRC_DIR_list = ['Code',  'src']
23 |     ChestXray_IMAGES_DIR_list = ['ChestXray-NIHCC']
24 |     ChestXray_OTHER_DATA_DIR_list = ['ChestXray-NIHCC_other']
25 |     PROCESSED_IMAGES_DIR_list = ['processed_npy14']
26 |     DATA_PARTITIONS_DIR_list = ['data_partitions']
27 |     MODEL_WEIGHTS_DIR_list = [ 'weights_tmpdir']
28 | 
29 |     def __setattr__(self, *_):
30 |         raise TypeError
31 | 
32 | 
33 | # os agnostic 'ls' function
34 | def get_files_in_dir(crt_dir):
35 |         return( [f for f in os.listdir(crt_dir) if os.path.isfile(os.path.join(crt_dir, f))])
36 |         
37 |        
38 |     
39 | def normalize_nd_array(crt_array):
40 |     # Normalised [0,1]
41 |     crt_array = crt_array - np.min(crt_array)
42 |     return(crt_array/np.ptp(crt_array))
43 | 
44 | def print_image_stats_by_channel(crt_image):
45 |     print('min:')
46 |     print(np.amin(crt_image[:,:,0]), 
47 |           np.amin(crt_image[:,:,1]),
48 |           np.amin(crt_image[:,:,2]))
49 |     print('max:')
50 |     print(np.amax(crt_image[:,:,0]), 
51 |           np.amax(crt_image[:,:,1]),
52 |           np.amax(crt_image[:,:,2]))        
53 | 
54 |         
55 |         
56 | if __name__=="__main__":        
57 |     prj_consts = chestxray_consts()
58 |     print('model_expected_image_height = ', prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_HEIGHT)
59 |     print('model_expected_image_width = ', prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_WIDTH)
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | This repository contains the code for the blog post: [Using Microsoft AI to Build a Lung-Disease Prediction Model using Chest X-Ray Images](https://blogs.technet.microsoft.com/machinelearning/2018/03/07/using-microsoft-ai-to-build-a-lung-disease-prediction-model-using-chest-x-ray-images/), by Xiaoyong Zhu, George Iordanescu, Ilia Karmanov, data scientists from Microsoft, and Mazen Zawaideh, radiologist resident from University of Washington Medical Center.
 3 | 
 4 | In this repostory, we provide you the Keras code (001-003 Jupyter Notebooks under AzureChestXRay_AMLWB\Code\02_Model) and PyTorch code (AzureChestXRay_AMLWB\Code\02_Model060_Train_pyTorch). You should be able to run the code from scratch and get the below result using Azure Machine Learning platform or run it using your own GPU machine.
 5 | 
 6 | # Get Started
 7 | 
 8 | ## Installing additional packages
 9 | 
10 | If you are using Azure Machine Learning as the training platform, all the dependencies should be installed. However, if you are trying out in your own environment, you should also install [keras-contrib](https://github.com/keras-team/keras-contrib) repository to run Keras code.
11 | 
12 | If you are trying out the lung detection algorithm, you need to install a few other additional libraries. Please refer to the README.md file under folder AzureChestXRay\AzureChestXRay_AMLWB\Code\src\finding_lungs for more details.
13 | 
14 | ## Running the code
15 | To run the code, you need to get the NIH Chest X-ray Dataset from here: https://nihcc.app.box.com/v/ChestXray-NIHCC. You need to get all the image files (all the files under `images` folder in NIH Dataset), Data_Entry_2017.csv file, as well as the Bounding Box data BBox_List_2017.csv. You might also want to remove a few low_quality images (Please refer to subfolder AzureChestXRay_AMLWB\Code\src\finding_lungs for more details).
16 | 
17 | 
18 | 
19 | #	Tools and Platforms
20 | - Deep Learning VMs with GPU acceleration is used as the compute environment
21 | - Azure Machine Learning is used as a managed machine learning service for project management, run history and version control, and model deployment
22 | 
23 | # Results
24 | 
25 | We've got the following result, and the average AUROC across all the 14 diseases is around 0.845.
26 | 
27 | | Disease      | AUC Score | Disease            | AUC Score |
28 | |--------------|-----------|--------------------|-----------|
29 | | Atelectasis  | 0.828543  | Pneumothorax       | 0.881838  |
30 | | Cardiomegaly | 0.891449  | Consolidation      | 0.721818  |
31 | | Effusion     | 0.817697  | Edema              | 0.868002  |
32 | | Infiltration | 0.907302  | Emphysema          | 0.787202  |
33 | | Mass         | 0.895815  | Fibrosis           | 0.826822  |
34 | | Nodule       | 0.907841  | Pleural Thickening | 0.793416  |
35 | | Pneumonia    | 0.817601  | Hernia             | 0.889089  |
36 | 
37 | 
38 | # Criticisms
39 | There are several discussions in the community on the efficacy of using NLP to mine the disease labels, and how it might potentially lead to poor label quality (for example, [here](https://lukeoakdenrayner.wordpress.com/2018/01/24/chexnet-an-in-depth-review/), as well as in [this article on Medium](https://medium.com/@paras42/dear-mythical-editor-radiologist-level-pneumonia-in-chexnet-c91041223526)). However, even with dirty labels, deep learning models are sometimes still able to achieve good classification performance.
40 | 
41 | # Referenced papers
42 | - The original chexnet paper mentioned in [StanfordML website](https://stanfordmlgroup.github.io/projects/chexnet/) as well as their [paper](https://arxiv.org/abs/1711.05225).
43 | - http://cs231n.stanford.edu/reports/2017/pdfs/527.pdf for pre-processing the data
44 | - https://arxiv.org/abs/1711.08760 for some other thoughts on the model architecture and the relationship between different diseases
45 | - Baseline result: https://arxiv.org/abs/1705.02315
46 | - Image Localization http://arxiv.org/abs/1512.04150
47 | 
48 | # Conclusion, acknowledgement, and thanks
49 | Some of the pre-processing code for Keras is borrowed from [the dr.b repository](https://github.com/taoddiao/dr.b).
50 | 
51 | We hope this repository will be helpful in your research project and please let us know if you have any questions or feedbacks. Pull requests are also welcome!
52 | 
53 | We also would like to thank Pranav Rajpurkar and Jeremy Irvin from Stanford for answering our questions about their implementation, as well as Wee Hyong Tok, Danielle Dean, Hanna Kim, and Ivan Tarapov from Microsoft for reviewing the blog post and providing their feedback.
54 | 
55 | # Disclaimer
56 | The source code, tools, and discussion in this repository are provided to assist data scientists in understanding the potential for developing deep learning -driven intelligent applications using Azure AI services and are intended for research and development use only. The x-ray image pathology classification system is not intended for use in clinical diagnosis or clinical decision-making or for any other clinical use. The performance of this model for clinical use has not been established.
57 | 
58 | # Contributing
59 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
60 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
61 | the rights to use your contribution. For details, visit https://cla.microsoft.com.
62 | 
63 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
64 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
65 | provided by the bot. You will only need to do this once across all repos using our CLA.
66 | 
67 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
68 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
69 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
70 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.suo
  8 | *.user
  9 | *.userosscache
 10 | *.sln.docstates
 11 | 
 12 | # User-specific files (MonoDevelop/Xamarin Studio)
 13 | *.userprefs
 14 | 
 15 | # Build results
 16 | [Dd]ebug/
 17 | [Dd]ebugPublic/
 18 | [Rr]elease/
 19 | [Rr]eleases/
 20 | x64/
 21 | x86/
 22 | bld/
 23 | [Bb]in/
 24 | [Oo]bj/
 25 | [Ll]og/
 26 | 
 27 | # Visual Studio 2015 cache/options directory
 28 | .vs/
 29 | # Uncomment if you have tasks that create the project's static files in wwwroot
 30 | #wwwroot/
 31 | 
 32 | 
 33 | # Visual Studio Code
 34 | .vscode/
 35 | # Uncomment if you have tasks that create the project's static files in wwwroot
 36 | #wwwroot/
 37 | 
 38 | # MSTest test Results
 39 | [Tt]est[Rr]esult*/
 40 | [Bb]uild[Ll]og.*
 41 | 
 42 | # NUNIT
 43 | *.VisualState.xml
 44 | TestResult.xml
 45 | 
 46 | # Build Results of an ATL Project
 47 | [Dd]ebugPS/
 48 | [Rr]eleasePS/
 49 | dlldata.c
 50 | 
 51 | # .NET Core
 52 | project.lock.json
 53 | project.fragment.lock.json
 54 | artifacts/
 55 | **/Properties/launchSettings.json
 56 | 
 57 | *_i.c
 58 | *_p.c
 59 | *_i.h
 60 | *.ilk
 61 | *.meta
 62 | *.obj
 63 | *.pch
 64 | *.pdb
 65 | *.pgc
 66 | *.pgd
 67 | *.rsp
 68 | *.sbr
 69 | *.tlb
 70 | *.tli
 71 | *.tlh
 72 | *.tmp
 73 | *.tmp_proj
 74 | *.log
 75 | *.vspscc
 76 | *.vssscc
 77 | .builds
 78 | *.pidb
 79 | *.svclog
 80 | *.scc
 81 | 
 82 | # Chutzpah Test files
 83 | _Chutzpah*
 84 | 
 85 | # Visual C++ cache files
 86 | ipch/
 87 | *.aps
 88 | *.ncb
 89 | *.opendb
 90 | *.opensdf
 91 | *.sdf
 92 | *.cachefile
 93 | *.VC.db
 94 | *.VC.VC.opendb
 95 | 
 96 | # Visual Studio profiler
 97 | *.psess
 98 | *.vsp
 99 | *.vspx
100 | *.sap
101 | 
102 | # TFS 2012 Local Workspace
103 | $tf/
104 | 
105 | # Guidance Automation Toolkit
106 | *.gpState
107 | 
108 | # ReSharper is a .NET coding add-in
109 | _ReSharper*/
110 | *.[Rr]e[Ss]harper
111 | *.DotSettings.user
112 | 
113 | # JustCode is a .NET coding add-in
114 | .JustCode
115 | 
116 | # TeamCity is a build add-in
117 | _TeamCity*
118 | 
119 | # DotCover is a Code Coverage Tool
120 | *.dotCover
121 | 
122 | # Visual Studio code coverage results
123 | *.coverage
124 | *.coveragexml
125 | 
126 | # NCrunch
127 | _NCrunch_*
128 | .*crunch*.local.xml
129 | nCrunchTemp_*
130 | 
131 | # MightyMoose
132 | *.mm.*
133 | AutoTest.Net/
134 | 
135 | # Web workbench (sass)
136 | .sass-cache/
137 | 
138 | # Installshield output folder
139 | [Ee]xpress/
140 | 
141 | # DocProject is a documentation generator add-in
142 | DocProject/buildhelp/
143 | DocProject/Help/*.HxT
144 | DocProject/Help/*.HxC
145 | DocProject/Help/*.hhc
146 | DocProject/Help/*.hhk
147 | DocProject/Help/*.hhp
148 | DocProject/Help/Html2
149 | DocProject/Help/html
150 | 
151 | # Click-Once directory
152 | publish/
153 | 
154 | # Publish Web Output
155 | *.[Pp]ublish.xml
156 | *.azurePubxml
157 | # TODO: Comment the next line if you want to checkin your web deploy settings
158 | # but database connection strings (with potential passwords) will be unencrypted
159 | *.pubxml
160 | *.publishproj
161 | 
162 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
163 | # checkin your Azure Web App publish settings, but sensitive information contained
164 | # in these scripts will be unencrypted
165 | PublishScripts/
166 | 
167 | # NuGet Packages
168 | *.nupkg
169 | # The packages folder can be ignored because of Package Restore
170 | **/packages/*
171 | # except build/, which is used as an MSBuild target.
172 | !**/packages/build/
173 | # Uncomment if necessary however generally it will be regenerated when needed
174 | #!**/packages/repositories.config
175 | # NuGet v3's project.json files produces more ignorable files
176 | *.nuget.props
177 | *.nuget.targets
178 | 
179 | # Microsoft Azure Build Output
180 | csx/
181 | *.build.csdef
182 | 
183 | # Microsoft Azure Emulator
184 | ecf/
185 | rcf/
186 | 
187 | # Windows Store app package directories and files
188 | AppPackages/
189 | BundleArtifacts/
190 | Package.StoreAssociation.xml
191 | _pkginfo.txt
192 | 
193 | # Visual Studio cache files
194 | # files ending in .cache can be ignored
195 | *.[Cc]ache
196 | # but keep track of directories ending in .cache
197 | !*.[Cc]ache/
198 | 
199 | # Others
200 | ClientBin/
201 | ~$*
202 | *~
203 | *.dbmdl
204 | *.dbproj.schemaview
205 | *.jfm
206 | *.pfx
207 | *.publishsettings
208 | orleans.codegen.cs
209 | 
210 | # Since there are multiple workflows, uncomment next line to ignore bower_components
211 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
212 | #bower_components/
213 | 
214 | # RIA/Silverlight projects
215 | Generated_Code/
216 | 
217 | # Backup & report files from converting an old project file
218 | # to a newer Visual Studio version. Backup files are not needed,
219 | # because we have git ;-)
220 | _UpgradeReport_Files/
221 | Backup*/
222 | UpgradeLog*.XML
223 | UpgradeLog*.htm
224 | 
225 | # SQL Server files
226 | *.mdf
227 | *.ldf
228 | *.ndf
229 | 
230 | # Business Intelligence projects
231 | *.rdl.data
232 | *.bim.layout
233 | *.bim_*.settings
234 | 
235 | # Microsoft Fakes
236 | FakesAssemblies/
237 | 
238 | # GhostDoc plugin setting file
239 | *.GhostDoc.xml
240 | 
241 | # Node.js Tools for Visual Studio
242 | .ntvs_analysis.dat
243 | node_modules/
244 | 
245 | # Typescript v1 declaration files
246 | typings/
247 | 
248 | # Visual Studio 6 build log
249 | *.plg
250 | 
251 | # Visual Studio 6 workspace options file
252 | *.opt
253 | 
254 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
255 | *.vbw
256 | 
257 | # Visual Studio LightSwitch build output
258 | **/*.HTMLClient/GeneratedArtifacts
259 | **/*.DesktopClient/GeneratedArtifacts
260 | **/*.DesktopClient/ModelManifest.xml
261 | **/*.Server/GeneratedArtifacts
262 | **/*.Server/ModelManifest.xml
263 | _Pvt_Extensions
264 | 
265 | # Paket dependency manager
266 | .paket/paket.exe
267 | paket-files/
268 | 
269 | # FAKE - F# Make
270 | .fake/
271 | 
272 | # JetBrains Rider
273 | .idea/
274 | *.sln.iml
275 | 
276 | # CodeRush
277 | .cr/
278 | 
279 | # Python Tools for Visual Studio (PTVS)
280 | __pycache__/
281 | *.pyc
282 | 
283 | # Cake - Uncomment if you are using it
284 | # tools/**
285 | # !tools/packages.config
286 | 
287 | # Telerik's JustMock configuration file
288 | *.jmconfig
289 | 
290 | # BizTalk build output
291 | *.btp.cs
292 | *.btm.cs
293 | *.odx.cs
294 | *.xsd.cs
295 | 
296 | # Jupyter Notebooks
297 | .ipynb_checkpoints/


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/finding_lungs/finding_lungs_DL_approach.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
  4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
  5 | import pandas as pd
  6 | from keras.models import load_model
  7 | from skimage import exposure, img_as_float
  8 | from skimage import transform
  9 | import numpy as np
 10 | 
 11 | from skimage import measure
 12 | import lungs_finder as lf
 13 | import cv2
 14 | 
 15 | # for lung detection
 16 | left_edge = 0
 17 | right_edge = 256
 18 | top_edge = 0
 19 | bottom_edge = 256
 20 | margin = 12
 21 | 
 22 | row_size = 256
 23 | col_size = 256
 24 | 
 25 | # Path to csv-file. File should contain X-ray filenames as first column,
 26 | # mask filenames as second column.
 27 | 
 28 | out_folder_matched_img = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay", "images_centered")
 29 | out_folder_mismatched_image = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay",
 30 |                                            "images_centered_mismatched_by_both")
 31 | csv_path = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay", "Data_Entry_2017.csv")
 32 | # Path to the folder with images. Images will be read from path + path_from_csv
 33 | img_path = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay", "images")
 34 | mis_detected_csv_path = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay", "mis_detected.csv")
 35 | df = pd.read_csv(csv_path)
 36 | 
 37 | # Load test data
 38 | im_shape = (256, 256)
 39 | 
 40 | # Load model
 41 | # plt.figure(figsize=(10, 10))
 42 | model_name = './trained_model.hdf5.bak'
 43 | UNet = load_model(model_name)
 44 | 
 45 | threshold = 0.85
 46 | # list to save the mis detected images
 47 | image_misdetect_list = []
 48 | 
 49 | 
 50 | def finding_lungs_non_DL_approach_and_save(image, file_name):
 51 |     # print(row.columns.values)
 52 |     # file_name = row[0]
 53 |     # print("line is", file_name, image.shape)
 54 |     # when reading from txt there is something in the end so we need to eliminate that
 55 |     # image = cv2.imread(os.path.join("Z:\\", "Data", "ChestXRay", "images", file_name), 0)
 56 | 
 57 |     img_height = image.shape[0]
 58 |     img_width = image.shape[1]
 59 |     # Get both lungs image. It uses HOG as main method,
 60 |     # but if HOG found nothing it uses HAAR or LBP.
 61 |     found_lungs = lf.get_lungs(image)
 62 | 
 63 |     # this can be written in a more concise way but we just keep it a bit redundant for easy reading
 64 |     if found_lungs is not None and found_lungs.shape[0] > img_height / 2 and found_lungs.shape[1] > img_width / 2:
 65 |         # print(found_lungs.shape)
 66 |         found_lungs_resized = cv2.resize(found_lungs, im_shape)
 67 |         # cv2.imshow(file_name, found_lungs)
 68 |         # code = cv2.waitKey(0)
 69 |         cv2.imwrite(os.path.join(out_folder_matched_img, file_name), found_lungs_resized)
 70 |         return True
 71 |     else:
 72 |         cv2.imwrite(os.path.join(out_folder_mismatched_image, file_name), cv2.resize(image, im_shape))
 73 |         return False
 74 | 
 75 | 
 76 | for index, item in df.iterrows():
 77 |     # X, y = loadDataGeneral(current_df, path, im_shape)
 78 |     raw_img = cv2.imread(os.path.join(img_path, item['Image Index']))
 79 | 
 80 |     img = img_as_float(raw_img)[:, :, 0]
 81 |     img = transform.resize(img, im_shape)
 82 |     img = exposure.equalize_hist(img)
 83 |     # img = np.expand_dims(img, -1)
 84 |     img -= img.mean()
 85 |     img /= img.std()
 86 | 
 87 |     file_name = item['Image Index']
 88 |     X = np.expand_dims(img, axis=0)
 89 |     X = np.expand_dims(X, axis=-1)
 90 |     n_test = X.shape[0]
 91 |     inp_shape = X[0].shape
 92 | 
 93 |     # img = exposure.rescale_intensity(np.squeeze(X), out_range=(0, 1))
 94 | 
 95 |     # print("size of img is", img.shape)
 96 |     prediction = UNet.predict(X)[..., 0].reshape(inp_shape[:2])
 97 | 
 98 |     thresh_img = np.where(prediction > threshold, 1.0, 0.0)  # threshold the image
 99 | 
100 |     labels = measure.label(thresh_img)  # Different labels are displayed in different colors
101 |     label_vals = np.unique(labels)
102 |     # print(label_vals)
103 |     regions = measure.regionprops(labels)
104 |     good_labels = []
105 |     global_B_box = []
106 |     for prop in regions:
107 |         B = prop.bbox
108 |         if B[2] - B[0] > row_size / 4 and B[3] - B[1] > col_size / 6:  # make sure size of lung to avoid small areas
109 |             good_labels.append(prop.label)
110 |             global_B_box.append(B)
111 | 
112 |     # print(len(good_labels))
113 | 
114 |     DL_failed_detect_flag = False
115 |     if len(good_labels) == 2:
116 | 
117 |         left_edge = np.clip(min(global_B_box[0][1] - margin, global_B_box[1][1] - margin), a_min=0, a_max=256)
118 |         right_edge = np.clip(max(global_B_box[0][3] + margin, global_B_box[1][3] + margin), a_min=0, a_max=256)
119 |         top_edge = np.clip(min(global_B_box[0][0] - margin, global_B_box[1][0] - margin), a_min=0, a_max=256)
120 |         bottom_edge = np.clip(max(global_B_box[0][2] + margin * 3, global_B_box[1][2] + margin * 4), a_min=0,
121 |                               a_max=256)  # leave more margins at the bottom
122 |     else:
123 |         # print(file_name)
124 | 
125 |         DL_failed_detect_flag = True
126 | 
127 |     if DL_failed_detect_flag:
128 |         img_name = os.path.join(out_folder_mismatched_image, file_name)
129 |         if not finding_lungs_non_DL_approach_and_save(raw_img, file_name):
130 |             # save file name only if both methods are not detected
131 |             image_misdetect_list.append(file_name)
132 |             print(file_name)
133 |     else:
134 |         img_name = os.path.join(out_folder_matched_img, file_name)
135 |         cropped = cv2.resize(raw_img, im_shape)[top_edge:bottom_edge, left_edge:right_edge]
136 |         # print(cropped)
137 |         resized_cropped = cv2.resize(cropped, im_shape)
138 |         cv2.imwrite(img_name, resized_cropped)
139 | 
140 |     # if mis_detected_flag:
141 |     #     mis_detected_flag = False
142 |     #     fig, ax = plt.subplots(2, 2, figsize=[12, 12])
143 |     #     ax[0, 0].set_title("Original " + file_name)
144 |     #     ax[0, 0].imshow(raw_img, cmap='gray')
145 |     #     ax[0, 0].axis('off')
146 |     #     ax[0, 1].set_title("Threshold " + file_name)
147 |     #     ax[0, 1].imshow(thresh_img, cmap='gray')
148 |     #     #         ax[0, 1].imshow(prediction, cmap='gray')
149 |     #     ax[0, 1].axis('off')
150 |     #     ax[1, 0].set_title("Color Labels " + file_name)
151 |     #     ax[1, 0].imshow(labels)
152 |     #     ax[1, 0].axis('off')
153 |     #     ax[1, 1].set_title("Apply Mask on Original " + file_name)
154 |     #
155 |     #     ax[1, 1].imshow(resized_cropped, cmap='gray')
156 |     #     ax[1, 1].axis('off')
157 | 
158 |     if index > 112120:  # for debug purpose
159 |         break
160 | 
161 |     if index % 100 == 0:
162 |         df = pd.DataFrame({'col': image_misdetect_list})
163 |         df.to_csv(mis_detected_csv_path, header=False, index=False)
164 | 
165 | df = pd.DataFrame({'col': image_misdetect_list})
166 | df.to_csv(mis_detected_csv_path, header=False, index=False)
167 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/Deployment_Guide.md:
--------------------------------------------------------------------------------
  1 | Install Azure Machine Learning Workbench AMLWB (https://docs.microsoft.com/en-us/azure/machine-learning/preview/quickstart-installation)   
  2 | 1. Set up your environment:   
  3 |   Open an amlwb cli and follow this [guide](https://docs.microsoft.com/en-us/azure/machine-learning/preview/tutorial-classifying-iris-part-2#execute-scripts-in-the-azure-machine-learning-cli-window):        
  4 |   REM login by using the aka.ms/devicelogin site      
  5 |   az login      
  6 |         
  7 |   REM lists all Azure subscriptions you have access to (# make sure the right subscription is selected (column isDefault))      
  8 |   az account list -o table      
  9 |         
 10 |   REM sets the current Azure subscription to the one you want to use      
 11 |   az account set -s <subscriptionId>      
 12 |         
 13 |   REM verifies that your current subscription is set correctly      
 14 |   az account show      
 15 |      
 16 |   REM  Create an experimentation account and and azure ml workspace using the portal   
 17 |      
 18 |   REM  Use the AMLWB to create a new project   
 19 |      
 20 |   REM Copy \Code\ structure and files (.ipynb and .py files) in the new experiment folder   
 21 |    
 22 |           
 23 |           
 24 | 2. Create compute contexts on remote VMs:        
 25 |         
 26 |   2.1  Using Azure portal:     
 27 |   -->Deploy a linux VM (e.g. a linux [DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro))      
 28 |   -->For best results, use a a deep learning linux VM (https://azuremarketplace.microsoft.com/en-us/marketplace/apps/microsoft-ads.dsvm-deep-learning).   
 29 |   -->You may need a [GPU VM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) for training, and a second CPU VM for testing operationalization.   
 30 |      
 31 |   --> Make sure the vm fqdn is set up. The disk sizes must cover data requirements. Default data disks do not survive machine reboot events. To keep the data available between machine reboots, either make the OS disk larger, or attach an external Azure VHD.    
 32 |      
 33 |   --> use https://docs.microsoft.com/en-us/azure/machine-learning/preview/known-issues-and-troubleshooting-guide#vm-disk-is-full to resize the VM disk if needed   
 34 |     #Deallocate VM (stopping will not work)   
 35 |     $ az vm deallocate --resource-group myResourceGroup  --name myVM   
 36 |     # Update Disc Size   
 37 |     $ az disk update --resource-group myResourceGroup --name myVM --size-gb 250   
 38 |     # Start VM       
 39 |     $ az vm start --resource-group myResourceGroup  --name myVM   
 40 |      
 41 |   --> ssh into the remote VM and create the folder structure that will be used by AMLWB to map host locations to directories in running AMLWB containers   
 42 |   loginuser@deeplearninggpuvm:~$ sudo mkdir -p /datadrive01/amlwbShare   
 43 |   loginuser@deeplearninggpuvm:~$ sudo chmod ugo=rwx /datadrive01/amlwbShare/   
 44 |   loginuser@deeplearninggpuvm2:~$ ls -l /datadrive01/   
 45 |   total 4   
 46 |   drwxrwxrwx 2 root root 4096 Feb  5 18:33 amlwbShare   
 47 |      
 48 |   2.2 Get the NIH Chext Xray images   
 49 |   Go to https://nihcc.app.box.com/v/ChestXray-NIHCC   
 50 |   Store the images from images (https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/37178474737) dir as unarchived files in a blob storage account.    
 51 |   They will be downloaded later in Code\01_DataPrep\_001_get_data.ipynb, and will land into a dir on the remote VM created above:   
 52 |   loginuser@deeplearninggpuvm2:~$ ls /datadrive01/amlwbShare/crt_ea/grt_work_space/crt_experiment/chestxray/data/ChestX-ray8/ChestXray-NIHCC/ | head -2   
 53 |   00000001_000.png   
 54 |   00000001_001.png   
 55 |    
 56 |   2.3 Get patient-image map file    
 57 |   Download manually NIH data file Data_Entry_2017.csv (https://nihcc.app.box.com/v/ChestXray-NIHCC) into this dir on the remote VM created above:   
 58 |   loginuser@deeplearninggpuvm2:~$ ls -l /datadrive01/amlwbShare/crt_ea/grt_work_space/crt_experiment/chestxray/data/ChestX-ray8/ChestXray-NIHCC_other   
 59 |   total 7680   
 60 |   -rw-rw-r-- 1 loginvm0011 loginvm0011 7861152 Feb  7 02:54 Data_Entry_2017.csv   
 61 |   Data_Entry_2017.csv is the patients to images map and will be used by \Code\02_Model\000_preprocess.ipynb to create the train/validate/test partitions.   
 62 |    
 63 |      
 64 |   2.4      
 65 |   -->in AMLWB cli, create AMLWB compute context:      
 66 |   az ml computetarget attach remotedocker --name <compute_context_name> --address <dsvm_fqdn> --username <dsvm_login> --password <dsvm_password>        
 67 |   the command above will create \aml_config\<compute_context_name>.runconfig and \aml_config\<compute_context_name>.compute files that control the AMLWB compute contexts   
 68 |    
 69 |   -->Check the existing compute targets:        
 70 |   az ml computetarget list      
 71 |         
 72 |   For CPU compute contexts:      
 73 |   -->edit <compute_context_name>.runconfig file:      
 74 |         CondaDependenciesFile: aml_config/conda_dependencies_o16n.yml      
 75 |   	Framework: Python        
 76 |         PrepareEnvironment: true        
 77 |   -->edit <compute_context_name>.compute file:        
 78 |         baseDockerImage: georgedockeraccount/utils_with_amlwb_base_cpu:azcopyenabled      
 79 | 	nativeSharedDirectory: /data/datadrive01/amlwbShare/        
 80 | 	sharedVolumes: true        
 81 | 	      
 82 |   For GPU compute contexts:      
 83 |   -->edit <compute_context_name>.runconfig file:      
 84 |         CondaDependenciesFile: aml_config/conda_dependencies_gpu.yml      
 85 |   	Framework: Python        
 86 |         PrepareEnvironment: true        
 87 |   -->edit <compute_context_name>.compute file:        
 88 | 	baseDockerImage: georgedockeraccount/utils_with_amlwb_base_gpu:azcopyenabled      
 89 | 	nativeSharedDirectory: /data/datadrive01/amlwbShare/      
 90 | 	nvidiaDocker: true   
 91 | 	sharedVolumes: true        
 92 |           
 93 |   -->go back to cli:        
 94 |   az ml experiment prepare -c <compute_context_name>        
 95 |   -> while the preparation is running, you can check on linux host machine how docker is running:        
 96 |   	sudo docker images        
 97 |   	sudo docker ps -a        
 98 |    
 99 | E.g.:   
100 | loginuser@deeplearninggpuvm:~$ sudo docker images   
101 | REPOSITORY                                      TAG                 IMAGE ID            CREATED             SIZE   
102 | azureml_88865f7583e9e1fd502a32a7717aa1f0        latest              a35a05a9b295        16 minutes ago      7.21GB   
103 | georgedockeraccount/utils_with_amlwb_base_gpu   azcopyenabled       2e6da7a1351c        4 weeks ago         3.89GB   
104 |    
105 |   -> see \Code\docker\Dockerfile for details on how the docker images have been created.   
106 |    
107 |    
108 | 3. Run experiments: 	   
109 | Code\01_DataPrep\_GetData.ipynb   
110 | \Code\02_Model\000_preprocess.ipynb : creates the train/validate/test partitions and saves all NIH chest XRay images as numpy objects on disk   
111 | \Code\02_Model\010_train.ipynb : trains a densenet model (pretrained on imagenet) on NIH chest xray data   
112 | \Code\02_Model\040_cam_simple.ipynb : shows CAM visualizations  
113 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/azure_chestxray_cam.py:
--------------------------------------------------------------------------------
  1 | ### Copyright (C) Microsoft Corporation.  
  2 | 
  3 | import keras.backend as K
  4 | import sys, os, io
  5 | import numpy as np
  6 | import cv2
  7 | 
  8 | import matplotlib
  9 | matplotlib.use('agg')
 10 | 
 11 | paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code',  'src'])))]
 12 | def add_path_to_sys_path(path_to_append):
 13 |     if not (any(path_to_append in paths for paths in sys.path)):
 14 |         sys.path.append(path_to_append)
 15 | [add_path_to_sys_path(crt_path) for crt_path in paths_to_append]
 16 | 
 17 | import azure_chestxray_utils
 18 | 
 19 | 
 20 | def get_score_and_cam_picture(cv2_input_image, DenseNetImageNet121_model):
 21 | # based on https://github.com/jacobgil/keras-cam/blob/master/cam.py
 22 |     width, height, _ = cv2_input_image.shape
 23 |     class_weights = DenseNetImageNet121_model.layers[-1].get_weights()[0]
 24 |     final_conv_layer = DenseNetImageNet121_model.layers[-3]
 25 |     get_output = K.function([DenseNetImageNet121_model.layers[0].input], 
 26 |                             [final_conv_layer.output, \
 27 |                              DenseNetImageNet121_model.layers[-1].output])
 28 |     [conv_outputs, prediction] = get_output([cv2_input_image[None,:,:,:]])
 29 |     conv_outputs = conv_outputs[0, :, :, :]
 30 |     prediction = prediction[0,:]
 31 |     
 32 |     #Create the class activation map.
 33 |     predicted_disease = np.argmax(prediction)
 34 |     cam = np.zeros(dtype = np.float32, shape = conv_outputs.shape[:2])
 35 |     for i, w in enumerate(class_weights[:, predicted_disease]):
 36 |             cam += w * conv_outputs[:, :, i]
 37 |     
 38 |     return prediction, cam, predicted_disease
 39 | 
 40 | 
 41 | def process_cam_image(crt_cam_image, xray_image, crt_alpha = .5):
 42 |     im_width, im_height, _ = xray_image.shape
 43 |     crt_cam_image = cv2.resize(crt_cam_image, (im_width, im_height), \
 44 |                                interpolation=cv2.INTER_CUBIC)
 45 |     
 46 | #     do some gamma enhancement, e is too much
 47 |     crt_cam_image = np.power(1.1, crt_cam_image)
 48 |     crt_cam_image = azure_chestxray_utils.normalize_nd_array(crt_cam_image)
 49 |     # crt_cam_image[np.where(crt_cam_image < 0.5)] = 0 
 50 |     crt_cam_image = 255*crt_cam_image
 51 | 
 52 |     # make cam an rgb image
 53 |     empty_image_channel = np.zeros(dtype = np.float32, shape = crt_cam_image.shape[:2])
 54 |     crt_cam_image = cv2.merge((crt_cam_image,empty_image_channel,empty_image_channel))
 55 |     
 56 |     blended_image = cv2.addWeighted(xray_image.astype('uint8'),crt_alpha,\
 57 |                                     crt_cam_image.astype('uint8'),(1-crt_alpha),0)
 58 |     return(blended_image)
 59 | 
 60 | def plot_cam_results(crt_blended_image, crt_cam_image, crt_xray_image, map_caption):
 61 |     import matplotlib.pyplot as plt
 62 | 
 63 |     fig = plt.figure(figsize = (15,7))
 64 | 
 65 |     ax1 = fig.add_subplot(2, 3, 1)
 66 |     ax1.imshow(crt_xray_image, cmap = 'gray', interpolation = 'bicubic')
 67 |     ax1.set_title('Orig X Ray')
 68 |     plt.axis('off')
 69 | 
 70 |     ax2 = fig.add_subplot(2,3, 2)
 71 |     cam_plot = ax2.imshow(crt_cam_image, cmap=plt.get_cmap('OrRd'), interpolation = 'bicubic')
 72 |     plt.colorbar(cam_plot, ax=ax2)
 73 |     ax2.set_title('Activation Map')
 74 |     plt.axis('off')
 75 | 
 76 |     ax3 = fig.add_subplot(2,3, 3)
 77 |     blended_plot = ax3.imshow(crt_blended_image, interpolation = 'bicubic')
 78 |     plt.colorbar(cam_plot, ax=ax3)
 79 |     ax3.set_title(map_caption)
 80 |     plt.axis('off')
 81 |     
 82 |     # serialize blended image plot padded in the x/y-direction
 83 |     image_as_BytesIO = io.BytesIO()
 84 |     x_direction_pad = 1.05;y_direction_pad=1.2
 85 |     extent = ax3.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
 86 |     fig.savefig(image_as_BytesIO, 
 87 |                 bbox_inches=extent.expanded(x_direction_pad, 
 88 |                                             y_direction_pad),
 89 |                format='png')
 90 |     image_as_BytesIO.seek(0)
 91 |     return(image_as_BytesIO)
 92 |     
 93 | 
 94 |     
 95 | def process_xray_image(crt_xray_image, DenseNetImageNet121_model):
 96 | 
 97 | #     print(crt_xray_image.shape)
 98 |     crt_xray_image = azure_chestxray_utils.normalize_nd_array(crt_xray_image)
 99 |     crt_xray_image = 255*crt_xray_image
100 |     crt_xray_image=crt_xray_image.astype('uint8')
101 | 
102 |     crt_predictions, crt_cam_image, predicted_disease_index = \
103 |     get_score_and_cam_picture(crt_xray_image, 
104 |                               DenseNetImageNet121_model)
105 |     
106 |     prj_consts = azure_chestxray_utils.chestxray_consts()
107 |     likely_disease=prj_consts.DISEASE_list[predicted_disease_index]
108 |     likely_disease_prob = 100*crt_predictions[predicted_disease_index]
109 |     likely_disease_prob_ratio=100*crt_predictions[predicted_disease_index]/sum(crt_predictions)
110 |     print('predictions: ', crt_predictions)
111 |     print('likely disease: ', likely_disease)
112 |     print('likely disease prob: ', likely_disease_prob)
113 |     print('likely disease prob ratio: ', likely_disease_prob_ratio)
114 |     
115 |     crt_blended_image = process_cam_image(crt_cam_image, crt_xray_image)
116 |     plot_cam_results(crt_blended_image, crt_cam_image, crt_xray_image,
117 |                     str(likely_disease)+ ' ' +
118 |                     "{0:.1f}".format(likely_disease_prob)+ '% (weight ' +
119 |                     "{0:.1f}".format(likely_disease_prob_ratio)+ '%)')
120 | 
121 | def process_nih_data(nih_data_files, NIH_data_dir, DenseNetImageNet121_model):
122 |     for crt_image in nih_data_files:
123 |         # print(crt_image)
124 |         prj_consts = azure_chestxray_utils.chestxray_consts()
125 | 
126 |         crt_xray_image = cv2.imread(os.path.join(NIH_data_dir,crt_image))
127 |         crt_xray_image = cv2.resize(crt_xray_image, 
128 |                                     (prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_HEIGHT, 
129 |                                      prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_WIDTH)) \
130 |                         .astype(np.float32)
131 | 
132 |         process_xray_image(crt_xray_image, DenseNetImageNet121_model )   
133 |         
134 | if __name__=="__main__":
135 |     #FIXME
136 |     # add example/test code here
137 | 
138 | 
139 | 
140 |     NIH_annotated_Cardiomegaly = ['00005066_030.png']
141 |     data_dir = ''
142 |     cv2_image = cv2.imread(os.path.join(data_dir,NIH_annotated_Cardiomegaly[0]))
143 | 
144 |     print_image_stats_by_channel(cv2_image)
145 |     cv2_image = normalize_nd_array(cv2_image)
146 |     cv2_image = 255*cv2_image
147 |     cv2_image=cv2_image.astype('uint8')
148 |     print_image_stats_by_channel(cv2_image)
149 | 
150 |     predictions, cam_image, predicted_disease_index = get_score_and_cam_picture(cv2_image, model)
151 |     print(predictions)
152 |     prj_consts = azure_chestxray_utils.chestxray_consts()
153 |     print(prj_consts.DISEASE_list[predicted_disease_index])
154 |     print('likely disease: ', prj_consts.DISEASE_list[predicted_disease_index])
155 |     print('likely disease prob ratio: ', \
156 |           predictions[predicted_disease_index]/sum(predictions))
157 |     blended_image = process_cam_image(cam_image, cv2_image)
158 |     plot_cam_results(blended_image, cam_image, cv2_image, \
159 |                  prj_consts.DISEASE_list[predicted_disease_index])      


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/score_image_and_cam.py:
--------------------------------------------------------------------------------
  1 | # This script generates the scoring and schema files
  2 | # Creates the schema, and holds the init and run functions needed to 
  3 | # operationalize the chestXray model
  4 | 
  5 | 
  6 | import os, sys, pickle, base64
  7 | import keras.models
  8 | import keras.layers
  9 | import keras_contrib.applications.densenet
 10 | import pandas as pd
 11 | import numpy as np
 12 | import azure_chestxray_utils, azure_chestxray_cam
 13 | 
 14 | ####################################
 15 | # Parameters
 16 | ####################################
 17 | global chest_XRay_model
 18 | global as_string_b64encoded_pickled_data_column_name
 19 | as_string_b64encoded_pickled_data_column_name   = 'encoded_image'
 20 | global densenet_weights_file_name
 21 | # densenet_weights_file_name = 'weights_only_chestxray_model_14_weights_712split_epoch_029_val_loss_147.7599.hdf5'
 22 | densenet_weights_file_name = 'weights_only_chestxray_model_14_weights_712split_epoch_029_val_loss_147.7599 - Copy.hdf5'
 23 | 
 24 | # Import data collection library. Only supported for docker mode.
 25 | # Functionality will be ignored when package isn't found
 26 | try:
 27 |     from azureml.datacollector import ModelDataCollector
 28 | except ImportError:
 29 |     print("Data collection is currently only supported in docker mode. May be disabled for local mode.")
 30 |     # Mocking out model data collector functionality
 31 |     class ModelDataCollector(object):
 32 |         def nop(*args, **kw): pass
 33 |         def __getattr__(self, _): return self.nop
 34 |         def __init__(self, *args, **kw): return None
 35 |     pass
 36 | 
 37 | ####################################
 38 | # Utils
 39 | ####################################
 40 | def as_string_b64encoded_pickled(input_object):
 41 |      #b64encode returns bytes class, make it string by calling .decode('utf-8')
 42 |      return (base64.b64encode(pickle.dumps(input_object))).decode('utf-8')
 43 | 
 44 | def unpickled_b64decoded_as_bytes(input_object):
 45 |     if input_object.startswith('b\''):
 46 |         input_object = input_object[2:-1]
 47 |     # make string bytes
 48 |     input_object   =  input_object.encode('utf-8')
 49 |     #decode and the unpickle the bytes to recover original object
 50 |     return (pickle.loads(base64.b64decode(input_object)))
 51 | 
 52 | def get_image_score_and_serialized_cam(crt_cv2_image, crt_chest_XRay_model):
 53 |     prj_consts = azure_chestxray_utils.chestxray_consts()
 54 |     crt_cv2_image = azure_chestxray_utils.normalize_nd_array(crt_cv2_image)
 55 |     crt_cv2_image = 255*crt_cv2_image
 56 |     crt_cv2_image=crt_cv2_image.astype('uint8')
 57 |     predictions, cam_image, predicted_disease_index = \
 58 |     azure_chestxray_cam.get_score_and_cam_picture(crt_cv2_image, crt_chest_XRay_model)
 59 |     blended_image = azure_chestxray_cam.process_cam_image(cam_image, crt_cv2_image)
 60 |     serialized_image = azure_chestxray_cam.plot_cam_results(blended_image, cam_image, crt_cv2_image, \
 61 |                  prj_consts.DISEASE_list[predicted_disease_index])
 62 |     return predictions, serialized_image
 63 | 
 64 | ####################################
 65 | # API functions
 66 | ####################################
 67 | 
 68 | # Prepare the web service definition by authoring
 69 | # init() and run() functions. Test the functions
 70 | # before deploying the web service.
 71 | def init():
 72 |     try: 
 73 |         print("init() method: Python version: " + str(sys.version))
 74 |         print("crt Dir: " + os.getcwd())
 75 |         
 76 |         import pip
 77 |         # pip.get_installed_distributions()
 78 |         myDistr = pip.get_installed_distributions()
 79 |         type(myDistr)
 80 |         for crtDist in myDistr:
 81 |             print(crtDist)
 82 | 
 83 |         # load the model file
 84 |         global chest_XRay_model
 85 |         chest_XRay_model = azure_chestxray_utils.build_DenseNetImageNet201_model() 
 86 |         chest_XRay_model.load_weights(densenet_weights_file_name)
 87 |         print('Densenet model loaded')
 88 |         
 89 |     except Exception as e:
 90 |         print("Exception in init:")
 91 |         print(str(e))
 92 | 
 93 | def run(input_df):
 94 |     try:
 95 |         import json
 96 |         
 97 |         debugCounter = 0
 98 |         print("run() method: Python version: " + str(sys.version) ); print('Step '+str(debugCounter));debugCounter+=1
 99 | 
100 |         print ('\ninput_df shape {}'.format(input_df.shape))
101 |         print(list(input_df))
102 |         print(input_df)
103 | 
104 |         input_df = input_df[as_string_b64encoded_pickled_data_column_name][0]; print('Step '+str(debugCounter));debugCounter+=1
105 |         input_cv2_image = unpickled_b64decoded_as_bytes(input_df); print('Step '+str(debugCounter));debugCounter+=1
106 | 
107 |         #finally scoring
108 |         predictions, serialized_cam_image = get_image_score_and_serialized_cam(input_cv2_image, chest_XRay_model)
109 |         #predictions = chest_XRay_model.predict(input_cv2_image[None,:,:,:])
110 | 
111 |         # prediction_dc.collect(ADScores)
112 |         outDict = {"chestXrayScore": str(predictions), "chestXrayCAM":as_string_b64encoded_pickled(serialized_cam_image)}
113 |         return json.dumps(outDict)
114 |     except Exception as e:
115 |         return(str(e))
116 | 
117 | 
118 | ####################################
119 | # main function can be used for test and demo
120 | ####################################
121 | def main():
122 |     from azureml.api.schema.dataTypes import DataTypes
123 |     from azureml.api.schema.sampleDefinition import SampleDefinition
124 |     from azureml.api.realtime.services import generate_schema
125 | 
126 |     print('Entered main function:')
127 |     print(os.getcwd())
128 |     
129 |     amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY'] 
130 |     print(amlWBSharedDir)
131 | 
132 |     def get_files_in_dir(crt_dir):
133 |         return( [f for f in os.listdir(crt_dir) if os.path.isfile(os.path.join(crt_dir, f))])
134 | 
135 |     fully_trained_weights_dir=os.path.join(
136 |         amlWBSharedDir,
137 |         os.path.join(*(['chestxray', 'output',  'trained_models_weights'])))
138 |     crt_models = get_files_in_dir(fully_trained_weights_dir)
139 |     print(fully_trained_weights_dir)
140 |     print(crt_models)
141 | 
142 |     test_images_dir=os.path.join(
143 |         amlWBSharedDir, 
144 |         os.path.join(*(['chestxray', 'data', 'ChestX-ray8', 'test_images'])))
145 |     test_images = get_files_in_dir(test_images_dir)
146 |     print(test_images_dir)
147 |     print(len(test_images))
148 | 
149 |     # score in local mode (i.e. here in main function)
150 |     model = azure_chestxray_utils.build_DenseNetImageNet201_model()
151 |     model.load_weights(os.path.join(
152 |         fully_trained_weights_dir, densenet_weights_file_name))
153 | 
154 |     print('Model weoghts loaded!')
155 | 
156 |     import cv2
157 |     cv2_image = cv2.imread(os.path.join(test_images_dir,test_images[0]))
158 |     x, serialized_cam_image = get_image_score_and_serialized_cam(cv2_image, model)
159 |     file_bytes = np.asarray(bytearray(serialized_cam_image.read()), dtype=np.uint8)
160 |     recovered_image = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR)
161 | 
162 |     # x = model.predict(cv2_image[None,:,:,:])
163 |     print(test_images[0])
164 |     print(x)
165 |     print(recovered_image.shape)
166 | 
167 |     #  score in local mode (i.e. here in main function) using encoded data
168 |     encoded_image = as_string_b64encoded_pickled(cv2_image)
169 |     df_for_api = pd.DataFrame(data=[[encoded_image]], columns=[as_string_b64encoded_pickled_data_column_name])
170 |     del encoded_image 
171 |     del cv2_image
172 |     del serialized_cam_image
173 |     
174 |     input_df = df_for_api[as_string_b64encoded_pickled_data_column_name][0]
175 |     input_cv2_image = unpickled_b64decoded_as_bytes(input_df); 
176 |     x, serialized_cam_image = get_image_score_and_serialized_cam(input_cv2_image, model) 
177 |     file_bytes = np.asarray(bytearray(serialized_cam_image.read()), dtype=np.uint8)
178 |     recovered_image = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR)
179 | 
180 |     # x = model.predict(input_cv2_image[None,:,:,:])
181 |     print('After encoding and decoding:')
182 |     print(x)
183 |     print(recovered_image.shape)
184 | 
185 |     del model
186 | 
187 |     # now create the post deployment env, i.e. score using init() and run()
188 |     crt_dir = os.getcwd()
189 |     working_dir = os.path.join(crt_dir, 'tmp_cam_deploy')
190 |     if not os.path.exists(working_dir):
191 |         os.makedirs(working_dir)
192 | 
193 |     import shutil
194 |     shutil.copyfile(
195 |         os.path.join( fully_trained_weights_dir,densenet_weights_file_name), 
196 |         os.path.join( working_dir,densenet_weights_file_name)) 
197 | 
198 |     os.chdir(working_dir)
199 | 
200 |     # Turn on data collection debug mode to view output in stdout
201 |     os.environ["AML_MODEL_DC_DEBUG"] = 'true'
202 | 
203 |     # Test the output of the functions
204 |     init()
205 |     print("Result: " + run(df_for_api))
206 | 
207 |      # #Generate the schema
208 |     data_for_schema = {"input_df": SampleDefinition(DataTypes.PANDAS, df_for_api)}
209 |     schema_file = os.path.join(fully_trained_weights_dir, 'chest_XRay_cam_service_schema.json')
210 |     generate_schema(run_func=run, inputs=data_for_schema, filepath=schema_file)
211 |     print("Schema saved in " +schema_file)   
212 |     
213 | 
214 | if __name__ == "__main__":
215 |     main()
216 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/01_DataPrep/001_get_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Copies data from blob to local host\n",
  8 |     "\n",
  9 |     "##### Copyright (C) Microsoft Corporation.  \n",
 10 |     "see license file for details "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 11,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Allow multiple displays per cell\n",
 20 |     "\n",
 21 |     "from IPython.core.interactiveshell import InteractiveShell\n",
 22 |     "InteractiveShell.ast_node_interactivity = \"all\""
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 12,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "'/azureml-share/'"
 34 |       ]
 35 |      },
 36 |      "execution_count": 12,
 37 |      "metadata": {},
 38 |      "output_type": "execute_result"
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n",
 43 |     "\n",
 44 |     "import os\n",
 45 |     "try:\n",
 46 |     "    amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY']    \n",
 47 |     "except:\n",
 48 |     "    amlWBSharedDir = ''\n",
 49 |     "    print('not using aml services?')\n",
 50 |     "    \n",
 51 |     "amlWBSharedDir"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 13,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Use the Azure Machine Learning data collector to log various metrics\n",
 61 |     "\n",
 62 |     "from azureml.logging import get_azureml_logger\n",
 63 |     "logger = get_azureml_logger()"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 14,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# Use Azure Machine Learning history magic to control history collection\n",
 73 |     "# History is off by default, options are \"on\", \"off\", or \"show\"\n",
 74 |     "# %azureml history on"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 15,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "[None]"
 86 |       ]
 87 |      },
 88 |      "execution_count": 15,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "# import utlity functions\n",
 95 |     "\n",
 96 |     "import sys, os\n",
 97 |     "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code',  'src'])))]\n",
 98 |     "def add_path_to_sys_path(path_to_append):\n",
 99 |     "    if not (any(path_to_append in paths for paths in sys.path)):\n",
100 |     "        sys.path.append(path_to_append)\n",
101 |     "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n",
102 |     "\n",
103 |     "import azure_chestxray_utils"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 16,
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "data": {
113 |       "text/plain": [
114 |        "'/azureml-run'"
115 |       ]
116 |      },
117 |      "execution_count": 16,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     },
121 |     {
122 |      "data": {
123 |       "text/plain": [
124 |        "'/azureml-share/chestxray/data/ChestX-ray8'"
125 |       ]
126 |      },
127 |      "execution_count": 16,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     },
131 |     {
132 |      "data": {
133 |       "text/plain": [
134 |        "'/azureml-share/code/notShared'"
135 |       ]
136 |      },
137 |      "execution_count": 16,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     },
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'"
145 |       ]
146 |      },
147 |      "execution_count": 16,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "# create the file path variables \n",
154 |     "# create nih_chest_xray_data_dir (in container dir mapped to a host dir for data persistence), \n",
155 |     "# where data will be copied from blob\n",
156 |     "\n",
157 |     "prj_consts = azure_chestxray_utils.chestxray_consts()\n",
158 |     "\n",
159 |     "os.getcwd()\n",
160 |     "\n",
161 |     "\n",
162 |     "data_base_input_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n",
163 |     "credential_info_path=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.CREDENTIALS_DIR_list)))\n",
164 |     "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n",
165 |     "                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n",
166 |     "\n",
167 |     "!mkdir -p {data_base_input_dir}\n",
168 |     "!mkdir -p {credential_info_path}  \n",
169 |     "!mkdir -p {nih_chest_xray_data_dir}\n",
170 |     "\n",
171 |     "data_base_input_dir\n",
172 |     "credential_info_path\n",
173 |     "nih_chest_xray_data_dir"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 17,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "name": "stdout",
183 |      "output_type": "stream",
184 |      "text": [
185 |       "Credentials file found at /azureml-share/code/notShared/get_data_access_secrets.py\n"
186 |      ]
187 |     },
188 |     {
189 |      "data": {
190 |       "text/plain": [
191 |        "'/azureml-share/code/notShared/get_data_access_secrets.py'"
192 |       ]
193 |      },
194 |      "execution_count": 17,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "# Create this function in file '{credential_info_path}/get_data_access_secrets.py'\n",
201 |     "# def get_blob_credentials():\n",
202 |     "#     dataBlob = 'https://somedatastore.blob.core.windows.net/somecontainer'\n",
203 |     "#     sourceKey = 'somesourceKey'\n",
204 |     "#     return dataBlob, sourceKey\n",
205 |     "# \n",
206 |     "# For example:\n",
207 |     "CredentialsFileName = os.path.join(credential_info_path, 'get_data_access_secrets.py') \n",
208 |     "crt_container = 'https://somedatastore.blob.core.windows.net/somecontainer' \n",
209 |     "crt_source_Key = 'somesourceKey'\n",
210 |     "import os.path \n",
211 |     "if not os.path.isfile(CredentialsFileName): \n",
212 |     "    print('Credentials file not found, will be written!') \n",
213 |     "    with open(CredentialsFileName,'w') as myFile:\n",
214 |     "        myFile.write((\"def get_blob_credentials():\\n\\t\" +\n",
215 |     "        \"dataBlob = '\" + crt_container + \"'\\n\\t\" +\n",
216 |     "        \"sourceKey = '\" + crt_source_Key + \"'\\n\\t\" +\n",
217 |     "        \"return dataBlob, sourceKey\\n\" )) \n",
218 |     "    print(('Credentials file written at ' +CredentialsFileName)) \n",
219 |     "else: \n",
220 |     "    print(('Credentials file found at ' + CredentialsFileName))\n",
221 |     "    \n",
222 |     "# check CredentialsFileName existence and print content if needed\n",
223 |     "\n",
224 |     "credentials_file_name = os.path.join(*([credential_info_path, 'get_data_access_secrets.py']))\n",
225 |     "credentials_file_name\n",
226 |     "# !cat {credentials_file_name}"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 18,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "# read credentials info\n",
236 |     "\n",
237 |     "import sys\n",
238 |     "def prepend_path_to_sys_path(path_to_append):\n",
239 |     "    if not (any(path_to_append in paths for paths in sys.path)):\n",
240 |     "        sys.path.append(path_to_append)\n",
241 |     "prepend_path_to_sys_path(credential_info_path)         \n",
242 |     "import get_data_access_secrets   \n",
243 |     "crt_container, crt_key = get_data_access_secrets.get_blob_credentials()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 19,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": [
252 |     "#check azcopy is available. Uncomment second line to print _response_ if needed\n",
253 |     "\n",
254 |     "response = !azcopy\n",
255 |     "# response"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 20,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "# copy data to local host dir\n",
265 |     "# add {  --source-key {crt_key}  } if needed\n",
266 |     "\n",
267 |     "answer = !yes | azcopy \\\n",
268 |     "    --source {crt_container} \\\n",
269 |     "    --destination {nih_chest_xray_data_dir} \\\n",
270 |     "    --recursive"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 21,
276 |    "metadata": {},
277 |    "outputs": [
278 |     {
279 |      "data": {
280 |       "text/plain": [
281 |        "['[2018/02/06 05:19:05] Transfer summary:',\n",
282 |        " '-----------------',\n",
283 |        " 'Total files transferred: 112120',\n",
284 |        " 'Transfer successfully:   112120',\n",
285 |        " 'Transfer skipped:        0',\n",
286 |        " 'Transfer failed:         0',\n",
287 |        " 'Elapsed time:            00.00:09:57']"
288 |       ]
289 |      },
290 |      "execution_count": 21,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "answer[-7:]"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 22,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/plain": [
307 |        "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'"
308 |       ]
309 |      },
310 |      "execution_count": 22,
311 |      "metadata": {},
312 |      "output_type": "execute_result"
313 |     },
314 |     {
315 |      "name": "stdout",
316 |      "output_type": "stream",
317 |      "text": [
318 |       "112120\r\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "nih_chest_xray_data_dir\n",
324 |     "!find $nih_chest_xray_data_dir -type f | wc -l"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 23,
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "#  run below command in amlwb cli to save current sessin as html\n",
334 |     "# jupyter nbconvert --to html .\\Code\\01_DataPrep\\001_get_data.ipynb"
335 |    ]
336 |   }
337 |  ],
338 |  "metadata": {
339 |   "kernelspec": {
340 |    "display_name": "azure_chestxray_lung_disease gpucomputecontext",
341 |    "language": "python",
342 |    "name": "azure_chestxray_lung_disease_gpucomputecontext"
343 |   },
344 |   "language_info": {
345 |    "codemirror_mode": {
346 |     "name": "ipython",
347 |     "version": 3
348 |    },
349 |    "file_extension": ".py",
350 |    "mimetype": "text/x-python",
351 |    "name": "python",
352 |    "nbconvert_exporter": "python",
353 |    "pygments_lexer": "ipython3",
354 |    "version": "3.5.2"
355 |   }
356 |  },
357 |  "nbformat": 4,
358 |  "nbformat_minor": 2
359 | }
360 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/02_Model/000_preprocess.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Data preprocessing\n",
  8 |     "\n",
  9 |     "##### Copyright (C) Microsoft Corporation.  \n",
 10 |     "see license file for details "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 12,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Allow multiple displays per cell\n",
 20 |     "from IPython.core.interactiveshell import InteractiveShell\n",
 21 |     "InteractiveShell.ast_node_interactivity = \"all\""
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 13,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "text/plain": [
 32 |        "'/azureml-share/'"
 33 |       ]
 34 |      },
 35 |      "execution_count": 13,
 36 |      "metadata": {},
 37 |      "output_type": "execute_result"
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n",
 42 |     "\n",
 43 |     "import os\n",
 44 |     "try:\n",
 45 |     "    amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY']    \n",
 46 |     "except:\n",
 47 |     "    amlWBSharedDir = ''\n",
 48 |     "    print('not using aml services?')\n",
 49 |     "    \n",
 50 |     "amlWBSharedDir"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 14,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Use the Azure Machine Learning data collector to log various metrics\n",
 60 |     "from azureml.logging import get_azureml_logger\n",
 61 |     "logger = get_azureml_logger()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 15,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Use Azure Machine Learning history magic to control history collection\n",
 71 |     "# History is off by default, options are \"on\", \"off\", or \"show\"\n",
 72 |     "# %azureml history on"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 16,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "[None]"
 84 |       ]
 85 |      },
 86 |      "execution_count": 16,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "# import utlity functions\n",
 93 |     "\n",
 94 |     "import sys, os\n",
 95 |     "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code',  'src'])))]\n",
 96 |     "def add_path_to_sys_path(path_to_append):\n",
 97 |     "    if not (any(path_to_append in paths for paths in sys.path)):\n",
 98 |     "        sys.path.append(path_to_append)\n",
 99 |     "\n",
100 |     "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n",
101 |     "\n",
102 |     "import azure_chestxray_utils"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "#### Path variables"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 17,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "'/azureml-share/chestxray/data/ChestX-ray8'"
121 |       ]
122 |      },
123 |      "execution_count": 17,
124 |      "metadata": {},
125 |      "output_type": "execute_result"
126 |     },
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "'/azureml-share/chestxray/output'"
131 |       ]
132 |      },
133 |      "execution_count": 17,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "# create base directories for the file path variables \n",
140 |     "# paths are tipically container level dirs mapped to a host dir for data persistence.\n",
141 |     "\n",
142 |     "prj_consts = azure_chestxray_utils.chestxray_consts()\n",
143 |     "\n",
144 |     "data_base_input_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n",
145 |     "data_base_output_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))  \n",
146 |     "\n",
147 |     "data_base_input_dir\n",
148 |     "data_base_output_dir\n"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 18,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'"
160 |       ]
161 |      },
162 |      "execution_count": 18,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     },
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "orig images number:['112120'] \n"
171 |      ]
172 |     }
173 |    ],
174 |    "source": [
175 |     "# chest xray images are in nih_chest_xray_data_dir\n",
176 |     "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n",
177 |     "                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n",
178 |     "nih_chest_xray_data_dir\n",
179 |     "\n",
180 |     "# check if we have all 112120 images in nih_chest_xray_data_dir\n",
181 |     "orig_images_no = !find $nih_chest_xray_data_dir -type f | wc -l\n",
182 |     "print(\"orig images number:{} \".format(orig_images_no))"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 19,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "data": {
192 |       "text/plain": [
193 |        "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC_other'"
194 |       ]
195 |      },
196 |      "execution_count": 19,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     },
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "BBox_List_2017.csv  Data_Entry_2017.csv  blacklist.csv\r\n"
205 |      ]
206 |     }
207 |    ],
208 |    "source": [
209 |     "# check if we have patients file list Data_Entry_2017.csv and BBox_List_2017.csv (https://nihcc.app.box.com/v/ChestXray-NIHCC)\n",
210 |     "# blacklist.csv is genrated by data scientists with no medical background\n",
211 |     "\n",
212 |     "other_data_dir=os.path.join(data_base_input_dir, os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))\n",
213 |     "other_data_dir\n",
214 |     "# !mkdir -p {other_data_dir}\n",
215 |     "!ls $other_data_dir\n",
216 |     "\n",
217 |     "# data is split into train/test/validation partitions\n",
218 |     "data_partitions_dir=os.path.join(data_base_output_dir, os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))  \n",
219 |     "!mkdir -p {data_partitions_dir}"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 20,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "import pickle\n",
229 |     "import random\n",
230 |     "import re\n",
231 |     "import tqdm\n",
232 |     "\n",
233 |     "import cv2\n",
234 |     "import numpy as np\n",
235 |     "import pandas as pd\n",
236 |     "import sklearn.model_selection "
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "#### Train/Validation/Test Data partitioning \n",
244 |     " - remove the images in the blacklist.csv where the image has low quality. \n",
245 |     " - remove the NIH bounding box patients since we will save those patients for later validation use. \n",
246 |     " - We will also divide data into train/valid/test dataset using a 7:1:2 ratio."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 21,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "len of original patient id is 30805\n",
259 |       "len of cleaned patient id is 30079\n",
260 |       "len of unique patient id with annotated data 726\n",
261 |       "len of patient id with annotated data 984\n"
262 |      ]
263 |     }
264 |    ],
265 |    "source": [
266 |     "# remove NIH manually annotated data (groung truth with heavy pathologies, no healthy patients) \n",
267 |     "# exclude what visusally looks like bad images to data scientists with no medical background\n",
268 |     "# todo\n",
269 |     "# This should prob be a generic function\n",
270 |     "\n",
271 |     "\n",
272 |     "total_patient_number = 30805\n",
273 |     "NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists \n",
274 |     "manually_selected_bad_images_file = 'blacklist.csv'# exclude what viusally looks like bad images\n",
275 |     "\n",
276 |     "patient_id_original = [i for i in range(1,total_patient_number + 1)]\n",
277 |     "\n",
278 |     "# ignored images list is used later, since this is not a patient ID level issue\n",
279 |     "ignored_images_set = set()\n",
280 |     "with open(os.path.join(other_data_dir, manually_selected_bad_images_file), 'r') as f:\n",
281 |     "    for line in f:\n",
282 |     "        # delete the last char which is \\n\n",
283 |     "        ignored_images_set.add(line[:-1])\n",
284 |     "        if int(line[:-9]) >= 30805:\n",
285 |     "            print(line[:-1])\n",
286 |     "\n",
287 |     "bbox_df = pd.read_csv(os.path.join(other_data_dir, NIH_annotated_file))\n",
288 |     "bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)\n",
289 |     "\n",
290 |     "bbox_patient_index_list = []\n",
291 |     "for index, item in bbox_patient_index_df.iteritems():\n",
292 |     "    bbox_patient_index_list.append(int(item))\n",
293 |     "\n",
294 |     "patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))\n",
295 |     "print(\"len of original patient id is\", len(patient_id_original))\n",
296 |     "print(\"len of cleaned patient id is\", len(patient_id))\n",
297 |     "print(\"len of unique patient id with annotated data\", \n",
298 |     "      len(list(set(bbox_patient_index_list))))\n",
299 |     "print(\"len of patient id with annotated data\",bbox_df.shape[0])\n"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 22,
305 |    "metadata": {},
306 |    "outputs": [
307 |     {
308 |      "name": "stdout",
309 |      "output_type": "stream",
310 |      "text": [
311 |       "first ten patient ids are [24303, 16035, 4967, 28624, 5378, 20335, 17069, 12271, 16975, 4469]\n",
312 |       "train:21563 valid:3081 test:6161\n"
313 |      ]
314 |     }
315 |    ],
316 |    "source": [
317 |     "random.seed(0)\n",
318 |     "random.shuffle(patient_id)\n",
319 |     "\n",
320 |     "print(\"first ten patient ids are\", patient_id[:10])\n",
321 |     "\n",
322 |     "# training:valid:test=7:1:2\n",
323 |     "patient_id_train = patient_id[:int(total_patient_number * 0.7)]\n",
324 |     "patient_id_valid = patient_id[int(total_patient_number * 0.7):int(total_patient_number * 0.8)]\n",
325 |     "# get the rest of the patient_id as the test set\n",
326 |     "patient_id_test = patient_id[int(total_patient_number * 0.8):]\n",
327 |     "patient_id_test.extend(bbox_patient_index_list)\n",
328 |     "patient_id_test = list(set(patient_id_test))\n",
329 |     "\n",
330 |     "\n",
331 |     "print(\"train:{} valid:{} test:{}\".format(len(patient_id_train), len(patient_id_valid), len(patient_id_test)))\n",
332 |     "\n",
333 |     "# test_set = test_set+left_out_patient_id\n",
334 |     "# print(\"train:{} valid:{} test:{}\".format(len(train_set), len(valid_set), len(test_set)))"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 23,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "# Add a few more project constants\n",
344 |     "\n",
345 |     "pathologies_name_list = prj_consts.DISEASE_list\n",
346 |     "NIH_patients_and_labels_file = 'Data_Entry_2017.csv'"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "#### Finally do preprocessing\n",
354 |     "Save labels and partitions"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 24,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "labels_df = pd.read_csv(os.path.join(other_data_dir, NIH_patients_and_labels_file))"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 25,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "def process_data(current_df, patient_ids):\n",
373 |     "    image_name_index = []\n",
374 |     "    image_labels = {}\n",
375 |     "    for individual_patient in tqdm.tqdm(patient_ids):\n",
376 |     "        for _, row in current_df[current_df['Patient ID'] == individual_patient].iterrows():\n",
377 |     "            processed_image_name = row['Image Index']\n",
378 |     "            if processed_image_name in ignored_images_set:\n",
379 |     "                pass\n",
380 |     "            else:\n",
381 |     "                image_name_index.append(processed_image_name)\n",
382 |     "                image_labels[processed_image_name] = np.zeros(14, dtype=np.uint8)\n",
383 |     "                for disease_index, ele in enumerate(pathologies_name_list):\n",
384 |     "                    if re.search(ele, row['Finding Labels'], re.IGNORECASE):\n",
385 |     "                        image_labels[processed_image_name][disease_index] = 1\n",
386 |     "                    else:\n",
387 |     "                        # redundant code but just to make it more readable\n",
388 |     "                        image_labels[processed_image_name][disease_index] = 0\n",
389 |     "                # print(\"processed\", row['Image Index'])\n",
390 |     "    return image_name_index, image_labels\n"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 26,
396 |    "metadata": {},
397 |    "outputs": [
398 |     {
399 |      "name": "stderr",
400 |      "output_type": "stream",
401 |      "text": [
402 |       "100%|██████████| 21563/21563 [00:35<00:00, 606.57it/s]\n",
403 |       "100%|██████████| 3081/3081 [00:05<00:00, 614.10it/s]\n",
404 |       "100%|██████████| 6161/6161 [00:13<00:00, 449.08it/s]\n"
405 |      ]
406 |     },
407 |     {
408 |      "name": "stdout",
409 |      "output_type": "stream",
410 |      "text": [
411 |       "train, valid, test image number is: 68508 9495 32893\n"
412 |      ]
413 |     }
414 |    ],
415 |    "source": [
416 |     "# # create and save train/test/validation partitions list\n",
417 |     "\n",
418 |     "train_data_index, train_labels = process_data(labels_df, patient_id_train)\n",
419 |     "valid_data_index, valid_labels = process_data(labels_df, patient_id_valid)\n",
420 |     "test_data_index, test_labels = process_data(labels_df, patient_id_test)\n",
421 |     "\n",
422 |     "print(\"train, valid, test image number is:\", len(train_data_index), len(valid_data_index), len(test_data_index))\n",
423 |     "\n",
424 |     "# save the data\n",
425 |     "labels_all = {}\n",
426 |     "labels_all.update(train_labels)\n",
427 |     "labels_all.update(valid_labels)\n",
428 |     "labels_all.update(test_labels)\n",
429 |     "\n",
430 |     "partition_dict = {'train': train_data_index, 'test': test_data_index, 'valid': valid_data_index}\n",
431 |     "\n",
432 |     "with open(os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle'), 'wb') as f:\n",
433 |     "    pickle.dump(labels_all, f)\n",
434 |     "\n",
435 |     "with open(os.path.join(data_partitions_dir,'partition14_unormalized_cleaned.pickle'), 'wb') as f:\n",
436 |     "    pickle.dump(partition_dict, f)\n",
437 |     "    \n",
438 |     "# also save the patient id partitions for pytorch training    \n",
439 |     "with open(os.path.join(data_partitions_dir,'train_test_valid_data_partitions.pickle'), 'wb') as f:\n",
440 |     "    pickle.dump([patient_id_train,patient_id_valid,\n",
441 |     "                 patient_id_test,\n",
442 |     "                list(set(bbox_patient_index_list))], f)    \n"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 27,
448 |    "metadata": {},
449 |    "outputs": [
450 |     {
451 |      "data": {
452 |       "text/plain": [
453 |        "dict"
454 |       ]
455 |      },
456 |      "execution_count": 27,
457 |      "metadata": {},
458 |      "output_type": "execute_result"
459 |     },
460 |     {
461 |      "data": {
462 |       "text/plain": [
463 |        "{'00001256_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n",
464 |        " '00010535_020.png': array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n",
465 |        " '00017170_004.png': array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n",
466 |        " '00017906_025.png': array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n",
467 |        " '00030353_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)}"
468 |       ]
469 |      },
470 |      "execution_count": 27,
471 |      "metadata": {},
472 |      "output_type": "execute_result"
473 |     }
474 |    ],
475 |    "source": [
476 |     "# sanity check, see train labels\n",
477 |     "\n",
478 |     "type(train_labels)\n",
479 |     "{k: train_labels[k] for k in list(train_labels)[:5]}"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 28,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "# jupyter nbconvert --to html .\\Code\\02_Model\\000_preprocess.ipynb"
489 |    ]
490 |   }
491 |  ],
492 |  "metadata": {
493 |   "kernelspec": {
494 |    "display_name": "Python 3",
495 |    "language": "python",
496 |    "name": "python3"
497 |   },
498 |   "language_info": {
499 |    "codemirror_mode": {
500 |     "name": "ipython",
501 |     "version": 3
502 |    },
503 |    "file_extension": ".py",
504 |    "mimetype": "text/x-python",
505 |    "name": "python",
506 |    "nbconvert_exporter": "python",
507 |    "pygments_lexer": "ipython3",
508 |    "version": "3.6.3"
509 |   }
510 |  },
511 |  "nbformat": 4,
512 |  "nbformat_minor": 2
513 | }
514 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/02_Model/.ipynb_checkpoints/000_preprocess-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Data preprocessing\n",
  8 |     "\n",
  9 |     "##### Copyright (C) Microsoft Corporation.  \n",
 10 |     "see license file for details "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 12,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Allow multiple displays per cell\n",
 20 |     "from IPython.core.interactiveshell import InteractiveShell\n",
 21 |     "InteractiveShell.ast_node_interactivity = \"all\""
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 13,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "text/plain": [
 32 |        "'/azureml-share/'"
 33 |       ]
 34 |      },
 35 |      "execution_count": 13,
 36 |      "metadata": {},
 37 |      "output_type": "execute_result"
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n",
 42 |     "\n",
 43 |     "import os\n",
 44 |     "try:\n",
 45 |     "    amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY']    \n",
 46 |     "except:\n",
 47 |     "    amlWBSharedDir = ''\n",
 48 |     "    print('not using aml services?')\n",
 49 |     "    \n",
 50 |     "amlWBSharedDir"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 14,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Use the Azure Machine Learning data collector to log various metrics\n",
 60 |     "from azureml.logging import get_azureml_logger\n",
 61 |     "logger = get_azureml_logger()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 15,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Use Azure Machine Learning history magic to control history collection\n",
 71 |     "# History is off by default, options are \"on\", \"off\", or \"show\"\n",
 72 |     "# %azureml history on"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 16,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "[None]"
 84 |       ]
 85 |      },
 86 |      "execution_count": 16,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "# import utlity functions\n",
 93 |     "\n",
 94 |     "import sys, os\n",
 95 |     "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code',  'src'])))]\n",
 96 |     "def add_path_to_sys_path(path_to_append):\n",
 97 |     "    if not (any(path_to_append in paths for paths in sys.path)):\n",
 98 |     "        sys.path.append(path_to_append)\n",
 99 |     "\n",
100 |     "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n",
101 |     "\n",
102 |     "import azure_chestxray_utils"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "#### Path variables"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 17,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "'/azureml-share/chestxray/data/ChestX-ray8'"
121 |       ]
122 |      },
123 |      "execution_count": 17,
124 |      "metadata": {},
125 |      "output_type": "execute_result"
126 |     },
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "'/azureml-share/chestxray/output'"
131 |       ]
132 |      },
133 |      "execution_count": 17,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "# create base directories for the file path variables \n",
140 |     "# paths are tipically container level dirs mapped to a host dir for data persistence.\n",
141 |     "\n",
142 |     "prj_consts = azure_chestxray_utils.chestxray_consts()\n",
143 |     "\n",
144 |     "data_base_input_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n",
145 |     "data_base_output_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))  \n",
146 |     "\n",
147 |     "data_base_input_dir\n",
148 |     "data_base_output_dir\n"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 18,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'"
160 |       ]
161 |      },
162 |      "execution_count": 18,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     },
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "orig images number:['112120'] \n"
171 |      ]
172 |     }
173 |    ],
174 |    "source": [
175 |     "# chest xray images are in nih_chest_xray_data_dir\n",
176 |     "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n",
177 |     "                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n",
178 |     "nih_chest_xray_data_dir\n",
179 |     "\n",
180 |     "# check if we have all 112120 images in nih_chest_xray_data_dir\n",
181 |     "orig_images_no = !find $nih_chest_xray_data_dir -type f | wc -l\n",
182 |     "print(\"orig images number:{} \".format(orig_images_no))"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 19,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "data": {
192 |       "text/plain": [
193 |        "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC_other'"
194 |       ]
195 |      },
196 |      "execution_count": 19,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     },
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "BBox_List_2017.csv  Data_Entry_2017.csv  blacklist.csv\r\n"
205 |      ]
206 |     }
207 |    ],
208 |    "source": [
209 |     "# check if we have patients file list Data_Entry_2017.csv and BBox_List_2017.csv (https://nihcc.app.box.com/v/ChestXray-NIHCC)\n",
210 |     "# blacklist.csv is genrated by data scientists with no medical background\n",
211 |     "\n",
212 |     "other_data_dir=os.path.join(data_base_input_dir, os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))\n",
213 |     "other_data_dir\n",
214 |     "# !mkdir -p {other_data_dir}\n",
215 |     "!ls $other_data_dir\n",
216 |     "\n",
217 |     "# data is split into train/test/validation partitions\n",
218 |     "data_partitions_dir=os.path.join(data_base_output_dir, os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))  \n",
219 |     "!mkdir -p {data_partitions_dir}"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 20,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "import pickle\n",
229 |     "import random\n",
230 |     "import re\n",
231 |     "import tqdm\n",
232 |     "\n",
233 |     "import cv2\n",
234 |     "import numpy as np\n",
235 |     "import pandas as pd\n",
236 |     "import sklearn.model_selection "
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "#### Train/Validation/Test Data partitioning \n",
244 |     " - remove the images in the blacklist.csv where the image has low quality. \n",
245 |     " - remove the NIH bounding box patients since we will save those patients for later validation use. \n",
246 |     " - We will also divide data into train/valid/test dataset using a 7:1:2 ratio."
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 21,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "len of original patient id is 30805\n",
259 |       "len of cleaned patient id is 30079\n",
260 |       "len of unique patient id with annotated data 726\n",
261 |       "len of patient id with annotated data 984\n"
262 |      ]
263 |     }
264 |    ],
265 |    "source": [
266 |     "# remove NIH manually annotated data (groung truth with heavy pathologies, no healthy patients) \n",
267 |     "# exclude what visusally looks like bad images to data scientists with no medical background\n",
268 |     "# todo\n",
269 |     "# This should prob be a generic function\n",
270 |     "\n",
271 |     "\n",
272 |     "total_patient_number = 30805\n",
273 |     "NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists \n",
274 |     "manually_selected_bad_images_file = 'blacklist.csv'# exclude what viusally looks like bad images\n",
275 |     "\n",
276 |     "patient_id_original = [i for i in range(1,total_patient_number + 1)]\n",
277 |     "\n",
278 |     "# ignored images list is used later, since this is not a patient ID level issue\n",
279 |     "ignored_images_set = set()\n",
280 |     "with open(os.path.join(other_data_dir, manually_selected_bad_images_file), 'r') as f:\n",
281 |     "    for line in f:\n",
282 |     "        # delete the last char which is \\n\n",
283 |     "        ignored_images_set.add(line[:-1])\n",
284 |     "        if int(line[:-9]) >= 30805:\n",
285 |     "            print(line[:-1])\n",
286 |     "\n",
287 |     "bbox_df = pd.read_csv(os.path.join(other_data_dir, NIH_annotated_file))\n",
288 |     "bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)\n",
289 |     "\n",
290 |     "bbox_patient_index_list = []\n",
291 |     "for index, item in bbox_patient_index_df.iteritems():\n",
292 |     "    bbox_patient_index_list.append(int(item))\n",
293 |     "\n",
294 |     "patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))\n",
295 |     "print(\"len of original patient id is\", len(patient_id_original))\n",
296 |     "print(\"len of cleaned patient id is\", len(patient_id))\n",
297 |     "print(\"len of unique patient id with annotated data\", \n",
298 |     "      len(list(set(bbox_patient_index_list))))\n",
299 |     "print(\"len of patient id with annotated data\",bbox_df.shape[0])\n"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 22,
305 |    "metadata": {},
306 |    "outputs": [
307 |     {
308 |      "name": "stdout",
309 |      "output_type": "stream",
310 |      "text": [
311 |       "first ten patient ids are [24303, 16035, 4967, 28624, 5378, 20335, 17069, 12271, 16975, 4469]\n",
312 |       "train:21563 valid:3081 test:6161\n"
313 |      ]
314 |     }
315 |    ],
316 |    "source": [
317 |     "random.seed(0)\n",
318 |     "random.shuffle(patient_id)\n",
319 |     "\n",
320 |     "print(\"first ten patient ids are\", patient_id[:10])\n",
321 |     "\n",
322 |     "# training:valid:test=7:1:2\n",
323 |     "patient_id_train = patient_id[:int(total_patient_number * 0.7)]\n",
324 |     "patient_id_valid = patient_id[int(total_patient_number * 0.7):int(total_patient_number * 0.8)]\n",
325 |     "# get the rest of the patient_id as the test set\n",
326 |     "patient_id_test = patient_id[int(total_patient_number * 0.8):]\n",
327 |     "patient_id_test.extend(bbox_patient_index_list)\n",
328 |     "patient_id_test = list(set(patient_id_test))\n",
329 |     "\n",
330 |     "\n",
331 |     "print(\"train:{} valid:{} test:{}\".format(len(patient_id_train), len(patient_id_valid), len(patient_id_test)))\n",
332 |     "\n",
333 |     "# test_set = test_set+left_out_patient_id\n",
334 |     "# print(\"train:{} valid:{} test:{}\".format(len(train_set), len(valid_set), len(test_set)))"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 23,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "# Add a few more project constants\n",
344 |     "\n",
345 |     "pathologies_name_list = prj_consts.DISEASE_list\n",
346 |     "NIH_patients_and_labels_file = 'Data_Entry_2017.csv'"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "#### Finally do preprocessing\n",
354 |     "Save labels and partitions"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 24,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "labels_df = pd.read_csv(os.path.join(other_data_dir, NIH_patients_and_labels_file))"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 25,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "def process_data(current_df, patient_ids):\n",
373 |     "    image_name_index = []\n",
374 |     "    image_labels = {}\n",
375 |     "    for individual_patient in tqdm.tqdm(patient_ids):\n",
376 |     "        for _, row in current_df[current_df['Patient ID'] == individual_patient].iterrows():\n",
377 |     "            processed_image_name = row['Image Index']\n",
378 |     "            if processed_image_name in ignored_images_set:\n",
379 |     "                pass\n",
380 |     "            else:\n",
381 |     "                image_name_index.append(processed_image_name)\n",
382 |     "                image_labels[processed_image_name] = np.zeros(14, dtype=np.uint8)\n",
383 |     "                for disease_index, ele in enumerate(pathologies_name_list):\n",
384 |     "                    if re.search(ele, row['Finding Labels'], re.IGNORECASE):\n",
385 |     "                        image_labels[processed_image_name][disease_index] = 1\n",
386 |     "                    else:\n",
387 |     "                        # redundant code but just to make it more readable\n",
388 |     "                        image_labels[processed_image_name][disease_index] = 0\n",
389 |     "                # print(\"processed\", row['Image Index'])\n",
390 |     "    return image_name_index, image_labels\n"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 26,
396 |    "metadata": {},
397 |    "outputs": [
398 |     {
399 |      "name": "stderr",
400 |      "output_type": "stream",
401 |      "text": [
402 |       "100%|██████████| 21563/21563 [00:35<00:00, 606.57it/s]\n",
403 |       "100%|██████████| 3081/3081 [00:05<00:00, 614.10it/s]\n",
404 |       "100%|██████████| 6161/6161 [00:13<00:00, 449.08it/s]\n"
405 |      ]
406 |     },
407 |     {
408 |      "name": "stdout",
409 |      "output_type": "stream",
410 |      "text": [
411 |       "train, valid, test image number is: 68508 9495 32893\n"
412 |      ]
413 |     }
414 |    ],
415 |    "source": [
416 |     "# # create and save train/test/validation partitions list\n",
417 |     "\n",
418 |     "train_data_index, train_labels = process_data(labels_df, patient_id_train)\n",
419 |     "valid_data_index, valid_labels = process_data(labels_df, patient_id_valid)\n",
420 |     "test_data_index, test_labels = process_data(labels_df, patient_id_test)\n",
421 |     "\n",
422 |     "print(\"train, valid, test image number is:\", len(train_data_index), len(valid_data_index), len(test_data_index))\n",
423 |     "\n",
424 |     "# save the data\n",
425 |     "labels_all = {}\n",
426 |     "labels_all.update(train_labels)\n",
427 |     "labels_all.update(valid_labels)\n",
428 |     "labels_all.update(test_labels)\n",
429 |     "\n",
430 |     "partition_dict = {'train': train_data_index, 'test': test_data_index, 'valid': valid_data_index}\n",
431 |     "\n",
432 |     "with open(os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle'), 'wb') as f:\n",
433 |     "    pickle.dump(labels_all, f)\n",
434 |     "\n",
435 |     "with open(os.path.join(data_partitions_dir,'partition14_unormalized_cleaned.pickle'), 'wb') as f:\n",
436 |     "    pickle.dump(partition_dict, f)\n",
437 |     "    \n",
438 |     "# also save the patient id partitions for pytorch training    \n",
439 |     "with open(os.path.join(data_partitions_dir,'train_test_valid_data_partitions.pickle'), 'wb') as f:\n",
440 |     "    pickle.dump([patient_id_train,patient_id_valid,\n",
441 |     "                 patient_id_test,\n",
442 |     "                list(set(bbox_patient_index_list))], f)    \n"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 27,
448 |    "metadata": {},
449 |    "outputs": [
450 |     {
451 |      "data": {
452 |       "text/plain": [
453 |        "dict"
454 |       ]
455 |      },
456 |      "execution_count": 27,
457 |      "metadata": {},
458 |      "output_type": "execute_result"
459 |     },
460 |     {
461 |      "data": {
462 |       "text/plain": [
463 |        "{'00001256_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n",
464 |        " '00010535_020.png': array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n",
465 |        " '00017170_004.png': array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n",
466 |        " '00017906_025.png': array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n",
467 |        " '00030353_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)}"
468 |       ]
469 |      },
470 |      "execution_count": 27,
471 |      "metadata": {},
472 |      "output_type": "execute_result"
473 |     }
474 |    ],
475 |    "source": [
476 |     "# sanity check, see train labels\n",
477 |     "\n",
478 |     "type(train_labels)\n",
479 |     "{k: train_labels[k] for k in list(train_labels)[:5]}"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 28,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "# jupyter nbconvert --to html .\\Code\\02_Model\\000_preprocess.ipynb"
489 |    ]
490 |   }
491 |  ],
492 |  "metadata": {
493 |   "kernelspec": {
494 |    "display_name": "Python 3",
495 |    "language": "python",
496 |    "name": "python3"
497 |   },
498 |   "language_info": {
499 |    "codemirror_mode": {
500 |     "name": "ipython",
501 |     "version": 3
502 |    },
503 |    "file_extension": ".py",
504 |    "mimetype": "text/x-python",
505 |    "name": "python",
506 |    "nbconvert_exporter": "python",
507 |    "pygments_lexer": "ipython3",
508 |    "version": "3.6.3"
509 |   }
510 |  },
511 |  "nbformat": 4,
512 |  "nbformat_minor": 2
513 | }
514 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/02_Model/010_train.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Train\n",
  8 |     "\n",
  9 |     "##### Copyright (C) Microsoft Corporation.  \n",
 10 |     "see license file for details "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Allow multiple displays per cell\n",
 20 |     "from IPython.core.interactiveshell import InteractiveShell\n",
 21 |     "InteractiveShell.ast_node_interactivity = \"all\""
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "text/plain": [
 32 |        "'/azureml-share/'"
 33 |       ]
 34 |      },
 35 |      "execution_count": 2,
 36 |      "metadata": {},
 37 |      "output_type": "execute_result"
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n",
 42 |     "\n",
 43 |     "import os\n",
 44 |     "try:\n",
 45 |     "    amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY']    \n",
 46 |     "except:\n",
 47 |     "    amlWBSharedDir = ''\n",
 48 |     "    print('not using aml services?')\n",
 49 |     "    \n",
 50 |     "amlWBSharedDir"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Use the Azure Machine Learning data collector to log various metrics\n",
 60 |     "from azureml.logging import get_azureml_logger\n",
 61 |     "logger = get_azureml_logger()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Use Azure Machine Learning history magic to control history collection\n",
 71 |     "# History is off by default, options are \"on\", \"off\", or \"show\"\n",
 72 |     "# %azureml history on"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "[None]"
 84 |       ]
 85 |      },
 86 |      "execution_count": 5,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "# import utlity functions\n",
 93 |     "\n",
 94 |     "import sys, os\n",
 95 |     "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code',  'src'])))]\n",
 96 |     "def add_path_to_sys_path(path_to_append):\n",
 97 |     "    if not (any(path_to_append in paths for paths in sys.path)):\n",
 98 |     "        sys.path.append(path_to_append)\n",
 99 |     "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n",
100 |     "\n",
101 |     "import azure_chestxray_utils\n",
102 |     "# import azure_chestxray_keras_utils"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 6,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# create the file path variables \n",
112 |     "# paths are tipically container level dirs mapped to a host dir for data persistence.\n",
113 |     "\n",
114 |     "prj_consts = azure_chestxray_utils.chestxray_consts()\n",
115 |     "\n",
116 |     "data_base_input_dir=os.path.join(amlWBSharedDir, \n",
117 |     "                                 os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n",
118 |     "data_base_output_dir=os.path.join(amlWBSharedDir, \n",
119 |     "                                  os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))  \n",
120 |     "\n",
121 |     "\n",
122 |     "# data used for training\n",
123 |     "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n",
124 |     "                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n",
125 |     "\n",
126 |     "data_partitions_dir=os.path.join(data_base_output_dir, \n",
127 |     "                                os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))  \n",
128 |     "partition_path = os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle')\n",
129 |     "label_path = os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle')"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 7,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "'/azureml-share/chestxray/output/weights_tmpdir'"
141 |       ]
142 |      },
143 |      "execution_count": 7,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     },
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "total 0\r\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "# global variables\n",
157 |     "\n",
158 |     "weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.MODEL_WEIGHTS_DIR_list))) \n",
159 |     "!mkdir -p {weights_dir}\n",
160 |     "weights_dir\n",
161 |     "!ls -l {weights_dir}\n",
162 |     "\n",
163 |     "# weights_path = os.path.join(\n",
164 |     "#     weights_dir, \n",
165 |     "#     prj_consts.PRETRAINED_DENSENET201_IMAGENET_CHESTXRAY_MODEL_FILE_NAME)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 8,
171 |    "metadata": {},
172 |    "outputs": [
173 |     {
174 |      "name": "stderr",
175 |      "output_type": "stream",
176 |      "text": [
177 |       "Using TensorFlow backend.\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "import os\n",
183 |     "\n",
184 |     "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"  # see issue #152\n",
185 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0,1\"\n",
186 |     "\n",
187 |     "import imgaug as ia\n",
188 |     "from imgaug import augmenters as iaa\n",
189 |     "ia.seed(1)\n",
190 |     "\n",
191 |     "import cv2\n",
192 |     "import keras.backend as K\n",
193 |     "from keras.optimizers import Adam\n",
194 |     "from keras.callbacks import ReduceLROnPlateau, Callback, ModelCheckpoint\n",
195 |     "import numpy as np\n",
196 |     "import pickle\n",
197 |     "from keras_contrib.applications.densenet import DenseNetImageNet121\n",
198 |     "from keras.layers import Dense\n",
199 |     "from keras.models import Model\n",
200 |     "from keras.utils import multi_gpu_model\n",
201 |     "from tensorflow.python.client import device_lib\n",
202 |     "import warnings\n",
203 |     "from keras.utils import Sequence\n",
204 |     "import tensorflow as tf"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "For testing purpose, we just run 1 epoch. It will take around 25 mins to run for one epoch using 2 K80 GPUs and it is usually needed to run around 30~50 epochs for the model to get converge."
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 9,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "# make force_restart = False if you continue a previous train session, make it True to start from scratch\n",
221 |     "force_restart = False\n",
222 |     "\n",
223 |     "initial_lr = 0.001\n",
224 |     "resized_height = 224\n",
225 |     "resized_width = 224\n",
226 |     "# resized_height = prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_HEIGHT\n",
227 |     "# resized_width = prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_WIDTH\n",
228 |     "num_channel = 3\n",
229 |     "num_classes = 14\n",
230 |     "epochs = 1 #200"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 10,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "def get_available_gpus():\n",
240 |     "    \"\"\"\n",
241 |     "\n",
242 |     "    Returns: number of GPUs available in the system\n",
243 |     "\n",
244 |     "    \"\"\"\n",
245 |     "    local_device_protos = device_lib.list_local_devices()\n",
246 |     "    return [x.name for x in local_device_protos if x.device_type == 'GPU']\n",
247 |     "\n",
248 |     "\n",
249 |     "# get number of available GPUs\n",
250 |     "num_gpu = len(get_available_gpus())\n",
251 |     "\n",
252 |     "# keras multi_gpu_model slices the data to different GPUs. see https://keras.io/utils/#multi_gpu_model for more details.\n",
253 |     "batch_size = 48 * num_gpu\n",
254 |     "\n",
255 |     "\n",
256 |     "# use Keras multi-gpu model, so we need to make sure the batch_size is divisible by num_gpu."
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 11,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "# device_lib.list_local_devices()\n",
266 |     "# !nvidia-smi"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 12,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "# use Keras multi-gpu model, so we need to make sure the batch_size is divisible by num_gpu.\n",
276 |     "\n",
277 |     "# multi GPU model checkpoint. copied from https://github.com/keras-team/keras/issues/8463\n",
278 |     "class MultiGPUCheckpointCallback(Callback):\n",
279 |     "\n",
280 |     "    def __init__(self, filepath, base_model, monitor='val_loss', verbose=0,\n",
281 |     "                 save_best_only=False, save_weights_only=False,\n",
282 |     "                 mode='auto', period=1):\n",
283 |     "        super(MultiGPUCheckpointCallback, self).__init__()\n",
284 |     "        self.base_model = base_model\n",
285 |     "        self.monitor = monitor\n",
286 |     "        self.verbose = verbose\n",
287 |     "        self.filepath = filepath\n",
288 |     "        self.save_best_only = save_best_only\n",
289 |     "        self.save_weights_only = save_weights_only\n",
290 |     "        self.period = period\n",
291 |     "        self.epochs_since_last_save = 0\n",
292 |     "\n",
293 |     "        if mode not in ['auto', 'min', 'max']:\n",
294 |     "            warnings.warn('ModelCheckpoint mode %s is unknown, '\n",
295 |     "                          'fallback to auto mode.' % (mode),\n",
296 |     "                          RuntimeWarning)\n",
297 |     "            mode = 'auto'\n",
298 |     "\n",
299 |     "        if mode == 'min':\n",
300 |     "            self.monitor_op = np.less\n",
301 |     "            self.best = np.Inf\n",
302 |     "        elif mode == 'max':\n",
303 |     "            self.monitor_op = np.greater\n",
304 |     "            self.best = -np.Inf\n",
305 |     "        else:\n",
306 |     "            if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):\n",
307 |     "                self.monitor_op = np.greater\n",
308 |     "                self.best = -np.Inf\n",
309 |     "            else:\n",
310 |     "                self.monitor_op = np.less\n",
311 |     "                self.best = np.Inf\n",
312 |     "\n",
313 |     "    def on_epoch_end(self, epoch, logs=None):\n",
314 |     "        logs = logs or {}\n",
315 |     "        self.epochs_since_last_save += 1\n",
316 |     "        if self.epochs_since_last_save >= self.period:\n",
317 |     "            self.epochs_since_last_save = 0\n",
318 |     "            filepath = self.filepath.format(epoch=epoch + 1, **logs)\n",
319 |     "            if self.save_best_only:\n",
320 |     "                current = logs.get(self.monitor)\n",
321 |     "                if current is None:\n",
322 |     "                    warnings.warn('Can save best model only with %s available, '\n",
323 |     "                                  'skipping.' % (self.monitor), RuntimeWarning)\n",
324 |     "                else:\n",
325 |     "                    if self.monitor_op(current, self.best):\n",
326 |     "                        if self.verbose > 0:\n",
327 |     "                            print('Epoch %05d: %s improved from %0.5f to %0.5f,'\n",
328 |     "                                  ' saving model to %s'\n",
329 |     "                                  % (epoch + 1, self.monitor, self.best,\n",
330 |     "                                     current, filepath))\n",
331 |     "                        self.best = current\n",
332 |     "                        if self.save_weights_only:\n",
333 |     "                            self.base_model.save_weights(filepath, overwrite=True)\n",
334 |     "                        else:\n",
335 |     "                            self.base_model.save(filepath, overwrite=True)\n",
336 |     "                    else:\n",
337 |     "                        if self.verbose > 0:\n",
338 |     "                            print('Epoch %05d: %s did not improve' %\n",
339 |     "                                  (epoch + 1, self.monitor))\n",
340 |     "            else:\n",
341 |     "                if self.verbose > 0:\n",
342 |     "                    print('Epoch %05d: saving model to %s' % (epoch + 1, filepath))\n",
343 |     "                if self.save_weights_only:\n",
344 |     "                    self.base_model.save_weights(filepath, overwrite=True)\n",
345 |     "                else:\n",
346 |     "                    self.base_model.save(filepath, overwrite=True)\n",
347 |     "\n",
348 |     "\n",
349 |     "seq = iaa.Sequential([\n",
350 |     "    iaa.Fliplr(0.5),  # horizontal flips\n",
351 |     "    iaa.Affine(rotate=(-15, 15)),  # random rotate image\n",
352 |     "    iaa.Affine(scale=(0.8, 1.1)),  # randomly scale the image\n",
353 |     "], random_order=True)  # apply augmenters in random order\n",
354 |     "\n",
355 |     "\n",
356 |     "# generator for train and validation data\n",
357 |     "# use the Sequence class per issue https://github.com/keras-team/keras/issues/1638\n",
358 |     "class DataGenSequence(Sequence):\n",
359 |     "    def __init__(self, labels, image_file_index, current_state):\n",
360 |     "        self.batch_size = batch_size\n",
361 |     "        self.labels = labels\n",
362 |     "        self.img_file_index = image_file_index\n",
363 |     "        self.current_state = current_state\n",
364 |     "        self.len = len(self.img_file_index) // self.batch_size\n",
365 |     "        print(\"for DataGenSequence\", current_state, \"total rows are:\", len(self.img_file_index), \", len is\", self.len)\n",
366 |     "\n",
367 |     "    def __len__(self):\n",
368 |     "        return self.len\n",
369 |     "\n",
370 |     "    def __getitem__(self, idx):\n",
371 |     "        # print(\"loading data segmentation\", idx)\n",
372 |     "        # make sure each batch size has the same amount of data\n",
373 |     "        current_batch = self.img_file_index[idx * self.batch_size: (idx + 1) * self.batch_size]\n",
374 |     "        X = np.empty((self.batch_size, resized_height, resized_width, num_channel))\n",
375 |     "        y = np.empty((self.batch_size, num_classes))\n",
376 |     "\n",
377 |     "        for i, image_name in enumerate(current_batch):\n",
378 |     "            path = os.path.join(nih_chest_xray_data_dir, image_name)\n",
379 |     "            # loading data\n",
380 |     "\n",
381 |     "            img = cv2.resize(cv2.imread(path), (resized_height, resized_width)).astype(np.float32)\n",
382 |     "            X[i, :, :, :] = img\n",
383 |     "            y[i, :] = labels[image_name]\n",
384 |     "\n",
385 |     "            # only do random flipping in training status\n",
386 |     "        if self.current_state == 'train':\n",
387 |     "            x_augmented = seq.augment_images(X)\n",
388 |     "        else:\n",
389 |     "            x_augmented = X\n",
390 |     "\n",
391 |     "        return x_augmented, y\n",
392 |     "\n",
393 |     "\n",
394 |     "# loss function\n",
395 |     "def unweighted_binary_crossentropy(y_true, y_pred):\n",
396 |     "    \"\"\"\n",
397 |     "    Args:\n",
398 |     "        y_true: true labels\n",
399 |     "        y_pred: predicted labels\n",
400 |     "\n",
401 |     "    Returns: the sum of binary cross entropy loss across all the classes\n",
402 |     "\n",
403 |     "    \"\"\"\n",
404 |     "    return K.sum(K.binary_crossentropy(y_true, y_pred))\n",
405 |     "\n",
406 |     "\n",
407 |     "def build_model():\n",
408 |     "    \"\"\"\n",
409 |     "\n",
410 |     "    Returns: a model with specified weights\n",
411 |     "\n",
412 |     "    \"\"\"\n",
413 |     "    # define the model, use pre-trained weights for image_net\n",
414 |     "    base_model = DenseNetImageNet121(input_shape=(224, 224, 3),\n",
415 |     "                                     weights='imagenet',\n",
416 |     "                                     include_top=False,\n",
417 |     "                                     pooling='avg')\n",
418 |     "\n",
419 |     "    x = base_model.output\n",
420 |     "    predictions = Dense(14, activation='sigmoid')(x)\n",
421 |     "    model = Model(inputs=base_model.input, outputs=predictions)\n",
422 |     "    return model"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": 13,
428 |    "metadata": {},
429 |    "outputs": [
430 |     {
431 |      "name": "stdout",
432 |      "output_type": "stream",
433 |      "text": [
434 |       "using 2 GPUs\n",
435 |       "Downloading data from https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-121-32-no-top.h5\n",
436 |       "33202176/33199896 [==============================] - 8s 0us/step\n",
437 |       "Weights for the model were loaded successfully\n"
438 |      ]
439 |     }
440 |    ],
441 |    "source": [
442 |     "if num_gpu > 1:\n",
443 |     "    print(\"using\", num_gpu, \"GPUs\")\n",
444 |     "    # build model\n",
445 |     "    with tf.device('/cpu:0'):\n",
446 |     "        model_single_gpu = build_model()\n",
447 |     "    # model_single_gpu.load_weights(weights_path)\n",
448 |     "\n",
449 |     "    # convert to multi-gpu model\n",
450 |     "    model_multi_gpu = multi_gpu_model(model_single_gpu, gpus=num_gpu)\n",
451 |     "    model_checkpoint = MultiGPUCheckpointCallback(\n",
452 |     "        os.path.join(weights_dir, 'azure_chest_xray_14_weights_712split_epoch_{epoch:03d}_val_loss_{val_loss:.4f}.hdf5'),\n",
453 |     "        model_single_gpu, monitor='val_loss', save_weights_only=False)\n",
454 |     "\n",
455 |     "    \n",
456 |     "\n",
457 |     "else:\n",
458 |     "    print(\"using single GPU\")\n",
459 |     "    model_multi_gpu = build_model()\n",
460 |     "    model_checkpoint = ModelCheckpoint(\n",
461 |     "        os.path.join(weights_dir, 'azure_chest_xray_14_weights_712split_epoch_{epoch:03d}_val_loss_{val_loss:.4f}.hdf5'),\n",
462 |     "        monitor='val_loss', save_weights_only=False)"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 14,
468 |    "metadata": {},
469 |    "outputs": [
470 |     {
471 |      "name": "stdout",
472 |      "output_type": "stream",
473 |      "text": [
474 |       "for DataGenSequence train total rows are: 68508 , len is 713\n",
475 |       "for DataGenSequence validation total rows are: 9495 , len is 98\n",
476 |       "Epoch 1/1\n",
477 |       "713/713 [==============================] - 1275s 2s/step - loss: 214.9958 - val_loss: 225.4705\n"
478 |      ]
479 |     },
480 |     {
481 |      "data": {
482 |       "text/plain": [
483 |        "<keras.callbacks.History at 0x7f7e8955d5c0>"
484 |       ]
485 |      },
486 |      "execution_count": 14,
487 |      "metadata": {},
488 |      "output_type": "execute_result"
489 |     }
490 |    ],
491 |    "source": [
492 |     "num_workers = 10 * num_gpu\n",
493 |     "\n",
494 |     "model_multi_gpu.compile(optimizer=Adam(lr=initial_lr), loss=unweighted_binary_crossentropy)\n",
495 |     "\n",
496 |     "reduce_lr_on_plateau = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_lr=1e-6)\n",
497 |     "\n",
498 |     "callbacks = [model_checkpoint, reduce_lr_on_plateau]\n",
499 |     "\n",
500 |     "with open(label_path, 'rb') as f:\n",
501 |     "    labels = pickle.load(f)\n",
502 |     "\n",
503 |     "with open(partition_path, 'rb') as f:\n",
504 |     "    partition = pickle.load(f)\n",
505 |     "\n",
506 |     "model_multi_gpu.fit_generator(generator=DataGenSequence(labels, partition['train'], current_state='train'),\n",
507 |     "                              epochs=epochs,\n",
508 |     "                              verbose=1,\n",
509 |     "                              callbacks=callbacks,\n",
510 |     "                              workers=num_workers,\n",
511 |     "                              # max_queue_size=32,\n",
512 |     "                              # shuffle=False,\n",
513 |     "                              validation_data=DataGenSequence(labels, partition['valid'], current_state='validation')\n",
514 |     "                              # validation_steps=1\n",
515 |     "                              )"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": 15,
521 |    "metadata": {},
522 |    "outputs": [],
523 |    "source": [
524 |     "# jupyter nbconvert --to html .\\Code\\02_Model\\010_train.ipynb"
525 |    ]
526 |   }
527 |  ],
528 |  "metadata": {
529 |   "kernelspec": {
530 |    "display_name": "azure_chestxray_lung_disease gpucomputecontext",
531 |    "language": "python",
532 |    "name": "azure_chestxray_lung_disease_gpucomputecontext"
533 |   },
534 |   "language_info": {
535 |    "codemirror_mode": {
536 |     "name": "ipython",
537 |     "version": 3
538 |    },
539 |    "file_extension": ".py",
540 |    "mimetype": "text/x-python",
541 |    "name": "python",
542 |    "nbconvert_exporter": "python",
543 |    "pygments_lexer": "ipython3",
544 |    "version": "3.5.2"
545 |   }
546 |  },
547 |  "nbformat": 4,
548 |  "nbformat_minor": 2
549 | }
550 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/02_Model/020_evaluate.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Copyright (C) Microsoft Corporation.  \n",
  8 |     "see license.md file for Enterprise Customer License and ISV License details "
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 59,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "# Allow multiple displays per cell\n",
 18 |     "from IPython.core.interactiveshell import InteractiveShell\n",
 19 |     "InteractiveShell.ast_node_interactivity = \"all\""
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 60,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "data": {
 29 |       "text/plain": [
 30 |        "'/azureml-share/'"
 31 |       ]
 32 |      },
 33 |      "execution_count": 60,
 34 |      "metadata": {},
 35 |      "output_type": "execute_result"
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "\n",
 40 |     "import os\n",
 41 |     "try:\n",
 42 |     "    amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY']    \n",
 43 |     "except:\n",
 44 |     "    print( 'Not in amlwb? define amlWBSharedDir as \"/shared_folder_on_host/amlwb_exp_acc/amlwb_work_space/amlwb_experiment\"')\n",
 45 |     "    \n",
 46 |     "amlWBSharedDir"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 61,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "# # Use the Azure Machine Learning data collector to log various metrics\n",
 56 |     "# from azureml.logging import get_azureml_logger\n",
 57 |     "# logger = get_azureml_logger()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 62,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# Use Azure Machine Learning history magic to control history collection\n",
 67 |     "# History is off by default, options are \"on\", \"off\", or \"show\"\n",
 68 |     "# %azureml history on"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 63,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "[None]"
 80 |       ]
 81 |      },
 82 |      "execution_count": 63,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "# import utlity functions\n",
 89 |     "\n",
 90 |     "import sys, os\n",
 91 |     "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code',  'src'])))]\n",
 92 |     "def add_path_to_sys_path(path_to_append):\n",
 93 |     "    if not (any(path_to_append in paths for paths in sys.path)):\n",
 94 |     "        sys.path.append(path_to_append)\n",
 95 |     "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n",
 96 |     "\n",
 97 |     "import azure_chestxray_utils\n",
 98 |     "import azure_chestxray_keras_utils"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 64,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "# create the file path variables \n",
108 |     "# paths are tipically container level dirs mapped to a host dir for data persistence.\n",
109 |     "\n",
110 |     "prj_consts = azure_chestxray_utils.chestxray_consts()\n",
111 |     "\n",
112 |     "data_base_input_dir=os.path.join(amlWBSharedDir, \n",
113 |     "                                 os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n",
114 |     "data_base_output_dir=os.path.join(amlWBSharedDir, \n",
115 |     "                                  os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 65,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "data": {
125 |       "text/plain": [
126 |        "'/azureml-share/chestxray/output/weights_tmpdir'"
127 |       ]
128 |      },
129 |      "execution_count": 65,
130 |      "metadata": {},
131 |      "output_type": "execute_result"
132 |     },
133 |     {
134 |      "name": "stdout",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "total 0\r\n"
138 |      ]
139 |     },
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "'/azureml-share/chestxray/output/fully_trained_models'"
144 |       ]
145 |      },
146 |      "execution_count": 65,
147 |      "metadata": {},
148 |      "output_type": "execute_result"
149 |     },
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "total 86320\r\n",
155 |       "-rw-rw-r-- 1 1003 1003 30097832 Feb 14 04:37 azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5\r\n",
156 |       "-rw-r--r-- 1 root root 29143128 Feb 14 04:55 weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5\r\n",
157 |       "-rw-rw-r-- 1 1003 1003 29142168 Feb  7 06:16 weights_only_azure_chest_xray__14_weights_712split_epoch_029_val_loss_147.7599.hdf5\r\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "# global variables\n",
163 |     "\n",
164 |     "#location of trained models weights, quality will be dependent on train data size\n",
165 |     "# and number of epochs among other things\n",
166 |     "weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.MODEL_WEIGHTS_DIR_list))) \n",
167 |     "weights_dir\n",
168 |     "!ls -l {weights_dir}\n",
169 |     "\n",
170 |     "# \"quality\" models, fully trained on all training data\n",
171 |     "fully_trained_weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.FULLY_PRETRAINED_MODEL_DIR_list))) \n",
172 |     "fully_trained_weights_dir\n",
173 |     "!ls -l {fully_trained_weights_dir}\n",
174 |     "\n",
175 |     "\n",
176 |     "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n",
177 |     "                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n",
178 |     "\n",
179 |     "data_partitions_dir=os.path.join(data_base_output_dir, \n",
180 |     "                                os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))  \n",
181 |     "label_path = os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle')\n",
182 |     "partition_path = os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle')"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 66,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "# # extract and save the weights from a full model\n",
192 |     "\n",
193 |     "# import keras_contrib\n",
194 |     "# from keras.models import load_model\n",
195 |     "# model_file_name = 'azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5'\n",
196 |     "# model = load_model(os.path.join(fully_trained_weights_dir, model_file_name))\n",
197 |     "# model.save_weights(os.path.join(fully_trained_weights_dir, 'weights_only_'+model_file_name))"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 67,
203 |    "metadata": {},
204 |    "outputs": [
205 |     {
206 |      "data": {
207 |       "text/plain": [
208 |        "['/azureml-share/chestxray/output/fully_trained_models/azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5',\n",
209 |        " '/azureml-share/chestxray/output/fully_trained_models/weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5',\n",
210 |        " '/azureml-share/chestxray/output/fully_trained_models/weights_only_azure_chest_xray__14_weights_712split_epoch_029_val_loss_147.7599.hdf5']"
211 |       ]
212 |      },
213 |      "execution_count": 67,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "# get long (full path) model file name\n",
220 |     "\n",
221 |     "all_models=!ls {os.path.join(fully_trained_weights_dir, '*.hdf5')}\n",
222 |     "all_models\n",
223 |     "models_file_name= [os.path.join(fully_trained_weights_dir, \n",
224 |     "                               'weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5')]"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 68,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "import os\n",
234 |     "import pickle\n",
235 |     "\n",
236 |     "import cv2\n",
237 |     "import numpy as np\n",
238 |     "import pandas as pd\n",
239 |     "from keras.models import load_model\n",
240 |     "from keras.utils import Sequence\n",
241 |     "from sklearn import metrics\n",
242 |     "\n",
243 |     "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"  # see issue #152\n",
244 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0,1\"\n",
245 |     "\n",
246 |     "from tensorflow.python.client import device_lib\n",
247 |     "\n",
248 |     "resized_height = 224\n",
249 |     "resized_width = 224\n",
250 |     "num_channel = 3\n",
251 |     "num_classes = 14\n",
252 |     "batch_size = 512 #512"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 69,
258 |    "metadata": {},
259 |    "outputs": [
260 |     {
261 |      "data": {
262 |       "text/plain": [
263 |        "['/device:GPU:0', '/device:GPU:1']"
264 |       ]
265 |      },
266 |      "execution_count": 69,
267 |      "metadata": {},
268 |      "output_type": "execute_result"
269 |     },
270 |     {
271 |      "name": "stdout",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "num of GPUs: 2\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "def get_available_gpus():\n",
280 |     "    \"\"\"\n",
281 |     "    Returns: number of GPUs available in the system\n",
282 |     "    \"\"\"\n",
283 |     "    local_device_protos = device_lib.list_local_devices()\n",
284 |     "    return [x.name for x in local_device_protos if x.device_type == 'GPU']\n",
285 |     "\n",
286 |     "get_available_gpus()\n",
287 |     "# get number of available GPUs\n",
288 |     "print(\"num of GPUs:\", len(get_available_gpus()))"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 70,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "# device_lib.list_local_devices()\n",
298 |     "# !nvidia-smi"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 71,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "name": "stdout",
308 |      "output_type": "stream",
309 |      "text": [
310 |       "num of GPUs: 2\n"
311 |      ]
312 |     },
313 |     {
314 |      "data": {
315 |       "text/plain": [
316 |        "['Atelectasis',\n",
317 |        " 'Cardiomegaly',\n",
318 |        " 'Effusion',\n",
319 |        " 'Infiltration',\n",
320 |        " 'Mass',\n",
321 |        " 'Nodule',\n",
322 |        " 'Pneumonia',\n",
323 |        " 'Pneumothorax',\n",
324 |        " 'Consolidation',\n",
325 |        " 'Edema',\n",
326 |        " 'Emphysema',\n",
327 |        " 'Fibrosis',\n",
328 |        " 'Pleural Thickening',\n",
329 |        " 'Hernia']"
330 |       ]
331 |      },
332 |      "execution_count": 71,
333 |      "metadata": {},
334 |      "output_type": "execute_result"
335 |     }
336 |    ],
337 |    "source": [
338 |     "num_gpu = get_available_gpus()\n",
339 |     "# get number of available GPUs\n",
340 |     "print(\"num of GPUs:\", len(get_available_gpus()))\n",
341 |     "\n",
342 |     "pathologies_name_list = prj_consts.DISEASE_list\n",
343 |     "pathologies_name_list\n",
344 |     "\n",
345 |     "stanford_result = [0.8094, 0.9248, 0.8638, 0.7345, 0.8676, 0.7802, 0.7680, 0.8887, 0.7901, 0.8878, 0.9371, 0.8047,\n",
346 |     "                   0.8062, 0.9164]\n",
347 |     "\n",
348 |     "\n",
349 |     "with open(label_path, 'rb') as f:\n",
350 |     "    labels = pickle.load(f)\n",
351 |     "\n",
352 |     "with open(partition_path, 'rb') as f:\n",
353 |     "    partition = pickle.load(f)"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 72,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "\n",
363 |     "# generator for train and validation data\n",
364 |     "# use the Sequence class per issue https://github.com/keras-team/keras/issues/1638\n",
365 |     "class DataGenSequence(Sequence):\n",
366 |     "    def __init__(self, labels, image_file_index, current_state):\n",
367 |     "        self.batch_size = batch_size\n",
368 |     "        self.labels = labels\n",
369 |     "        self.img_file_index = image_file_index\n",
370 |     "        self.current_state = current_state\n",
371 |     "        self.len = len(self.img_file_index) // self.batch_size\n",
372 |     "        print(\"for DataGenSequence\", current_state, \"total rows are:\", len(self.img_file_index), \", len is\", self.len)\n",
373 |     "\n",
374 |     "    def __len__(self):\n",
375 |     "        return self.len\n",
376 |     "\n",
377 |     "    def __getitem__(self, idx):\n",
378 |     "        # print(\"loading data segmentation\", idx)\n",
379 |     "        # make sure each batch size has the same amount of data\n",
380 |     "        current_batch = self.img_file_index[idx * self.batch_size: (idx + 1) * self.batch_size]\n",
381 |     "        X = np.empty((self.batch_size, resized_height, resized_width, num_channel))\n",
382 |     "        y = np.empty((self.batch_size, num_classes))\n",
383 |     "\n",
384 |     "        for i, image_name in enumerate(current_batch):\n",
385 |     "            path = os.path.join(nih_chest_xray_data_dir, image_name)\n",
386 |     "\n",
387 |     "            # loading data\n",
388 |     "\n",
389 |     "            img = cv2.resize(cv2.imread(path), (resized_height, resized_width)).astype(np.float32)\n",
390 |     "            X[i, :, :, :] = img\n",
391 |     "            y[i, :] = labels[image_name]\n",
392 |     "\n",
393 |     "            # only do random flipping in training status\n",
394 |     "        if self.current_state == 'train':\n",
395 |     "            # this is different from the training code\n",
396 |     "            x_augmented = X\n",
397 |     "        else:\n",
398 |     "            x_augmented = X\n",
399 |     "\n",
400 |     "        return x_augmented, y"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": 73,
406 |    "metadata": {},
407 |    "outputs": [
408 |     {
409 |      "data": {
410 |       "text/plain": [
411 |        "32893"
412 |       ]
413 |      },
414 |      "execution_count": 73,
415 |      "metadata": {},
416 |      "output_type": "execute_result"
417 |     }
418 |    ],
419 |    "source": [
420 |     "len(partition['test'])"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 74,
426 |    "metadata": {},
427 |    "outputs": [
428 |     {
429 |      "name": "stdout",
430 |      "output_type": "stream",
431 |      "text": [
432 |       "len of result is 32768\n",
433 |       "/azureml-share/chestxray/output/fully_trained_models/weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5\n",
434 |       "Weights for the model were loaded successfully\n",
435 |       "evaluation for model /azureml-share/chestxray/output/fully_trained_models/weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5\n",
436 |       "for DataGenSequence test total rows are: 32893 , len is 64\n",
437 |       "64/64 [==============================] - 469s 7s/step\n",
438 |       "result shape (32768, 14)\n",
439 |       "               Disease  Our AUC Score  Stanford AUC Score     Delta\n",
440 |       "0          Atelectasis       0.823191              0.8094 -0.013791\n",
441 |       "1         Cardiomegaly       0.933519              0.9248 -0.008719\n",
442 |       "2             Effusion       0.883184              0.8638 -0.019384\n",
443 |       "3         Infiltration       0.744561              0.7345 -0.010061\n",
444 |       "4                 Mass       0.859510              0.8676  0.008090\n",
445 |       "5               Nodule       0.783997              0.7802 -0.003797\n",
446 |       "6            Pneumonia       0.801597              0.7680 -0.033597\n",
447 |       "7         Pneumothorax       0.830550              0.8887  0.058150\n",
448 |       "8        Consolidation       0.813993              0.7901 -0.023893\n",
449 |       "9                Edema       0.896173              0.8878 -0.008373\n",
450 |       "10           Emphysema       0.849184              0.9371  0.087916\n",
451 |       "11            Fibrosis       0.882463              0.8047 -0.077763\n",
452 |       "12  Pleural Thickening       1.000000              0.8062 -0.193800\n",
453 |       "13              Hernia       0.916395              0.9164  0.000005\n"
454 |      ]
455 |     }
456 |    ],
457 |    "source": [
458 |     "import keras_contrib\n",
459 |     "\n",
460 |     "# load test data\n",
461 |     "X_test = np.empty((len(partition['test']), 224, 224, 3), dtype=np.float32)\n",
462 |     "y_test = np.empty((len(partition['test']) - len(partition['test']) % batch_size, 14), dtype=np.float32)\n",
463 |     "\n",
464 |     "for i, npy in enumerate(partition['test']):\n",
465 |     "    if (i < len(y_test)):\n",
466 |     "        # round to batch_size\n",
467 |     "        y_test[i, :] = labels[npy]\n",
468 |     "\n",
469 |     "print(\"len of result is\", len(y_test))\n",
470 |     "y_pred_list = np.empty((len(models_file_name), len(partition['test']), 14), dtype=np.float32)\n",
471 |     "\n",
472 |     "# individual models\n",
473 |     "for index, current_model_file in enumerate(models_file_name):\n",
474 |     "    print(current_model_file)\n",
475 |     "#    model = load_model(current_model_file)\n",
476 |     "    model = azure_chestxray_keras_utils.build_model(keras_contrib.applications.densenet.DenseNetImageNet121); model.load_weights(current_model_file)\n",
477 |     "\n",
478 |     "    print('evaluation for model', current_model_file)\n",
479 |     "    # y_pred = model.predict(X_test)\n",
480 |     "\n",
481 |     "    y_pred = model.predict_generator(generator=DataGenSequence(labels, partition['test'], current_state='test'),\n",
482 |     "                                     workers=32, verbose=1, max_queue_size=1)\n",
483 |     "    print(\"result shape\", y_pred.shape)\n",
484 |     "    \n",
485 |     "    # add one fake row of ones in both test and pred values to avoid:\n",
486 |     "    # ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.\n",
487 |     "    y_test = np.insert(y_test, 0, np.ones((y_test.shape[1],)), 0)\n",
488 |     "    y_pred = np.insert(y_pred, 0, np.ones((y_pred.shape[1],)), 0)\n",
489 |     "\n",
490 |     "    df = pd.DataFrame(columns=['Disease', 'Our AUC Score', 'Stanford AUC Score'])\n",
491 |     "    for d in range(14):\n",
492 |     "        df.loc[d] = [pathologies_name_list[d],\n",
493 |     "                     metrics.roc_auc_score(y_test[:, d], y_pred[:, d]),\n",
494 |     "                     stanford_result[d]]\n",
495 |     "\n",
496 |     "    df['Delta'] = df['Stanford AUC Score'] - df['Our AUC Score']\n",
497 |     "    df.to_csv(current_model_file + \".csv\", index=False)\n",
498 |     "    print(df)"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": 78,
504 |    "metadata": {},
505 |    "outputs": [
506 |     {
507 |      "data": {
508 |       "text/plain": [
509 |        "(1, 32893, 14)"
510 |       ]
511 |      },
512 |      "execution_count": 78,
513 |      "metadata": {},
514 |      "output_type": "execute_result"
515 |     },
516 |     {
517 |      "data": {
518 |       "text/plain": [
519 |        "numpy.ndarray"
520 |       ]
521 |      },
522 |      "execution_count": 78,
523 |      "metadata": {},
524 |      "output_type": "execute_result"
525 |     },
526 |     {
527 |      "data": {
528 |       "text/plain": [
529 |        "(32769, 14)"
530 |       ]
531 |      },
532 |      "execution_count": 78,
533 |      "metadata": {},
534 |      "output_type": "execute_result"
535 |     },
536 |     {
537 |      "data": {
538 |       "text/plain": [
539 |        "array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],\n",
540 |        "       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],\n",
541 |        "       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],\n",
542 |        "      dtype=float32)"
543 |       ]
544 |      },
545 |      "execution_count": 78,
546 |      "metadata": {},
547 |      "output_type": "execute_result"
548 |     },
549 |     {
550 |      "data": {
551 |       "text/plain": [
552 |        "array([4004, 1189, 5187, 7044, 2089, 2090,  593, 2450, 1850,  808,  933,\n",
553 |        "        493,    1,   79])"
554 |       ]
555 |      },
556 |      "execution_count": 78,
557 |      "metadata": {},
558 |      "output_type": "execute_result"
559 |     }
560 |    ],
561 |    "source": [
562 |     "y_pred_list.shape\n",
563 |     "type(y_test[:, d])\n",
564 |     "y_test.shape\n",
565 |     "y_test[:3,]\n",
566 |     "y_test.sum(axis=0).astype(int)\n"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 76,
572 |    "metadata": {},
573 |    "outputs": [],
574 |    "source": [
575 |     "# C:\\repos\\ChestXRay\\TDSP>jupyter nbconvert --to html .\\Code\\02_Model\\020_evaluate.ipynb\n",
576 |     "# [NbConvertApp] Converting notebook .\\Code\\01_DataPrep\\001_get_data.ipynb to html\n",
577 |     "# [NbConvertApp] Writing 263414 bytes to .\\Code\\01_DataPrep\\001_get_data.html"
578 |    ]
579 |   }
580 |  ],
581 |  "metadata": {
582 |   "kernelspec": {
583 |    "display_name": "azure_chestxray_lung_disease gpucomputecontext",
584 |    "language": "python",
585 |    "name": "azure_chestxray_lung_disease_gpucomputecontext"
586 |   },
587 |   "language_info": {
588 |    "codemirror_mode": {
589 |     "name": "ipython",
590 |     "version": 3
591 |    },
592 |    "file_extension": ".py",
593 |    "mimetype": "text/x-python",
594 |    "name": "python",
595 |    "nbconvert_exporter": "python",
596 |    "pygments_lexer": "ipython3",
597 |    "version": "3.5.2"
598 |   }
599 |  },
600 |  "nbformat": 4,
601 |  "nbformat_minor": 2
602 | }
603 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/src/finding_lungs/blacklist_other_images_with_lower_quality.csv:
--------------------------------------------------------------------------------
   1 | 00000032_013.png
   2 | 00000032_023.png
   3 | 00000032_024.png
   4 | 00000032_055.png
   5 | 00000032_058.png
   6 | 00000116_007.png
   7 | 00000244_000.png
   8 | 00000244_002.png
   9 | 00000248_013.png
  10 | 00000248_018.png
  11 | 00000248_019.png
  12 | 00000467_013.png
  13 | 00000468_060.png
  14 | 00000565_000.png
  15 | 00000583_005.png
  16 | 00000583_007.png
  17 | 00000583_009.png
  18 | 00000583_019.png
  19 | 00000583_024.png
  20 | 00000627_030.png
  21 | 00000627_036.png
  22 | 00000703_000.png
  23 | 00000831_008.png
  24 | 00000929_000.png
  25 | 00000929_001.png
  26 | 00000980_004.png
  27 | 00001029_003.png
  28 | 00001075_016.png
  29 | 00001075_020.png
  30 | 00001122_016.png
  31 | 00001122_017.png
  32 | 00001153_006.png
  33 | 00001157_002.png
  34 | 00001179_000.png
  35 | 00001179_001.png
  36 | 00001181_000.png
  37 | 00001223_000.png
  38 | 00001223_001.png
  39 | 00001249_004.png
  40 | 00001255_012.png
  41 | 00001255_035.png
  42 | 00001267_001.png
  43 | 00001278_009.png
  44 | 00001278_011.png
  45 | 00001437_038.png
  46 | 00001501_002.png
  47 | 00001564_000.png
  48 | 00001577_003.png
  49 | 00001595_000.png
  50 | 00001595_001.png
  51 | 00001595_002.png
  52 | 00001686_000.png
  53 | 00001686_001.png
  54 | 00001736_005.png
  55 | 00001736_007.png
  56 | 00001736_010.png
  57 | 00001736_014.png
  58 | 00001736_018.png
  59 | 00001736_021.png
  60 | 00001736_025.png
  61 | 00001736_026.png
  62 | 00001736_027.png
  63 | 00001787_002.png
  64 | 00001787_010.png
  65 | 00001814_004.png
  66 | 00001836_014.png
  67 | 00001855_000.png
  68 | 00001855_004.png
  69 | 00001855_009.png
  70 | 00001855_010.png
  71 | 00001855_011.png
  72 | 00001855_012.png
  73 | 00001855_014.png
  74 | 00001855_016.png
  75 | 00001855_018.png
  76 | 00001855_020.png
  77 | 00001855_021.png
  78 | 00001855_022.png
  79 | 00001855_023.png
  80 | 00001855_024.png
  81 | 00001855_025.png
  82 | 00001855_026.png
  83 | 00001855_027.png
  84 | 00001855_028.png
  85 | 00001855_029.png
  86 | 00001855_030.png
  87 | 00001855_032.png
  88 | 00001855_033.png
  89 | 00001855_034.png
  90 | 00001855_035.png
  91 | 00001855_037.png
  92 | 00001952_000.png
  93 | 00001952_001.png
  94 | 00001952_002.png
  95 | 00001952_007.png
  96 | 00001952_008.png
  97 | 00001986_010.png
  98 | 00002072_003.png
  99 | 00002072_004.png
 100 | 00002072_009.png
 101 | 00002072_010.png
 102 | 00002072_011.png
 103 | 00002072_014.png
 104 | 00002072_015.png
 105 | 00002072_018.png
 106 | 00002072_019.png
 107 | 00002208_001.png
 108 | 00002359_018.png
 109 | 00002366_001.png
 110 | 00002366_002.png
 111 | 00002437_036.png
 112 | 00002437_037.png
 113 | 00002529_007.png
 114 | 00002529_023.png
 115 | 00002529_025.png
 116 | 00002529_030.png
 117 | 00002545_001.png
 118 | 00002582_007.png
 119 | 00002594_001.png
 120 | 00002633_023.png
 121 | 00002636_000.png
 122 | 00002659_003.png
 123 | 00002675_005.png
 124 | 00002733_000.png
 125 | 00002763_023.png
 126 | 00002763_024.png
 127 | 00002892_004.png
 128 | 00002896_000.png
 129 | 00003004_000.png
 130 | 00003005_005.png
 131 | 00003029_018.png
 132 | 00003059_000.png
 133 | 00003060_000.png
 134 | 00003094_000.png
 135 | 00003094_003.png
 136 | 00003094_004.png
 137 | 00003094_005.png
 138 | 00003158_001.png
 139 | 00003186_003.png
 140 | 00003369_001.png
 141 | 00003465_000.png
 142 | 00003465_001.png
 143 | 00003465_002.png
 144 | 00003465_003.png
 145 | 00003465_004.png
 146 | 00003465_005.png
 147 | 00003465_006.png
 148 | 00003465_007.png
 149 | 00003465_008.png
 150 | 00003523_036.png
 151 | 00004276_000.png
 152 | 00004285_000.png
 153 | 00004309_006.png
 154 | 00004344_025.png
 155 | 00004360_020.png
 156 | 00004360_023.png
 157 | 00004472_000.png
 158 | 00004545_000.png
 159 | 00004660_000.png
 160 | 00004672_001.png
 161 | 00004703_000.png
 162 | 00004706_001.png
 163 | 00004792_000.png
 164 | 00004808_014.png
 165 | 00004808_094.png
 166 | 00004811_000.png
 167 | 00004928_006.png
 168 | 00005051_000.png
 169 | 00005094_009.png
 170 | 00005201_001.png
 171 | 00005204_001.png
 172 | 00005220_012.png
 173 | 00005220_015.png
 174 | 00005254_003.png
 175 | 00005254_004.png
 176 | 00005254_008.png
 177 | 00005298_013.png
 178 | 00005360_002.png
 179 | 00005573_004.png
 180 | 00005699_005.png
 181 | 00005712_008.png
 182 | 00005746_008.png
 183 | 00005748_000.png
 184 | 00005750_015.png
 185 | 00005750_016.png
 186 | 00005750_017.png
 187 | 00005877_000.png
 188 | 00005937_000.png
 189 | 00005975_001.png
 190 | 00006008_015.png
 191 | 00006015_000.png
 192 | 00006015_003.png
 193 | 00006039_022.png
 194 | 00006054_001.png
 195 | 00006127_000.png
 196 | 00006209_001.png
 197 | 00006220_002.png
 198 | 00006220_003.png
 199 | 00006220_004.png
 200 | 00006220_005.png
 201 | 00006220_006.png
 202 | 00006220_009.png
 203 | 00006271_002.png
 204 | 00006271_078.png
 205 | 00006271_093.png
 206 | 00006294_004.png
 207 | 00006296_011.png
 208 | 00006381_009.png
 209 | 00006391_001.png
 210 | 00006415_000.png
 211 | 00006446_012.png
 212 | 00006585_007.png
 213 | 00006754_008.png
 214 | 00006838_000.png
 215 | 00006850_019.png
 216 | 00006870_000.png
 217 | 00006904_007.png
 218 | 00006906_029.png
 219 | 00006906_031.png
 220 | 00006906_032.png
 221 | 00006917_000.png
 222 | 00006960_022.png
 223 | 00007001_001.png
 224 | 00007018_034.png
 225 | 00007018_035.png
 226 | 00007108_006.png
 227 | 00007152_008.png
 228 | 00007217_005.png
 229 | 00007269_000.png
 230 | 00007276_001.png
 231 | 00007276_002.png
 232 | 00007322_003.png
 233 | 00007322_005.png
 234 | 00007322_009.png
 235 | 00007322_020.png
 236 | 00007371_000.png
 237 | 00007438_000.png
 238 | 00007500_000.png
 239 | 00007500_001.png
 240 | 00007545_000.png
 241 | 00007558_004.png
 242 | 00007558_007.png
 243 | 00007624_036.png
 244 | 00007830_000.png
 245 | 00007830_001.png
 246 | 00007830_004.png
 247 | 00007830_005.png
 248 | 00007830_007.png
 249 | 00007830_010.png
 250 | 00007973_000.png
 251 | 00007985_000.png
 252 | 00008051_039.png
 253 | 00008051_050.png
 254 | 00008051_051.png
 255 | 00008295_010.png
 256 | 00008297_008.png
 257 | 00008297_013.png
 258 | 00008297_016.png
 259 | 00008314_000.png
 260 | 00008463_001.png
 261 | 00008522_057.png
 262 | 00008549_000.png
 263 | 00008640_000.png
 264 | 00008701_008.png
 265 | 00008911_006.png
 266 | 00008993_000.png
 267 | 00009218_020.png
 268 | 00009218_022.png
 269 | 00009282_000.png
 270 | 00009465_004.png
 271 | 00009508_004.png
 272 | 00009551_008.png
 273 | 00009551_022.png
 274 | 00009573_000.png
 275 | 00009608_045.png
 276 | 00009613_005.png
 277 | 00009621_000.png
 278 | 00009621_001.png
 279 | 00009621_002.png
 280 | 00009621_003.png
 281 | 00009621_004.png
 282 | 00009621_005.png
 283 | 00009621_006.png
 284 | 00009621_007.png
 285 | 00009702_006.png
 286 | 00009727_012.png
 287 | 00009727_013.png
 288 | 00009727_014.png
 289 | 00009727_018.png
 290 | 00009727_019.png
 291 | 00009727_020.png
 292 | 00009727_022.png
 293 | 00009727_023.png
 294 | 00009727_027.png
 295 | 00009727_028.png
 296 | 00009876_002.png
 297 | 00009886_000.png
 298 | 00009892_007.png
 299 | 00009892_046.png
 300 | 00009911_004.png
 301 | 00009953_016.png
 302 | 00010007_053.png
 303 | 00010007_060.png
 304 | 00010007_071.png
 305 | 00010007_074.png
 306 | 00010007_082.png
 307 | 00010007_103.png
 308 | 00010012_018.png
 309 | 00010012_026.png
 310 | 00010092_007.png
 311 | 00010092_043.png
 312 | 00010124_000.png
 313 | 00010294_007.png
 314 | 00010352_054.png
 315 | 00010352_074.png
 316 | 00010360_004.png
 317 | 00010384_005.png
 318 | 00010405_000.png
 319 | 00010405_001.png
 320 | 00010415_000.png
 321 | 00010435_002.png
 322 | 00010544_016.png
 323 | 00010544_027.png
 324 | 00010544_030.png
 325 | 00010693_027.png
 326 | 00010698_001.png
 327 | 00010698_013.png
 328 | 00010761_000.png
 329 | 00010773_014.png
 330 | 00010773_025.png
 331 | 00010790_039.png
 332 | 00010790_043.png
 333 | 00010790_045.png
 334 | 00010792_004.png
 335 | 00010805_002.png
 336 | 00010805_003.png
 337 | 00010805_004.png
 338 | 00010805_005.png
 339 | 00010805_006.png
 340 | 00010805_008.png
 341 | 00010805_009.png
 342 | 00010805_010.png
 343 | 00010805_011.png
 344 | 00010805_013.png
 345 | 00010805_015.png
 346 | 00010805_016.png
 347 | 00010805_017.png
 348 | 00010805_018.png
 349 | 00010805_019.png
 350 | 00010805_020.png
 351 | 00010805_023.png
 352 | 00010805_025.png
 353 | 00010805_037.png
 354 | 00010805_038.png
 355 | 00010805_040.png
 356 | 00010805_043.png
 357 | 00010805_045.png
 358 | 00010805_046.png
 359 | 00010805_047.png
 360 | 00010805_048.png
 361 | 00010805_050.png
 362 | 00010828_017.png
 363 | 00010843_000.png
 364 | 00010887_027.png
 365 | 00010960_001.png
 366 | 00010960_002.png
 367 | 00010995_006.png
 368 | 00010995_008.png
 369 | 00011007_000.png
 370 | 00011021_012.png
 371 | 00011064_000.png
 372 | 00011164_007.png
 373 | 00011237_095.png
 374 | 00011237_108.png
 375 | 00011379_002.png
 376 | 00011379_003.png
 377 | 00011379_004.png
 378 | 00011379_005.png
 379 | 00011379_006.png
 380 | 00011379_013.png
 381 | 00011379_018.png
 382 | 00011379_019.png
 383 | 00011379_022.png
 384 | 00011379_039.png
 385 | 00011379_041.png
 386 | 00011379_043.png
 387 | 00011379_045.png
 388 | 00011379_046.png
 389 | 00011379_047.png
 390 | 00011386_000.png
 391 | 00011391_016.png
 392 | 00011391_031.png
 393 | 00011391_032.png
 394 | 00011391_039.png
 395 | 00011391_041.png
 396 | 00011391_043.png
 397 | 00011391_047.png
 398 | 00011391_055.png
 399 | 00011436_009.png
 400 | 00011461_002.png
 401 | 00011553_002.png
 402 | 00011553_003.png
 403 | 00011553_004.png
 404 | 00011553_005.png
 405 | 00011553_006.png
 406 | 00011553_007.png
 407 | 00011553_009.png
 408 | 00011553_010.png
 409 | 00011553_011.png
 410 | 00011553_012.png
 411 | 00011553_013.png
 412 | 00011553_014.png
 413 | 00011553_015.png
 414 | 00011553_016.png
 415 | 00011553_017.png
 416 | 00011553_018.png
 417 | 00011553_019.png
 418 | 00011553_020.png
 419 | 00011553_022.png
 420 | 00011553_023.png
 421 | 00011553_024.png
 422 | 00011553_025.png
 423 | 00011553_026.png
 424 | 00011553_027.png
 425 | 00011553_028.png
 426 | 00011553_029.png
 427 | 00011553_030.png
 428 | 00011553_031.png
 429 | 00011553_032.png
 430 | 00011553_033.png
 431 | 00011553_034.png
 432 | 00011553_035.png
 433 | 00011553_036.png
 434 | 00011553_037.png
 435 | 00011553_038.png
 436 | 00011553_040.png
 437 | 00011553_041.png
 438 | 00011553_046.png
 439 | 00011553_047.png
 440 | 00011673_000.png
 441 | 00011677_001.png
 442 | 00011677_002.png
 443 | 00011702_024.png
 444 | 00011702_062.png
 445 | 00011731_003.png
 446 | 00011769_000.png
 447 | 00011925_047.png
 448 | 00011925_049.png
 449 | 00011925_051.png
 450 | 00011925_053.png
 451 | 00011925_055.png
 452 | 00011925_068.png
 453 | 00011925_071.png
 454 | 00011925_078.png
 455 | 00011947_000.png
 456 | 00011985_008.png
 457 | 00012141_013.png
 458 | 00012159_002.png
 459 | 00012162_001.png
 460 | 00012276_007.png
 461 | 00012276_009.png
 462 | 00012276_010.png
 463 | 00012276_013.png
 464 | 00012276_017.png
 465 | 00012276_018.png
 466 | 00012368_002.png
 467 | 00012470_011.png
 468 | 00012470_012.png
 469 | 00012515_002.png
 470 | 00012591_000.png
 471 | 00012605_000.png
 472 | 00012605_001.png
 473 | 00012628_017.png
 474 | 00012628_060.png
 475 | 00012648_001.png
 476 | 00012654_001.png
 477 | 00012662_000.png
 478 | 00012742_000.png
 479 | 00012742_001.png
 480 | 00012742_002.png
 481 | 00012798_000.png
 482 | 00012834_005.png
 483 | 00012834_007.png
 484 | 00012834_010.png
 485 | 00012834_085.png
 486 | 00012834_120.png
 487 | 00012834_137.png
 488 | 00012863_027.png
 489 | 00012863_039.png
 490 | 00013049_006.png
 491 | 00013049_007.png
 492 | 00013123_004.png
 493 | 00013152_004.png
 494 | 00013158_004.png
 495 | 00013249_004.png
 496 | 00013249_006.png
 497 | 00013249_007.png
 498 | 00013249_008.png
 499 | 00013249_013.png
 500 | 00013249_014.png
 501 | 00013249_017.png
 502 | 00013249_018.png
 503 | 00013249_028.png
 504 | 00013249_033.png
 505 | 00013249_036.png
 506 | 00013249_038.png
 507 | 00013249_041.png
 508 | 00013249_046.png
 509 | 00013401_000.png
 510 | 00013440_000.png
 511 | 00013499_004.png
 512 | 00013568_000.png
 513 | 00013601_013.png
 514 | 00013608_000.png
 515 | 00013608_002.png
 516 | 00013608_004.png
 517 | 00013608_016.png
 518 | 00013615_015.png
 519 | 00013615_025.png
 520 | 00013615_027.png
 521 | 00013615_049.png
 522 | 00013615_057.png
 523 | 00013615_060.png
 524 | 00013625_033.png
 525 | 00013641_014.png
 526 | 00013641_041.png
 527 | 00013670_146.png
 528 | 00013670_162.png
 529 | 00013670_163.png
 530 | 00013670_166.png
 531 | 00013670_167.png
 532 | 00013685_047.png
 533 | 00013774_027.png
 534 | 00013774_041.png
 535 | 00013774_042.png
 536 | 00013774_048.png
 537 | 00013894_010.png
 538 | 00013894_024.png
 539 | 00013894_025.png
 540 | 00013894_027.png
 541 | 00013896_004.png
 542 | 00013922_020.png
 543 | 00013922_021.png
 544 | 00013966_007.png
 545 | 00013993_016.png
 546 | 00013993_049.png
 547 | 00013993_099.png
 548 | 00014004_018.png
 549 | 00014004_023.png
 550 | 00014014_002.png
 551 | 00014014_005.png
 552 | 00014080_001.png
 553 | 00014112_019.png
 554 | 00014128_023.png
 555 | 00014192_000.png
 556 | 00014203_016.png
 557 | 00014203_026.png
 558 | 00014203_028.png
 559 | 00014203_029.png
 560 | 00014203_042.png
 561 | 00014203_044.png
 562 | 00014223_012.png
 563 | 00014245_001.png
 564 | 00014245_003.png
 565 | 00014314_001.png
 566 | 00014320_040.png
 567 | 00014320_043.png
 568 | 00014323_001.png
 569 | 00014323_002.png
 570 | 00014323_003.png
 571 | 00014332_004.png
 572 | 00014351_000.png
 573 | 00014352_001.png
 574 | 00014465_016.png
 575 | 00014474_002.png
 576 | 00014486_004.png
 577 | 00014509_000.png
 578 | 00014958_009.png
 579 | 00014982_000.png
 580 | 00015007_002.png
 581 | 00015007_003.png
 582 | 00015007_005.png
 583 | 00015007_006.png
 584 | 00015007_007.png
 585 | 00015007_008.png
 586 | 00015007_011.png
 587 | 00015024_003.png
 588 | 00015031_006.png
 589 | 00015031_022.png
 590 | 00015041_003.png
 591 | 00015112_004.png
 592 | 00015126_000.png
 593 | 00015151_001.png
 594 | 00015193_014.png
 595 | 00015213_000.png
 596 | 00015290_000.png
 597 | 00015391_001.png
 598 | 00015462_001.png
 599 | 00015462_002.png
 600 | 00015482_000.png
 601 | 00015530_071.png
 602 | 00015530_142.png
 603 | 00015564_011.png
 604 | 00015605_038.png
 605 | 00015605_051.png
 606 | 00015605_053.png
 607 | 00015605_055.png
 608 | 00015606_013.png
 609 | 00015606_050.png
 610 | 00015696_001.png
 611 | 00015758_000.png
 612 | 00015826_019.png
 613 | 00015923_000.png
 614 | 00015934_000.png
 615 | 00015986_000.png
 616 | 00015996_001.png
 617 | 00016009_046.png
 618 | 00016034_003.png
 619 | 00016051_003.png
 620 | 00016051_004.png
 621 | 00016133_000.png
 622 | 00016175_003.png
 623 | 00016175_006.png
 624 | 00016175_008.png
 625 | 00016184_027.png
 626 | 00016238_006.png
 627 | 00016292_000.png
 628 | 00016292_001.png
 629 | 00016292_002.png
 630 | 00016292_003.png
 631 | 00016292_004.png
 632 | 00016378_001.png
 633 | 00016410_006.png
 634 | 00016410_008.png
 635 | 00016410_055.png
 636 | 00016484_001.png
 637 | 00016484_005.png
 638 | 00016484_009.png
 639 | 00016484_011.png
 640 | 00016484_026.png
 641 | 00016522_019.png
 642 | 00016529_000.png
 643 | 00016638_003.png
 644 | 00016638_004.png
 645 | 00016653_000.png
 646 | 00016732_035.png
 647 | 00016784_002.png
 648 | 00016860_001.png
 649 | 00016860_005.png
 650 | 00016867_003.png
 651 | 00016918_005.png
 652 | 00017036_023.png
 653 | 00017110_012.png
 654 | 00017138_032.png
 655 | 00017207_002.png
 656 | 00017207_003.png
 657 | 00017207_008.png
 658 | 00017258_022.png
 659 | 00017258_023.png
 660 | 00017362_009.png
 661 | 00017392_000.png
 662 | 00017400_000.png
 663 | 00017403_007.png
 664 | 00017403_010.png
 665 | 00017424_034.png
 666 | 00017424_035.png
 667 | 00017424_036.png
 668 | 00017424_038.png
 669 | 00017424_041.png
 670 | 00017425_002.png
 671 | 00017425_006.png
 672 | 00017477_000.png
 673 | 00017504_024.png
 674 | 00017504_068.png
 675 | 00017538_001.png
 676 | 00017538_002.png
 677 | 00017541_025.png
 678 | 00017553_000.png
 679 | 00017561_001.png
 680 | 00017605_014.png
 681 | 00017606_020.png
 682 | 00017618_013.png
 683 | 00017625_000.png
 684 | 00017625_004.png
 685 | 00017641_004.png
 686 | 00017645_013.png
 687 | 00017648_000.png
 688 | 00017695_000.png
 689 | 00017753_026.png
 690 | 00017817_001.png
 691 | 00017817_002.png
 692 | 00017927_001.png
 693 | 00017941_005.png
 694 | 00017972_006.png
 695 | 00017972_014.png
 696 | 00017979_000.png
 697 | 00017999_000.png
 698 | 00018011_015.png
 699 | 00018044_020.png
 700 | 00018044_036.png
 701 | 00018044_040.png
 702 | 00018044_043.png
 703 | 00018069_000.png
 704 | 00018069_001.png
 705 | 00018091_012.png
 706 | 00018103_002.png
 707 | 00018103_007.png
 708 | 00018103_009.png
 709 | 00018104_004.png
 710 | 00018116_000.png
 711 | 00018121_000.png
 712 | 00018125_009.png
 713 | 00018126_024.png
 714 | 00018175_002.png
 715 | 00018191_000.png
 716 | 00018191_001.png
 717 | 00018213_001.png
 718 | 00018240_000.png
 719 | 00018251_001.png
 720 | 00018251_002.png
 721 | 00018251_003.png
 722 | 00018251_004.png
 723 | 00018251_005.png
 724 | 00018251_006.png
 725 | 00018251_007.png
 726 | 00018251_008.png
 727 | 00018251_009.png
 728 | 00018251_010.png
 729 | 00018251_011.png
 730 | 00018251_012.png
 731 | 00018251_013.png
 732 | 00018251_014.png
 733 | 00018253_089.png
 734 | 00018336_000.png
 735 | 00018437_001.png
 736 | 00018437_002.png
 737 | 00018445_002.png
 738 | 00018458_000.png
 739 | 00018486_000.png
 740 | 00018571_000.png
 741 | 00018573_000.png
 742 | 00018598_004.png
 743 | 00018610_002.png
 744 | 00018610_004.png
 745 | 00018614_001.png
 746 | 00018615_001.png
 747 | 00018778_001.png
 748 | 00018778_002.png
 749 | 00018778_005.png
 750 | 00018921_026.png
 751 | 00018921_027.png
 752 | 00018927_000.png
 753 | 00018949_001.png
 754 | 00019020_000.png
 755 | 00019045_000.png
 756 | 00019107_001.png
 757 | 00019124_011.png
 758 | 00019124_012.png
 759 | 00019150_007.png
 760 | 00019301_000.png
 761 | 00019390_002.png
 762 | 00019390_004.png
 763 | 00019534_000.png
 764 | 00019576_024.png
 765 | 00019576_063.png
 766 | 00019576_064.png
 767 | 00019576_065.png
 768 | 00019587_000.png
 769 | 00019592_010.png
 770 | 00019660_001.png
 771 | 00019707_010.png
 772 | 00019888_001.png
 773 | 00019928_000.png
 774 | 00019967_001.png
 775 | 00019967_002.png
 776 | 00019967_003.png
 777 | 00019967_004.png
 778 | 00019967_007.png
 779 | 00019967_008.png
 780 | 00019967_009.png
 781 | 00019967_011.png
 782 | 00019967_012.png
 783 | 00019967_013.png
 784 | 00019967_014.png
 785 | 00019967_017.png
 786 | 00019967_019.png
 787 | 00019967_020.png
 788 | 00019967_032.png
 789 | 00020006_001.png
 790 | 00020108_001.png
 791 | 00020110_000.png
 792 | 00020146_002.png
 793 | 00020213_011.png
 794 | 00020213_018.png
 795 | 00020213_060.png
 796 | 00020213_061.png
 797 | 00020213_113.png
 798 | 00020219_000.png
 799 | 00020326_013.png
 800 | 00020326_058.png
 801 | 00020348_000.png
 802 | 00020364_002.png
 803 | 00020364_003.png
 804 | 00020398_010.png
 805 | 00020438_007.png
 806 | 00020622_002.png
 807 | 00020631_009.png
 808 | 00020928_004.png
 809 | 00020928_014.png
 810 | 00020928_015.png
 811 | 00020945_022.png
 812 | 00021023_014.png
 813 | 00021044_000.png
 814 | 00021108_000.png
 815 | 00021201_042.png
 816 | 00021420_013.png
 817 | 00021420_028.png
 818 | 00021481_012.png
 819 | 00021506_001.png
 820 | 00021508_002.png
 821 | 00021510_000.png
 822 | 00021572_010.png
 823 | 00021695_003.png
 824 | 00021700_006.png
 825 | 00021729_000.png
 826 | 00021770_012.png
 827 | 00021770_014.png
 828 | 00021770_015.png
 829 | 00021770_016.png
 830 | 00021811_003.png
 831 | 00021835_029.png
 832 | 00021901_005.png
 833 | 00021917_000.png
 834 | 00021942_006.png
 835 | 00021990_002.png
 836 | 00022010_001.png
 837 | 00022051_000.png
 838 | 00022174_000.png
 839 | 00022174_001.png
 840 | 00022245_011.png
 841 | 00022283_029.png
 842 | 00022339_000.png
 843 | 00022416_052.png
 844 | 00022470_007.png
 845 | 00022486_000.png
 846 | 00022523_004.png
 847 | 00022523_005.png
 848 | 00022524_000.png
 849 | 00022528_007.png
 850 | 00022566_022.png
 851 | 00022599_004.png
 852 | 00022714_000.png
 853 | 00022723_000.png
 854 | 00022725_003.png
 855 | 00022727_001.png
 856 | 00022815_004.png
 857 | 00022815_015.png
 858 | 00022815_020.png
 859 | 00022815_031.png
 860 | 00022815_037.png
 861 | 00022815_058.png
 862 | 00022815_068.png
 863 | 00022815_073.png
 864 | 00022815_079.png
 865 | 00022872_001.png
 866 | 00022872_002.png
 867 | 00022975_004.png
 868 | 00023027_000.png
 869 | 00023068_015.png
 870 | 00023129_000.png
 871 | 00023160_003.png
 872 | 00023176_019.png
 873 | 00023192_000.png
 874 | 00023195_000.png
 875 | 00023197_000.png
 876 | 00023254_003.png
 877 | 00023271_016.png
 878 | 00023325_037.png
 879 | 00023325_039.png
 880 | 00025066_000.png
 881 | 00025203_000.png
 882 | 00025223_000.png
 883 | 00025290_014.png
 884 | 00025445_001.png
 885 | 00025513_001.png
 886 | 00025513_005.png
 887 | 00025513_006.png
 888 | 00025513_007.png
 889 | 00025513_008.png
 890 | 00025513_009.png
 891 | 00025513_010.png
 892 | 00025513_011.png
 893 | 00025513_012.png
 894 | 00025513_013.png
 895 | 00025513_014.png
 896 | 00025529_010.png
 897 | 00025628_024.png
 898 | 00025628_026.png
 899 | 00025628_027.png
 900 | 00025664_037.png
 901 | 00025665_000.png
 902 | 00025691_000.png
 903 | 00025691_002.png
 904 | 00025697_001.png
 905 | 00025704_000.png
 906 | 00025796_000.png
 907 | 00025809_001.png
 908 | 00025839_010.png
 909 | 00025839_012.png
 910 | 00025932_001.png
 911 | 00025958_000.png
 912 | 00025958_002.png
 913 | 00025958_003.png
 914 | 00025958_006.png
 915 | 00026068_000.png
 916 | 00026068_001.png
 917 | 00026092_003.png
 918 | 00026098_028.png
 919 | 00026099_041.png
 920 | 00026114_000.png
 921 | 00026115_000.png
 922 | 00026159_000.png
 923 | 00026167_008.png
 924 | 00026194_001.png
 925 | 00026194_004.png
 926 | 00026194_007.png
 927 | 00026194_008.png
 928 | 00026194_009.png
 929 | 00026194_010.png
 930 | 00026194_011.png
 931 | 00026194_012.png
 932 | 00026194_014.png
 933 | 00026194_015.png
 934 | 00026194_018.png
 935 | 00026232_030.png
 936 | 00026262_000.png
 937 | 00026346_015.png
 938 | 00026349_005.png
 939 | 00026382_009.png
 940 | 00026431_000.png
 941 | 00026474_003.png
 942 | 00026506_000.png
 943 | 00026538_025.png
 944 | 00026621_000.png
 945 | 00026634_000.png
 946 | 00026666_000.png
 947 | 00026701_000.png
 948 | 00026758_000.png
 949 | 00026801_005.png
 950 | 00026818_020.png
 951 | 00026867_002.png
 952 | 00026867_004.png
 953 | 00026911_004.png
 954 | 00026925_006.png
 955 | 00026925_011.png
 956 | 00026925_015.png
 957 | 00026963_032.png
 958 | 00026971_026.png
 959 | 00026993_003.png
 960 | 00026993_004.png
 961 | 00027072_000.png
 962 | 00027196_009.png
 963 | 00027196_010.png
 964 | 00027213_001.png
 965 | 00027213_008.png
 966 | 00027213_009.png
 967 | 00027213_010.png
 968 | 00027213_076.png
 969 | 00027213_079.png
 970 | 00027299_006.png
 971 | 00027299_007.png
 972 | 00027415_009.png
 973 | 00027415_011.png
 974 | 00027415_028.png
 975 | 00027415_029.png
 976 | 00027415_037.png
 977 | 00027415_046.png
 978 | 00027415_047.png
 979 | 00027415_049.png
 980 | 00027415_059.png
 981 | 00027415_068.png
 982 | 00027415_069.png
 983 | 00027415_072.png
 984 | 00027415_073.png
 985 | 00027415_075.png
 986 | 00027415_077.png
 987 | 00027441_012.png
 988 | 00027441_017.png
 989 | 00027441_019.png
 990 | 00027441_024.png
 991 | 00027442_008.png
 992 | 00027464_024.png
 993 | 00027465_008.png
 994 | 00027524_000.png
 995 | 00027618_012.png
 996 | 00027623_006.png
 997 | 00027639_000.png
 998 | 00027639_001.png
 999 | 00027639_002.png
1000 | 00027639_003.png
1001 | 00027677_000.png
1002 | 00027710_000.png
1003 | 00027725_021.png
1004 | 00027725_035.png
1005 | 00027726_016.png
1006 | 00027726_019.png
1007 | 00027726_020.png
1008 | 00027726_021.png
1009 | 00027726_050.png
1010 | 00027726_051.png
1011 | 00027765_000.png
1012 | 00027765_002.png
1013 | 00027952_004.png
1014 | 00027981_000.png
1015 | 00027981_001.png
1016 | 00027981_002.png
1017 | 00028076_000.png
1018 | 00028092_000.png
1019 | 00028201_000.png
1020 | 00028211_012.png
1021 | 00028301_002.png
1022 | 00028341_001.png
1023 | 00028341_002.png
1024 | 00028341_003.png
1025 | 00028341_004.png
1026 | 00028341_005.png
1027 | 00028341_006.png
1028 | 00028341_007.png
1029 | 00028341_008.png
1030 | 00028341_009.png
1031 | 00028341_010.png
1032 | 00028341_011.png
1033 | 00028341_012.png
1034 | 00028389_000.png
1035 | 00028450_000.png
1036 | 00028454_011.png
1037 | 00028454_013.png
1038 | 00028474_000.png
1039 | 00028657_000.png
1040 | 00028799_000.png
1041 | 00028829_002.png
1042 | 00028873_017.png
1043 | 00028873_019.png
1044 | 00028873_020.png
1045 | 00028882_004.png
1046 | 00028961_006.png
1047 | 00028961_008.png
1048 | 00028996_002.png
1049 | 00028996_003.png
1050 | 00028996_004.png
1051 | 00029174_002.png
1052 | 00029222_003.png
1053 | 00029235_001.png
1054 | 00029245_002.png
1055 | 00029276_004.png
1056 | 00029404_000.png
1057 | 00029404_002.png
1058 | 00029404_003.png
1059 | 00029404_004.png
1060 | 00029404_005.png
1061 | 00029404_006.png
1062 | 00029404_007.png
1063 | 00029404_008.png
1064 | 00029404_010.png
1065 | 00029476_003.png
1066 | 00029596_012.png
1067 | 00029627_000.png
1068 | 00029813_029.png
1069 | 00029943_022.png
1070 | 00030079_020.png
1071 | 00030079_031.png
1072 | 00030206_011.png
1073 | 00030209_011.png
1074 | 00030213_000.png
1075 | 00030245_001.png
1076 | 00030320_004.png
1077 | 00030320_006.png
1078 | 00030323_038.png
1079 | 00030410_005.png
1080 | 00030412_002.png
1081 | 00030609_000.png
1082 | 00030609_001.png
1083 | 00030609_002.png
1084 | 00030609_003.png
1085 | 00030609_006.png
1086 | 00030609_008.png
1087 | 00030609_009.png
1088 | 00030609_010.png
1089 | 00030609_011.png
1090 | 00030609_017.png
1091 | 00030609_021.png
1092 | 00030609_023.png
1093 | 00030609_026.png
1094 | 00030786_004.png
1095 | 


--------------------------------------------------------------------------------
/AzureChestXRay_AMLWB/Code/02_Model/060_Train_pyTorch.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Train\n",
  8 |     "\n",
  9 |     "##### Copyright (C) Microsoft Corporation.  \n",
 10 |     "see license file for details "
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# Allow multiple displays per cell\n",
 20 |     "from IPython.core.interactiveshell import InteractiveShell\n",
 21 |     "InteractiveShell.ast_node_interactivity = \"all\""
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "text/plain": [
 32 |        "'/azureml-share/'"
 33 |       ]
 34 |      },
 35 |      "execution_count": 2,
 36 |      "metadata": {},
 37 |      "output_type": "execute_result"
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n",
 42 |     "\n",
 43 |     "import os\n",
 44 |     "try:\n",
 45 |     "    amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY']    \n",
 46 |     "except:\n",
 47 |     "    amlWBSharedDir = ''\n",
 48 |     "    print('not using aml services?')\n",
 49 |     "    \n",
 50 |     "amlWBSharedDir"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 3,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "## Data needs 2 things\n",
 60 |     "## TEMP (Get images)\n",
 61 |     "#crt_container  = 'https://chestxray.blob.core.windows.net/chestxraynih'\n",
 62 |     "#crt_destination = '/mnt/images'\n",
 63 |     "#answer = !yes | azcopy \\\n",
 64 |     "#    --source {crt_container} \\\n",
 65 |     "#    --destination {crt_destination} \\\n",
 66 |     "#    --recursive\n",
 67 |     "## TEMP (Get Labels csv)\n",
 68 |     "# Put to blob\n",
 69 |     "\n",
 70 |     "# Why not have a zip from blob that gets unzipped and has both images and csv?\n",
 71 |     "# Would make self-contained ..."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 4,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "import os\n",
 81 |     "import sys\n",
 82 |     "import numpy as np\n",
 83 |     "import pandas as pd\n",
 84 |     "import torch\n",
 85 |     "import torchvision.models as models\n",
 86 |     "import torch.nn as nn\n",
 87 |     "import torch.nn.functional as F\n",
 88 |     "import torch.optim as optim\n",
 89 |     "import torch.nn.init as init\n",
 90 |     "import time\n",
 91 |     "from torch.optim.lr_scheduler import ReduceLROnPlateau\n",
 92 |     "from torch.autograd import Variable\n",
 93 |     "import torchvision.transforms as transforms\n",
 94 |     "from torch.utils.data import DataLoader, Dataset\n",
 95 |     "from sklearn.metrics.ranking import roc_auc_score\n",
 96 |     "from sklearn.model_selection import train_test_split\n",
 97 |     "from PIL import Image\n",
 98 |     "import multiprocessing"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 5,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "assert torch.cuda.is_available()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 6,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "torch.backends.cudnn.benchmark=True # enables cudnn's auto-tuner"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 7,
122 |    "metadata": {},
123 |    "outputs": [
124 |     {
125 |      "name": "stdout",
126 |      "output_type": "stream",
127 |      "text": [
128 |       "OS:  linux\n",
129 |       "Python:  3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, 17:53:06) \n",
130 |       "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n",
131 |       "PyTorch:  0.3.1\n",
132 |       "CPUs:  12\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "print(\"OS: \", sys.platform)\n",
138 |     "print(\"Python: \", sys.version)\n",
139 |     "print(\"PyTorch: \", torch.__version__)\n",
140 |     "CPU_COUNT = multiprocessing.cpu_count()\n",
141 |     "print(\"CPUs: \", CPU_COUNT)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 8,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# Globals\n",
151 |     "# With small batch may be faster on P100 to do one 1 GPU\n",
152 |     "MULTI_GPU = True\n",
153 |     "CLASSES = 14\n",
154 |     "WIDTH = 224\n",
155 |     "HEIGHT = 224\n",
156 |     "CHANNELS = 3\n",
157 |     "LR = 0.0001\n",
158 |     "EPOCHS = 1 #100\n",
159 |     "# Can scale to max for inference but for training LR will be affected\n",
160 |     "# Prob better to increase this though on P100 since LR is not too low\n",
161 |     "# Easier to see when plotted\n",
162 |     "BATCHSIZE = 64*2\n",
163 |     "IMAGENET_RGB_MEAN = [0.485, 0.456, 0.406]\n",
164 |     "IMAGENET_RGB_SD = [0.229, 0.224, 0.225]"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 9,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "[None]"
176 |       ]
177 |      },
178 |      "execution_count": 9,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "# import utlity functions\n",
185 |     "\n",
186 |     "import sys, os\n",
187 |     "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code',  'src'])))]\n",
188 |     "def add_path_to_sys_path(path_to_append):\n",
189 |     "    if not (any(path_to_append in paths for paths in sys.path)):\n",
190 |     "        sys.path.append(path_to_append)\n",
191 |     "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n",
192 |     "\n",
193 |     "import azure_chestxray_utils"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 10,
199 |    "metadata": {},
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'"
205 |       ]
206 |      },
207 |      "execution_count": 10,
208 |      "metadata": {},
209 |      "output_type": "execute_result"
210 |     },
211 |     {
212 |      "name": "stdout",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "112120\r\n"
216 |      ]
217 |     },
218 |     {
219 |      "data": {
220 |       "text/plain": [
221 |        "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC_other'"
222 |       ]
223 |      },
224 |      "execution_count": 10,
225 |      "metadata": {},
226 |      "output_type": "execute_result"
227 |     },
228 |     {
229 |      "name": "stdout",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "BBox_List_2017.csv  Data_Entry_2017.csv  blacklist.csv\r\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "# create the file path variables \n",
238 |     "# paths are tipically container level dirs mapped to a host dir for data persistence.\n",
239 |     "\n",
240 |     "prj_consts = azure_chestxray_utils.chestxray_consts()\n",
241 |     "\n",
242 |     "data_base_input_dir=os.path.join(amlWBSharedDir, \n",
243 |     "                                 os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n",
244 |     "data_base_output_dir=os.path.join(amlWBSharedDir, \n",
245 |     "                                  os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))  \n",
246 |     "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n",
247 |     "                                     os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n",
248 |     "other_data_dir=os.path.join(data_base_input_dir, \n",
249 |     "                            os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))\n",
250 |     "label_file = os.path.join(other_data_dir,'Data_Entry_2017.csv')\n",
251 |     "\n",
252 |     "data_partitions_dir=os.path.join(data_base_output_dir, \n",
253 |     "                                os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))  \n",
254 |     "nih_chest_xray_data_dir\n",
255 |     "!find $nih_chest_xray_data_dir -type f | wc -l\n",
256 |     "\n",
257 |     "other_data_dir\n",
258 |     "!ls $other_data_dir"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 11,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "# Paths\n",
268 |     "# BASE_DIR = \"/mnt\"\n",
269 |     "# DATA_FOLDER = os.path.join(BASE_DIR, \"ChestXray-NIHCC\")\n",
270 |     "# IMAGE_FOLDER = os.path.join(BASE_DIR, \"images\")\n",
271 |     "# LABEL_FILE = os.path.join(DATA_FOLDER, \"Data_Entry_2017.csv\")\n",
272 |     "# print(IMAGE_FOLDER, LABEL_FILE)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 12,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "#####################################################################################################\n",
282 |     "## Data Loading\n",
283 |     "#####################################################################################################"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 13,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "# # todo\n",
293 |     "# # This should prob be a generic function\n",
294 |     "# # Split data into train/val/test\n",
295 |     "\n",
296 |     "# real_total_patient_number = 30805\n",
297 |     "# patient_id_original = [i for i in range(real_total_patient_number + 1)]\n",
298 |     "\n",
299 |     "# bbox_df = pd.read_csv(os.path.join(other_data_dir, 'BBox_List_2017.csv'))\n",
300 |     "\n",
301 |     "# black_list_set = set()\n",
302 |     "# with open(os.path.join(other_data_dir, 'blacklist.csv'), 'r') as f:\n",
303 |     "#     for line in f:\n",
304 |     "#         # delete the last char which is \\n\n",
305 |     "#         black_list_set.add(line[:-1])\n",
306 |     "#         if int(line[:-9]) >= 30805:\n",
307 |     "#             print(line[:-1])\n",
308 |     "\n",
309 |     "# # print(\"00029404_009.png\" in black_list_set)\n",
310 |     "# bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)\n",
311 |     "\n",
312 |     "# bbox_patient_index_list = []\n",
313 |     "\n",
314 |     "\n",
315 |     "# for index, item in bbox_patient_index_df.iteritems():\n",
316 |     "#     bbox_patient_index_list.append(int(item))\n",
317 |     "\n",
318 |     "# patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))\n",
319 |     "# print(\"len of patient id is\", len(patient_id))\n",
320 |     "# print(\"len of unique patient id with annotated data\", \n",
321 |     "#       len(list(set(bbox_patient_index_list))))\n",
322 |     "# print(\"len of patient id with annotated data\",bbox_df.shape[0])\n",
323 |     "# print(\"len of original patient id is\", len(patient_id_original))\n"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 14,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "# # set fast_testing True to see the training pipeline running for a few iterations on a very small # of images\n",
333 |     "# # set fast_testing False to perfrom real training on full training data \n",
334 |     "# fast_testing = True\n",
335 |     "\n",
336 |     "# # for real training we need random order\n",
337 |     "# if (fast_testing):\n",
338 |     "#     shuffle_data_FLAG = False\n",
339 |     "#     crt_patient_id = patient_id[:300]\n",
340 |     "#     left_out_patient_id = patient_id[300:]\n",
341 |     "# else:\n",
342 |     "#     crt_patient_id = patient_id\n",
343 |     "#     left_out_patient_id = []\n",
344 |     "#     # set seed to reproduce result\n",
345 |     "#     random.seed(0)\n",
346 |     "#     shuffle_data_FLAG = True\n",
347 |     "\n",
348 |     "# # training:valid:test=7:1:2\n",
349 |     "# train_set, other_set = train_test_split(\n",
350 |     "#     crt_patient_id, train_size=0.7, test_size=0.3, shuffle=shuffle_data_FLAG)\n",
351 |     "# valid_set, test_set = train_test_split(\n",
352 |     "#     other_set, train_size=1/3, test_size=2/3, shuffle=shuffle_data_FLAG)\n",
353 |     "# print(\"train:{} valid:{} test:{}\".format(len(train_set), len(valid_set), len(test_set)))\n",
354 |     "\n",
355 |     "# # test_set = test_set+left_out_patient_id\n",
356 |     "# # print(\"train:{} valid:{} test:{}\".format(len(train_set), len(valid_set), len(test_set)))"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 15,
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "name": "stdout",
366 |      "output_type": "stream",
367 |      "text": [
368 |       "train:21563 valid:3081 test:6161 nih-annotated:726\n"
369 |      ]
370 |     }
371 |    ],
372 |    "source": [
373 |     "import pickle\n",
374 |     "patient_id_partition_file = os.path.join(data_partitions_dir, 'train_test_valid_data_partitions.pickle')\n",
375 |     "\n",
376 |     "with open(patient_id_partition_file, 'rb') as f:\n",
377 |     "    [train_set,valid_set,test_set, nih_annotated_set]=pickle.load(f)\n",
378 |     "\n",
379 |     "print(\"train:{} valid:{} test:{} nih-annotated:{}\".format(len(train_set), len(valid_set), \\\n",
380 |     "                                                     len(test_set), len(nih_annotated_set)))"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": 16,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "class XrayData(Dataset):\n",
390 |     "    def __init__(self, img_dir, lbl_file, patient_ids, transform=None):\n",
391 |     "        \n",
392 |     "        # Read labels-csv\n",
393 |     "        df = pd.read_csv(lbl_file)\n",
394 |     "        # Filter by patient-ids\n",
395 |     "        df = df[df['Patient ID'].isin(patient_ids)]\n",
396 |     "        # Split labels\n",
397 |     "        df_label = df['Finding Labels'].str.split(\n",
398 |     "            '|', expand=False).str.join(sep='*').str.get_dummies(sep='*')\n",
399 |     "        df_label.drop(['No Finding'], axis=1, inplace=True)\n",
400 |     "                \n",
401 |     "        # List of images (full-path)\n",
402 |     "        self.img_locs =  df['Image Index'].map(lambda im: os.path.join(img_dir, im)).values\n",
403 |     "        # One-hot encoded labels (float32 for BCE loss)\n",
404 |     "        self.labels = df_label.values\n",
405 |     "        # Processing\n",
406 |     "        self.transform = transform\n",
407 |     "              \n",
408 |     "        print(\"Loaded {} labels and {} images\".format(len(self.labels), \n",
409 |     "                                                      len(self.img_locs)))\n",
410 |     "    \n",
411 |     "    def __getitem__(self, idx):\n",
412 |     "        \n",
413 |     "        im_file = self.img_locs[idx]\n",
414 |     "        im_rgb = Image.open(im_file).convert('RGB')\n",
415 |     "        label = self.labels[idx]\n",
416 |     "        if self.transform is not None:\n",
417 |     "            im_rgb = self.transform(im_rgb)\n",
418 |     "        return im_rgb, torch.FloatTensor(label)\n",
419 |     "        \n",
420 |     "    def __len__(self):\n",
421 |     "        return len(self.img_locs)"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 17,
427 |    "metadata": {},
428 |    "outputs": [],
429 |    "source": [
430 |     "def no_augmentation_dataset(img_dir, lbl_file, patient_ids, normalize):\n",
431 |     "    dataset = XrayData(img_dir, lbl_file, patient_ids,\n",
432 |     "                       transform=transforms.Compose([\n",
433 |     "                           transforms.Resize(WIDTH),\n",
434 |     "                           transforms.ToTensor(),  \n",
435 |     "                           normalize]))\n",
436 |     "    return dataset"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": 18,
442 |    "metadata": {},
443 |    "outputs": [
444 |     {
445 |      "name": "stdout",
446 |      "output_type": "stream",
447 |      "text": [
448 |       "Loaded 69217 labels and 69217 images\n"
449 |      ]
450 |     }
451 |    ],
452 |    "source": [
453 |     "# Dataset for training\n",
454 |     "# Normalise by imagenet mean/sd\n",
455 |     "normalize = transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)\n",
456 |     "# todo\n",
457 |     "# Go wild here with the transforms\n",
458 |     "# https://github.com/pytorch/vision/blob/master/torchvision/transforms/transforms.py\n",
459 |     "#__all__ = [\"Compose\", \"ToTensor\", \"ToPILImage\", \"Normalize\", \"Resize\", \"Scale\", \"CenterCrop\", \"Pad\",\n",
460 |     "#           \"Lambda\", \"RandomCrop\", \"RandomHorizontalFlip\", \"RandomVerticalFlip\", \"RandomResizedCrop\",\n",
461 |     "#           \"RandomSizedCrop\", \"FiveCrop\", \"TenCrop\", \"LinearTransformation\", \"ColorJitter\", \"RandomRotation\",\n",
462 |     "#           \"Grayscale\", \"RandomGrayscale\"]\n",
463 |     "train_dataset = XrayData(img_dir=nih_chest_xray_data_dir,\n",
464 |     "                         lbl_file=label_file,\n",
465 |     "                         patient_ids=train_set,\n",
466 |     "                         transform=transforms.Compose([\n",
467 |     "                             transforms.Resize(264),\n",
468 |     "                             transforms.RandomHorizontalFlip(),\n",
469 |     "                             transforms.RandomResizedCrop(size=WIDTH),\n",
470 |     "                             transforms.ColorJitter(0.15, 0.15),\n",
471 |     "                             transforms.RandomRotation(15),\n",
472 |     "                             transforms.ToTensor(),  # need to convert image to tensor!\n",
473 |     "                             normalize]))"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 19,
479 |    "metadata": {},
480 |    "outputs": [
481 |     {
482 |      "name": "stdout",
483 |      "output_type": "stream",
484 |      "text": [
485 |       "Loaded 9600 labels and 9600 images\n",
486 |       "Loaded 33303 labels and 33303 images\n"
487 |      ]
488 |     }
489 |    ],
490 |    "source": [
491 |     "valid_dataset = no_augmentation_dataset(nih_chest_xray_data_dir, label_file, valid_set, normalize)\n",
492 |     "test_dataset = no_augmentation_dataset(nih_chest_xray_data_dir, label_file, test_set, normalize)"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": 20,
498 |    "metadata": {},
499 |    "outputs": [],
500 |    "source": [
501 |     "#####################################################################################################\n",
502 |     "## Helper Functions\n",
503 |     "#####################################################################################################"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 21,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": [
512 |     "def get_symbol(out_features=CLASSES, multi_gpu=MULTI_GPU):\n",
513 |     "    model = models.densenet.densenet121(pretrained=True)\n",
514 |     "    # Replace classifier (FC-1000) with (FC-14)\n",
515 |     "    model.classifier = nn.Sequential(\n",
516 |     "        nn.Linear(model.classifier.in_features, out_features), \n",
517 |     "        nn.Sigmoid())\n",
518 |     "    if multi_gpu:\n",
519 |     "        model = nn.DataParallel(model)\n",
520 |     "    # CUDA\n",
521 |     "    model.cuda()  \n",
522 |     "    return model"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 22,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "def init_symbol(sym, lr=LR):\n",
532 |     "    # torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)\n",
533 |     "    opt = optim.Adam(sym.parameters(), lr=lr, betas=(0.9, 0.999))\n",
534 |     "    criterion = nn.BCELoss()\n",
535 |     "    scheduler = ReduceLROnPlateau(opt, factor = 0.1, patience = 5, mode = 'min')\n",
536 |     "    return opt, criterion, scheduler "
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 23,
542 |    "metadata": {},
543 |    "outputs": [],
544 |    "source": [
545 |     "def compute_roc_auc(data_gt, data_pd, mean=True, classes=CLASSES):\n",
546 |     "    roc_auc = []\n",
547 |     "    data_gt = data_gt.cpu().numpy()\n",
548 |     "    data_pd = data_pd.cpu().numpy()\n",
549 |     "    for i in range(classes):\n",
550 |     "        roc_auc.append(roc_auc_score(data_gt[:, i], data_pd[:, i]))\n",
551 |     "    if mean:\n",
552 |     "        roc_auc = np.mean(roc_auc)\n",
553 |     "    return roc_auc"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": 24,
559 |    "metadata": {},
560 |    "outputs": [],
561 |    "source": [
562 |     "def train_epoch(model, dataloader, optimizer, criterion, epoch, batch=BATCHSIZE):\n",
563 |     "    model.train()\n",
564 |     "    print(\"Training epoch {}\".format(epoch+1))\n",
565 |     "    loss_val = 0\n",
566 |     "    loss_cnt = 0\n",
567 |     "    for data, target in dataloader:\n",
568 |     "        # Get samples\n",
569 |     "        data = Variable(torch.FloatTensor(data).cuda())\n",
570 |     "        target = Variable(torch.FloatTensor(target).cuda())\n",
571 |     "        # Init\n",
572 |     "        optimizer.zero_grad()\n",
573 |     "        # Forwards\n",
574 |     "        output = model(data)\n",
575 |     "        # Loss\n",
576 |     "        loss = criterion(output, target)\n",
577 |     "        # Back-prop\n",
578 |     "        loss.backward()\n",
579 |     "        optimizer.step()   \n",
580 |     "         # Log the loss\n",
581 |     "        loss_val += loss.data[0]\n",
582 |     "        loss_cnt += 1\n",
583 |     "    print(\"Training loss: {0:.4f}\".format(loss_val/loss_cnt))"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "code",
588 |    "execution_count": 25,
589 |    "metadata": {},
590 |    "outputs": [],
591 |    "source": [
592 |     "def valid_epoch(model, dataloader, criterion, epoch, phase='valid', batch=BATCHSIZE):\n",
593 |     "    model.eval()\n",
594 |     "    if phase == 'testing':\n",
595 |     "        print(\"Testing epoch {}\".format(epoch+1))\n",
596 |     "    else:\n",
597 |     "        print(\"Validating epoch {}\".format(epoch+1))\n",
598 |     "    out_pred = torch.FloatTensor().cuda()\n",
599 |     "    out_gt = torch.FloatTensor().cuda()\n",
600 |     "    loss_val = 0\n",
601 |     "    loss_cnt = 0\n",
602 |     "    for data, target in dataloader:\n",
603 |     "        # Get samples\n",
604 |     "        data = Variable(torch.FloatTensor(data).cuda(), volatile=True)\n",
605 |     "        target = Variable(torch.FloatTensor(target).cuda(), volatile=True)\n",
606 |     "         # Forwards\n",
607 |     "        output = model(data)\n",
608 |     "        # Loss\n",
609 |     "        loss = criterion(output, target)\n",
610 |     "        # Log the loss\n",
611 |     "        loss_val += loss.data[0]\n",
612 |     "        loss_cnt += 1\n",
613 |     "        # Log for AUC\n",
614 |     "        out_pred = torch.cat((out_pred, output.data), 0)\n",
615 |     "        out_gt = torch.cat((out_gt, target.data), 0)\n",
616 |     "    loss_mean = loss_val/loss_cnt\n",
617 |     "    if phase == 'testing':\n",
618 |     "        print(\"Test-Dataset loss: {0:.4f}\".format(loss_mean))\n",
619 |     "        print(\"Test-Dataset AUC: {0:.4f}\".format(compute_roc_auc(out_gt, out_pred)))\n",
620 |     "\n",
621 |     "    else:\n",
622 |     "        print(\"Validation loss: {0:.4f}\".format(loss_mean))\n",
623 |     "        print(\"Validation AUC: {0:.4f}\".format(compute_roc_auc(out_gt, out_pred)))\n",
624 |     "    return loss_mean"
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": 26,
630 |    "metadata": {},
631 |    "outputs": [],
632 |    "source": [
633 |     "def print_learning_rate(opt):\n",
634 |     "    for param_group in opt.param_groups:\n",
635 |     "        print(\"Learining rate: \", param_group['lr'])"
636 |    ]
637 |   },
638 |   {
639 |    "cell_type": "code",
640 |    "execution_count": 27,
641 |    "metadata": {},
642 |    "outputs": [],
643 |    "source": [
644 |     "# DataLoaders\n",
645 |     "# 4*CPU_COUNT\n",
646 |     "# pin_memory=True\n",
647 |     "train_loader = DataLoader(dataset=train_dataset, batch_size=BATCHSIZE,\n",
648 |     "                          shuffle=True, num_workers=4*CPU_COUNT, pin_memory=False)\n",
649 |     "\n",
650 |     "valid_loader = DataLoader(dataset=valid_dataset, batch_size=8*BATCHSIZE,\n",
651 |     "                          shuffle=False, num_workers=0, pin_memory=False)\n",
652 |     "\n",
653 |     "test_loader = DataLoader(dataset=test_dataset, batch_size=8*BATCHSIZE,\n",
654 |     "                         shuffle=False, num_workers=4*CPU_COUNT, pin_memory=False)"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 28,
660 |    "metadata": {},
661 |    "outputs": [],
662 |    "source": [
663 |     "#####################################################################################################\n",
664 |     "## Train Azure Chest Xray\n",
665 |     "#####################################################################################################"
666 |    ]
667 |   },
668 |   {
669 |    "cell_type": "code",
670 |    "execution_count": 29,
671 |    "metadata": {},
672 |    "outputs": [
673 |     {
674 |      "name": "stderr",
675 |      "output_type": "stream",
676 |      "text": [
677 |       "Downloading: \"https://download.pytorch.org/models/densenet121-a639ec97.pth\" to /home/mmlspark/.torch/models/densenet121-a639ec97.pth\n",
678 |       "100%|██████████| 32342954/32342954 [00:00<00:00, 50666489.38it/s]\n"
679 |      ]
680 |     },
681 |     {
682 |      "name": "stdout",
683 |      "output_type": "stream",
684 |      "text": [
685 |       "CPU times: user 2.42 s, sys: 910 ms, total: 3.33 s\n",
686 |       "Wall time: 18.3 s\n"
687 |      ]
688 |     }
689 |    ],
690 |    "source": [
691 |     "%%time\n",
692 |     "# Load symbol\n",
693 |     "azure_chest_xray_sym = get_symbol()"
694 |    ]
695 |   },
696 |   {
697 |    "cell_type": "code",
698 |    "execution_count": 30,
699 |    "metadata": {},
700 |    "outputs": [
701 |     {
702 |      "name": "stdout",
703 |      "output_type": "stream",
704 |      "text": [
705 |       "CPU times: user 2.04 ms, sys: 136 µs, total: 2.18 ms\n",
706 |       "Wall time: 2.18 ms\n"
707 |      ]
708 |     }
709 |    ],
710 |    "source": [
711 |     "%%time\n",
712 |     "# Load optimiser, loss\n",
713 |     "optimizer, criterion, scheduler = init_symbol(azure_chest_xray_sym)"
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "code",
718 |    "execution_count": null,
719 |    "metadata": {},
720 |    "outputs": [
721 |     {
722 |      "name": "stdout",
723 |      "output_type": "stream",
724 |      "text": [
725 |       "Wed Feb 14 08:09:54 2018       \n",
726 |       "+-----------------------------------------------------------------------------+\n",
727 |       "| NVIDIA-SMI 384.111                Driver Version: 384.111                   |\n",
728 |       "|-------------------------------+----------------------+----------------------+\n",
729 |       "| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
730 |       "| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |\n",
731 |       "|===============================+======================+======================|\n",
732 |       "|   0  Tesla K80           Off  | 00002CD7:00:00.0 Off |                  Off |\n",
733 |       "| N/A   38C    P0    72W / 149W |    241MiB / 12205MiB |      0%      Default |\n",
734 |       "+-------------------------------+----------------------+----------------------+\n",
735 |       "|   1  Tesla K80           Off  | 000045C0:00:00.0 Off |                  Off |\n",
736 |       "| N/A   42C    P8    28W / 149W |     11MiB / 12205MiB |      0%      Default |\n",
737 |       "+-------------------------------+----------------------+----------------------+\n",
738 |       "                                                                               \n",
739 |       "+-----------------------------------------------------------------------------+\n",
740 |       "| Processes:                                                       GPU Memory |\n",
741 |       "|  GPU       PID   Type   Process name                             Usage      |\n",
742 |       "|=============================================================================|\n",
743 |       "+-----------------------------------------------------------------------------+\n",
744 |       "CUDA Version 8.0.61\n",
745 |       "CUDA Version 8.0.61\n"
746 |      ]
747 |     }
748 |    ],
749 |    "source": [
750 |     "!nvidia-smi\n",
751 |     "!cat /usr/local/cuda-8.0/version.txt\n",
752 |     "!cat /usr/local/cuda/version.txt\n"
753 |    ]
754 |   },
755 |   {
756 |    "cell_type": "code",
757 |    "execution_count": null,
758 |    "metadata": {},
759 |    "outputs": [
760 |     {
761 |      "name": "stdout",
762 |      "output_type": "stream",
763 |      "text": [
764 |       "Validating epoch 0\n",
765 |       "Validation loss: 0.6890\n",
766 |       "Validation AUC: 0.4773\n"
767 |      ]
768 |     },
769 |     {
770 |      "data": {
771 |       "text/plain": [
772 |        "0.689042454957962"
773 |       ]
774 |      },
775 |      "execution_count": 32,
776 |      "metadata": {},
777 |      "output_type": "execute_result"
778 |     },
779 |     {
780 |      "name": "stdout",
781 |      "output_type": "stream",
782 |      "text": [
783 |       "Training epoch 1\n"
784 |      ]
785 |     }
786 |    ],
787 |    "source": [
788 |     "# Original CheXNet ROC AUC = 0.841\n",
789 |     "loss_min = float(\"inf\")    \n",
790 |     "stime = time.time()\n",
791 |     "\n",
792 |     "# No-training\n",
793 |     "valid_epoch(azure_chest_xray_sym, valid_loader, criterion, -1)\n",
794 |     "\n",
795 |     "# Main train/val/test loop\n",
796 |     "for j in range(EPOCHS):\n",
797 |     "    train_epoch(azure_chest_xray_sym, train_loader, optimizer, criterion, j)\n",
798 |     "    loss_val = valid_epoch(azure_chest_xray_sym, valid_loader, criterion, j)\n",
799 |     "    test_loss_val = valid_epoch(azure_chest_xray_sym, test_loader, criterion, j, 'testing')\n",
800 |     "    # LR Schedule\n",
801 |     "    scheduler.step(loss_val)\n",
802 |     "    print_learning_rate(optimizer)\n",
803 |     "    # todo: tensorboard hooks\n",
804 |     "    # Logging\n",
805 |     "    if loss_val < loss_min:\n",
806 |     "        print(\"Loss decreased. Saving ...\")\n",
807 |     "        loss_min = loss_val\n",
808 |     "        torch.save({'epoch': j + 1, \n",
809 |     "                    'state_dict': azure_chest_xray_sym.state_dict(), \n",
810 |     "                    'best_loss': loss_min, \n",
811 |     "                    'optimizer' : optimizer.state_dict()}, 'best_azure_chest_xray_model_v2.pth.tar')\n",
812 |     "    etime = time.time()\n",
813 |     "    print(\"Epoch time: {0:.0f} seconds\".format(etime-stime))\n",
814 |     "    print(\"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\")"
815 |    ]
816 |   },
817 |   {
818 |    "cell_type": "code",
819 |    "execution_count": null,
820 |    "metadata": {},
821 |    "outputs": [],
822 |    "source": [
823 |     "#####################################################################################################\n",
824 |     "## Test azure_chest_xray\n",
825 |     "#####################################################################################################"
826 |    ]
827 |   },
828 |   {
829 |    "cell_type": "code",
830 |    "execution_count": null,
831 |    "metadata": {},
832 |    "outputs": [],
833 |    "source": [
834 |     "# Load model for testing\n",
835 |     "azure_chest_xray_sym_test = get_symbol()\n",
836 |     "chkpt = torch.load(\"best_azure_chest_xray_model_v2.pth.tar\")\n",
837 |     "chexnet_sym_test.load_state_dict(chkpt['state_dict'])"
838 |    ]
839 |   },
840 |   {
841 |    "cell_type": "code",
842 |    "execution_count": null,
843 |    "metadata": {},
844 |    "outputs": [],
845 |    "source": [
846 |     "valid_loss = valid_epoch(azure_chest_xray_sym_test, valid_loader, criterion, -1)\n",
847 |     "test_loss = valid_epoch(azure_chest_xray_sym_test, test_loader, criterion, -1, 'testing')"
848 |    ]
849 |   },
850 |   {
851 |    "cell_type": "code",
852 |    "execution_count": null,
853 |    "metadata": {},
854 |    "outputs": [],
855 |    "source": [
856 |     "#import torch.onnx\n",
857 |     "#dummy_input = Variable(torch.randn(BATCHSIZE, CHANNELS, HEIGHT, WIDTH)).cuda()\n",
858 |     "#torch.onnx.export(azure_chest_xray_sym_test, dummy_input, \"azure_chest_xray.proto\", verbose=True)"
859 |    ]
860 |   },
861 |   {
862 |    "cell_type": "code",
863 |    "execution_count": null,
864 |    "metadata": {},
865 |    "outputs": [],
866 |    "source": [
867 |     "# jupyter nbconvert --to html .\\Code\\02_Model\\060_Train_pyTorch.ipynb"
868 |    ]
869 |   }
870 |  ],
871 |  "metadata": {
872 |   "kernelspec": {
873 |    "display_name": "Python 3",
874 |    "language": "python",
875 |    "name": "python3"
876 |   },
877 |   "language_info": {
878 |    "codemirror_mode": {
879 |     "name": "ipython",
880 |     "version": 3
881 |    },
882 |    "file_extension": ".py",
883 |    "mimetype": "text/x-python",
884 |    "name": "python",
885 |    "nbconvert_exporter": "python",
886 |    "pygments_lexer": "ipython3",
887 |    "version": "3.6.3"
888 |   }
889 |  },
890 |  "nbformat": 4,
891 |  "nbformat_minor": 2
892 | }
893 | 


--------------------------------------------------------------------------------