├── AzureChestXRay_AMLWB ├── Code │ ├── src │ │ ├── azure_chestxray_pytorch_utils.py │ │ ├── finding_lungs │ │ │ ├── non_PA_AP_view_samples.png │ │ │ ├── rotated_images_samples.png │ │ │ ├── blacklist_rotated_images.csv │ │ │ ├── blacklist_non_PA_AP_view.csv │ │ │ ├── finding_lungs_DL_approach.py │ │ │ └── blacklist_other_images_with_lower_quality.csv │ │ ├── azure_chestxray_keras_utils.py │ │ ├── azure_chestxray_utils.py │ │ ├── azure_chestxray_cam.py │ │ └── score_image_and_cam.py │ ├── docker │ │ └── Dockerfile │ ├── Deployment_Guide.md │ ├── 01_DataPrep │ │ └── 001_get_data.ipynb │ └── 02_Model │ │ ├── 000_preprocess.ipynb │ │ ├── .ipynb_checkpoints │ │ └── 000_preprocess-checkpoint.ipynb │ │ ├── 010_train.ipynb │ │ ├── 020_evaluate.ipynb │ │ └── 060_Train_pyTorch.ipynb └── aml_config │ ├── gpucomputecontext.runconfig │ ├── gpucomputecontext.compute │ └── conda_dependencies_gpu.yml ├── LICENSE ├── README.md └── .gitignore /AzureChestXRay_AMLWB/Code/src/azure_chestxray_pytorch_utils.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/finding_lungs/non_PA_AP_view_samples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tikyau/AzureChestXRay/master/AzureChestXRay_AMLWB/Code/src/finding_lungs/non_PA_AP_view_samples.png -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/finding_lungs/rotated_images_samples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tikyau/AzureChestXRay/master/AzureChestXRay_AMLWB/Code/src/finding_lungs/rotated_images_samples.png -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/aml_config/gpucomputecontext.runconfig: -------------------------------------------------------------------------------- 1 | ArgumentVector: 2 | - $file 3 | CondaDependenciesFile: aml_config/conda_dependencies_gpu.yml 4 | EnvironmentVariables: null 5 | Framework: Python 6 | PrepareEnvironment: true 7 | SparkDependenciesFile: aml_config/spark_dependencies.yml 8 | Target: gpucomputecontext 9 | TrackedRun: true 10 | UseSampling: true 11 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/finding_lungs/blacklist_rotated_images.csv: -------------------------------------------------------------------------------- 1 | 00001255_007.png 2 | 00001814_001.png 3 | 00002180_000.png 4 | 00002815_003.png 5 | 00003693_005.png 6 | 00005823_000.png 7 | 00007188_002.png 8 | 00008051_036.png 9 | 00008468_003.png 10 | 00009889_023.png 11 | 00009984_001.png 12 | 00011460_066.png 13 | 00013299_000.png 14 | 00013431_000.png 15 | 00017258_011.png 16 | 00017606_037.png 17 | 00019620_001.png 18 | 00026701_001.png 19 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/aml_config/gpucomputecontext.compute: -------------------------------------------------------------------------------- 1 | address: ghiordanxrgpuvm02.westus2.cloudapp.azure.com 2 | #baseDockerImage: microsoft/mmlspark:plus-0.9.9 3 | baseDockerImage: georgedockeraccount/utils_with_amlwb_base_gpu:azcopyenabled 4 | nvidiaDocker: true 5 | 6 | #nativeSharedDirectory: ~/.azureml/share/ 7 | nativeSharedDirectory: /datadrive01/amlwbShare/ 8 | 9 | password: AzureMlSecret=gpucomputecontext#loginvm0011#56bdbc19d02f4df08a50f94cfc8ec9ef 10 | sharedVolumes: true 11 | type: remotedocker 12 | username: loginvm0011 13 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # for CPU compute contexts 2 | #FROM microsoft/mmlspark:plus-0.9.9 3 | 4 | # for GPU compute contexts 5 | FROM microsoft/mmlspark:plus-gpu-0.9.9 6 | 7 | ENV PREVUSER=$USER 8 | USER root 9 | 10 | # install AzCopy on Linux 11 | # https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-linux?toc=%2fazure%2fstorage%2fblobs%2ftoc.json 12 | RUN apt-get update && apt-get install -y apt-transport-https wget rsync git 13 | RUN curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg && \ 14 | mv microsoft.gpg /etc/apt/trusted.gpg.d/microsoft.gpg &&\ 15 | sh -c 'echo "deb [arch=amd64] https://packages.microsoft.com/repos/microsoft-ubuntu-xenial-prod xenial main" > /etc/apt/sources.list.d/dotnetdev.list' && \ 16 | apt-get update && \ 17 | apt-get install -y --no-install-recommends && \ 18 | apt-get install -y dotnet-sdk-2.0.2 && \ 19 | wget -O azcopy.tar.gz https://aka.ms/downloadazcopyprlinux && \ 20 | tar -xf azcopy.tar.gz && \ 21 | ./install.sh 22 | 23 | 24 | USER $PREVUSER 25 | 26 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/azure_chestxray_keras_utils.py: -------------------------------------------------------------------------------- 1 | ### Copyright (C) Microsoft Corporation. 2 | 3 | from keras.layers import Dense 4 | from keras.models import Model 5 | from keras_contrib.applications.densenet import DenseNetImageNet121 6 | import keras_contrib 7 | 8 | def build_model(crt_densenet_function): 9 | """ 10 | 11 | Returns: a model with specified weights 12 | 13 | """ 14 | # define the model, use pre-trained weights for image_net 15 | base_model = crt_densenet_function(input_shape=(224, 224, 3), 16 | weights='imagenet', 17 | include_top=False, 18 | pooling='avg') 19 | 20 | x = base_model.output 21 | predictions = Dense(14, activation='sigmoid')(x) 22 | model = Model(inputs=base_model.input, outputs=predictions) 23 | return model 24 | 25 | if __name__=="__main__": 26 | model = build_model(DenseNetImageNet121) 27 | print(model.summary()) 28 | model = build_model(keras_contrib.applications.densenet.DenseNetImageNet201) 29 | print(model.summary()) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/finding_lungs/blacklist_non_PA_AP_view.csv: -------------------------------------------------------------------------------- 1 | 00000591_003.png 2 | 00001136_001.png 3 | 00001153_005.png 4 | 00001602_000.png 5 | 00001803_003.png 6 | 00002097_000.png 7 | 00002117_003.png 8 | 00002354_000.png 9 | 00002592_003.png 10 | 00002639_009.png 11 | 00003023_000.png 12 | 00003094_002.png 13 | 00004533_004.png 14 | 00004808_001.png 15 | 00004906_000.png 16 | 00005192_001.png 17 | 00005260_000.png 18 | 00005286_001.png 19 | 00006462_008.png 20 | 00006836_008.png 21 | 00006851_004.png 22 | 00007113_001.png 23 | 00007152_006.png 24 | 00007160_002.png 25 | 00007454_001.png 26 | 00007482_010.png 27 | 00007716_007.png 28 | 00008016_000.png 29 | 00008082_000.png 30 | 00009198_002.png 31 | 00009368_010.png 32 | 00009368_011.png 33 | 00009584_002.png 34 | 00009889_038.png 35 | 00010007_121.png 36 | 00010065_000.png 37 | 00012249_001.png 38 | 00012388_002.png 39 | 00012907_007.png 40 | 00013160_000.png 41 | 00013670_137.png 42 | 00013714_001.png 43 | 00014294_015.png 44 | 00014675_034.png 45 | 00014963_000.png 46 | 00015054_000.png 47 | 00015078_007.png 48 | 00016233_004.png 49 | 00016637_000.png 50 | 00017753_022.png 51 | 00017915_003.png 52 | 00020373_002.png 53 | 00020644_000.png 54 | 00025381_004.png 55 | 00026806_000.png 56 | 00029476_000.png 57 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/aml_config/conda_dependencies_gpu.yml: -------------------------------------------------------------------------------- 1 | # Conda environment specification. The dependencies defined in this file will 2 | # be automatically provisioned for managed runs. These include runs against 3 | # the localdocker, remotedocker, and cluster compute targets. 4 | 5 | # Note that this file is NOT used to automatically manage dependencies for the 6 | # local compute target. To provision these dependencies locally, run: 7 | # conda env update --file conda_dependencies.yml 8 | 9 | # Details about the Conda environment file format: 10 | # https://conda.io/docs/using/envs.html#create-environment-file-by-hand 11 | 12 | # For managing Spark packages and configuration, see spark_dependencies.yml. 13 | 14 | # Version of this configuration file's structure and semantics in AzureML. 15 | # This directive is stored in a comment to preserve the Conda file structure. 16 | # [AzureMlVersion] = 2 17 | 18 | name: project_environment 19 | channels: 20 | # - conda-forge 21 | - pytorch #soumith 22 | dependencies: 23 | # The python interpreter version. 24 | # Currently Azure ML Workbench only supports 3.5.2. 25 | - python=3.5.2 26 | - tqdm 27 | - opencv 28 | - h5py 29 | - scikit-learn 30 | - nomkl 31 | # - pytorch=0.3.0 32 | # - cuda80 33 | # - torchvision 34 | 35 | # Required for Jupyter Notebooks. 36 | - ipykernel=4.6.1 37 | 38 | - pip: 39 | # Required packages for AzureML execution, history, and data preparation. 40 | - --index-url https://azuremldownloads.azureedge.net/python-repository/preview 41 | - --extra-index-url https://pypi.python.org/simple 42 | - azureml-requirements 43 | 44 | # The API for Azure Machine Learning Model Management Service. 45 | # Details: https://github.com/Azure/Machine-Learning-Operationalization 46 | - azure-ml-api-sdk==0.1.0a11 47 | - git+git://github.com/fchollet/keras.git 48 | - git+https://github.com/ahundt/keras-contrib.git 49 | - tensorflow-gpu==1.4 50 | - matplotlib 51 | - numpy==1.14.0 52 | - git+https://github.com/aleju/imgaug 53 | - http://download.pytorch.org/whl/cu80/torch-0.3.1-cp35-cp35m-linux_x86_64.whl 54 | # - http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp35-cp35m-manylinux1_x86_64.whl 55 | - torchvision 56 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/azure_chestxray_utils.py: -------------------------------------------------------------------------------- 1 | ### Copyright (C) Microsoft Corporation. 2 | 3 | import os 4 | import numpy as np 5 | 6 | class chestxray_consts(object): 7 | DISEASE_list = ['Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 8 | 'Pneumothorax', 9 | 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural Thickening', 'Hernia'] 10 | 11 | PRETRAINED_DENSENET201_IMAGENET_CHESTXRAY_MODEL_FILE_NAME = 'chexnet_14_weights_multigpu_contribmodel_121layer_712split_epoch_011_val_loss_153.9783.hdf5' 12 | FULLY_PRETRAINED_MODEL_DIR_list = [ 'fully_trained_models'] 13 | 14 | 15 | CHESTXRAY_MODEL_EXPECTED_IMAGE_HEIGHT = 224 16 | CHESTXRAY_MODEL_EXPECTED_IMAGE_WIDTH = 224 17 | 18 | BASE_INPUT_DIR_list = ['chestxray', 'data', 'ChestX-ray8'] 19 | BASE_OUTPUT_DIR_list = ['chestxray', 'output'] 20 | CREDENTIALS_DIR_list = ['code', 'notShared'] 21 | 22 | SRC_DIR_list = ['Code', 'src'] 23 | ChestXray_IMAGES_DIR_list = ['ChestXray-NIHCC'] 24 | ChestXray_OTHER_DATA_DIR_list = ['ChestXray-NIHCC_other'] 25 | PROCESSED_IMAGES_DIR_list = ['processed_npy14'] 26 | DATA_PARTITIONS_DIR_list = ['data_partitions'] 27 | MODEL_WEIGHTS_DIR_list = [ 'weights_tmpdir'] 28 | 29 | def __setattr__(self, *_): 30 | raise TypeError 31 | 32 | 33 | # os agnostic 'ls' function 34 | def get_files_in_dir(crt_dir): 35 | return( [f for f in os.listdir(crt_dir) if os.path.isfile(os.path.join(crt_dir, f))]) 36 | 37 | 38 | 39 | def normalize_nd_array(crt_array): 40 | # Normalised [0,1] 41 | crt_array = crt_array - np.min(crt_array) 42 | return(crt_array/np.ptp(crt_array)) 43 | 44 | def print_image_stats_by_channel(crt_image): 45 | print('min:') 46 | print(np.amin(crt_image[:,:,0]), 47 | np.amin(crt_image[:,:,1]), 48 | np.amin(crt_image[:,:,2])) 49 | print('max:') 50 | print(np.amax(crt_image[:,:,0]), 51 | np.amax(crt_image[:,:,1]), 52 | np.amax(crt_image[:,:,2])) 53 | 54 | 55 | 56 | if __name__=="__main__": 57 | prj_consts = chestxray_consts() 58 | print('model_expected_image_height = ', prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_HEIGHT) 59 | print('model_expected_image_width = ', prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_WIDTH) 60 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | This repository contains the code for the blog post: [Using Microsoft AI to Build a Lung-Disease Prediction Model using Chest X-Ray Images](https://blogs.technet.microsoft.com/machinelearning/2018/03/07/using-microsoft-ai-to-build-a-lung-disease-prediction-model-using-chest-x-ray-images/), by Xiaoyong Zhu, George Iordanescu, Ilia Karmanov, data scientists from Microsoft, and Mazen Zawaideh, radiologist resident from University of Washington Medical Center. 3 | 4 | In this repostory, we provide you the Keras code (001-003 Jupyter Notebooks under AzureChestXRay_AMLWB\Code\02_Model) and PyTorch code (AzureChestXRay_AMLWB\Code\02_Model060_Train_pyTorch). You should be able to run the code from scratch and get the below result using Azure Machine Learning platform or run it using your own GPU machine. 5 | 6 | # Get Started 7 | 8 | ## Installing additional packages 9 | 10 | If you are using Azure Machine Learning as the training platform, all the dependencies should be installed. However, if you are trying out in your own environment, you should also install [keras-contrib](https://github.com/keras-team/keras-contrib) repository to run Keras code. 11 | 12 | If you are trying out the lung detection algorithm, you need to install a few other additional libraries. Please refer to the README.md file under folder AzureChestXRay\AzureChestXRay_AMLWB\Code\src\finding_lungs for more details. 13 | 14 | ## Running the code 15 | To run the code, you need to get the NIH Chest X-ray Dataset from here: https://nihcc.app.box.com/v/ChestXray-NIHCC. You need to get all the image files (all the files under `images` folder in NIH Dataset), Data_Entry_2017.csv file, as well as the Bounding Box data BBox_List_2017.csv. You might also want to remove a few low_quality images (Please refer to subfolder AzureChestXRay_AMLWB\Code\src\finding_lungs for more details). 16 | 17 | 18 | 19 | # Tools and Platforms 20 | - Deep Learning VMs with GPU acceleration is used as the compute environment 21 | - Azure Machine Learning is used as a managed machine learning service for project management, run history and version control, and model deployment 22 | 23 | # Results 24 | 25 | We've got the following result, and the average AUROC across all the 14 diseases is around 0.845. 26 | 27 | | Disease | AUC Score | Disease | AUC Score | 28 | |--------------|-----------|--------------------|-----------| 29 | | Atelectasis | 0.828543 | Pneumothorax | 0.881838 | 30 | | Cardiomegaly | 0.891449 | Consolidation | 0.721818 | 31 | | Effusion | 0.817697 | Edema | 0.868002 | 32 | | Infiltration | 0.907302 | Emphysema | 0.787202 | 33 | | Mass | 0.895815 | Fibrosis | 0.826822 | 34 | | Nodule | 0.907841 | Pleural Thickening | 0.793416 | 35 | | Pneumonia | 0.817601 | Hernia | 0.889089 | 36 | 37 | 38 | # Criticisms 39 | There are several discussions in the community on the efficacy of using NLP to mine the disease labels, and how it might potentially lead to poor label quality (for example, [here](https://lukeoakdenrayner.wordpress.com/2018/01/24/chexnet-an-in-depth-review/), as well as in [this article on Medium](https://medium.com/@paras42/dear-mythical-editor-radiologist-level-pneumonia-in-chexnet-c91041223526)). However, even with dirty labels, deep learning models are sometimes still able to achieve good classification performance. 40 | 41 | # Referenced papers 42 | - The original chexnet paper mentioned in [StanfordML website](https://stanfordmlgroup.github.io/projects/chexnet/) as well as their [paper](https://arxiv.org/abs/1711.05225). 43 | - http://cs231n.stanford.edu/reports/2017/pdfs/527.pdf for pre-processing the data 44 | - https://arxiv.org/abs/1711.08760 for some other thoughts on the model architecture and the relationship between different diseases 45 | - Baseline result: https://arxiv.org/abs/1705.02315 46 | - Image Localization http://arxiv.org/abs/1512.04150 47 | 48 | # Conclusion, acknowledgement, and thanks 49 | Some of the pre-processing code for Keras is borrowed from [the dr.b repository](https://github.com/taoddiao/dr.b). 50 | 51 | We hope this repository will be helpful in your research project and please let us know if you have any questions or feedbacks. Pull requests are also welcome! 52 | 53 | We also would like to thank Pranav Rajpurkar and Jeremy Irvin from Stanford for answering our questions about their implementation, as well as Wee Hyong Tok, Danielle Dean, Hanna Kim, and Ivan Tarapov from Microsoft for reviewing the blog post and providing their feedback. 54 | 55 | # Disclaimer 56 | The source code, tools, and discussion in this repository are provided to assist data scientists in understanding the potential for developing deep learning -driven intelligent applications using Azure AI services and are intended for research and development use only. The x-ray image pathology classification system is not intended for use in clinical diagnosis or clinical decision-making or for any other clinical use. The performance of this model for clinical use has not been established. 57 | 58 | # Contributing 59 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 60 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 61 | the rights to use your contribution. For details, visit https://cla.microsoft.com. 62 | 63 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide 64 | a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions 65 | provided by the bot. You will only need to do this once across all repos using our CLA. 66 | 67 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 68 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 69 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 70 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | 33 | # Visual Studio Code 34 | .vscode/ 35 | # Uncomment if you have tasks that create the project's static files in wwwroot 36 | #wwwroot/ 37 | 38 | # MSTest test Results 39 | [Tt]est[Rr]esult*/ 40 | [Bb]uild[Ll]og.* 41 | 42 | # NUNIT 43 | *.VisualState.xml 44 | TestResult.xml 45 | 46 | # Build Results of an ATL Project 47 | [Dd]ebugPS/ 48 | [Rr]eleasePS/ 49 | dlldata.c 50 | 51 | # .NET Core 52 | project.lock.json 53 | project.fragment.lock.json 54 | artifacts/ 55 | **/Properties/launchSettings.json 56 | 57 | *_i.c 58 | *_p.c 59 | *_i.h 60 | *.ilk 61 | *.meta 62 | *.obj 63 | *.pch 64 | *.pdb 65 | *.pgc 66 | *.pgd 67 | *.rsp 68 | *.sbr 69 | *.tlb 70 | *.tli 71 | *.tlh 72 | *.tmp 73 | *.tmp_proj 74 | *.log 75 | *.vspscc 76 | *.vssscc 77 | .builds 78 | *.pidb 79 | *.svclog 80 | *.scc 81 | 82 | # Chutzpah Test files 83 | _Chutzpah* 84 | 85 | # Visual C++ cache files 86 | ipch/ 87 | *.aps 88 | *.ncb 89 | *.opendb 90 | *.opensdf 91 | *.sdf 92 | *.cachefile 93 | *.VC.db 94 | *.VC.VC.opendb 95 | 96 | # Visual Studio profiler 97 | *.psess 98 | *.vsp 99 | *.vspx 100 | *.sap 101 | 102 | # TFS 2012 Local Workspace 103 | $tf/ 104 | 105 | # Guidance Automation Toolkit 106 | *.gpState 107 | 108 | # ReSharper is a .NET coding add-in 109 | _ReSharper*/ 110 | *.[Rr]e[Ss]harper 111 | *.DotSettings.user 112 | 113 | # JustCode is a .NET coding add-in 114 | .JustCode 115 | 116 | # TeamCity is a build add-in 117 | _TeamCity* 118 | 119 | # DotCover is a Code Coverage Tool 120 | *.dotCover 121 | 122 | # Visual Studio code coverage results 123 | *.coverage 124 | *.coveragexml 125 | 126 | # NCrunch 127 | _NCrunch_* 128 | .*crunch*.local.xml 129 | nCrunchTemp_* 130 | 131 | # MightyMoose 132 | *.mm.* 133 | AutoTest.Net/ 134 | 135 | # Web workbench (sass) 136 | .sass-cache/ 137 | 138 | # Installshield output folder 139 | [Ee]xpress/ 140 | 141 | # DocProject is a documentation generator add-in 142 | DocProject/buildhelp/ 143 | DocProject/Help/*.HxT 144 | DocProject/Help/*.HxC 145 | DocProject/Help/*.hhc 146 | DocProject/Help/*.hhk 147 | DocProject/Help/*.hhp 148 | DocProject/Help/Html2 149 | DocProject/Help/html 150 | 151 | # Click-Once directory 152 | publish/ 153 | 154 | # Publish Web Output 155 | *.[Pp]ublish.xml 156 | *.azurePubxml 157 | # TODO: Comment the next line if you want to checkin your web deploy settings 158 | # but database connection strings (with potential passwords) will be unencrypted 159 | *.pubxml 160 | *.publishproj 161 | 162 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 163 | # checkin your Azure Web App publish settings, but sensitive information contained 164 | # in these scripts will be unencrypted 165 | PublishScripts/ 166 | 167 | # NuGet Packages 168 | *.nupkg 169 | # The packages folder can be ignored because of Package Restore 170 | **/packages/* 171 | # except build/, which is used as an MSBuild target. 172 | !**/packages/build/ 173 | # Uncomment if necessary however generally it will be regenerated when needed 174 | #!**/packages/repositories.config 175 | # NuGet v3's project.json files produces more ignorable files 176 | *.nuget.props 177 | *.nuget.targets 178 | 179 | # Microsoft Azure Build Output 180 | csx/ 181 | *.build.csdef 182 | 183 | # Microsoft Azure Emulator 184 | ecf/ 185 | rcf/ 186 | 187 | # Windows Store app package directories and files 188 | AppPackages/ 189 | BundleArtifacts/ 190 | Package.StoreAssociation.xml 191 | _pkginfo.txt 192 | 193 | # Visual Studio cache files 194 | # files ending in .cache can be ignored 195 | *.[Cc]ache 196 | # but keep track of directories ending in .cache 197 | !*.[Cc]ache/ 198 | 199 | # Others 200 | ClientBin/ 201 | ~$* 202 | *~ 203 | *.dbmdl 204 | *.dbproj.schemaview 205 | *.jfm 206 | *.pfx 207 | *.publishsettings 208 | orleans.codegen.cs 209 | 210 | # Since there are multiple workflows, uncomment next line to ignore bower_components 211 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 212 | #bower_components/ 213 | 214 | # RIA/Silverlight projects 215 | Generated_Code/ 216 | 217 | # Backup & report files from converting an old project file 218 | # to a newer Visual Studio version. Backup files are not needed, 219 | # because we have git ;-) 220 | _UpgradeReport_Files/ 221 | Backup*/ 222 | UpgradeLog*.XML 223 | UpgradeLog*.htm 224 | 225 | # SQL Server files 226 | *.mdf 227 | *.ldf 228 | *.ndf 229 | 230 | # Business Intelligence projects 231 | *.rdl.data 232 | *.bim.layout 233 | *.bim_*.settings 234 | 235 | # Microsoft Fakes 236 | FakesAssemblies/ 237 | 238 | # GhostDoc plugin setting file 239 | *.GhostDoc.xml 240 | 241 | # Node.js Tools for Visual Studio 242 | .ntvs_analysis.dat 243 | node_modules/ 244 | 245 | # Typescript v1 declaration files 246 | typings/ 247 | 248 | # Visual Studio 6 build log 249 | *.plg 250 | 251 | # Visual Studio 6 workspace options file 252 | *.opt 253 | 254 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 255 | *.vbw 256 | 257 | # Visual Studio LightSwitch build output 258 | **/*.HTMLClient/GeneratedArtifacts 259 | **/*.DesktopClient/GeneratedArtifacts 260 | **/*.DesktopClient/ModelManifest.xml 261 | **/*.Server/GeneratedArtifacts 262 | **/*.Server/ModelManifest.xml 263 | _Pvt_Extensions 264 | 265 | # Paket dependency manager 266 | .paket/paket.exe 267 | paket-files/ 268 | 269 | # FAKE - F# Make 270 | .fake/ 271 | 272 | # JetBrains Rider 273 | .idea/ 274 | *.sln.iml 275 | 276 | # CodeRush 277 | .cr/ 278 | 279 | # Python Tools for Visual Studio (PTVS) 280 | __pycache__/ 281 | *.pyc 282 | 283 | # Cake - Uncomment if you are using it 284 | # tools/** 285 | # !tools/packages.config 286 | 287 | # Telerik's JustMock configuration file 288 | *.jmconfig 289 | 290 | # BizTalk build output 291 | *.btp.cs 292 | *.btm.cs 293 | *.odx.cs 294 | *.xsd.cs 295 | 296 | # Jupyter Notebooks 297 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/finding_lungs/finding_lungs_DL_approach.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 5 | import pandas as pd 6 | from keras.models import load_model 7 | from skimage import exposure, img_as_float 8 | from skimage import transform 9 | import numpy as np 10 | 11 | from skimage import measure 12 | import lungs_finder as lf 13 | import cv2 14 | 15 | # for lung detection 16 | left_edge = 0 17 | right_edge = 256 18 | top_edge = 0 19 | bottom_edge = 256 20 | margin = 12 21 | 22 | row_size = 256 23 | col_size = 256 24 | 25 | # Path to csv-file. File should contain X-ray filenames as first column, 26 | # mask filenames as second column. 27 | 28 | out_folder_matched_img = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay", "images_centered") 29 | out_folder_mismatched_image = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay", 30 | "images_centered_mismatched_by_both") 31 | csv_path = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay", "Data_Entry_2017.csv") 32 | # Path to the folder with images. Images will be read from path + path_from_csv 33 | img_path = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay", "images") 34 | mis_detected_csv_path = os.path.join("/mnt", "MyAzureFileShare", "Data", "ChestXRay", "mis_detected.csv") 35 | df = pd.read_csv(csv_path) 36 | 37 | # Load test data 38 | im_shape = (256, 256) 39 | 40 | # Load model 41 | # plt.figure(figsize=(10, 10)) 42 | model_name = './trained_model.hdf5.bak' 43 | UNet = load_model(model_name) 44 | 45 | threshold = 0.85 46 | # list to save the mis detected images 47 | image_misdetect_list = [] 48 | 49 | 50 | def finding_lungs_non_DL_approach_and_save(image, file_name): 51 | # print(row.columns.values) 52 | # file_name = row[0] 53 | # print("line is", file_name, image.shape) 54 | # when reading from txt there is something in the end so we need to eliminate that 55 | # image = cv2.imread(os.path.join("Z:\\", "Data", "ChestXRay", "images", file_name), 0) 56 | 57 | img_height = image.shape[0] 58 | img_width = image.shape[1] 59 | # Get both lungs image. It uses HOG as main method, 60 | # but if HOG found nothing it uses HAAR or LBP. 61 | found_lungs = lf.get_lungs(image) 62 | 63 | # this can be written in a more concise way but we just keep it a bit redundant for easy reading 64 | if found_lungs is not None and found_lungs.shape[0] > img_height / 2 and found_lungs.shape[1] > img_width / 2: 65 | # print(found_lungs.shape) 66 | found_lungs_resized = cv2.resize(found_lungs, im_shape) 67 | # cv2.imshow(file_name, found_lungs) 68 | # code = cv2.waitKey(0) 69 | cv2.imwrite(os.path.join(out_folder_matched_img, file_name), found_lungs_resized) 70 | return True 71 | else: 72 | cv2.imwrite(os.path.join(out_folder_mismatched_image, file_name), cv2.resize(image, im_shape)) 73 | return False 74 | 75 | 76 | for index, item in df.iterrows(): 77 | # X, y = loadDataGeneral(current_df, path, im_shape) 78 | raw_img = cv2.imread(os.path.join(img_path, item['Image Index'])) 79 | 80 | img = img_as_float(raw_img)[:, :, 0] 81 | img = transform.resize(img, im_shape) 82 | img = exposure.equalize_hist(img) 83 | # img = np.expand_dims(img, -1) 84 | img -= img.mean() 85 | img /= img.std() 86 | 87 | file_name = item['Image Index'] 88 | X = np.expand_dims(img, axis=0) 89 | X = np.expand_dims(X, axis=-1) 90 | n_test = X.shape[0] 91 | inp_shape = X[0].shape 92 | 93 | # img = exposure.rescale_intensity(np.squeeze(X), out_range=(0, 1)) 94 | 95 | # print("size of img is", img.shape) 96 | prediction = UNet.predict(X)[..., 0].reshape(inp_shape[:2]) 97 | 98 | thresh_img = np.where(prediction > threshold, 1.0, 0.0) # threshold the image 99 | 100 | labels = measure.label(thresh_img) # Different labels are displayed in different colors 101 | label_vals = np.unique(labels) 102 | # print(label_vals) 103 | regions = measure.regionprops(labels) 104 | good_labels = [] 105 | global_B_box = [] 106 | for prop in regions: 107 | B = prop.bbox 108 | if B[2] - B[0] > row_size / 4 and B[3] - B[1] > col_size / 6: # make sure size of lung to avoid small areas 109 | good_labels.append(prop.label) 110 | global_B_box.append(B) 111 | 112 | # print(len(good_labels)) 113 | 114 | DL_failed_detect_flag = False 115 | if len(good_labels) == 2: 116 | 117 | left_edge = np.clip(min(global_B_box[0][1] - margin, global_B_box[1][1] - margin), a_min=0, a_max=256) 118 | right_edge = np.clip(max(global_B_box[0][3] + margin, global_B_box[1][3] + margin), a_min=0, a_max=256) 119 | top_edge = np.clip(min(global_B_box[0][0] - margin, global_B_box[1][0] - margin), a_min=0, a_max=256) 120 | bottom_edge = np.clip(max(global_B_box[0][2] + margin * 3, global_B_box[1][2] + margin * 4), a_min=0, 121 | a_max=256) # leave more margins at the bottom 122 | else: 123 | # print(file_name) 124 | 125 | DL_failed_detect_flag = True 126 | 127 | if DL_failed_detect_flag: 128 | img_name = os.path.join(out_folder_mismatched_image, file_name) 129 | if not finding_lungs_non_DL_approach_and_save(raw_img, file_name): 130 | # save file name only if both methods are not detected 131 | image_misdetect_list.append(file_name) 132 | print(file_name) 133 | else: 134 | img_name = os.path.join(out_folder_matched_img, file_name) 135 | cropped = cv2.resize(raw_img, im_shape)[top_edge:bottom_edge, left_edge:right_edge] 136 | # print(cropped) 137 | resized_cropped = cv2.resize(cropped, im_shape) 138 | cv2.imwrite(img_name, resized_cropped) 139 | 140 | # if mis_detected_flag: 141 | # mis_detected_flag = False 142 | # fig, ax = plt.subplots(2, 2, figsize=[12, 12]) 143 | # ax[0, 0].set_title("Original " + file_name) 144 | # ax[0, 0].imshow(raw_img, cmap='gray') 145 | # ax[0, 0].axis('off') 146 | # ax[0, 1].set_title("Threshold " + file_name) 147 | # ax[0, 1].imshow(thresh_img, cmap='gray') 148 | # # ax[0, 1].imshow(prediction, cmap='gray') 149 | # ax[0, 1].axis('off') 150 | # ax[1, 0].set_title("Color Labels " + file_name) 151 | # ax[1, 0].imshow(labels) 152 | # ax[1, 0].axis('off') 153 | # ax[1, 1].set_title("Apply Mask on Original " + file_name) 154 | # 155 | # ax[1, 1].imshow(resized_cropped, cmap='gray') 156 | # ax[1, 1].axis('off') 157 | 158 | if index > 112120: # for debug purpose 159 | break 160 | 161 | if index % 100 == 0: 162 | df = pd.DataFrame({'col': image_misdetect_list}) 163 | df.to_csv(mis_detected_csv_path, header=False, index=False) 164 | 165 | df = pd.DataFrame({'col': image_misdetect_list}) 166 | df.to_csv(mis_detected_csv_path, header=False, index=False) 167 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/Deployment_Guide.md: -------------------------------------------------------------------------------- 1 | Install Azure Machine Learning Workbench AMLWB (https://docs.microsoft.com/en-us/azure/machine-learning/preview/quickstart-installation) 2 | 1. Set up your environment: 3 | Open an amlwb cli and follow this [guide](https://docs.microsoft.com/en-us/azure/machine-learning/preview/tutorial-classifying-iris-part-2#execute-scripts-in-the-azure-machine-learning-cli-window): 4 | REM login by using the aka.ms/devicelogin site 5 | az login 6 | 7 | REM lists all Azure subscriptions you have access to (# make sure the right subscription is selected (column isDefault)) 8 | az account list -o table 9 | 10 | REM sets the current Azure subscription to the one you want to use 11 | az account set -s 12 | 13 | REM verifies that your current subscription is set correctly 14 | az account show 15 | 16 | REM Create an experimentation account and and azure ml workspace using the portal 17 | 18 | REM Use the AMLWB to create a new project 19 | 20 | REM Copy \Code\ structure and files (.ipynb and .py files) in the new experiment folder 21 | 22 | 23 | 24 | 2. Create compute contexts on remote VMs: 25 | 26 | 2.1 Using Azure portal: 27 | -->Deploy a linux VM (e.g. a linux [DSVM](https://docs.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/dsvm-ubuntu-intro)) 28 | -->For best results, use a a deep learning linux VM (https://azuremarketplace.microsoft.com/en-us/marketplace/apps/microsoft-ads.dsvm-deep-learning). 29 | -->You may need a [GPU VM](https://docs.microsoft.com/en-us/azure/virtual-machines/windows/sizes-gpu) for training, and a second CPU VM for testing operationalization. 30 | 31 | --> Make sure the vm fqdn is set up. The disk sizes must cover data requirements. Default data disks do not survive machine reboot events. To keep the data available between machine reboots, either make the OS disk larger, or attach an external Azure VHD. 32 | 33 | --> use https://docs.microsoft.com/en-us/azure/machine-learning/preview/known-issues-and-troubleshooting-guide#vm-disk-is-full to resize the VM disk if needed 34 | #Deallocate VM (stopping will not work) 35 | $ az vm deallocate --resource-group myResourceGroup --name myVM 36 | # Update Disc Size 37 | $ az disk update --resource-group myResourceGroup --name myVM --size-gb 250 38 | # Start VM 39 | $ az vm start --resource-group myResourceGroup --name myVM 40 | 41 | --> ssh into the remote VM and create the folder structure that will be used by AMLWB to map host locations to directories in running AMLWB containers 42 | loginuser@deeplearninggpuvm:~$ sudo mkdir -p /datadrive01/amlwbShare 43 | loginuser@deeplearninggpuvm:~$ sudo chmod ugo=rwx /datadrive01/amlwbShare/ 44 | loginuser@deeplearninggpuvm2:~$ ls -l /datadrive01/ 45 | total 4 46 | drwxrwxrwx 2 root root 4096 Feb 5 18:33 amlwbShare 47 | 48 | 2.2 Get the NIH Chext Xray images 49 | Go to https://nihcc.app.box.com/v/ChestXray-NIHCC 50 | Store the images from images (https://nihcc.app.box.com/v/ChestXray-NIHCC/folder/37178474737) dir as unarchived files in a blob storage account. 51 | They will be downloaded later in Code\01_DataPrep\_001_get_data.ipynb, and will land into a dir on the remote VM created above: 52 | loginuser@deeplearninggpuvm2:~$ ls /datadrive01/amlwbShare/crt_ea/grt_work_space/crt_experiment/chestxray/data/ChestX-ray8/ChestXray-NIHCC/ | head -2 53 | 00000001_000.png 54 | 00000001_001.png 55 | 56 | 2.3 Get patient-image map file 57 | Download manually NIH data file Data_Entry_2017.csv (https://nihcc.app.box.com/v/ChestXray-NIHCC) into this dir on the remote VM created above: 58 | loginuser@deeplearninggpuvm2:~$ ls -l /datadrive01/amlwbShare/crt_ea/grt_work_space/crt_experiment/chestxray/data/ChestX-ray8/ChestXray-NIHCC_other 59 | total 7680 60 | -rw-rw-r-- 1 loginvm0011 loginvm0011 7861152 Feb 7 02:54 Data_Entry_2017.csv 61 | Data_Entry_2017.csv is the patients to images map and will be used by \Code\02_Model\000_preprocess.ipynb to create the train/validate/test partitions. 62 | 63 | 64 | 2.4 65 | -->in AMLWB cli, create AMLWB compute context: 66 | az ml computetarget attach remotedocker --name --address --username --password 67 | the command above will create \aml_config\.runconfig and \aml_config\.compute files that control the AMLWB compute contexts 68 | 69 | -->Check the existing compute targets: 70 | az ml computetarget list 71 | 72 | For CPU compute contexts: 73 | -->edit .runconfig file: 74 | CondaDependenciesFile: aml_config/conda_dependencies_o16n.yml 75 | Framework: Python 76 | PrepareEnvironment: true 77 | -->edit .compute file: 78 | baseDockerImage: georgedockeraccount/utils_with_amlwb_base_cpu:azcopyenabled 79 | nativeSharedDirectory: /data/datadrive01/amlwbShare/ 80 | sharedVolumes: true 81 | 82 | For GPU compute contexts: 83 | -->edit .runconfig file: 84 | CondaDependenciesFile: aml_config/conda_dependencies_gpu.yml 85 | Framework: Python 86 | PrepareEnvironment: true 87 | -->edit .compute file: 88 | baseDockerImage: georgedockeraccount/utils_with_amlwb_base_gpu:azcopyenabled 89 | nativeSharedDirectory: /data/datadrive01/amlwbShare/ 90 | nvidiaDocker: true 91 | sharedVolumes: true 92 | 93 | -->go back to cli: 94 | az ml experiment prepare -c 95 | -> while the preparation is running, you can check on linux host machine how docker is running: 96 | sudo docker images 97 | sudo docker ps -a 98 | 99 | E.g.: 100 | loginuser@deeplearninggpuvm:~$ sudo docker images 101 | REPOSITORY TAG IMAGE ID CREATED SIZE 102 | azureml_88865f7583e9e1fd502a32a7717aa1f0 latest a35a05a9b295 16 minutes ago 7.21GB 103 | georgedockeraccount/utils_with_amlwb_base_gpu azcopyenabled 2e6da7a1351c 4 weeks ago 3.89GB 104 | 105 | -> see \Code\docker\Dockerfile for details on how the docker images have been created. 106 | 107 | 108 | 3. Run experiments: 109 | Code\01_DataPrep\_GetData.ipynb 110 | \Code\02_Model\000_preprocess.ipynb : creates the train/validate/test partitions and saves all NIH chest XRay images as numpy objects on disk 111 | \Code\02_Model\010_train.ipynb : trains a densenet model (pretrained on imagenet) on NIH chest xray data 112 | \Code\02_Model\040_cam_simple.ipynb : shows CAM visualizations 113 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/azure_chestxray_cam.py: -------------------------------------------------------------------------------- 1 | ### Copyright (C) Microsoft Corporation. 2 | 3 | import keras.backend as K 4 | import sys, os, io 5 | import numpy as np 6 | import cv2 7 | 8 | import matplotlib 9 | matplotlib.use('agg') 10 | 11 | paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code', 'src'])))] 12 | def add_path_to_sys_path(path_to_append): 13 | if not (any(path_to_append in paths for paths in sys.path)): 14 | sys.path.append(path_to_append) 15 | [add_path_to_sys_path(crt_path) for crt_path in paths_to_append] 16 | 17 | import azure_chestxray_utils 18 | 19 | 20 | def get_score_and_cam_picture(cv2_input_image, DenseNetImageNet121_model): 21 | # based on https://github.com/jacobgil/keras-cam/blob/master/cam.py 22 | width, height, _ = cv2_input_image.shape 23 | class_weights = DenseNetImageNet121_model.layers[-1].get_weights()[0] 24 | final_conv_layer = DenseNetImageNet121_model.layers[-3] 25 | get_output = K.function([DenseNetImageNet121_model.layers[0].input], 26 | [final_conv_layer.output, \ 27 | DenseNetImageNet121_model.layers[-1].output]) 28 | [conv_outputs, prediction] = get_output([cv2_input_image[None,:,:,:]]) 29 | conv_outputs = conv_outputs[0, :, :, :] 30 | prediction = prediction[0,:] 31 | 32 | #Create the class activation map. 33 | predicted_disease = np.argmax(prediction) 34 | cam = np.zeros(dtype = np.float32, shape = conv_outputs.shape[:2]) 35 | for i, w in enumerate(class_weights[:, predicted_disease]): 36 | cam += w * conv_outputs[:, :, i] 37 | 38 | return prediction, cam, predicted_disease 39 | 40 | 41 | def process_cam_image(crt_cam_image, xray_image, crt_alpha = .5): 42 | im_width, im_height, _ = xray_image.shape 43 | crt_cam_image = cv2.resize(crt_cam_image, (im_width, im_height), \ 44 | interpolation=cv2.INTER_CUBIC) 45 | 46 | # do some gamma enhancement, e is too much 47 | crt_cam_image = np.power(1.1, crt_cam_image) 48 | crt_cam_image = azure_chestxray_utils.normalize_nd_array(crt_cam_image) 49 | # crt_cam_image[np.where(crt_cam_image < 0.5)] = 0 50 | crt_cam_image = 255*crt_cam_image 51 | 52 | # make cam an rgb image 53 | empty_image_channel = np.zeros(dtype = np.float32, shape = crt_cam_image.shape[:2]) 54 | crt_cam_image = cv2.merge((crt_cam_image,empty_image_channel,empty_image_channel)) 55 | 56 | blended_image = cv2.addWeighted(xray_image.astype('uint8'),crt_alpha,\ 57 | crt_cam_image.astype('uint8'),(1-crt_alpha),0) 58 | return(blended_image) 59 | 60 | def plot_cam_results(crt_blended_image, crt_cam_image, crt_xray_image, map_caption): 61 | import matplotlib.pyplot as plt 62 | 63 | fig = plt.figure(figsize = (15,7)) 64 | 65 | ax1 = fig.add_subplot(2, 3, 1) 66 | ax1.imshow(crt_xray_image, cmap = 'gray', interpolation = 'bicubic') 67 | ax1.set_title('Orig X Ray') 68 | plt.axis('off') 69 | 70 | ax2 = fig.add_subplot(2,3, 2) 71 | cam_plot = ax2.imshow(crt_cam_image, cmap=plt.get_cmap('OrRd'), interpolation = 'bicubic') 72 | plt.colorbar(cam_plot, ax=ax2) 73 | ax2.set_title('Activation Map') 74 | plt.axis('off') 75 | 76 | ax3 = fig.add_subplot(2,3, 3) 77 | blended_plot = ax3.imshow(crt_blended_image, interpolation = 'bicubic') 78 | plt.colorbar(cam_plot, ax=ax3) 79 | ax3.set_title(map_caption) 80 | plt.axis('off') 81 | 82 | # serialize blended image plot padded in the x/y-direction 83 | image_as_BytesIO = io.BytesIO() 84 | x_direction_pad = 1.05;y_direction_pad=1.2 85 | extent = ax3.get_window_extent().transformed(fig.dpi_scale_trans.inverted()) 86 | fig.savefig(image_as_BytesIO, 87 | bbox_inches=extent.expanded(x_direction_pad, 88 | y_direction_pad), 89 | format='png') 90 | image_as_BytesIO.seek(0) 91 | return(image_as_BytesIO) 92 | 93 | 94 | 95 | def process_xray_image(crt_xray_image, DenseNetImageNet121_model): 96 | 97 | # print(crt_xray_image.shape) 98 | crt_xray_image = azure_chestxray_utils.normalize_nd_array(crt_xray_image) 99 | crt_xray_image = 255*crt_xray_image 100 | crt_xray_image=crt_xray_image.astype('uint8') 101 | 102 | crt_predictions, crt_cam_image, predicted_disease_index = \ 103 | get_score_and_cam_picture(crt_xray_image, 104 | DenseNetImageNet121_model) 105 | 106 | prj_consts = azure_chestxray_utils.chestxray_consts() 107 | likely_disease=prj_consts.DISEASE_list[predicted_disease_index] 108 | likely_disease_prob = 100*crt_predictions[predicted_disease_index] 109 | likely_disease_prob_ratio=100*crt_predictions[predicted_disease_index]/sum(crt_predictions) 110 | print('predictions: ', crt_predictions) 111 | print('likely disease: ', likely_disease) 112 | print('likely disease prob: ', likely_disease_prob) 113 | print('likely disease prob ratio: ', likely_disease_prob_ratio) 114 | 115 | crt_blended_image = process_cam_image(crt_cam_image, crt_xray_image) 116 | plot_cam_results(crt_blended_image, crt_cam_image, crt_xray_image, 117 | str(likely_disease)+ ' ' + 118 | "{0:.1f}".format(likely_disease_prob)+ '% (weight ' + 119 | "{0:.1f}".format(likely_disease_prob_ratio)+ '%)') 120 | 121 | def process_nih_data(nih_data_files, NIH_data_dir, DenseNetImageNet121_model): 122 | for crt_image in nih_data_files: 123 | # print(crt_image) 124 | prj_consts = azure_chestxray_utils.chestxray_consts() 125 | 126 | crt_xray_image = cv2.imread(os.path.join(NIH_data_dir,crt_image)) 127 | crt_xray_image = cv2.resize(crt_xray_image, 128 | (prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_HEIGHT, 129 | prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_WIDTH)) \ 130 | .astype(np.float32) 131 | 132 | process_xray_image(crt_xray_image, DenseNetImageNet121_model ) 133 | 134 | if __name__=="__main__": 135 | #FIXME 136 | # add example/test code here 137 | 138 | 139 | 140 | NIH_annotated_Cardiomegaly = ['00005066_030.png'] 141 | data_dir = '' 142 | cv2_image = cv2.imread(os.path.join(data_dir,NIH_annotated_Cardiomegaly[0])) 143 | 144 | print_image_stats_by_channel(cv2_image) 145 | cv2_image = normalize_nd_array(cv2_image) 146 | cv2_image = 255*cv2_image 147 | cv2_image=cv2_image.astype('uint8') 148 | print_image_stats_by_channel(cv2_image) 149 | 150 | predictions, cam_image, predicted_disease_index = get_score_and_cam_picture(cv2_image, model) 151 | print(predictions) 152 | prj_consts = azure_chestxray_utils.chestxray_consts() 153 | print(prj_consts.DISEASE_list[predicted_disease_index]) 154 | print('likely disease: ', prj_consts.DISEASE_list[predicted_disease_index]) 155 | print('likely disease prob ratio: ', \ 156 | predictions[predicted_disease_index]/sum(predictions)) 157 | blended_image = process_cam_image(cam_image, cv2_image) 158 | plot_cam_results(blended_image, cam_image, cv2_image, \ 159 | prj_consts.DISEASE_list[predicted_disease_index]) -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/score_image_and_cam.py: -------------------------------------------------------------------------------- 1 | # This script generates the scoring and schema files 2 | # Creates the schema, and holds the init and run functions needed to 3 | # operationalize the chestXray model 4 | 5 | 6 | import os, sys, pickle, base64 7 | import keras.models 8 | import keras.layers 9 | import keras_contrib.applications.densenet 10 | import pandas as pd 11 | import numpy as np 12 | import azure_chestxray_utils, azure_chestxray_cam 13 | 14 | #################################### 15 | # Parameters 16 | #################################### 17 | global chest_XRay_model 18 | global as_string_b64encoded_pickled_data_column_name 19 | as_string_b64encoded_pickled_data_column_name = 'encoded_image' 20 | global densenet_weights_file_name 21 | # densenet_weights_file_name = 'weights_only_chestxray_model_14_weights_712split_epoch_029_val_loss_147.7599.hdf5' 22 | densenet_weights_file_name = 'weights_only_chestxray_model_14_weights_712split_epoch_029_val_loss_147.7599 - Copy.hdf5' 23 | 24 | # Import data collection library. Only supported for docker mode. 25 | # Functionality will be ignored when package isn't found 26 | try: 27 | from azureml.datacollector import ModelDataCollector 28 | except ImportError: 29 | print("Data collection is currently only supported in docker mode. May be disabled for local mode.") 30 | # Mocking out model data collector functionality 31 | class ModelDataCollector(object): 32 | def nop(*args, **kw): pass 33 | def __getattr__(self, _): return self.nop 34 | def __init__(self, *args, **kw): return None 35 | pass 36 | 37 | #################################### 38 | # Utils 39 | #################################### 40 | def as_string_b64encoded_pickled(input_object): 41 | #b64encode returns bytes class, make it string by calling .decode('utf-8') 42 | return (base64.b64encode(pickle.dumps(input_object))).decode('utf-8') 43 | 44 | def unpickled_b64decoded_as_bytes(input_object): 45 | if input_object.startswith('b\''): 46 | input_object = input_object[2:-1] 47 | # make string bytes 48 | input_object = input_object.encode('utf-8') 49 | #decode and the unpickle the bytes to recover original object 50 | return (pickle.loads(base64.b64decode(input_object))) 51 | 52 | def get_image_score_and_serialized_cam(crt_cv2_image, crt_chest_XRay_model): 53 | prj_consts = azure_chestxray_utils.chestxray_consts() 54 | crt_cv2_image = azure_chestxray_utils.normalize_nd_array(crt_cv2_image) 55 | crt_cv2_image = 255*crt_cv2_image 56 | crt_cv2_image=crt_cv2_image.astype('uint8') 57 | predictions, cam_image, predicted_disease_index = \ 58 | azure_chestxray_cam.get_score_and_cam_picture(crt_cv2_image, crt_chest_XRay_model) 59 | blended_image = azure_chestxray_cam.process_cam_image(cam_image, crt_cv2_image) 60 | serialized_image = azure_chestxray_cam.plot_cam_results(blended_image, cam_image, crt_cv2_image, \ 61 | prj_consts.DISEASE_list[predicted_disease_index]) 62 | return predictions, serialized_image 63 | 64 | #################################### 65 | # API functions 66 | #################################### 67 | 68 | # Prepare the web service definition by authoring 69 | # init() and run() functions. Test the functions 70 | # before deploying the web service. 71 | def init(): 72 | try: 73 | print("init() method: Python version: " + str(sys.version)) 74 | print("crt Dir: " + os.getcwd()) 75 | 76 | import pip 77 | # pip.get_installed_distributions() 78 | myDistr = pip.get_installed_distributions() 79 | type(myDistr) 80 | for crtDist in myDistr: 81 | print(crtDist) 82 | 83 | # load the model file 84 | global chest_XRay_model 85 | chest_XRay_model = azure_chestxray_utils.build_DenseNetImageNet201_model() 86 | chest_XRay_model.load_weights(densenet_weights_file_name) 87 | print('Densenet model loaded') 88 | 89 | except Exception as e: 90 | print("Exception in init:") 91 | print(str(e)) 92 | 93 | def run(input_df): 94 | try: 95 | import json 96 | 97 | debugCounter = 0 98 | print("run() method: Python version: " + str(sys.version) ); print('Step '+str(debugCounter));debugCounter+=1 99 | 100 | print ('\ninput_df shape {}'.format(input_df.shape)) 101 | print(list(input_df)) 102 | print(input_df) 103 | 104 | input_df = input_df[as_string_b64encoded_pickled_data_column_name][0]; print('Step '+str(debugCounter));debugCounter+=1 105 | input_cv2_image = unpickled_b64decoded_as_bytes(input_df); print('Step '+str(debugCounter));debugCounter+=1 106 | 107 | #finally scoring 108 | predictions, serialized_cam_image = get_image_score_and_serialized_cam(input_cv2_image, chest_XRay_model) 109 | #predictions = chest_XRay_model.predict(input_cv2_image[None,:,:,:]) 110 | 111 | # prediction_dc.collect(ADScores) 112 | outDict = {"chestXrayScore": str(predictions), "chestXrayCAM":as_string_b64encoded_pickled(serialized_cam_image)} 113 | return json.dumps(outDict) 114 | except Exception as e: 115 | return(str(e)) 116 | 117 | 118 | #################################### 119 | # main function can be used for test and demo 120 | #################################### 121 | def main(): 122 | from azureml.api.schema.dataTypes import DataTypes 123 | from azureml.api.schema.sampleDefinition import SampleDefinition 124 | from azureml.api.realtime.services import generate_schema 125 | 126 | print('Entered main function:') 127 | print(os.getcwd()) 128 | 129 | amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY'] 130 | print(amlWBSharedDir) 131 | 132 | def get_files_in_dir(crt_dir): 133 | return( [f for f in os.listdir(crt_dir) if os.path.isfile(os.path.join(crt_dir, f))]) 134 | 135 | fully_trained_weights_dir=os.path.join( 136 | amlWBSharedDir, 137 | os.path.join(*(['chestxray', 'output', 'trained_models_weights']))) 138 | crt_models = get_files_in_dir(fully_trained_weights_dir) 139 | print(fully_trained_weights_dir) 140 | print(crt_models) 141 | 142 | test_images_dir=os.path.join( 143 | amlWBSharedDir, 144 | os.path.join(*(['chestxray', 'data', 'ChestX-ray8', 'test_images']))) 145 | test_images = get_files_in_dir(test_images_dir) 146 | print(test_images_dir) 147 | print(len(test_images)) 148 | 149 | # score in local mode (i.e. here in main function) 150 | model = azure_chestxray_utils.build_DenseNetImageNet201_model() 151 | model.load_weights(os.path.join( 152 | fully_trained_weights_dir, densenet_weights_file_name)) 153 | 154 | print('Model weoghts loaded!') 155 | 156 | import cv2 157 | cv2_image = cv2.imread(os.path.join(test_images_dir,test_images[0])) 158 | x, serialized_cam_image = get_image_score_and_serialized_cam(cv2_image, model) 159 | file_bytes = np.asarray(bytearray(serialized_cam_image.read()), dtype=np.uint8) 160 | recovered_image = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR) 161 | 162 | # x = model.predict(cv2_image[None,:,:,:]) 163 | print(test_images[0]) 164 | print(x) 165 | print(recovered_image.shape) 166 | 167 | # score in local mode (i.e. here in main function) using encoded data 168 | encoded_image = as_string_b64encoded_pickled(cv2_image) 169 | df_for_api = pd.DataFrame(data=[[encoded_image]], columns=[as_string_b64encoded_pickled_data_column_name]) 170 | del encoded_image 171 | del cv2_image 172 | del serialized_cam_image 173 | 174 | input_df = df_for_api[as_string_b64encoded_pickled_data_column_name][0] 175 | input_cv2_image = unpickled_b64decoded_as_bytes(input_df); 176 | x, serialized_cam_image = get_image_score_and_serialized_cam(input_cv2_image, model) 177 | file_bytes = np.asarray(bytearray(serialized_cam_image.read()), dtype=np.uint8) 178 | recovered_image = cv2.imdecode(file_bytes, cv2.IMREAD_COLOR) 179 | 180 | # x = model.predict(input_cv2_image[None,:,:,:]) 181 | print('After encoding and decoding:') 182 | print(x) 183 | print(recovered_image.shape) 184 | 185 | del model 186 | 187 | # now create the post deployment env, i.e. score using init() and run() 188 | crt_dir = os.getcwd() 189 | working_dir = os.path.join(crt_dir, 'tmp_cam_deploy') 190 | if not os.path.exists(working_dir): 191 | os.makedirs(working_dir) 192 | 193 | import shutil 194 | shutil.copyfile( 195 | os.path.join( fully_trained_weights_dir,densenet_weights_file_name), 196 | os.path.join( working_dir,densenet_weights_file_name)) 197 | 198 | os.chdir(working_dir) 199 | 200 | # Turn on data collection debug mode to view output in stdout 201 | os.environ["AML_MODEL_DC_DEBUG"] = 'true' 202 | 203 | # Test the output of the functions 204 | init() 205 | print("Result: " + run(df_for_api)) 206 | 207 | # #Generate the schema 208 | data_for_schema = {"input_df": SampleDefinition(DataTypes.PANDAS, df_for_api)} 209 | schema_file = os.path.join(fully_trained_weights_dir, 'chest_XRay_cam_service_schema.json') 210 | generate_schema(run_func=run, inputs=data_for_schema, filepath=schema_file) 211 | print("Schema saved in " +schema_file) 212 | 213 | 214 | if __name__ == "__main__": 215 | main() 216 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/01_DataPrep/001_get_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Copies data from blob to local host\n", 8 | "\n", 9 | "##### Copyright (C) Microsoft Corporation. \n", 10 | "see license file for details " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 11, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Allow multiple displays per cell\n", 20 | "\n", 21 | "from IPython.core.interactiveshell import InteractiveShell\n", 22 | "InteractiveShell.ast_node_interactivity = \"all\"" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 12, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "'/azureml-share/'" 34 | ] 35 | }, 36 | "execution_count": 12, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n", 43 | "\n", 44 | "import os\n", 45 | "try:\n", 46 | " amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY'] \n", 47 | "except:\n", 48 | " amlWBSharedDir = ''\n", 49 | " print('not using aml services?')\n", 50 | " \n", 51 | "amlWBSharedDir" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 13, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# Use the Azure Machine Learning data collector to log various metrics\n", 61 | "\n", 62 | "from azureml.logging import get_azureml_logger\n", 63 | "logger = get_azureml_logger()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 14, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# Use Azure Machine Learning history magic to control history collection\n", 73 | "# History is off by default, options are \"on\", \"off\", or \"show\"\n", 74 | "# %azureml history on" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 15, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "[None]" 86 | ] 87 | }, 88 | "execution_count": 15, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "# import utlity functions\n", 95 | "\n", 96 | "import sys, os\n", 97 | "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code', 'src'])))]\n", 98 | "def add_path_to_sys_path(path_to_append):\n", 99 | " if not (any(path_to_append in paths for paths in sys.path)):\n", 100 | " sys.path.append(path_to_append)\n", 101 | "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n", 102 | "\n", 103 | "import azure_chestxray_utils" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 16, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "'/azureml-run'" 115 | ] 116 | }, 117 | "execution_count": 16, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | }, 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "'/azureml-share/chestxray/data/ChestX-ray8'" 125 | ] 126 | }, 127 | "execution_count": 16, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | }, 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "'/azureml-share/code/notShared'" 135 | ] 136 | }, 137 | "execution_count": 16, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | }, 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'" 145 | ] 146 | }, 147 | "execution_count": 16, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "# create the file path variables \n", 154 | "# create nih_chest_xray_data_dir (in container dir mapped to a host dir for data persistence), \n", 155 | "# where data will be copied from blob\n", 156 | "\n", 157 | "prj_consts = azure_chestxray_utils.chestxray_consts()\n", 158 | "\n", 159 | "os.getcwd()\n", 160 | "\n", 161 | "\n", 162 | "data_base_input_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n", 163 | "credential_info_path=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.CREDENTIALS_DIR_list)))\n", 164 | "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n", 165 | " os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n", 166 | "\n", 167 | "!mkdir -p {data_base_input_dir}\n", 168 | "!mkdir -p {credential_info_path} \n", 169 | "!mkdir -p {nih_chest_xray_data_dir}\n", 170 | "\n", 171 | "data_base_input_dir\n", 172 | "credential_info_path\n", 173 | "nih_chest_xray_data_dir" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 17, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "Credentials file found at /azureml-share/code/notShared/get_data_access_secrets.py\n" 186 | ] 187 | }, 188 | { 189 | "data": { 190 | "text/plain": [ 191 | "'/azureml-share/code/notShared/get_data_access_secrets.py'" 192 | ] 193 | }, 194 | "execution_count": 17, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "# Create this function in file '{credential_info_path}/get_data_access_secrets.py'\n", 201 | "# def get_blob_credentials():\n", 202 | "# dataBlob = 'https://somedatastore.blob.core.windows.net/somecontainer'\n", 203 | "# sourceKey = 'somesourceKey'\n", 204 | "# return dataBlob, sourceKey\n", 205 | "# \n", 206 | "# For example:\n", 207 | "CredentialsFileName = os.path.join(credential_info_path, 'get_data_access_secrets.py') \n", 208 | "crt_container = 'https://somedatastore.blob.core.windows.net/somecontainer' \n", 209 | "crt_source_Key = 'somesourceKey'\n", 210 | "import os.path \n", 211 | "if not os.path.isfile(CredentialsFileName): \n", 212 | " print('Credentials file not found, will be written!') \n", 213 | " with open(CredentialsFileName,'w') as myFile:\n", 214 | " myFile.write((\"def get_blob_credentials():\\n\\t\" +\n", 215 | " \"dataBlob = '\" + crt_container + \"'\\n\\t\" +\n", 216 | " \"sourceKey = '\" + crt_source_Key + \"'\\n\\t\" +\n", 217 | " \"return dataBlob, sourceKey\\n\" )) \n", 218 | " print(('Credentials file written at ' +CredentialsFileName)) \n", 219 | "else: \n", 220 | " print(('Credentials file found at ' + CredentialsFileName))\n", 221 | " \n", 222 | "# check CredentialsFileName existence and print content if needed\n", 223 | "\n", 224 | "credentials_file_name = os.path.join(*([credential_info_path, 'get_data_access_secrets.py']))\n", 225 | "credentials_file_name\n", 226 | "# !cat {credentials_file_name}" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 18, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "# read credentials info\n", 236 | "\n", 237 | "import sys\n", 238 | "def prepend_path_to_sys_path(path_to_append):\n", 239 | " if not (any(path_to_append in paths for paths in sys.path)):\n", 240 | " sys.path.append(path_to_append)\n", 241 | "prepend_path_to_sys_path(credential_info_path) \n", 242 | "import get_data_access_secrets \n", 243 | "crt_container, crt_key = get_data_access_secrets.get_blob_credentials()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 19, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "#check azcopy is available. Uncomment second line to print _response_ if needed\n", 253 | "\n", 254 | "response = !azcopy\n", 255 | "# response" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 20, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "# copy data to local host dir\n", 265 | "# add { --source-key {crt_key} } if needed\n", 266 | "\n", 267 | "answer = !yes | azcopy \\\n", 268 | " --source {crt_container} \\\n", 269 | " --destination {nih_chest_xray_data_dir} \\\n", 270 | " --recursive" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 21, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "['[2018/02/06 05:19:05] Transfer summary:',\n", 282 | " '-----------------',\n", 283 | " 'Total files transferred: 112120',\n", 284 | " 'Transfer successfully: 112120',\n", 285 | " 'Transfer skipped: 0',\n", 286 | " 'Transfer failed: 0',\n", 287 | " 'Elapsed time: 00.00:09:57']" 288 | ] 289 | }, 290 | "execution_count": 21, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "answer[-7:]" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 22, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'" 308 | ] 309 | }, 310 | "execution_count": 22, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | }, 314 | { 315 | "name": "stdout", 316 | "output_type": "stream", 317 | "text": [ 318 | "112120\r\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "nih_chest_xray_data_dir\n", 324 | "!find $nih_chest_xray_data_dir -type f | wc -l" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 23, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "# run below command in amlwb cli to save current sessin as html\n", 334 | "# jupyter nbconvert --to html .\\Code\\01_DataPrep\\001_get_data.ipynb" 335 | ] 336 | } 337 | ], 338 | "metadata": { 339 | "kernelspec": { 340 | "display_name": "azure_chestxray_lung_disease gpucomputecontext", 341 | "language": "python", 342 | "name": "azure_chestxray_lung_disease_gpucomputecontext" 343 | }, 344 | "language_info": { 345 | "codemirror_mode": { 346 | "name": "ipython", 347 | "version": 3 348 | }, 349 | "file_extension": ".py", 350 | "mimetype": "text/x-python", 351 | "name": "python", 352 | "nbconvert_exporter": "python", 353 | "pygments_lexer": "ipython3", 354 | "version": "3.5.2" 355 | } 356 | }, 357 | "nbformat": 4, 358 | "nbformat_minor": 2 359 | } 360 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/02_Model/000_preprocess.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Data preprocessing\n", 8 | "\n", 9 | "##### Copyright (C) Microsoft Corporation. \n", 10 | "see license file for details " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 12, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Allow multiple displays per cell\n", 20 | "from IPython.core.interactiveshell import InteractiveShell\n", 21 | "InteractiveShell.ast_node_interactivity = \"all\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 13, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "'/azureml-share/'" 33 | ] 34 | }, 35 | "execution_count": 13, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n", 42 | "\n", 43 | "import os\n", 44 | "try:\n", 45 | " amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY'] \n", 46 | "except:\n", 47 | " amlWBSharedDir = ''\n", 48 | " print('not using aml services?')\n", 49 | " \n", 50 | "amlWBSharedDir" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 14, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Use the Azure Machine Learning data collector to log various metrics\n", 60 | "from azureml.logging import get_azureml_logger\n", 61 | "logger = get_azureml_logger()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 15, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Use Azure Machine Learning history magic to control history collection\n", 71 | "# History is off by default, options are \"on\", \"off\", or \"show\"\n", 72 | "# %azureml history on" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 16, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "[None]" 84 | ] 85 | }, 86 | "execution_count": 16, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "# import utlity functions\n", 93 | "\n", 94 | "import sys, os\n", 95 | "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code', 'src'])))]\n", 96 | "def add_path_to_sys_path(path_to_append):\n", 97 | " if not (any(path_to_append in paths for paths in sys.path)):\n", 98 | " sys.path.append(path_to_append)\n", 99 | "\n", 100 | "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n", 101 | "\n", 102 | "import azure_chestxray_utils" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "#### Path variables" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 17, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "'/azureml-share/chestxray/data/ChestX-ray8'" 121 | ] 122 | }, 123 | "execution_count": 17, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | }, 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "'/azureml-share/chestxray/output'" 131 | ] 132 | }, 133 | "execution_count": 17, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "# create base directories for the file path variables \n", 140 | "# paths are tipically container level dirs mapped to a host dir for data persistence.\n", 141 | "\n", 142 | "prj_consts = azure_chestxray_utils.chestxray_consts()\n", 143 | "\n", 144 | "data_base_input_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n", 145 | "data_base_output_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list))) \n", 146 | "\n", 147 | "data_base_input_dir\n", 148 | "data_base_output_dir\n" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 18, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'" 160 | ] 161 | }, 162 | "execution_count": 18, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | }, 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "orig images number:['112120'] \n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "# chest xray images are in nih_chest_xray_data_dir\n", 176 | "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n", 177 | " os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n", 178 | "nih_chest_xray_data_dir\n", 179 | "\n", 180 | "# check if we have all 112120 images in nih_chest_xray_data_dir\n", 181 | "orig_images_no = !find $nih_chest_xray_data_dir -type f | wc -l\n", 182 | "print(\"orig images number:{} \".format(orig_images_no))" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 19, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC_other'" 194 | ] 195 | }, 196 | "execution_count": 19, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | }, 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "BBox_List_2017.csv Data_Entry_2017.csv blacklist.csv\r\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "# check if we have patients file list Data_Entry_2017.csv and BBox_List_2017.csv (https://nihcc.app.box.com/v/ChestXray-NIHCC)\n", 210 | "# blacklist.csv is genrated by data scientists with no medical background\n", 211 | "\n", 212 | "other_data_dir=os.path.join(data_base_input_dir, os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))\n", 213 | "other_data_dir\n", 214 | "# !mkdir -p {other_data_dir}\n", 215 | "!ls $other_data_dir\n", 216 | "\n", 217 | "# data is split into train/test/validation partitions\n", 218 | "data_partitions_dir=os.path.join(data_base_output_dir, os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list))) \n", 219 | "!mkdir -p {data_partitions_dir}" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 20, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "import pickle\n", 229 | "import random\n", 230 | "import re\n", 231 | "import tqdm\n", 232 | "\n", 233 | "import cv2\n", 234 | "import numpy as np\n", 235 | "import pandas as pd\n", 236 | "import sklearn.model_selection " 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "#### Train/Validation/Test Data partitioning \n", 244 | " - remove the images in the blacklist.csv where the image has low quality. \n", 245 | " - remove the NIH bounding box patients since we will save those patients for later validation use. \n", 246 | " - We will also divide data into train/valid/test dataset using a 7:1:2 ratio." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 21, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "len of original patient id is 30805\n", 259 | "len of cleaned patient id is 30079\n", 260 | "len of unique patient id with annotated data 726\n", 261 | "len of patient id with annotated data 984\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "# remove NIH manually annotated data (groung truth with heavy pathologies, no healthy patients) \n", 267 | "# exclude what visusally looks like bad images to data scientists with no medical background\n", 268 | "# todo\n", 269 | "# This should prob be a generic function\n", 270 | "\n", 271 | "\n", 272 | "total_patient_number = 30805\n", 273 | "NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists \n", 274 | "manually_selected_bad_images_file = 'blacklist.csv'# exclude what viusally looks like bad images\n", 275 | "\n", 276 | "patient_id_original = [i for i in range(1,total_patient_number + 1)]\n", 277 | "\n", 278 | "# ignored images list is used later, since this is not a patient ID level issue\n", 279 | "ignored_images_set = set()\n", 280 | "with open(os.path.join(other_data_dir, manually_selected_bad_images_file), 'r') as f:\n", 281 | " for line in f:\n", 282 | " # delete the last char which is \\n\n", 283 | " ignored_images_set.add(line[:-1])\n", 284 | " if int(line[:-9]) >= 30805:\n", 285 | " print(line[:-1])\n", 286 | "\n", 287 | "bbox_df = pd.read_csv(os.path.join(other_data_dir, NIH_annotated_file))\n", 288 | "bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)\n", 289 | "\n", 290 | "bbox_patient_index_list = []\n", 291 | "for index, item in bbox_patient_index_df.iteritems():\n", 292 | " bbox_patient_index_list.append(int(item))\n", 293 | "\n", 294 | "patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))\n", 295 | "print(\"len of original patient id is\", len(patient_id_original))\n", 296 | "print(\"len of cleaned patient id is\", len(patient_id))\n", 297 | "print(\"len of unique patient id with annotated data\", \n", 298 | " len(list(set(bbox_patient_index_list))))\n", 299 | "print(\"len of patient id with annotated data\",bbox_df.shape[0])\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 22, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "name": "stdout", 309 | "output_type": "stream", 310 | "text": [ 311 | "first ten patient ids are [24303, 16035, 4967, 28624, 5378, 20335, 17069, 12271, 16975, 4469]\n", 312 | "train:21563 valid:3081 test:6161\n" 313 | ] 314 | } 315 | ], 316 | "source": [ 317 | "random.seed(0)\n", 318 | "random.shuffle(patient_id)\n", 319 | "\n", 320 | "print(\"first ten patient ids are\", patient_id[:10])\n", 321 | "\n", 322 | "# training:valid:test=7:1:2\n", 323 | "patient_id_train = patient_id[:int(total_patient_number * 0.7)]\n", 324 | "patient_id_valid = patient_id[int(total_patient_number * 0.7):int(total_patient_number * 0.8)]\n", 325 | "# get the rest of the patient_id as the test set\n", 326 | "patient_id_test = patient_id[int(total_patient_number * 0.8):]\n", 327 | "patient_id_test.extend(bbox_patient_index_list)\n", 328 | "patient_id_test = list(set(patient_id_test))\n", 329 | "\n", 330 | "\n", 331 | "print(\"train:{} valid:{} test:{}\".format(len(patient_id_train), len(patient_id_valid), len(patient_id_test)))\n", 332 | "\n", 333 | "# test_set = test_set+left_out_patient_id\n", 334 | "# print(\"train:{} valid:{} test:{}\".format(len(train_set), len(valid_set), len(test_set)))" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 23, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Add a few more project constants\n", 344 | "\n", 345 | "pathologies_name_list = prj_consts.DISEASE_list\n", 346 | "NIH_patients_and_labels_file = 'Data_Entry_2017.csv'" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "#### Finally do preprocessing\n", 354 | "Save labels and partitions" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 24, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "labels_df = pd.read_csv(os.path.join(other_data_dir, NIH_patients_and_labels_file))" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 25, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "def process_data(current_df, patient_ids):\n", 373 | " image_name_index = []\n", 374 | " image_labels = {}\n", 375 | " for individual_patient in tqdm.tqdm(patient_ids):\n", 376 | " for _, row in current_df[current_df['Patient ID'] == individual_patient].iterrows():\n", 377 | " processed_image_name = row['Image Index']\n", 378 | " if processed_image_name in ignored_images_set:\n", 379 | " pass\n", 380 | " else:\n", 381 | " image_name_index.append(processed_image_name)\n", 382 | " image_labels[processed_image_name] = np.zeros(14, dtype=np.uint8)\n", 383 | " for disease_index, ele in enumerate(pathologies_name_list):\n", 384 | " if re.search(ele, row['Finding Labels'], re.IGNORECASE):\n", 385 | " image_labels[processed_image_name][disease_index] = 1\n", 386 | " else:\n", 387 | " # redundant code but just to make it more readable\n", 388 | " image_labels[processed_image_name][disease_index] = 0\n", 389 | " # print(\"processed\", row['Image Index'])\n", 390 | " return image_name_index, image_labels\n" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 26, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "name": "stderr", 400 | "output_type": "stream", 401 | "text": [ 402 | "100%|██████████| 21563/21563 [00:35<00:00, 606.57it/s]\n", 403 | "100%|██████████| 3081/3081 [00:05<00:00, 614.10it/s]\n", 404 | "100%|██████████| 6161/6161 [00:13<00:00, 449.08it/s]\n" 405 | ] 406 | }, 407 | { 408 | "name": "stdout", 409 | "output_type": "stream", 410 | "text": [ 411 | "train, valid, test image number is: 68508 9495 32893\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "# # create and save train/test/validation partitions list\n", 417 | "\n", 418 | "train_data_index, train_labels = process_data(labels_df, patient_id_train)\n", 419 | "valid_data_index, valid_labels = process_data(labels_df, patient_id_valid)\n", 420 | "test_data_index, test_labels = process_data(labels_df, patient_id_test)\n", 421 | "\n", 422 | "print(\"train, valid, test image number is:\", len(train_data_index), len(valid_data_index), len(test_data_index))\n", 423 | "\n", 424 | "# save the data\n", 425 | "labels_all = {}\n", 426 | "labels_all.update(train_labels)\n", 427 | "labels_all.update(valid_labels)\n", 428 | "labels_all.update(test_labels)\n", 429 | "\n", 430 | "partition_dict = {'train': train_data_index, 'test': test_data_index, 'valid': valid_data_index}\n", 431 | "\n", 432 | "with open(os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle'), 'wb') as f:\n", 433 | " pickle.dump(labels_all, f)\n", 434 | "\n", 435 | "with open(os.path.join(data_partitions_dir,'partition14_unormalized_cleaned.pickle'), 'wb') as f:\n", 436 | " pickle.dump(partition_dict, f)\n", 437 | " \n", 438 | "# also save the patient id partitions for pytorch training \n", 439 | "with open(os.path.join(data_partitions_dir,'train_test_valid_data_partitions.pickle'), 'wb') as f:\n", 440 | " pickle.dump([patient_id_train,patient_id_valid,\n", 441 | " patient_id_test,\n", 442 | " list(set(bbox_patient_index_list))], f) \n" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 27, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "dict" 454 | ] 455 | }, 456 | "execution_count": 27, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | }, 460 | { 461 | "data": { 462 | "text/plain": [ 463 | "{'00001256_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n", 464 | " '00010535_020.png': array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n", 465 | " '00017170_004.png': array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n", 466 | " '00017906_025.png': array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n", 467 | " '00030353_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)}" 468 | ] 469 | }, 470 | "execution_count": 27, 471 | "metadata": {}, 472 | "output_type": "execute_result" 473 | } 474 | ], 475 | "source": [ 476 | "# sanity check, see train labels\n", 477 | "\n", 478 | "type(train_labels)\n", 479 | "{k: train_labels[k] for k in list(train_labels)[:5]}" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 28, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "# jupyter nbconvert --to html .\\Code\\02_Model\\000_preprocess.ipynb" 489 | ] 490 | } 491 | ], 492 | "metadata": { 493 | "kernelspec": { 494 | "display_name": "Python 3", 495 | "language": "python", 496 | "name": "python3" 497 | }, 498 | "language_info": { 499 | "codemirror_mode": { 500 | "name": "ipython", 501 | "version": 3 502 | }, 503 | "file_extension": ".py", 504 | "mimetype": "text/x-python", 505 | "name": "python", 506 | "nbconvert_exporter": "python", 507 | "pygments_lexer": "ipython3", 508 | "version": "3.6.3" 509 | } 510 | }, 511 | "nbformat": 4, 512 | "nbformat_minor": 2 513 | } 514 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/02_Model/.ipynb_checkpoints/000_preprocess-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Data preprocessing\n", 8 | "\n", 9 | "##### Copyright (C) Microsoft Corporation. \n", 10 | "see license file for details " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 12, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Allow multiple displays per cell\n", 20 | "from IPython.core.interactiveshell import InteractiveShell\n", 21 | "InteractiveShell.ast_node_interactivity = \"all\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 13, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "'/azureml-share/'" 33 | ] 34 | }, 35 | "execution_count": 13, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n", 42 | "\n", 43 | "import os\n", 44 | "try:\n", 45 | " amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY'] \n", 46 | "except:\n", 47 | " amlWBSharedDir = ''\n", 48 | " print('not using aml services?')\n", 49 | " \n", 50 | "amlWBSharedDir" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 14, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Use the Azure Machine Learning data collector to log various metrics\n", 60 | "from azureml.logging import get_azureml_logger\n", 61 | "logger = get_azureml_logger()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 15, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Use Azure Machine Learning history magic to control history collection\n", 71 | "# History is off by default, options are \"on\", \"off\", or \"show\"\n", 72 | "# %azureml history on" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 16, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "[None]" 84 | ] 85 | }, 86 | "execution_count": 16, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "# import utlity functions\n", 93 | "\n", 94 | "import sys, os\n", 95 | "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code', 'src'])))]\n", 96 | "def add_path_to_sys_path(path_to_append):\n", 97 | " if not (any(path_to_append in paths for paths in sys.path)):\n", 98 | " sys.path.append(path_to_append)\n", 99 | "\n", 100 | "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n", 101 | "\n", 102 | "import azure_chestxray_utils" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "#### Path variables" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 17, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "'/azureml-share/chestxray/data/ChestX-ray8'" 121 | ] 122 | }, 123 | "execution_count": 17, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | }, 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "'/azureml-share/chestxray/output'" 131 | ] 132 | }, 133 | "execution_count": 17, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "# create base directories for the file path variables \n", 140 | "# paths are tipically container level dirs mapped to a host dir for data persistence.\n", 141 | "\n", 142 | "prj_consts = azure_chestxray_utils.chestxray_consts()\n", 143 | "\n", 144 | "data_base_input_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n", 145 | "data_base_output_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list))) \n", 146 | "\n", 147 | "data_base_input_dir\n", 148 | "data_base_output_dir\n" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 18, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'" 160 | ] 161 | }, 162 | "execution_count": 18, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | }, 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "orig images number:['112120'] \n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "# chest xray images are in nih_chest_xray_data_dir\n", 176 | "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n", 177 | " os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n", 178 | "nih_chest_xray_data_dir\n", 179 | "\n", 180 | "# check if we have all 112120 images in nih_chest_xray_data_dir\n", 181 | "orig_images_no = !find $nih_chest_xray_data_dir -type f | wc -l\n", 182 | "print(\"orig images number:{} \".format(orig_images_no))" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 19, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC_other'" 194 | ] 195 | }, 196 | "execution_count": 19, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | }, 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "BBox_List_2017.csv Data_Entry_2017.csv blacklist.csv\r\n" 205 | ] 206 | } 207 | ], 208 | "source": [ 209 | "# check if we have patients file list Data_Entry_2017.csv and BBox_List_2017.csv (https://nihcc.app.box.com/v/ChestXray-NIHCC)\n", 210 | "# blacklist.csv is genrated by data scientists with no medical background\n", 211 | "\n", 212 | "other_data_dir=os.path.join(data_base_input_dir, os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))\n", 213 | "other_data_dir\n", 214 | "# !mkdir -p {other_data_dir}\n", 215 | "!ls $other_data_dir\n", 216 | "\n", 217 | "# data is split into train/test/validation partitions\n", 218 | "data_partitions_dir=os.path.join(data_base_output_dir, os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list))) \n", 219 | "!mkdir -p {data_partitions_dir}" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 20, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "import pickle\n", 229 | "import random\n", 230 | "import re\n", 231 | "import tqdm\n", 232 | "\n", 233 | "import cv2\n", 234 | "import numpy as np\n", 235 | "import pandas as pd\n", 236 | "import sklearn.model_selection " 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "#### Train/Validation/Test Data partitioning \n", 244 | " - remove the images in the blacklist.csv where the image has low quality. \n", 245 | " - remove the NIH bounding box patients since we will save those patients for later validation use. \n", 246 | " - We will also divide data into train/valid/test dataset using a 7:1:2 ratio." 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 21, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "len of original patient id is 30805\n", 259 | "len of cleaned patient id is 30079\n", 260 | "len of unique patient id with annotated data 726\n", 261 | "len of patient id with annotated data 984\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "# remove NIH manually annotated data (groung truth with heavy pathologies, no healthy patients) \n", 267 | "# exclude what visusally looks like bad images to data scientists with no medical background\n", 268 | "# todo\n", 269 | "# This should prob be a generic function\n", 270 | "\n", 271 | "\n", 272 | "total_patient_number = 30805\n", 273 | "NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists \n", 274 | "manually_selected_bad_images_file = 'blacklist.csv'# exclude what viusally looks like bad images\n", 275 | "\n", 276 | "patient_id_original = [i for i in range(1,total_patient_number + 1)]\n", 277 | "\n", 278 | "# ignored images list is used later, since this is not a patient ID level issue\n", 279 | "ignored_images_set = set()\n", 280 | "with open(os.path.join(other_data_dir, manually_selected_bad_images_file), 'r') as f:\n", 281 | " for line in f:\n", 282 | " # delete the last char which is \\n\n", 283 | " ignored_images_set.add(line[:-1])\n", 284 | " if int(line[:-9]) >= 30805:\n", 285 | " print(line[:-1])\n", 286 | "\n", 287 | "bbox_df = pd.read_csv(os.path.join(other_data_dir, NIH_annotated_file))\n", 288 | "bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)\n", 289 | "\n", 290 | "bbox_patient_index_list = []\n", 291 | "for index, item in bbox_patient_index_df.iteritems():\n", 292 | " bbox_patient_index_list.append(int(item))\n", 293 | "\n", 294 | "patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))\n", 295 | "print(\"len of original patient id is\", len(patient_id_original))\n", 296 | "print(\"len of cleaned patient id is\", len(patient_id))\n", 297 | "print(\"len of unique patient id with annotated data\", \n", 298 | " len(list(set(bbox_patient_index_list))))\n", 299 | "print(\"len of patient id with annotated data\",bbox_df.shape[0])\n" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 22, 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "name": "stdout", 309 | "output_type": "stream", 310 | "text": [ 311 | "first ten patient ids are [24303, 16035, 4967, 28624, 5378, 20335, 17069, 12271, 16975, 4469]\n", 312 | "train:21563 valid:3081 test:6161\n" 313 | ] 314 | } 315 | ], 316 | "source": [ 317 | "random.seed(0)\n", 318 | "random.shuffle(patient_id)\n", 319 | "\n", 320 | "print(\"first ten patient ids are\", patient_id[:10])\n", 321 | "\n", 322 | "# training:valid:test=7:1:2\n", 323 | "patient_id_train = patient_id[:int(total_patient_number * 0.7)]\n", 324 | "patient_id_valid = patient_id[int(total_patient_number * 0.7):int(total_patient_number * 0.8)]\n", 325 | "# get the rest of the patient_id as the test set\n", 326 | "patient_id_test = patient_id[int(total_patient_number * 0.8):]\n", 327 | "patient_id_test.extend(bbox_patient_index_list)\n", 328 | "patient_id_test = list(set(patient_id_test))\n", 329 | "\n", 330 | "\n", 331 | "print(\"train:{} valid:{} test:{}\".format(len(patient_id_train), len(patient_id_valid), len(patient_id_test)))\n", 332 | "\n", 333 | "# test_set = test_set+left_out_patient_id\n", 334 | "# print(\"train:{} valid:{} test:{}\".format(len(train_set), len(valid_set), len(test_set)))" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 23, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Add a few more project constants\n", 344 | "\n", 345 | "pathologies_name_list = prj_consts.DISEASE_list\n", 346 | "NIH_patients_and_labels_file = 'Data_Entry_2017.csv'" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "#### Finally do preprocessing\n", 354 | "Save labels and partitions" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 24, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "labels_df = pd.read_csv(os.path.join(other_data_dir, NIH_patients_and_labels_file))" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 25, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "def process_data(current_df, patient_ids):\n", 373 | " image_name_index = []\n", 374 | " image_labels = {}\n", 375 | " for individual_patient in tqdm.tqdm(patient_ids):\n", 376 | " for _, row in current_df[current_df['Patient ID'] == individual_patient].iterrows():\n", 377 | " processed_image_name = row['Image Index']\n", 378 | " if processed_image_name in ignored_images_set:\n", 379 | " pass\n", 380 | " else:\n", 381 | " image_name_index.append(processed_image_name)\n", 382 | " image_labels[processed_image_name] = np.zeros(14, dtype=np.uint8)\n", 383 | " for disease_index, ele in enumerate(pathologies_name_list):\n", 384 | " if re.search(ele, row['Finding Labels'], re.IGNORECASE):\n", 385 | " image_labels[processed_image_name][disease_index] = 1\n", 386 | " else:\n", 387 | " # redundant code but just to make it more readable\n", 388 | " image_labels[processed_image_name][disease_index] = 0\n", 389 | " # print(\"processed\", row['Image Index'])\n", 390 | " return image_name_index, image_labels\n" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 26, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "name": "stderr", 400 | "output_type": "stream", 401 | "text": [ 402 | "100%|██████████| 21563/21563 [00:35<00:00, 606.57it/s]\n", 403 | "100%|██████████| 3081/3081 [00:05<00:00, 614.10it/s]\n", 404 | "100%|██████████| 6161/6161 [00:13<00:00, 449.08it/s]\n" 405 | ] 406 | }, 407 | { 408 | "name": "stdout", 409 | "output_type": "stream", 410 | "text": [ 411 | "train, valid, test image number is: 68508 9495 32893\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "# # create and save train/test/validation partitions list\n", 417 | "\n", 418 | "train_data_index, train_labels = process_data(labels_df, patient_id_train)\n", 419 | "valid_data_index, valid_labels = process_data(labels_df, patient_id_valid)\n", 420 | "test_data_index, test_labels = process_data(labels_df, patient_id_test)\n", 421 | "\n", 422 | "print(\"train, valid, test image number is:\", len(train_data_index), len(valid_data_index), len(test_data_index))\n", 423 | "\n", 424 | "# save the data\n", 425 | "labels_all = {}\n", 426 | "labels_all.update(train_labels)\n", 427 | "labels_all.update(valid_labels)\n", 428 | "labels_all.update(test_labels)\n", 429 | "\n", 430 | "partition_dict = {'train': train_data_index, 'test': test_data_index, 'valid': valid_data_index}\n", 431 | "\n", 432 | "with open(os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle'), 'wb') as f:\n", 433 | " pickle.dump(labels_all, f)\n", 434 | "\n", 435 | "with open(os.path.join(data_partitions_dir,'partition14_unormalized_cleaned.pickle'), 'wb') as f:\n", 436 | " pickle.dump(partition_dict, f)\n", 437 | " \n", 438 | "# also save the patient id partitions for pytorch training \n", 439 | "with open(os.path.join(data_partitions_dir,'train_test_valid_data_partitions.pickle'), 'wb') as f:\n", 440 | " pickle.dump([patient_id_train,patient_id_valid,\n", 441 | " patient_id_test,\n", 442 | " list(set(bbox_patient_index_list))], f) \n" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 27, 448 | "metadata": {}, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "dict" 454 | ] 455 | }, 456 | "execution_count": 27, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | }, 460 | { 461 | "data": { 462 | "text/plain": [ 463 | "{'00001256_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n", 464 | " '00010535_020.png': array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n", 465 | " '00017170_004.png': array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n", 466 | " '00017906_025.png': array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8),\n", 467 | " '00030353_000.png': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)}" 468 | ] 469 | }, 470 | "execution_count": 27, 471 | "metadata": {}, 472 | "output_type": "execute_result" 473 | } 474 | ], 475 | "source": [ 476 | "# sanity check, see train labels\n", 477 | "\n", 478 | "type(train_labels)\n", 479 | "{k: train_labels[k] for k in list(train_labels)[:5]}" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 28, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "# jupyter nbconvert --to html .\\Code\\02_Model\\000_preprocess.ipynb" 489 | ] 490 | } 491 | ], 492 | "metadata": { 493 | "kernelspec": { 494 | "display_name": "Python 3", 495 | "language": "python", 496 | "name": "python3" 497 | }, 498 | "language_info": { 499 | "codemirror_mode": { 500 | "name": "ipython", 501 | "version": 3 502 | }, 503 | "file_extension": ".py", 504 | "mimetype": "text/x-python", 505 | "name": "python", 506 | "nbconvert_exporter": "python", 507 | "pygments_lexer": "ipython3", 508 | "version": "3.6.3" 509 | } 510 | }, 511 | "nbformat": 4, 512 | "nbformat_minor": 2 513 | } 514 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/02_Model/010_train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Train\n", 8 | "\n", 9 | "##### Copyright (C) Microsoft Corporation. \n", 10 | "see license file for details " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Allow multiple displays per cell\n", 20 | "from IPython.core.interactiveshell import InteractiveShell\n", 21 | "InteractiveShell.ast_node_interactivity = \"all\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "'/azureml-share/'" 33 | ] 34 | }, 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n", 42 | "\n", 43 | "import os\n", 44 | "try:\n", 45 | " amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY'] \n", 46 | "except:\n", 47 | " amlWBSharedDir = ''\n", 48 | " print('not using aml services?')\n", 49 | " \n", 50 | "amlWBSharedDir" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "# Use the Azure Machine Learning data collector to log various metrics\n", 60 | "from azureml.logging import get_azureml_logger\n", 61 | "logger = get_azureml_logger()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Use Azure Machine Learning history magic to control history collection\n", 71 | "# History is off by default, options are \"on\", \"off\", or \"show\"\n", 72 | "# %azureml history on" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "[None]" 84 | ] 85 | }, 86 | "execution_count": 5, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "# import utlity functions\n", 93 | "\n", 94 | "import sys, os\n", 95 | "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code', 'src'])))]\n", 96 | "def add_path_to_sys_path(path_to_append):\n", 97 | " if not (any(path_to_append in paths for paths in sys.path)):\n", 98 | " sys.path.append(path_to_append)\n", 99 | "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n", 100 | "\n", 101 | "import azure_chestxray_utils\n", 102 | "# import azure_chestxray_keras_utils" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 6, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# create the file path variables \n", 112 | "# paths are tipically container level dirs mapped to a host dir for data persistence.\n", 113 | "\n", 114 | "prj_consts = azure_chestxray_utils.chestxray_consts()\n", 115 | "\n", 116 | "data_base_input_dir=os.path.join(amlWBSharedDir, \n", 117 | " os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n", 118 | "data_base_output_dir=os.path.join(amlWBSharedDir, \n", 119 | " os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list))) \n", 120 | "\n", 121 | "\n", 122 | "# data used for training\n", 123 | "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n", 124 | " os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n", 125 | "\n", 126 | "data_partitions_dir=os.path.join(data_base_output_dir, \n", 127 | " os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list))) \n", 128 | "partition_path = os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle')\n", 129 | "label_path = os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle')" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 7, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "'/azureml-share/chestxray/output/weights_tmpdir'" 141 | ] 142 | }, 143 | "execution_count": 7, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | }, 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "total 0\r\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "# global variables\n", 157 | "\n", 158 | "weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.MODEL_WEIGHTS_DIR_list))) \n", 159 | "!mkdir -p {weights_dir}\n", 160 | "weights_dir\n", 161 | "!ls -l {weights_dir}\n", 162 | "\n", 163 | "# weights_path = os.path.join(\n", 164 | "# weights_dir, \n", 165 | "# prj_consts.PRETRAINED_DENSENET201_IMAGENET_CHESTXRAY_MODEL_FILE_NAME)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "name": "stderr", 175 | "output_type": "stream", 176 | "text": [ 177 | "Using TensorFlow backend.\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "import os\n", 183 | "\n", 184 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\" # see issue #152\n", 185 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0,1\"\n", 186 | "\n", 187 | "import imgaug as ia\n", 188 | "from imgaug import augmenters as iaa\n", 189 | "ia.seed(1)\n", 190 | "\n", 191 | "import cv2\n", 192 | "import keras.backend as K\n", 193 | "from keras.optimizers import Adam\n", 194 | "from keras.callbacks import ReduceLROnPlateau, Callback, ModelCheckpoint\n", 195 | "import numpy as np\n", 196 | "import pickle\n", 197 | "from keras_contrib.applications.densenet import DenseNetImageNet121\n", 198 | "from keras.layers import Dense\n", 199 | "from keras.models import Model\n", 200 | "from keras.utils import multi_gpu_model\n", 201 | "from tensorflow.python.client import device_lib\n", 202 | "import warnings\n", 203 | "from keras.utils import Sequence\n", 204 | "import tensorflow as tf" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "For testing purpose, we just run 1 epoch. It will take around 25 mins to run for one epoch using 2 K80 GPUs and it is usually needed to run around 30~50 epochs for the model to get converge." 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 9, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# make force_restart = False if you continue a previous train session, make it True to start from scratch\n", 221 | "force_restart = False\n", 222 | "\n", 223 | "initial_lr = 0.001\n", 224 | "resized_height = 224\n", 225 | "resized_width = 224\n", 226 | "# resized_height = prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_HEIGHT\n", 227 | "# resized_width = prj_consts.CHESTXRAY_MODEL_EXPECTED_IMAGE_WIDTH\n", 228 | "num_channel = 3\n", 229 | "num_classes = 14\n", 230 | "epochs = 1 #200" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 10, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "def get_available_gpus():\n", 240 | " \"\"\"\n", 241 | "\n", 242 | " Returns: number of GPUs available in the system\n", 243 | "\n", 244 | " \"\"\"\n", 245 | " local_device_protos = device_lib.list_local_devices()\n", 246 | " return [x.name for x in local_device_protos if x.device_type == 'GPU']\n", 247 | "\n", 248 | "\n", 249 | "# get number of available GPUs\n", 250 | "num_gpu = len(get_available_gpus())\n", 251 | "\n", 252 | "# keras multi_gpu_model slices the data to different GPUs. see https://keras.io/utils/#multi_gpu_model for more details.\n", 253 | "batch_size = 48 * num_gpu\n", 254 | "\n", 255 | "\n", 256 | "# use Keras multi-gpu model, so we need to make sure the batch_size is divisible by num_gpu." 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 11, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "# device_lib.list_local_devices()\n", 266 | "# !nvidia-smi" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 12, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "# use Keras multi-gpu model, so we need to make sure the batch_size is divisible by num_gpu.\n", 276 | "\n", 277 | "# multi GPU model checkpoint. copied from https://github.com/keras-team/keras/issues/8463\n", 278 | "class MultiGPUCheckpointCallback(Callback):\n", 279 | "\n", 280 | " def __init__(self, filepath, base_model, monitor='val_loss', verbose=0,\n", 281 | " save_best_only=False, save_weights_only=False,\n", 282 | " mode='auto', period=1):\n", 283 | " super(MultiGPUCheckpointCallback, self).__init__()\n", 284 | " self.base_model = base_model\n", 285 | " self.monitor = monitor\n", 286 | " self.verbose = verbose\n", 287 | " self.filepath = filepath\n", 288 | " self.save_best_only = save_best_only\n", 289 | " self.save_weights_only = save_weights_only\n", 290 | " self.period = period\n", 291 | " self.epochs_since_last_save = 0\n", 292 | "\n", 293 | " if mode not in ['auto', 'min', 'max']:\n", 294 | " warnings.warn('ModelCheckpoint mode %s is unknown, '\n", 295 | " 'fallback to auto mode.' % (mode),\n", 296 | " RuntimeWarning)\n", 297 | " mode = 'auto'\n", 298 | "\n", 299 | " if mode == 'min':\n", 300 | " self.monitor_op = np.less\n", 301 | " self.best = np.Inf\n", 302 | " elif mode == 'max':\n", 303 | " self.monitor_op = np.greater\n", 304 | " self.best = -np.Inf\n", 305 | " else:\n", 306 | " if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):\n", 307 | " self.monitor_op = np.greater\n", 308 | " self.best = -np.Inf\n", 309 | " else:\n", 310 | " self.monitor_op = np.less\n", 311 | " self.best = np.Inf\n", 312 | "\n", 313 | " def on_epoch_end(self, epoch, logs=None):\n", 314 | " logs = logs or {}\n", 315 | " self.epochs_since_last_save += 1\n", 316 | " if self.epochs_since_last_save >= self.period:\n", 317 | " self.epochs_since_last_save = 0\n", 318 | " filepath = self.filepath.format(epoch=epoch + 1, **logs)\n", 319 | " if self.save_best_only:\n", 320 | " current = logs.get(self.monitor)\n", 321 | " if current is None:\n", 322 | " warnings.warn('Can save best model only with %s available, '\n", 323 | " 'skipping.' % (self.monitor), RuntimeWarning)\n", 324 | " else:\n", 325 | " if self.monitor_op(current, self.best):\n", 326 | " if self.verbose > 0:\n", 327 | " print('Epoch %05d: %s improved from %0.5f to %0.5f,'\n", 328 | " ' saving model to %s'\n", 329 | " % (epoch + 1, self.monitor, self.best,\n", 330 | " current, filepath))\n", 331 | " self.best = current\n", 332 | " if self.save_weights_only:\n", 333 | " self.base_model.save_weights(filepath, overwrite=True)\n", 334 | " else:\n", 335 | " self.base_model.save(filepath, overwrite=True)\n", 336 | " else:\n", 337 | " if self.verbose > 0:\n", 338 | " print('Epoch %05d: %s did not improve' %\n", 339 | " (epoch + 1, self.monitor))\n", 340 | " else:\n", 341 | " if self.verbose > 0:\n", 342 | " print('Epoch %05d: saving model to %s' % (epoch + 1, filepath))\n", 343 | " if self.save_weights_only:\n", 344 | " self.base_model.save_weights(filepath, overwrite=True)\n", 345 | " else:\n", 346 | " self.base_model.save(filepath, overwrite=True)\n", 347 | "\n", 348 | "\n", 349 | "seq = iaa.Sequential([\n", 350 | " iaa.Fliplr(0.5), # horizontal flips\n", 351 | " iaa.Affine(rotate=(-15, 15)), # random rotate image\n", 352 | " iaa.Affine(scale=(0.8, 1.1)), # randomly scale the image\n", 353 | "], random_order=True) # apply augmenters in random order\n", 354 | "\n", 355 | "\n", 356 | "# generator for train and validation data\n", 357 | "# use the Sequence class per issue https://github.com/keras-team/keras/issues/1638\n", 358 | "class DataGenSequence(Sequence):\n", 359 | " def __init__(self, labels, image_file_index, current_state):\n", 360 | " self.batch_size = batch_size\n", 361 | " self.labels = labels\n", 362 | " self.img_file_index = image_file_index\n", 363 | " self.current_state = current_state\n", 364 | " self.len = len(self.img_file_index) // self.batch_size\n", 365 | " print(\"for DataGenSequence\", current_state, \"total rows are:\", len(self.img_file_index), \", len is\", self.len)\n", 366 | "\n", 367 | " def __len__(self):\n", 368 | " return self.len\n", 369 | "\n", 370 | " def __getitem__(self, idx):\n", 371 | " # print(\"loading data segmentation\", idx)\n", 372 | " # make sure each batch size has the same amount of data\n", 373 | " current_batch = self.img_file_index[idx * self.batch_size: (idx + 1) * self.batch_size]\n", 374 | " X = np.empty((self.batch_size, resized_height, resized_width, num_channel))\n", 375 | " y = np.empty((self.batch_size, num_classes))\n", 376 | "\n", 377 | " for i, image_name in enumerate(current_batch):\n", 378 | " path = os.path.join(nih_chest_xray_data_dir, image_name)\n", 379 | " # loading data\n", 380 | "\n", 381 | " img = cv2.resize(cv2.imread(path), (resized_height, resized_width)).astype(np.float32)\n", 382 | " X[i, :, :, :] = img\n", 383 | " y[i, :] = labels[image_name]\n", 384 | "\n", 385 | " # only do random flipping in training status\n", 386 | " if self.current_state == 'train':\n", 387 | " x_augmented = seq.augment_images(X)\n", 388 | " else:\n", 389 | " x_augmented = X\n", 390 | "\n", 391 | " return x_augmented, y\n", 392 | "\n", 393 | "\n", 394 | "# loss function\n", 395 | "def unweighted_binary_crossentropy(y_true, y_pred):\n", 396 | " \"\"\"\n", 397 | " Args:\n", 398 | " y_true: true labels\n", 399 | " y_pred: predicted labels\n", 400 | "\n", 401 | " Returns: the sum of binary cross entropy loss across all the classes\n", 402 | "\n", 403 | " \"\"\"\n", 404 | " return K.sum(K.binary_crossentropy(y_true, y_pred))\n", 405 | "\n", 406 | "\n", 407 | "def build_model():\n", 408 | " \"\"\"\n", 409 | "\n", 410 | " Returns: a model with specified weights\n", 411 | "\n", 412 | " \"\"\"\n", 413 | " # define the model, use pre-trained weights for image_net\n", 414 | " base_model = DenseNetImageNet121(input_shape=(224, 224, 3),\n", 415 | " weights='imagenet',\n", 416 | " include_top=False,\n", 417 | " pooling='avg')\n", 418 | "\n", 419 | " x = base_model.output\n", 420 | " predictions = Dense(14, activation='sigmoid')(x)\n", 421 | " model = Model(inputs=base_model.input, outputs=predictions)\n", 422 | " return model" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 13, 428 | "metadata": {}, 429 | "outputs": [ 430 | { 431 | "name": "stdout", 432 | "output_type": "stream", 433 | "text": [ 434 | "using 2 GPUs\n", 435 | "Downloading data from https://github.com/titu1994/DenseNet/releases/download/v3.0/DenseNet-BC-121-32-no-top.h5\n", 436 | "33202176/33199896 [==============================] - 8s 0us/step\n", 437 | "Weights for the model were loaded successfully\n" 438 | ] 439 | } 440 | ], 441 | "source": [ 442 | "if num_gpu > 1:\n", 443 | " print(\"using\", num_gpu, \"GPUs\")\n", 444 | " # build model\n", 445 | " with tf.device('/cpu:0'):\n", 446 | " model_single_gpu = build_model()\n", 447 | " # model_single_gpu.load_weights(weights_path)\n", 448 | "\n", 449 | " # convert to multi-gpu model\n", 450 | " model_multi_gpu = multi_gpu_model(model_single_gpu, gpus=num_gpu)\n", 451 | " model_checkpoint = MultiGPUCheckpointCallback(\n", 452 | " os.path.join(weights_dir, 'azure_chest_xray_14_weights_712split_epoch_{epoch:03d}_val_loss_{val_loss:.4f}.hdf5'),\n", 453 | " model_single_gpu, monitor='val_loss', save_weights_only=False)\n", 454 | "\n", 455 | " \n", 456 | "\n", 457 | "else:\n", 458 | " print(\"using single GPU\")\n", 459 | " model_multi_gpu = build_model()\n", 460 | " model_checkpoint = ModelCheckpoint(\n", 461 | " os.path.join(weights_dir, 'azure_chest_xray_14_weights_712split_epoch_{epoch:03d}_val_loss_{val_loss:.4f}.hdf5'),\n", 462 | " monitor='val_loss', save_weights_only=False)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 14, 468 | "metadata": {}, 469 | "outputs": [ 470 | { 471 | "name": "stdout", 472 | "output_type": "stream", 473 | "text": [ 474 | "for DataGenSequence train total rows are: 68508 , len is 713\n", 475 | "for DataGenSequence validation total rows are: 9495 , len is 98\n", 476 | "Epoch 1/1\n", 477 | "713/713 [==============================] - 1275s 2s/step - loss: 214.9958 - val_loss: 225.4705\n" 478 | ] 479 | }, 480 | { 481 | "data": { 482 | "text/plain": [ 483 | "" 484 | ] 485 | }, 486 | "execution_count": 14, 487 | "metadata": {}, 488 | "output_type": "execute_result" 489 | } 490 | ], 491 | "source": [ 492 | "num_workers = 10 * num_gpu\n", 493 | "\n", 494 | "model_multi_gpu.compile(optimizer=Adam(lr=initial_lr), loss=unweighted_binary_crossentropy)\n", 495 | "\n", 496 | "reduce_lr_on_plateau = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, min_lr=1e-6)\n", 497 | "\n", 498 | "callbacks = [model_checkpoint, reduce_lr_on_plateau]\n", 499 | "\n", 500 | "with open(label_path, 'rb') as f:\n", 501 | " labels = pickle.load(f)\n", 502 | "\n", 503 | "with open(partition_path, 'rb') as f:\n", 504 | " partition = pickle.load(f)\n", 505 | "\n", 506 | "model_multi_gpu.fit_generator(generator=DataGenSequence(labels, partition['train'], current_state='train'),\n", 507 | " epochs=epochs,\n", 508 | " verbose=1,\n", 509 | " callbacks=callbacks,\n", 510 | " workers=num_workers,\n", 511 | " # max_queue_size=32,\n", 512 | " # shuffle=False,\n", 513 | " validation_data=DataGenSequence(labels, partition['valid'], current_state='validation')\n", 514 | " # validation_steps=1\n", 515 | " )" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 15, 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [ 524 | "# jupyter nbconvert --to html .\\Code\\02_Model\\010_train.ipynb" 525 | ] 526 | } 527 | ], 528 | "metadata": { 529 | "kernelspec": { 530 | "display_name": "azure_chestxray_lung_disease gpucomputecontext", 531 | "language": "python", 532 | "name": "azure_chestxray_lung_disease_gpucomputecontext" 533 | }, 534 | "language_info": { 535 | "codemirror_mode": { 536 | "name": "ipython", 537 | "version": 3 538 | }, 539 | "file_extension": ".py", 540 | "mimetype": "text/x-python", 541 | "name": "python", 542 | "nbconvert_exporter": "python", 543 | "pygments_lexer": "ipython3", 544 | "version": "3.5.2" 545 | } 546 | }, 547 | "nbformat": 4, 548 | "nbformat_minor": 2 549 | } 550 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/02_Model/020_evaluate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Copyright (C) Microsoft Corporation. \n", 8 | "see license.md file for Enterprise Customer License and ISV License details " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 59, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "# Allow multiple displays per cell\n", 18 | "from IPython.core.interactiveshell import InteractiveShell\n", 19 | "InteractiveShell.ast_node_interactivity = \"all\"" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 60, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "data": { 29 | "text/plain": [ 30 | "'/azureml-share/'" 31 | ] 32 | }, 33 | "execution_count": 60, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "\n", 40 | "import os\n", 41 | "try:\n", 42 | " amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY'] \n", 43 | "except:\n", 44 | " print( 'Not in amlwb? define amlWBSharedDir as \"/shared_folder_on_host/amlwb_exp_acc/amlwb_work_space/amlwb_experiment\"')\n", 45 | " \n", 46 | "amlWBSharedDir" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 61, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# # Use the Azure Machine Learning data collector to log various metrics\n", 56 | "# from azureml.logging import get_azureml_logger\n", 57 | "# logger = get_azureml_logger()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 62, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# Use Azure Machine Learning history magic to control history collection\n", 67 | "# History is off by default, options are \"on\", \"off\", or \"show\"\n", 68 | "# %azureml history on" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 63, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "[None]" 80 | ] 81 | }, 82 | "execution_count": 63, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "# import utlity functions\n", 89 | "\n", 90 | "import sys, os\n", 91 | "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code', 'src'])))]\n", 92 | "def add_path_to_sys_path(path_to_append):\n", 93 | " if not (any(path_to_append in paths for paths in sys.path)):\n", 94 | " sys.path.append(path_to_append)\n", 95 | "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n", 96 | "\n", 97 | "import azure_chestxray_utils\n", 98 | "import azure_chestxray_keras_utils" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 64, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# create the file path variables \n", 108 | "# paths are tipically container level dirs mapped to a host dir for data persistence.\n", 109 | "\n", 110 | "prj_consts = azure_chestxray_utils.chestxray_consts()\n", 111 | "\n", 112 | "data_base_input_dir=os.path.join(amlWBSharedDir, \n", 113 | " os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n", 114 | "data_base_output_dir=os.path.join(amlWBSharedDir, \n", 115 | " os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 65, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "'/azureml-share/chestxray/output/weights_tmpdir'" 127 | ] 128 | }, 129 | "execution_count": 65, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | }, 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "total 0\r\n" 138 | ] 139 | }, 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "'/azureml-share/chestxray/output/fully_trained_models'" 144 | ] 145 | }, 146 | "execution_count": 65, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | }, 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "total 86320\r\n", 155 | "-rw-rw-r-- 1 1003 1003 30097832 Feb 14 04:37 azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5\r\n", 156 | "-rw-r--r-- 1 root root 29143128 Feb 14 04:55 weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5\r\n", 157 | "-rw-rw-r-- 1 1003 1003 29142168 Feb 7 06:16 weights_only_azure_chest_xray__14_weights_712split_epoch_029_val_loss_147.7599.hdf5\r\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "# global variables\n", 163 | "\n", 164 | "#location of trained models weights, quality will be dependent on train data size\n", 165 | "# and number of epochs among other things\n", 166 | "weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.MODEL_WEIGHTS_DIR_list))) \n", 167 | "weights_dir\n", 168 | "!ls -l {weights_dir}\n", 169 | "\n", 170 | "# \"quality\" models, fully trained on all training data\n", 171 | "fully_trained_weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.FULLY_PRETRAINED_MODEL_DIR_list))) \n", 172 | "fully_trained_weights_dir\n", 173 | "!ls -l {fully_trained_weights_dir}\n", 174 | "\n", 175 | "\n", 176 | "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n", 177 | " os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n", 178 | "\n", 179 | "data_partitions_dir=os.path.join(data_base_output_dir, \n", 180 | " os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list))) \n", 181 | "label_path = os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle')\n", 182 | "partition_path = os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle')" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 66, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "# # extract and save the weights from a full model\n", 192 | "\n", 193 | "# import keras_contrib\n", 194 | "# from keras.models import load_model\n", 195 | "# model_file_name = 'azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5'\n", 196 | "# model = load_model(os.path.join(fully_trained_weights_dir, model_file_name))\n", 197 | "# model.save_weights(os.path.join(fully_trained_weights_dir, 'weights_only_'+model_file_name))" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 67, 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/plain": [ 208 | "['/azureml-share/chestxray/output/fully_trained_models/azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5',\n", 209 | " '/azureml-share/chestxray/output/fully_trained_models/weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5',\n", 210 | " '/azureml-share/chestxray/output/fully_trained_models/weights_only_azure_chest_xray__14_weights_712split_epoch_029_val_loss_147.7599.hdf5']" 211 | ] 212 | }, 213 | "execution_count": 67, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "# get long (full path) model file name\n", 220 | "\n", 221 | "all_models=!ls {os.path.join(fully_trained_weights_dir, '*.hdf5')}\n", 222 | "all_models\n", 223 | "models_file_name= [os.path.join(fully_trained_weights_dir, \n", 224 | " 'weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5')]" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 68, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "import os\n", 234 | "import pickle\n", 235 | "\n", 236 | "import cv2\n", 237 | "import numpy as np\n", 238 | "import pandas as pd\n", 239 | "from keras.models import load_model\n", 240 | "from keras.utils import Sequence\n", 241 | "from sklearn import metrics\n", 242 | "\n", 243 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\" # see issue #152\n", 244 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0,1\"\n", 245 | "\n", 246 | "from tensorflow.python.client import device_lib\n", 247 | "\n", 248 | "resized_height = 224\n", 249 | "resized_width = 224\n", 250 | "num_channel = 3\n", 251 | "num_classes = 14\n", 252 | "batch_size = 512 #512" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 69, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "['/device:GPU:0', '/device:GPU:1']" 264 | ] 265 | }, 266 | "execution_count": 69, 267 | "metadata": {}, 268 | "output_type": "execute_result" 269 | }, 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "num of GPUs: 2\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "def get_available_gpus():\n", 280 | " \"\"\"\n", 281 | " Returns: number of GPUs available in the system\n", 282 | " \"\"\"\n", 283 | " local_device_protos = device_lib.list_local_devices()\n", 284 | " return [x.name for x in local_device_protos if x.device_type == 'GPU']\n", 285 | "\n", 286 | "get_available_gpus()\n", 287 | "# get number of available GPUs\n", 288 | "print(\"num of GPUs:\", len(get_available_gpus()))" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 70, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "# device_lib.list_local_devices()\n", 298 | "# !nvidia-smi" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 71, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "num of GPUs: 2\n" 311 | ] 312 | }, 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "['Atelectasis',\n", 317 | " 'Cardiomegaly',\n", 318 | " 'Effusion',\n", 319 | " 'Infiltration',\n", 320 | " 'Mass',\n", 321 | " 'Nodule',\n", 322 | " 'Pneumonia',\n", 323 | " 'Pneumothorax',\n", 324 | " 'Consolidation',\n", 325 | " 'Edema',\n", 326 | " 'Emphysema',\n", 327 | " 'Fibrosis',\n", 328 | " 'Pleural Thickening',\n", 329 | " 'Hernia']" 330 | ] 331 | }, 332 | "execution_count": 71, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "num_gpu = get_available_gpus()\n", 339 | "# get number of available GPUs\n", 340 | "print(\"num of GPUs:\", len(get_available_gpus()))\n", 341 | "\n", 342 | "pathologies_name_list = prj_consts.DISEASE_list\n", 343 | "pathologies_name_list\n", 344 | "\n", 345 | "stanford_result = [0.8094, 0.9248, 0.8638, 0.7345, 0.8676, 0.7802, 0.7680, 0.8887, 0.7901, 0.8878, 0.9371, 0.8047,\n", 346 | " 0.8062, 0.9164]\n", 347 | "\n", 348 | "\n", 349 | "with open(label_path, 'rb') as f:\n", 350 | " labels = pickle.load(f)\n", 351 | "\n", 352 | "with open(partition_path, 'rb') as f:\n", 353 | " partition = pickle.load(f)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 72, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "\n", 363 | "# generator for train and validation data\n", 364 | "# use the Sequence class per issue https://github.com/keras-team/keras/issues/1638\n", 365 | "class DataGenSequence(Sequence):\n", 366 | " def __init__(self, labels, image_file_index, current_state):\n", 367 | " self.batch_size = batch_size\n", 368 | " self.labels = labels\n", 369 | " self.img_file_index = image_file_index\n", 370 | " self.current_state = current_state\n", 371 | " self.len = len(self.img_file_index) // self.batch_size\n", 372 | " print(\"for DataGenSequence\", current_state, \"total rows are:\", len(self.img_file_index), \", len is\", self.len)\n", 373 | "\n", 374 | " def __len__(self):\n", 375 | " return self.len\n", 376 | "\n", 377 | " def __getitem__(self, idx):\n", 378 | " # print(\"loading data segmentation\", idx)\n", 379 | " # make sure each batch size has the same amount of data\n", 380 | " current_batch = self.img_file_index[idx * self.batch_size: (idx + 1) * self.batch_size]\n", 381 | " X = np.empty((self.batch_size, resized_height, resized_width, num_channel))\n", 382 | " y = np.empty((self.batch_size, num_classes))\n", 383 | "\n", 384 | " for i, image_name in enumerate(current_batch):\n", 385 | " path = os.path.join(nih_chest_xray_data_dir, image_name)\n", 386 | "\n", 387 | " # loading data\n", 388 | "\n", 389 | " img = cv2.resize(cv2.imread(path), (resized_height, resized_width)).astype(np.float32)\n", 390 | " X[i, :, :, :] = img\n", 391 | " y[i, :] = labels[image_name]\n", 392 | "\n", 393 | " # only do random flipping in training status\n", 394 | " if self.current_state == 'train':\n", 395 | " # this is different from the training code\n", 396 | " x_augmented = X\n", 397 | " else:\n", 398 | " x_augmented = X\n", 399 | "\n", 400 | " return x_augmented, y" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 73, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "data": { 410 | "text/plain": [ 411 | "32893" 412 | ] 413 | }, 414 | "execution_count": 73, 415 | "metadata": {}, 416 | "output_type": "execute_result" 417 | } 418 | ], 419 | "source": [ 420 | "len(partition['test'])" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 74, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "name": "stdout", 430 | "output_type": "stream", 431 | "text": [ 432 | "len of result is 32768\n", 433 | "/azureml-share/chestxray/output/fully_trained_models/weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5\n", 434 | "Weights for the model were loaded successfully\n", 435 | "evaluation for model /azureml-share/chestxray/output/fully_trained_models/weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5\n", 436 | "for DataGenSequence test total rows are: 32893 , len is 64\n", 437 | "64/64 [==============================] - 469s 7s/step\n", 438 | "result shape (32768, 14)\n", 439 | " Disease Our AUC Score Stanford AUC Score Delta\n", 440 | "0 Atelectasis 0.823191 0.8094 -0.013791\n", 441 | "1 Cardiomegaly 0.933519 0.9248 -0.008719\n", 442 | "2 Effusion 0.883184 0.8638 -0.019384\n", 443 | "3 Infiltration 0.744561 0.7345 -0.010061\n", 444 | "4 Mass 0.859510 0.8676 0.008090\n", 445 | "5 Nodule 0.783997 0.7802 -0.003797\n", 446 | "6 Pneumonia 0.801597 0.7680 -0.033597\n", 447 | "7 Pneumothorax 0.830550 0.8887 0.058150\n", 448 | "8 Consolidation 0.813993 0.7901 -0.023893\n", 449 | "9 Edema 0.896173 0.8878 -0.008373\n", 450 | "10 Emphysema 0.849184 0.9371 0.087916\n", 451 | "11 Fibrosis 0.882463 0.8047 -0.077763\n", 452 | "12 Pleural Thickening 1.000000 0.8062 -0.193800\n", 453 | "13 Hernia 0.916395 0.9164 0.000005\n" 454 | ] 455 | } 456 | ], 457 | "source": [ 458 | "import keras_contrib\n", 459 | "\n", 460 | "# load test data\n", 461 | "X_test = np.empty((len(partition['test']), 224, 224, 3), dtype=np.float32)\n", 462 | "y_test = np.empty((len(partition['test']) - len(partition['test']) % batch_size, 14), dtype=np.float32)\n", 463 | "\n", 464 | "for i, npy in enumerate(partition['test']):\n", 465 | " if (i < len(y_test)):\n", 466 | " # round to batch_size\n", 467 | " y_test[i, :] = labels[npy]\n", 468 | "\n", 469 | "print(\"len of result is\", len(y_test))\n", 470 | "y_pred_list = np.empty((len(models_file_name), len(partition['test']), 14), dtype=np.float32)\n", 471 | "\n", 472 | "# individual models\n", 473 | "for index, current_model_file in enumerate(models_file_name):\n", 474 | " print(current_model_file)\n", 475 | "# model = load_model(current_model_file)\n", 476 | " model = azure_chestxray_keras_utils.build_model(keras_contrib.applications.densenet.DenseNetImageNet121); model.load_weights(current_model_file)\n", 477 | "\n", 478 | " print('evaluation for model', current_model_file)\n", 479 | " # y_pred = model.predict(X_test)\n", 480 | "\n", 481 | " y_pred = model.predict_generator(generator=DataGenSequence(labels, partition['test'], current_state='test'),\n", 482 | " workers=32, verbose=1, max_queue_size=1)\n", 483 | " print(\"result shape\", y_pred.shape)\n", 484 | " \n", 485 | " # add one fake row of ones in both test and pred values to avoid:\n", 486 | " # ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.\n", 487 | " y_test = np.insert(y_test, 0, np.ones((y_test.shape[1],)), 0)\n", 488 | " y_pred = np.insert(y_pred, 0, np.ones((y_pred.shape[1],)), 0)\n", 489 | "\n", 490 | " df = pd.DataFrame(columns=['Disease', 'Our AUC Score', 'Stanford AUC Score'])\n", 491 | " for d in range(14):\n", 492 | " df.loc[d] = [pathologies_name_list[d],\n", 493 | " metrics.roc_auc_score(y_test[:, d], y_pred[:, d]),\n", 494 | " stanford_result[d]]\n", 495 | "\n", 496 | " df['Delta'] = df['Stanford AUC Score'] - df['Our AUC Score']\n", 497 | " df.to_csv(current_model_file + \".csv\", index=False)\n", 498 | " print(df)" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 78, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/plain": [ 509 | "(1, 32893, 14)" 510 | ] 511 | }, 512 | "execution_count": 78, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | }, 516 | { 517 | "data": { 518 | "text/plain": [ 519 | "numpy.ndarray" 520 | ] 521 | }, 522 | "execution_count": 78, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | }, 526 | { 527 | "data": { 528 | "text/plain": [ 529 | "(32769, 14)" 530 | ] 531 | }, 532 | "execution_count": 78, 533 | "metadata": {}, 534 | "output_type": "execute_result" 535 | }, 536 | { 537 | "data": { 538 | "text/plain": [ 539 | "array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],\n", 540 | " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],\n", 541 | " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],\n", 542 | " dtype=float32)" 543 | ] 544 | }, 545 | "execution_count": 78, 546 | "metadata": {}, 547 | "output_type": "execute_result" 548 | }, 549 | { 550 | "data": { 551 | "text/plain": [ 552 | "array([4004, 1189, 5187, 7044, 2089, 2090, 593, 2450, 1850, 808, 933,\n", 553 | " 493, 1, 79])" 554 | ] 555 | }, 556 | "execution_count": 78, 557 | "metadata": {}, 558 | "output_type": "execute_result" 559 | } 560 | ], 561 | "source": [ 562 | "y_pred_list.shape\n", 563 | "type(y_test[:, d])\n", 564 | "y_test.shape\n", 565 | "y_test[:3,]\n", 566 | "y_test.sum(axis=0).astype(int)\n" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 76, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "# C:\\repos\\ChestXRay\\TDSP>jupyter nbconvert --to html .\\Code\\02_Model\\020_evaluate.ipynb\n", 576 | "# [NbConvertApp] Converting notebook .\\Code\\01_DataPrep\\001_get_data.ipynb to html\n", 577 | "# [NbConvertApp] Writing 263414 bytes to .\\Code\\01_DataPrep\\001_get_data.html" 578 | ] 579 | } 580 | ], 581 | "metadata": { 582 | "kernelspec": { 583 | "display_name": "azure_chestxray_lung_disease gpucomputecontext", 584 | "language": "python", 585 | "name": "azure_chestxray_lung_disease_gpucomputecontext" 586 | }, 587 | "language_info": { 588 | "codemirror_mode": { 589 | "name": "ipython", 590 | "version": 3 591 | }, 592 | "file_extension": ".py", 593 | "mimetype": "text/x-python", 594 | "name": "python", 595 | "nbconvert_exporter": "python", 596 | "pygments_lexer": "ipython3", 597 | "version": "3.5.2" 598 | } 599 | }, 600 | "nbformat": 4, 601 | "nbformat_minor": 2 602 | } 603 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/src/finding_lungs/blacklist_other_images_with_lower_quality.csv: -------------------------------------------------------------------------------- 1 | 00000032_013.png 2 | 00000032_023.png 3 | 00000032_024.png 4 | 00000032_055.png 5 | 00000032_058.png 6 | 00000116_007.png 7 | 00000244_000.png 8 | 00000244_002.png 9 | 00000248_013.png 10 | 00000248_018.png 11 | 00000248_019.png 12 | 00000467_013.png 13 | 00000468_060.png 14 | 00000565_000.png 15 | 00000583_005.png 16 | 00000583_007.png 17 | 00000583_009.png 18 | 00000583_019.png 19 | 00000583_024.png 20 | 00000627_030.png 21 | 00000627_036.png 22 | 00000703_000.png 23 | 00000831_008.png 24 | 00000929_000.png 25 | 00000929_001.png 26 | 00000980_004.png 27 | 00001029_003.png 28 | 00001075_016.png 29 | 00001075_020.png 30 | 00001122_016.png 31 | 00001122_017.png 32 | 00001153_006.png 33 | 00001157_002.png 34 | 00001179_000.png 35 | 00001179_001.png 36 | 00001181_000.png 37 | 00001223_000.png 38 | 00001223_001.png 39 | 00001249_004.png 40 | 00001255_012.png 41 | 00001255_035.png 42 | 00001267_001.png 43 | 00001278_009.png 44 | 00001278_011.png 45 | 00001437_038.png 46 | 00001501_002.png 47 | 00001564_000.png 48 | 00001577_003.png 49 | 00001595_000.png 50 | 00001595_001.png 51 | 00001595_002.png 52 | 00001686_000.png 53 | 00001686_001.png 54 | 00001736_005.png 55 | 00001736_007.png 56 | 00001736_010.png 57 | 00001736_014.png 58 | 00001736_018.png 59 | 00001736_021.png 60 | 00001736_025.png 61 | 00001736_026.png 62 | 00001736_027.png 63 | 00001787_002.png 64 | 00001787_010.png 65 | 00001814_004.png 66 | 00001836_014.png 67 | 00001855_000.png 68 | 00001855_004.png 69 | 00001855_009.png 70 | 00001855_010.png 71 | 00001855_011.png 72 | 00001855_012.png 73 | 00001855_014.png 74 | 00001855_016.png 75 | 00001855_018.png 76 | 00001855_020.png 77 | 00001855_021.png 78 | 00001855_022.png 79 | 00001855_023.png 80 | 00001855_024.png 81 | 00001855_025.png 82 | 00001855_026.png 83 | 00001855_027.png 84 | 00001855_028.png 85 | 00001855_029.png 86 | 00001855_030.png 87 | 00001855_032.png 88 | 00001855_033.png 89 | 00001855_034.png 90 | 00001855_035.png 91 | 00001855_037.png 92 | 00001952_000.png 93 | 00001952_001.png 94 | 00001952_002.png 95 | 00001952_007.png 96 | 00001952_008.png 97 | 00001986_010.png 98 | 00002072_003.png 99 | 00002072_004.png 100 | 00002072_009.png 101 | 00002072_010.png 102 | 00002072_011.png 103 | 00002072_014.png 104 | 00002072_015.png 105 | 00002072_018.png 106 | 00002072_019.png 107 | 00002208_001.png 108 | 00002359_018.png 109 | 00002366_001.png 110 | 00002366_002.png 111 | 00002437_036.png 112 | 00002437_037.png 113 | 00002529_007.png 114 | 00002529_023.png 115 | 00002529_025.png 116 | 00002529_030.png 117 | 00002545_001.png 118 | 00002582_007.png 119 | 00002594_001.png 120 | 00002633_023.png 121 | 00002636_000.png 122 | 00002659_003.png 123 | 00002675_005.png 124 | 00002733_000.png 125 | 00002763_023.png 126 | 00002763_024.png 127 | 00002892_004.png 128 | 00002896_000.png 129 | 00003004_000.png 130 | 00003005_005.png 131 | 00003029_018.png 132 | 00003059_000.png 133 | 00003060_000.png 134 | 00003094_000.png 135 | 00003094_003.png 136 | 00003094_004.png 137 | 00003094_005.png 138 | 00003158_001.png 139 | 00003186_003.png 140 | 00003369_001.png 141 | 00003465_000.png 142 | 00003465_001.png 143 | 00003465_002.png 144 | 00003465_003.png 145 | 00003465_004.png 146 | 00003465_005.png 147 | 00003465_006.png 148 | 00003465_007.png 149 | 00003465_008.png 150 | 00003523_036.png 151 | 00004276_000.png 152 | 00004285_000.png 153 | 00004309_006.png 154 | 00004344_025.png 155 | 00004360_020.png 156 | 00004360_023.png 157 | 00004472_000.png 158 | 00004545_000.png 159 | 00004660_000.png 160 | 00004672_001.png 161 | 00004703_000.png 162 | 00004706_001.png 163 | 00004792_000.png 164 | 00004808_014.png 165 | 00004808_094.png 166 | 00004811_000.png 167 | 00004928_006.png 168 | 00005051_000.png 169 | 00005094_009.png 170 | 00005201_001.png 171 | 00005204_001.png 172 | 00005220_012.png 173 | 00005220_015.png 174 | 00005254_003.png 175 | 00005254_004.png 176 | 00005254_008.png 177 | 00005298_013.png 178 | 00005360_002.png 179 | 00005573_004.png 180 | 00005699_005.png 181 | 00005712_008.png 182 | 00005746_008.png 183 | 00005748_000.png 184 | 00005750_015.png 185 | 00005750_016.png 186 | 00005750_017.png 187 | 00005877_000.png 188 | 00005937_000.png 189 | 00005975_001.png 190 | 00006008_015.png 191 | 00006015_000.png 192 | 00006015_003.png 193 | 00006039_022.png 194 | 00006054_001.png 195 | 00006127_000.png 196 | 00006209_001.png 197 | 00006220_002.png 198 | 00006220_003.png 199 | 00006220_004.png 200 | 00006220_005.png 201 | 00006220_006.png 202 | 00006220_009.png 203 | 00006271_002.png 204 | 00006271_078.png 205 | 00006271_093.png 206 | 00006294_004.png 207 | 00006296_011.png 208 | 00006381_009.png 209 | 00006391_001.png 210 | 00006415_000.png 211 | 00006446_012.png 212 | 00006585_007.png 213 | 00006754_008.png 214 | 00006838_000.png 215 | 00006850_019.png 216 | 00006870_000.png 217 | 00006904_007.png 218 | 00006906_029.png 219 | 00006906_031.png 220 | 00006906_032.png 221 | 00006917_000.png 222 | 00006960_022.png 223 | 00007001_001.png 224 | 00007018_034.png 225 | 00007018_035.png 226 | 00007108_006.png 227 | 00007152_008.png 228 | 00007217_005.png 229 | 00007269_000.png 230 | 00007276_001.png 231 | 00007276_002.png 232 | 00007322_003.png 233 | 00007322_005.png 234 | 00007322_009.png 235 | 00007322_020.png 236 | 00007371_000.png 237 | 00007438_000.png 238 | 00007500_000.png 239 | 00007500_001.png 240 | 00007545_000.png 241 | 00007558_004.png 242 | 00007558_007.png 243 | 00007624_036.png 244 | 00007830_000.png 245 | 00007830_001.png 246 | 00007830_004.png 247 | 00007830_005.png 248 | 00007830_007.png 249 | 00007830_010.png 250 | 00007973_000.png 251 | 00007985_000.png 252 | 00008051_039.png 253 | 00008051_050.png 254 | 00008051_051.png 255 | 00008295_010.png 256 | 00008297_008.png 257 | 00008297_013.png 258 | 00008297_016.png 259 | 00008314_000.png 260 | 00008463_001.png 261 | 00008522_057.png 262 | 00008549_000.png 263 | 00008640_000.png 264 | 00008701_008.png 265 | 00008911_006.png 266 | 00008993_000.png 267 | 00009218_020.png 268 | 00009218_022.png 269 | 00009282_000.png 270 | 00009465_004.png 271 | 00009508_004.png 272 | 00009551_008.png 273 | 00009551_022.png 274 | 00009573_000.png 275 | 00009608_045.png 276 | 00009613_005.png 277 | 00009621_000.png 278 | 00009621_001.png 279 | 00009621_002.png 280 | 00009621_003.png 281 | 00009621_004.png 282 | 00009621_005.png 283 | 00009621_006.png 284 | 00009621_007.png 285 | 00009702_006.png 286 | 00009727_012.png 287 | 00009727_013.png 288 | 00009727_014.png 289 | 00009727_018.png 290 | 00009727_019.png 291 | 00009727_020.png 292 | 00009727_022.png 293 | 00009727_023.png 294 | 00009727_027.png 295 | 00009727_028.png 296 | 00009876_002.png 297 | 00009886_000.png 298 | 00009892_007.png 299 | 00009892_046.png 300 | 00009911_004.png 301 | 00009953_016.png 302 | 00010007_053.png 303 | 00010007_060.png 304 | 00010007_071.png 305 | 00010007_074.png 306 | 00010007_082.png 307 | 00010007_103.png 308 | 00010012_018.png 309 | 00010012_026.png 310 | 00010092_007.png 311 | 00010092_043.png 312 | 00010124_000.png 313 | 00010294_007.png 314 | 00010352_054.png 315 | 00010352_074.png 316 | 00010360_004.png 317 | 00010384_005.png 318 | 00010405_000.png 319 | 00010405_001.png 320 | 00010415_000.png 321 | 00010435_002.png 322 | 00010544_016.png 323 | 00010544_027.png 324 | 00010544_030.png 325 | 00010693_027.png 326 | 00010698_001.png 327 | 00010698_013.png 328 | 00010761_000.png 329 | 00010773_014.png 330 | 00010773_025.png 331 | 00010790_039.png 332 | 00010790_043.png 333 | 00010790_045.png 334 | 00010792_004.png 335 | 00010805_002.png 336 | 00010805_003.png 337 | 00010805_004.png 338 | 00010805_005.png 339 | 00010805_006.png 340 | 00010805_008.png 341 | 00010805_009.png 342 | 00010805_010.png 343 | 00010805_011.png 344 | 00010805_013.png 345 | 00010805_015.png 346 | 00010805_016.png 347 | 00010805_017.png 348 | 00010805_018.png 349 | 00010805_019.png 350 | 00010805_020.png 351 | 00010805_023.png 352 | 00010805_025.png 353 | 00010805_037.png 354 | 00010805_038.png 355 | 00010805_040.png 356 | 00010805_043.png 357 | 00010805_045.png 358 | 00010805_046.png 359 | 00010805_047.png 360 | 00010805_048.png 361 | 00010805_050.png 362 | 00010828_017.png 363 | 00010843_000.png 364 | 00010887_027.png 365 | 00010960_001.png 366 | 00010960_002.png 367 | 00010995_006.png 368 | 00010995_008.png 369 | 00011007_000.png 370 | 00011021_012.png 371 | 00011064_000.png 372 | 00011164_007.png 373 | 00011237_095.png 374 | 00011237_108.png 375 | 00011379_002.png 376 | 00011379_003.png 377 | 00011379_004.png 378 | 00011379_005.png 379 | 00011379_006.png 380 | 00011379_013.png 381 | 00011379_018.png 382 | 00011379_019.png 383 | 00011379_022.png 384 | 00011379_039.png 385 | 00011379_041.png 386 | 00011379_043.png 387 | 00011379_045.png 388 | 00011379_046.png 389 | 00011379_047.png 390 | 00011386_000.png 391 | 00011391_016.png 392 | 00011391_031.png 393 | 00011391_032.png 394 | 00011391_039.png 395 | 00011391_041.png 396 | 00011391_043.png 397 | 00011391_047.png 398 | 00011391_055.png 399 | 00011436_009.png 400 | 00011461_002.png 401 | 00011553_002.png 402 | 00011553_003.png 403 | 00011553_004.png 404 | 00011553_005.png 405 | 00011553_006.png 406 | 00011553_007.png 407 | 00011553_009.png 408 | 00011553_010.png 409 | 00011553_011.png 410 | 00011553_012.png 411 | 00011553_013.png 412 | 00011553_014.png 413 | 00011553_015.png 414 | 00011553_016.png 415 | 00011553_017.png 416 | 00011553_018.png 417 | 00011553_019.png 418 | 00011553_020.png 419 | 00011553_022.png 420 | 00011553_023.png 421 | 00011553_024.png 422 | 00011553_025.png 423 | 00011553_026.png 424 | 00011553_027.png 425 | 00011553_028.png 426 | 00011553_029.png 427 | 00011553_030.png 428 | 00011553_031.png 429 | 00011553_032.png 430 | 00011553_033.png 431 | 00011553_034.png 432 | 00011553_035.png 433 | 00011553_036.png 434 | 00011553_037.png 435 | 00011553_038.png 436 | 00011553_040.png 437 | 00011553_041.png 438 | 00011553_046.png 439 | 00011553_047.png 440 | 00011673_000.png 441 | 00011677_001.png 442 | 00011677_002.png 443 | 00011702_024.png 444 | 00011702_062.png 445 | 00011731_003.png 446 | 00011769_000.png 447 | 00011925_047.png 448 | 00011925_049.png 449 | 00011925_051.png 450 | 00011925_053.png 451 | 00011925_055.png 452 | 00011925_068.png 453 | 00011925_071.png 454 | 00011925_078.png 455 | 00011947_000.png 456 | 00011985_008.png 457 | 00012141_013.png 458 | 00012159_002.png 459 | 00012162_001.png 460 | 00012276_007.png 461 | 00012276_009.png 462 | 00012276_010.png 463 | 00012276_013.png 464 | 00012276_017.png 465 | 00012276_018.png 466 | 00012368_002.png 467 | 00012470_011.png 468 | 00012470_012.png 469 | 00012515_002.png 470 | 00012591_000.png 471 | 00012605_000.png 472 | 00012605_001.png 473 | 00012628_017.png 474 | 00012628_060.png 475 | 00012648_001.png 476 | 00012654_001.png 477 | 00012662_000.png 478 | 00012742_000.png 479 | 00012742_001.png 480 | 00012742_002.png 481 | 00012798_000.png 482 | 00012834_005.png 483 | 00012834_007.png 484 | 00012834_010.png 485 | 00012834_085.png 486 | 00012834_120.png 487 | 00012834_137.png 488 | 00012863_027.png 489 | 00012863_039.png 490 | 00013049_006.png 491 | 00013049_007.png 492 | 00013123_004.png 493 | 00013152_004.png 494 | 00013158_004.png 495 | 00013249_004.png 496 | 00013249_006.png 497 | 00013249_007.png 498 | 00013249_008.png 499 | 00013249_013.png 500 | 00013249_014.png 501 | 00013249_017.png 502 | 00013249_018.png 503 | 00013249_028.png 504 | 00013249_033.png 505 | 00013249_036.png 506 | 00013249_038.png 507 | 00013249_041.png 508 | 00013249_046.png 509 | 00013401_000.png 510 | 00013440_000.png 511 | 00013499_004.png 512 | 00013568_000.png 513 | 00013601_013.png 514 | 00013608_000.png 515 | 00013608_002.png 516 | 00013608_004.png 517 | 00013608_016.png 518 | 00013615_015.png 519 | 00013615_025.png 520 | 00013615_027.png 521 | 00013615_049.png 522 | 00013615_057.png 523 | 00013615_060.png 524 | 00013625_033.png 525 | 00013641_014.png 526 | 00013641_041.png 527 | 00013670_146.png 528 | 00013670_162.png 529 | 00013670_163.png 530 | 00013670_166.png 531 | 00013670_167.png 532 | 00013685_047.png 533 | 00013774_027.png 534 | 00013774_041.png 535 | 00013774_042.png 536 | 00013774_048.png 537 | 00013894_010.png 538 | 00013894_024.png 539 | 00013894_025.png 540 | 00013894_027.png 541 | 00013896_004.png 542 | 00013922_020.png 543 | 00013922_021.png 544 | 00013966_007.png 545 | 00013993_016.png 546 | 00013993_049.png 547 | 00013993_099.png 548 | 00014004_018.png 549 | 00014004_023.png 550 | 00014014_002.png 551 | 00014014_005.png 552 | 00014080_001.png 553 | 00014112_019.png 554 | 00014128_023.png 555 | 00014192_000.png 556 | 00014203_016.png 557 | 00014203_026.png 558 | 00014203_028.png 559 | 00014203_029.png 560 | 00014203_042.png 561 | 00014203_044.png 562 | 00014223_012.png 563 | 00014245_001.png 564 | 00014245_003.png 565 | 00014314_001.png 566 | 00014320_040.png 567 | 00014320_043.png 568 | 00014323_001.png 569 | 00014323_002.png 570 | 00014323_003.png 571 | 00014332_004.png 572 | 00014351_000.png 573 | 00014352_001.png 574 | 00014465_016.png 575 | 00014474_002.png 576 | 00014486_004.png 577 | 00014509_000.png 578 | 00014958_009.png 579 | 00014982_000.png 580 | 00015007_002.png 581 | 00015007_003.png 582 | 00015007_005.png 583 | 00015007_006.png 584 | 00015007_007.png 585 | 00015007_008.png 586 | 00015007_011.png 587 | 00015024_003.png 588 | 00015031_006.png 589 | 00015031_022.png 590 | 00015041_003.png 591 | 00015112_004.png 592 | 00015126_000.png 593 | 00015151_001.png 594 | 00015193_014.png 595 | 00015213_000.png 596 | 00015290_000.png 597 | 00015391_001.png 598 | 00015462_001.png 599 | 00015462_002.png 600 | 00015482_000.png 601 | 00015530_071.png 602 | 00015530_142.png 603 | 00015564_011.png 604 | 00015605_038.png 605 | 00015605_051.png 606 | 00015605_053.png 607 | 00015605_055.png 608 | 00015606_013.png 609 | 00015606_050.png 610 | 00015696_001.png 611 | 00015758_000.png 612 | 00015826_019.png 613 | 00015923_000.png 614 | 00015934_000.png 615 | 00015986_000.png 616 | 00015996_001.png 617 | 00016009_046.png 618 | 00016034_003.png 619 | 00016051_003.png 620 | 00016051_004.png 621 | 00016133_000.png 622 | 00016175_003.png 623 | 00016175_006.png 624 | 00016175_008.png 625 | 00016184_027.png 626 | 00016238_006.png 627 | 00016292_000.png 628 | 00016292_001.png 629 | 00016292_002.png 630 | 00016292_003.png 631 | 00016292_004.png 632 | 00016378_001.png 633 | 00016410_006.png 634 | 00016410_008.png 635 | 00016410_055.png 636 | 00016484_001.png 637 | 00016484_005.png 638 | 00016484_009.png 639 | 00016484_011.png 640 | 00016484_026.png 641 | 00016522_019.png 642 | 00016529_000.png 643 | 00016638_003.png 644 | 00016638_004.png 645 | 00016653_000.png 646 | 00016732_035.png 647 | 00016784_002.png 648 | 00016860_001.png 649 | 00016860_005.png 650 | 00016867_003.png 651 | 00016918_005.png 652 | 00017036_023.png 653 | 00017110_012.png 654 | 00017138_032.png 655 | 00017207_002.png 656 | 00017207_003.png 657 | 00017207_008.png 658 | 00017258_022.png 659 | 00017258_023.png 660 | 00017362_009.png 661 | 00017392_000.png 662 | 00017400_000.png 663 | 00017403_007.png 664 | 00017403_010.png 665 | 00017424_034.png 666 | 00017424_035.png 667 | 00017424_036.png 668 | 00017424_038.png 669 | 00017424_041.png 670 | 00017425_002.png 671 | 00017425_006.png 672 | 00017477_000.png 673 | 00017504_024.png 674 | 00017504_068.png 675 | 00017538_001.png 676 | 00017538_002.png 677 | 00017541_025.png 678 | 00017553_000.png 679 | 00017561_001.png 680 | 00017605_014.png 681 | 00017606_020.png 682 | 00017618_013.png 683 | 00017625_000.png 684 | 00017625_004.png 685 | 00017641_004.png 686 | 00017645_013.png 687 | 00017648_000.png 688 | 00017695_000.png 689 | 00017753_026.png 690 | 00017817_001.png 691 | 00017817_002.png 692 | 00017927_001.png 693 | 00017941_005.png 694 | 00017972_006.png 695 | 00017972_014.png 696 | 00017979_000.png 697 | 00017999_000.png 698 | 00018011_015.png 699 | 00018044_020.png 700 | 00018044_036.png 701 | 00018044_040.png 702 | 00018044_043.png 703 | 00018069_000.png 704 | 00018069_001.png 705 | 00018091_012.png 706 | 00018103_002.png 707 | 00018103_007.png 708 | 00018103_009.png 709 | 00018104_004.png 710 | 00018116_000.png 711 | 00018121_000.png 712 | 00018125_009.png 713 | 00018126_024.png 714 | 00018175_002.png 715 | 00018191_000.png 716 | 00018191_001.png 717 | 00018213_001.png 718 | 00018240_000.png 719 | 00018251_001.png 720 | 00018251_002.png 721 | 00018251_003.png 722 | 00018251_004.png 723 | 00018251_005.png 724 | 00018251_006.png 725 | 00018251_007.png 726 | 00018251_008.png 727 | 00018251_009.png 728 | 00018251_010.png 729 | 00018251_011.png 730 | 00018251_012.png 731 | 00018251_013.png 732 | 00018251_014.png 733 | 00018253_089.png 734 | 00018336_000.png 735 | 00018437_001.png 736 | 00018437_002.png 737 | 00018445_002.png 738 | 00018458_000.png 739 | 00018486_000.png 740 | 00018571_000.png 741 | 00018573_000.png 742 | 00018598_004.png 743 | 00018610_002.png 744 | 00018610_004.png 745 | 00018614_001.png 746 | 00018615_001.png 747 | 00018778_001.png 748 | 00018778_002.png 749 | 00018778_005.png 750 | 00018921_026.png 751 | 00018921_027.png 752 | 00018927_000.png 753 | 00018949_001.png 754 | 00019020_000.png 755 | 00019045_000.png 756 | 00019107_001.png 757 | 00019124_011.png 758 | 00019124_012.png 759 | 00019150_007.png 760 | 00019301_000.png 761 | 00019390_002.png 762 | 00019390_004.png 763 | 00019534_000.png 764 | 00019576_024.png 765 | 00019576_063.png 766 | 00019576_064.png 767 | 00019576_065.png 768 | 00019587_000.png 769 | 00019592_010.png 770 | 00019660_001.png 771 | 00019707_010.png 772 | 00019888_001.png 773 | 00019928_000.png 774 | 00019967_001.png 775 | 00019967_002.png 776 | 00019967_003.png 777 | 00019967_004.png 778 | 00019967_007.png 779 | 00019967_008.png 780 | 00019967_009.png 781 | 00019967_011.png 782 | 00019967_012.png 783 | 00019967_013.png 784 | 00019967_014.png 785 | 00019967_017.png 786 | 00019967_019.png 787 | 00019967_020.png 788 | 00019967_032.png 789 | 00020006_001.png 790 | 00020108_001.png 791 | 00020110_000.png 792 | 00020146_002.png 793 | 00020213_011.png 794 | 00020213_018.png 795 | 00020213_060.png 796 | 00020213_061.png 797 | 00020213_113.png 798 | 00020219_000.png 799 | 00020326_013.png 800 | 00020326_058.png 801 | 00020348_000.png 802 | 00020364_002.png 803 | 00020364_003.png 804 | 00020398_010.png 805 | 00020438_007.png 806 | 00020622_002.png 807 | 00020631_009.png 808 | 00020928_004.png 809 | 00020928_014.png 810 | 00020928_015.png 811 | 00020945_022.png 812 | 00021023_014.png 813 | 00021044_000.png 814 | 00021108_000.png 815 | 00021201_042.png 816 | 00021420_013.png 817 | 00021420_028.png 818 | 00021481_012.png 819 | 00021506_001.png 820 | 00021508_002.png 821 | 00021510_000.png 822 | 00021572_010.png 823 | 00021695_003.png 824 | 00021700_006.png 825 | 00021729_000.png 826 | 00021770_012.png 827 | 00021770_014.png 828 | 00021770_015.png 829 | 00021770_016.png 830 | 00021811_003.png 831 | 00021835_029.png 832 | 00021901_005.png 833 | 00021917_000.png 834 | 00021942_006.png 835 | 00021990_002.png 836 | 00022010_001.png 837 | 00022051_000.png 838 | 00022174_000.png 839 | 00022174_001.png 840 | 00022245_011.png 841 | 00022283_029.png 842 | 00022339_000.png 843 | 00022416_052.png 844 | 00022470_007.png 845 | 00022486_000.png 846 | 00022523_004.png 847 | 00022523_005.png 848 | 00022524_000.png 849 | 00022528_007.png 850 | 00022566_022.png 851 | 00022599_004.png 852 | 00022714_000.png 853 | 00022723_000.png 854 | 00022725_003.png 855 | 00022727_001.png 856 | 00022815_004.png 857 | 00022815_015.png 858 | 00022815_020.png 859 | 00022815_031.png 860 | 00022815_037.png 861 | 00022815_058.png 862 | 00022815_068.png 863 | 00022815_073.png 864 | 00022815_079.png 865 | 00022872_001.png 866 | 00022872_002.png 867 | 00022975_004.png 868 | 00023027_000.png 869 | 00023068_015.png 870 | 00023129_000.png 871 | 00023160_003.png 872 | 00023176_019.png 873 | 00023192_000.png 874 | 00023195_000.png 875 | 00023197_000.png 876 | 00023254_003.png 877 | 00023271_016.png 878 | 00023325_037.png 879 | 00023325_039.png 880 | 00025066_000.png 881 | 00025203_000.png 882 | 00025223_000.png 883 | 00025290_014.png 884 | 00025445_001.png 885 | 00025513_001.png 886 | 00025513_005.png 887 | 00025513_006.png 888 | 00025513_007.png 889 | 00025513_008.png 890 | 00025513_009.png 891 | 00025513_010.png 892 | 00025513_011.png 893 | 00025513_012.png 894 | 00025513_013.png 895 | 00025513_014.png 896 | 00025529_010.png 897 | 00025628_024.png 898 | 00025628_026.png 899 | 00025628_027.png 900 | 00025664_037.png 901 | 00025665_000.png 902 | 00025691_000.png 903 | 00025691_002.png 904 | 00025697_001.png 905 | 00025704_000.png 906 | 00025796_000.png 907 | 00025809_001.png 908 | 00025839_010.png 909 | 00025839_012.png 910 | 00025932_001.png 911 | 00025958_000.png 912 | 00025958_002.png 913 | 00025958_003.png 914 | 00025958_006.png 915 | 00026068_000.png 916 | 00026068_001.png 917 | 00026092_003.png 918 | 00026098_028.png 919 | 00026099_041.png 920 | 00026114_000.png 921 | 00026115_000.png 922 | 00026159_000.png 923 | 00026167_008.png 924 | 00026194_001.png 925 | 00026194_004.png 926 | 00026194_007.png 927 | 00026194_008.png 928 | 00026194_009.png 929 | 00026194_010.png 930 | 00026194_011.png 931 | 00026194_012.png 932 | 00026194_014.png 933 | 00026194_015.png 934 | 00026194_018.png 935 | 00026232_030.png 936 | 00026262_000.png 937 | 00026346_015.png 938 | 00026349_005.png 939 | 00026382_009.png 940 | 00026431_000.png 941 | 00026474_003.png 942 | 00026506_000.png 943 | 00026538_025.png 944 | 00026621_000.png 945 | 00026634_000.png 946 | 00026666_000.png 947 | 00026701_000.png 948 | 00026758_000.png 949 | 00026801_005.png 950 | 00026818_020.png 951 | 00026867_002.png 952 | 00026867_004.png 953 | 00026911_004.png 954 | 00026925_006.png 955 | 00026925_011.png 956 | 00026925_015.png 957 | 00026963_032.png 958 | 00026971_026.png 959 | 00026993_003.png 960 | 00026993_004.png 961 | 00027072_000.png 962 | 00027196_009.png 963 | 00027196_010.png 964 | 00027213_001.png 965 | 00027213_008.png 966 | 00027213_009.png 967 | 00027213_010.png 968 | 00027213_076.png 969 | 00027213_079.png 970 | 00027299_006.png 971 | 00027299_007.png 972 | 00027415_009.png 973 | 00027415_011.png 974 | 00027415_028.png 975 | 00027415_029.png 976 | 00027415_037.png 977 | 00027415_046.png 978 | 00027415_047.png 979 | 00027415_049.png 980 | 00027415_059.png 981 | 00027415_068.png 982 | 00027415_069.png 983 | 00027415_072.png 984 | 00027415_073.png 985 | 00027415_075.png 986 | 00027415_077.png 987 | 00027441_012.png 988 | 00027441_017.png 989 | 00027441_019.png 990 | 00027441_024.png 991 | 00027442_008.png 992 | 00027464_024.png 993 | 00027465_008.png 994 | 00027524_000.png 995 | 00027618_012.png 996 | 00027623_006.png 997 | 00027639_000.png 998 | 00027639_001.png 999 | 00027639_002.png 1000 | 00027639_003.png 1001 | 00027677_000.png 1002 | 00027710_000.png 1003 | 00027725_021.png 1004 | 00027725_035.png 1005 | 00027726_016.png 1006 | 00027726_019.png 1007 | 00027726_020.png 1008 | 00027726_021.png 1009 | 00027726_050.png 1010 | 00027726_051.png 1011 | 00027765_000.png 1012 | 00027765_002.png 1013 | 00027952_004.png 1014 | 00027981_000.png 1015 | 00027981_001.png 1016 | 00027981_002.png 1017 | 00028076_000.png 1018 | 00028092_000.png 1019 | 00028201_000.png 1020 | 00028211_012.png 1021 | 00028301_002.png 1022 | 00028341_001.png 1023 | 00028341_002.png 1024 | 00028341_003.png 1025 | 00028341_004.png 1026 | 00028341_005.png 1027 | 00028341_006.png 1028 | 00028341_007.png 1029 | 00028341_008.png 1030 | 00028341_009.png 1031 | 00028341_010.png 1032 | 00028341_011.png 1033 | 00028341_012.png 1034 | 00028389_000.png 1035 | 00028450_000.png 1036 | 00028454_011.png 1037 | 00028454_013.png 1038 | 00028474_000.png 1039 | 00028657_000.png 1040 | 00028799_000.png 1041 | 00028829_002.png 1042 | 00028873_017.png 1043 | 00028873_019.png 1044 | 00028873_020.png 1045 | 00028882_004.png 1046 | 00028961_006.png 1047 | 00028961_008.png 1048 | 00028996_002.png 1049 | 00028996_003.png 1050 | 00028996_004.png 1051 | 00029174_002.png 1052 | 00029222_003.png 1053 | 00029235_001.png 1054 | 00029245_002.png 1055 | 00029276_004.png 1056 | 00029404_000.png 1057 | 00029404_002.png 1058 | 00029404_003.png 1059 | 00029404_004.png 1060 | 00029404_005.png 1061 | 00029404_006.png 1062 | 00029404_007.png 1063 | 00029404_008.png 1064 | 00029404_010.png 1065 | 00029476_003.png 1066 | 00029596_012.png 1067 | 00029627_000.png 1068 | 00029813_029.png 1069 | 00029943_022.png 1070 | 00030079_020.png 1071 | 00030079_031.png 1072 | 00030206_011.png 1073 | 00030209_011.png 1074 | 00030213_000.png 1075 | 00030245_001.png 1076 | 00030320_004.png 1077 | 00030320_006.png 1078 | 00030323_038.png 1079 | 00030410_005.png 1080 | 00030412_002.png 1081 | 00030609_000.png 1082 | 00030609_001.png 1083 | 00030609_002.png 1084 | 00030609_003.png 1085 | 00030609_006.png 1086 | 00030609_008.png 1087 | 00030609_009.png 1088 | 00030609_010.png 1089 | 00030609_011.png 1090 | 00030609_017.png 1091 | 00030609_021.png 1092 | 00030609_023.png 1093 | 00030609_026.png 1094 | 00030786_004.png 1095 | -------------------------------------------------------------------------------- /AzureChestXRay_AMLWB/Code/02_Model/060_Train_pyTorch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Train\n", 8 | "\n", 9 | "##### Copyright (C) Microsoft Corporation. \n", 10 | "see license file for details " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "# Allow multiple displays per cell\n", 20 | "from IPython.core.interactiveshell import InteractiveShell\n", 21 | "InteractiveShell.ast_node_interactivity = \"all\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/plain": [ 32 | "'/azureml-share/'" 33 | ] 34 | }, 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "# AZUREML_NATIVE_SHARE_DIRECTORY mapping to host dir is set by _nativeSharedDirectory_ in .compute file \n", 42 | "\n", 43 | "import os\n", 44 | "try:\n", 45 | " amlWBSharedDir = os.environ['AZUREML_NATIVE_SHARE_DIRECTORY'] \n", 46 | "except:\n", 47 | " amlWBSharedDir = ''\n", 48 | " print('not using aml services?')\n", 49 | " \n", 50 | "amlWBSharedDir" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 3, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "## Data needs 2 things\n", 60 | "## TEMP (Get images)\n", 61 | "#crt_container = 'https://chestxray.blob.core.windows.net/chestxraynih'\n", 62 | "#crt_destination = '/mnt/images'\n", 63 | "#answer = !yes | azcopy \\\n", 64 | "# --source {crt_container} \\\n", 65 | "# --destination {crt_destination} \\\n", 66 | "# --recursive\n", 67 | "## TEMP (Get Labels csv)\n", 68 | "# Put to blob\n", 69 | "\n", 70 | "# Why not have a zip from blob that gets unzipped and has both images and csv?\n", 71 | "# Would make self-contained ..." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "import os\n", 81 | "import sys\n", 82 | "import numpy as np\n", 83 | "import pandas as pd\n", 84 | "import torch\n", 85 | "import torchvision.models as models\n", 86 | "import torch.nn as nn\n", 87 | "import torch.nn.functional as F\n", 88 | "import torch.optim as optim\n", 89 | "import torch.nn.init as init\n", 90 | "import time\n", 91 | "from torch.optim.lr_scheduler import ReduceLROnPlateau\n", 92 | "from torch.autograd import Variable\n", 93 | "import torchvision.transforms as transforms\n", 94 | "from torch.utils.data import DataLoader, Dataset\n", 95 | "from sklearn.metrics.ranking import roc_auc_score\n", 96 | "from sklearn.model_selection import train_test_split\n", 97 | "from PIL import Image\n", 98 | "import multiprocessing" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "assert torch.cuda.is_available()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "torch.backends.cudnn.benchmark=True # enables cudnn's auto-tuner" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 7, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "OS: linux\n", 129 | "Python: 3.5.2 |Continuum Analytics, Inc.| (default, Jul 2 2016, 17:53:06) \n", 130 | "[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]\n", 131 | "PyTorch: 0.3.1\n", 132 | "CPUs: 12\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "print(\"OS: \", sys.platform)\n", 138 | "print(\"Python: \", sys.version)\n", 139 | "print(\"PyTorch: \", torch.__version__)\n", 140 | "CPU_COUNT = multiprocessing.cpu_count()\n", 141 | "print(\"CPUs: \", CPU_COUNT)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# Globals\n", 151 | "# With small batch may be faster on P100 to do one 1 GPU\n", 152 | "MULTI_GPU = True\n", 153 | "CLASSES = 14\n", 154 | "WIDTH = 224\n", 155 | "HEIGHT = 224\n", 156 | "CHANNELS = 3\n", 157 | "LR = 0.0001\n", 158 | "EPOCHS = 1 #100\n", 159 | "# Can scale to max for inference but for training LR will be affected\n", 160 | "# Prob better to increase this though on P100 since LR is not too low\n", 161 | "# Easier to see when plotted\n", 162 | "BATCHSIZE = 64*2\n", 163 | "IMAGENET_RGB_MEAN = [0.485, 0.456, 0.406]\n", 164 | "IMAGENET_RGB_SD = [0.229, 0.224, 0.225]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 9, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "[None]" 176 | ] 177 | }, 178 | "execution_count": 9, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "# import utlity functions\n", 185 | "\n", 186 | "import sys, os\n", 187 | "paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code', 'src'])))]\n", 188 | "def add_path_to_sys_path(path_to_append):\n", 189 | " if not (any(path_to_append in paths for paths in sys.path)):\n", 190 | " sys.path.append(path_to_append)\n", 191 | "[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]\n", 192 | "\n", 193 | "import azure_chestxray_utils" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 10, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC'" 205 | ] 206 | }, 207 | "execution_count": 10, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | }, 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "112120\r\n" 216 | ] 217 | }, 218 | { 219 | "data": { 220 | "text/plain": [ 221 | "'/azureml-share/chestxray/data/ChestX-ray8/ChestXray-NIHCC_other'" 222 | ] 223 | }, 224 | "execution_count": 10, 225 | "metadata": {}, 226 | "output_type": "execute_result" 227 | }, 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "BBox_List_2017.csv Data_Entry_2017.csv blacklist.csv\r\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "# create the file path variables \n", 238 | "# paths are tipically container level dirs mapped to a host dir for data persistence.\n", 239 | "\n", 240 | "prj_consts = azure_chestxray_utils.chestxray_consts()\n", 241 | "\n", 242 | "data_base_input_dir=os.path.join(amlWBSharedDir, \n", 243 | " os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))\n", 244 | "data_base_output_dir=os.path.join(amlWBSharedDir, \n", 245 | " os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list))) \n", 246 | "nih_chest_xray_data_dir=os.path.join(data_base_input_dir, \n", 247 | " os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))\n", 248 | "other_data_dir=os.path.join(data_base_input_dir, \n", 249 | " os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))\n", 250 | "label_file = os.path.join(other_data_dir,'Data_Entry_2017.csv')\n", 251 | "\n", 252 | "data_partitions_dir=os.path.join(data_base_output_dir, \n", 253 | " os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list))) \n", 254 | "nih_chest_xray_data_dir\n", 255 | "!find $nih_chest_xray_data_dir -type f | wc -l\n", 256 | "\n", 257 | "other_data_dir\n", 258 | "!ls $other_data_dir" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 11, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "# Paths\n", 268 | "# BASE_DIR = \"/mnt\"\n", 269 | "# DATA_FOLDER = os.path.join(BASE_DIR, \"ChestXray-NIHCC\")\n", 270 | "# IMAGE_FOLDER = os.path.join(BASE_DIR, \"images\")\n", 271 | "# LABEL_FILE = os.path.join(DATA_FOLDER, \"Data_Entry_2017.csv\")\n", 272 | "# print(IMAGE_FOLDER, LABEL_FILE)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 12, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "#####################################################################################################\n", 282 | "## Data Loading\n", 283 | "#####################################################################################################" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 13, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "# # todo\n", 293 | "# # This should prob be a generic function\n", 294 | "# # Split data into train/val/test\n", 295 | "\n", 296 | "# real_total_patient_number = 30805\n", 297 | "# patient_id_original = [i for i in range(real_total_patient_number + 1)]\n", 298 | "\n", 299 | "# bbox_df = pd.read_csv(os.path.join(other_data_dir, 'BBox_List_2017.csv'))\n", 300 | "\n", 301 | "# black_list_set = set()\n", 302 | "# with open(os.path.join(other_data_dir, 'blacklist.csv'), 'r') as f:\n", 303 | "# for line in f:\n", 304 | "# # delete the last char which is \\n\n", 305 | "# black_list_set.add(line[:-1])\n", 306 | "# if int(line[:-9]) >= 30805:\n", 307 | "# print(line[:-1])\n", 308 | "\n", 309 | "# # print(\"00029404_009.png\" in black_list_set)\n", 310 | "# bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)\n", 311 | "\n", 312 | "# bbox_patient_index_list = []\n", 313 | "\n", 314 | "\n", 315 | "# for index, item in bbox_patient_index_df.iteritems():\n", 316 | "# bbox_patient_index_list.append(int(item))\n", 317 | "\n", 318 | "# patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))\n", 319 | "# print(\"len of patient id is\", len(patient_id))\n", 320 | "# print(\"len of unique patient id with annotated data\", \n", 321 | "# len(list(set(bbox_patient_index_list))))\n", 322 | "# print(\"len of patient id with annotated data\",bbox_df.shape[0])\n", 323 | "# print(\"len of original patient id is\", len(patient_id_original))\n" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 14, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "# # set fast_testing True to see the training pipeline running for a few iterations on a very small # of images\n", 333 | "# # set fast_testing False to perfrom real training on full training data \n", 334 | "# fast_testing = True\n", 335 | "\n", 336 | "# # for real training we need random order\n", 337 | "# if (fast_testing):\n", 338 | "# shuffle_data_FLAG = False\n", 339 | "# crt_patient_id = patient_id[:300]\n", 340 | "# left_out_patient_id = patient_id[300:]\n", 341 | "# else:\n", 342 | "# crt_patient_id = patient_id\n", 343 | "# left_out_patient_id = []\n", 344 | "# # set seed to reproduce result\n", 345 | "# random.seed(0)\n", 346 | "# shuffle_data_FLAG = True\n", 347 | "\n", 348 | "# # training:valid:test=7:1:2\n", 349 | "# train_set, other_set = train_test_split(\n", 350 | "# crt_patient_id, train_size=0.7, test_size=0.3, shuffle=shuffle_data_FLAG)\n", 351 | "# valid_set, test_set = train_test_split(\n", 352 | "# other_set, train_size=1/3, test_size=2/3, shuffle=shuffle_data_FLAG)\n", 353 | "# print(\"train:{} valid:{} test:{}\".format(len(train_set), len(valid_set), len(test_set)))\n", 354 | "\n", 355 | "# # test_set = test_set+left_out_patient_id\n", 356 | "# # print(\"train:{} valid:{} test:{}\".format(len(train_set), len(valid_set), len(test_set)))" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 15, 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | "train:21563 valid:3081 test:6161 nih-annotated:726\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "import pickle\n", 374 | "patient_id_partition_file = os.path.join(data_partitions_dir, 'train_test_valid_data_partitions.pickle')\n", 375 | "\n", 376 | "with open(patient_id_partition_file, 'rb') as f:\n", 377 | " [train_set,valid_set,test_set, nih_annotated_set]=pickle.load(f)\n", 378 | "\n", 379 | "print(\"train:{} valid:{} test:{} nih-annotated:{}\".format(len(train_set), len(valid_set), \\\n", 380 | " len(test_set), len(nih_annotated_set)))" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 16, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "class XrayData(Dataset):\n", 390 | " def __init__(self, img_dir, lbl_file, patient_ids, transform=None):\n", 391 | " \n", 392 | " # Read labels-csv\n", 393 | " df = pd.read_csv(lbl_file)\n", 394 | " # Filter by patient-ids\n", 395 | " df = df[df['Patient ID'].isin(patient_ids)]\n", 396 | " # Split labels\n", 397 | " df_label = df['Finding Labels'].str.split(\n", 398 | " '|', expand=False).str.join(sep='*').str.get_dummies(sep='*')\n", 399 | " df_label.drop(['No Finding'], axis=1, inplace=True)\n", 400 | " \n", 401 | " # List of images (full-path)\n", 402 | " self.img_locs = df['Image Index'].map(lambda im: os.path.join(img_dir, im)).values\n", 403 | " # One-hot encoded labels (float32 for BCE loss)\n", 404 | " self.labels = df_label.values\n", 405 | " # Processing\n", 406 | " self.transform = transform\n", 407 | " \n", 408 | " print(\"Loaded {} labels and {} images\".format(len(self.labels), \n", 409 | " len(self.img_locs)))\n", 410 | " \n", 411 | " def __getitem__(self, idx):\n", 412 | " \n", 413 | " im_file = self.img_locs[idx]\n", 414 | " im_rgb = Image.open(im_file).convert('RGB')\n", 415 | " label = self.labels[idx]\n", 416 | " if self.transform is not None:\n", 417 | " im_rgb = self.transform(im_rgb)\n", 418 | " return im_rgb, torch.FloatTensor(label)\n", 419 | " \n", 420 | " def __len__(self):\n", 421 | " return len(self.img_locs)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 17, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "def no_augmentation_dataset(img_dir, lbl_file, patient_ids, normalize):\n", 431 | " dataset = XrayData(img_dir, lbl_file, patient_ids,\n", 432 | " transform=transforms.Compose([\n", 433 | " transforms.Resize(WIDTH),\n", 434 | " transforms.ToTensor(), \n", 435 | " normalize]))\n", 436 | " return dataset" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 18, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "name": "stdout", 446 | "output_type": "stream", 447 | "text": [ 448 | "Loaded 69217 labels and 69217 images\n" 449 | ] 450 | } 451 | ], 452 | "source": [ 453 | "# Dataset for training\n", 454 | "# Normalise by imagenet mean/sd\n", 455 | "normalize = transforms.Normalize(IMAGENET_RGB_MEAN, IMAGENET_RGB_SD)\n", 456 | "# todo\n", 457 | "# Go wild here with the transforms\n", 458 | "# https://github.com/pytorch/vision/blob/master/torchvision/transforms/transforms.py\n", 459 | "#__all__ = [\"Compose\", \"ToTensor\", \"ToPILImage\", \"Normalize\", \"Resize\", \"Scale\", \"CenterCrop\", \"Pad\",\n", 460 | "# \"Lambda\", \"RandomCrop\", \"RandomHorizontalFlip\", \"RandomVerticalFlip\", \"RandomResizedCrop\",\n", 461 | "# \"RandomSizedCrop\", \"FiveCrop\", \"TenCrop\", \"LinearTransformation\", \"ColorJitter\", \"RandomRotation\",\n", 462 | "# \"Grayscale\", \"RandomGrayscale\"]\n", 463 | "train_dataset = XrayData(img_dir=nih_chest_xray_data_dir,\n", 464 | " lbl_file=label_file,\n", 465 | " patient_ids=train_set,\n", 466 | " transform=transforms.Compose([\n", 467 | " transforms.Resize(264),\n", 468 | " transforms.RandomHorizontalFlip(),\n", 469 | " transforms.RandomResizedCrop(size=WIDTH),\n", 470 | " transforms.ColorJitter(0.15, 0.15),\n", 471 | " transforms.RandomRotation(15),\n", 472 | " transforms.ToTensor(), # need to convert image to tensor!\n", 473 | " normalize]))" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 19, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "Loaded 9600 labels and 9600 images\n", 486 | "Loaded 33303 labels and 33303 images\n" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "valid_dataset = no_augmentation_dataset(nih_chest_xray_data_dir, label_file, valid_set, normalize)\n", 492 | "test_dataset = no_augmentation_dataset(nih_chest_xray_data_dir, label_file, test_set, normalize)" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 20, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "#####################################################################################################\n", 502 | "## Helper Functions\n", 503 | "#####################################################################################################" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 21, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "def get_symbol(out_features=CLASSES, multi_gpu=MULTI_GPU):\n", 513 | " model = models.densenet.densenet121(pretrained=True)\n", 514 | " # Replace classifier (FC-1000) with (FC-14)\n", 515 | " model.classifier = nn.Sequential(\n", 516 | " nn.Linear(model.classifier.in_features, out_features), \n", 517 | " nn.Sigmoid())\n", 518 | " if multi_gpu:\n", 519 | " model = nn.DataParallel(model)\n", 520 | " # CUDA\n", 521 | " model.cuda() \n", 522 | " return model" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 22, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "def init_symbol(sym, lr=LR):\n", 532 | " # torch.optim.Adam(params, lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)\n", 533 | " opt = optim.Adam(sym.parameters(), lr=lr, betas=(0.9, 0.999))\n", 534 | " criterion = nn.BCELoss()\n", 535 | " scheduler = ReduceLROnPlateau(opt, factor = 0.1, patience = 5, mode = 'min')\n", 536 | " return opt, criterion, scheduler " 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 23, 542 | "metadata": {}, 543 | "outputs": [], 544 | "source": [ 545 | "def compute_roc_auc(data_gt, data_pd, mean=True, classes=CLASSES):\n", 546 | " roc_auc = []\n", 547 | " data_gt = data_gt.cpu().numpy()\n", 548 | " data_pd = data_pd.cpu().numpy()\n", 549 | " for i in range(classes):\n", 550 | " roc_auc.append(roc_auc_score(data_gt[:, i], data_pd[:, i]))\n", 551 | " if mean:\n", 552 | " roc_auc = np.mean(roc_auc)\n", 553 | " return roc_auc" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 24, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "def train_epoch(model, dataloader, optimizer, criterion, epoch, batch=BATCHSIZE):\n", 563 | " model.train()\n", 564 | " print(\"Training epoch {}\".format(epoch+1))\n", 565 | " loss_val = 0\n", 566 | " loss_cnt = 0\n", 567 | " for data, target in dataloader:\n", 568 | " # Get samples\n", 569 | " data = Variable(torch.FloatTensor(data).cuda())\n", 570 | " target = Variable(torch.FloatTensor(target).cuda())\n", 571 | " # Init\n", 572 | " optimizer.zero_grad()\n", 573 | " # Forwards\n", 574 | " output = model(data)\n", 575 | " # Loss\n", 576 | " loss = criterion(output, target)\n", 577 | " # Back-prop\n", 578 | " loss.backward()\n", 579 | " optimizer.step() \n", 580 | " # Log the loss\n", 581 | " loss_val += loss.data[0]\n", 582 | " loss_cnt += 1\n", 583 | " print(\"Training loss: {0:.4f}\".format(loss_val/loss_cnt))" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": 25, 589 | "metadata": {}, 590 | "outputs": [], 591 | "source": [ 592 | "def valid_epoch(model, dataloader, criterion, epoch, phase='valid', batch=BATCHSIZE):\n", 593 | " model.eval()\n", 594 | " if phase == 'testing':\n", 595 | " print(\"Testing epoch {}\".format(epoch+1))\n", 596 | " else:\n", 597 | " print(\"Validating epoch {}\".format(epoch+1))\n", 598 | " out_pred = torch.FloatTensor().cuda()\n", 599 | " out_gt = torch.FloatTensor().cuda()\n", 600 | " loss_val = 0\n", 601 | " loss_cnt = 0\n", 602 | " for data, target in dataloader:\n", 603 | " # Get samples\n", 604 | " data = Variable(torch.FloatTensor(data).cuda(), volatile=True)\n", 605 | " target = Variable(torch.FloatTensor(target).cuda(), volatile=True)\n", 606 | " # Forwards\n", 607 | " output = model(data)\n", 608 | " # Loss\n", 609 | " loss = criterion(output, target)\n", 610 | " # Log the loss\n", 611 | " loss_val += loss.data[0]\n", 612 | " loss_cnt += 1\n", 613 | " # Log for AUC\n", 614 | " out_pred = torch.cat((out_pred, output.data), 0)\n", 615 | " out_gt = torch.cat((out_gt, target.data), 0)\n", 616 | " loss_mean = loss_val/loss_cnt\n", 617 | " if phase == 'testing':\n", 618 | " print(\"Test-Dataset loss: {0:.4f}\".format(loss_mean))\n", 619 | " print(\"Test-Dataset AUC: {0:.4f}\".format(compute_roc_auc(out_gt, out_pred)))\n", 620 | "\n", 621 | " else:\n", 622 | " print(\"Validation loss: {0:.4f}\".format(loss_mean))\n", 623 | " print(\"Validation AUC: {0:.4f}\".format(compute_roc_auc(out_gt, out_pred)))\n", 624 | " return loss_mean" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 26, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [ 633 | "def print_learning_rate(opt):\n", 634 | " for param_group in opt.param_groups:\n", 635 | " print(\"Learining rate: \", param_group['lr'])" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 27, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "# DataLoaders\n", 645 | "# 4*CPU_COUNT\n", 646 | "# pin_memory=True\n", 647 | "train_loader = DataLoader(dataset=train_dataset, batch_size=BATCHSIZE,\n", 648 | " shuffle=True, num_workers=4*CPU_COUNT, pin_memory=False)\n", 649 | "\n", 650 | "valid_loader = DataLoader(dataset=valid_dataset, batch_size=8*BATCHSIZE,\n", 651 | " shuffle=False, num_workers=0, pin_memory=False)\n", 652 | "\n", 653 | "test_loader = DataLoader(dataset=test_dataset, batch_size=8*BATCHSIZE,\n", 654 | " shuffle=False, num_workers=4*CPU_COUNT, pin_memory=False)" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 28, 660 | "metadata": {}, 661 | "outputs": [], 662 | "source": [ 663 | "#####################################################################################################\n", 664 | "## Train Azure Chest Xray\n", 665 | "#####################################################################################################" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 29, 671 | "metadata": {}, 672 | "outputs": [ 673 | { 674 | "name": "stderr", 675 | "output_type": "stream", 676 | "text": [ 677 | "Downloading: \"https://download.pytorch.org/models/densenet121-a639ec97.pth\" to /home/mmlspark/.torch/models/densenet121-a639ec97.pth\n", 678 | "100%|██████████| 32342954/32342954 [00:00<00:00, 50666489.38it/s]\n" 679 | ] 680 | }, 681 | { 682 | "name": "stdout", 683 | "output_type": "stream", 684 | "text": [ 685 | "CPU times: user 2.42 s, sys: 910 ms, total: 3.33 s\n", 686 | "Wall time: 18.3 s\n" 687 | ] 688 | } 689 | ], 690 | "source": [ 691 | "%%time\n", 692 | "# Load symbol\n", 693 | "azure_chest_xray_sym = get_symbol()" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": 30, 699 | "metadata": {}, 700 | "outputs": [ 701 | { 702 | "name": "stdout", 703 | "output_type": "stream", 704 | "text": [ 705 | "CPU times: user 2.04 ms, sys: 136 µs, total: 2.18 ms\n", 706 | "Wall time: 2.18 ms\n" 707 | ] 708 | } 709 | ], 710 | "source": [ 711 | "%%time\n", 712 | "# Load optimiser, loss\n", 713 | "optimizer, criterion, scheduler = init_symbol(azure_chest_xray_sym)" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": {}, 720 | "outputs": [ 721 | { 722 | "name": "stdout", 723 | "output_type": "stream", 724 | "text": [ 725 | "Wed Feb 14 08:09:54 2018 \n", 726 | "+-----------------------------------------------------------------------------+\n", 727 | "| NVIDIA-SMI 384.111 Driver Version: 384.111 |\n", 728 | "|-------------------------------+----------------------+----------------------+\n", 729 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 730 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 731 | "|===============================+======================+======================|\n", 732 | "| 0 Tesla K80 Off | 00002CD7:00:00.0 Off | Off |\n", 733 | "| N/A 38C P0 72W / 149W | 241MiB / 12205MiB | 0% Default |\n", 734 | "+-------------------------------+----------------------+----------------------+\n", 735 | "| 1 Tesla K80 Off | 000045C0:00:00.0 Off | Off |\n", 736 | "| N/A 42C P8 28W / 149W | 11MiB / 12205MiB | 0% Default |\n", 737 | "+-------------------------------+----------------------+----------------------+\n", 738 | " \n", 739 | "+-----------------------------------------------------------------------------+\n", 740 | "| Processes: GPU Memory |\n", 741 | "| GPU PID Type Process name Usage |\n", 742 | "|=============================================================================|\n", 743 | "+-----------------------------------------------------------------------------+\n", 744 | "CUDA Version 8.0.61\n", 745 | "CUDA Version 8.0.61\n" 746 | ] 747 | } 748 | ], 749 | "source": [ 750 | "!nvidia-smi\n", 751 | "!cat /usr/local/cuda-8.0/version.txt\n", 752 | "!cat /usr/local/cuda/version.txt\n" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": null, 758 | "metadata": {}, 759 | "outputs": [ 760 | { 761 | "name": "stdout", 762 | "output_type": "stream", 763 | "text": [ 764 | "Validating epoch 0\n", 765 | "Validation loss: 0.6890\n", 766 | "Validation AUC: 0.4773\n" 767 | ] 768 | }, 769 | { 770 | "data": { 771 | "text/plain": [ 772 | "0.689042454957962" 773 | ] 774 | }, 775 | "execution_count": 32, 776 | "metadata": {}, 777 | "output_type": "execute_result" 778 | }, 779 | { 780 | "name": "stdout", 781 | "output_type": "stream", 782 | "text": [ 783 | "Training epoch 1\n" 784 | ] 785 | } 786 | ], 787 | "source": [ 788 | "# Original CheXNet ROC AUC = 0.841\n", 789 | "loss_min = float(\"inf\") \n", 790 | "stime = time.time()\n", 791 | "\n", 792 | "# No-training\n", 793 | "valid_epoch(azure_chest_xray_sym, valid_loader, criterion, -1)\n", 794 | "\n", 795 | "# Main train/val/test loop\n", 796 | "for j in range(EPOCHS):\n", 797 | " train_epoch(azure_chest_xray_sym, train_loader, optimizer, criterion, j)\n", 798 | " loss_val = valid_epoch(azure_chest_xray_sym, valid_loader, criterion, j)\n", 799 | " test_loss_val = valid_epoch(azure_chest_xray_sym, test_loader, criterion, j, 'testing')\n", 800 | " # LR Schedule\n", 801 | " scheduler.step(loss_val)\n", 802 | " print_learning_rate(optimizer)\n", 803 | " # todo: tensorboard hooks\n", 804 | " # Logging\n", 805 | " if loss_val < loss_min:\n", 806 | " print(\"Loss decreased. Saving ...\")\n", 807 | " loss_min = loss_val\n", 808 | " torch.save({'epoch': j + 1, \n", 809 | " 'state_dict': azure_chest_xray_sym.state_dict(), \n", 810 | " 'best_loss': loss_min, \n", 811 | " 'optimizer' : optimizer.state_dict()}, 'best_azure_chest_xray_model_v2.pth.tar')\n", 812 | " etime = time.time()\n", 813 | " print(\"Epoch time: {0:.0f} seconds\".format(etime-stime))\n", 814 | " print(\"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\")" 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": null, 820 | "metadata": {}, 821 | "outputs": [], 822 | "source": [ 823 | "#####################################################################################################\n", 824 | "## Test azure_chest_xray\n", 825 | "#####################################################################################################" 826 | ] 827 | }, 828 | { 829 | "cell_type": "code", 830 | "execution_count": null, 831 | "metadata": {}, 832 | "outputs": [], 833 | "source": [ 834 | "# Load model for testing\n", 835 | "azure_chest_xray_sym_test = get_symbol()\n", 836 | "chkpt = torch.load(\"best_azure_chest_xray_model_v2.pth.tar\")\n", 837 | "chexnet_sym_test.load_state_dict(chkpt['state_dict'])" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [ 846 | "valid_loss = valid_epoch(azure_chest_xray_sym_test, valid_loader, criterion, -1)\n", 847 | "test_loss = valid_epoch(azure_chest_xray_sym_test, test_loader, criterion, -1, 'testing')" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": null, 853 | "metadata": {}, 854 | "outputs": [], 855 | "source": [ 856 | "#import torch.onnx\n", 857 | "#dummy_input = Variable(torch.randn(BATCHSIZE, CHANNELS, HEIGHT, WIDTH)).cuda()\n", 858 | "#torch.onnx.export(azure_chest_xray_sym_test, dummy_input, \"azure_chest_xray.proto\", verbose=True)" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": null, 864 | "metadata": {}, 865 | "outputs": [], 866 | "source": [ 867 | "# jupyter nbconvert --to html .\\Code\\02_Model\\060_Train_pyTorch.ipynb" 868 | ] 869 | } 870 | ], 871 | "metadata": { 872 | "kernelspec": { 873 | "display_name": "Python 3", 874 | "language": "python", 875 | "name": "python3" 876 | }, 877 | "language_info": { 878 | "codemirror_mode": { 879 | "name": "ipython", 880 | "version": 3 881 | }, 882 | "file_extension": ".py", 883 | "mimetype": "text/x-python", 884 | "name": "python", 885 | "nbconvert_exporter": "python", 886 | "pygments_lexer": "ipython3", 887 | "version": "3.6.3" 888 | } 889 | }, 890 | "nbformat": 4, 891 | "nbformat_minor": 2 892 | } 893 | --------------------------------------------------------------------------------