├── .gitignore ├── README.md ├── chexpert_supervised ├── .gitignore ├── README.md ├── chexpert-model │ ├── args │ │ ├── __init__.py │ │ ├── base_arg_parser.py │ │ ├── test_arg_parser.py │ │ └── train_arg_parser.py │ ├── bash_scripts │ │ ├── finetune_normal.sh │ │ ├── finetune_normal2.sh │ │ ├── train_chexpert_models.sh │ │ ├── train_intermountain_models.sh │ │ ├── train_synthetic.sh │ │ └── valid_ignore.sh │ ├── calibrate.py │ ├── calibration_params.json │ ├── cams │ │ ├── __init__.py │ │ ├── base_cam.py │ │ ├── ensemble_cam.py │ │ ├── grad_cam.py │ │ ├── guided_backprop.py │ │ └── model_cam_configs.json │ ├── confidence_interval.py │ ├── confidence_interval_diff.py │ ├── constants │ │ ├── __init__.py │ │ └── constants.py │ ├── data │ │ ├── __init__.py │ │ ├── base_dataset.py │ │ ├── chexpert_dataset.py │ │ ├── custom_dataset.py │ │ ├── loader.py │ │ ├── pad_collate.py │ │ └── task_sequences.json │ ├── dataset │ │ ├── __init__.py │ │ ├── base_dataset.py │ │ ├── ckpts │ │ │ └── debugging │ │ │ │ └── args.json │ │ ├── concat_dataset.py │ │ ├── constants.py │ │ ├── get_loader.py │ │ ├── label_mapper.py │ │ ├── nih_dataset.py │ │ ├── pad_collate.py │ │ ├── predict_config.json │ │ ├── su_dataset.py │ │ ├── task_sequences.json │ │ └── transforms │ │ │ ├── __init__.py │ │ │ └── clahe.py │ ├── eval │ │ ├── __init__.py │ │ ├── average_meter.py │ │ ├── below_curve_counter.py │ │ ├── evaluator.py │ │ └── loss.py │ ├── logger │ │ ├── __init__.py │ │ └── logger.py │ ├── models │ │ ├── __init__.py │ │ ├── calibrate.py │ │ └── models.py │ ├── optim │ │ ├── __init__.py │ │ └── optimizer.py │ ├── predict │ │ ├── __init__.py │ │ ├── configs │ │ │ ├── final.json │ │ │ └── toy.json │ │ ├── ensemble_predict.py │ │ └── predict.py │ ├── saver │ │ ├── __init__.py │ │ └── model_saver.py │ ├── sbatch │ │ ├── gen_sbatch.py │ │ └── job_management.py │ ├── scripts │ │ ├── get_cams.py │ │ ├── get_model_size.py │ │ └── map_uncertain.py │ ├── select_ensemble.py │ ├── test.py │ ├── test_images.py │ ├── test_one.py │ ├── timeout_test.py │ ├── train.py │ └── util │ │ ├── __init__.py │ │ ├── cuda_util.py │ │ ├── image_util.py │ │ ├── io_util.py │ │ ├── label_util.py │ │ └── model_util.py └── environment.yml ├── image_source ├── contrastive_learning.PNG ├── cx_all_full_ci.PNG ├── cx_all_last_ci.PNG └── moco_flowchart_new.PNG └── moco_pretraining ├── moco ├── LICENSE ├── aihc_utils │ ├── __init__.py │ ├── image_transform.py │ └── storage_util.py ├── detection │ ├── README.md │ ├── configs │ │ ├── Base-RCNN-C4-BN.yaml │ │ ├── coco_R_50_C4_2x.yaml │ │ ├── coco_R_50_C4_2x_moco.yaml │ │ ├── pascal_voc_R_50_C4_24k.yaml │ │ └── pascal_voc_R_50_C4_24k_moco.yaml │ ├── convert-pretrain-to-detectron2.py │ └── train_net.py ├── main_lincls.py ├── main_moco.py ├── moco │ ├── __init__.py │ ├── builder.py │ └── loader.py └── training_tools │ ├── __init__.py │ ├── combiner.py │ ├── evaluator.py │ └── meters.py └── scripts ├── convert_to_chexpert.py ├── generate_moco_training_scripts.py ├── parse_log.py ├── reorganize_files.py ├── resize.sh ├── shenzhen_mutiple_split.py ├── split_into_train_val.py └── training_scripts ├── r8w1n416.sh ├── sbatch_lincls_template.sh ├── sbatch_moco_lincls.sh ├── sbatch_moco_train.sh └── sbatch_moco_train_local.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # AIHC specialized ignores 2 | __pycache__/ 3 | *.pyc 4 | *~ 5 | .DS_Store 6 | ._* 7 | *.jpg 8 | *.pth.tar 9 | *.ipynb_checkpoints 10 | logs/ 11 | 12 | ## Copied from https://github.com/github/gitignore/edit/master/Python.gitignore 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | share/python-wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | MANIFEST 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .nox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | *.py,cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | cover/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | db.sqlite3 75 | db.sqlite3-journal 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | .pybuilder/ 89 | target/ 90 | 91 | # Jupyter Notebook 92 | .ipynb_checkpoints 93 | 94 | # IPython 95 | profile_default/ 96 | ipython_config.py 97 | 98 | # pyenv 99 | # For a library or package, you might want to ignore these files since the code is 100 | # intended to run in multiple environments; otherwise, check them in: 101 | # .python-version 102 | 103 | # pipenv 104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 107 | # install all needed dependencies. 108 | #Pipfile.lock 109 | 110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 111 | __pypackages__/ 112 | 113 | # Celery stuff 114 | celerybeat-schedule 115 | celerybeat.pid 116 | 117 | # SageMath parsed files 118 | *.sage.py 119 | 120 | # Environments 121 | .env 122 | .venv 123 | env/ 124 | venv/ 125 | ENV/ 126 | env.bak/ 127 | venv.bak/ 128 | 129 | # Spyder project settings 130 | .spyderproject 131 | .spyproject 132 | 133 | # Rope project settings 134 | .ropeproject 135 | 136 | # mkdocs documentation 137 | /site 138 | 139 | # mypy 140 | .mypy_cache/ 141 | .dmypy.json 142 | dmypy.json 143 | 144 | # Pyre type checker 145 | .pyre/ 146 | 147 | # pytype static type analyzer 148 | .pytype/ 149 | 150 | # Cython debug symbols 151 | cython_debug/ 152 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MoCo-CXR: MoCo Pretraining Improves Representations and Transferability of Chest X-Ray Models 2 | 3 | This repository contains 4 | * A modified version of [the MoCo paper](https://github.com/facebookresearch/moco) to accomodate for the CheXpert dataset 5 | * A modified version of the original implementation of [the CheXpert paper](https://arxiv.org/pdf/1901.07031.pdf) 6 | 7 | Preprint of this work is available on [arXiv](https://arxiv.org/pdf/2010.05352.pdf) 8 | ``` 9 | @article{sowrirajanmoco, 10 | title={MoCo-CXR: MoCo Pretraining Improves Representation and Transferability of Chest X-ray Models}, 11 | author={Sowrirajan, Hari and Yang, Jingbo and Ng, Andrew Y and Rajpurkar, Pranav} 12 | } 13 | ``` 14 | 15 | This work has been presented in 16 | * ACM Conference on Health, Inference and Learning (CHIL 2020) workshop 17 | * Medical Imaging with Deep Learing (MIDL 2021) 18 | 19 | ## Abstract 20 | ``` 21 | Contrastive learning is a form of self-supervision that can leverage unlabeled data to produce pretrained models. While contrastive learning has demonstrated promising results on natural image classification tasks, its application to medical imaging tasks like chest X-ray interpretation has been limited. In this work, we propose MoCo-CXR, which is an adaptation of the contrastive learning method Momentum Contrast (MoCo), to produce models with better representations and initializations for the detection of pathologies in chest X-rays. In detecting pleural effusion, we find that linear models trained on MoCo-CXR-pretrained representations outperform those without MoCo-CXR-pretrained representations, indicating that MoCo-CXR-pretrained representations are of higher-quality. End-to-end fine-tuning experiments reveal that a model initialized via MoCo-CXR-pretraining outperforms its non-MoCo-CXR-pretrained counterpart. We find that MoCo-CXR-pretraining provides the most benefit with limited labeled training data. Finally, we demonstrate similar results on a target Tuberculosis dataset unseen during pretraining, indicating that MoCo-CXR-pretraining endows models with representations and transferability that can be applied across chest X-ray datasets and tasks. 22 | ``` 23 | 24 | ## Methods 25 | 26 | MoCo-CXR uses momemtum contrast as an unsupervised training method. This method maximizes agreement between augmentations of the same images while increases distances with the momemtum-weighted negative embedding. 27 | 28 | drawing 29 | 30 | MoCo-CXR is based on ResNet initialize weights, then trained in an unsupervised manner. Supervised learning is performed on different label fractions for the CheXpert dataset and the Shenzhen dataset. 31 | 32 | drawing 33 | 34 | 35 | ## Evaluation 36 | 37 | Comparison of MoCo-CXR performance against ResNet initialized baseline when only the linear layers are fine tuned. 38 | 39 | drawing 40 | 41 | Comparison of MoCo-CXR performance against ResNet initialized baseline when all layers are allowed to be tuned. 42 | 43 | drawing 44 | 45 | ## Checkpoints 46 | * https://storage.googleapis.com/moco-cxr/mnn-00001.pth.tar 47 | * https://storage.googleapis.com/moco-cxr/r8w-00001-v2.pth.tar (slightly different but produces similar result as V1) 48 | * https://storage.googleapis.com/moco-cxr/r8w-00001.pth.tar 49 | * https://storage.googleapis.com/moco-cxr/r8w-0001.pth.tar 50 | * https://storage.googleapis.com/moco-cxr/r8w-001.pth.tar 51 | * https://storage.googleapis.com/moco-cxr/r5w-00001.pth.tar 52 | * https://storage.googleapis.com/moco-cxr/d1w-00001.pth.tar 53 | 54 | Note that these checkpoints follow MoCo's implementation. To re-use them for ImageNet-like training process, you will have to "hack" the checkpoint weights using ways similar to our [model saver](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/chexpert_supervised/chexpert-model/saver/model_saver.py). 55 | 56 | ## Running the experiments 57 | 58 | ### Pre-Training 59 | Note that the above naming includes "dot", that is, 00001 means 0.0001=1e-4. 60 | Our experiments are conducted on Stanford's SLURM. For reference, the training script used is [here](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/training_scripts/r8w1n416.sh). Alternatively, if you are runng it on a "vanilla" machine, you can refence [this script](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/training_scripts/sbatch_moco_train_local.sh). You could also referene [a generation script](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/generate_moco_training_scripts.py) if you would like to generate commands for different learning rate and/or backbone model. 61 | 62 | ### MoCo-CXR Training with CheXpert 63 | 64 | We used splitting scripts like [this](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/reorganize_files.py) to split data into traininig and validation sets. These also generate the various draws to produce confidence interval for evaluation of our semi-supervised approach. 65 | 66 | For the Shenzhen dataset, we used [this](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/convert_to_chexpert.py) to convert the unpacked Shenzhen files into CheXpert's default format for easier experiment setup. Note that the actual CheXpert pipeline is a 3 step process, training, model picking (select best checkpoint) and evaluation. Each independent "draw" went through this process. 67 | 68 | ## Additional Information 69 | 70 | * [Shenzhen dataset](https://qims.amegroups.com/article/view/5132/6030) 71 | * [CheXpert leaderboard](https://stanfordmlgroup.github.io/competitions/chexpert/) 72 | * [CheXtransfer](https://www.chilconference.org/proceeding_P11.html) 73 | * [CheXternal](https://www.chilconference.org/proceeding_P12.html) 74 | * [VisualCheXbert](https://www.chilconference.org/proceeding_P10.html) 75 | -------------------------------------------------------------------------------- /chexpert_supervised/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | *~ 4 | .DS_Store 5 | ._* 6 | *.jpg 7 | *.pth.tar 8 | *.ipynb_checkpoints 9 | logs/ 10 | -------------------------------------------------------------------------------- /chexpert_supervised/README.md: -------------------------------------------------------------------------------- 1 | # aihc-winter19-robustness 2 | Repo for project on robustness to medical images. 3 | Development branch for conaug-2020. 4 | 5 | ## Activate environment 6 | Default experiment to activate is defined in chexpert-model/sbatch/conaug/sbatch_commands/envs.py in `CONAUG_ENV` (line 23) 7 | 8 | Point it to your own virtual environment if needed. 9 | e.g. 10 | ``` 11 | source /deep/u/canliu/envs/aihc_chexpert/bin/activate 12 | ``` 13 | 14 | ## chexpert-model 15 | This directory is a fork of the original chexpert-model, in our organization. 16 | 17 | ### Usage 18 | #### Automation. 19 | ##### See relevant code in chexpert-model/sbatch/conaug. 20 | 21 | ##### Generate sbatch scripts en masse: 22 | 1. Set up finetuning config by modifying *chexpert-model/sbatch/conaug/configs/finetune.json*. 23 | 2. Specify experiments to finetune with by entering experiment names in *CKPT_LIST* in *chexpert-model/sbatch/conaug/script_generation.py*. 24 | 3. Generate scripts: 25 | ``` 26 | python chexpert-model/sbatch/conaug/script_generation.py 27 | ``` 28 | with optional arguments: 29 | ``` 30 | --user_id: owner of the pretrained checkpoints. Default: account running the script generation code. 31 | --epochs: number of epochs. Default: calculated automatically based on label fractions. 32 | --cpu: cpu per tasks. Default: 4. 33 | --mem: cpu memory to request. Default: 32000. 34 | --log_path: directory to log job status. Default: /sailhome//experiments. 35 | ``` 36 | Note that each group of finetuning experiments is associated with a unique timestamp. 37 | 38 | ##### Launch sbatch jobs: 39 | 1. Specify jobs to run by in *CONFIG* dictionary in *chexpert-model/sbatch/conaug/job_management.py*. 40 | 2. Launch jobs: 41 | ``` 42 | python chexpert-model/sbatch/conaug/job_management.py 43 | ``` 44 | with optional arguments: 45 | ``` 46 | --refresh: frequency to refresh screen (to print out current job status). 47 | ``` 48 | 49 | --------------- 50 | Following parts are informative but also could be outdated. 51 | 52 | #### Training 53 | Single model (default train and val set): 54 | ``` 55 | python train.py --dataset chexpert --save_dir --experiment_name 56 | ``` 57 | Ensemble model: Training the ensemble model consists of individually training 15 models separately. It may be good to use sbatch to train these models separately. 58 | 59 | Single model (custom train and val set): please use full paths to the images in the csvs if custom_dataset=True 60 | ``` 61 | python train.py --dataset custom --train_custom_csv --val_custom_csv --save_dir --experiment_name 62 | ``` 63 | (please look at train_arg_parser for other flags such as gpu, number of epochs, etc.) 64 | 65 | 66 | #### Testing 67 | Single model (default test set): 68 | ``` 69 | python test.py --dataset chexpert --ckpt_path --phase {valid, test} --save_dir 70 | ``` 71 | Ensemble (default test set): 72 | ``` 73 | python test.py --dataset chexpert --config_path --phase {valid, test} --save_dir 74 | ``` 75 | Single model (custom test set, separate test gt/paths): please use full paths to the images in the csvs if custom_dataset=True 76 | ``` 77 | python test.py --dataset custom --ckpt_path --phase {valid, test} --save_dir --test_groundtruth --test_image_paths 78 | ``` 79 | Single model (custom test set, test csv): please use full paths to the images in the csvs if custom_dataset=True 80 | ``` 81 | python test.py --dataset custom --ckpt_path --phase {valid, test} --save_dir --together True --test_csv 82 | ``` 83 | 84 | (please look at test_arg_parser for other flags such as save_cams) 85 | 86 | A note on CAMS generation: 87 | ```--save_cams True```: to generate CAMS 88 | ```--only_competition_cams True```: to only generate CAMS for competition classes 89 | CAMS will only be generated for classes where groundtruth is 1. 90 | 91 | ### Reproduce CheXpert test results 92 | `python test.py --dataset chexpert --config_path predict/config/final.json --phase test --save_dir ` 93 | 94 | ### Evaluating a pre-trained model 95 | Some pre-trained models are available in `/deep/group/CheXpert/final_ckpts/`. We can try a 3-class model. Make a temporary folder `[temp]`, and do: 96 | ``` 97 | cp /deep/group/CheXpert/final_ckpts/CheXpert-3-class/best.pth.tar [temp] 98 | cp /deep/group/CheXpert/final_ckpts/CheXpert-3-class/args.json [temp] 99 | cd [repo]/chexpert-model/ 100 | python test.py --dataset chexpert --ckpt_path [temp]/best.pth.tar --phase {valid, test} --model_uncertainty True --save_dir 101 | ``` 102 | Regarding the structure of the `[temp]` folder, let `[phase]` be the phase selected previously. Then, `[temp]/results/[phase]/scores.txt` contains a variety of metrics tabulated by the `Evaluator`. On the branch `mark_model_analysis`, `test.py` also saves `groundtruth.csv` and `predictions.csv` to `[temp]/results/[phase]/`. 103 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/args/__init__.py: -------------------------------------------------------------------------------- 1 | from .test_arg_parser import TestArgParser 2 | from .train_arg_parser import TrainArgParser 3 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/args/test_arg_parser.py: -------------------------------------------------------------------------------- 1 | """Define class for processing testing command-line arguments.""" 2 | import util 3 | 4 | from .base_arg_parser import BaseArgParser 5 | 6 | 7 | class TestArgParser(BaseArgParser): 8 | """Argument parser for args used only in test mode.""" 9 | def __init__(self): 10 | super(TestArgParser, self).__init__() 11 | self.is_training = False 12 | 13 | self.parser.add_argument('--inference_only', 14 | action='store_true', 15 | help=('If set, then only do inference. Useful'+ 16 | ' when the csv has uncertainty label')) 17 | # Data args 18 | self.parser.add_argument('--phase', 19 | dest='data_args.phase', 20 | type=str, default='valid', 21 | choices=('train', 'valid', 'test')) 22 | self.parser.add_argument('--test_groundtruth', 23 | dest='data_args.gt_csv', 24 | type=str, default=None, 25 | help=('csv file if custom dataset')) 26 | self.parser.add_argument('--test_image_paths', 27 | dest='data_args.paths_csv', 28 | type=str, default=None, 29 | help=('csv file if custom dataset')) 30 | self.parser.add_argument('--together', 31 | dest='data_args.together', 32 | type=str, default=True, 33 | help=('whether we have integrated test csv')) 34 | self.parser.add_argument('--test_csv', 35 | dest='data_args.test_csv', 36 | type=str, default=None, 37 | help=('csv file for integrated test set')) 38 | # Logger args 39 | self.parser.add_argument('--save_cams', 40 | dest='logger_args.save_cams', 41 | type=util.str_to_bool, default=False, 42 | help=('If true, will save cams to ' + 43 | 'experiment_folder/cams')) 44 | self.parser.add_argument('--only_evaluation_cams', 45 | dest='logger_args.only_evaluation_cams', 46 | type=util.str_to_bool, default=True, 47 | help=('If true, will only generate cams ' + 48 | 'on evaluation labels. Only ' + 49 | 'relevant if --save_cams is True')) 50 | self.parser.add_argument('--only_competition_cams', 51 | dest='logger_args.only_competition_cams', 52 | type=util.str_to_bool, default=False, 53 | help='Whether to only output cams for' + 54 | 'competition categories.') 55 | 56 | # Model args 57 | self.parser.add_argument('--config_path', 58 | dest='model_args.config_path', 59 | type=str, default=None) 60 | self.parser.add_argument('--calibrate', 61 | dest='model_args.calibrate', 62 | type=util.str_to_bool, default=False, 63 | help='Compute calibrated probabilities.') 64 | 65 | # TODO: Somehow need this line 66 | self.parser.add_argument('--moco', dest='model_args.moco', 67 | type=util.str_to_bool, default=True, 68 | help='Using moco') -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/bash_scripts/finetune_normal.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=deep --qos=normal 3 | #SBATCH --time=06:00:00 4 | #SBATCH --nodes=1 5 | #SBATCH --cpus-per-task=4 6 | #SBATCH --mem=32G 7 | 8 | # only use the following on partition with GPUs 9 | #SBATCH --gres=gpu:1 10 | 11 | #SBATCH --job-name="finetune" 12 | #SBATCH --output=finetune-%j.out 13 | 14 | echo "Running finetune on uignore" 15 | 16 | python ../train.py --ckpt_path /deep/group/chexperturbed/runs/2019-04-18-22.17.36.095031__minhphu/DenseNet121_320_1e-04_uncertainty_ignored_top10/best.pth.tar \ 17 | --dataset custom \ 18 | --train_custom_csv /deep/group/chexperturbed/data/natural/Nokiadev10K_and_NokiaNORMALS507_noflux.csv \ 19 | --val_custom_csv /deep/group/chexperturbed/data/CheXpert-original/prosp500_all.csv \ 20 | --save_dir /deep/group/minhphu/dump \ 21 | --experiment_name finetune_uignore \ 22 | --batch_size 48 \ 23 | --iters_per_print 48 \ 24 | --iters_per_visual 48000 \ 25 | --iters_per_eval=4800 \ 26 | --iters_per_save=4800 \ 27 | --gpu_ids 0 \ 28 | --num_epochs=3 \ 29 | --metric_name chexpert-competition-AUROC \ 30 | --maximize_metric True \ 31 | --scale 320 \ 32 | --max_ckpts 10 \ 33 | --keep_topk True 34 | 35 | echo "Done!" 36 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/bash_scripts/finetune_normal2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=deep --qos=normal 3 | #SBATCH --time=60:00:00 4 | #SBATCH --nodes=4 5 | #SBATCH --cpus-per-task=4 6 | #SBATCH --mem=64G 7 | 8 | # only use the following on partition with GPUs 9 | #SBATCH --gres=gpu:4 10 | 11 | #SBATCH --job-name="finetune" 12 | #SBATCH --output=finetune-%j.out 13 | 14 | echo "Running finetune on uignore" 15 | 16 | SAVE_DIR="/deep/group/chexperturbed/runs/2019-04-23-21.26.43.341224__minhphu" 17 | 18 | python ../train.py --ckpt_path /deep/group/chexperturbed/runs/2019-04-18-22.17.36.095031__minhphu/DenseNet121_320_1e-04_uncertainty_ignored_top10/best.pth.tar \ 19 | --dataset custom \ 20 | --train_custom_csv /deep/group/chexperturbed/data/natural/Nokiadev10K_and_NokiaNORMALS507_noflux.csv \ 21 | --val_custom_csv /deep/group/chexperturbed/data/CheXpert-original/prosp500_all.csv \ 22 | --save_dir $SAVE_DIR \ 23 | --experiment_name finetune_uignore \ 24 | --batch_size 48 \ 25 | --iters_per_print 48 \ 26 | --iters_per_visual 48000 \ 27 | --iters_per_eval=4800 \ 28 | --iters_per_save=4800 \ 29 | --gpu_ids 0 \ 30 | --num_epochs=3 \ 31 | --metric_name chexpert-competition-AUROC \ 32 | --maximize_metric True \ 33 | --scale 320 \ 34 | --max_ckpts 10 \ 35 | --keep_topk True 36 | 37 | echo "Done!" 38 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/bash_scripts/train_chexpert_models.sh: -------------------------------------------------------------------------------- 1 | # 3-class 2 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_3-class_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --model_uncertainty=True && \ 3 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_3-class_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --model_uncertainty=True && \ 4 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_3-class_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --model_uncertainty=True 5 | 6 | # Ignore 7 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ignore_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True && \ 8 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ignore_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True && \ 9 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_ignore_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True 10 | 11 | # Self-Train 12 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_self-train_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_self-train.csv && \ 13 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_self-train_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_self-train.csv && \ 14 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_self-train_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_self-train.csv 15 | 16 | # Ones 17 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ones_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_ones.csv && \ 18 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ones_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_ones.csv && \ 19 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_ones_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_ones.csv 20 | 21 | # Zeros 22 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_zeros_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_zeros.csv && \ 23 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_zeros_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_zeros.csv && \ 24 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_zeros_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_zeros.csv -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/bash_scripts/train_intermountain_models.sh: -------------------------------------------------------------------------------- 1 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ignore_top10 --num_epochs=3 --metric_name chexpert-competition-avg-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True 2 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/bash_scripts/train_synthetic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=deep --qos=normal 3 | #SBATCH --time=60:00:00 4 | #SBATCH --nodes=4 5 | #SBATCH --cpus-per-task=4 6 | #SBATCH --mem=64G 7 | 8 | # only use the following on partition with GPUs 9 | #SBATCH --gres=gpu:4 10 | 11 | #SBATCH --job-name="train_synthetic" 12 | #SBATCH --output=train_synthetic-%j.out 13 | 14 | SAVE_DIR='/deep/group/chexperturbed/runs/2019-04-25-00.37.28.808433__minhphu' 15 | TRAIN_CSV='/deep/group/chexperturbed/data/CheXpert/synthetic_final/random/level_5/train_with_normal.csv' 16 | VALID_CSV='/deep/group/chexperturbed/data/CheXpert-original/prosp500_all.csv' 17 | # Ignore 18 | echo "Running Uignore..." 19 | IGNORE_NAME='Uone' 20 | python ../train.py --dataset custom --train_custom_csv $TRAIN_CSV --val_custom_csv $VALID_CSV --save_dir $SAVE_DIR --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name ${IGNORE_NAME}_1 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --max_ckpts 10 --keep_topk True && \ 21 | python ../train.py --dataset custom --train_custom_csv $TRAIN_CSV --val_custom_csv $VALID_CSV --save_dir $SAVE_DIR --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name ${IGNORE_NAME}_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --max_ckpts 10 --keep_topk True && \ 22 | python ../train.py --dataset custom --train_custom_csv $TRAIN_CSV --val_custom_csv $VALID_CSV --save_dir $SAVE_DIR --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name ${IGNORE_NAME}_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --max_ckpts 10 --keep_topk True 23 | 24 | 25 | # Uone 26 | # TODO 27 | 28 | # Uzero 29 | # TODO 30 | 31 | # Self-train 32 | # TODO 33 | 34 | # 3class 35 | # TODO 36 | 37 | echo "Done!" 38 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/bash_scripts/valid_ignore.sh: -------------------------------------------------------------------------------- 1 | 2 | USER='minhphu' 3 | ROOT=/deep/group/${USER} 4 | TEMP=${ROOT}/dump 5 | 6 | cp /deep/group/CheXpert/final_ckpts/CheXpert-Ignore/best.pth.tar $TEMP 7 | cp /deep/group/CheXpert/final_ckpts/CheXpert-Ignore/args.json $TEMP 8 | cd ${ROOT}/aihc-winter19-robustness/chexpert-model/ 9 | python test.py --inference_only True \ 10 | --dataset custom \ 11 | --together True \ 12 | --test_csv /deep/group/chexperturbed/data/toy_of_CheXpert/train.csv \ 13 | --ckpt_path ${TEMP}/best.pth.tar \ 14 | --phase test \ 15 | --save_dir $TEMP \ 16 | 17 | 18 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/calibrate.py: -------------------------------------------------------------------------------- 1 | """Entry-point script to train models.""" 2 | import torch 3 | 4 | from args import TestArgParser 5 | from logger import Logger 6 | from predict import Predictor, EnsemblePredictor 7 | from saver import ModelSaver 8 | from data import get_loader 9 | from eval import Evaluator 10 | from constants import * 11 | 12 | 13 | def calibrate(args): 14 | """Run model testing.""" 15 | model_args = args.model_args 16 | data_args = args.data_args 17 | logger_args = args.logger_args 18 | 19 | # Get logger. 20 | logger = Logger(logger_args.log_path, 21 | logger_args.save_dir, 22 | logger_args.results_dir) 23 | 24 | # Get image paths corresponding to predictions for logging 25 | paths = None 26 | 27 | if model_args.config_path is not None: 28 | # Instantiate the EnsemblePredictor class for obtaining 29 | # model predictions. 30 | predictor = EnsemblePredictor(config_path=model_args.config_path, 31 | model_args=model_args, 32 | data_args=data_args, 33 | gpu_ids=args.gpu_ids, 34 | device=args.device, 35 | logger=logger) 36 | # Obtain ensemble predictions. 37 | # Caches both individual and ensemble predictions. 38 | # We always turn off caching to ensure that we write the Path column. 39 | predictions, groundtruth, paths = predictor.predict(cache=False, 40 | return_paths=True, 41 | all_gt_tasks=True) 42 | else: 43 | # Load the model at ckpt_path. 44 | ckpt_path = model_args.ckpt_path 45 | ckpt_save_dir = Path(ckpt_path).parent 46 | model_uncertainty = model_args.model_uncertainty 47 | # Get model args from checkpoint and add them to 48 | # command-line specified model args. 49 | model_args, transform_args\ 50 | = ModelSaver.get_args(cl_model_args=model_args, 51 | dataset=data_args.dataset, 52 | ckpt_save_dir=ckpt_save_dir, 53 | model_uncertainty=model_uncertainty) 54 | model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path, 55 | gpu_ids=args.gpu_ids, 56 | model_args=model_args, 57 | is_training=False) 58 | # Instantiate the Predictor class for obtaining model predictions. 59 | predictor = Predictor(model=model, device=args.device) 60 | # Get phase loader object. 61 | return_info_dict = True 62 | loader = get_loader(phase=data_args.phase, 63 | data_args=data_args, 64 | transform_args=transform_args, 65 | is_training=False, 66 | return_info_dict=return_info_dict, 67 | logger=logger) 68 | # Obtain model predictions 69 | if return_info_dict: 70 | predictions, groundtruth, paths = predictor.predict(loader) 71 | else: 72 | predictions, groundtruth = predictor.predict(loader) 73 | #print(groundtruth) 74 | # custom function 75 | from sklearn.linear_model import LogisticRegression as LR 76 | params = [] 77 | for column in predictions: 78 | #print(predictions[column].values) 79 | #print(groundtruth[column].values) 80 | #drop corresponding rows where gt is -1 and 81 | lr = LR(C=15) 82 | to_drop = groundtruth.index[groundtruth[column] == -1].tolist() 83 | lr.fit(predictions[column].drop(to_drop).values.reshape(-1,1),groundtruth[column].drop(to_drop).values) # LR needs X to be 2-dimensional 84 | print("num_rows_used",predictions[column].drop(to_drop).values.size) 85 | #print(groundtruth[column].drop(to_drop).values.size) 86 | #print(predictions[column].values) 87 | print("coeffs", lr.coef_, lr.intercept_) 88 | p_calibrated=lr.predict_proba(predictions[column].values.reshape(-1,1)) 89 | params.append((lr.coef_, lr.intercept_)) 90 | import json 91 | with open('calibration_params.json', 'w') as f: 92 | import pandas as pd 93 | pd.Series(params).to_json(f, orient='values') 94 | 95 | #return lr 96 | 97 | if __name__ == "__main__": 98 | torch.multiprocessing.set_sharing_strategy('file_system') 99 | parser = TestArgParser() 100 | calibrate(parser.parse_args()) 101 | 102 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/calibration_params.json: -------------------------------------------------------------------------------- 1 | [[[[6.7817460615]],[-4.1531589153]],[[[1.6547258878]],[0.0203352903]],[[[9.6160614807]],[-1.5795825742]],[[[0.5744558881]],[-5.0300181584]],[[[9.7480215396]],[-3.7431941999]],[[[7.7600541565]],[-3.6481033461]],[[[15.1510516812]],[-2.8724927246]],[[[3.1205136268]],[-3.2489406983]],[[[12.4407352569]],[-2.3623841489]],[[[6.0697003679]],[-4.6369053622]],[[[6.4079661737]],[-3.1433334158]],[[[0.3846525627]],[-5.0128584793]],[[[0.1457574294]],[-4.4623561016]],[[[7.3639532052]],[-3.4352856906]]] -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/cams/__init__.py: -------------------------------------------------------------------------------- 1 | from .grad_cam import GradCAM 2 | from .base_cam import BaseCAM 3 | from .ensemble_cam import EnsembleCAM 4 | from .guided_backprop import GuidedBackPropagation 5 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/cams/base_cam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import util 4 | 5 | 6 | class BaseCAM(object): 7 | """Base class for generating CAMs. 8 | Adapted from: https://github.com/kazuto1011/grad-cam-pytorch 9 | """ 10 | def __init__(self, model, device): 11 | super(BaseCAM, self).__init__() 12 | pred_type = '_3class' if model.module.model_uncertainty else 'binary' 13 | self.device = device 14 | self.pred_type = pred_type 15 | self.model = model 16 | self.model.eval() 17 | self.inputs = None 18 | 19 | def _encode_one_hot(self, idx): 20 | one_hot = torch.zeros([1, self.preds.size()[-1]], 21 | dtype=torch.float32, device=self.device, requires_grad=True) 22 | 23 | if self.pred_type == '_3class': 24 | ind = 2 + idx * 3 # Get the index of positive class of the pathology. 25 | one_hot[0][ind] = 1.0 26 | else: 27 | one_hot[0][idx] = 1.0 28 | 29 | return one_hot 30 | 31 | def forward(self, x): 32 | self.inputs = x.to(self.device) 33 | self.model.zero_grad() 34 | self.preds = self.model(self.inputs) 35 | 36 | if self.pred_type == 'binary': 37 | self.probs = torch.sigmoid(self.preds)[0] 38 | elif self.pred_type == '_3class': 39 | self.probs = util.uncertain_logits_to_probs(self.preds)[0] 40 | else: 41 | self.probs = F.softmax(self.preds, dim=1)[0] 42 | return self.probs.detach().to('cpu').numpy() 43 | 44 | def backward(self, idx): 45 | one_hot = self._encode_one_hot(idx) 46 | self.preds.backward(gradient=one_hot, retain_graph=True) 47 | 48 | def get_cam(self, x, task_id, task=None): 49 | raise NotImplementedError 50 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/cams/ensemble_cam.py: -------------------------------------------------------------------------------- 1 | from .grad_cam import GradCAM 2 | 3 | import torch 4 | import numpy as np 5 | 6 | class EnsembleCAM(object): 7 | """Class for generating CAMs using an ensemble.""" 8 | def __init__(self, model, device): 9 | 10 | super(EnsembleCAM, self).__init__() 11 | 12 | self.device = device 13 | self.model = model 14 | 15 | def get_cam(self, x, task_id, task): 16 | 17 | ensemble_probs = [] 18 | cams = [] 19 | 20 | loaded_model_iterator = self.model.loaded_model_iterator(task) 21 | for loaded_model in loaded_model_iterator: 22 | grad_cam = GradCAM(loaded_model, self.device) 23 | probs = grad_cam.forward(x) 24 | 25 | grad_cam.backward(idx=task_id) 26 | 27 | cam = grad_cam.extract_cam()[0] 28 | 29 | ensemble_probs.append(probs) 30 | cams.append(cam) 31 | 32 | probs = self.model.aggregation_fn(ensemble_probs, axis=0) 33 | sorted_probs = np.sort(probs, axis=0)[::-1] 34 | idx = np.argsort(probs, axis=0)[::-1] 35 | 36 | cam = self.model.aggregation_fn(cams, axis=0) 37 | 38 | return sorted_probs, idx, cam -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/cams/grad_cam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | from collections import OrderedDict 7 | from .base_cam import BaseCAM 8 | 9 | 10 | # Load the dictionary of model configs 11 | # that for each model has the name of 12 | # the last layer before the GAP 13 | with open('cams/model_cam_configs.json') as f: 14 | MODEL_CONFIGS = json.load(f) 15 | 16 | 17 | class GradCAM(BaseCAM): 18 | """Class for generating grad CAMs. 19 | Adapted from: https://github.com/kazuto1011/grad-cam-pytorch 20 | """ 21 | def __init__(self, model, device): 22 | 23 | super(GradCAM, self).__init__(model, device) 24 | self.fmaps = OrderedDict() 25 | self.grads = OrderedDict() 26 | self.target_layer = MODEL_CONFIGS[model.module.__class__.__name__]['target_layer'] 27 | 28 | def save_fmap(m, _, output): 29 | self.fmaps[id(m)] = output.to('cpu') 30 | 31 | def save_grad(m, _, grad_out): 32 | self.grads[id(m)] = grad_out[0].to('cpu') 33 | 34 | for name, module in self.model.named_modules(): 35 | # Only put hooks on the target layer 36 | if name == self.target_layer: 37 | self.target_module_id = id(module) 38 | module.register_forward_hook(save_fmap) 39 | module.register_backward_hook(save_grad) 40 | 41 | def _find(self, outputs): 42 | 43 | # Since we've only put hooks on one layer 44 | # the target layer, we can return the value 45 | # right away 46 | return outputs[self.target_module_id] 47 | 48 | @staticmethod 49 | def _normalize(grads): 50 | return grads / (torch.norm(grads).item() + 1e-5) 51 | 52 | def _compute_grad_weights(self, grads): 53 | grads = self._normalize(grads) 54 | weights = F.adaptive_avg_pool2d(grads, 1) 55 | return weights 56 | 57 | def extract_cam(self): 58 | """ 59 | c: number of filters in final conv layer 60 | f: filter size 61 | shape of fmaps and grads : num_images x c x f x f 62 | shape of weights: num_images x c x 1 x 1 63 | shape of gcam: num_images x f x f 64 | """ 65 | 66 | fmaps = self._find(self.fmaps) 67 | grads = self._find(self.grads) 68 | weights = self._compute_grad_weights(grads) 69 | 70 | assert len(fmaps.size()) == 4 and fmaps.size()[0] == 1 71 | 72 | 73 | assert len(weights.size()) == 4 and weights.size()[0] == 1 74 | 75 | # Sum up along the filter dimension 76 | gcam = (fmaps * weights).sum(dim=1) 77 | 78 | gcam = torch.clamp(gcam, min=0, max=float('inf')) 79 | 80 | gcam -= gcam.min() 81 | gcam /= (gcam.max() + 1e-7) 82 | 83 | return gcam.detach().to('cpu').numpy() 84 | 85 | 86 | def get_cam(self, x, task_id, task=None): 87 | 88 | probs = self.forward(x) 89 | sorted_probs = np.sort(probs, axis=0)[::-1] 90 | idx = np.argsort(probs, axis=0)[::-1] 91 | self.backward(idx=task_id) 92 | cam = self.extract_cam()[0] 93 | 94 | return sorted_probs, idx, cam 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/cams/guided_backprop.py: -------------------------------------------------------------------------------- 1 | from cams import BaseCAM 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | class GuidedBackPropagation(BaseCAM): 7 | 8 | def __init__(self, model, device, is_binary, is_3d): 9 | super(GuidedBackPropagation, self).__init__(model, device, is_binary, is_3d) 10 | self.input_grad = [] 11 | def func_b(module, grad_in, grad_out): 12 | # Cut off negative gradients 13 | if isinstance(module, nn.ReLU): 14 | return (torch.clamp(grad_in[0], min=0.0),) 15 | 16 | for module in self.model.named_modules(): 17 | module[1].register_backward_hook(func_b) 18 | 19 | 20 | def generate(self): 21 | output = self.input_grad.to('cpu').numpy()[0] 22 | return output 23 | 24 | def forward(self, x): 25 | self.inputs = x.to(self.device) 26 | 27 | def save_grad(grad): 28 | self.input_grad = grad.to('cpu') 29 | 30 | self.inputs.register_hook(save_grad) 31 | self.model.zero_grad() 32 | self.preds = self.model(self.inputs) 33 | 34 | if self.is_binary: 35 | self.probs = torch.sigmoid(self.preds)[0] 36 | else: 37 | self.probs = F.softmax(self.preds, dim=1)[0] 38 | self.prob, self.idx = self.probs.sort(0, True) 39 | 40 | return self.prob, self.idx 41 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/cams/model_cam_configs.json: -------------------------------------------------------------------------------- 1 | { 2 | "DenseNet121": { 3 | "target_layer": "module.model.features" 4 | }, 5 | "ResNet152": { 6 | "target_layer": "module.model.layer4.2.conv3" 7 | }, 8 | "Inceptionv4": { 9 | "target_layer": "module.model.features.21.branch3.1.conv" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/confidence_interval.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import itertools 3 | import numpy as np 4 | import pandas as pd 5 | import pathlib 6 | import sklearn.metrics 7 | import sys 8 | 9 | import argparse 10 | 11 | from constants import NamedTasks 12 | 13 | class ConfidenceGenerator(): 14 | # Confidence level is 0.95, then we do 1 - confidence level to get 0.05 15 | def __init__(self, confidence_level): 16 | self.records = [] 17 | self.confidence_level = 1 - confidence_level 18 | 19 | @staticmethod 20 | def compute_cis(series, confidence_level): 21 | sorted_perfs = series.sort_values() 22 | lower_index = int(confidence_level/2 * len(sorted_perfs)) - 1 23 | upper_index = int((1 - confidence_level/2) * len(sorted_perfs)) - 1 24 | lower = sorted_perfs.iloc[lower_index].round(3) 25 | upper = sorted_perfs.iloc[upper_index].round(3) 26 | mean = sorted_perfs.mean().round(3) 27 | return lower, mean, upper 28 | 29 | def create_ci_record(self, perfs, name): 30 | lower, mean, upper = ConfidenceGenerator.compute_cis( 31 | perfs, self.confidence_level) 32 | record = {"name": name, 33 | "lower": lower, 34 | "mean": mean, 35 | "upper": upper, 36 | } 37 | self.records.append(record) 38 | 39 | def generate_cis(self, df): 40 | for diseases in df.columns: 41 | self.create_ci_record(df[diseases], diseases) 42 | 43 | df = pd.DataFrame.from_records(self.records) 44 | return df 45 | 46 | 47 | def confidence(bootstraps, output_path, confidence_level=0.95): 48 | cb = ConfidenceGenerator(confidence_level=confidence_level) 49 | df = cb.generate_cis(bootstraps) 50 | 51 | df.to_csv(output_path, index=False) 52 | 53 | def single_replicate_performances(gt, pred, diseases, metric, num_replicates): 54 | sample_ids = np.random.choice(len(gt), size=len(gt), replace=True) 55 | replicate_performances = {} 56 | gt_replicate = gt.iloc[sample_ids] 57 | pred_replicate = pred.iloc[sample_ids] 58 | 59 | for col in diseases: 60 | performance = metric(gt_replicate[col], pred_replicate[col]) 61 | replicate_performances[col] = performance 62 | return replicate_performances 63 | 64 | def multi_replicate_performances(gt, all_preds, diseases, metric, num_replicates): 65 | sample_ids = np.random.choice(len(gt), size=len(gt), replace=True) 66 | replicate_performances = {d: [None for i in range(len(all_preds))] for d in diseases} 67 | gt_replicate = gt.iloc[sample_ids] 68 | 69 | for i, pred in enumerate(all_preds): 70 | pred_replicate = pred.iloc[sample_ids] 71 | 72 | for col in diseases: 73 | performance = metric(gt_replicate[col], pred_replicate[col]) 74 | replicate_performances[col][i] = performance 75 | 76 | averaged_rep_perf = {d: np.mean(replicate_performances[d]) for d in diseases} 77 | return averaged_rep_perf 78 | 79 | 80 | def bootstrap_metric(gt, pred, all_preds, diseases, metric, num_replicates): 81 | 82 | all_performances = [] 83 | all_multi_performances = [] 84 | for _ in range(num_replicates): 85 | single_rep_performances = single_replicate_performances(gt, pred, diseases, metric, num_replicates) 86 | multi_rep_performances = multi_replicate_performances(gt, all_preds, diseases, metric, num_replicates) 87 | 88 | all_performances.append(copy.deepcopy(single_rep_performances)) 89 | all_multi_performances.append(copy.deepcopy(multi_rep_performances)) 90 | 91 | single_performances = pd.DataFrame.from_records(all_performances) 92 | multi_performances = pd.DataFrame.from_records(all_multi_performances) 93 | 94 | return single_performances, multi_performances 95 | 96 | 97 | def compute_bootstrap_confidence_interval(gt, pred, all_preds, 98 | diseases, metric, 99 | num_replicates, confidence_level, 100 | output_path): 101 | single_bootstrap, multi_bootstrap = bootstrap_metric(gt, pred, all_preds, 102 | diseases, metric, 103 | num_replicates) 104 | 105 | confidence(single_bootstrap, 106 | output_path, 107 | confidence_level=0.95) 108 | confidence(multi_bootstrap, 109 | output_path.replace('.csv', '_multi.csv'), 110 | confidence_level=0.95) 111 | 112 | 113 | 114 | if __name__ == '__main__': 115 | 116 | parser = argparse.ArgumentParser(description="Arguments for confidence_interval.py") 117 | parser.add_argument("--tasks", nargs='+', type=str) 118 | parser.add_argument("--custom_tasks", type=str) 119 | parser.add_argument("--metric", type=str, required=True) 120 | parser.add_argument("--num_replicates", type=int, required=True) 121 | parser.add_argument("--confidence_level", type=float, required=True) 122 | parser.add_argument("--groundtruth", type=str, required=True) 123 | parser.add_argument("--prediction", type=str, required=True) 124 | parser.add_argument("--split", type=int, required=True) 125 | parser.add_argument("--num_splits", type=int, required=True) 126 | parser.add_argument("--output", type=str, required=True) 127 | args = parser.parse_args() 128 | 129 | # A redundant renaming of the arguments -- to avoid breaking the rest of the code. 130 | if args.custom_tasks is not None: 131 | disease_names = ','.join(NamedTasks[args.custom_tasks]) 132 | else: 133 | disease_names = args.tasks 134 | metric_name = args.metric 135 | num_replicates = args.num_replicates 136 | confidence_level = args.confidence_level 137 | gt_path = args.groundtruth 138 | pred_path = args.prediction 139 | cur_iter = args.split 140 | num_iters = args.num_splits 141 | output_path = args.output 142 | 143 | print("Start confidence_interval...") 144 | # TODO JBY: Support more metrics 145 | assert metric_name == 'AUROC', 'Only AUROC is supported at the moment' 146 | 147 | diseases = disease_names.split(', ') 148 | diseases = [d.strip() for d in diseases] 149 | 150 | gt = pd.read_csv(gt_path) 151 | # gt = np.array(gt[disease_name].values.tolist()) 152 | # gt = gt[disease_name] 153 | 154 | pred = pd.read_csv(pred_path) 155 | # pred = np.array(pred[disease_name].values.tolist()) 156 | # pred = pred[disease_name] 157 | 158 | all_preds = [] 159 | for i in range(num_iters): 160 | new_pred_path = pred_path.replace(f'it{cur_iter}', f'it{i}') 161 | all_preds.append(pd.read_csv(new_pred_path)) 162 | 163 | # TODO, support more metrics 164 | 165 | print('Parsed arguments') 166 | 167 | compute_bootstrap_confidence_interval( 168 | gt, pred, all_preds, diseases, 169 | sklearn.metrics.roc_auc_score, 170 | num_replicates, confidence_level, 171 | output_path) 172 | 173 | print('Confidence interval generated') 174 | 175 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/confidence_interval_diff.py: -------------------------------------------------------------------------------- 1 | from confidence_interval import * 2 | 3 | 4 | def diff_replicate_performances(gt, all_preds1, all_preds2, diseases, metric, num_replicates): 5 | sample_ids = np.random.choice(len(gt), size=len(gt), replace=True) 6 | replicate_performances = {d: [None for i in range(len(all_preds1))] for d in diseases} 7 | gt_replicate = gt.iloc[sample_ids] 8 | 9 | #import pdb; pdb.set_trace() 10 | 11 | pred1_performances = {d: [None for i in range(len(all_preds1))] for d in diseases} 12 | for i, pred in enumerate(all_preds1): 13 | pred1_replicate = pred.iloc[sample_ids] 14 | for col in diseases: 15 | performance = metric(gt_replicate[col], pred1_replicate[col]) 16 | pred1_performances[col][i] = performance 17 | #print(f'Pred 1[{i}] => {performance}') 18 | 19 | pred2_performances = {d: [None for i in range(len(all_preds1))] for d in diseases} 20 | for i, pred in enumerate(all_preds2): 21 | pred2_replicate = pred.iloc[sample_ids] 22 | for col in diseases: 23 | performance = metric(gt_replicate[col], pred2_replicate[col]) 24 | pred2_performances[col][i] = performance 25 | #print(f'Pred 2[{i}] => {performance}') 26 | 27 | 28 | diff_rep_perf = {} 29 | for d in diseases: 30 | a1 = np.array(pred1_performances[d]) 31 | a2 = np.array(pred2_performances[d]) 32 | 33 | diff_rep_perf[d] = np.mean(a1 - a2) 34 | 35 | #import pdb; pdb.set_trace() 36 | # diff_rep_perf = {d: pred1_performances[d] - pred2_performances[d] for d in diseases} 37 | return diff_rep_perf 38 | 39 | 40 | def bootstrap_diff_metric(gt, all_preds1, all_preds2, diseases, metric, num_replicates): 41 | 42 | all_multi_performances = [] 43 | for _ in range(num_replicates): 44 | multi_rep_performances = diff_replicate_performances( 45 | gt, all_preds1, all_preds2, diseases, metric, num_replicates) 46 | 47 | all_multi_performances.append(copy.deepcopy(multi_rep_performances)) 48 | 49 | multi_performances = pd.DataFrame.from_records(all_multi_performances) 50 | 51 | return multi_performances 52 | 53 | 54 | def compute_bootstrap_diff_confidence_interval(gt, all_preds1, all_preds2, 55 | diseases, metric, 56 | num_replicates, confidence_level, 57 | output_path): 58 | multi_bootstrap = bootstrap_diff_metric( 59 | gt, all_preds1, all_preds2, diseases, metric, num_replicates) 60 | 61 | confidence(multi_bootstrap, 62 | output_path.replace('.csv', '_diff.csv'), 63 | confidence_level=0.95) 64 | 65 | 66 | if __name__ == '__main__': 67 | # TODO: JBY: Big hack, no proper argparser used here! 68 | # Usage: 69 | # python confidence_interval.py 70 | # [DISEASE_NAME] [METRIC_NAME] 71 | # [NUM_REPLICATES] [CONFIDENCE_LEVEL] 72 | # [GT_CSV_PATH] 73 | # [PRED1_CSV_PATH] 74 | # [PRED2_CSV_PATH] 75 | # [CUR_ITER] [NUM_ITERS] 76 | # [OUTPUT_PATH] 77 | 78 | assert len(sys.argv) == 11 79 | 80 | disease_names = sys.argv[1] 81 | metric_name = sys.argv[2] 82 | num_replicates = int(sys.argv[3]) 83 | confidence_level = float(sys.argv[4]) 84 | gt_path = sys.argv[5] 85 | pred1_path = sys.argv[6] 86 | pred2_path = sys.argv[7] 87 | cur_iter = int(sys.argv[8]) 88 | num_iters = int(sys.argv[9]) 89 | output_path = sys.argv[10] 90 | 91 | # TODO JBY: Support more metrics 92 | assert metric_name == 'AUROC', 'Only AUROC is supported at the moment' 93 | 94 | diseases = disease_names.split(', ') 95 | diseases = [d.strip() for d in diseases] 96 | 97 | gt = pd.read_csv(gt_path) 98 | # gt = np.array(gt[disease_name].values.tolist()) 99 | # gt = gt[disease_name] 100 | 101 | pred1 = pd.read_csv(pred1_path) 102 | pred2 = pd.read_csv(pred2_path) 103 | # pred = np.array(pred[disease_name].values.tolist()) 104 | # pred = pred[disease_name] 105 | 106 | all_preds1 = [] 107 | for i in range(num_iters): 108 | new_pred_path = pred1_path.replace(f'it{cur_iter}', f'it{i}') 109 | all_preds1.append(pd.read_csv(new_pred_path)) 110 | 111 | all_preds2 = [] 112 | for i in range(num_iters): 113 | new_pred_path = pred2_path.replace(f'it{cur_iter}', f'it{i}') 114 | all_preds2.append(pd.read_csv(new_pred_path)) 115 | 116 | # TODO, support more metrics 117 | 118 | print('Parsed arguments') 119 | 120 | compute_bootstrap_diff_confidence_interval( 121 | gt, all_preds1, all_preds2, diseases, 122 | sklearn.metrics.roc_auc_score, 123 | num_replicates, confidence_level, 124 | output_path) 125 | 126 | print('Confidence interval generated') 127 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/constants/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import * 2 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/constants/constants.py: -------------------------------------------------------------------------------- 1 | """Define constants to be used throughout the repository.""" 2 | from pathlib import Path 3 | 4 | # Main directories 5 | PROJECT_DIR = Path(__file__).parent.parent 6 | DATA_DIR = Path("/deep/group") 7 | 8 | # Datasets 9 | CHEXPERT = "chexpert" 10 | CUSTOM = "custom" 11 | CHEXPERT_SINGLE = "chexpert_single_special" 12 | CXR14 = "cxr14" 13 | SHENZHEN = "shenzhen_special" 14 | 15 | # Predict config constants 16 | CFG_TASK2MODELS = "task2models" 17 | CFG_AGG_METHOD = "aggregation_method" 18 | CFG_CKPT_PATH = "ckpt_path" 19 | CFG_IS_3CLASS = "is_3class" 20 | 21 | # Dataset constants 22 | IMAGENET_MEAN = [0.485, 0.456, 0.406] 23 | IMAGENET_STD = [0.229, 0.224, 0.225] 24 | COL_PATH = "Path" 25 | COL_STUDY = "Study" 26 | COL_TASK = "Tasks" 27 | COL_METRIC = "Metrics" 28 | COL_VALUE = "Values" 29 | TASKS = "tasks" 30 | UNCERTAIN = -1 31 | MISSING = -2 32 | 33 | # CheXpert specific constants 34 | CHEXPERT_DATASET_NAME = "CheXpert-v1.0" 35 | CHEXPERT_PARENT_DATA_DIR = DATA_DIR / "CheXpert" 36 | CHEXPERT_SAVE_DIR = CHEXPERT_PARENT_DATA_DIR / "models/" 37 | CHEXPERT_DATA_DIR = CHEXPERT_PARENT_DATA_DIR / CHEXPERT_DATASET_NAME 38 | CHEXPERT_TEST_DIR = CHEXPERT_PARENT_DATA_DIR / "CodaLab" 39 | CHEXPERT_UNCERTAIN_DIR = CHEXPERT_PARENT_DATA_DIR / "Uncertainty" 40 | CHEXPERT_RAD_PATH = CHEXPERT_PARENT_DATA_DIR / "rad_perf_test.csv" 41 | CHEXPERT_MEAN = [.5020, .5020, .5020] 42 | CHEXPERT_STD = [.085585, .085585, .085585] 43 | CHEXPERT_TASKS = ["No Finding", 44 | "Enlarged Cardiomediastinum", 45 | "Cardiomegaly", 46 | "Lung Lesion", 47 | "Airspace Opacity", 48 | "Edema", 49 | "Consolidation", 50 | "Pneumonia", 51 | "Atelectasis", 52 | "Pneumothorax", 53 | "Pleural Effusion", 54 | "Pleural Other", 55 | "Fracture", 56 | "Support Devices" 57 | ] 58 | CHEXPERT_SINGLE_TASKS = ["No Finding", 59 | "Pleural Effusion", 60 | ] 61 | 62 | CHEXPERT_COMPETITION_TASKS = ["Atelectasis", 63 | "Cardiomegaly", 64 | "Consolidation", 65 | "Edema", 66 | "Pleural Effusion" 67 | ] 68 | CHEXPERT_COMPETITION_SINGLE_TASKS = CHEXPERT_COMPETITION_TASKS 69 | # CHEXPERT_COMPETITION_SINGLE_TASKS = ["Pleural Effusion"] 70 | 71 | SHENZHEN_TASKS = ['Tuberculosis'] 72 | 73 | # CXR14 specific constants 74 | CXR14_DATA_DIR = DATA_DIR / CXR14 75 | CXR14_TASKS = ["Cardiomegaly", 76 | "Emphysema", 77 | "Pleural Effusion", 78 | "Hernia", 79 | "Infiltration", 80 | "Mass", 81 | "Nodule", 82 | "Atelectasis", 83 | "Pneumothorax", 84 | "Pleural Thickening", 85 | "Pneumonia", 86 | "Fibrosis", 87 | "Edema", 88 | "Consolidation"] 89 | CALIBRATION_FILE = "calibration_params.json" 90 | 91 | DATASET2TASKS = {CHEXPERT: CHEXPERT_TASKS, 92 | CUSTOM: CHEXPERT_TASKS, 93 | CHEXPERT_SINGLE: CHEXPERT_TASKS, 94 | CXR14: CXR14_TASKS, 95 | SHENZHEN: SHENZHEN_TASKS} 96 | 97 | EVAL_METRIC2TASKS = {'chexpert-log_loss': CHEXPERT_TASKS, 98 | 'cxr14-log_loss': CXR14_TASKS, 99 | 'shenzhen-AUROC': SHENZHEN_TASKS, 100 | 'chexpert-competition-log_loss': CHEXPERT_COMPETITION_TASKS, 101 | 'chexpert-competition-AUROC': CHEXPERT_COMPETITION_TASKS, 102 | 'chexpert-competition-single-AUROC': CHEXPERT_COMPETITION_TASKS} 103 | 104 | NamedTasks = {'chexpert': CHEXPERT_TASKS, 105 | 'chexpert-competition': CHEXPERT_COMPETITION_TASKS, 106 | 'pleural-effusion': CHEXPERT_TASKS 107 | } 108 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .loader import get_loader 2 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/data/base_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torchvision.transforms as t 4 | from torch.utils.data import Dataset 5 | from PIL import ImageEnhance 6 | 7 | from constants import * 8 | 9 | 10 | class BaseDataset(Dataset): 11 | def __init__(self, csv_name, is_training, transform_args): 12 | self.transform_args = transform_args 13 | self.csv_name = f"{csv_name}.csv" if not csv_name.endswith(".csv") else csv_name 14 | self.is_training = is_training 15 | 16 | def get_enhance_transform(self, f, enhance_min, enhance_max): 17 | def do_enhancement(img): 18 | factor = np.random.uniform(enhance_min, enhance_max) 19 | enhancer = f(img) 20 | return enhancer.enhance(factor) 21 | return do_enhancement 22 | 23 | 24 | def transform(self, img): 25 | """Set the transforms to be applied when loading.""" 26 | 27 | transform_args = self.transform_args 28 | # Shorter side scaled to transform_args.scale 29 | if transform_args.maintain_ratio: 30 | transforms_list = [t.Resize(transform_args.scale)] 31 | else: 32 | transforms_list = [t.Resize((transform_args.scale, transform_args.scale))] 33 | 34 | # Data augmentation 35 | if self.is_training: 36 | if np.random.rand() < transform_args.rotate_prob: 37 | transforms_list += [t.RandomRotation((transform_args.rotate_min, 38 | transform_args.rotate_max))] 39 | 40 | if np.random.rand() < transform_args.contrast_prob: 41 | transforms_list += [self.get_enhance_transform(ImageEnhance.Contrast, 42 | transform_args.contrast_min, 43 | transform_args.contrast_max)] 44 | 45 | if np.random.rand() < transform_args.brightness_prob: 46 | transforms_list += [self.get_enhance_transform(ImageEnhance.Brightness, 47 | transform_args.brightness_min, 48 | transform_args.brightness_max)] 49 | 50 | if np.random.rand() < transform_args.sharpness_prob: 51 | transforms_list += [self.get_enhance_transform(ImageEnhance.Sharpness, 52 | transform_args.sharpness_min, 53 | transform_args.sharpness_max)] 54 | 55 | if np.random.rand() < transform_args.horizontal_flip_prob: 56 | transforms_list += [t.Random.HorizontalFlip()] 57 | 58 | if transform_args.crop != 0: 59 | transforms_list += [t.RandomCrop((transform_args.crop, transform_args.crop))] 60 | 61 | else: 62 | transforms_list += [t.CenterCrop((transform_args.crop, 63 | transform_args.crop)) 64 | if transform_args.crop else None] 65 | 66 | if transform_args.normalization == 'imagenet': 67 | normalize = t.Normalize(mean=IMAGENET_MEAN, 68 | std=IMAGENET_STD) 69 | elif transform_args.normalization == 'chexpert_norm': 70 | normalize = t.Normalize(mean=CHEXPERT_MEAN, 71 | std=CHEXPERT_STD) 72 | transforms_list += [t.ToTensor(), normalize] 73 | 74 | return t.Compose([transform for transform in transforms_list if transform])(img) 75 | 76 | def __len__(self): 77 | return len(self.labels) 78 | 79 | def __getitem__(self, index): 80 | raise NotImplementedError 81 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/data/chexpert_dataset.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import torch 3 | import pandas as pd 4 | from PIL import Image 5 | 6 | import util 7 | from .base_dataset import BaseDataset 8 | from constants import * 9 | 10 | 11 | class CheXpertDataset(BaseDataset): 12 | def __init__(self, csv_name, is_training, study_level, 13 | transform_args, toy, return_info_dict, logger=None, data_args=None): 14 | # Pass in parent of data_dir because test set is in a different 15 | # directory due to dataset release, and uncertain maps are in a 16 | # different directory as well (both are under the parent directory). 17 | super().__init__(csv_name, is_training, transform_args) 18 | self.study_level = study_level 19 | self.toy = toy 20 | self.return_info_dict = return_info_dict 21 | self.logger = logger 22 | self.data_args = data_args 23 | 24 | self.is_train_dataset = self.csv_name == "train.csv" 25 | self.is_test_dataset = self.csv_name == "test.csv" 26 | self.is_val_dataset = self.csv_name == "valid.csv" 27 | self.is_uncertain_dataset = "uncertainty" in self.csv_name 28 | 29 | if self.is_test_dataset: 30 | self.csv_path = CHEXPERT_TEST_DIR / f"{csv_name}_image_paths.csv" 31 | elif self.is_uncertain_dataset: 32 | self.csv_path = CHEXPERT_UNCERTAIN_DIR / self.csv_name 33 | else: 34 | self.csv_path = CHEXPERT_DATA_DIR / self.csv_name 35 | 36 | if self.is_val_dataset: 37 | print("valid", self.csv_path) 38 | 39 | df = self.load_df() 40 | 41 | self.studies = df[COL_STUDY].drop_duplicates() 42 | 43 | if self.toy and self.csv_name == 'train.csv': 44 | self.studies = self.studies.sample(n=10) 45 | df = df[df[COL_STUDY].isin(self.studies)] 46 | df = df.reset_index(drop=True) 47 | 48 | # Set Study folder as index. 49 | if self.study_level: 50 | self.set_study_as_index(df) 51 | 52 | self.labels = self.get_labels(df) 53 | self.img_paths = self.get_paths(df) 54 | 55 | def load_df(self): 56 | df = pd.read_csv(Path(self.csv_path)) 57 | 58 | # Prepend the data dir to get the full path. 59 | df[COL_PATH] = df[COL_PATH].apply(lambda x: CHEXPERT_PARENT_DATA_DIR / x) 60 | if self.is_test_dataset: #adjust for the fact that images are in codalab 61 | df[COL_PATH] = df[COL_PATH].apply(lambda p: 62 | Path(str(p).replace(str(CHEXPERT_DATA_DIR), 63 | str(CHEXPERT_TEST_DIR)))) 64 | df[COL_STUDY] = df[COL_PATH].apply(lambda p: Path(p).parent) 65 | if self.is_test_dataset: 66 | gt_df = pd.read_csv(CHEXPERT_TEST_DIR / "test_groundtruth.csv") 67 | gt_df[COL_STUDY] = gt_df[COL_STUDY].apply(lambda s: CHEXPERT_PARENT_DATA_DIR / s) 68 | gt_df[COL_STUDY] = gt_df[COL_STUDY].apply(lambda s: Path(str(s).replace(str(CHEXPERT_DATA_DIR), 69 | str(CHEXPERT_TEST_DIR)))) 70 | df = pd.merge(df, gt_df, on=COL_STUDY, how = 'outer') 71 | df = df.dropna(subset=['Path']) 72 | 73 | df = df.rename(columns={"Lung Opacity": "Airspace Opacity"}).sort_values(COL_STUDY) 74 | 75 | df[CHEXPERT_TASKS] = df[CHEXPERT_TASKS].fillna(value=0) 76 | 77 | return df 78 | 79 | def set_study_as_index(self, df): 80 | df.index = df[COL_STUDY] 81 | 82 | def get_paths(self, df): 83 | return df[COL_PATH] 84 | 85 | def get_labels(self, df): 86 | # Get the labels 87 | if self.study_level: 88 | study_df = df.drop_duplicates(subset=COL_STUDY) 89 | labels = study_df[CHEXPERT_TASKS] 90 | else: 91 | labels = df[CHEXPERT_TASKS] 92 | 93 | return labels 94 | 95 | def get_study(self, index): 96 | 97 | # Get study folder path 98 | study_path = self.studies.iloc[index] 99 | 100 | # Get and transform the label 101 | label = self.labels.loc[study_path].values 102 | label = torch.FloatTensor(label) 103 | 104 | # Get and transform the images 105 | # corresponding to the study at hand 106 | img_paths = pd.Series(self.img_paths.loc[study_path]).tolist() 107 | imgs = [Image.open(path).convert('RGB') for path in img_paths] 108 | # Downscale full resolution image to 1024 in the same way as 109 | # performed in previous preprocessing, then convert back to PIL. 110 | # imgs = [util.resize_img(cv2.imread(str(path), 0), 1024) for path in img_paths] 111 | # imgs = [Image.fromarray(img).convert('RGB') for img in imgs] 112 | 113 | imgs = [self.transform(img) for img in imgs] 114 | imgs = torch.stack(imgs) 115 | 116 | if self.return_info_dict: 117 | 118 | info_dict = {'paths': study_path} 119 | 120 | return imgs, label, info_dict 121 | 122 | return imgs, label 123 | 124 | def get_image(self, index): 125 | 126 | # Get and transform the label 127 | label = self.labels.iloc[index].values 128 | label = torch.FloatTensor(label) 129 | 130 | # Get and transform the image 131 | img_path = self.img_paths.iloc[index] 132 | img = Image.open(img_path).convert('RGB') 133 | img = self.transform(img) 134 | 135 | if self.return_info_dict: 136 | info_dict = {'paths': str(img_path)} 137 | return img, label, info_dict 138 | 139 | return img, label 140 | 141 | def __getitem__(self, index): 142 | if self.study_level: 143 | return self.get_study(index) 144 | else: 145 | return self.get_image(index) 146 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/data/loader.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | 3 | from .chexpert_dataset import CheXpertDataset 4 | from .custom_dataset import CustomDataset 5 | from .pad_collate import PadCollate 6 | from constants import * 7 | 8 | 9 | def get_loader(phase, data_args, transform_args, 10 | is_training, return_info_dict, 11 | logger=None): 12 | """Get PyTorch data loader. 13 | 14 | Args: 15 | phase: string name of training phase {train, valid, test}. 16 | data_args: Namespace of data arguments. 17 | transform_args: Namespace of transform arguments. 18 | is_training: Bool indicating whether in training mode. 19 | return_info_dict: Bool indicating whether to return extra info 20 | in batches. 21 | logger: Optional Logger object for printing data to stdout and file. 22 | 23 | Return: 24 | loader: PyTorch DataLoader object 25 | """ 26 | 27 | study_level = not is_training 28 | shuffle = is_training 29 | 30 | # TODO: Make this more general 31 | if data_args.dataset == "chexpert": 32 | Dataset = CheXpertDataset 33 | elif 'special' in data_args.dataset: 34 | Dataset = CustomDataset 35 | elif data_args.dataset == "custom": 36 | Dataset = CustomDataset 37 | else: 38 | raise ValueError(f"Dataset {data_args.dataset} not supported.") 39 | 40 | # Get name of csv to load data from. 41 | # uncertain_map_path will replace this name. 42 | # need to make this more general!!! 43 | #csv_name = data_args.uncertain_map_path\ 44 | # if data_args.uncertain_map_path is not None else phase 45 | 46 | if data_args.uncertain_map_path is not None and phase == 'train': 47 | csv_name = data_args.uncertain_map_path 48 | else: 49 | csv_name = phase 50 | # Instantiate the Dataset class. 51 | dataset = Dataset(csv_name, is_training, study_level, transform_args, 52 | data_args.toy, return_info_dict, logger, data_args) 53 | if study_level: 54 | # Pick collate function 55 | collate_fn = PadCollate(dim=0) 56 | loader = data.DataLoader(dataset, 57 | batch_size=data_args.batch_size, 58 | shuffle=shuffle, 59 | num_workers=data_args.num_workers, 60 | collate_fn=collate_fn) 61 | else: 62 | loader = data.DataLoader(dataset, 63 | batch_size=data_args.batch_size, 64 | shuffle=shuffle, 65 | num_workers=data_args.num_workers) 66 | 67 | return loader 68 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/data/pad_collate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def pad_tensor(vec, pad, dim): 5 | """ 6 | args: 7 | vec - tensor to pad 8 | pad - the size to pad to 9 | dim - dimension to pad 10 | 11 | return: 12 | a new tensor padded to 'pad' in dimension 'dim' 13 | """ 14 | pad_size = list(vec.shape) 15 | pad_size[dim] = pad - vec.size(dim) 16 | return torch.cat([vec, torch.zeros(*pad_size)], dim=dim) 17 | 18 | 19 | class PadCollate: 20 | """ 21 | a variant of callate_fn that pads according to the longest sequence in 22 | a batch of sequences 23 | """ 24 | 25 | def __init__(self, dim=0): 26 | """ 27 | args: 28 | dim - the dimension to be padded (dimension of time in sequences) 29 | """ 30 | self.dim = dim 31 | 32 | def pad_collate(self, batch): 33 | """ 34 | args: 35 | batch - list of (tensor, label) 36 | 37 | return: 38 | xs - a tensor of all examples in 'batch' after padding 39 | ys - a LongTensor of all labels in batch 40 | mask - a mask with 0s in positions that should be ignored 41 | """ 42 | # find longest sequence 43 | study_lens = list(map(lambda x: x[0].shape[self.dim], batch)) 44 | max_len = max(study_lens) 45 | 46 | # Pad first example according to max_len 47 | num_components = max(len(x) for x in batch) 48 | batch = [(pad_tensor(x[0], pad=max_len, dim=self.dim),) + tuple(x[1:]) for x in batch] 49 | 50 | # Stack padded items and 51 | batch = tuple(self._merge(batch, component_idx=i) for i in range(num_components)) 52 | masks = [[1] * sl + [0] * (max_len - sl) for sl in study_lens] 53 | masks = torch.tensor(masks, dtype=torch.float32) 54 | 55 | return batch + (masks,) 56 | 57 | def __call__(self, batch): 58 | return self.pad_collate(batch) 59 | 60 | @staticmethod 61 | def _merge(batch, component_idx): 62 | """Merge components of a batch into a single tensor or list. 63 | 64 | Args: 65 | batch: Batch to merge. 66 | component_idx: Index of component in each example that will be merged. 67 | 68 | Returns: 69 | Merged components 70 | """ 71 | # Group all components into list 72 | components = [x[component_idx] for x in batch] 73 | assert len(components) > 0, 'Error in pad_collate: Cannot merge a batch of size 0' 74 | first_component = components[0] 75 | 76 | # Merge based on data type of components 77 | if isinstance(first_component, dict): 78 | merged_components = {k: [d[k] for d in components] for k in first_component} 79 | elif isinstance(first_component, torch.Tensor): 80 | merged_components = torch.stack(components, dim=0) 81 | else: 82 | raise ValueError('Unexpected type in PadCollate._merge: {}'.format(type(components[0]))) 83 | 84 | return merged_components 85 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/data/task_sequences.json: -------------------------------------------------------------------------------- 1 | { 2 | "competition": { 3 | "Atelectasis": 0, 4 | "Cardiomegaly": 1, 5 | "Consolidation": 2, 6 | "Edema": 3, 7 | "Pleural Effusion": 4 8 | }, 9 | "stanford": { 10 | "No Finding": 0, 11 | "Enlarged Cardiomediastinum": 1, 12 | "Cardiomegaly": 2, 13 | "Lung Lesion": 3, 14 | "Airspace Opacity": 4, 15 | "Edema": 5, 16 | "Consolidation": 6, 17 | "Pneumonia": 7, 18 | "Atelectasis": 8, 19 | "Pneumothorax": 9, 20 | "Pleural Effusion": 10, 21 | "Pleural Other": 11, 22 | "Fracture": 12, 23 | "Support Devices": 13 24 | }, 25 | "stanford_exclude_NF": { 26 | "Enlarged Cardiomediastinum": 0, 27 | "Cardiomegaly": 1, 28 | "Lung Lesion": 2, 29 | "Airspace Opacity": 3, 30 | "Edema": 4, 31 | "Consolidation": 5, 32 | "Pneumonia": 6, 33 | "Atelectasis": 7, 34 | "Pneumothorax": 8, 35 | "Pleural Effusion": 9, 36 | "Pleural Other": 10, 37 | "Fracture": 11, 38 | "Support Devices": 12 39 | }, 40 | "nih": { 41 | "Cardiomegaly": 0, 42 | "Emphysema": 1, 43 | "Pleural Effusion": 2, 44 | "Hernia": 3, 45 | "Infiltration": 4, 46 | "Mass": 5, 47 | "Nodule": 6, 48 | "Atelectasis": 7, 49 | "Pneumothorax": 8, 50 | "Pleural Thickening": 9, 51 | "Pneumonia": 10, 52 | "Fibrosis": 11, 53 | "Edema": 12, 54 | "Consolidation": 13 55 | }, 56 | 57 | "nih_su_union": { 58 | "Pleural Effusion": 0, 59 | "Pleural Other": 1, 60 | "Infiltration": 2, 61 | "Consolidation": 3, 62 | "Mass": 4, 63 | "Support Devices": 5, 64 | "Airspace Opacity": 6, 65 | "Lung Lesion": 7, 66 | "No Finding": 8, 67 | "Atelectasis": 9, 68 | "Nodule": 10, 69 | "Pneumothorax": 11, 70 | "Enlarged Cardiomediastinum": 12, 71 | "Fracture": 13, 72 | "Edema": 14, 73 | "Emphysema": 15, 74 | "Pleural Thickening": 16, 75 | "Hernia": 17, 76 | "Pneumonia": 18, 77 | "Fibrosis": 19, 78 | "Cardiomegaly": 20 79 | }, 80 | "su_using_nih_labeller": { 81 | "Cardiomegaly": 0, 82 | "Edema": 1, 83 | "Consolidation": 2, 84 | "Pneumonia": 3, 85 | "Atelectasis": 4, 86 | "Pneumothorax": 5, 87 | "Pleural Effusion": 6 88 | }, 89 | "single_atelectasis": { 90 | "Atelectasis": 0 91 | }, 92 | "single_cardiomegaly": { 93 | "Cardiomegaly": 0 94 | }, 95 | "single_consolidation": { 96 | "Consolidation": 0 97 | }, 98 | "single_edema": { 99 | "Edema": 0 100 | }, 101 | "single_pleural_effusion": { 102 | "Pleural Effusion": 0 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .su_dataset import SUDataset 2 | from .nih_dataset import NIHDataset 3 | from .concat_dataset import ConcatDataset 4 | from .constants import * 5 | from .label_mapper import LabelMapper 6 | from .label_mapper import TASK_SEQUENCES 7 | from .get_loader import get_loader, get_eval_loaders 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/base_dataset.py: -------------------------------------------------------------------------------- 1 | import torchvision.transforms as transforms 2 | 3 | from dataset.constants import CXR_MEAN, CXR_STD, IMAGENET_MEAN, IMAGENET_STD 4 | from pathlib import Path 5 | from torch.utils.data import Dataset 6 | from .transforms import CLAHE 7 | from .label_mapper import TASK_SEQUENCES, LabelMapper 8 | 9 | 10 | class BaseDataset(Dataset): 11 | 12 | def __init__(self, data_dir, transform_args, split, is_training, dataset_name, tasks_to, dataset_task_sequence=None): 13 | """ Base class for CXR Dataset. 14 | Args: 15 | data_dir (string): Name of the root data director. 16 | transform_args (Namespace): Args for data transforms 17 | split (argsparse): Name of the dataset split to load (train, valid) 18 | dataset_name (string): Name of the dataset. Used to fetch the task sequence, used for this dataset. 19 | (the task sequence used when loading the csv) 20 | tasks_to (dict): Name of the sequence of tasks 21 | we want to map all our labels to. 22 | """ 23 | 24 | assert isinstance(data_dir, str) 25 | assert isinstance(split, str) 26 | assert isinstance(dataset_name, str) 27 | assert isinstance(tasks_to, dict) 28 | 29 | self.dataset_name = dataset_name 30 | self.data_dir = Path(data_dir) 31 | self.split = split 32 | self.is_training = is_training 33 | 34 | # Create a label mapper 35 | # Get the two label sequences as two dicts: 36 | # e.g {pathology1: 0, pathology2: 1...} 37 | if dataset_task_sequence is not None: 38 | self.original_tasks = TASK_SEQUENCES[dataset_task_sequence] 39 | else: 40 | self.original_tasks = TASK_SEQUENCES[dataset_name] 41 | self.target_tasks = tasks_to 42 | 43 | self.label_mapper = None 44 | 45 | if self.original_tasks != self.target_tasks: 46 | self.label_mapper = LabelMapper( 47 | self.original_tasks, 48 | self.target_tasks) 49 | 50 | self._set_transforms(transform_args) 51 | 52 | def _set_class_weights(self, labels): 53 | """Set class weights for weighted loss. 54 | 55 | Each task, gets its own set of class weights. 56 | 57 | Weights are calculate by taking 1 - the relative 58 | frequency of the class (positive vs negative).. 59 | 60 | Args: 61 | labels: Dataframe or numpy array containing 62 | a list of the labels. Shape should be 63 | (num_examples, num_labels) 64 | 65 | 66 | Example: 67 | 100 examples with two tasks, cardiomegaly and consolidation. 68 | 10 positve cases of cardiomegaly. 69 | 20 positive cases of consolidation. 70 | 71 | We will then have: 72 | Class weights for cardiomegaly: 73 | [1-0.9, 1-0.1] = [0.1, 0.9] 74 | Class weights for consolidation: 75 | [1-0.8, 1-0.2] = [0.2, 0.8] 76 | 77 | The first element in each list is the wieght for the 78 | negative examples. 79 | """ 80 | 81 | # Set weights for positive vs negative examples 82 | self.p_count = (labels == 1).sum(axis=0) 83 | self.n_count = (labels == 0).sum(axis=0) 84 | 85 | if self.label_mapper is not None: 86 | self.p_count = self.label_mapper.map(self.p_count) 87 | self.n_count = self.label_mapper.map(self.n_count) 88 | 89 | self.total = self.p_count + self.n_count 90 | 91 | self.class_weights = [self.n_count / self.total, 92 | self.p_count / self.total] 93 | 94 | def _set_transforms(self, t_args): 95 | """Set the transforms 96 | 97 | Example: 98 | Image of size 1024x840. 99 | Scale to 312x256. 100 | Normalization and augmentation 101 | Random crop (or center crop) to 224x224. 102 | 103 | Note: Crop will be k * 224 and 104 | scale will be k*256. 105 | """ 106 | 107 | # Shorter side scaled to t_args.scale 108 | if t_args.maintain_ratio: 109 | transforms_list = [transforms.Resize(t_args.scale)] 110 | else: 111 | transforms_list = [transforms.Resize((t_args.scale, t_args.scale))] 112 | 113 | # Data augmentation 114 | if self.is_training: 115 | transforms_list += [transforms.RandomHorizontalFlip() if t_args.horizontal_flip else None, 116 | transforms.RandomRotation(t_args.rotate) if t_args.rotate else None, 117 | transforms.RandomCrop((t_args.crop, t_args.crop)) if t_args.crop != 0 else None] 118 | else: 119 | transforms_list += [transforms.CenterCrop((t_args.crop, t_args.crop)) if t_args.crop else None] 120 | # Normalization 121 | if t_args.clahe: 122 | transforms_list += [CLAHE(clip_limit=2.0, tile_grid_size=(8, 8))] 123 | 124 | if t_args.normalization == 'imagenet': 125 | normalize = transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) 126 | elif t_args.normalization == 'cxr_norm': 127 | normalize = transforms.Normalize(mean=CXR_MEAN, std=CXR_STD) 128 | transforms_list += [transforms.ToTensor(), normalize] 129 | 130 | self.transform = transforms.Compose([t for t in transforms_list if t]) 131 | 132 | 133 | def __len__(self): 134 | return len(self.labels) 135 | 136 | def __getitem__(self, index): 137 | raise NotImplementedError 138 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/ckpts/debugging/args.json: -------------------------------------------------------------------------------- 1 | { 2 | "NIH_data_dir": "/deep/group/rad_data", 3 | "SU_data_dir": "/deep/group/xray4all", 4 | "adam_beta_1": 0.9, 5 | "adam_beta_2": 0.999, 6 | "batch_size": 6, 7 | "best_ckpt_metric": "val_loss", 8 | "ckpt_path": "", 9 | "classes": "all", 10 | "epochs_per_eval": 1, 11 | "epochs_per_save": 3, 12 | "eval_on_nih": false, 13 | "eval_on_su": true, 14 | "gpu_ids": "0,1,2", 15 | "horizontal_flip": false, 16 | "init_method": "kaiming", 17 | "iters_per_print": 6, 18 | "iters_per_visual": 120, 19 | "label_seq": "competition", 20 | "loss_fn": "", 21 | "lr": 0.001, 22 | "lr_decay_gamma": 0.1, 23 | "lr_decay_step": 100, 24 | "lr_milestones": "50,125,250", 25 | "lr_patience": 10, 26 | "lr_scheduler": "step", 27 | "max_ckpts": 2, 28 | "max_eval": -1, 29 | "metric_name": "val_loss", 30 | "model": "DenseNet121", 31 | "model_depth": 50, 32 | "name": "debugging", 33 | "nih_train_frac": 0, 34 | "num_channels": 3, 35 | "num_classes": 14, 36 | "num_epochs": 15, 37 | "num_visuals": 4, 38 | "num_workers": 8, 39 | "optimizer": "sgd", 40 | "pretrained": true, 41 | "rotate": 0, 42 | "save_dir": "ckpts/", 43 | "scale": 256, 44 | "sgd_dampening": 0.9, 45 | "sgd_momentum": 0.9, 46 | "su_train_frac": 1, 47 | "toy": false, 48 | "weight_decay": 0.0001 49 | } 50 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/concat_dataset.py: -------------------------------------------------------------------------------- 1 | import bisect 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class ConcatDataset(Dataset): 6 | """https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#ConcatDataset 7 | 8 | Dataset to concatenate multiple datasets. 9 | Purpose: useful to assemble different existing datasets, possibly 10 | large-scale datasets as the concatenation operation is done in an 11 | on-the-fly manner. 12 | 13 | Arguments: 14 | datasets (sequence): List of datasets to be concatenated 15 | """ 16 | 17 | @staticmethod 18 | def cumsum(sequence): 19 | r, s = [], 0 20 | for e in sequence: 21 | l = len(e) 22 | r.append(l + s) 23 | s += l 24 | return r 25 | 26 | @staticmethod 27 | def get_class_weights(datasets): 28 | 29 | p_count = 0 30 | n_count = 0 31 | 32 | for dataset in datasets: 33 | p_count = p_count + dataset.p_count 34 | n_count = n_count + dataset.n_count 35 | 36 | total_count = p_count + n_count 37 | class_weights = [n_count / total_count, 38 | p_count / total_count] 39 | 40 | return class_weights 41 | 42 | 43 | def __init__(self, datasets): 44 | super(ConcatDataset, self).__init__() 45 | assert len(datasets) > 0, 'datasets should not be an empty iterable' 46 | self.datasets = list(datasets) 47 | self.cumulative_sizes = self.cumsum(self.datasets) 48 | self.class_weights = self.get_class_weights(self.datasets) 49 | 50 | def __len__(self): 51 | return self.cumulative_sizes[-1] 52 | 53 | def __getitem__(self, idx): 54 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) 55 | if dataset_idx == 0: 56 | sample_idx = idx 57 | else: 58 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] 59 | return self.datasets[dataset_idx][sample_idx] 60 | 61 | @property 62 | def cummulative_sizes(self): 63 | warnings.warn("cummulative_sizes attribute is renamed to " 64 | "cumulative_sizes", DeprecationWarning, stacklevel=2) 65 | return self.cumulative_size 66 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/constants.py: -------------------------------------------------------------------------------- 1 | CXR_MEAN = [.5020, .5020, .5020] 2 | CXR_STD = [.085585, .085585, .085585] 3 | 4 | IMAGENET_MEAN = [0.485, 0.456, 0.406] 5 | IMAGENET_STD = [0.229, 0.224, 0.225] 6 | 7 | COL_PATH = 'Path' 8 | COL_STUDY = 'Study' 9 | COL_SPLIT = 'DataSplit' 10 | COL_PATIENT = 'Patient' 11 | 12 | CFG_TASK2MODELS = 'task2models' 13 | CFG_AGG_METHOD = 'aggregation_method' 14 | CFG_CKPT_PATH = 'ckpt_path' 15 | CFG_IS_3CLASS = 'is_3class' 16 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/get_loader.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | 3 | from .concat_dataset import ConcatDataset 4 | from .su_dataset import SUDataset 5 | from .nih_dataset import NIHDataset 6 | from .label_mapper import TASK_SEQUENCES 7 | from .pad_collate import PadCollate 8 | 9 | def get_loader(data_args, 10 | transform_args, 11 | split, 12 | task_sequence, 13 | su_frac, 14 | nih_frac, 15 | batch_size, 16 | is_training=False, 17 | shuffle=False, 18 | study_level=False, 19 | frontal_lateral=False, 20 | return_info_dict=False): 21 | 22 | """Returns a dataset loader 23 | If both stanford_frac and nih_frac is one, the loader 24 | will sample both NIH and Stanford data. 25 | 26 | Args: 27 | stanford_frac: Float that specifies what percentage of stanford to load. 28 | nih_frac: Float that specifies what percentage of NIH to load. 29 | split: String determining if this is the train, valid, test, or sample split. 30 | shuffle: If true, the loader will shuffle the data. 31 | study_level: If true, creates a loader that loads the image on the study level. 32 | Only applicable for the SU dataset. 33 | frontal_lateral: If true, loads frontal/lateral labels. 34 | Only applicable for the SU dataset. 35 | return_info_dict: If true, return a dict of info with each image. 36 | 37 | Return: 38 | loader: A loader 39 | """ 40 | 41 | if is_training: 42 | study_level=data_args.train_on_studies 43 | 44 | datasets = [] 45 | if su_frac != 0: 46 | datasets.append( 47 | SUDataset( 48 | data_args.su_data_dir, 49 | transform_args, split=split, 50 | is_training=is_training, 51 | tasks_to=task_sequence, 52 | frac=su_frac, 53 | study_level=study_level, 54 | frontal_lateral=frontal_lateral, 55 | toy=data_args.toy, 56 | return_info_dict=return_info_dict 57 | ) 58 | ) 59 | 60 | if nih_frac != 0: 61 | datasets.append( 62 | NIHDataset( 63 | data_args.nih_data_dir, 64 | transform_args, split=split, 65 | is_training=is_training, 66 | tasks_to=task_sequence, 67 | frac=nih_frac, 68 | toy=data_args.toy 69 | ) 70 | ) 71 | 72 | if len(datasets) == 2: 73 | assert study_level is False, "Currently, you can't create concatenated datasets when training on studies" 74 | dataset = ConcatDataset(datasets) 75 | else: 76 | dataset = datasets[0] 77 | 78 | # Pick collate function 79 | if study_level: 80 | collate_fn = PadCollate(dim=0) 81 | loader = data.DataLoader(dataset, 82 | batch_size=batch_size, 83 | shuffle=shuffle, 84 | num_workers=8, 85 | collate_fn=collate_fn) 86 | else: 87 | loader = data.DataLoader(dataset, 88 | batch_size=batch_size, 89 | shuffle=shuffle, 90 | num_workers=8) 91 | 92 | return loader 93 | 94 | 95 | def get_eval_loaders(data_args, transform_args, task_sequence, batch_size, frontal_lateral, return_info_dict=False): 96 | """Returns a dataset loader 97 | If both stanford_frac and nih_frac is one, the loader 98 | will sample both NIH and Stanford data. 99 | 100 | Args: 101 | eval_su: Float that specifes what percentage of stanford to load. 102 | nih_frac: Float that specifes what percentage of NIH to load. 103 | args: Additional arguments needed to load the dataset. 104 | return_info_dict: If true, return a dict of info with each image. 105 | 106 | Return: 107 | loader: A loader 108 | 109 | """ 110 | 111 | eval_loaders = [] 112 | 113 | if data_args.eval_su: 114 | eval_loaders += [get_loader(data_args, 115 | transform_args, 116 | 'valid', 117 | task_sequence, 118 | su_frac=1, 119 | nih_frac=0, 120 | batch_size=batch_size, 121 | is_training=False, 122 | shuffle=False, 123 | study_level=not frontal_lateral, 124 | frontal_lateral=frontal_lateral, 125 | return_info_dict=return_info_dict)] 126 | 127 | if data_args.eval_nih: 128 | eval_loaders += [get_loader(data_args, 129 | transform_args, 130 | 'train', 131 | task_sequence, 132 | su_frac=0, 133 | nih_frac=1, 134 | batch_size=batch_size, 135 | is_training=False, 136 | shuffle=False, 137 | study_level=True, 138 | return_info_dict=return_info_dict), 139 | get_loader(data_args, 140 | transform_args, 141 | 'valid', 142 | task_sequence, 143 | su_frac=0, 144 | nih_frac=1, 145 | batch_size=batch_size, 146 | is_training=False, 147 | shuffle=False, 148 | study_level=True, 149 | return_info_dict=return_info_dict)] 150 | 151 | return eval_loaders 152 | 153 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/label_mapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import sys 4 | import os 5 | from pathlib import Path 6 | from collections import OrderedDict 7 | 8 | # Load the dictionary of label sequences 9 | with open(Path(__file__).parent / 'task_sequences.json') as f: 10 | TASK_SEQUENCES = {k: OrderedDict(sorted(v.items(), key=lambda x: x[1])) for k, v in json.load(f).items()} 11 | 12 | class LabelMapper: 13 | # special cases of label values 14 | UNCERTAIN = -1 15 | MISSING = -2 16 | 17 | def __init__(self, from_seq, to_seq): 18 | """Class that converts one task sequence, 19 | to another task sequence. (e.g nih to stanford). 20 | 21 | The key equation is: x_new = Ax + b where 22 | A is the mapping_matrix, putting 1s in x to the 23 | right place in x_new. b, below known as missing_bias 24 | makes ure that the values in the to_seq that don't exist 25 | in the from_seq, are all put to eqaul zero. 26 | 27 | Args: 28 | from_seq: An ordered dict of the tasks (task: index) 29 | you want to map from. 30 | 31 | to_seq: An ordered dict of the tasks (task: index) 32 | you want to map to. 33 | """ 34 | # Can't be any duplicates within a task sequence. 35 | assert len(set(from_seq)) == len(from_seq) 36 | assert len(set(to_seq)) == len(to_seq) 37 | 38 | # The values 0 .. num_pathologies need to be unique 39 | assert len(set(to_seq.values())) == len(to_seq.values()) 40 | assert len(set(from_seq.values())) == len(from_seq.values()) 41 | 42 | # store the from and to task sequences 43 | self.from_seq = from_seq 44 | self.to_seq = to_seq 45 | 46 | # create the mapping matrix 47 | self.mapping_matrix = self._get_map_matrix(from_seq, to_seq) 48 | 49 | # Each row in the mapping matrix that is all zero 50 | # corresponds to a task that does not exist in the from_seq 51 | # we want those values to have value -2 52 | # These values can then easily be masked at a later stage 53 | 54 | missing_tasks_indeces = np.where(np.sum(self.mapping_matrix, axis=1) == 0) 55 | 56 | self.missing_bias = np.zeros(len(to_seq)) 57 | self.missing_bias[missing_tasks_indeces] = LabelMapper.MISSING 58 | 59 | def map(self, label): 60 | """Maps label from self.from_seq to self.from_to_seq 61 | 62 | The missing_bias makes sure that tasks that are missing 63 | in the from_seq are put as -2 in new_label. 64 | 65 | Args: 66 | label: A numpy array (a vector) with binary values. 67 | each corresponding to a binary task. Usually this task is 68 | to determine if whether specific pathology is present. 69 | 70 | Return: 71 | new_label: A numpy array with the labels whose indeces corresponds 72 | to the label sequence stored in self.to_seq. 73 | """ 74 | 75 | new_label = np.dot(self.mapping_matrix, label) + self.missing_bias 76 | 77 | return new_label 78 | 79 | def _get_map_matrix(self, from_seq, to_seq): 80 | """ Creates a mapping matrix between to 81 | labeling sequences. 82 | 83 | The matrix shape is (num_from_tasks, num_to_tasks). 84 | That means that if a row ends up fully empty, that class 85 | does not exists in the from_seq. If a column ends up fully 86 | empty it means that the class does not exist in the target. 87 | """ 88 | num_from_tasks = len(from_seq) 89 | num_to_tasks = len(to_seq) 90 | map_matrix = np.zeros((num_to_tasks, num_from_tasks)) 91 | 92 | for target_pathology in to_seq: 93 | to_id = to_seq[target_pathology] 94 | if target_pathology in from_seq: 95 | from_id = from_seq[target_pathology] 96 | map_matrix[to_id, from_id] = 1 97 | 98 | return map_matrix 99 | 100 | def label_overlap(self): 101 | """Utility method to check overlap 102 | between the two label_sequences""" 103 | 104 | overlap = set(self.from_seq).intersection(set(self.to_seq)) 105 | 106 | return list(overlap) 107 | 108 | @staticmethod 109 | def display(sequence, array): 110 | """Prints in easy to read format the binary array 111 | and label sequence. 112 | 113 | Put this in this class mainly for namespacing purposes. 114 | """ 115 | 116 | tasks = list(sequence) 117 | array = array.tolist() 118 | assert(len(tasks) == len(array)) 119 | 120 | path_label_dict = dict(zip(tasks, array)) 121 | 122 | print(json.dumps(path_label_dict, indent=4)) 123 | 124 | return dict(zip(tasks, array)) 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/nih_dataset.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from PIL import Image 6 | import torch 7 | 8 | from .base_dataset import BaseDataset 9 | 10 | class NIHDataset(BaseDataset): 11 | 12 | def __init__(self, data_dir, 13 | transform_args, split, is_training, tasks_to, frac, toy=False): 14 | """ NIH Dataset 15 | Args: 16 | data_dir (string): Name of the root data director. 17 | transform_args (Namespace): Namespace object containing all the transform arguments. 18 | split (argsparse): Arguments used for transforms 19 | tasks_to (dict): The sequence of tasks. 20 | """ 21 | 22 | super().__init__(data_dir, transform_args, 23 | split, is_training, 'nih', tasks_to) 24 | 25 | self.study_level = False 26 | 27 | # Load data from csv 28 | df = self._load_df(self.data_dir, split) 29 | if toy and split == 'train': 30 | df = df.sample(n=20) 31 | df = df.reset_index(drop=True) 32 | 33 | if frac != 1 and is_training: 34 | df = df.sample(frac=frac) 35 | df = df.reset_index(drop=True) 36 | 37 | # Get labels and studies 38 | self.labels = self._get_labels(df) 39 | 40 | # Get image paths 41 | self.img_paths = self._get_paths(df) 42 | 43 | # Set transforms and class weights 44 | self._set_class_weights(self.labels) 45 | 46 | @staticmethod 47 | def _load_df(data_dir, split): 48 | 49 | if split == 'test': 50 | csv_path = data_dir / 'test420.csv' 51 | else: 52 | csv_path = data_dir / (split + '_medium.csv') 53 | 54 | df = pd.read_csv(csv_path) 55 | img_dir = data_dir / 'images' 56 | df['Path'] = df['Path'].apply(lambda x: img_dir / x) 57 | df = df.reset_index(drop=True) 58 | 59 | return df 60 | 61 | @staticmethod 62 | def _get_paths(df): 63 | """Get list pf paths to images""" 64 | 65 | # Skip the first header row 66 | return df['Path'].tolist()[1:] 67 | 68 | def _get_studies(self, df): 69 | """The NIH dataset does not have study level data""" 70 | return None 71 | def _get_labels(self, df): 72 | """Return all the labels. 73 | 74 | In the NIH datset all labels are in one column. The 75 | diferent pathologies are separated with pipes 76 | E.g: 0|0|1|1|0|1|1|0|0|0|0|1|0|0 77 | """ 78 | 79 | labels = np.array([np.fromstring(row['Label'], sep='|', dtype=int) for i, row in df.iterrows() if i]) 80 | return labels 81 | 82 | def __getitem__(self, index): 83 | 84 | # Get and transform the label 85 | label = self.labels[index, :] 86 | if self.label_mapper is not None: 87 | label = self.label_mapper.map(label) 88 | label = torch.FloatTensor(label) 89 | 90 | # Get and transform the image 91 | img = Image.open(self.img_paths[index]).convert('RGB') 92 | img = self.transform(img) 93 | 94 | return img, label 95 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/pad_collate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def pad_tensor(vec, pad, dim): 5 | """ 6 | args: 7 | vec - tensor to pad 8 | pad - the size to pad to 9 | dim - dimension to pad 10 | 11 | return: 12 | a new tensor padded to 'pad' in dimension 'dim' 13 | """ 14 | pad_size = list(vec.shape) 15 | pad_size[dim] = pad - vec.size(dim) 16 | return torch.cat([vec, torch.zeros(*pad_size)], dim=dim) 17 | 18 | 19 | class PadCollate: 20 | """ 21 | a variant of callate_fn that pads according to the longest sequence in 22 | a batch of sequences 23 | """ 24 | 25 | def __init__(self, dim=0): 26 | """ 27 | args: 28 | dim - the dimension to be padded (dimension of time in sequences) 29 | """ 30 | self.dim = dim 31 | 32 | def pad_collate(self, batch): 33 | """ 34 | args: 35 | batch - list of (tensor, label) 36 | 37 | return: 38 | xs - a tensor of all examples in 'batch' after padding 39 | ys - a LongTensor of all labels in batch 40 | mask - a mask with 0s in positions that should be ignored 41 | """ 42 | # find longest sequence 43 | study_lens = list(map(lambda x: x[0].shape[self.dim], batch)) 44 | max_len = max(study_lens) 45 | 46 | # Pad first example according to max_len 47 | num_components = max(len(x) for x in batch) 48 | batch = [(pad_tensor(x[0], pad=max_len, dim=self.dim),) + tuple(x[1:]) for x in batch] 49 | 50 | # Stack padded items and 51 | batch = tuple(self._merge(batch, component_idx=i) for i in range(num_components)) 52 | masks = [[1] * sl + [0] * (max_len - sl) for sl in study_lens] 53 | masks = torch.tensor(masks, dtype=torch.float32) 54 | 55 | return batch + (masks,) 56 | 57 | def __call__(self, batch): 58 | return self.pad_collate(batch) 59 | 60 | @staticmethod 61 | def _merge(batch, component_idx): 62 | """Merge components of a batch into a single tensor or list. 63 | 64 | Args: 65 | batch: Batch to merge. 66 | component_idx: Index of component in each example that will be merged. 67 | 68 | Returns: 69 | Merged components 70 | """ 71 | # Group all components into list 72 | components = [x[component_idx] for x in batch] 73 | assert len(components) > 0, 'Error in pad_collate: Cannot merge a batch of size 0' 74 | first_component = components[0] 75 | 76 | # Merge based on data type of components 77 | if isinstance(first_component, dict): 78 | merged_components = {k: [d[k] for d in components] for k in first_component} 79 | elif isinstance(first_component, torch.Tensor): 80 | merged_components = torch.stack(components, dim=0) 81 | else: 82 | raise ValueError('Unexpected type in PadCollate._merge: {}'.format(type(components[0]))) 83 | 84 | return merged_components 85 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/predict_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "aggregation_method": "mean", 3 | "task2models": { 4 | "No Finding": [ 5 | { 6 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 7 | "is_3class": true 8 | } 9 | ], 10 | "Enlarged Cardiomediastinum": [ 11 | { 12 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 13 | "is_3class": true 14 | } 15 | ], 16 | "Cardiomegaly": [ 17 | { 18 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 19 | "is_3class": true 20 | } 21 | ], 22 | "Lung Lesion": [ 23 | { 24 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 25 | "is_3class": true 26 | } 27 | ], 28 | "Airspace Opacity": [ 29 | { 30 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 31 | "is_3class": true 32 | } 33 | ], 34 | "Edema": [ 35 | { 36 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_ones/best.pth.tar", 37 | "is_3class": false 38 | } 39 | ], 40 | "Consolidation": [ 41 | { 42 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_ignore/best.pth.tar", 43 | "is_3class": false 44 | } 45 | ], 46 | "Pneumonia": [ 47 | { 48 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 49 | "is_3class": true 50 | } 51 | ], 52 | "Atelectasis": [ 53 | { 54 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_ones/best.pth.tar", 55 | "is_3class": false 56 | } 57 | ], 58 | "Pneumothorax": [ 59 | { 60 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 61 | "is_3class": true 62 | } 63 | ], 64 | "Pleural Effusion": [ 65 | { 66 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 67 | "is_3class": true 68 | } 69 | ], 70 | "Pleural Other": [ 71 | { 72 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 73 | "is_3class": true 74 | } 75 | ], 76 | "Fracture": [ 77 | { 78 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 79 | "is_3class": true 80 | } 81 | ], 82 | "Support Devices": [ 83 | { 84 | "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar", 85 | "is_3class": true 86 | } 87 | ] 88 | } 89 | } -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/task_sequences.json: -------------------------------------------------------------------------------- 1 | { 2 | "competition": { 3 | "Atelectasis": 0, 4 | "Cardiomegaly": 1, 5 | "Consolidation": 2, 6 | "Edema": 3, 7 | "Pleural Effusion": 4 8 | }, 9 | "stanford": { 10 | "No Finding": 0, 11 | "Enlarged Cardiomediastinum": 1, 12 | "Cardiomegaly": 2, 13 | "Lung Lesion": 3, 14 | "Airspace Opacity": 4, 15 | "Edema": 5, 16 | "Consolidation": 6, 17 | "Pneumonia": 7, 18 | "Atelectasis": 8, 19 | "Pneumothorax": 9, 20 | "Pleural Effusion": 10, 21 | "Pleural Other": 11, 22 | "Fracture": 12, 23 | "Support Devices": 13 24 | }, 25 | "stanford_exclude_NF": { 26 | "Enlarged Cardiomediastinum": 0, 27 | "Cardiomegaly": 1, 28 | "Lung Lesion": 2, 29 | "Airspace Opacity": 3, 30 | "Edema": 4, 31 | "Consolidation": 5, 32 | "Pneumonia": 6, 33 | "Atelectasis": 7, 34 | "Pneumothorax": 8, 35 | "Pleural Effusion": 9, 36 | "Pleural Other": 10, 37 | "Fracture": 11, 38 | "Support Devices": 12 39 | }, 40 | "nih": { 41 | "Cardiomegaly": 0, 42 | "Emphysema": 1, 43 | "Pleural Effusion": 2, 44 | "Hernia": 3, 45 | "Infiltration": 4, 46 | "Mass": 5, 47 | "Nodule": 6, 48 | "Atelectasis": 7, 49 | "Pneumothorax": 8, 50 | "Pleural Thickening": 9, 51 | "Pneumonia": 10, 52 | "Fibrosis": 11, 53 | "Edema": 12, 54 | "Consolidation": 13 55 | }, 56 | 57 | "nih_su_union": { 58 | "Pleural Effusion": 0, 59 | "Pleural Other": 1, 60 | "Infiltration": 2, 61 | "Consolidation": 3, 62 | "Mass": 4, 63 | "Support Devices": 5, 64 | "Airspace Opacity": 6, 65 | "Lung Lesion": 7, 66 | "No Finding": 8, 67 | "Atelectasis": 9, 68 | "Nodule": 10, 69 | "Pneumothorax": 11, 70 | "Enlarged Cardiomediastinum": 12, 71 | "Fracture": 13, 72 | "Edema": 14, 73 | "Emphysema": 15, 74 | "Pleural Thickening": 16, 75 | "Hernia": 17, 76 | "Pneumonia": 18, 77 | "Fibrosis": 19, 78 | "Cardiomegaly": 20 79 | }, 80 | "su_using_nih_labeller": { 81 | "Cardiomegaly": 0, 82 | "Edema": 1, 83 | "Consolidation": 2, 84 | "Pneumonia": 3, 85 | "Atelectasis": 4, 86 | "Pneumothorax": 5, 87 | "Pleural Effusion": 6 88 | }, 89 | "single_atelectasis": { 90 | "Atelectasis": 0 91 | }, 92 | "single_cardiomegaly": { 93 | "Cardiomegaly": 0 94 | }, 95 | "single_consolidation": { 96 | "Consolidation": 0 97 | }, 98 | "single_edema": { 99 | "Edema": 0 100 | }, 101 | "single_pleural_effusion": { 102 | "Pleural Effusion": 0 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .clahe import CLAHE 2 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/dataset/transforms/clahe.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | from PIL import Image 4 | 5 | class CLAHE(object): 6 | """ Apply CLAHE on a single image""" 7 | 8 | def __init__(self, clip_limit=2.0, tile_grid_size=(8,8)): 9 | self.clip_limit = 2.0 10 | self.tile_grid_size = tile_grid_size 11 | 12 | def __call__(self, PIL_img, save = False): 13 | im_np = np.asarray(PIL_img) 14 | im_np = cv2.cvtColor(im_np, cv2.COLOR_BGR2GRAY) 15 | 16 | # create a CLAHE object (Arguments are optional) 17 | clahe = cv2.createCLAHE(self.clip_limit, self.tile_grid_size) 18 | cl1 = clahe.apply(im_np) 19 | imaged = cv2.cvtColor(cl1, cv2.COLOR_GRAY2RGB) 20 | img = Image.fromarray(imaged) 21 | 22 | if save: 23 | # Saving images to display 24 | preimage.save(str(PIL_image) + "original.png") 25 | img.save(str(preimage) + "CLAHEd.png") 26 | return img 27 | 28 | # Not sure if the following is necessary 29 | def __repr__(self): 30 | return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std) 31 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/eval/__init__.py: -------------------------------------------------------------------------------- 1 | from .average_meter import AverageMeter 2 | from .evaluator import Evaluator 3 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/eval/average_meter.py: -------------------------------------------------------------------------------- 1 | class AverageMeter(object): 2 | """Computes and stores the average and current value. 3 | 4 | Adapted from: 5 | https://github.com/pytorch/examples/blob/master/imagenet/main.py 6 | """ 7 | def __init__(self): 8 | self.avg = 0 9 | self.val = 0 10 | self.sum = 0 11 | self.count = 0 12 | 13 | def reset(self): 14 | self.__init__() 15 | 16 | def update(self, val, n=1): 17 | self.val = val 18 | self.sum += val * n 19 | self.count += n 20 | self.avg = self.sum / self.count 21 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/eval/below_curve_counter.py: -------------------------------------------------------------------------------- 1 | """Define below curve counter class.""" 2 | import sklearn.metrics as sk_metrics 3 | 4 | 5 | class BelowCurveCounter(object): 6 | def __init__(self, rad_perf, task_name): 7 | self.rad_perf = rad_perf 8 | self.task_name = task_name 9 | 10 | def ROC(self, ground_truth, predictions): 11 | 12 | self.rad_perf.index = self.rad_perf['Score'] 13 | num_below_roc = 0 14 | 15 | fpr, tpr, threshold = sk_metrics.roc_curve(ground_truth, predictions) 16 | for rad_name in ['Rad1', 'Rad2', 'Rad3']: 17 | rad_sensitivity =\ 18 | self.rad_perf.loc[f'{self.task_name} Sensitivity', 19 | rad_name] 20 | rad_specificity =\ 21 | self.rad_perf.loc[f'{self.task_name} Specificity', 22 | rad_name] 23 | 24 | rad_vertical_projection, rad_horizontal_projection =\ 25 | self._project(fpr, tpr, 1 - rad_specificity, rad_sensitivity) 26 | 27 | if (rad_vertical_projection >= rad_sensitivity): 28 | num_below_roc += 1 29 | 30 | return num_below_roc 31 | 32 | def PR(self, ground_truth, predictions): 33 | self.rad_perf.index = self.rad_perf['Score'] 34 | 35 | num_below_pr = 0 36 | precision, recall, threshold =\ 37 | sk_metrics.precision_recall_curve(ground_truth, predictions) 38 | 39 | for rad_name in ['Rad1', 'Rad2', 'Rad3']: 40 | rad_sensitivity =\ 41 | self.rad_perf.loc[f'{self.task_name} Sensitivity', 42 | rad_name] 43 | rad_precision =\ 44 | self.rad_perf.loc[f'{self.task_name} Precision', 45 | rad_name] 46 | 47 | rad_vertical_projection, rad_horizontal_projection =\ 48 | self._project(recall, precision, 49 | rad_sensitivity, rad_precision) 50 | 51 | if (rad_vertical_projection >= rad_precision): 52 | num_below_pr += 1 53 | 54 | return num_below_pr 55 | 56 | @staticmethod 57 | def _project(X, Y, rad_x, rad_y): 58 | """Find the closest points on the curve to the point in 59 | X and Y directions.""" 60 | x = 0 61 | y = 0 62 | 63 | while (((x+2 < len(X)) and (X[x] > rad_x and X[x + 1] > rad_x)) 64 | or (X[x] < rad_x and X[x + 1] < rad_x)): 65 | x += 1 66 | while ((y+2 < len(Y)) and (Y[y] > rad_y and Y[y + 1] > rad_y) 67 | or (Y[y] < rad_y and Y[y + 1] < rad_y)): 68 | y += 1 69 | 70 | rad_vertical_projection =\ 71 | (Y[x + 1] - Y[x]) * (rad_x - X[x]) + Y[x] 72 | rad_horizontal_projection =\ 73 | (X[y + 1] - X[y]) * (rad_y - Y[y]) + X[y] 74 | 75 | return rad_vertical_projection, rad_horizontal_projection 76 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/eval/evaluator.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sklearn.metrics as sk_metrics 4 | import torch.nn as nn 5 | 6 | from .below_curve_counter import BelowCurveCounter 7 | from .loss import CrossEntropyLossWithUncertainty, MaskedLossWrapper 8 | 9 | 10 | class Evaluator(object): 11 | """Evaluator class for evaluating predictions against 12 | binary groundtruth.""" 13 | def __init__(self, logger=None, **kwargs): 14 | self.logger = logger 15 | self.kwargs = kwargs 16 | 17 | if "operating_points_path" in kwargs: 18 | self.rad_perf = pd.read_csv(kwargs["operating_points_path"]) 19 | else: 20 | self.rad_perf = None 21 | 22 | self.set_eval_functions() 23 | 24 | def evaluate(self, groundtruth, predictions, metric, threshold=0.5): 25 | """Evaluate a single metric on groundtruth and predictions.""" 26 | print("Evaluating metric: {}".format(metric)) 27 | if metric in self.summary_metrics: 28 | metric_fn = self.summary_metrics[metric] 29 | value = metric_fn(groundtruth, predictions) 30 | elif metric in self.curve_metrics: 31 | metric_fn = self.curve_metrics[metric] 32 | value = metric_fn(groundtruth, predictions) 33 | elif metric in self.point_metrics: 34 | metric_fn = self.point_metrics[metric] 35 | value = metric_fn(groundtruth, predictions > threshold) 36 | # if metric == 'precision' or metric == 'recall': 37 | # if value < 0.01: 38 | # raise ValueError(f"Metric {metric} should not have score less than 0.01") 39 | else: 40 | raise ValueError(f"Metric {metric} not supported.") 41 | 42 | return value 43 | 44 | def evaluate_tasks(self, groundtruth, predictions, threshold=0.5): 45 | """Compute evaluation metrics and curves on multiple tasks.""" 46 | metrics = {} 47 | curves = {} 48 | for task in list(predictions): 49 | print("Evaluating task: {}".format(task)) 50 | 51 | task_groundtruth = groundtruth[task] 52 | task_predictions = predictions[task] 53 | # filter out those with -1 in groundtruth 54 | non_label = task_groundtruth.index[task_groundtruth == -1.0] 55 | task_predictions = task_predictions.drop(non_label) 56 | task_groundtruth = task_groundtruth.drop(non_label) 57 | 58 | metrics.update({f"{task}:{metric}": 59 | self.evaluate(task_groundtruth, 60 | task_predictions, 61 | metric=metric) 62 | for metric in self.summary_metrics}) 63 | 64 | metrics.update({f"{task}:{metric}@thresh={threshold}": 65 | self.evaluate(task_groundtruth, 66 | task_predictions, 67 | metric=metric, 68 | threshold=threshold) 69 | for metric in self.point_metrics}) 70 | """ 71 | if self.rad_perf is not None: 72 | 73 | below_curve_counter = BelowCurveCounter(self.rad_perf, 74 | task) 75 | metrics.update({ 76 | f'{task}:rads_below_ROC': 77 | below_curve_counter.ROC(task_groundtruth, 78 | task_predictions), 79 | f'{task}:rads_below_PR': 80 | below_curve_counter.PR(task_groundtruth, 81 | task_predictions) 82 | }) 83 | """ 84 | curves.update({f"{task}:{metric}": 85 | self.evaluate(task_groundtruth, 86 | task_predictions, 87 | metric=metric, 88 | threshold=threshold) 89 | for metric in self.curve_metrics}) 90 | 91 | return metrics, curves 92 | 93 | def evaluate_average_metric(self, metrics, evaluate_tasks, 94 | average_metric_name): 95 | """Evaluate an average metric over classes.""" 96 | 97 | # All provided names must be of the form "...-{metric_name}" 98 | metric_name = average_metric_name.split("-")[-1] 99 | 100 | average_metric = np.mean([metrics[f"{task}:{metric_name}"] 101 | for task in evaluate_tasks]) 102 | 103 | return average_metric 104 | 105 | def set_eval_functions(self): 106 | """Set the evaluation functions.""" 107 | def undefined_catcher(func, x, y): 108 | try: 109 | return func(x, y) 110 | except Exception: 111 | return np.nan 112 | 113 | # Functions that take probs as input 114 | self.summary_metrics = { 115 | 'AUPRC': lambda x, y: undefined_catcher(sk_metrics.average_precision_score, x, y), 116 | 'AUROC': lambda x, y: undefined_catcher(sk_metrics.roc_auc_score, x, y), 117 | 'log_loss': lambda x, y: undefined_catcher(sk_metrics.log_loss, x, y), 118 | } 119 | 120 | # Functions that take binary values as input 121 | self.point_metrics = { 122 | 'accuracy': lambda x, y: undefined_catcher(sk_metrics.accuracy_score, x, y), 123 | 'precision': lambda x, y: undefined_catcher(sk_metrics.precision_score, x, y), 124 | 'recall': lambda x, y: undefined_catcher(sk_metrics.recall_score, x, y), 125 | } 126 | 127 | self.curve_metrics = { 128 | 'PRC': lambda x, y: undefined_catcher(sk_metrics.precision_recall_curve, x, y), 129 | 'ROC': lambda x, y: undefined_catcher(sk_metrics.roc_curve, x, y), 130 | } 131 | 132 | def get_loss_fn(self, loss_fn_name, model_uncertainty, 133 | mask_uncertain, device): 134 | """Get the loss function used for training. 135 | 136 | Args: 137 | loss_fn_name: Name of loss function to use. 138 | model_uncertainty: Bool indicating whether to predict 139 | UNCERTAIN directly. 140 | mask_uncertain: Bool indicating whether to mask 141 | UNCERTAIN labels. 142 | device: device to compute loss on (gpu or cpu). 143 | """ 144 | print("evaluator: loss function name: {}".format(loss_fn_name)) 145 | if model_uncertainty: 146 | loss_fn = CrossEntropyLossWithUncertainty() 147 | elif loss_fn_name == 'cross_entropy': 148 | loss_fn = nn.BCEWithLogitsLoss(reduction="none" 149 | if mask_uncertain else "mean") 150 | 151 | # Apply a wrapper that masks uncertain labels. 152 | if mask_uncertain: 153 | loss_fn = MaskedLossWrapper(loss_fn, device) 154 | 155 | else: 156 | raise ValueError("No loss function for supplied arguments.") 157 | 158 | return loss_fn 159 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/eval/loss.py: -------------------------------------------------------------------------------- 1 | """Define uncertainty cross entropy class.""" 2 | import torch 3 | import torch.nn as nn 4 | 5 | from constants import * 6 | 7 | 8 | class CrossEntropyLossWithUncertainty(nn.Module): 9 | """Cross-entropy loss modified to also include uncertainty outputs.""" 10 | def __init__(self, size_average=True, reduce=True): 11 | super(CrossEntropyLossWithUncertainty, self).__init__() 12 | self.ce_loss = nn.CrossEntropyLoss(reduce=False) 13 | self.size_average = size_average 14 | self.reduce = reduce 15 | 16 | def forward(self, logits, labels): 17 | """ 18 | Args: 19 | logits: Un-normalized outputs of shape (batch_size, num_tasks, 3) 20 | labels: Labels of shape (batch_size, num_tasks) 21 | where -1 is uncertain, 0 is negative, 1 is positive. 22 | """ 23 | batch_size, last_dim = logits.size() 24 | if last_dim % 3: 25 | raise ValueError('Last dim should be divisible by 3, ' + 26 | f'got last dim of {last_dim}') 27 | num_tasks = last_dim // 3 28 | 29 | # Fuse batch and task dimensions 30 | logits = logits.view(batch_size * num_tasks, 3) 31 | # Shift labels into range [0, 2] 32 | labels = (labels + 1).type(torch.int64) 33 | # Flatten 34 | labels = labels.view(-1) 35 | 36 | # Output shape (batch_size * num_tasks,) 37 | loss = self.ce_loss(logits, labels) 38 | # Reshape and take average over batch dim 39 | loss = loss.view(batch_size, num_tasks) 40 | 41 | if self.size_average: 42 | loss = loss.mean(1) 43 | if self.reduce: 44 | loss = loss.mean(0) 45 | 46 | return loss 47 | 48 | 49 | class MaskedLossWrapper(nn.Module): 50 | 51 | def __init__(self, loss_fn, device): 52 | 53 | super().__init__() 54 | self.loss_fn = loss_fn 55 | self.device = device 56 | 57 | def _get_mask(self, targets): 58 | """Returns a mask to mask uncertain 59 | and missing labels. 60 | 61 | Functions tales advantage of the following: 62 | Negative/Positive: 0/1 63 | Uncertain: -1 64 | Missing: -2 """ 65 | 66 | mask = torch.ones(targets.shape) 67 | mask[targets == UNCERTAIN] = 0 68 | mask[targets == MISSING] = 0 69 | 70 | mask = mask.to(self.device) 71 | 72 | return mask 73 | 74 | def forward(self, logits, targets): 75 | 76 | # Apply loss function 77 | loss = self.loss_fn(logits, targets) 78 | 79 | # Apply mask to skip missing labels 80 | # and handle uncertain labels 81 | mask = self._get_mask(targets) 82 | loss = loss * mask 83 | 84 | # Average the loss 85 | loss = loss.sum() 86 | loss = loss * (1 / (mask.sum())) 87 | 88 | return loss 89 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/logger/__init__.py: -------------------------------------------------------------------------------- 1 | from .logger import Logger -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/logger/logger.py: -------------------------------------------------------------------------------- 1 | """Define Logger class for logging information to stdout and disk.""" 2 | import pandas as pd 3 | import sys 4 | from tensorboardX import SummaryWriter 5 | 6 | from constants import COL_PATH, COL_TASK, COL_METRIC, COL_VALUE 7 | 8 | 9 | class Logger(object): 10 | """Class for logging output.""" 11 | def __init__(self, log_path, save_dir, results_dir=None): 12 | self.log_path = log_path 13 | self.log_file = log_path.open('w') 14 | 15 | self.tb_log_dir = save_dir / "tb" 16 | self.summary_writer = SummaryWriter(log_dir=str(self.tb_log_dir)) 17 | 18 | self.results_dir = results_dir 19 | if results_dir is not None: 20 | self.metrics_path = results_dir / "scores.txt" 21 | self.metrics_csv_path = results_dir / "scores.csv" 22 | self.metrics_file = self.metrics_path.open('w') 23 | self.predictions_path = results_dir / "predictions.csv" 24 | self.groundtruth_path = results_dir / "groundtruth.csv" 25 | 26 | def log(self, *args): 27 | self.log_stdout(*args) 28 | print(*args, file=self.log_file) 29 | self.log_file.flush() 30 | 31 | def log_metrics(self, metrics, save_csv=False): 32 | for metric, value in metrics.items(): 33 | msg = f'{metric}:\t{value}' 34 | if self.results_dir is not None: 35 | self.log_stdout(msg) 36 | print(msg, file=self.metrics_file) 37 | self.metrics_file.flush() 38 | else: 39 | self.log(f"[{msg}]") 40 | 41 | if save_csv: 42 | col_tasks = [] 43 | col_metrics = [] 44 | col_values = [] 45 | for task_metric, value in metrics.items(): 46 | # Extract task and metric from dict key 47 | tokens = task_metric.split(":") 48 | assert len(tokens) == 2, "Failed to split key on ':'!" 49 | task, metric = tokens 50 | col_tasks.append(task) 51 | col_metrics.append(metric) 52 | col_values.append(value) 53 | 54 | # Assemble a DataFrame and save as CSV 55 | metrics_df = pd.DataFrame({COL_TASK: col_tasks, 56 | COL_METRIC: col_metrics, 57 | COL_VALUE: col_values}) 58 | metrics_df.to_csv(self.metrics_csv_path, index=False) 59 | 60 | def log_stdout(self, *args): 61 | print(*args, file=sys.stdout) 62 | sys.stdout.flush() 63 | 64 | def close(self): 65 | self.log_file.close() 66 | 67 | def log_scalars(self, scalar_dict, iterations, print_to_stdout=True): 68 | """Log all values in a dict as scalars to TensorBoard.""" 69 | for k, v in scalar_dict.items(): 70 | if print_to_stdout: 71 | self.log_stdout(f'[{k}: {v:.3g}]') 72 | k = k.replace(':', '/') # Group in TensorBoard by phase 73 | self.summary_writer.add_scalar(k, v, iterations) 74 | 75 | # def log_scalars2(self, scalar_dict, iterations, print_to_stdout=True): 76 | # """Log AUROC and accuracy in a dict as scalars to TensorBoard.""" 77 | # for k, v in scalar_dict.items(): 78 | # # Only prints AUROC and accuracy 79 | # if ('AUROC' in k) or ('accuracy' in k): 80 | # k = k.replace(':', '/') # Group in TensorBoard by phase 81 | # self.summary_writer.add_scalar(k, v, iterations) 82 | 83 | def log_predictions_groundtruth(self, predictions, groundtruth, 84 | paths=None): 85 | if paths is not None: 86 | predictions.insert(0, COL_PATH, paths) 87 | groundtruth.insert(0, COL_PATH, paths) 88 | 89 | predictions.to_csv(self.predictions_path, index=False) 90 | groundtruth.to_csv(self.groundtruth_path, index=False) 91 | 92 | if paths is not None: 93 | del predictions[COL_PATH] 94 | del groundtruth[COL_PATH] 95 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import * 2 | from .calibrate import Calibrator 3 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/models/calibrate.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | 4 | from pathlib import Path 5 | from sklearn.isotonic import IsotonicRegression 6 | from sklearn.calibration import _SigmoidCalibration 7 | 8 | 9 | class Calibrator(object): 10 | """Class for performing post-processing calibration techniques.""" 11 | def __init__(self, calibrator_type, calibrator_dir, task_name, eval=True): 12 | # Where to save or load calibration model 13 | self.calibrator_type = calibrator_type 14 | self.path = calibrator_dir / (f"{calibrator_type}_{task_name}.pkl") 15 | self.eval = eval 16 | 17 | if self.eval: 18 | # If in eval mode, load the calibration model 19 | self.load() 20 | 21 | def predict(self, y_prob): 22 | # Run the loaded calibration model 23 | return self.calibrator.predict(y_prob) 24 | 25 | def train(self, y_true, y_prob): 26 | if self.calibrator_type == 'isotonic': 27 | self.calibrator = IsotonicRegression(out_of_bounds='clip') 28 | elif self.calibrator_type == 'platt': 29 | self.calibrator = _SigmoidCalibration() 30 | 31 | self.calibrator.fit(y_prob, y_true) 32 | 33 | self.save() 34 | 35 | def load(self): 36 | print(f"Loading calibration model from {self.path}") 37 | with self.path.open('rb') as f: 38 | self.calibrator = pickle.load(f) 39 | 40 | def save(self): 41 | print(f"Saving calibration model to {self.path}") 42 | if not self.path.parent.exists(): 43 | self.path.parent.mkdir(parents=True) 44 | with self.path.open('wb') as f: 45 | pickle.dump(self.calibrator, f) 46 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/optim/__init__.py: -------------------------------------------------------------------------------- 1 | from .optimizer import Optimizer 2 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/predict/__init__.py: -------------------------------------------------------------------------------- 1 | from .predict import Predictor 2 | from .ensemble_predict import EnsemblePredictor 3 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/predict/configs/toy.json: -------------------------------------------------------------------------------- 1 | { 2 | "aggregation_method": "mean", 3 | "task2models": { 4 | "Atelectasis": [ 5 | { 6 | "ckpt_path": "/deep/group/xray4all/final_ckpts/DenseNet121_320_1e-04_uncertainty_ones_top10/iter_336000.pth.tar", 7 | "is_3class": false 8 | }, 9 | { 10 | "ckpt_path": "/deep/group/xray4all/final_ckpts/DenseNet121_320_1e-04_uncertainty_ones_top10/iter_384000.pth.tar", 11 | "is_3class": false 12 | } 13 | ], 14 | "Cardiomegaly": [ 15 | { 16 | "ckpt_path": "/deep/group/xray4all/final_ckpts/DenseNet121_320_1e-04_uncertainty_3-class_top10/iter_403200.pth.tar", 17 | "is_3class": true 18 | }, 19 | { 20 | "ckpt_path": "/deep/group/xray4all/final_ckpts/DenseNet121_320_1e-04_uncertainty_3-class_top10/iter_350400.pth.tar", 21 | "is_3class": true 22 | } 23 | ] 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/predict/predict.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | import util 7 | 8 | NEG_INF = -1e9 9 | 10 | 11 | class Predictor(object): 12 | """Predictor class for a single model.""" 13 | def __init__(self, model, device): 14 | 15 | self.model = model 16 | self.device = device 17 | 18 | def predict(self, loader): 19 | self.model.eval() 20 | probs = [] 21 | gt = [] 22 | all_embeddings = [] 23 | if loader.dataset.return_info_dict: 24 | paths = [] 25 | with tqdm(total=len(loader.dataset)) as progress_bar: 26 | for data in loader: 27 | with torch.no_grad(): 28 | if loader.dataset.study_level: 29 | if loader.dataset.return_info_dict: 30 | inputs, targets, info_dict, mask = data 31 | else: 32 | inputs, targets, mask = data 33 | 34 | 35 | # Fuse batch size `b` and study length `s` 36 | b, s, c, h, w = inputs.size() 37 | inputs = inputs.view(-1, c, h, w) 38 | 39 | # Predict 40 | logits, embeddings = self.model(inputs.to(self.device)) 41 | all_embeddings.append(embeddings.detach().cpu().numpy()) 42 | logits = logits.view(b, s, -1) 43 | 44 | # Mask padding to negative infinity 45 | ignore_where = (mask == 0).unsqueeze(-1) 46 | ignore_where = ignore_where.repeat(1, 1, 47 | logits.size(-1)) 48 | ignore_where = ignore_where.to(self.device) 49 | logits = torch.where(ignore_where, 50 | torch.full_like(logits, NEG_INF), 51 | logits) 52 | batch_logits, _ = torch.max(logits, 1) 53 | 54 | else: 55 | if loader.dataset.return_info_dict: 56 | inputs, targets, info_dict = data 57 | else: 58 | inputs, targets = data 59 | 60 | batch_logits = self.model(inputs.to(self.device)) 61 | 62 | if self.model.module.model_uncertainty: 63 | batch_probs =\ 64 | util.uncertain_logits_to_probs(batch_logits) 65 | else: 66 | batch_probs = torch.sigmoid(batch_logits) 67 | 68 | probs.append(batch_probs.cpu()) 69 | gt.append(targets) 70 | if loader.dataset.return_info_dict: 71 | paths.extend(info_dict['paths']) 72 | progress_bar.update(targets.size(0)) 73 | 74 | concat = np.concatenate(all_embeddings) 75 | all_embeddings = concat.reshape(len(concat), -1) 76 | probs_concat = np.concatenate(probs) 77 | gt_concat = np.concatenate(gt) 78 | 79 | with open('cx_res18.npy', 'wb') as f: 80 | np.save(f, all_embeddings) 81 | np.save(f, gt_concat) 82 | 83 | print(probs_concat.shape) 84 | print(gt_concat.shape) 85 | tasks = self.model.module.tasks # Tasks decided at self.model.module.tasks. 86 | print(tasks) 87 | probs_df = pd.DataFrame({task: probs_concat[:, i] 88 | for i, task in enumerate(tasks)}) 89 | gt_df = pd.DataFrame({task: gt_concat[:, i] # Check how gt_df looks like. 90 | for i, task in enumerate(tasks)}) 91 | 92 | self.model.train() 93 | 94 | if loader.dataset.return_info_dict: 95 | return probs_df, gt_df, paths 96 | 97 | return probs_df, gt_df 98 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/saver/__init__.py: -------------------------------------------------------------------------------- 1 | from .model_saver import ModelSaver 2 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/scripts/get_cams.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | sys.path.append(str(Path(__file__).absolute().parent.parent)) 4 | 5 | import os 6 | 7 | import pandas as pd 8 | import cv2 9 | import torch 10 | import numpy as np 11 | from imageio import imsave 12 | 13 | import util 14 | from dataset import TASK_SEQUENCES 15 | from cams import GradCAM, EnsembleCAM 16 | from cams import GuidedBackPropagation 17 | from saver import ModelSaver 18 | from args import TestArgParser 19 | from dataset import get_loader, get_eval_loaders 20 | from dataset.constants import IMAGENET_MEAN, IMAGENET_STD 21 | 22 | def save_grad_cams(args, loader, model, output_dir, only_competition=False, only_top_task=False): 23 | """Save grad cams for all examples in a loader.""" 24 | 25 | # 'study_level' determined if the loader is returning 26 | # studies or individual images 27 | study_level = loader.dataset.study_level 28 | 29 | # NOTE: some model does not have task_sequence 30 | if hasattr(model.module, 'task_sequence'): 31 | task_sequence = model.module.task_sequence 32 | # NOTE: Right now hard code to "stanford" task_sequence, 33 | # to match the number of predictions CheXpert makes. 34 | else: 35 | # task_sequence = TASK_SEQUENCES[data_args.task_sequence] 36 | task_sequence = TASK_SEQUENCES["stanford"] 37 | print(f'WARNING: assuming that the models task sequence is \n {task_sequence}') 38 | 39 | if hasattr(model, "task2model_dicts"): 40 | grad_cam = EnsembleCAM(model, args.device) 41 | else: 42 | grad_cam = GradCAM(model, args.device) 43 | 44 | # By keeping track of the example id 45 | # we can name each folder using the example_id. 46 | counter = 0 47 | 48 | if study_level: 49 | # for inputs_batch, labels_batch, masks_batch in loader: 50 | for inputs_batch, labels_batch, info_batch, masks_batch in loader: 51 | for i, (input_study, label_study, mask_study) in enumerate(zip(inputs_batch, labels_batch, masks_batch)): 52 | 53 | directory = f'{output_dir}/{counter}' 54 | # Loop over the views in a studyo 55 | view_id = 0 56 | for input_, mask_val in zip(input_study, mask_study): 57 | # Skip this image if it is just a 'padded' image 58 | if mask_val == 0: 59 | continue 60 | 61 | write_grad_cams(input_, label_study, grad_cam, directory, 62 | task_sequence, 63 | only_competition=only_competition, 64 | view_id=view_id) 65 | view_id = view_id + 1 66 | 67 | # Write label to txt and save to same folder 68 | # to make inspecting the cams easier 69 | label = np.reshape(label_study.numpy(), (1, -1)) 70 | label_df = pd.DataFrame(label, columns=list(task_sequence)) 71 | label_df["Path"] = info_batch['paths'][i] 72 | label_df["Counter"] = counter 73 | label_df.to_csv(f'{directory}/groundtruth.txt', index=False) 74 | 75 | counter = counter + 1 76 | 77 | else: 78 | for inputs, labels in loader: 79 | for input_, label in zip(inputs, labels): 80 | directory = f'{output_dir}/{counter}' 81 | write_grad_cams(input_, label, grad_cam, directory, task_sequence) 82 | 83 | counter = counter + 1 84 | 85 | def write_grad_cams(input_, label, grad_cam, 86 | directory, task_sequence, only_competition=False, only_top_task=False, view_id=None): 87 | 88 | """Creates a CAM for each image. 89 | 90 | Args: 91 | input: Image tensor with shape (3 x h x h) 92 | grad_cam: EnsembleCam Object wrapped around GradCam objects, which are wrapped around models. 93 | directory: the output folder for these set of cams 94 | task_sequence: 95 | """ 96 | if only_competition: 97 | COMPETITION_TASKS = TASK_SEQUENCES['competition'] 98 | 99 | # Get the original image by 100 | # unnormalizing (img pixels will be between 0 and 1) 101 | # img shape: c, h, w 102 | img = util.un_normalize(input_, IMAGENET_MEAN, IMAGENET_STD) 103 | 104 | # move rgb chanel to last 105 | img = np.moveaxis(img, 0, 2) 106 | 107 | # Add the batch dimension 108 | # as the model requires it. 109 | input_ = input_.unsqueeze(0) 110 | _, channels, height, width = input_.shape 111 | num_tasks = len(task_sequence) 112 | 113 | # Create the directory for cams for this specific example 114 | if not os.path.exists(directory): 115 | os.makedirs(directory) 116 | 117 | #assert (inputs.shape[0] == 1), 'batch size must be equal to 1' 118 | with torch.set_grad_enabled(True): 119 | 120 | for task_id in range(num_tasks): 121 | task_name = list(task_sequence)[task_id] 122 | if only_competition: 123 | if task_name not in COMPETITION_TASKS: 124 | continue 125 | 126 | task = task_name.lower() 127 | task = task.replace(' ', '_') 128 | task_label = int(label[task_id].item()) 129 | if any([((task in f) and (f'v-{view_id}' in f)) for f in os.listdir(directory)]) or task_label != 1: 130 | continue 131 | 132 | probs, idx, cam = grad_cam.get_cam(input_, task_id, task_name) 133 | 134 | # Resize cam and overlay on image 135 | resized_cam = cv2.resize(cam, (height, width)) 136 | # We don't normalize since the grad clam class has already taken care of that 137 | img_with_cam = util.add_heat_map(img, resized_cam, normalize=False) 138 | 139 | # Save a cam for this task and image 140 | # using task, prob and groundtruth in file name 141 | prob = probs[idx==task_id].item() 142 | if view_id is None: 143 | filename = f'{task}-p{prob:.3f}-gt{task_label}.png' 144 | else: 145 | filename = f'{task}-p{prob:.3f}-gt{task_label}-v-{view_id}.png' 146 | output_path = os.path.join(directory, filename) 147 | imsave(output_path, img_with_cam) 148 | 149 | 150 | # Save the original image in the same folder 151 | output_path = os.path.join(directory, f'original_image-v-{view_id}.png') 152 | img = np.uint8(img * 255) 153 | imsave(output_path, img) 154 | 155 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/scripts/get_model_size.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from prettytable import PrettyTable 4 | from torchvision import models 5 | 6 | 7 | def count_parameters(model): 8 | table = PrettyTable(["Modules", "Parameters"]) 9 | total_params = 0 10 | for name, parameter in model.named_parameters(): 11 | if not parameter.requires_grad: continue 12 | param = parameter.numel() 13 | table.add_row([name, param]) 14 | total_params+=param 15 | print(table) 16 | print(f"Total Trainable Params: {total_params}") 17 | return total_params 18 | 19 | 20 | if __name__ == '__main__' : 21 | 22 | net = models.__dict__[sys.argv[1]]() 23 | count_parameters(net) -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/test.py: -------------------------------------------------------------------------------- 1 | """Entry-point script to train models.""" 2 | import torch 3 | 4 | from args import TestArgParser 5 | from logger import Logger 6 | from predict import Predictor, EnsemblePredictor 7 | from saver import ModelSaver 8 | from data import get_loader 9 | from eval import Evaluator 10 | from constants import * 11 | from scripts.get_cams import save_grad_cams 12 | from dataset import TASK_SEQUENCES 13 | 14 | 15 | def test(args): 16 | """Run model testing.""" 17 | 18 | model_args = args.model_args 19 | data_args = args.data_args 20 | logger_args = args.logger_args 21 | 22 | # import pdb; pdb.set_trace() 23 | 24 | # Get logger. 25 | logger = Logger(logger_args.log_path, 26 | logger_args.save_dir, 27 | logger_args.results_dir) 28 | 29 | # Get image paths corresponding to predictions for logging 30 | paths = None 31 | 32 | if model_args.config_path is not None: 33 | # Instantiate the EnsemblePredictor class for obtaining 34 | # model predictions. 35 | predictor = EnsemblePredictor(config_path=model_args.config_path, 36 | model_args=model_args, 37 | data_args=data_args, 38 | gpu_ids=args.gpu_ids, 39 | device=args.device, 40 | logger=logger) 41 | # Obtain ensemble predictions. 42 | # Caches both individual and ensemble predictions. 43 | # We always turn off caching to ensure that we write the Path column. 44 | predictions, groundtruth, paths = predictor.predict(cache=False, 45 | return_paths=True, 46 | all_gt_tasks=True) 47 | else: 48 | # Load the model at ckpt_path. 49 | ckpt_path = model_args.ckpt_path 50 | ckpt_save_dir = Path(ckpt_path).parent 51 | model_uncertainty = model_args.model_uncertainty 52 | # Get model args from checkpoint and add them to 53 | # command-line specified model args. 54 | model_args, transform_args\ 55 | = ModelSaver.get_args(cl_model_args=model_args, 56 | dataset=data_args.dataset, 57 | ckpt_save_dir=ckpt_save_dir, 58 | model_uncertainty=model_uncertainty) 59 | 60 | # TODO JBY: in test moco should never be true. 61 | model_args.moco = args.model_args.moco 62 | model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path, 63 | gpu_ids=args.gpu_ids, 64 | model_args=model_args, 65 | is_training=False) 66 | 67 | # Instantiate the Predictor class for obtaining model predictions. 68 | predictor = Predictor(model=model, device=args.device) 69 | # Get phase loader object. 70 | return_info_dict = True 71 | loader = get_loader(phase=data_args.phase, 72 | data_args=data_args, 73 | transform_args=transform_args, 74 | is_training=False, 75 | return_info_dict=return_info_dict, 76 | logger=logger) 77 | # Obtain model predictions. 78 | if return_info_dict: 79 | predictions, groundtruth, paths = predictor.predict(loader) 80 | else: 81 | predictions, groundtruth = predictor.predict(loader) 82 | # print(predictions[CHEXPERT_COMPETITION_TASKS]) 83 | if model_args.calibrate: 84 | #open the json file which has the saved parameters 85 | import json 86 | with open(CALIBRATION_FILE) as f: 87 | data = json.load(f) 88 | i = 0 89 | #print(predictions) 90 | import math 91 | def sigmoid(x): 92 | return 1 / (1 + math.exp(-x)) 93 | 94 | for column in predictions: 95 | predictions[column] = predictions[column].apply \ 96 | (lambda x: sigmoid(x * data[i][0][0][0] \ 97 | + data[i][1][0])) 98 | i += 1 99 | 100 | # print(predictions[CHEXPERT_COMPETITION_TASKS]) 101 | #run forward on all the predictions in each row of predictions 102 | 103 | # Log predictions and groundtruth to file in CSV format. 104 | logger.log_predictions_groundtruth(predictions, groundtruth, paths) 105 | 106 | if not args.inference_only: 107 | # Instantiate the evaluator class for evaluating models. 108 | evaluator = Evaluator(logger, 109 | operating_points_path=CHEXPERT_RAD_PATH) 110 | # Get model metrics and curves on the phase dataset. 111 | metrics, curves = evaluator.evaluate_tasks(groundtruth, predictions) 112 | # Log metrics to stdout and file. 113 | logger.log_stdout(f"Writing metrics to {logger.metrics_path}.") 114 | logger.log_metrics(metrics, save_csv=True) 115 | 116 | # TODO: make this work with ensemble 117 | # TODO: investigate if the eval_loader can just be the normal loader here 118 | if logger_args.save_cams: 119 | cams_dir = logger_args.save_dir / 'cams' 120 | print(f'Save cams to {cams_dir}') 121 | save_grad_cams(args, loader, model, 122 | cams_dir, 123 | only_competition=logger_args.only_competition_cams, 124 | only_top_task=False) 125 | 126 | logger.log("=== Testing Complete ===") 127 | # Produce other visuals 128 | # TODO: This causes "unexpected error to scripts" 129 | # raise NotImplementedError() 130 | 131 | 132 | if __name__ == "__main__": 133 | torch.multiprocessing.set_sharing_strategy('file_system') 134 | parser = TestArgParser() 135 | print("Start test...") 136 | test(parser.parse_args()) 137 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/test_images.py: -------------------------------------------------------------------------------- 1 | # Create dummy csv and dummy image folders before running test 2 | import subprocess 3 | import shutil 4 | import os 5 | import glob 6 | import csv 7 | from constants import * 8 | from argparse import ArgumentParser 9 | 10 | 11 | def parse_script_args(): 12 | """Parse command line arguments. 13 | 14 | Returns: 15 | args (Namespace): Parsed command line arguments 16 | 17 | """ 18 | parser = ArgumentParser() 19 | 20 | parser.add_argument('--save_dir', 21 | type=str, default=str(CHEXPERT_SAVE_DIR), 22 | help='Directory to save model data.') 23 | 24 | parser.add_argument('--img_folder', type=str, 25 | default=None, required=True, 26 | help='Path to folder of all the images') 27 | 28 | parser.add_argument('--batch_size', 29 | type=int, default=16, 30 | help='Batch size for training / evaluation.') 31 | 32 | parser.add_argument('--ckpt_path', 33 | type=str, default=None, 34 | help=('Checkpoint path for eval.')) 35 | 36 | parser.add_argument('--config_path', 37 | type=str, default=None, 38 | help=('Path to ensemble.')) 39 | 40 | args = parser.parse_args() 41 | return args 42 | 43 | def folders_csv(folder): 44 | """Create csv and put images in folder 45 | 46 | Args: 47 | folder (str): path to all the images 48 | """ 49 | images = glob.glob(folder + "/*.jpg") 50 | rows = [] 51 | for image in images: 52 | img_path = Path(image) 53 | img_name = img_path.name 54 | new_dir = img_path.parent / img_name.rstrip('.jpg') 55 | new_dir.mkdir(exist_ok=True, parents=True) 56 | new_path = new_dir / img_name 57 | rows.append([str(new_path.absolute())] + [None] * 4 + [0] * 14) 58 | img_path.rename(new_path) 59 | with open(folder + '/dummy.csv', 'w') as csv_file: 60 | row = ["Path", "Sex", "Age", "Frontal/Lateral", "AP/PA"] \ 61 | + CHEXPERT_TASKS 62 | writer = csv.writer(csv_file) 63 | writer.writerow(row) 64 | for row in rows: 65 | writer.writerow(row) 66 | 67 | def run_test(args): 68 | """Run test on dummy csv 69 | 70 | Args: 71 | args (Namespace): Parsed command line arguments 72 | """ 73 | if args.config_path is not None: 74 | path = "--config_path" 75 | path_name = args.config_path 76 | else: 77 | path = "--ckpt_path" 78 | path_name = args.ckpt_path 79 | subprocess.run(['python', 'test.py', '--dataset', 'custom', path, path_name, 80 | '--phase', 'test', '--together', 'True', '--test_csv', 81 | str(args.img_folder + '/dummy.csv'), '--save_dir', args.save_dir]) 82 | os.remove(args.img_folder + '/dummy.csv') #remove if you want to keep csv 83 | os.remove(args.save_dir + '/results/test/groundtruth.csv') 84 | 85 | 86 | if __name__ == "__main__": 87 | args = parse_script_args() 88 | csv = folders_csv(args.img_folder) 89 | run_test(args) -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/test_one.py: -------------------------------------------------------------------------------- 1 | """Evaluate a ckpt or config on a test CSV. 2 | 3 | Usage: 4 | python test_one.py --model_path 5 | --csv_path 6 | --name 7 | 8 | """ 9 | import os 10 | import pandas as pd 11 | import sys 12 | 13 | from argparse import ArgumentParser 14 | from datetime import datetime 15 | from getpass import getuser 16 | from pathlib import Path 17 | from shutil import copy 18 | from subprocess import run 19 | 20 | 21 | FILE_ENDINGS = set(['.pth', '.tar', '.json']) 22 | ROOT_DIR = Path('/deep/group/chexperturbed/runs') 23 | USER_DIR = ROOT_DIR / getuser() 24 | TASKS = ['Cardiomegaly', 25 | 'Edema', 26 | 'Consolidation', 27 | 'Atelectasis', 28 | 'Pleural Effusion', 29 | 'Normal'] 30 | METRIC = 'AUROC' 31 | 32 | 33 | def parse_script_args(): 34 | """Parse command line arguments. 35 | 36 | Returns: 37 | args (Namespace): parsed command line arguments 38 | 39 | """ 40 | parser = ArgumentParser() 41 | 42 | parser.add_argument('--name', type=str, required=True, 43 | help='Name of the run') 44 | 45 | parser.add_argument('--model_path', type=str, required=True, 46 | help='Path of ckpt or config file') 47 | 48 | parser.add_argument('--csv_path', type=str, required=True, 49 | help='Path to test_csv') 50 | 51 | parser.add_argument('--is_3class', action='store_true', 52 | help='Whether this is a 3-class model') 53 | 54 | parser.add_argument('--save_cams', action='store_true', 55 | help='Whether to also generate CAMs') 56 | 57 | parser.add_argument('--gpu_ids', type=str, required=True, 58 | help='Devices to use') 59 | 60 | parser.add_argument('--inference_only', action='store_true', 61 | help='Whether to only run inference') 62 | 63 | args = parser.parse_args() 64 | args.model_path = Path(args.model_path) 65 | assert args.model_path.exists() 66 | args.csv_path = Path(args.csv_path) 67 | assert args.csv_path.exists() 68 | if args.model_path.suffix not in FILE_ENDINGS: 69 | print('Error: unrecognized file ending! Exiting.') 70 | exit() 71 | return args 72 | 73 | 74 | if __name__ == '__main__': 75 | args = parse_script_args() 76 | exp_dir = USER_DIR / args.name 77 | print('Saving run results in %s...' % str(exp_dir)) 78 | USER_DIR.mkdir(exist_ok=True, parents=True) 79 | 80 | # Don't allow experiment to proceed if already exists, to avoid clobbering 81 | try: 82 | exp_dir.mkdir(parents=True) 83 | except FileExistsError as e: 84 | print('Error: directory already exists! Exiting.') 85 | exit() 86 | 87 | # Save command for reproducibility 88 | cmd_path = exp_dir / 'cmd.txt' 89 | print('Saving command to %s...' % str(cmd_path)) 90 | cmd = ' '.join(['python'] + sys.argv) 91 | with open(cmd_path, 'w+') as f: 92 | f.write('%s\n' % cmd) 93 | 94 | # Testing ensemble 95 | model_path = None 96 | if args.model_path.suffix == '.json': 97 | config_dst_path = exp_dir / args.model_path.name 98 | copy(args.model_path, config_dst_path) 99 | model_path = ('--config_path', str(config_dst_path)) 100 | # Testing single model 101 | else: 102 | ckpt_dst_path = exp_dir / args.model_path.name 103 | copy(args.model_path, ckpt_dst_path) 104 | model_path = ('--ckpt_path', str(ckpt_dst_path)) 105 | args_dst_path = exp_dir / 'args.json' 106 | copy(args.model_path.parent / 'args.json', args_dst_path) 107 | 108 | test_args = ['python', 'test.py', 109 | '--dataset', 'custom', 110 | '--together', 'True', 111 | '--test_csv', args.csv_path, 112 | model_path[0], model_path[1], 113 | '--phase', 'test', 114 | '--save_dir', str(exp_dir), 115 | '--gpu_ids', args.gpu_ids] 116 | 117 | if args.is_3class: 118 | test_args += ['--model_uncertainty', 'True'] 119 | 120 | if args.save_cams: 121 | test_args += ['--save_cams', 'True'] 122 | test_args += ['--only_competition_cams', 'True'] 123 | 124 | if args.inference_only: 125 | test_args += ['--inference_only'] 126 | 127 | # Run the model, but suppress output 128 | print('Running model...') 129 | with open(os.devnull, 'w') as devnull: 130 | run(test_args, stdout=devnull) 131 | 132 | # Delete the checkpoint when done to save space 133 | if model_path[0] == '--ckpt_path': 134 | print('Deleting checkpoint...') 135 | Path(model_path[1]).unlink() 136 | 137 | # Quit if we're only doing inference 138 | if args.inference_only: 139 | exit() 140 | 141 | # Print out relevant metrics 142 | scores_path = exp_dir 143 | if model_path[0] == '--config_path': 144 | scores_path /= args.model_path.stem 145 | scores_path = scores_path / 'results' / 'test' / 'scores.csv' 146 | df = pd.read_csv(scores_path) 147 | print('Selected results:') 148 | values = [] 149 | for task in TASKS: 150 | value = float(df[(df['Metrics'] == METRIC) & 151 | (df['Tasks'] == task)]['Values']) 152 | values.append(value) 153 | print('%s (%s): %f' % (METRIC, task, value)) 154 | 155 | # Build row for spreadsheet 156 | ss_date = datetime.now().strftime('%m/%d/%Y') 157 | ss_path = str(args.model_path) 158 | ss_test_data = args.name.split('__')[-1] 159 | ss_values = [str(value) for value in values] 160 | ss_results_dir = str(exp_dir) 161 | ss_cmd = cmd 162 | ss_row = [ss_date, ss_path, ss_test_data] 163 | ss_row += ss_values + [ss_results_dir, ss_cmd] 164 | ss_row = ','.join(ss_row) 165 | print('Generated row for spreadsheet: %s' % ss_row) 166 | with open(exp_dir / 'row.txt', 'w+') as f: 167 | f.write('%s\n' % ss_row) 168 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/timeout_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import threading 3 | from time import sleep 4 | 5 | import _thread as thread 6 | 7 | 8 | def quit_function(fn_name): 9 | # print to stderr, unbuffered in Python 2. 10 | print('{0} took too long'.format(fn_name), file=sys.stderr) 11 | sys.stderr.flush() # Python 3 stderr is likely buffered. 12 | thread.interrupt_main() # raises KeyboardInterrupt 13 | # raise TimeoutError 14 | 15 | 16 | def exit_after(s): 17 | ''' 18 | use as decorator to exit process if 19 | function takes longer than s seconds 20 | ''' 21 | def outer(fn): 22 | def inner(*args, **kwargs): 23 | timer = threading.Timer(s, quit_function, args=[fn.__name__]) 24 | timer.start() 25 | try: 26 | result = fn(*args, **kwargs) 27 | finally: 28 | timer.cancel() 29 | return result 30 | return inner 31 | return outer 32 | 33 | 34 | @exit_after(5) 35 | def countdown(n): 36 | print('countdown started', flush=True) 37 | for i in range(n, -1, -1): 38 | print(i, end=', ', flush=True) 39 | sleep(1) 40 | print('countdown finished') 41 | 42 | 43 | if __name__ == '__main__': 44 | 45 | countdown(2) 46 | 47 | try: 48 | countdown(5) 49 | except: 50 | print('here') 51 | 52 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/util/__init__.py: -------------------------------------------------------------------------------- 1 | from util.cuda_util import * 2 | from util.io_util import * 3 | from util.image_util import * 4 | from util.model_util import * 5 | from util.label_util import * -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/util/cuda_util.py: -------------------------------------------------------------------------------- 1 | """Utility file for CUDA and GPU-specific functions.""" 2 | 3 | 4 | import torch 5 | import torch.backends.cudnn as cudnn 6 | 7 | 8 | def setup_gpus(gpu_ids): 9 | """Set up the GPUs and return the device to be used. 10 | 11 | Args: 12 | gpu_ids (list): list of GPU IDs 13 | 14 | Returns: 15 | device (str): the device, either 'cuda' or 'cpu' 16 | 17 | """ 18 | device = None 19 | if len(gpu_ids) > 0 and torch.cuda.is_available(): 20 | torch.cuda.set_device(gpu_ids[0]) 21 | cudnn.benchmark = True 22 | device = 'cuda' 23 | else: 24 | device = 'cpu' 25 | 26 | return device 27 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/util/io_util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from sys import stderr 3 | 4 | 5 | def args_to_list(csv, allow_empty, arg_type=int, allow_negative=True): 6 | """Convert comma-separated arguments to a list. 7 | 8 | Args: 9 | csv: Comma-separated list of arguments as a string. 10 | allow_empty: If True, allow the list to be empty. Otherwise return None instead of empty list. 11 | arg_type: Argument type in the list. 12 | allow_negative: If True, allow negative inputs. 13 | 14 | Returns: 15 | List of arguments, converted to `arg_type`. 16 | """ 17 | arg_vals = [arg_type(d) for d in str(csv).split(',')] 18 | if not allow_negative: 19 | arg_vals = [v for v in arg_vals if v >= 0] 20 | if not allow_empty and len(arg_vals) == 0: 21 | return None 22 | return arg_vals 23 | 24 | # TODO: Move to logger 25 | def print_err(*args, **kwargs): 26 | """Print a message to stderr.""" 27 | print(*args, file=stderr, **kwargs) 28 | 29 | 30 | def str_to_bool(arg): 31 | """Convert an argument string into its boolean value. 32 | 33 | Args: 34 | arg: String representing a bool. 35 | 36 | Returns: 37 | Boolean value for the string. 38 | """ 39 | if arg.lower() in ('yes', 'true', 't', 'y', '1'): 40 | return True 41 | elif arg.lower() in ('no', 'false', 'f', 'n', '0'): 42 | return False 43 | else: 44 | raise argparse.ArgumentTypeError('Boolean value expected.') 45 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/util/label_util.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | PATH_TO_STUDY_RE = re.compile(r'(valid|train|test)/patient(\d+)/study(\d+)') 5 | 6 | 7 | def get_study_id(path): 8 | """Get a unique study ID from a (study or image) path. 9 | 10 | For example: 11 | /deep/group/xray4all/images/valid/patient64542/study1 -> valid/patient64542/study1 12 | 13 | Args: 14 | path (str): Path to convert to study_id. 15 | """ 16 | path = str(path) 17 | match = PATH_TO_STUDY_RE.search(path) 18 | return match.group(0) if match else None 19 | -------------------------------------------------------------------------------- /chexpert_supervised/chexpert-model/util/model_util.py: -------------------------------------------------------------------------------- 1 | import torch.nn.functional as F 2 | 3 | 4 | def uncertain_logits_to_probs(logits): 5 | """Convert explicit uncertainty modeling logits to probabilities P(is_abnormal). 6 | 7 | Args: 8 | logits: Input of shape (batch_size, num_tasks * 3). 9 | 10 | Returns: 11 | probs: Output of shape (batch_size, num_tasks). 12 | Position (i, j) interpreted as P(example i has pathology j). 13 | """ 14 | b, n_times_d = logits.size() 15 | d = 3 16 | if n_times_d % d: 17 | raise ValueError('Expected logits dimension to be divisible by ' + 18 | f'{d}, got size {n_times_d}.') 19 | n = n_times_d // d 20 | 21 | logits = logits.view(b, n, d) 22 | probs = F.softmax(logits[:, :, 1:], dim=-1) 23 | probs = probs[:, :, 1] 24 | 25 | return probs 26 | -------------------------------------------------------------------------------- /chexpert_supervised/environment.yml: -------------------------------------------------------------------------------- 1 | name: chexpert-baseline 2 | channels: 3 | - pytorch 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - blas=1.0=mkl 9 | - bzip2=1.0.8=h516909a_2 10 | - ca-certificates=2020.1.1=0 11 | - cairo=1.16.0=h18b612c_1001 12 | - certifi=2020.4.5.1=py37_0 13 | - cudatoolkit=10.1.243=h6bb024c_0 14 | - cycler=0.10.0=py_2 15 | - dbus=1.13.6=he372182_0 16 | - expat=2.2.9=he1b5a44_2 17 | - ffmpeg=4.0=hcdf2ecd_0 18 | - fontconfig=2.13.1=he4413a7_1000 19 | - freeglut=3.0.0=hf484d3e_1005 20 | - freetype=2.9.1=h8a8886c_1 21 | - glib=2.63.1=h3eb4bd4_1 22 | - graphite2=1.3.13=he1b5a44_1001 23 | - gst-plugins-base=1.14.0=hbbd80ab_1 24 | - gstreamer=1.14.0=hb31296c_0 25 | - harfbuzz=1.8.8=hffaf4a1_0 26 | - hdf5=1.10.2=hc401514_3 27 | - icu=58.2=hf484d3e_1000 28 | - intel-openmp=2020.1=217 29 | - jasper=2.0.14=h07fcdf6_1 30 | - joblib=0.15.1=py_0 31 | - jpeg=9b=h024ee3a_2 32 | - kiwisolver=1.2.0=py37h99015e2_0 33 | - ld_impl_linux-64=2.33.1=h53a641e_7 34 | - libedit=3.1.20181209=hc058e9b_0 35 | - libffi=3.3=he6710b0_1 36 | - libgcc-ng=9.1.0=hdf63c60_0 37 | - libgfortran=3.0.0=1 38 | - libgfortran-ng=7.3.0=hdf63c60_0 39 | - libglu=9.0.0=he1b5a44_1001 40 | - libopencv=3.4.2=hb342d67_1 41 | - libopus=1.3.1=h7b6447c_0 42 | - libpng=1.6.37=hbc83047_0 43 | - libstdcxx-ng=9.1.0=hdf63c60_0 44 | - libtiff=4.1.0=h2733197_1 45 | - libuuid=2.32.1=h14c3975_1000 46 | - libvpx=1.7.0=h439df22_0 47 | - libxcb=1.13=h14c3975_1002 48 | - libxml2=2.9.10=he19cac6_1 49 | - lz4-c=1.9.2=he6710b0_0 50 | - matplotlib=3.1.3=py37_0 51 | - matplotlib-base=3.1.3=py37hef1b27d_0 52 | - mkl=2020.1=217 53 | - mkl-service=2.3.0=py37he904b0f_0 54 | - mkl_fft=1.0.15=py37ha843d7b_0 55 | - mkl_random=1.1.1=py37h0573a6f_0 56 | - ncurses=6.2=he6710b0_1 57 | - ninja=1.9.0=py37hfd86e86_0 58 | - numpy=1.18.1=py37h4f9e942_0 59 | - numpy-base=1.18.1=py37hde5b4d6_1 60 | - olefile=0.46=py37_0 61 | - opencv=3.4.2=py37h6fd60c2_1 62 | - openssl=1.1.1g=h7b6447c_0 63 | - pandas=1.0.3=py37h0573a6f_0 64 | - pcre=8.44=he1b5a44_0 65 | - pillow=7.1.2=py37hb39fc2d_0 66 | - pip=20.0.2=py37_3 67 | - pixman=0.38.0=h516909a_1003 68 | - pthread-stubs=0.4=h14c3975_1001 69 | - py-opencv=3.4.2=py37hb342d67_1 70 | - pyparsing=2.4.7=pyh9f0ad1d_0 71 | - pyqt=5.9.2=py37hcca6a23_4 72 | - python=3.7.7=hcff3b4d_5 73 | - python-dateutil=2.8.1=py_0 74 | - python_abi=3.7=1_cp37m 75 | - pytorch=1.4.0=py3.7_cuda10.1.243_cudnn7.6.3_0 76 | - pytz=2020.1=py_0 77 | - qt=5.9.7=h5867ecd_1 78 | - readline=8.0=h7b6447c_0 79 | - scikit-learn=0.22.1=py37hd81dba3_0 80 | - scipy=1.4.1=py37h0b6359f_0 81 | - setuptools=47.1.1=py37_0 82 | - sip=4.19.8=py37hf484d3e_0 83 | - six=1.15.0=py_0 84 | - sqlite=3.31.1=h62c20be_1 85 | - tk=8.6.8=hbc83047_0 86 | - torchvision=0.5.0=py37_cu101 87 | - tornado=6.0.4=py37h8f50634_1 88 | - wheel=0.34.2=py37_0 89 | - xorg-fixesproto=5.0=h14c3975_1002 90 | - xorg-inputproto=2.3.2=h14c3975_1002 91 | - xorg-kbproto=1.0.7=h14c3975_1002 92 | - xorg-libice=1.0.10=h516909a_0 93 | - xorg-libsm=1.2.3=h84519dc_1000 94 | - xorg-libx11=1.6.9=h516909a_0 95 | - xorg-libxau=1.0.9=h14c3975_0 96 | - xorg-libxdmcp=1.1.3=h516909a_0 97 | - xorg-libxext=1.3.4=h516909a_0 98 | - xorg-libxfixes=5.0.3=h516909a_1004 99 | - xorg-libxi=1.7.10=h516909a_0 100 | - xorg-libxrender=0.9.10=h516909a_1002 101 | - xorg-renderproto=0.11.1=h14c3975_1002 102 | - xorg-xextproto=7.3.0=h14c3975_1002 103 | - xorg-xproto=7.0.31=h14c3975_1007 104 | - xz=5.2.5=h7b6447c_0 105 | - zlib=1.2.11=h7b6447c_3 106 | - zstd=1.4.4=h0b5b093_3 107 | - pip: 108 | - absl-py==0.9.0 109 | - cachetools==4.1.0 110 | - chardet==3.0.4 111 | - future==0.18.2 112 | - google-auth==1.17.2 113 | - google-auth-oauthlib==0.4.1 114 | - grpcio==1.29.0 115 | - idna==2.9 116 | - importlib-metadata==1.6.1 117 | - markdown==3.2.2 118 | - munch==2.5.0 119 | - oauthlib==3.1.0 120 | - pretrainedmodels==0.7.4 121 | - protobuf==3.12.2 122 | - pyasn1==0.4.8 123 | - pyasn1-modules==0.2.8 124 | - pytorch-lightning==0.7.6 125 | - pyyaml==5.3.1 126 | - requests==2.23.0 127 | - requests-oauthlib==1.3.0 128 | - rsa==4.6 129 | - tensorboard==2.2.2 130 | - tensorboard-plugin-wit==1.6.0.post3 131 | - tqdm==4.46.1 132 | - urllib3==1.25.9 133 | - werkzeug==1.0.1 134 | - zipp==3.1.0 135 | 136 | -------------------------------------------------------------------------------- /image_source/contrastive_learning.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/image_source/contrastive_learning.PNG -------------------------------------------------------------------------------- /image_source/cx_all_full_ci.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/image_source/cx_all_full_ci.PNG -------------------------------------------------------------------------------- /image_source/cx_all_last_ci.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/image_source/cx_all_last_ci.PNG -------------------------------------------------------------------------------- /image_source/moco_flowchart_new.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/image_source/moco_flowchart_new.PNG -------------------------------------------------------------------------------- /moco_pretraining/moco/aihc_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/moco_pretraining/moco/aihc_utils/__init__.py -------------------------------------------------------------------------------- /moco_pretraining/moco/aihc_utils/image_transform.py: -------------------------------------------------------------------------------- 1 | import torchvision.transforms as transforms 2 | 3 | CXR_MEAN = [.5020, .5020, .5020] 4 | CXR_STD = [.085585, .085585, .085585] 5 | 6 | 7 | def get_transform(args, training): 8 | # Shorter side scaled to args.img_size 9 | if args.maintain_ratio: 10 | transforms_list = [transforms.Resize(args.img_size)] 11 | else: 12 | transforms_list = [transforms.Resize((args.img_size, args.img_size))] 13 | 14 | # Data augmentation 15 | if training: 16 | transforms_list += [transforms.RandomHorizontalFlip(), 17 | transforms.RandomRotation(args.rotate), 18 | transforms.RandomCrop((args.crop, args.crop)) if args.crop != 0 else None] 19 | else: 20 | transforms_list += [transforms.CenterCrop((args.crop, args.crop)) if args.crop else None] 21 | 22 | # Normalization 23 | # Seems like the arguments do not contain clahe anyways 24 | # if t_args.clahe: 25 | # transforms_list += [CLAHE(clip_limit=2.0, tile_grid_size=(8, 8))] 26 | 27 | normalize = transforms.Normalize(mean=CXR_MEAN, std=CXR_STD) 28 | transforms_list += [transforms.ToTensor(), normalize] 29 | 30 | # transform = transforms.Compose([t for t in transforms_list if t]) 31 | transform = [t for t in transforms_list if t] 32 | return transform -------------------------------------------------------------------------------- /moco_pretraining/moco/aihc_utils/storage_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | 4 | from pathlib import Path 5 | import getpass 6 | 7 | import getpass 8 | 9 | if str(getpass.getuser()) == 'jby': 10 | STORAGE_ROOT = Path('/home/jby/chexpert_experiments') 11 | else: 12 | STORAGE_ROOT = Path('/deep/group/aihc-bootcamp-spring2020/cxr_fewer_samples/experiments') 13 | 14 | 15 | def get_storage_folder(exp_name, exp_type): 16 | 17 | try: 18 | jobid = os.environ["SLURM_JOB_ID"] 19 | except: 20 | jobid = None 21 | 22 | datestr = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') 23 | username = str(getpass.getuser()) 24 | 25 | fname = f'{exp_name}_{exp_type}_{datestr}_SLURM{jobid}' if jobid is not None else f'{exp_name}_{exp_type}_{datestr}' 26 | 27 | path_name = STORAGE_ROOT / username / fname 28 | os.makedirs(path_name) 29 | 30 | print(f'Experiment storage is at {fname}') 31 | return path_name -------------------------------------------------------------------------------- /moco_pretraining/moco/detection/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## MoCo: Transferring to Detection 3 | 4 | The `train_net.py` script reproduces the object detection experiments on Pascal VOC and COCO. 5 | 6 | ### Instruction 7 | 8 | 1. Install [detectron2](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md). 9 | 10 | 1. Convert a pre-trained MoCo model to detectron2's format: 11 | ``` 12 | python3 convert-pretrain-to-detectron2.py input.pth.tar output.pkl 13 | ``` 14 | 15 | 1. Put dataset under "./datasets" directory, 16 | following the [directory structure](https://github.com/facebookresearch/detectron2/tree/master/datasets) 17 | requried by detectron2. 18 | 19 | 1. Run training: 20 | ``` 21 | python train_net.py --config-file configs/pascal_voc_R_50_C4_24k_moco.yaml \ 22 | --num-gpus 8 MODEL.WEIGHTS ./output.pkl 23 | ``` 24 | 25 | ### Results 26 | 27 | Below are the results on Pascal VOC 2007 test, fine-tuned on 2007+2012 trainval for 24k iterations using Faster R-CNN with a R50-C4 backbone: 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 |
pretrainAP50APAP75
ImageNet-1M, supervised81.353.558.8
ImageNet-1M, MoCo v1, 200ep81.555.962.6
ImageNet-1M, MoCo v2, 200ep82.457.063.6
ImageNet-1M, MoCo v2, 800ep82.557.464.0
60 | 61 | ***Note:*** These results are means of 5 trials. Variation on Pascal VOC is large: the std of AP50, AP, AP75 is expected to be 0.2, 0.2, 0.4 in most cases. We recommend to run 5 trials and compute means. 62 | -------------------------------------------------------------------------------- /moco_pretraining/moco/detection/configs/Base-RCNN-C4-BN.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | RPN: 4 | PRE_NMS_TOPK_TEST: 6000 5 | POST_NMS_TOPK_TEST: 1000 6 | ROI_HEADS: 7 | NAME: "Res5ROIHeadsExtraNorm" 8 | BACKBONE: 9 | FREEZE_AT: 0 10 | RESNETS: 11 | NORM: "SyncBN" 12 | TEST: 13 | PRECISE_BN: 14 | ENABLED: True 15 | SOLVER: 16 | IMS_PER_BATCH: 16 17 | BASE_LR: 0.02 18 | -------------------------------------------------------------------------------- /moco_pretraining/moco/detection/configs/coco_R_50_C4_2x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-RCNN-C4-BN.yaml" 2 | MODEL: 3 | MASK_ON: True 4 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 5 | INPUT: 6 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 7 | MIN_SIZE_TEST: 800 8 | DATASETS: 9 | TRAIN: ("coco_2017_train",) 10 | TEST: ("coco_2017_val",) 11 | SOLVER: 12 | STEPS: (120000, 160000) 13 | MAX_ITER: 180000 14 | -------------------------------------------------------------------------------- /moco_pretraining/moco/detection/configs/coco_R_50_C4_2x_moco.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "coco_R_50_C4_2x.yaml" 2 | MODEL: 3 | PIXEL_MEAN: [123.675, 116.280, 103.530] 4 | PIXEL_STD: [58.395, 57.120, 57.375] 5 | WEIGHTS: "See Instructions" 6 | RESNETS: 7 | STRIDE_IN_1X1: False 8 | INPUT: 9 | FORMAT: "RGB" 10 | -------------------------------------------------------------------------------- /moco_pretraining/moco/detection/configs/pascal_voc_R_50_C4_24k.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-RCNN-C4-BN.yaml" 2 | MODEL: 3 | MASK_ON: False 4 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 5 | ROI_HEADS: 6 | NUM_CLASSES: 20 7 | INPUT: 8 | MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) 9 | MIN_SIZE_TEST: 800 10 | DATASETS: 11 | TRAIN: ('voc_2007_trainval', 'voc_2012_trainval') 12 | TEST: ('voc_2007_test',) 13 | SOLVER: 14 | STEPS: (18000, 22000) 15 | MAX_ITER: 24000 16 | WARMUP_ITERS: 100 17 | -------------------------------------------------------------------------------- /moco_pretraining/moco/detection/configs/pascal_voc_R_50_C4_24k_moco.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "pascal_voc_R_50_C4_24k.yaml" 2 | MODEL: 3 | PIXEL_MEAN: [123.675, 116.280, 103.530] 4 | PIXEL_STD: [58.395, 57.120, 57.375] 5 | WEIGHTS: "See Instructions" 6 | RESNETS: 7 | STRIDE_IN_1X1: False 8 | INPUT: 9 | FORMAT: "RGB" 10 | -------------------------------------------------------------------------------- /moco_pretraining/moco/detection/convert-pretrain-to-detectron2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import pickle as pkl 5 | import sys 6 | import torch 7 | 8 | if __name__ == "__main__": 9 | input = sys.argv[1] 10 | 11 | obj = torch.load(input, map_location="cpu") 12 | obj = obj["state_dict"] 13 | 14 | newmodel = {} 15 | for k, v in obj.items(): 16 | if not k.startswith("module.encoder_q."): 17 | continue 18 | old_k = k 19 | k = k.replace("module.encoder_q.", "") 20 | if "layer" not in k: 21 | k = "stem." + k 22 | for t in [1, 2, 3, 4]: 23 | k = k.replace("layer{}".format(t), "res{}".format(t + 1)) 24 | for t in [1, 2, 3]: 25 | k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) 26 | k = k.replace("downsample.0", "shortcut") 27 | k = k.replace("downsample.1", "shortcut.norm") 28 | print(old_k, "->", k) 29 | newmodel[k] = v.numpy() 30 | 31 | res = {"model": newmodel, "__author__": "MOCO", "matching_heuristics": True} 32 | 33 | with open(sys.argv[2], "wb") as f: 34 | pkl.dump(res, f) 35 | -------------------------------------------------------------------------------- /moco_pretraining/moco/detection/train_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 3 | 4 | import os 5 | 6 | from detectron2.checkpoint import DetectionCheckpointer 7 | from detectron2.config import get_cfg 8 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch 9 | from detectron2.evaluation import COCOEvaluator, PascalVOCDetectionEvaluator 10 | from detectron2.layers import get_norm 11 | from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads 12 | 13 | 14 | @ROI_HEADS_REGISTRY.register() 15 | class Res5ROIHeadsExtraNorm(Res5ROIHeads): 16 | """ 17 | As described in the MOCO paper, there is an extra BN layer 18 | following the res5 stage. 19 | """ 20 | def _build_res5_block(self, cfg): 21 | seq, out_channels = super()._build_res5_block(cfg) 22 | norm = cfg.MODEL.RESNETS.NORM 23 | norm = get_norm(norm, out_channels) 24 | seq.add_module("norm", norm) 25 | return seq, out_channels 26 | 27 | 28 | class Trainer(DefaultTrainer): 29 | @classmethod 30 | def build_evaluator(cls, cfg, dataset_name, output_folder=None): 31 | if output_folder is None: 32 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") 33 | if "coco" in dataset_name: 34 | return COCOEvaluator(dataset_name, cfg, True, output_folder) 35 | else: 36 | assert "voc" in dataset_name 37 | return PascalVOCDetectionEvaluator(dataset_name) 38 | 39 | 40 | def setup(args): 41 | cfg = get_cfg() 42 | cfg.merge_from_file(args.config_file) 43 | cfg.merge_from_list(args.opts) 44 | cfg.freeze() 45 | default_setup(cfg, args) 46 | return cfg 47 | 48 | 49 | def main(args): 50 | cfg = setup(args) 51 | 52 | if args.eval_only: 53 | model = Trainer.build_model(cfg) 54 | DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( 55 | cfg.MODEL.WEIGHTS, resume=args.resume 56 | ) 57 | res = Trainer.test(cfg, model) 58 | return res 59 | 60 | trainer = Trainer(cfg) 61 | trainer.resume_or_load(resume=args.resume) 62 | return trainer.train() 63 | 64 | 65 | if __name__ == "__main__": 66 | args = default_argument_parser().parse_args() 67 | print("Command Line Args:", args) 68 | launch( 69 | main, 70 | args.num_gpus, 71 | num_machines=args.num_machines, 72 | machine_rank=args.machine_rank, 73 | dist_url=args.dist_url, 74 | args=(args,), 75 | ) 76 | -------------------------------------------------------------------------------- /moco_pretraining/moco/moco/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | -------------------------------------------------------------------------------- /moco_pretraining/moco/moco/loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | from PIL import ImageFilter 3 | import random 4 | 5 | 6 | class TwoCropsTransform: 7 | """Take two random crops of one image as the query and key.""" 8 | 9 | def __init__(self, base_transform): 10 | self.base_transform = base_transform 11 | 12 | def __call__(self, x): 13 | q = self.base_transform(x) 14 | k = self.base_transform(x) 15 | return [q, k] 16 | 17 | 18 | class GaussianBlur(object): 19 | """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709""" 20 | 21 | def __init__(self, sigma=[.1, 2.]): 22 | self.sigma = sigma 23 | 24 | def __call__(self, x): 25 | sigma = random.uniform(self.sigma[0], self.sigma[1]) 26 | x = x.filter(ImageFilter.GaussianBlur(radius=sigma)) 27 | return x 28 | -------------------------------------------------------------------------------- /moco_pretraining/moco/training_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/moco_pretraining/moco/training_tools/__init__.py -------------------------------------------------------------------------------- /moco_pretraining/moco/training_tools/combiner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from collections import defaultdict 3 | 4 | 5 | def detach_tensor(tensor): 6 | if type(tensor) != np.ndarray: 7 | if type(tensor) == list: 8 | return np.ndarray(tensor) 9 | else: 10 | return tensor.cpu().detach().numpy() 11 | return tensor 12 | 13 | def recursive_append(target_dict, source_dict): 14 | for e in source_dict: 15 | if type(source_dict[e]) == dict: 16 | if e not in target_dict: 17 | target_dict[e] = defaultdict(list) 18 | target_dict[e] = recursive_append(target_dict[e], source_dict[e]) 19 | elif source_dict[e] is not None: 20 | if type(source_dict[e]) == list: 21 | target_dict[e].append(source_dict[e]) 22 | else: 23 | target_dict[e].append(source_dict[e].cpu()) 24 | 25 | return target_dict 26 | 27 | def recursive_concat(source_dict): 28 | for e in source_dict: 29 | if type(source_dict[e]) == dict or type(source_dict[e]) == defaultdict: 30 | source_dict[e] = recursive_concat(source_dict[e]) 31 | elif source_dict[e] is not None: 32 | source_dict[e] = np.concatenate(source_dict[e]) 33 | 34 | return source_dict -------------------------------------------------------------------------------- /moco_pretraining/moco/training_tools/evaluator.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import time 5 | import warnings 6 | import sys 7 | 8 | import numpy as np 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.parallel 12 | import torch.backends.cudnn as cudnn 13 | import torch.distributed as dist 14 | import torch.optim 15 | from sklearn.metrics import roc_auc_score 16 | from scipy.special import softmax 17 | 18 | from .meters import AverageMeter 19 | from .meters import ProgressMeter 20 | from .combiner import detach_tensor 21 | 22 | ''' 23 | def pred_accuracy(output, target, k): 24 | """Computes the accuracy over the k top predictions for the specified values of k""" 25 | 26 | output = detach_tensor(output) 27 | target = detach_tensor(target) 28 | 29 | batch_size = target.size(0) 30 | 31 | argsorted_out = np.argsort(output)[:,-k:] 32 | return np.asarray(np.any(argsorted_y.T == target, axis=0).mean(dtype='f')), 33 | 34 | 35 | _, pred = output.topk(maxk, 1, True, True) 36 | pred = pred.t() 37 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 38 | 39 | res = [] 40 | correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 41 | res.append(correct_k.mul_(100.0 / batch_size)) 42 | return res[0] # Seems like we only want the 1st 43 | ''' 44 | 45 | 46 | def decorator_detach_tensor(function): 47 | def wrapper(*args, **kwargs): 48 | # TODO Find a simple way to handle this business ... 49 | # If is eval, or if fast debug, or 50 | # is train and not heavy, or is train and heavy 51 | output = detach_tensor(args[0]) 52 | target = detach_tensor(args[1]) 53 | args = args[2:] 54 | 55 | result = function(output, target, *args, **kwargs) 56 | return result 57 | return wrapper 58 | 59 | @decorator_detach_tensor 60 | def topk_acc(output, target, k): 61 | """Computes the accuracy over the k top predictions for the specified values of k""" 62 | argsorted_out = np.argsort(output)[:,-k:] 63 | matching = np.asarray(np.any(argsorted_out.T == target, axis=0)) 64 | return matching.mean(dtype='f') 65 | 66 | 67 | @decorator_detach_tensor 68 | def compute_auc_binary(output, target): 69 | #assuming output and target are all vectors for binary case 70 | try: 71 | o = softmax(output, axis=1) 72 | auc = roc_auc_score(target, o[:,1]) 73 | except: 74 | return -1 75 | return auc 76 | 77 | 78 | class Evaluator: 79 | 80 | def __init__(self, model, loss_func, metrics, loaders, args): 81 | 82 | self.model = model 83 | self.loss_func = loss_func 84 | self.metrics = metrics 85 | self.loaders = loaders 86 | self.args = args 87 | 88 | self.metric_best_vals = {metric: 0 for metric in self.metrics} 89 | 90 | 91 | def evaluate(self, eval_type, epoch): 92 | 93 | print(f'==> Evaluation for {eval_type}, epoch {epoch}') 94 | 95 | loader = self.loaders[eval_type] 96 | 97 | batch_time = AverageMeter('Time', ':6.3f') 98 | losses = AverageMeter('Loss', ':.4e') 99 | 100 | metric_meters = {metric: AverageMeter(metric, self.metrics[metric]['format']) \ 101 | for metric in self.metrics} 102 | list_meters = [metric_meters[m] for m in metric_meters] 103 | 104 | progress = ProgressMeter( 105 | len(loader), 106 | [batch_time, losses, *list_meters], 107 | prefix=f'{eval_type}@Epoch {epoch}: ') 108 | 109 | # switch to evaluate mode 110 | self.model.eval() 111 | all_output = [] 112 | all_gt = [] 113 | 114 | with torch.no_grad(): 115 | end = time.time() 116 | for i, (images, target) in enumerate(loader): 117 | if self.args.gpu is not None: 118 | images = images.cuda(self.args.gpu, non_blocking=True) 119 | target = target.cuda(self.args.gpu, non_blocking=True) 120 | all_gt.append(target.cpu()) 121 | 122 | # compute output 123 | output = self.model(images) 124 | all_output.append(output.cpu()) 125 | 126 | loss = self.loss_func(output, target) 127 | 128 | # JBY: For simplicity do losses first 129 | losses.update(loss.item(), images.size(0)) 130 | 131 | for metric in self.metrics: 132 | args = [output, target, *self.metrics[metric]['args']] 133 | metric_func = globals()[self.metrics[metric]['func']] 134 | result = metric_func(*args) 135 | 136 | metric_meters[metric].update(result, images.size(0)) 137 | 138 | # measure elapsed time 139 | batch_time.update(time.time() - end) 140 | end = time.time() 141 | 142 | if i % self.args.print_freq == 0: 143 | progress.display(i) 144 | 145 | # TODO: this should also be done with the ProgressMeter 146 | # print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' 147 | # .format(top1=top1, top5=top5)) 148 | progress.display(i + 1) 149 | 150 | all_output = np.concatenate(all_output) 151 | all_gt = np.concatenate(all_gt) 152 | 153 | for metric in self.metrics: 154 | args = [all_output, all_gt, *self.metrics[metric]['args']] 155 | metric_func = globals()[self.metrics[metric]['func']] 156 | result = metric_func(*args) 157 | 158 | metric_meters[metric].update(result, images.size(0)) 159 | 160 | self.metric_best_vals[metric] = max(metric_meters[metric].avg, 161 | self.metric_best_vals[metric]) 162 | 163 | progress.display(i + 1, summary=True) -------------------------------------------------------------------------------- /moco_pretraining/moco/training_tools/meters.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class AverageMeter(object): 4 | """Computes and stores the average and current value""" 5 | def __init__(self, name, fmt=':f'): 6 | self.name = name 7 | self.fmt = fmt 8 | self.reset() 9 | 10 | def reset(self): 11 | self.val = 0 12 | self.avg = 0 13 | self.sum = 0 14 | self.count = 0 15 | 16 | def update(self, val, n=1): 17 | if type(val) == torch.Tensor: 18 | val = val.item() 19 | 20 | self.val = val 21 | self.sum += val * n 22 | self.count += n 23 | self.avg = self.sum / self.count 24 | 25 | def str_val(self): 26 | if self.name == 'Loss': 27 | fmtstr = '{name} {val' + self.fmt + '}\n' 28 | else: 29 | fmtstr = '{name} {val' + self.fmt + '}' 30 | return fmtstr.format(**self.__dict__) 31 | 32 | def __str__(self): 33 | if self.name == 'Loss': 34 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})\n' 35 | else: 36 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 37 | return fmtstr.format(**self.__dict__) 38 | 39 | 40 | class ProgressMeter(object): 41 | def __init__(self, num_batches, meters, prefix=""): 42 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches) 43 | self.meters = meters 44 | self.prefix = prefix 45 | 46 | def display(self, batch, summary=False): 47 | entries = [self.prefix + self.batch_fmtstr.format(batch)] 48 | if not summary: 49 | entries += [str(meter) for meter in self.meters] 50 | print('\t'.join(entries)) 51 | else: 52 | entries += [meter.str_val() for meter in self.meters] 53 | print('Summary: ' + '\t'.join(entries)) 54 | 55 | def _get_batch_fmtstr(self, num_batches): 56 | num_digits = len(str(num_batches // 1)) 57 | fmt = '{:' + str(num_digits) + 'd}' 58 | return '[' + fmt + '/' + fmt.format(num_batches) + ']' 59 | 60 | -------------------------------------------------------------------------------- /moco_pretraining/scripts/convert_to_chexpert.py: -------------------------------------------------------------------------------- 1 | '''File created to reorganize montgomery and shenzhen dataset to fit 2 | torchvision.ImageFolder class 3 | ''' 4 | 5 | from collections import defaultdict 6 | import copy 7 | import os 8 | import pprint as pp 9 | import random 10 | import re 11 | import shutil 12 | import sys 13 | 14 | import pandas as pd 15 | from pathlib import Path 16 | from tqdm import tqdm 17 | 18 | # 2e-7 ~ 2--1 19 | ALL_SEMI_RATIO = [0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5] 20 | TEST_RATIO = 0.15 21 | VAL_RATIO = 0.1 22 | 23 | 24 | def print_summary(df, name): 25 | total_len = len(df) 26 | no_finding = len(df[df['No Finding'] == 0]) 27 | tb = len(df[df['Tuberculosis'] == 0]) 28 | 29 | print(f'CSV: {name}, No Finding: {no_finding}, Tuberculosis: {tb}') 30 | 31 | 32 | def convert_shenzhen(root_folder): 33 | 34 | RE_SEX_AGE = re.compile(r'(?P.*al)[e]?[\s|,]*(?P[0-9]+)[yr]?[s]?') 35 | RE_FNAME = re.compile(r'CHNCXR\_(?P[0-9]+)\_(?P[0|1])\.txt') 36 | 37 | root_path = Path(root_folder) 38 | 39 | key_words = ['upper', 'lower', 'left', 'right', 'bilateral', 'atb', 'ptb', 'stb'] 40 | 41 | # readings = {'healthy': [], 'disease': []} 42 | parsed = [] 43 | for i, f in tqdm(enumerate(os.listdir(root_path / 'ClinicalReadings'))): 44 | 45 | f_result = RE_FNAME.search(f) 46 | pid = f_result.groupdict()['idx'] 47 | lbl = f_result.groupdict()['lbl'] 48 | 49 | data = { 50 | 'Study': None, 51 | 'Age': None, 52 | 'Sex': None, 53 | 'No Finding': None, 54 | 'Tuberculosis': None, 55 | 'Path': None 56 | } 57 | 58 | disease = None 59 | with open(root_path / 'ClinicalReadings' / f, 'r') as txt: 60 | lines = txt.readlines() 61 | 62 | # if len(lines) > 3: 63 | # import pdb; pdb.set_trace() 64 | 65 | for l in lines: 66 | result = RE_SEX_AGE.search(l) 67 | 68 | if result: 69 | age = int(result.groupdict()['age']) 70 | sex = result.groupdict()['sex'].lower() 71 | 72 | data['Age'] = age 73 | data['Sex'] = sex 74 | else: 75 | l = l.strip().lower() 76 | 77 | if len(l) > 0: 78 | if 'normal' in l: 79 | assert lbl == '0' 80 | disease = False 81 | else: 82 | if lbl != '1': 83 | import pdb; pdb.set_trace() 84 | 85 | disease = False 86 | for k in key_words: 87 | if k in l: 88 | disease = True 89 | 90 | if 'pleuritis' in l: 91 | disease = True 92 | 93 | assert disease is not None 94 | 95 | if disease: 96 | data['No Finding'] = 0 97 | data['Tuberculosis'] = 1 98 | else: 99 | data['No Finding'] = 1 100 | data['Tuberculosis'] = 0 101 | 102 | 103 | fname = root_path / 'shenzhentest' / 'test' / f'patient{pid}' / 'study1' / 'view1_frontal.jpg' 104 | study = Path('shenzhen') / 'shenzhentest' / 'test' / f'patient{pid}' / 'study1' 105 | data['Study'] = study 106 | data['Path'] = fname 107 | 108 | parsed.append(data) 109 | 110 | val_rows = [] 111 | test_rows = [] 112 | 113 | ratios = ALL_SEMI_RATIO + [1] 114 | fine_tune_splitted_rows = {s: [] for s in ratios} 115 | for stuff in tqdm(parsed): 116 | rnd = random.random() 117 | 118 | if rnd < VAL_RATIO: 119 | val_rows.append(stuff) 120 | elif rnd < VAL_RATIO + TEST_RATIO: 121 | test_rows.append(stuff) 122 | else: 123 | rnd = random.random() 124 | 125 | for s in ratios: 126 | if rnd < s: 127 | fine_tune_splitted_rows[s].append(stuff) 128 | 129 | df = pd.DataFrame(val_rows) 130 | df.to_csv(root_path / f'chexpert_like_val.csv') 131 | print_summary(df, 'validation') 132 | 133 | df = pd.DataFrame(test_rows) 134 | df.to_csv(root_path / f'chexpert_like_test.csv') 135 | print_summary(df, 'test') 136 | 137 | for s in ratios: 138 | df = pd.DataFrame(fine_tune_splitted_rows[s]) 139 | df.to_csv(root_path / f'chexpert_like_{s}.csv') 140 | print_summary(df, f'semi_{s}') 141 | 142 | if __name__ == '__main__': 143 | # Usage: 144 | # python convert_to_chexpert.py moco/shenzhen 23 145 | # Try 17, 28, 20 146 | 147 | random.seed(sys.argv[2]) 148 | 149 | convert_shenzhen(sys.argv[1]) -------------------------------------------------------------------------------- /moco_pretraining/scripts/generate_moco_training_scripts.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | 4 | 5 | SBATCH_SCRIPT = \ 6 | '''#!/bin/bash 7 | #SBATCH --partition=deep 8 | #SBATCH --nodes=1 9 | #SBATCH --cpus-per-task=4 10 | #SBATCH --mem=32000 11 | 12 | # only use the following on partition with GPUs 13 | #SBATCH --gres=gpu:1 14 | 15 | #SBATCH --job-name="SB_JOBNAME" 16 | #SBATCH --output=exp_logs/SB_JOBNAME-%j.out 17 | 18 | # only use the following if you want email notification 19 | ####SBATCH --mail-user=youremailaddress 20 | ####SBATCH --mail-type=ALL 21 | 22 | # list out some useful information 23 | echo "SLURM_JOBID="$SLURM_JOBID 24 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST 25 | echo "SLURM_NNODES"=$SLURM_NNODES 26 | echo "SLURMTMPDIR="$SLURMTMPDIR 27 | echo "working directory = "$SLURM_SUBMIT_DIR 28 | 29 | # sample job 30 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l` 31 | echo NPROCS=$NPROCS 32 | 33 | cd ../moco; python main_moco.py -a SB_MODEL \\ 34 | --lr SB_LR --batch-size SB_BATCH_SIZE \\ 35 | --epochs SB_EPOCHS \\ 36 | --world-size 1 --rank 0 \\ 37 | --mlp --moco-t 0.2 SB_FROM_IMAGENET \\ 38 | --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \\ 39 | --aug-setting chexpert --rotate SB_ROTATION --maintain-ratio \\ 40 | --train_data /deep/group/data/moco/chexpert-proper-test/data/full_train \\ 41 | --exp-name SB_EXPNAME 42 | 43 | # done 44 | echo "Done" 45 | ''' 46 | 47 | BASH_SCRIPT = \ 48 | '''cd /home/jby/aihc-spring20-fewer/moco; python main_moco.py -a SB_MODEL \\ 49 | --lr SB_LR --batch-size SB_BATCH_SIZE \\ 50 | --world-size 1 --rank 0 \\ 51 | --mlp --moco-t 0.2 SB_FROM_IMAGENET \\ 52 | --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \\ 53 | --aug-setting chexpert --rotate SB_ROTATION --maintain-ratio \\ 54 | --train_data /home/jby/CheXpert/full_train \\ 55 | --exp-name SB_EXPNAME 2>&1 | tee /home/jby/chexpert_experiments/jby/SB_EXPNAME_log.txt 56 | ''' 57 | 58 | 59 | LR_SHORT = { 60 | 1e-7: '1n7', 61 | 1e-6: '1n6', 62 | 5e-5: '5n5', 63 | 3e-5: '3n5', 64 | 2e-5: '2n5', 65 | 1e-5: '1n5', 66 | 1e-4: '1n4', 67 | 1e-3: '1n3', 68 | 1e-2: '1n2', 69 | 5e-2: '5n2', 70 | 5e-4: '5n4' 71 | } 72 | 73 | MODEL_SHORT_NAME_MAP = {'resnet18': 'r8', 74 | 'resnet50': 'r5', 75 | 'densenet121': 'd1'} 76 | 77 | def gen_script(model, lr, batch_size, imagenet, epoch, gcp): 78 | 79 | today = datetime.datetime.now() 80 | strtoday = today.strftime('%Y%m%dh%H') 81 | 82 | sb_model = model 83 | sb_lr = str(lr) 84 | sb_epoch = str(epoch) 85 | sb_batch_size = str(batch_size) 86 | sb_from_imagenet = '--from-imagenet' if imagenet else '' 87 | sb_rotation = str(10) 88 | sb_jobname = f'{MODEL_SHORT_NAME_MAP[sb_model]}{"w" if imagenet else "o"}{LR_SHORT[lr]}{batch_size}' 89 | sb_expname = f'{sb_jobname}_{strtoday}' 90 | 91 | if not gcp: 92 | script = SBATCH_SCRIPT 93 | else: 94 | script = BASH_SCRIPT 95 | 96 | script = script.replace('SB_JOBNAME', sb_jobname) 97 | script = script.replace('SB_MODEL', sb_model) 98 | script = script.replace('SB_LR', sb_lr) 99 | script = script.replace('SB_EPOCH', sb_epoch) 100 | script = script.replace('SB_BATCH_SIZE', sb_batch_size) 101 | script = script.replace('SB_FROM_IMAGENET', sb_from_imagenet) 102 | script = script.replace('SB_ROTATION', sb_rotation) 103 | script = script.replace('SB_EXPNAME', sb_expname) 104 | 105 | fname = f'{sb_jobname}{"_local" if gcp else ""}.sh' 106 | with open(f'training_scripts/{fname}', 'w') as f: 107 | f.write(script) 108 | 109 | if __name__ == '__main__': 110 | 111 | GCP = False 112 | 113 | os.makedirs('training_scripts', exist_ok=True) 114 | # densenet121: 32 115 | # resnet50: 32 116 | # resnet18L 128 117 | 118 | BATCH_SIZE_MAP = { 119 | 'resnet18': 24, 120 | 'resnet50': 24, 121 | 'densenet121': 24, 122 | } 123 | 124 | LR_EPOCH_MAP = { 125 | 1e-5: 20, 126 | 1e-4: 20, 127 | 1e-2: 35 128 | } 129 | 130 | # for model in ['densenet121', 'resnet18', 'resnet50']: 131 | for model in ['resnet18']: 132 | for imagenet in [True, False]: 133 | for lr in [1e-5, 1e-4, 1e-2]: 134 | if not imagenet: 135 | actual_lr = lr * 5 136 | else: 137 | actual_lr = lr 138 | 139 | gen_script(model, actual_lr, BATCH_SIZE_MAP[model], imagenet, LR_EPOCH_MAP[lr], gcp=False) -------------------------------------------------------------------------------- /moco_pretraining/scripts/parse_log.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | LOG_RE = re.compile(r'Epoch: \[([0-9]+)\](\[[0-9]+\\[0-9]+\])(\s(\w+)(\[0-9]+\.[0-9]+)\s\(([0-9]+\.[0-9]+)\))') 4 | 5 | # Epoch:\s+\[([0-9]+)\]\[([0-9]+)\/([0-9]+)\](\s+([a-zA-z@]+)\s+(-?[\d.]+(?:e-?\d+)?)\s+\([\s]*(-?[\d.]+(?:e-?\d+)?)\))+ 6 | # \s+([a-zA-z@]+)\s+(-?[\d.]+(?:e-?\d+)?)\s+\([\s]*(-?[\d.]+(?:e-?\d+)?)\) 7 | # Epoch: [125][1420/1569] Time 0.836 ( 1.043) Data 0.000 ( 0.276) Loss 5.7912e+00 (5.8516e+00) Acc@1 100.00 ( 92.73) Acc@5 100.00 ( 97.18) 8 | 9 | def analyze_log(): 10 | -------------------------------------------------------------------------------- /moco_pretraining/scripts/resize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Convert all images in $1 to $2 3 | 4 | mkdir -p $2 5 | 6 | for filename in $1/*; do 7 | # echo $filename 8 | convert $filename -resize 500x500! $2/$(basename "$filename") 9 | done -------------------------------------------------------------------------------- /moco_pretraining/scripts/shenzhen_mutiple_split.py: -------------------------------------------------------------------------------- 1 | '''File created to reorganize montgomery and shenzhen dataset to fit 2 | torchvision.ImageFolder class 3 | ''' 4 | 5 | from collections import defaultdict 6 | import copy 7 | import os 8 | import pprint as pp 9 | import random 10 | import re 11 | import shutil 12 | import sys 13 | 14 | import pandas as pd 15 | from pathlib import Path 16 | from tqdm import tqdm 17 | 18 | # 2e-7 ~ 2--1 19 | ALL_SEMI_RATIO = [0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5] 20 | 21 | SEMI_ITERATIONS = { 0.0078125: 12, 22 | 0.015625: 10, 23 | 0.03125: 8, 24 | 0.0625: 8, 25 | 0.125: 4, 26 | 0.25: 4, 27 | 0.5: 4, 28 | 1: 1 29 | } 30 | 31 | TEST_RATIO = 0.20 32 | VAL_RATIO = 0.15 33 | TOTAL_TRAIN_RATIO = 1 - TEST_RATIO - VAL_RATIO 34 | TOTAL = 662 35 | 36 | 37 | def verify_one_split(df, name, ratio=None): 38 | if df is None: 39 | return False 40 | 41 | total_len = len(df) 42 | 43 | if ratio is not None: 44 | if not (TOTAL * ratio > total_len * 0.95 and TOTAL * ratio < total_len * 1.05): 45 | print(f'Split {name} has incorrect number of items {total_len}') 46 | return False 47 | 48 | if 'Tuberculosis' not in df: 49 | return False 50 | 51 | # no_finding = len(df[df['No Finding'] == 0]) 52 | tb = len(df[df['Tuberculosis'] == 1]) 53 | no_tb = len(df[df['Tuberculosis'] == 0]) 54 | 55 | if tb == 0 or no_tb == 0: 56 | print(f'Split {name} has a ratio of infnity, which is BAD') 57 | return False 58 | 59 | ratio = no_tb / tb 60 | 61 | if ratio > 0.9 and ratio < 1.2: 62 | return True 63 | else: 64 | print(f'Split {name} has a ratio of {ratio}, which is BAD') 65 | return False 66 | 67 | def print_summary(df, name): 68 | total_len = len(df) 69 | no_finding = len(df[df['No Finding'] == 0]) 70 | tb = len(df[df['Tuberculosis'] == 0]) 71 | 72 | print(f'CSV: {name}, No Finding: {no_finding}, Tuberculosis: {tb}') 73 | 74 | 75 | def perform_split(root_path, parsed): 76 | 77 | okay = False 78 | while not okay: 79 | val_rows = [] 80 | test_rows = [] 81 | train_rows = [] 82 | 83 | try: 84 | for stuff in tqdm(parsed): 85 | rnd = random.random() 86 | 87 | if rnd < VAL_RATIO: 88 | val_rows.append(stuff) 89 | elif rnd < VAL_RATIO + TEST_RATIO: 90 | test_rows.append(stuff) 91 | else: 92 | train_rows.append(stuff) 93 | 94 | val_df = pd.DataFrame(val_rows) 95 | assert verify_one_split(val_df, 'val', ratio=VAL_RATIO) 96 | val_df.to_csv(root_path / f'chexpert_like_val.csv') 97 | print_summary(val_df, 'validation') 98 | 99 | test_df = pd.DataFrame(test_rows) 100 | assert verify_one_split(test_df, 'test', ratio=TEST_RATIO) 101 | test_df.to_csv(root_path / f'chexpert_like_test.csv') 102 | print_summary(test_df, 'test') 103 | 104 | okay = True 105 | except AssertionError: 106 | pass 107 | 108 | ratios = ALL_SEMI_RATIO + [1] 109 | for s in ratios: 110 | for it in range(SEMI_ITERATIONS[s]): 111 | 112 | df = None 113 | name = f'{s}_{it}' 114 | while not verify_one_split(df, name): 115 | items = [] 116 | for item in train_rows: 117 | rnd = random.random() 118 | if rnd < s: 119 | items.append(item) 120 | 121 | df = pd.DataFrame(items) 122 | verify_one_split(df, name, s * TOTAL_TRAIN_RATIO) 123 | 124 | df.to_csv(root_path / f'chexpert_like_{name}.csv') 125 | print_summary(df, name) 126 | 127 | 128 | def convert_shenzhen(root_folder): 129 | 130 | RE_SEX_AGE = re.compile(r'(?P.*al)[e]?[\s|,]*(?P[0-9]+)[yr]?[s]?') 131 | RE_FNAME = re.compile(r'CHNCXR\_(?P[0-9]+)\_(?P[0|1])\.txt') 132 | 133 | root_path = Path(root_folder) 134 | 135 | key_words = ['upper', 'lower', 'left', 'right', 'bilateral', 'atb', 'ptb', 'stb'] 136 | 137 | # readings = {'healthy': [], 'disease': []} 138 | parsed = [] 139 | for i, f in tqdm(enumerate(os.listdir(root_path / 'ClinicalReadings'))): 140 | 141 | f_result = RE_FNAME.search(f) 142 | pid = f_result.groupdict()['idx'] 143 | lbl = f_result.groupdict()['lbl'] 144 | 145 | data = { 146 | 'Study': None, 147 | 'Age': None, 148 | 'Sex': None, 149 | 'No Finding': None, 150 | 'Tuberculosis': None, 151 | 'Path': None 152 | } 153 | 154 | disease = None 155 | with open(root_path / 'ClinicalReadings' / f, 'r') as txt: 156 | lines = txt.readlines() 157 | 158 | # if len(lines) > 3: 159 | # import pdb; pdb.set_trace() 160 | 161 | for l in lines: 162 | result = RE_SEX_AGE.search(l) 163 | 164 | if result: 165 | age = int(result.groupdict()['age']) 166 | sex = result.groupdict()['sex'].lower() 167 | 168 | data['Age'] = age 169 | data['Sex'] = sex 170 | else: 171 | l = l.strip().lower() 172 | 173 | if len(l) > 0: 174 | if 'normal' in l: 175 | assert lbl == '0' 176 | disease = False 177 | else: 178 | if lbl != '1': 179 | import pdb; pdb.set_trace() 180 | 181 | disease = False 182 | for k in key_words: 183 | if k in l: 184 | disease = True 185 | 186 | if 'pleuritis' in l: 187 | disease = True 188 | 189 | assert disease is not None 190 | 191 | if disease: 192 | data['No Finding'] = 0 193 | data['Tuberculosis'] = 1 194 | else: 195 | data['No Finding'] = 1 196 | data['Tuberculosis'] = 0 197 | 198 | 199 | fname = root_path / 'shenzhentest' / 'test' / f'patient{pid}' / 'study1' / 'view1_frontal.jpg' 200 | study = Path('shenzhen') / 'shenzhentest' / 'test' / f'patient{pid}' / 'study1' 201 | data['Study'] = study 202 | data['Path'] = fname 203 | 204 | parsed.append(data) 205 | 206 | perform_split(root_path, parsed) 207 | 208 | if __name__ == '__main__': 209 | # Usage: 210 | # python shenzhen_mutiple_split.py moco/shenzhen 211 | # Try 17, 28, 20 212 | 213 | convert_shenzhen(sys.argv[1]) -------------------------------------------------------------------------------- /moco_pretraining/scripts/split_into_train_val.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | import random 5 | import pandas as pd 6 | 7 | random.seed(2020) 8 | 9 | TRAIN_RATIO = 0.7 10 | 11 | 12 | def split_folder(source_folder, target_train, target_val): 13 | 14 | os.makedirs(target_train, exist_ok=True) 15 | os.makedirs(target_val, exist_ok=True) 16 | 17 | for label in os.listdir(source_folder): 18 | os.makedirs(os.path.join(target_train, label), exist_ok=True) 19 | os.makedirs(os.path.join(target_val, label), exist_ok=True) 20 | 21 | allocation = [] 22 | for label in os.listdir(source_folder): 23 | if os.path.isfile(os.path.join(source_folder, label)): 24 | continue 25 | for fname in os.listdir(os.path.join(source_folder, label)): 26 | 27 | source = os.path.join(source_folder, label, fname) 28 | train_path = os.path.join(target_train, label, fname) 29 | val_path = os.path.join(target_val, label, fname) 30 | 31 | if random.random() < TRAIN_RATIO: 32 | shutil.copy(source, train_path) 33 | # all_train[label].append(train_path) 34 | allocation.append({'orig_path': source, 'new_path': train_path, 35 | 'train': 1, 'val': 0}) 36 | else: 37 | shutil.copy(source, val_path) 38 | # all_val[label].append(val_path) 39 | allocation.append({'orig_path': source, 'new_path': val_path, 40 | 'train': 0, 'val': 1}) 41 | 42 | df = pd.DataFrame(allocation) 43 | df.to_csv(os.path.join(source_folder, 'assignment.csv')) 44 | 45 | 46 | if __name__ == '__main__': 47 | 48 | source = sys.argv[1] 49 | train = sys.argv[2] 50 | val = sys.argv[3] 51 | 52 | split_folder(source, train, val) 53 | 54 | 55 | -------------------------------------------------------------------------------- /moco_pretraining/scripts/training_scripts/r8w1n416.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=deep 3 | #SBATCH --nodes=1 4 | #SBATCH --cpus-per-task=4 5 | #SBATCH --mem=32000 6 | 7 | # only use the following on partition with GPUs 8 | #SBATCH --gres=gpu:1 9 | 10 | #SBATCH --job-name="r8w1n416" 11 | #SBATCH --output=exp_logs/r8w1n416-%j.out 12 | 13 | # only use the following if you want email notification 14 | ####SBATCH --mail-user=youremailaddress 15 | ####SBATCH --mail-type=ALL 16 | 17 | # list out some useful information 18 | echo "SLURM_JOBID="$SLURM_JOBID 19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST 20 | echo "SLURM_NNODES"=$SLURM_NNODES 21 | echo "SLURMTMPDIR="$SLURMTMPDIR 22 | echo "working directory = "$SLURM_SUBMIT_DIR 23 | 24 | # sample job 25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l` 26 | echo NPROCS=$NPROCS 27 | 28 | cd ../moco; python main_moco.py -a resnet18 \ 29 | --lr 0.0001 --batch-size 16 \ 30 | --epochs 20 \ 31 | --world-size 1 --rank 0 \ 32 | --mlp --moco-t 0.2 --from-imagenet \ 33 | --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \ 34 | --aug-setting chexpert --rotate 10 --maintain-ratio \ 35 | --train_data /deep/group/data/moco/chexpert-proper-test/data/full_train \ 36 | --exp-name r8w1n416_20200911h13 37 | 38 | # done 39 | echo "Done" 40 | -------------------------------------------------------------------------------- /moco_pretraining/scripts/training_scripts/sbatch_lincls_template.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=deep 3 | #SBATCH --nodes=1 4 | #SBATCH --cpus-per-task=4 5 | #SBATCH --mem=64000 6 | 7 | # only use the following on partition with GPUs 8 | #SBATCH --gres=gpu:2 9 | 10 | #SBATCH --job-name="REPLACE_JOB_NAME" 11 | #SBATCH --output=REPLACE_OUTPUT_PATH-%j.out 12 | 13 | # only use the following if you want email notification 14 | ####SBATCH --mail-user=youremailaddress 15 | ####SBATCH --mail-type=ALL 16 | 17 | # list out some useful information 18 | echo "SLURM_JOBID="$SLURM_JOBID 19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST 20 | echo "SLURM_NNODES"=$SLURM_NNODES 21 | echo "SLURMTMPDIR="$SLURMTMPDIR 22 | echo "working directory = "$SLURM_SUBMIT_DIR 23 | 24 | # sample job 25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l` 26 | echo NPROCS=$NPROCS 27 | 28 | cd ../moco; python main_lincls.py -a resnet18 --lr REPLACE_LR \ 29 | --batch-size 48 \ 30 | --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \ 31 | --pretrained REPLACE_CHECKPOINT \ 32 | --world-size 1 --rank 0 REPLACE_COS \ 33 | --train_data REPLACE_TRAIN \ 34 | --val_data REPLACE_VALID\ 35 | --test_data REPLACE_TEST \ 36 | --from-imagenet REPLACE_SEMI \ 37 | --binary \ 38 | --aug-setting chexpert --rotate --maintain-ratio \ 39 | --exp-name REPLACE_EXP_NAME 40 | 41 | echo "Done" 42 | -------------------------------------------------------------------------------- /moco_pretraining/scripts/training_scripts/sbatch_moco_lincls.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=deep 3 | #SBATCH --nodes=1 4 | #SBATCH --cpus-per-task=8 5 | #SBATCH --mem=120000 6 | 7 | # only use the following on partition with GPUs 8 | #SBATCH --gres=gpu:4 9 | 10 | #SBATCH --job-name="moco-v1-lincls" 11 | #SBATCH --output=exp_logs/v1-lincls-%j.out 12 | 13 | # only use the following if you want email notification 14 | ####SBATCH --mail-user=youremailaddress 15 | ####SBATCH --mail-type=ALL 16 | 17 | # list out some useful information 18 | echo "SLURM_JOBID="$SLURM_JOBID 19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST 20 | echo "SLURM_NNODES"=$SLURM_NNODES 21 | echo "SLURMTMPDIR="$SLURMTMPDIR 22 | echo "working directory = "$SLURM_SUBMIT_DIR 23 | 24 | # sample job 25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l` 26 | echo NPROCS=$NPROCS 27 | 28 | cd ../moco; python main_lincls.py -a resnet50 --lr 30.0 --batch-size 256 \ 29 | --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \ 30 | --world-size 1 --rank 0 \ 31 | --train_data chexpert-v10-small-as-imagenet/data/actual_train \ 32 | --val_data chexpert-v10-small-as-imagenet/data/actual_valid \ 33 | --test_data chexpert-v10-small-as-imagenet/data/valid \ 34 | --from-imagenet \ 35 | --exp-name moco_v1_lincls 36 | # done 37 | echo "Done" 38 | -------------------------------------------------------------------------------- /moco_pretraining/scripts/training_scripts/sbatch_moco_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=deep 3 | #SBATCH --nodes=1 4 | #SBATCH --cpus-per-task=4 5 | #SBATCH --mem=32000 6 | 7 | # only use the following on partition with GPUs 8 | #SBATCH --gres=gpu:1 9 | 10 | #SBATCH --job-name="dense121" 11 | #SBATCH --output=exp_logs/dense121-%j.out 12 | 13 | # only use the following if you want email notification 14 | ####SBATCH --mail-user=youremailaddress 15 | ####SBATCH --mail-type=ALL 16 | 17 | # list out some useful information 18 | echo "SLURM_JOBID="$SLURM_JOBID 19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST 20 | echo "SLURM_NNODES"=$SLURM_NNODES 21 | echo "SLURMTMPDIR="$SLURMTMPDIR 22 | echo "working directory = "$SLURM_SUBMIT_DIR 23 | 24 | # sample job 25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l` 26 | echo NPROCS=$NPROCS 27 | 28 | cd ../moco; python main_moco.py -a densenet121 \ 29 | --lr 1e-4 --batch-size 16 \ 30 | --world-size 1 --rank 0 \ 31 | --mlp --moco-t 0.2 \ 32 | --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \ 33 | --from-imagenet \ 34 | --aug-setting chexpert --rotate --maintain-ratio \ 35 | --train_data data/full_train \ 36 | --exp-name dense121 37 | 38 | # done 39 | echo "Done" 40 | -------------------------------------------------------------------------------- /moco_pretraining/scripts/training_scripts/sbatch_moco_train_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=deep 3 | #SBATCH --nodes=1 4 | #SBATCH --cpus-per-task=4 5 | #SBATCH --mem=32000 6 | 7 | # only use the following on partition with GPUs 8 | #SBATCH --gres=gpu:1 9 | 10 | #SBATCH --job-name="densenet121" 11 | #SBATCH --output=exp_logs/densenet121-%j.out 12 | 13 | # only use the following if you want email notification 14 | ####SBATCH --mail-user=youremailaddress 15 | ####SBATCH --mail-type=ALL 16 | 17 | # list out some useful information 18 | echo "SLURM_JOBID="$SLURM_JOBID 19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST 20 | echo "SLURM_NNODES"=$SLURM_NNODES 21 | echo "SLURMTMPDIR="$SLURMTMPDIR 22 | echo "working directory = "$SLURM_SUBMIT_DIR 23 | 24 | # sample job 25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l` 26 | echo NPROCS=$NPROCS 27 | 28 | cd ../moco; python main_moco.py -a densenet121 \ 29 | --lr 1e-4 --batch-size 16 \ 30 | --world-size 1 --rank 0 \ 31 | --mlp --moco-t 0.2 \ 32 | --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \ 33 | --aug-setting chexpert --rotate --maintain-ratio \ 34 | --train_data data/full_train \ 35 | --exp-name densenet121 36 | 37 | # done 38 | echo "Done" 39 | --------------------------------------------------------------------------------