├── .gitignore
├── README.md
├── chexpert_supervised
    ├── .gitignore
    ├── README.md
    ├── chexpert-model
    │   ├── args
    │   │   ├── __init__.py
    │   │   ├── base_arg_parser.py
    │   │   ├── test_arg_parser.py
    │   │   └── train_arg_parser.py
    │   ├── bash_scripts
    │   │   ├── finetune_normal.sh
    │   │   ├── finetune_normal2.sh
    │   │   ├── train_chexpert_models.sh
    │   │   ├── train_intermountain_models.sh
    │   │   ├── train_synthetic.sh
    │   │   └── valid_ignore.sh
    │   ├── calibrate.py
    │   ├── calibration_params.json
    │   ├── cams
    │   │   ├── __init__.py
    │   │   ├── base_cam.py
    │   │   ├── ensemble_cam.py
    │   │   ├── grad_cam.py
    │   │   ├── guided_backprop.py
    │   │   └── model_cam_configs.json
    │   ├── confidence_interval.py
    │   ├── confidence_interval_diff.py
    │   ├── constants
    │   │   ├── __init__.py
    │   │   └── constants.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── base_dataset.py
    │   │   ├── chexpert_dataset.py
    │   │   ├── custom_dataset.py
    │   │   ├── loader.py
    │   │   ├── pad_collate.py
    │   │   └── task_sequences.json
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── base_dataset.py
    │   │   ├── ckpts
    │   │   │   └── debugging
    │   │   │   │   └── args.json
    │   │   ├── concat_dataset.py
    │   │   ├── constants.py
    │   │   ├── get_loader.py
    │   │   ├── label_mapper.py
    │   │   ├── nih_dataset.py
    │   │   ├── pad_collate.py
    │   │   ├── predict_config.json
    │   │   ├── su_dataset.py
    │   │   ├── task_sequences.json
    │   │   └── transforms
    │   │   │   ├── __init__.py
    │   │   │   └── clahe.py
    │   ├── eval
    │   │   ├── __init__.py
    │   │   ├── average_meter.py
    │   │   ├── below_curve_counter.py
    │   │   ├── evaluator.py
    │   │   └── loss.py
    │   ├── logger
    │   │   ├── __init__.py
    │   │   └── logger.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── calibrate.py
    │   │   └── models.py
    │   ├── optim
    │   │   ├── __init__.py
    │   │   └── optimizer.py
    │   ├── predict
    │   │   ├── __init__.py
    │   │   ├── configs
    │   │   │   ├── final.json
    │   │   │   └── toy.json
    │   │   ├── ensemble_predict.py
    │   │   └── predict.py
    │   ├── saver
    │   │   ├── __init__.py
    │   │   └── model_saver.py
    │   ├── sbatch
    │   │   ├── gen_sbatch.py
    │   │   └── job_management.py
    │   ├── scripts
    │   │   ├── get_cams.py
    │   │   ├── get_model_size.py
    │   │   └── map_uncertain.py
    │   ├── select_ensemble.py
    │   ├── test.py
    │   ├── test_images.py
    │   ├── test_one.py
    │   ├── timeout_test.py
    │   ├── train.py
    │   └── util
    │   │   ├── __init__.py
    │   │   ├── cuda_util.py
    │   │   ├── image_util.py
    │   │   ├── io_util.py
    │   │   ├── label_util.py
    │   │   └── model_util.py
    └── environment.yml
├── image_source
    ├── contrastive_learning.PNG
    ├── cx_all_full_ci.PNG
    ├── cx_all_last_ci.PNG
    └── moco_flowchart_new.PNG
└── moco_pretraining
    ├── moco
        ├── LICENSE
        ├── aihc_utils
        │   ├── __init__.py
        │   ├── image_transform.py
        │   └── storage_util.py
        ├── detection
        │   ├── README.md
        │   ├── configs
        │   │   ├── Base-RCNN-C4-BN.yaml
        │   │   ├── coco_R_50_C4_2x.yaml
        │   │   ├── coco_R_50_C4_2x_moco.yaml
        │   │   ├── pascal_voc_R_50_C4_24k.yaml
        │   │   └── pascal_voc_R_50_C4_24k_moco.yaml
        │   ├── convert-pretrain-to-detectron2.py
        │   └── train_net.py
        ├── main_lincls.py
        ├── main_moco.py
        ├── moco
        │   ├── __init__.py
        │   ├── builder.py
        │   └── loader.py
        └── training_tools
        │   ├── __init__.py
        │   ├── combiner.py
        │   ├── evaluator.py
        │   └── meters.py
    └── scripts
        ├── convert_to_chexpert.py
        ├── generate_moco_training_scripts.py
        ├── parse_log.py
        ├── reorganize_files.py
        ├── resize.sh
        ├── shenzhen_mutiple_split.py
        ├── split_into_train_val.py
        └── training_scripts
            ├── r8w1n416.sh
            ├── sbatch_lincls_template.sh
            ├── sbatch_moco_lincls.sh
            ├── sbatch_moco_train.sh
            └── sbatch_moco_train_local.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # AIHC specialized ignores
  2 | __pycache__/
  3 | *.pyc
  4 | *~
  5 | .DS_Store
  6 | ._*
  7 | *.jpg
  8 | *.pth.tar
  9 | *.ipynb_checkpoints
 10 | logs/
 11 | 
 12 | ## Copied from https://github.com/github/gitignore/edit/master/Python.gitignore
 13 | 
 14 | # Byte-compiled / optimized / DLL files
 15 | __pycache__/
 16 | *.py[cod]
 17 | *$py.class
 18 | 
 19 | # C extensions
 20 | *.so
 21 | 
 22 | # Distribution / packaging
 23 | .Python
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | share/python-wheels/
 37 | *.egg-info/
 38 | .installed.cfg
 39 | *.egg
 40 | MANIFEST
 41 | 
 42 | # PyInstaller
 43 | #  Usually these files are written by a python script from a template
 44 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 45 | *.manifest
 46 | *.spec
 47 | 
 48 | # Installer logs
 49 | pip-log.txt
 50 | pip-delete-this-directory.txt
 51 | 
 52 | # Unit test / coverage reports
 53 | htmlcov/
 54 | .tox/
 55 | .nox/
 56 | .coverage
 57 | .coverage.*
 58 | .cache
 59 | nosetests.xml
 60 | coverage.xml
 61 | *.cover
 62 | *.py,cover
 63 | .hypothesis/
 64 | .pytest_cache/
 65 | cover/
 66 | 
 67 | # Translations
 68 | *.mo
 69 | *.pot
 70 | 
 71 | # Django stuff:
 72 | *.log
 73 | local_settings.py
 74 | db.sqlite3
 75 | db.sqlite3-journal
 76 | 
 77 | # Flask stuff:
 78 | instance/
 79 | .webassets-cache
 80 | 
 81 | # Scrapy stuff:
 82 | .scrapy
 83 | 
 84 | # Sphinx documentation
 85 | docs/_build/
 86 | 
 87 | # PyBuilder
 88 | .pybuilder/
 89 | target/
 90 | 
 91 | # Jupyter Notebook
 92 | .ipynb_checkpoints
 93 | 
 94 | # IPython
 95 | profile_default/
 96 | ipython_config.py
 97 | 
 98 | # pyenv
 99 | #   For a library or package, you might want to ignore these files since the code is
100 | #   intended to run in multiple environments; otherwise, check them in:
101 | # .python-version
102 | 
103 | # pipenv
104 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | #   install all needed dependencies.
108 | #Pipfile.lock
109 | 
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 | 
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 | 
117 | # SageMath parsed files
118 | *.sage.py
119 | 
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 | 
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 | 
133 | # Rope project settings
134 | .ropeproject
135 | 
136 | # mkdocs documentation
137 | /site
138 | 
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 | 
144 | # Pyre type checker
145 | .pyre/
146 | 
147 | # pytype static type analyzer
148 | .pytype/
149 | 
150 | # Cython debug symbols
151 | cython_debug/
152 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MoCo-CXR: MoCo Pretraining Improves Representations and Transferability of Chest X-Ray Models
 2 | 
 3 | This repository contains
 4 | * A modified version of [the MoCo paper](https://github.com/facebookresearch/moco) to accomodate for the CheXpert dataset
 5 | * A modified version of the original implementation of [the CheXpert paper](https://arxiv.org/pdf/1901.07031.pdf)
 6 | 
 7 | Preprint of this work is available on [arXiv](https://arxiv.org/pdf/2010.05352.pdf)
 8 | ```
 9 | @article{sowrirajanmoco,
10 |   title={MoCo-CXR: MoCo Pretraining Improves Representation and Transferability of Chest X-ray Models},
11 |   author={Sowrirajan, Hari and Yang, Jingbo and Ng, Andrew Y and Rajpurkar, Pranav}
12 | }
13 | ```
14 | 
15 | This work has been presented in
16 | * ACM Conference on Health, Inference and Learning (CHIL 2020) workshop
17 | * Medical Imaging with Deep Learing (MIDL 2021)
18 | 
19 | ## Abstract
20 | ```
21 | Contrastive learning is a form of self-supervision that can leverage unlabeled data to produce pretrained models. While contrastive learning has demonstrated promising results on natural image classification tasks, its application to medical imaging tasks like chest X-ray interpretation has been limited. In this work, we propose MoCo-CXR, which is an adaptation of the contrastive learning method Momentum Contrast (MoCo), to produce models with better representations and initializations for the detection of pathologies in chest X-rays. In detecting pleural effusion, we find that linear models trained on MoCo-CXR-pretrained representations outperform those without MoCo-CXR-pretrained representations, indicating that MoCo-CXR-pretrained representations are of higher-quality. End-to-end fine-tuning experiments reveal that a model initialized via MoCo-CXR-pretraining outperforms its non-MoCo-CXR-pretrained counterpart. We find that MoCo-CXR-pretraining provides the most benefit with limited labeled training data. Finally, we demonstrate similar results on a target Tuberculosis dataset unseen during pretraining, indicating that MoCo-CXR-pretraining endows models with representations and transferability that can be applied across chest X-ray datasets and tasks.
22 | ```
23 | 
24 | ## Methods
25 | 
26 | MoCo-CXR uses momemtum contrast as an unsupervised training method. This method maximizes agreement between augmentations of the same images while increases distances with the momemtum-weighted negative embedding.
27 | 
28 | <img src="image_source/contrastive_learning.PNG" alt="drawing" width="250"/>
29 | 
30 | MoCo-CXR is based on ResNet initialize weights, then trained in an unsupervised manner. Supervised learning is performed on different label fractions for the CheXpert dataset and the Shenzhen dataset.
31 | 
32 | <img src="image_source/moco_flowchart_new.PNG" alt="drawing" width="450"/>
33 | 
34 | 
35 | ## Evaluation
36 | 
37 | Comparison of MoCo-CXR performance against ResNet initialized baseline when only the linear layers are fine tuned.
38 | 
39 | <img src="image_source/cx_all_last_ci.PNG" alt="drawing" width="350"/>
40 | 
41 | Comparison of MoCo-CXR performance against ResNet initialized baseline when all layers are allowed to be tuned.
42 | 
43 | <img src="image_source/cx_all_full_ci.PNG" alt="drawing" width="350"/>
44 | 
45 | ## Checkpoints
46 | * https://storage.googleapis.com/moco-cxr/mnn-00001.pth.tar
47 | * https://storage.googleapis.com/moco-cxr/r8w-00001-v2.pth.tar  (slightly different but produces similar result as V1)
48 | * https://storage.googleapis.com/moco-cxr/r8w-00001.pth.tar
49 | * https://storage.googleapis.com/moco-cxr/r8w-0001.pth.tar
50 | * https://storage.googleapis.com/moco-cxr/r8w-001.pth.tar
51 | * https://storage.googleapis.com/moco-cxr/r5w-00001.pth.tar
52 | * https://storage.googleapis.com/moco-cxr/d1w-00001.pth.tar
53 | 
54 | Note that these checkpoints follow MoCo's implementation. To re-use them for ImageNet-like training process, you will have to "hack" the checkpoint weights using ways similar to our [model saver](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/chexpert_supervised/chexpert-model/saver/model_saver.py).
55 | 
56 | ## Running the experiments
57 | 
58 | ### Pre-Training
59 | Note that the above naming includes "dot", that is, 00001 means 0.0001=1e-4. 
60 | Our experiments are conducted on Stanford's SLURM. For reference, the training script used is [here](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/training_scripts/r8w1n416.sh). Alternatively, if you are runng it on a "vanilla" machine, you can refence [this script](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/training_scripts/sbatch_moco_train_local.sh). You could also referene [a generation script](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/generate_moco_training_scripts.py) if you would like to generate commands for different learning rate and/or backbone model.
61 | 
62 | ### MoCo-CXR Training with CheXpert
63 | 
64 | We used splitting scripts like [this](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/reorganize_files.py) to split data into traininig and validation sets. These also generate the various draws to produce confidence interval for evaluation of our semi-supervised approach. 
65 | 
66 | For the Shenzhen dataset, we used [this](https://github.com/stanfordmlgroup/MoCo-CXR/blob/main/moco_pretraining/scripts/convert_to_chexpert.py) to convert the unpacked Shenzhen files into CheXpert's default format for easier experiment setup. Note that the actual CheXpert pipeline is a 3 step process, training, model picking (select best checkpoint) and evaluation. Each independent "draw" went through this process. 
67 | 
68 | ## Additional Information
69 | 
70 | * [Shenzhen dataset](https://qims.amegroups.com/article/view/5132/6030)
71 | * [CheXpert leaderboard](https://stanfordmlgroup.github.io/competitions/chexpert/)
72 | * [CheXtransfer](https://www.chilconference.org/proceeding_P11.html)
73 | * [CheXternal](https://www.chilconference.org/proceeding_P12.html)
74 | * [VisualCheXbert](https://www.chilconference.org/proceeding_P10.html)
75 | 


--------------------------------------------------------------------------------
/chexpert_supervised/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.pyc
 3 | *~
 4 | .DS_Store
 5 | ._*
 6 | *.jpg
 7 | *.pth.tar
 8 | *.ipynb_checkpoints
 9 | logs/
10 | 


--------------------------------------------------------------------------------
/chexpert_supervised/README.md:
--------------------------------------------------------------------------------
  1 | # aihc-winter19-robustness
  2 | Repo for project on robustness to medical images.
  3 | Development branch for conaug-2020.
  4 | 
  5 | ## Activate environment
  6 | Default experiment to activate is defined in chexpert-model/sbatch/conaug/sbatch_commands/envs.py in `CONAUG_ENV` (line 23)
  7 | 
  8 | Point it to your own virtual environment if needed.
  9 | e.g.
 10 | ```
 11 | source /deep/u/canliu/envs/aihc_chexpert/bin/activate
 12 | ```
 13 | 
 14 | ## chexpert-model
 15 | This directory is a fork of the original chexpert-model, in our organization.
 16 | 
 17 | ### Usage
 18 | #### Automation.
 19 | ##### See relevant code in chexpert-model/sbatch/conaug.
 20 | 
 21 | ##### Generate sbatch scripts en masse:
 22 | 1. Set up finetuning config by modifying *chexpert-model/sbatch/conaug/configs/finetune.json*.
 23 | 2. Specify experiments to finetune with by entering experiment names in *CKPT_LIST* in *chexpert-model/sbatch/conaug/script_generation.py*.
 24 | 3. Generate scripts: 
 25 | ```
 26 | python chexpert-model/sbatch/conaug/script_generation.py
 27 | ``` 
 28 | with optional arguments:
 29 | ```
 30 | --user_id: owner of the pretrained checkpoints. Default: account running the script generation code.
 31 | --epochs: number of epochs. Default: calculated automatically based on label fractions.
 32 | --cpu: cpu per tasks. Default: 4.
 33 | --mem: cpu memory to request. Default: 32000.
 34 | --log_path: directory to log job status. Default: /sailhome/<user>/experiments.
 35 | ```
 36 | Note that each group of finetuning experiments is associated with a unique timestamp.
 37 | 
 38 | ##### Launch sbatch jobs:
 39 | 1. Specify jobs to run by in *CONFIG* dictionary in *chexpert-model/sbatch/conaug/job_management.py*.
 40 | 2. Launch jobs:
 41 | ```
 42 | python chexpert-model/sbatch/conaug/job_management.py
 43 | ```
 44 | with optional arguments:
 45 | ```
 46 | --refresh: frequency to refresh screen (to print out current job status).
 47 | ```
 48 | 
 49 | ---------------
 50 | Following parts are informative but also could be outdated.
 51 | 
 52 | #### Training
 53 | Single model (default train and val set):
 54 | ```
 55 | python train.py --dataset chexpert --save_dir <path to save dir> --experiment_name <type of model/experiment>
 56 | ```
 57 | Ensemble model: Training the ensemble model consists of individually training 15 models separately. It may be good to use sbatch to train these models separately. 
 58 | 
 59 | Single model (custom train and val set): please use full paths to the images in the csvs if custom_dataset=True
 60 | ```
 61 | python train.py --dataset custom  --train_custom_csv <path to train csv>  --val_custom_csv <path to val csv> --save_dir <path to save dir> --experiment_name <type of model/experiment>
 62 | ```
 63 | (please look at train_arg_parser for other flags such as gpu, number of epochs, etc.)
 64 |   
 65 | 
 66 | #### Testing
 67 | Single model (default test set):
 68 | ```
 69 | python test.py --dataset chexpert --ckpt_path <path to checkpoint> --phase {valid, test} --save_dir <path to save dir>
 70 | ```
 71 | Ensemble (default test set):
 72 | ```
 73 | python test.py --dataset chexpert --config_path <path to config> --phase {valid, test} --save_dir <path to save dir>
 74 | ```
 75 | Single model (custom test set, separate test gt/paths): please use full paths to the images in the csvs if custom_dataset=True
 76 | ```
 77 | python test.py --dataset custom --ckpt_path <path to checkpoint> --phase {valid, test} --save_dir <path to save dir> --test_groundtruth <path to gt labels of studies in test set> --test_image_paths <path to images paths in test set>
 78 | ```
 79 | Single model (custom test set, test csv): please use full paths to the images in the csvs if custom_dataset=True
 80 | ```
 81 | python test.py --dataset custom --ckpt_path <path to checkpoint> --phase {valid, test} --save_dir <path to save dir> --together True --test_csv <path to test csv, same format as train/val> 
 82 | ```
 83 | 
 84 | (please look at test_arg_parser for other flags such as save_cams)
 85 | 
 86 | A note on CAMS generation:
 87 | ```--save_cams True```: to generate CAMS
 88 | ```--only_competition_cams True```: to only generate CAMS for competition classes
 89 | CAMS will only be generated for classes where groundtruth is 1.
 90 | 
 91 | ### Reproduce CheXpert test results
 92 | `python test.py --dataset chexpert --config_path predict/config/final.json --phase test --save_dir <path to save dir>`
 93 | 
 94 | ### Evaluating a pre-trained model
 95 | Some pre-trained models are available in `/deep/group/CheXpert/final_ckpts/`. We can try a 3-class model. Make a temporary folder `[temp]`, and do:
 96 | ```
 97 | cp /deep/group/CheXpert/final_ckpts/CheXpert-3-class/best.pth.tar [temp]
 98 | cp /deep/group/CheXpert/final_ckpts/CheXpert-3-class/args.json [temp]
 99 | cd [repo]/chexpert-model/
100 | python test.py --dataset chexpert --ckpt_path [temp]/best.pth.tar --phase {valid, test} --model_uncertainty True --save_dir <path to save dir>
101 | ```
102 | Regarding the structure of the `[temp]` folder, let `[phase]` be the phase selected previously. Then, `[temp]/results/[phase]/scores.txt` contains a variety of metrics tabulated by the `Evaluator`. On the branch `mark_model_analysis`, `test.py` also saves `groundtruth.csv` and `predictions.csv` to `[temp]/results/[phase]/`.
103 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/args/__init__.py:
--------------------------------------------------------------------------------
1 | from .test_arg_parser import TestArgParser
2 | from .train_arg_parser import TrainArgParser
3 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/args/test_arg_parser.py:
--------------------------------------------------------------------------------
 1 | """Define class for processing testing command-line arguments."""
 2 | import util
 3 | 
 4 | from .base_arg_parser import BaseArgParser
 5 | 
 6 | 
 7 | class TestArgParser(BaseArgParser):
 8 |     """Argument parser for args used only in test mode."""
 9 |     def __init__(self):
10 |         super(TestArgParser, self).__init__()
11 |         self.is_training = False
12 | 
13 |         self.parser.add_argument('--inference_only',
14 |                                  action='store_true',
15 |                                  help=('If set, then only do inference. Useful'+
16 |                                        ' when the csv has uncertainty label'))
17 |         # Data args
18 |         self.parser.add_argument('--phase',
19 |                                  dest='data_args.phase',
20 |                                  type=str, default='valid',
21 |                                  choices=('train', 'valid', 'test'))
22 |         self.parser.add_argument('--test_groundtruth',
23 |                                  dest='data_args.gt_csv',
24 |                                  type=str, default=None,
25 |                                  help=('csv file if custom dataset'))
26 |         self.parser.add_argument('--test_image_paths',
27 |                                  dest='data_args.paths_csv',
28 |                                  type=str, default=None,
29 |                                  help=('csv file if custom dataset'))
30 |         self.parser.add_argument('--together',
31 |                                  dest='data_args.together',
32 |                                  type=str, default=True,
33 |                                  help=('whether we have integrated test csv'))
34 |         self.parser.add_argument('--test_csv',
35 |                                  dest='data_args.test_csv',
36 |                                  type=str, default=None,
37 |                                  help=('csv file for integrated test set'))
38 |         # Logger args
39 |         self.parser.add_argument('--save_cams',
40 |                                  dest='logger_args.save_cams',
41 |                                  type=util.str_to_bool, default=False,
42 |                                  help=('If true, will save cams to ' +
43 |                                        'experiment_folder/cams'))
44 |         self.parser.add_argument('--only_evaluation_cams',
45 |                                  dest='logger_args.only_evaluation_cams',
46 |                                  type=util.str_to_bool, default=True,
47 |                                  help=('If true, will only generate cams ' +
48 |                                        'on evaluation labels. Only ' +
49 |                                        'relevant if --save_cams is True'))
50 |         self.parser.add_argument('--only_competition_cams',
51 |                                  dest='logger_args.only_competition_cams',
52 |                                  type=util.str_to_bool, default=False,
53 |                                  help='Whether to only output cams for' +
54 |                                  'competition categories.')
55 | 
56 |         # Model args
57 |         self.parser.add_argument('--config_path',
58 |                                  dest='model_args.config_path',
59 |                                  type=str, default=None)
60 |         self.parser.add_argument('--calibrate',
61 |                                  dest='model_args.calibrate',
62 |                                  type=util.str_to_bool, default=False,
63 |                                  help='Compute calibrated probabilities.')
64 |         
65 |         # TODO: Somehow need this line
66 |         self.parser.add_argument('--moco', dest='model_args.moco',
67 |                                  type=util.str_to_bool, default=True,
68 |                                  help='Using moco')


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/bash_scripts/finetune_normal.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=deep --qos=normal
 3 | #SBATCH --time=06:00:00
 4 | #SBATCH --nodes=1
 5 | #SBATCH --cpus-per-task=4
 6 | #SBATCH --mem=32G
 7 | 
 8 | # only use the following on partition with GPUs
 9 | #SBATCH --gres=gpu:1
10 | 
11 | #SBATCH --job-name="finetune"
12 | #SBATCH --output=finetune-%j.out
13 | 
14 | echo "Running finetune on uignore"
15 | 
16 | python ../train.py --ckpt_path /deep/group/chexperturbed/runs/2019-04-18-22.17.36.095031__minhphu/DenseNet121_320_1e-04_uncertainty_ignored_top10/best.pth.tar \
17 |                    --dataset custom \
18 |                    --train_custom_csv /deep/group/chexperturbed/data/natural/Nokiadev10K_and_NokiaNORMALS507_noflux.csv \
19 |                    --val_custom_csv /deep/group/chexperturbed/data/CheXpert-original/prosp500_all.csv \
20 |                    --save_dir /deep/group/minhphu/dump \
21 |                    --experiment_name finetune_uignore \
22 |                    --batch_size 48 \
23 |                    --iters_per_print 48 \
24 |                    --iters_per_visual 48000 \
25 |                    --iters_per_eval=4800 \
26 |                    --iters_per_save=4800 \
27 |                    --gpu_ids 0 \
28 |                    --num_epochs=3 \
29 |                    --metric_name chexpert-competition-AUROC \
30 |                    --maximize_metric True \
31 |                    --scale 320 \
32 |                    --max_ckpts 10 \
33 |                    --keep_topk True 
34 | 
35 | echo "Done!"
36 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/bash_scripts/finetune_normal2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=deep --qos=normal
 3 | #SBATCH --time=60:00:00
 4 | #SBATCH --nodes=4
 5 | #SBATCH --cpus-per-task=4
 6 | #SBATCH --mem=64G
 7 | 
 8 | # only use the following on partition with GPUs
 9 | #SBATCH --gres=gpu:4
10 | 
11 | #SBATCH --job-name="finetune"
12 | #SBATCH --output=finetune-%j.out
13 | 
14 | echo "Running finetune on uignore"
15 | 
16 | SAVE_DIR="/deep/group/chexperturbed/runs/2019-04-23-21.26.43.341224__minhphu"
17 | 
18 | python ../train.py --ckpt_path /deep/group/chexperturbed/runs/2019-04-18-22.17.36.095031__minhphu/DenseNet121_320_1e-04_uncertainty_ignored_top10/best.pth.tar \
19 |                    --dataset custom \
20 |                    --train_custom_csv /deep/group/chexperturbed/data/natural/Nokiadev10K_and_NokiaNORMALS507_noflux.csv \
21 |                    --val_custom_csv /deep/group/chexperturbed/data/CheXpert-original/prosp500_all.csv \
22 |                    --save_dir $SAVE_DIR \
23 |                    --experiment_name finetune_uignore \
24 |                    --batch_size 48 \
25 |                    --iters_per_print 48 \
26 |                    --iters_per_visual 48000 \
27 |                    --iters_per_eval=4800 \
28 |                    --iters_per_save=4800 \
29 |                    --gpu_ids 0 \
30 |                    --num_epochs=3 \
31 |                    --metric_name chexpert-competition-AUROC \
32 |                    --maximize_metric True \
33 |                    --scale 320 \
34 |                    --max_ckpts 10 \
35 |                    --keep_topk True 
36 | 
37 | echo "Done!"
38 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/bash_scripts/train_chexpert_models.sh:
--------------------------------------------------------------------------------
 1 | # 3-class
 2 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_3-class_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --model_uncertainty=True && \
 3 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_3-class_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --model_uncertainty=True && \
 4 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_3-class_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --model_uncertainty=True
 5 | 
 6 | # Ignore
 7 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ignore_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True && \
 8 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ignore_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True && \
 9 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_ignore_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True
10 | 
11 | # Self-Train
12 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_self-train_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_self-train.csv && \
13 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_self-train_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_self-train.csv && \
14 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_self-train_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_self-train.csv
15 | 
16 | # Ones
17 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ones_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_ones.csv && \
18 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ones_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_ones.csv && \
19 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_ones_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_ones.csv
20 | 
21 | # Zeros
22 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_zeros_top10 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_zeros.csv && \
23 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_zeros_top10_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_zeros.csv && \
24 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name DenseNet121_320_1e-04_uncertainty_zeros_top10_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True --uncertain_map_path=uncertainty_zeros.csv


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/bash_scripts/train_intermountain_models.sh:
--------------------------------------------------------------------------------
1 | python train.py --dataset chexpert --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name DenseNet121_320_1e-04_uncertainty_ignore_top10 --num_epochs=3 --metric_name chexpert-competition-avg-AUROC --maximize_metric True --scale 320 --save_dir /deep/group/CheXpert/final_ckpts --max_ckpts 10 --keep_topk True
2 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/bash_scripts/train_synthetic.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=deep --qos=normal
 3 | #SBATCH --time=60:00:00
 4 | #SBATCH --nodes=4
 5 | #SBATCH --cpus-per-task=4
 6 | #SBATCH --mem=64G
 7 | 
 8 | # only use the following on partition with GPUs
 9 | #SBATCH --gres=gpu:4
10 | 
11 | #SBATCH --job-name="train_synthetic"
12 | #SBATCH --output=train_synthetic-%j.out
13 | 
14 | SAVE_DIR='/deep/group/chexperturbed/runs/2019-04-25-00.37.28.808433__minhphu'
15 | TRAIN_CSV='/deep/group/chexperturbed/data/CheXpert/synthetic_final/random/level_5/train_with_normal.csv'
16 | VALID_CSV='/deep/group/chexperturbed/data/CheXpert-original/prosp500_all.csv'
17 | # Ignore
18 | echo "Running Uignore..."
19 | IGNORE_NAME='Uone'
20 | python ../train.py --dataset custom --train_custom_csv $TRAIN_CSV --val_custom_csv $VALID_CSV --save_dir $SAVE_DIR --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name ${IGNORE_NAME}_1 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --max_ckpts 10 --keep_topk True && \
21 | python ../train.py --dataset custom --train_custom_csv $TRAIN_CSV --val_custom_csv $VALID_CSV --save_dir $SAVE_DIR --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2 --experiment_name ${IGNORE_NAME}_2 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --max_ckpts 10 --keep_topk True && \
22 | python ../train.py --dataset custom --train_custom_csv $TRAIN_CSV --val_custom_csv $VALID_CSV --save_dir $SAVE_DIR --batch_size 48 --iters_per_print 48 --iters_per_visual 48000 --iters_per_eval=4800 --iters_per_save=4800 --gpu_ids 0,1,2,3 --experiment_name ${IGNORE_NAME}_3 --num_epochs=3 --metric_name chexpert-competition-AUROC --maximize_metric True --scale 320 --max_ckpts 10 --keep_topk True
23 | 
24 | 
25 | # Uone
26 | # TODO
27 | 
28 | # Uzero
29 | # TODO
30 | 
31 | # Self-train
32 | # TODO
33 | 
34 | # 3class
35 | # TODO
36 | 
37 | echo "Done!"
38 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/bash_scripts/valid_ignore.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | USER='minhphu'
 3 | ROOT=/deep/group/${USER}
 4 | TEMP=${ROOT}/dump
 5 | 
 6 | cp /deep/group/CheXpert/final_ckpts/CheXpert-Ignore/best.pth.tar $TEMP
 7 | cp /deep/group/CheXpert/final_ckpts/CheXpert-Ignore/args.json $TEMP
 8 | cd ${ROOT}/aihc-winter19-robustness/chexpert-model/
 9 | python test.py --inference_only True \
10 |                --dataset custom \
11 |                --together True \
12 |                --test_csv /deep/group/chexperturbed/data/toy_of_CheXpert/train.csv \
13 |                --ckpt_path ${TEMP}/best.pth.tar \
14 |                --phase test \
15 |                --save_dir $TEMP \
16 | 
17 |                
18 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/calibrate.py:
--------------------------------------------------------------------------------
  1 | """Entry-point script to train models."""
  2 | import torch
  3 | 
  4 | from args import TestArgParser
  5 | from logger import Logger
  6 | from predict import Predictor, EnsemblePredictor
  7 | from saver import ModelSaver
  8 | from data import get_loader
  9 | from eval import Evaluator
 10 | from constants import *
 11 | 
 12 | 
 13 | def calibrate(args):
 14 |     """Run model testing."""
 15 |     model_args = args.model_args
 16 |     data_args = args.data_args
 17 |     logger_args = args.logger_args
 18 | 
 19 |     # Get logger.
 20 |     logger = Logger(logger_args.log_path,
 21 |                     logger_args.save_dir,
 22 |                     logger_args.results_dir)
 23 | 
 24 |     # Get image paths corresponding to predictions for logging
 25 |     paths = None
 26 | 
 27 |     if model_args.config_path is not None:
 28 |         # Instantiate the EnsemblePredictor class for obtaining
 29 |         # model predictions.
 30 |         predictor = EnsemblePredictor(config_path=model_args.config_path,
 31 |                                       model_args=model_args,
 32 |                                       data_args=data_args,
 33 |                                       gpu_ids=args.gpu_ids,
 34 |                                       device=args.device,
 35 |                                       logger=logger)
 36 |         # Obtain ensemble predictions.
 37 |         # Caches both individual and ensemble predictions.
 38 |         # We always turn off caching to ensure that we write the Path column.
 39 |         predictions, groundtruth, paths = predictor.predict(cache=False,
 40 |                                                             return_paths=True,
 41 |                                                             all_gt_tasks=True)
 42 |     else:
 43 |         # Load the model at ckpt_path.
 44 |         ckpt_path = model_args.ckpt_path
 45 |         ckpt_save_dir = Path(ckpt_path).parent
 46 |         model_uncertainty = model_args.model_uncertainty
 47 |         # Get model args from checkpoint and add them to
 48 |         # command-line specified model args.
 49 |         model_args, transform_args\
 50 |             = ModelSaver.get_args(cl_model_args=model_args,
 51 |                                   dataset=data_args.dataset,
 52 |                                   ckpt_save_dir=ckpt_save_dir,
 53 |                                   model_uncertainty=model_uncertainty)
 54 |         model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path,
 55 |                                                  gpu_ids=args.gpu_ids,
 56 |                                                  model_args=model_args,
 57 |                                                  is_training=False)
 58 |         # Instantiate the Predictor class for obtaining model predictions.
 59 |         predictor = Predictor(model=model, device=args.device)
 60 |         # Get phase loader object.
 61 |         return_info_dict = True
 62 |         loader = get_loader(phase=data_args.phase,
 63 |                             data_args=data_args,
 64 |                             transform_args=transform_args,
 65 |                             is_training=False,
 66 |                             return_info_dict=return_info_dict,
 67 |                             logger=logger)
 68 |         # Obtain model predictions
 69 |         if return_info_dict:
 70 |             predictions, groundtruth, paths = predictor.predict(loader)
 71 |         else:
 72 |             predictions, groundtruth = predictor.predict(loader)
 73 |         #print(groundtruth)
 74 |     # custom function
 75 |     from sklearn.linear_model import LogisticRegression as LR
 76 |     params = []
 77 |     for column in predictions:
 78 |         #print(predictions[column].values)
 79 |         #print(groundtruth[column].values)
 80 |         #drop corresponding rows where gt is -1  and 
 81 |         lr = LR(C=15)
 82 |         to_drop = groundtruth.index[groundtruth[column] == -1].tolist()                                        
 83 |         lr.fit(predictions[column].drop(to_drop).values.reshape(-1,1),groundtruth[column].drop(to_drop).values)     # LR needs X to be 2-dimensional
 84 |         print("num_rows_used",predictions[column].drop(to_drop).values.size)
 85 |         #print(groundtruth[column].drop(to_drop).values.size)
 86 |         #print(predictions[column].values)
 87 |         print("coeffs", lr.coef_, lr.intercept_)
 88 |         p_calibrated=lr.predict_proba(predictions[column].values.reshape(-1,1))
 89 |         params.append((lr.coef_, lr.intercept_))
 90 |     import json
 91 |     with open('calibration_params.json', 'w') as f:
 92 |         import pandas as pd
 93 |         pd.Series(params).to_json(f, orient='values')
 94 |  
 95 |     #return lr
 96 | 
 97 | if __name__ == "__main__":
 98 |     torch.multiprocessing.set_sharing_strategy('file_system')
 99 |     parser = TestArgParser()
100 |     calibrate(parser.parse_args())
101 |      
102 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/calibration_params.json:
--------------------------------------------------------------------------------
1 | [[[[6.7817460615]],[-4.1531589153]],[[[1.6547258878]],[0.0203352903]],[[[9.6160614807]],[-1.5795825742]],[[[0.5744558881]],[-5.0300181584]],[[[9.7480215396]],[-3.7431941999]],[[[7.7600541565]],[-3.6481033461]],[[[15.1510516812]],[-2.8724927246]],[[[3.1205136268]],[-3.2489406983]],[[[12.4407352569]],[-2.3623841489]],[[[6.0697003679]],[-4.6369053622]],[[[6.4079661737]],[-3.1433334158]],[[[0.3846525627]],[-5.0128584793]],[[[0.1457574294]],[-4.4623561016]],[[[7.3639532052]],[-3.4352856906]]]


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/cams/__init__.py:
--------------------------------------------------------------------------------
1 | from .grad_cam import GradCAM
2 | from .base_cam import BaseCAM
3 | from .ensemble_cam import EnsembleCAM
4 | from .guided_backprop import GuidedBackPropagation
5 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/cams/base_cam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | import util
 4 | 
 5 | 
 6 | class BaseCAM(object):
 7 |     """Base class for generating CAMs.
 8 |     Adapted from: https://github.com/kazuto1011/grad-cam-pytorch
 9 |     """
10 |     def __init__(self, model, device):
11 |         super(BaseCAM, self).__init__()
12 |         pred_type = '_3class' if model.module.model_uncertainty else 'binary'
13 |         self.device = device
14 |         self.pred_type = pred_type
15 |         self.model = model
16 |         self.model.eval()
17 |         self.inputs = None
18 | 
19 |     def _encode_one_hot(self, idx):
20 |         one_hot = torch.zeros([1, self.preds.size()[-1]],
21 |                               dtype=torch.float32, device=self.device, requires_grad=True)
22 | 
23 |         if self.pred_type == '_3class':
24 |             ind = 2 + idx * 3 # Get the index of positive class of the pathology.
25 |             one_hot[0][ind] = 1.0
26 |         else:
27 |             one_hot[0][idx] = 1.0
28 | 
29 |         return one_hot
30 | 
31 |     def forward(self, x):
32 |         self.inputs = x.to(self.device)
33 |         self.model.zero_grad()
34 |         self.preds = self.model(self.inputs)
35 | 
36 |         if self.pred_type == 'binary':
37 |             self.probs = torch.sigmoid(self.preds)[0]
38 |         elif self.pred_type == '_3class':
39 |             self.probs = util.uncertain_logits_to_probs(self.preds)[0]
40 |         else:
41 |             self.probs = F.softmax(self.preds, dim=1)[0]
42 |         return self.probs.detach().to('cpu').numpy()
43 | 
44 |     def backward(self, idx):
45 |         one_hot = self._encode_one_hot(idx)
46 |         self.preds.backward(gradient=one_hot, retain_graph=True)
47 | 
48 |     def get_cam(self, x, task_id, task=None):
49 |         raise NotImplementedError
50 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/cams/ensemble_cam.py:
--------------------------------------------------------------------------------
 1 | from .grad_cam import GradCAM
 2 | 
 3 | import torch
 4 | import numpy as np
 5 | 
 6 | class EnsembleCAM(object):
 7 |     """Class for generating CAMs using an ensemble."""
 8 |     def __init__(self, model, device):
 9 | 
10 |         super(EnsembleCAM, self).__init__()
11 | 
12 |         self.device = device
13 |         self.model = model
14 | 
15 |     def get_cam(self, x, task_id, task):
16 | 
17 |         ensemble_probs = []
18 |         cams = []
19 | 
20 |         loaded_model_iterator = self.model.loaded_model_iterator(task)
21 |         for loaded_model in loaded_model_iterator:
22 |             grad_cam = GradCAM(loaded_model, self.device)
23 |             probs = grad_cam.forward(x)
24 | 
25 |             grad_cam.backward(idx=task_id)
26 | 
27 |             cam = grad_cam.extract_cam()[0]
28 | 
29 |             ensemble_probs.append(probs)
30 |             cams.append(cam)
31 | 
32 |         probs = self.model.aggregation_fn(ensemble_probs, axis=0)
33 |         sorted_probs = np.sort(probs, axis=0)[::-1]
34 |         idx = np.argsort(probs, axis=0)[::-1]
35 | 
36 |         cam = self.model.aggregation_fn(cams, axis=0)
37 | 
38 |         return sorted_probs, idx, cam


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/cams/grad_cam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import json
 3 | import torch
 4 | import torch.nn.functional as F
 5 | 
 6 | from collections import OrderedDict
 7 | from .base_cam import BaseCAM
 8 | 
 9 | 
10 | # Load the dictionary of model configs
11 | # that for each model has the name of
12 | # the last layer before the GAP
13 | with open('cams/model_cam_configs.json') as f:
14 |     MODEL_CONFIGS = json.load(f)
15 | 
16 | 
17 | class GradCAM(BaseCAM):
18 |     """Class for generating grad CAMs.
19 |     Adapted from: https://github.com/kazuto1011/grad-cam-pytorch
20 |     """
21 |     def __init__(self, model, device):
22 | 
23 |         super(GradCAM, self).__init__(model, device)
24 |         self.fmaps = OrderedDict()
25 |         self.grads = OrderedDict()
26 |         self.target_layer = MODEL_CONFIGS[model.module.__class__.__name__]['target_layer']
27 | 
28 |         def save_fmap(m, _, output):
29 |             self.fmaps[id(m)] = output.to('cpu')
30 | 
31 |         def save_grad(m, _, grad_out):
32 |             self.grads[id(m)] = grad_out[0].to('cpu')
33 | 	
34 |         for name, module in self.model.named_modules():
35 |             # Only put hooks on the target layer
36 |             if name == self.target_layer:
37 |                 self.target_module_id = id(module)
38 |                 module.register_forward_hook(save_fmap)
39 |                 module.register_backward_hook(save_grad)
40 | 
41 |     def _find(self, outputs):
42 | 
43 |         # Since we've only put hooks on one layer
44 |         # the target layer, we can return the value
45 |         # right away
46 |         return outputs[self.target_module_id]
47 | 
48 |     @staticmethod
49 |     def _normalize(grads):
50 |         return grads / (torch.norm(grads).item() + 1e-5)
51 | 
52 |     def _compute_grad_weights(self, grads):
53 |         grads = self._normalize(grads)
54 |         weights = F.adaptive_avg_pool2d(grads, 1)
55 |         return weights
56 | 
57 |     def extract_cam(self):
58 |         """
59 |             c: number of filters in final conv layer
60 |             f: filter size
61 |             shape of fmaps and grads : num_images x c x f x f
62 |             shape of weights: num_images x c x 1 x 1
63 |             shape of gcam: num_images x f x f
64 |         """
65 | 
66 |         fmaps = self._find(self.fmaps)
67 |         grads = self._find(self.grads)
68 |         weights = self._compute_grad_weights(grads)
69 | 
70 |         assert len(fmaps.size()) == 4 and fmaps.size()[0] == 1
71 | 
72 | 
73 |         assert len(weights.size()) == 4 and weights.size()[0] == 1
74 | 
75 |         # Sum up along the filter dimension
76 |         gcam = (fmaps * weights).sum(dim=1)
77 | 
78 |         gcam = torch.clamp(gcam, min=0, max=float('inf'))
79 | 
80 |         gcam -= gcam.min()
81 |         gcam /= (gcam.max() + 1e-7)
82 | 
83 |         return gcam.detach().to('cpu').numpy()
84 | 
85 | 
86 |     def get_cam(self, x, task_id, task=None):
87 |         
88 |         probs = self.forward(x)
89 |         sorted_probs = np.sort(probs, axis=0)[::-1]
90 |         idx = np.argsort(probs, axis=0)[::-1]
91 |         self.backward(idx=task_id)
92 |         cam = self.extract_cam()[0]
93 | 
94 |         return sorted_probs, idx, cam
95 | 
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/cams/guided_backprop.py:
--------------------------------------------------------------------------------
 1 | from cams import BaseCAM
 2 | import torch
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | 
 6 | class GuidedBackPropagation(BaseCAM):
 7 | 
 8 |     def __init__(self, model, device, is_binary, is_3d):
 9 |         super(GuidedBackPropagation, self).__init__(model, device, is_binary, is_3d)
10 |         self.input_grad = []
11 |         def func_b(module, grad_in, grad_out):
12 |             # Cut off negative gradients
13 |             if isinstance(module, nn.ReLU):
14 |                 return (torch.clamp(grad_in[0], min=0.0),)
15 | 
16 |         for module in self.model.named_modules():
17 |             module[1].register_backward_hook(func_b)
18 | 
19 | 
20 |     def generate(self):
21 |         output = self.input_grad.to('cpu').numpy()[0]
22 |         return output
23 | 
24 |     def forward(self, x):
25 |         self.inputs = x.to(self.device)
26 | 
27 |         def save_grad(grad):
28 |             self.input_grad = grad.to('cpu')
29 | 
30 |         self.inputs.register_hook(save_grad)
31 |         self.model.zero_grad()
32 |         self.preds = self.model(self.inputs)
33 | 
34 |         if self.is_binary:
35 |             self.probs = torch.sigmoid(self.preds)[0]
36 |         else:
37 |             self.probs = F.softmax(self.preds, dim=1)[0]
38 |         self.prob, self.idx = self.probs.sort(0, True)
39 | 
40 |         return self.prob, self.idx
41 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/cams/model_cam_configs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "DenseNet121": {
 3 |         "target_layer": "module.model.features"
 4 |     },
 5 |     "ResNet152": {
 6 |         "target_layer": "module.model.layer4.2.conv3"
 7 |     },
 8 |     "Inceptionv4": {
 9 |         "target_layer": "module.model.features.21.branch3.1.conv"
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/confidence_interval.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import itertools
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pathlib
  6 | import sklearn.metrics
  7 | import sys
  8 | 
  9 | import argparse
 10 | 
 11 | from constants import NamedTasks
 12 | 
 13 | class ConfidenceGenerator():
 14 |     # Confidence level is 0.95, then we do 1 - confidence level to get 0.05
 15 |     def __init__(self, confidence_level):
 16 |         self.records = []
 17 |         self.confidence_level = 1 - confidence_level 
 18 | 
 19 |     @staticmethod
 20 |     def compute_cis(series, confidence_level):
 21 |         sorted_perfs = series.sort_values()
 22 |         lower_index = int(confidence_level/2 * len(sorted_perfs)) - 1
 23 |         upper_index = int((1 - confidence_level/2) * len(sorted_perfs)) - 1
 24 |         lower = sorted_perfs.iloc[lower_index].round(3)
 25 |         upper = sorted_perfs.iloc[upper_index].round(3)
 26 |         mean = sorted_perfs.mean().round(3)
 27 |         return lower, mean, upper
 28 | 
 29 |     def create_ci_record(self, perfs, name):
 30 |         lower, mean, upper = ConfidenceGenerator.compute_cis(
 31 |             perfs, self.confidence_level)
 32 |         record = {"name": name,
 33 |                   "lower": lower,
 34 |                   "mean": mean,
 35 |                   "upper": upper,
 36 |                   }
 37 |         self.records.append(record)
 38 | 
 39 |     def generate_cis(self, df):
 40 |         for diseases in df.columns:
 41 |             self.create_ci_record(df[diseases], diseases)
 42 | 
 43 |         df = pd.DataFrame.from_records(self.records)
 44 |         return df
 45 | 
 46 | 
 47 | def confidence(bootstraps, output_path, confidence_level=0.95):
 48 |     cb = ConfidenceGenerator(confidence_level=confidence_level)
 49 |     df = cb.generate_cis(bootstraps)
 50 | 
 51 |     df.to_csv(output_path, index=False)
 52 | 
 53 | def single_replicate_performances(gt, pred, diseases, metric, num_replicates):
 54 |     sample_ids = np.random.choice(len(gt), size=len(gt), replace=True)
 55 |     replicate_performances = {}
 56 |     gt_replicate = gt.iloc[sample_ids]
 57 |     pred_replicate = pred.iloc[sample_ids]
 58 | 
 59 |     for col in diseases:
 60 |         performance = metric(gt_replicate[col], pred_replicate[col])
 61 |         replicate_performances[col] = performance
 62 |     return replicate_performances
 63 | 
 64 | def multi_replicate_performances(gt, all_preds, diseases, metric, num_replicates):
 65 |     sample_ids = np.random.choice(len(gt), size=len(gt), replace=True)
 66 |     replicate_performances = {d: [None for i in range(len(all_preds))] for d in diseases}
 67 |     gt_replicate = gt.iloc[sample_ids]
 68 |     
 69 |     for i, pred in enumerate(all_preds):
 70 |         pred_replicate = pred.iloc[sample_ids]
 71 | 
 72 |         for col in diseases:
 73 |             performance = metric(gt_replicate[col], pred_replicate[col])
 74 |             replicate_performances[col][i] = performance
 75 | 
 76 |     averaged_rep_perf = {d: np.mean(replicate_performances[d]) for d in diseases}
 77 |     return averaged_rep_perf
 78 | 
 79 | 
 80 | def bootstrap_metric(gt, pred, all_preds, diseases, metric, num_replicates):
 81 |     
 82 |     all_performances = []
 83 |     all_multi_performances = []
 84 |     for _ in range(num_replicates):
 85 |         single_rep_performances = single_replicate_performances(gt, pred, diseases, metric, num_replicates)
 86 |         multi_rep_performances = multi_replicate_performances(gt, all_preds, diseases, metric, num_replicates)
 87 | 
 88 |         all_performances.append(copy.deepcopy(single_rep_performances))
 89 |         all_multi_performances.append(copy.deepcopy(multi_rep_performances))
 90 | 
 91 |     single_performances = pd.DataFrame.from_records(all_performances)
 92 |     multi_performances = pd.DataFrame.from_records(all_multi_performances)
 93 | 
 94 |     return single_performances, multi_performances
 95 | 
 96 | 
 97 | def compute_bootstrap_confidence_interval(gt, pred, all_preds,
 98 |                                           diseases, metric,
 99 |                                           num_replicates, confidence_level,
100 |                                           output_path):
101 |     single_bootstrap, multi_bootstrap = bootstrap_metric(gt, pred, all_preds,
102 |                                                             diseases, metric,
103 |                                                             num_replicates)
104 | 
105 |     confidence(single_bootstrap,
106 |                 output_path,
107 |                 confidence_level=0.95)
108 |     confidence(multi_bootstrap,
109 |                 output_path.replace('.csv', '_multi.csv'),
110 |                 confidence_level=0.95)
111 | 
112 | 
113 | 
114 | if __name__ == '__main__':
115 | 
116 |     parser = argparse.ArgumentParser(description="Arguments for confidence_interval.py")
117 |     parser.add_argument("--tasks", nargs='+', type=str)
118 |     parser.add_argument("--custom_tasks", type=str)
119 |     parser.add_argument("--metric", type=str, required=True)
120 |     parser.add_argument("--num_replicates", type=int, required=True)
121 |     parser.add_argument("--confidence_level", type=float, required=True)
122 |     parser.add_argument("--groundtruth", type=str, required=True)
123 |     parser.add_argument("--prediction", type=str, required=True)
124 |     parser.add_argument("--split", type=int, required=True)
125 |     parser.add_argument("--num_splits", type=int, required=True)
126 |     parser.add_argument("--output", type=str, required=True)
127 |     args = parser.parse_args()
128 | 
129 |     # A redundant renaming of the arguments -- to avoid breaking the rest of the code.
130 |     if args.custom_tasks is not None:
131 |         disease_names = ','.join(NamedTasks[args.custom_tasks])
132 |     else:
133 |         disease_names = args.tasks
134 |     metric_name = args.metric
135 |     num_replicates = args.num_replicates
136 |     confidence_level = args.confidence_level
137 |     gt_path = args.groundtruth
138 |     pred_path = args.prediction
139 |     cur_iter = args.split
140 |     num_iters = args.num_splits
141 |     output_path = args.output
142 | 
143 |     print("Start confidence_interval...")
144 |     # TODO JBY: Support more metrics
145 |     assert metric_name == 'AUROC', 'Only AUROC is supported at the moment'
146 | 
147 |     diseases = disease_names.split(', ')
148 |     diseases = [d.strip() for d in diseases]
149 | 
150 |     gt = pd.read_csv(gt_path)
151 |     # gt = np.array(gt[disease_name].values.tolist())
152 |     # gt = gt[disease_name]
153 | 
154 |     pred = pd.read_csv(pred_path)
155 |     # pred = np.array(pred[disease_name].values.tolist())
156 |     # pred = pred[disease_name]
157 | 
158 |     all_preds = []
159 |     for i in range(num_iters):
160 |         new_pred_path = pred_path.replace(f'it{cur_iter}', f'it{i}')
161 |         all_preds.append(pd.read_csv(new_pred_path))
162 | 
163 |     # TODO, support more metrics
164 | 
165 |     print('Parsed arguments')
166 | 
167 |     compute_bootstrap_confidence_interval(
168 |         gt, pred, all_preds, diseases, 
169 |         sklearn.metrics.roc_auc_score,
170 |         num_replicates, confidence_level,
171 |         output_path)
172 | 
173 |     print('Confidence interval generated')
174 |     
175 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/confidence_interval_diff.py:
--------------------------------------------------------------------------------
  1 | from confidence_interval import *
  2 | 
  3 | 
  4 | def diff_replicate_performances(gt, all_preds1, all_preds2, diseases, metric, num_replicates):
  5 |     sample_ids = np.random.choice(len(gt), size=len(gt), replace=True)
  6 |     replicate_performances = {d: [None for i in range(len(all_preds1))] for d in diseases}
  7 |     gt_replicate = gt.iloc[sample_ids]
  8 |     
  9 |     #import pdb; pdb.set_trace()
 10 | 
 11 |     pred1_performances = {d: [None for i in range(len(all_preds1))] for d in diseases}
 12 |     for i, pred in enumerate(all_preds1):
 13 |         pred1_replicate = pred.iloc[sample_ids]
 14 |         for col in diseases:
 15 |             performance = metric(gt_replicate[col], pred1_replicate[col])
 16 |             pred1_performances[col][i] = performance
 17 |             #print(f'Pred 1[{i}] => {performance}')
 18 | 
 19 |     pred2_performances = {d: [None for i in range(len(all_preds1))] for d in diseases}
 20 |     for i, pred in enumerate(all_preds2):
 21 |         pred2_replicate = pred.iloc[sample_ids]
 22 |         for col in diseases:
 23 |             performance = metric(gt_replicate[col], pred2_replicate[col])
 24 |             pred2_performances[col][i] = performance
 25 |             #print(f'Pred 2[{i}] => {performance}')
 26 | 
 27 | 
 28 |     diff_rep_perf = {}
 29 |     for d in diseases:
 30 |         a1 = np.array(pred1_performances[d])
 31 |         a2 = np.array(pred2_performances[d])
 32 | 
 33 |         diff_rep_perf[d] = np.mean(a1 - a2)
 34 | 
 35 |     #import pdb; pdb.set_trace()
 36 |     # diff_rep_perf = {d: pred1_performances[d] - pred2_performances[d] for d in diseases}
 37 |     return diff_rep_perf
 38 | 
 39 | 
 40 | def bootstrap_diff_metric(gt, all_preds1, all_preds2, diseases, metric, num_replicates):
 41 |     
 42 |     all_multi_performances = []
 43 |     for _ in range(num_replicates):
 44 |         multi_rep_performances = diff_replicate_performances(
 45 |                 gt, all_preds1, all_preds2, diseases, metric, num_replicates)
 46 | 
 47 |         all_multi_performances.append(copy.deepcopy(multi_rep_performances))
 48 | 
 49 |     multi_performances = pd.DataFrame.from_records(all_multi_performances)
 50 | 
 51 |     return multi_performances
 52 | 
 53 | 
 54 | def compute_bootstrap_diff_confidence_interval(gt, all_preds1, all_preds2,
 55 |                                           diseases, metric,
 56 |                                           num_replicates, confidence_level,
 57 |                                           output_path):
 58 |     multi_bootstrap = bootstrap_diff_metric(
 59 |                 gt, all_preds1, all_preds2, diseases, metric, num_replicates)
 60 | 
 61 |     confidence(multi_bootstrap,
 62 |                 output_path.replace('.csv', '_diff.csv'),
 63 |                 confidence_level=0.95)
 64 | 
 65 | 
 66 | if __name__ == '__main__':
 67 |     # TODO: JBY: Big hack, no proper argparser used here!
 68 |     # Usage:
 69 |     #   python confidence_interval.py
 70 |     #       [DISEASE_NAME] [METRIC_NAME]
 71 |     #       [NUM_REPLICATES] [CONFIDENCE_LEVEL]
 72 |     #       [GT_CSV_PATH] 
 73 |     #       [PRED1_CSV_PATH]
 74 |     #       [PRED2_CSV_PATH]
 75 |     #       [CUR_ITER] [NUM_ITERS]
 76 |     #       [OUTPUT_PATH]
 77 | 
 78 |     assert len(sys.argv) == 11
 79 | 
 80 |     disease_names = sys.argv[1]
 81 |     metric_name = sys.argv[2]
 82 |     num_replicates = int(sys.argv[3])
 83 |     confidence_level = float(sys.argv[4])
 84 |     gt_path = sys.argv[5]
 85 |     pred1_path = sys.argv[6]
 86 |     pred2_path = sys.argv[7]
 87 |     cur_iter = int(sys.argv[8])
 88 |     num_iters = int(sys.argv[9])
 89 |     output_path = sys.argv[10]
 90 | 
 91 |     # TODO JBY: Support more metrics
 92 |     assert metric_name == 'AUROC', 'Only AUROC is supported at the moment'
 93 | 
 94 |     diseases = disease_names.split(', ')
 95 |     diseases = [d.strip() for d in diseases]
 96 | 
 97 |     gt = pd.read_csv(gt_path)
 98 |     # gt = np.array(gt[disease_name].values.tolist())
 99 |     # gt = gt[disease_name]
100 | 
101 |     pred1 = pd.read_csv(pred1_path)
102 |     pred2 = pd.read_csv(pred2_path)
103 |     # pred = np.array(pred[disease_name].values.tolist())
104 |     # pred = pred[disease_name]
105 | 
106 |     all_preds1 = []
107 |     for i in range(num_iters):
108 |         new_pred_path = pred1_path.replace(f'it{cur_iter}', f'it{i}')
109 |         all_preds1.append(pd.read_csv(new_pred_path))
110 |     
111 |     all_preds2 = []
112 |     for i in range(num_iters):
113 |         new_pred_path = pred2_path.replace(f'it{cur_iter}', f'it{i}')
114 |         all_preds2.append(pd.read_csv(new_pred_path))
115 | 
116 |     # TODO, support more metrics
117 | 
118 |     print('Parsed arguments')
119 | 
120 |     compute_bootstrap_diff_confidence_interval(
121 |         gt, all_preds1, all_preds2, diseases, 
122 |         sklearn.metrics.roc_auc_score,
123 |         num_replicates, confidence_level,
124 |         output_path)
125 | 
126 |     print('Confidence interval generated')
127 |     


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/constants/__init__.py:
--------------------------------------------------------------------------------
1 | from .constants import *
2 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/constants/constants.py:
--------------------------------------------------------------------------------
  1 | """Define constants to be used throughout the repository."""
  2 | from pathlib import Path
  3 | 
  4 | # Main directories
  5 | PROJECT_DIR = Path(__file__).parent.parent
  6 | DATA_DIR = Path("/deep/group")
  7 | 
  8 | # Datasets
  9 | CHEXPERT = "chexpert"
 10 | CUSTOM = "custom"
 11 | CHEXPERT_SINGLE = "chexpert_single_special"
 12 | CXR14 = "cxr14"
 13 | SHENZHEN = "shenzhen_special"
 14 | 
 15 | # Predict config constants
 16 | CFG_TASK2MODELS = "task2models"
 17 | CFG_AGG_METHOD = "aggregation_method"
 18 | CFG_CKPT_PATH = "ckpt_path"
 19 | CFG_IS_3CLASS = "is_3class"
 20 | 
 21 | # Dataset constants
 22 | IMAGENET_MEAN = [0.485, 0.456, 0.406]
 23 | IMAGENET_STD = [0.229, 0.224, 0.225]
 24 | COL_PATH = "Path"
 25 | COL_STUDY = "Study"
 26 | COL_TASK = "Tasks"
 27 | COL_METRIC = "Metrics"
 28 | COL_VALUE = "Values"
 29 | TASKS = "tasks"
 30 | UNCERTAIN = -1
 31 | MISSING = -2
 32 | 
 33 | # CheXpert specific constants
 34 | CHEXPERT_DATASET_NAME = "CheXpert-v1.0"
 35 | CHEXPERT_PARENT_DATA_DIR = DATA_DIR / "CheXpert"
 36 | CHEXPERT_SAVE_DIR = CHEXPERT_PARENT_DATA_DIR / "models/"
 37 | CHEXPERT_DATA_DIR = CHEXPERT_PARENT_DATA_DIR / CHEXPERT_DATASET_NAME
 38 | CHEXPERT_TEST_DIR = CHEXPERT_PARENT_DATA_DIR / "CodaLab"
 39 | CHEXPERT_UNCERTAIN_DIR = CHEXPERT_PARENT_DATA_DIR / "Uncertainty"
 40 | CHEXPERT_RAD_PATH = CHEXPERT_PARENT_DATA_DIR / "rad_perf_test.csv"
 41 | CHEXPERT_MEAN = [.5020, .5020, .5020]
 42 | CHEXPERT_STD = [.085585, .085585, .085585]
 43 | CHEXPERT_TASKS = ["No Finding",
 44 |                   "Enlarged Cardiomediastinum",
 45 |                   "Cardiomegaly",
 46 |                   "Lung Lesion",
 47 |                   "Airspace Opacity",
 48 |                   "Edema",
 49 |                   "Consolidation",
 50 |                   "Pneumonia",
 51 |                   "Atelectasis",
 52 |                   "Pneumothorax",
 53 |                   "Pleural Effusion",
 54 |                   "Pleural Other",
 55 |                   "Fracture",
 56 |                   "Support Devices"
 57 |                   ]
 58 | CHEXPERT_SINGLE_TASKS = ["No Finding",
 59 |                          "Pleural Effusion",
 60 |                         ]
 61 | 
 62 | CHEXPERT_COMPETITION_TASKS = ["Atelectasis",
 63 |                               "Cardiomegaly",
 64 |                               "Consolidation",
 65 |                               "Edema",
 66 |                               "Pleural Effusion"
 67 |                               ]
 68 | CHEXPERT_COMPETITION_SINGLE_TASKS = CHEXPERT_COMPETITION_TASKS
 69 | # CHEXPERT_COMPETITION_SINGLE_TASKS = ["Pleural Effusion"]
 70 | 
 71 | SHENZHEN_TASKS = ['Tuberculosis']
 72 | 
 73 | # CXR14 specific constants
 74 | CXR14_DATA_DIR = DATA_DIR / CXR14
 75 | CXR14_TASKS = ["Cardiomegaly",
 76 |                "Emphysema",
 77 |                "Pleural Effusion",
 78 |                "Hernia",
 79 |                "Infiltration",
 80 |                "Mass",
 81 |                "Nodule",
 82 |                "Atelectasis",
 83 |                "Pneumothorax",
 84 |                "Pleural Thickening",
 85 |                "Pneumonia",
 86 |                "Fibrosis",
 87 |                "Edema",
 88 |                "Consolidation"]
 89 | CALIBRATION_FILE = "calibration_params.json"
 90 | 
 91 | DATASET2TASKS = {CHEXPERT: CHEXPERT_TASKS,
 92 |                  CUSTOM: CHEXPERT_TASKS,
 93 |                  CHEXPERT_SINGLE: CHEXPERT_TASKS,
 94 |                  CXR14: CXR14_TASKS,
 95 |                  SHENZHEN: SHENZHEN_TASKS}
 96 | 
 97 | EVAL_METRIC2TASKS = {'chexpert-log_loss': CHEXPERT_TASKS,
 98 |                      'cxr14-log_loss': CXR14_TASKS,
 99 |                      'shenzhen-AUROC': SHENZHEN_TASKS,
100 |                      'chexpert-competition-log_loss': CHEXPERT_COMPETITION_TASKS,
101 |                      'chexpert-competition-AUROC': CHEXPERT_COMPETITION_TASKS,
102 |                      'chexpert-competition-single-AUROC': CHEXPERT_COMPETITION_TASKS}
103 | 
104 | NamedTasks = {'chexpert': CHEXPERT_TASKS,
105 |         'chexpert-competition': CHEXPERT_COMPETITION_TASKS,
106 |         'pleural-effusion': CHEXPERT_TASKS
107 |         }
108 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .loader import get_loader
2 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/data/base_dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import torchvision.transforms as t
 4 | from torch.utils.data import Dataset
 5 | from PIL import ImageEnhance
 6 | 
 7 | from constants import *
 8 | 
 9 | 
10 | class BaseDataset(Dataset):
11 |     def __init__(self, csv_name, is_training, transform_args):
12 |         self.transform_args = transform_args
13 |         self.csv_name = f"{csv_name}.csv" if not csv_name.endswith(".csv") else csv_name
14 |         self.is_training = is_training
15 | 
16 |     def get_enhance_transform(self, f, enhance_min, enhance_max):
17 |         def do_enhancement(img):
18 |             factor = np.random.uniform(enhance_min, enhance_max)
19 |             enhancer = f(img)
20 |             return enhancer.enhance(factor)
21 |         return do_enhancement
22 |             
23 |         
24 |     def transform(self, img):
25 |         """Set the transforms to be applied when loading."""
26 | 
27 |         transform_args = self.transform_args
28 |         # Shorter side scaled to transform_args.scale
29 |         if transform_args.maintain_ratio:
30 |             transforms_list = [t.Resize(transform_args.scale)]
31 |         else:
32 |             transforms_list = [t.Resize((transform_args.scale, transform_args.scale))]
33 | 
34 |         # Data augmentation
35 |         if self.is_training:
36 |             if np.random.rand() < transform_args.rotate_prob:
37 |                 transforms_list += [t.RandomRotation((transform_args.rotate_min,
38 |                                                       transform_args.rotate_max))]
39 | 
40 |             if np.random.rand() < transform_args.contrast_prob:
41 |                 transforms_list += [self.get_enhance_transform(ImageEnhance.Contrast,
42 |                                                                transform_args.contrast_min,
43 |                                                                transform_args.contrast_max)]
44 | 
45 |             if np.random.rand() < transform_args.brightness_prob:
46 |                 transforms_list += [self.get_enhance_transform(ImageEnhance.Brightness,
47 |                                                                transform_args.brightness_min,
48 |                                                                transform_args.brightness_max)]
49 | 
50 |             if np.random.rand() < transform_args.sharpness_prob:
51 |                 transforms_list += [self.get_enhance_transform(ImageEnhance.Sharpness,
52 |                                                                transform_args.sharpness_min,
53 |                                                                transform_args.sharpness_max)]
54 | 
55 |             if np.random.rand() < transform_args.horizontal_flip_prob:
56 |                 transforms_list += [t.Random.HorizontalFlip()]
57 | 
58 |             if transform_args.crop != 0:
59 |                 transforms_list += [t.RandomCrop((transform_args.crop, transform_args.crop))]
60 | 
61 |         else:
62 |             transforms_list += [t.CenterCrop((transform_args.crop,
63 |                                               transform_args.crop))
64 |                                 if transform_args.crop else None]
65 | 
66 |         if transform_args.normalization == 'imagenet':
67 |             normalize = t.Normalize(mean=IMAGENET_MEAN,
68 |                                     std=IMAGENET_STD)
69 |         elif transform_args.normalization == 'chexpert_norm':
70 |             normalize = t.Normalize(mean=CHEXPERT_MEAN,
71 |                                     std=CHEXPERT_STD)
72 |         transforms_list += [t.ToTensor(), normalize]
73 | 
74 |         return t.Compose([transform for transform in transforms_list if transform])(img)
75 | 
76 |     def __len__(self):
77 |         return len(self.labels)
78 | 
79 |     def __getitem__(self, index):
80 |         raise NotImplementedError
81 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/data/chexpert_dataset.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import torch
  3 | import pandas as pd
  4 | from PIL import Image
  5 | 
  6 | import util
  7 | from .base_dataset import BaseDataset
  8 | from constants import *
  9 | 
 10 | 
 11 | class CheXpertDataset(BaseDataset):
 12 |     def __init__(self, csv_name, is_training, study_level,
 13 |                  transform_args, toy, return_info_dict, logger=None, data_args=None):
 14 |         # Pass in parent of data_dir because test set is in a different
 15 |         # directory due to dataset release, and uncertain maps are in a
 16 |         # different directory as well (both are under the parent directory).
 17 |         super().__init__(csv_name, is_training, transform_args)
 18 |         self.study_level = study_level
 19 |         self.toy = toy
 20 |         self.return_info_dict = return_info_dict
 21 |         self.logger = logger
 22 |         self.data_args = data_args
 23 | 
 24 |         self.is_train_dataset = self.csv_name == "train.csv"
 25 |         self.is_test_dataset = self.csv_name == "test.csv"
 26 |         self.is_val_dataset = self.csv_name == "valid.csv"
 27 |         self.is_uncertain_dataset = "uncertainty" in self.csv_name
 28 | 
 29 |         if self.is_test_dataset:
 30 |             self.csv_path = CHEXPERT_TEST_DIR / f"{csv_name}_image_paths.csv"
 31 |         elif self.is_uncertain_dataset:
 32 |             self.csv_path = CHEXPERT_UNCERTAIN_DIR / self.csv_name
 33 |         else:
 34 |             self.csv_path = CHEXPERT_DATA_DIR / self.csv_name
 35 |         
 36 |         if self.is_val_dataset:
 37 |             print("valid", self.csv_path)
 38 | 
 39 |         df = self.load_df()
 40 | 
 41 |         self.studies = df[COL_STUDY].drop_duplicates()
 42 | 
 43 |         if self.toy and self.csv_name == 'train.csv':
 44 |             self.studies = self.studies.sample(n=10)
 45 |             df = df[df[COL_STUDY].isin(self.studies)]
 46 |             df = df.reset_index(drop=True)
 47 | 
 48 |         # Set Study folder as index.
 49 |         if self.study_level:
 50 |             self.set_study_as_index(df)
 51 | 
 52 |         self.labels = self.get_labels(df)
 53 |         self.img_paths = self.get_paths(df)
 54 | 
 55 |     def load_df(self):
 56 |         df = pd.read_csv(Path(self.csv_path))
 57 | 
 58 |         # Prepend the data dir to get the full path.
 59 |         df[COL_PATH] = df[COL_PATH].apply(lambda x: CHEXPERT_PARENT_DATA_DIR / x)
 60 |         if self.is_test_dataset: #adjust for the fact that images are in codalab
 61 |             df[COL_PATH] = df[COL_PATH].apply(lambda p:
 62 |                                                 Path(str(p).replace(str(CHEXPERT_DATA_DIR),
 63 |                                                                     str(CHEXPERT_TEST_DIR))))       
 64 |         df[COL_STUDY] = df[COL_PATH].apply(lambda p: Path(p).parent)
 65 |         if self.is_test_dataset:
 66 |             gt_df = pd.read_csv(CHEXPERT_TEST_DIR / "test_groundtruth.csv")
 67 |             gt_df[COL_STUDY] = gt_df[COL_STUDY].apply(lambda s: CHEXPERT_PARENT_DATA_DIR / s)
 68 |             gt_df[COL_STUDY] = gt_df[COL_STUDY].apply(lambda s: Path(str(s).replace(str(CHEXPERT_DATA_DIR),
 69 |                                                                                str(CHEXPERT_TEST_DIR))))
 70 |             df = pd.merge(df, gt_df, on=COL_STUDY, how = 'outer') 
 71 |             df = df.dropna(subset=['Path'])
 72 | 
 73 |         df = df.rename(columns={"Lung Opacity": "Airspace Opacity"}).sort_values(COL_STUDY)
 74 | 
 75 |         df[CHEXPERT_TASKS] = df[CHEXPERT_TASKS].fillna(value=0)
 76 | 
 77 |         return df
 78 | 
 79 |     def set_study_as_index(self, df):
 80 |         df.index = df[COL_STUDY]
 81 | 
 82 |     def get_paths(self, df):
 83 |         return df[COL_PATH]
 84 | 
 85 |     def get_labels(self, df):
 86 |         # Get the labels
 87 |         if self.study_level:
 88 |             study_df = df.drop_duplicates(subset=COL_STUDY)
 89 |             labels = study_df[CHEXPERT_TASKS]
 90 |         else:
 91 |             labels = df[CHEXPERT_TASKS]
 92 | 
 93 |         return labels
 94 | 
 95 |     def get_study(self, index):
 96 | 
 97 |         # Get study folder path
 98 |         study_path = self.studies.iloc[index]
 99 | 
100 |         # Get and transform the label
101 |         label = self.labels.loc[study_path].values
102 |         label = torch.FloatTensor(label)
103 | 
104 |         # Get and transform the images
105 |         # corresponding to the study at hand
106 |         img_paths = pd.Series(self.img_paths.loc[study_path]).tolist()
107 |         imgs = [Image.open(path).convert('RGB') for path in img_paths]
108 |         # Downscale full resolution image to 1024 in the same way as 
109 |         # performed in previous preprocessing, then convert back to PIL.
110 |         # imgs = [util.resize_img(cv2.imread(str(path), 0), 1024) for path in img_paths]
111 |         # imgs = [Image.fromarray(img).convert('RGB') for img in imgs]
112 | 
113 |         imgs = [self.transform(img) for img in imgs]
114 |         imgs = torch.stack(imgs)
115 | 
116 |         if self.return_info_dict:
117 | 
118 |             info_dict = {'paths': study_path}
119 | 
120 |             return imgs, label, info_dict
121 | 
122 |         return imgs, label
123 | 
124 |     def get_image(self, index):
125 | 
126 |         # Get and transform the label
127 |         label = self.labels.iloc[index].values
128 |         label = torch.FloatTensor(label)
129 | 
130 |         # Get and transform the image
131 |         img_path = self.img_paths.iloc[index]
132 |         img = Image.open(img_path).convert('RGB')
133 |         img = self.transform(img)
134 | 
135 |         if self.return_info_dict:
136 |             info_dict = {'paths': str(img_path)}
137 |             return img, label, info_dict
138 | 
139 |         return img, label
140 | 
141 |     def __getitem__(self, index):
142 |         if self.study_level:
143 |             return self.get_study(index)
144 |         else:
145 |             return self.get_image(index)
146 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/data/loader.py:
--------------------------------------------------------------------------------
 1 | import torch.utils.data as data
 2 | 
 3 | from .chexpert_dataset import CheXpertDataset
 4 | from .custom_dataset import CustomDataset
 5 | from .pad_collate import PadCollate
 6 | from constants import *
 7 | 
 8 | 
 9 | def get_loader(phase, data_args, transform_args,
10 |                is_training, return_info_dict,
11 |                logger=None):
12 |     """Get PyTorch data loader.
13 | 
14 |     Args:
15 |         phase: string name of training phase {train, valid, test}.
16 |         data_args: Namespace of data arguments.
17 |         transform_args: Namespace of transform arguments.
18 |         is_training: Bool indicating whether in training mode.
19 |         return_info_dict: Bool indicating whether to return extra info
20 |                           in batches.
21 |         logger: Optional Logger object for printing data to stdout and file.
22 | 
23 |     Return:
24 |         loader: PyTorch DataLoader object
25 |     """
26 | 
27 |     study_level = not is_training
28 |     shuffle = is_training
29 | 
30 |     # TODO: Make this more general
31 |     if data_args.dataset == "chexpert":
32 |         Dataset = CheXpertDataset
33 |     elif 'special' in data_args.dataset:
34 |         Dataset = CustomDataset
35 |     elif data_args.dataset == "custom":
36 |         Dataset = CustomDataset
37 |     else:
38 |         raise ValueError(f"Dataset {data_args.dataset} not supported.")
39 | 
40 |     # Get name of csv to load data from.
41 |     # uncertain_map_path will replace this name.
42 |     # need to make this more general!!!
43 |     #csv_name = data_args.uncertain_map_path\
44 |     #    if data_args.uncertain_map_path is not None else phase
45 | 
46 |     if data_args.uncertain_map_path is not None and phase == 'train':
47 |         csv_name = data_args.uncertain_map_path
48 |     else:
49 |         csv_name = phase
50 |     # Instantiate the Dataset class.
51 |     dataset = Dataset(csv_name, is_training, study_level, transform_args,
52 |                       data_args.toy, return_info_dict, logger, data_args)
53 |     if study_level:
54 |         # Pick collate function
55 |         collate_fn = PadCollate(dim=0)
56 |         loader = data.DataLoader(dataset,
57 |                                  batch_size=data_args.batch_size,
58 |                                  shuffle=shuffle,
59 |                                  num_workers=data_args.num_workers,
60 |                                  collate_fn=collate_fn)
61 |     else:
62 |         loader = data.DataLoader(dataset,
63 |                                  batch_size=data_args.batch_size,
64 |                                  shuffle=shuffle,
65 |                                  num_workers=data_args.num_workers)
66 | 
67 |     return loader
68 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/data/pad_collate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def pad_tensor(vec, pad, dim):
 5 |     """
 6 |     args:
 7 |         vec - tensor to pad
 8 |         pad - the size to pad to
 9 |         dim - dimension to pad
10 | 
11 |     return:
12 |         a new tensor padded to 'pad' in dimension 'dim'
13 |     """
14 |     pad_size = list(vec.shape)
15 |     pad_size[dim] = pad - vec.size(dim)
16 |     return torch.cat([vec, torch.zeros(*pad_size)], dim=dim)
17 | 
18 | 
19 | class PadCollate:
20 |     """
21 |     a variant of callate_fn that pads according to the longest sequence in
22 |     a batch of sequences
23 |     """
24 | 
25 |     def __init__(self, dim=0):
26 |         """
27 |         args:
28 |             dim - the dimension to be padded (dimension of time in sequences)
29 |         """
30 |         self.dim = dim
31 | 
32 |     def pad_collate(self, batch):
33 |         """
34 |         args:
35 |             batch - list of (tensor, label)
36 | 
37 |         return:
38 |             xs - a tensor of all examples in 'batch' after padding
39 |             ys - a LongTensor of all labels in batch
40 |             mask - a mask with 0s in positions that should be ignored
41 |         """
42 |         # find longest sequence
43 |         study_lens = list(map(lambda x: x[0].shape[self.dim], batch))
44 |         max_len = max(study_lens)
45 | 
46 |         # Pad first example according to max_len
47 |         num_components = max(len(x) for x in batch)
48 |         batch = [(pad_tensor(x[0], pad=max_len, dim=self.dim),) + tuple(x[1:]) for x in batch]
49 | 
50 |         # Stack padded items and
51 |         batch = tuple(self._merge(batch, component_idx=i) for i in range(num_components))
52 |         masks = [[1] * sl + [0] * (max_len - sl) for sl in study_lens]
53 |         masks = torch.tensor(masks, dtype=torch.float32)
54 | 
55 |         return batch + (masks,)
56 | 
57 |     def __call__(self, batch):
58 |         return self.pad_collate(batch)
59 | 
60 |     @staticmethod
61 |     def _merge(batch, component_idx):
62 |         """Merge components of a batch into a single tensor or list.
63 | 
64 |         Args:
65 |             batch: Batch to merge.
66 |             component_idx: Index of component in each example that will be merged.
67 | 
68 |         Returns:
69 |              Merged components
70 |         """
71 |         # Group all components into list
72 |         components = [x[component_idx] for x in batch]
73 |         assert len(components) > 0, 'Error in pad_collate: Cannot merge a batch of size 0'
74 |         first_component = components[0]
75 | 
76 |         # Merge based on data type of components
77 |         if isinstance(first_component, dict):
78 |             merged_components = {k: [d[k] for d in components] for k in first_component}
79 |         elif isinstance(first_component, torch.Tensor):
80 |             merged_components = torch.stack(components, dim=0)
81 |         else:
82 |             raise ValueError('Unexpected type in PadCollate._merge: {}'.format(type(components[0])))
83 | 
84 |         return merged_components
85 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/data/task_sequences.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "competition": {
  3 |         "Atelectasis": 0,
  4 |         "Cardiomegaly": 1, 
  5 |         "Consolidation": 2,
  6 |         "Edema": 3,
  7 |         "Pleural Effusion": 4
  8 |     },
  9 |     "stanford": {
 10 |         "No Finding": 0,
 11 |         "Enlarged Cardiomediastinum": 1,
 12 |         "Cardiomegaly": 2,
 13 |         "Lung Lesion": 3,
 14 |         "Airspace Opacity": 4,
 15 |         "Edema": 5,
 16 |         "Consolidation": 6,
 17 |         "Pneumonia": 7,
 18 |         "Atelectasis": 8,
 19 |         "Pneumothorax": 9,
 20 |         "Pleural Effusion": 10,
 21 |         "Pleural Other": 11,
 22 |         "Fracture": 12,
 23 |         "Support Devices": 13
 24 |     },
 25 |     "stanford_exclude_NF": {
 26 |         "Enlarged Cardiomediastinum": 0,
 27 |         "Cardiomegaly": 1,
 28 |         "Lung Lesion": 2,
 29 |         "Airspace Opacity": 3,
 30 |         "Edema": 4,
 31 |         "Consolidation": 5,
 32 |         "Pneumonia": 6,
 33 |         "Atelectasis": 7,
 34 |         "Pneumothorax": 8,
 35 |         "Pleural Effusion": 9,
 36 |         "Pleural Other": 10,
 37 |         "Fracture": 11,
 38 |         "Support Devices": 12
 39 |     },
 40 |     "nih": {
 41 |         "Cardiomegaly": 0,
 42 |         "Emphysema": 1,
 43 |         "Pleural Effusion": 2,
 44 |         "Hernia": 3,
 45 |         "Infiltration": 4,
 46 |         "Mass": 5,
 47 |         "Nodule": 6,
 48 |         "Atelectasis": 7,
 49 |         "Pneumothorax": 8,
 50 |         "Pleural Thickening": 9,
 51 |         "Pneumonia": 10,
 52 |         "Fibrosis": 11,
 53 |         "Edema": 12,
 54 |         "Consolidation": 13
 55 |     },
 56 | 
 57 |     "nih_su_union": {
 58 |         "Pleural Effusion": 0,
 59 |         "Pleural Other": 1,
 60 |         "Infiltration": 2,
 61 |         "Consolidation": 3,
 62 |         "Mass": 4,
 63 |         "Support Devices": 5,
 64 |         "Airspace Opacity": 6,
 65 |         "Lung Lesion": 7,
 66 |         "No Finding": 8,
 67 |         "Atelectasis": 9,
 68 |         "Nodule": 10,
 69 |         "Pneumothorax": 11,
 70 |         "Enlarged Cardiomediastinum": 12,
 71 |         "Fracture": 13,
 72 |         "Edema": 14,
 73 |         "Emphysema": 15,
 74 |         "Pleural Thickening": 16,
 75 |         "Hernia": 17,
 76 |         "Pneumonia": 18,
 77 |         "Fibrosis": 19,
 78 |         "Cardiomegaly": 20
 79 |     },
 80 |     "su_using_nih_labeller": {
 81 |         "Cardiomegaly": 0,
 82 |         "Edema": 1,
 83 |         "Consolidation": 2,
 84 |         "Pneumonia": 3,
 85 |         "Atelectasis": 4,
 86 |         "Pneumothorax": 5,
 87 |         "Pleural Effusion": 6
 88 |     },
 89 |     "single_atelectasis": {
 90 |         "Atelectasis": 0
 91 |     },
 92 |     "single_cardiomegaly": {
 93 |         "Cardiomegaly": 0
 94 |     },
 95 |     "single_consolidation": {
 96 |         "Consolidation": 0
 97 |     },
 98 |     "single_edema": {
 99 |         "Edema": 0
100 |     },
101 |     "single_pleural_effusion": {
102 |         "Pleural Effusion": 0
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | from .su_dataset import SUDataset
 2 | from .nih_dataset import NIHDataset
 3 | from .concat_dataset import ConcatDataset
 4 | from .constants import *
 5 | from .label_mapper import LabelMapper
 6 | from .label_mapper import TASK_SEQUENCES
 7 | from .get_loader import get_loader, get_eval_loaders
 8 | 
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/base_dataset.py:
--------------------------------------------------------------------------------
  1 | import torchvision.transforms as transforms
  2 | 
  3 | from dataset.constants import CXR_MEAN, CXR_STD, IMAGENET_MEAN, IMAGENET_STD
  4 | from pathlib import Path
  5 | from torch.utils.data import Dataset
  6 | from .transforms import CLAHE
  7 | from .label_mapper import TASK_SEQUENCES, LabelMapper
  8 | 
  9 | 
 10 | class BaseDataset(Dataset):
 11 | 
 12 |     def __init__(self, data_dir, transform_args, split, is_training, dataset_name, tasks_to, dataset_task_sequence=None):
 13 |         """ Base class for CXR Dataset.
 14 |         Args:
 15 |             data_dir (string): Name of the root data director.
 16 |             transform_args (Namespace): Args for data transforms
 17 |             split (argsparse): Name of the dataset split to load (train, valid)
 18 |             dataset_name (string): Name of the dataset. Used to fetch the task sequence, used for this dataset.
 19 |                 (the task sequence used when loading the csv)
 20 |             tasks_to (dict): Name of the sequence of tasks
 21 |                 we want to map all our labels to.
 22 |         """
 23 | 
 24 |         assert isinstance(data_dir, str)
 25 |         assert isinstance(split, str)
 26 |         assert isinstance(dataset_name, str)
 27 |         assert isinstance(tasks_to, dict)
 28 | 
 29 |         self.dataset_name = dataset_name
 30 |         self.data_dir = Path(data_dir)
 31 |         self.split = split
 32 |         self.is_training = is_training
 33 | 
 34 |         # Create a label mapper
 35 |         # Get the two label sequences as two dicts:
 36 |         # e.g {pathology1: 0, pathology2: 1...}
 37 |         if dataset_task_sequence is not None:
 38 |             self.original_tasks = TASK_SEQUENCES[dataset_task_sequence]
 39 |         else:
 40 |             self.original_tasks = TASK_SEQUENCES[dataset_name]
 41 |         self.target_tasks = tasks_to
 42 | 
 43 |         self.label_mapper = None
 44 | 
 45 |         if self.original_tasks != self.target_tasks:
 46 |             self.label_mapper = LabelMapper(
 47 |                     self.original_tasks,
 48 |                     self.target_tasks)
 49 | 
 50 |         self._set_transforms(transform_args)
 51 | 
 52 |     def _set_class_weights(self, labels):
 53 |         """Set class weights for weighted loss.
 54 | 
 55 |         Each task, gets its own set of class weights.
 56 | 
 57 |         Weights are calculate by taking 1 - the relative
 58 |         frequency of the class (positive vs negative)..
 59 | 
 60 |         Args:
 61 |             labels: Dataframe or numpy array containing
 62 |             a list of the labels. Shape should be
 63 |             (num_examples, num_labels)
 64 | 
 65 | 
 66 |         Example:
 67 |             100 examples with two tasks, cardiomegaly and consolidation.
 68 |             10 positve cases of cardiomegaly.
 69 |             20 positive cases of consolidation.
 70 | 
 71 |             We will then have:
 72 |             Class weights for cardiomegaly:
 73 |             [1-0.9, 1-0.1] = [0.1, 0.9]
 74 |             Class weights for consolidation:
 75 |             [1-0.8, 1-0.2] = [0.2, 0.8]
 76 | 
 77 |             The first element in each list is the wieght for the
 78 |             negative examples.
 79 |         """
 80 | 
 81 |         # Set weights for positive vs negative examples
 82 |         self.p_count = (labels == 1).sum(axis=0)
 83 |         self.n_count = (labels == 0).sum(axis=0)
 84 | 
 85 |         if self.label_mapper is not None:
 86 |             self.p_count = self.label_mapper.map(self.p_count)
 87 |             self.n_count = self.label_mapper.map(self.n_count)
 88 | 
 89 |         self.total = self.p_count + self.n_count
 90 | 
 91 |         self.class_weights = [self.n_count / self.total,
 92 |                         self.p_count / self.total]
 93 | 
 94 |     def _set_transforms(self, t_args):
 95 |         """Set the transforms
 96 | 
 97 |             Example:
 98 |                 Image of size 1024x840.
 99 |                 Scale to 312x256.
100 |                 Normalization and augmentation
101 |                 Random crop (or center crop) to 224x224.
102 | 
103 |             Note: Crop will be k * 224 and
104 |             scale will be k*256.
105 |         """
106 | 
107 |         # Shorter side scaled to t_args.scale
108 |         if t_args.maintain_ratio:
109 |             transforms_list = [transforms.Resize(t_args.scale)]
110 |         else:
111 |             transforms_list = [transforms.Resize((t_args.scale, t_args.scale))]
112 | 
113 |         # Data augmentation
114 |         if self.is_training:
115 |             transforms_list += [transforms.RandomHorizontalFlip() if t_args.horizontal_flip else None,
116 |                                 transforms.RandomRotation(t_args.rotate) if t_args.rotate else None,
117 |                                 transforms.RandomCrop((t_args.crop, t_args.crop)) if t_args.crop != 0 else None]
118 |         else:
119 |             transforms_list += [transforms.CenterCrop((t_args.crop, t_args.crop)) if t_args.crop else None]
120 |         # Normalization
121 |         if t_args.clahe:
122 |             transforms_list += [CLAHE(clip_limit=2.0, tile_grid_size=(8, 8))]
123 | 
124 |         if t_args.normalization == 'imagenet':
125 |             normalize = transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
126 |         elif t_args.normalization == 'cxr_norm':
127 |             normalize = transforms.Normalize(mean=CXR_MEAN, std=CXR_STD)
128 |         transforms_list += [transforms.ToTensor(), normalize]
129 | 
130 |         self.transform = transforms.Compose([t for t in transforms_list if t])
131 | 
132 | 
133 |     def __len__(self):
134 |         return len(self.labels)
135 | 
136 |     def __getitem__(self, index):
137 |         raise NotImplementedError
138 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/ckpts/debugging/args.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "NIH_data_dir": "/deep/group/rad_data",
 3 |     "SU_data_dir": "/deep/group/xray4all",
 4 |     "adam_beta_1": 0.9,
 5 |     "adam_beta_2": 0.999,
 6 |     "batch_size": 6,
 7 |     "best_ckpt_metric": "val_loss",
 8 |     "ckpt_path": "",
 9 |     "classes": "all",
10 |     "epochs_per_eval": 1,
11 |     "epochs_per_save": 3,
12 |     "eval_on_nih": false,
13 |     "eval_on_su": true,
14 |     "gpu_ids": "0,1,2",
15 |     "horizontal_flip": false,
16 |     "init_method": "kaiming",
17 |     "iters_per_print": 6,
18 |     "iters_per_visual": 120,
19 |     "label_seq": "competition",
20 |     "loss_fn": "",
21 |     "lr": 0.001,
22 |     "lr_decay_gamma": 0.1,
23 |     "lr_decay_step": 100,
24 |     "lr_milestones": "50,125,250",
25 |     "lr_patience": 10,
26 |     "lr_scheduler": "step",
27 |     "max_ckpts": 2,
28 |     "max_eval": -1,
29 |     "metric_name": "val_loss",
30 |     "model": "DenseNet121",
31 |     "model_depth": 50,
32 |     "name": "debugging",
33 |     "nih_train_frac": 0,
34 |     "num_channels": 3,
35 |     "num_classes": 14,
36 |     "num_epochs": 15,
37 |     "num_visuals": 4,
38 |     "num_workers": 8,
39 |     "optimizer": "sgd",
40 |     "pretrained": true,
41 |     "rotate": 0,
42 |     "save_dir": "ckpts/",
43 |     "scale": 256,
44 |     "sgd_dampening": 0.9,
45 |     "sgd_momentum": 0.9,
46 |     "su_train_frac": 1,
47 |     "toy": false,
48 |     "weight_decay": 0.0001
49 | }
50 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | import bisect
 2 | from torch.utils.data import Dataset
 3 | 
 4 | 
 5 | class ConcatDataset(Dataset):
 6 |     """https://pytorch.org/docs/stable/_modules/torch/utils/data/dataset.html#ConcatDataset
 7 | 
 8 |     Dataset to concatenate multiple datasets.
 9 |     Purpose: useful to assemble different existing datasets, possibly
10 |     large-scale datasets as the concatenation operation is done in an
11 |     on-the-fly manner.
12 | 
13 |     Arguments:
14 |         datasets (sequence): List of datasets to be concatenated
15 |     """
16 | 
17 |     @staticmethod
18 |     def cumsum(sequence):
19 |         r, s = [], 0
20 |         for e in sequence:
21 |             l = len(e)
22 |             r.append(l + s)
23 |             s += l
24 |         return r
25 | 
26 |     @staticmethod
27 |     def get_class_weights(datasets):
28 | 
29 |         p_count = 0
30 |         n_count = 0
31 | 
32 |         for dataset in datasets:
33 |             p_count = p_count + dataset.p_count
34 |             n_count = n_count + dataset.n_count
35 | 
36 |         total_count = p_count + n_count
37 |         class_weights = [n_count / total_count,
38 |                         p_count / total_count]
39 | 
40 |         return class_weights
41 | 
42 | 
43 |     def __init__(self, datasets):
44 |         super(ConcatDataset, self).__init__()
45 |         assert len(datasets) > 0, 'datasets should not be an empty iterable'
46 |         self.datasets = list(datasets)
47 |         self.cumulative_sizes = self.cumsum(self.datasets)
48 |         self.class_weights = self.get_class_weights(self.datasets)
49 | 
50 |     def __len__(self):
51 |         return self.cumulative_sizes[-1]
52 | 
53 |     def __getitem__(self, idx):
54 |         dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
55 |         if dataset_idx == 0:
56 |             sample_idx = idx
57 |         else:
58 |             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
59 |         return self.datasets[dataset_idx][sample_idx]
60 | 
61 |     @property
62 |     def cummulative_sizes(self):
63 |         warnings.warn("cummulative_sizes attribute is renamed to "
64 |                       "cumulative_sizes", DeprecationWarning, stacklevel=2)
65 |         return self.cumulative_size
66 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/constants.py:
--------------------------------------------------------------------------------
 1 | CXR_MEAN = [.5020, .5020, .5020]
 2 | CXR_STD = [.085585, .085585, .085585]
 3 | 
 4 | IMAGENET_MEAN = [0.485, 0.456, 0.406]
 5 | IMAGENET_STD = [0.229, 0.224, 0.225]
 6 | 
 7 | COL_PATH = 'Path'
 8 | COL_STUDY = 'Study'
 9 | COL_SPLIT = 'DataSplit'
10 | COL_PATIENT = 'Patient'
11 | 
12 | CFG_TASK2MODELS = 'task2models'
13 | CFG_AGG_METHOD = 'aggregation_method'
14 | CFG_CKPT_PATH = 'ckpt_path'
15 | CFG_IS_3CLASS = 'is_3class'
16 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/get_loader.py:
--------------------------------------------------------------------------------
  1 | import torch.utils.data as data
  2 | 
  3 | from .concat_dataset import ConcatDataset
  4 | from .su_dataset import SUDataset
  5 | from .nih_dataset import NIHDataset
  6 | from .label_mapper import TASK_SEQUENCES
  7 | from .pad_collate import PadCollate
  8 | 
  9 | def get_loader(data_args,
 10 |                transform_args,
 11 |                split,
 12 |                task_sequence,
 13 |                su_frac,
 14 |                nih_frac,
 15 |                batch_size,
 16 |                is_training=False,
 17 |                shuffle=False,
 18 |                study_level=False,
 19 |                frontal_lateral=False,
 20 |                return_info_dict=False):
 21 | 
 22 |     """Returns a dataset loader
 23 |        If both stanford_frac and nih_frac is one, the loader
 24 |        will sample both NIH and Stanford data.
 25 | 
 26 |     Args:
 27 |         stanford_frac: Float that specifies what percentage of stanford to load.
 28 |         nih_frac: Float that specifies what percentage of NIH to load.
 29 |         split: String determining if this is the train, valid, test, or sample split.
 30 |         shuffle: If true, the loader will shuffle the data.
 31 |         study_level: If true, creates a loader that loads the image on the study level.
 32 |             Only applicable for the SU dataset.
 33 |         frontal_lateral: If true, loads frontal/lateral labels.
 34 |             Only applicable for the SU dataset.
 35 |         return_info_dict: If true, return a dict of info with each image.
 36 | 
 37 |     Return:
 38 |         loader: A loader
 39 |     """
 40 | 
 41 |     if is_training:
 42 |         study_level=data_args.train_on_studies
 43 | 
 44 |     datasets = []
 45 |     if su_frac != 0:
 46 |         datasets.append(
 47 |                 SUDataset(
 48 |                     data_args.su_data_dir,
 49 |                     transform_args, split=split,
 50 |                     is_training=is_training,
 51 |                     tasks_to=task_sequence,
 52 |                     frac=su_frac,
 53 |                     study_level=study_level,
 54 |                     frontal_lateral=frontal_lateral,
 55 |                     toy=data_args.toy,
 56 |                     return_info_dict=return_info_dict
 57 |                     )
 58 |                 )
 59 | 
 60 |     if nih_frac != 0:
 61 |         datasets.append(
 62 |                 NIHDataset(
 63 |                     data_args.nih_data_dir,
 64 |                     transform_args, split=split,
 65 |                     is_training=is_training,
 66 |                     tasks_to=task_sequence,
 67 |                     frac=nih_frac,
 68 |                     toy=data_args.toy
 69 |                     )
 70 |                 )
 71 | 
 72 |     if len(datasets) == 2:
 73 |         assert study_level is False, "Currently, you can't create concatenated datasets when training on studies"
 74 |         dataset = ConcatDataset(datasets)
 75 |     else:
 76 |         dataset = datasets[0]
 77 | 
 78 |     # Pick collate function
 79 |     if study_level:
 80 |         collate_fn = PadCollate(dim=0)
 81 |         loader = data.DataLoader(dataset,
 82 |                              batch_size=batch_size,
 83 |                              shuffle=shuffle,
 84 |                              num_workers=8,
 85 |                              collate_fn=collate_fn)
 86 |     else:
 87 |         loader = data.DataLoader(dataset,
 88 |                              batch_size=batch_size,
 89 |                              shuffle=shuffle,
 90 |                              num_workers=8)
 91 | 
 92 |     return loader
 93 | 
 94 | 
 95 | def get_eval_loaders(data_args, transform_args, task_sequence, batch_size, frontal_lateral, return_info_dict=False):
 96 |     """Returns a dataset loader
 97 |        If both stanford_frac and nih_frac is one, the loader
 98 |        will sample both NIH and Stanford data.
 99 | 
100 |     Args:
101 |         eval_su: Float that specifes what percentage of stanford to load.
102 |         nih_frac: Float that specifes what percentage of NIH to load.
103 |         args: Additional arguments needed to load the dataset.
104 |         return_info_dict: If true, return a dict of info with each image.
105 | 
106 |     Return:
107 |         loader: A loader
108 | 
109 |     """
110 | 
111 |     eval_loaders = []
112 | 
113 |     if data_args.eval_su:
114 |         eval_loaders += [get_loader(data_args,
115 |                                     transform_args,
116 |                                     'valid',
117 |                                     task_sequence,
118 |                                     su_frac=1,
119 |                                     nih_frac=0,
120 |                                     batch_size=batch_size,
121 |                                     is_training=False,
122 |                                     shuffle=False,
123 |                                     study_level=not frontal_lateral,
124 |                                     frontal_lateral=frontal_lateral,
125 |                                     return_info_dict=return_info_dict)]
126 | 
127 |     if data_args.eval_nih:
128 |         eval_loaders += [get_loader(data_args,
129 |                                     transform_args,
130 |                                     'train',
131 |                                     task_sequence,
132 |                                     su_frac=0,
133 |                                     nih_frac=1,
134 |                                     batch_size=batch_size,
135 |                                     is_training=False,
136 |                                     shuffle=False,
137 |                                     study_level=True,
138 |                                     return_info_dict=return_info_dict),
139 |                          get_loader(data_args,
140 |                                     transform_args,
141 |                                     'valid',
142 |                                     task_sequence,
143 |                                     su_frac=0,
144 |                                     nih_frac=1,
145 |                                     batch_size=batch_size,
146 |                                     is_training=False,
147 |                                     shuffle=False,
148 |                                     study_level=True,
149 |                                     return_info_dict=return_info_dict)]
150 | 
151 |     return eval_loaders
152 | 
153 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/label_mapper.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json
  3 | import sys
  4 | import os
  5 | from pathlib import Path
  6 | from collections import OrderedDict
  7 | 
  8 | # Load the dictionary of label sequences
  9 | with open(Path(__file__).parent / 'task_sequences.json') as f:
 10 |     TASK_SEQUENCES = {k: OrderedDict(sorted(v.items(), key=lambda x: x[1])) for k, v in json.load(f).items()}
 11 | 
 12 | class LabelMapper:
 13 |     # special cases of label values
 14 |     UNCERTAIN = -1
 15 |     MISSING = -2
 16 | 
 17 |     def __init__(self, from_seq, to_seq):
 18 |         """Class that converts one task sequence,
 19 |         to another task sequence. (e.g nih to stanford).
 20 | 
 21 |         The key equation is: x_new = Ax + b where
 22 |         A is the mapping_matrix, putting 1s in x to the
 23 |         right place in x_new. b, below known as missing_bias
 24 |         makes ure that the values in the to_seq that don't exist
 25 |         in the from_seq, are all put to eqaul zero.
 26 | 
 27 |         Args:
 28 |             from_seq: An ordered dict of the tasks (task: index)
 29 |                you want to map from.
 30 | 
 31 |             to_seq: An ordered dict of the tasks (task: index)
 32 |                you want to map to.
 33 |         """
 34 |         # Can't be any duplicates within a task sequence.
 35 |         assert len(set(from_seq)) == len(from_seq)
 36 |         assert len(set(to_seq)) == len(to_seq)
 37 | 
 38 |         # The values 0 .. num_pathologies need to be unique
 39 |         assert len(set(to_seq.values())) == len(to_seq.values())
 40 |         assert len(set(from_seq.values())) == len(from_seq.values())
 41 | 
 42 |         # store the from and to task sequences
 43 |         self.from_seq = from_seq
 44 |         self.to_seq = to_seq
 45 | 
 46 |         # create the mapping matrix
 47 |         self.mapping_matrix = self._get_map_matrix(from_seq, to_seq)
 48 | 
 49 |         # Each row in the mapping matrix that is all zero
 50 |         # corresponds to a task that does not exist in the from_seq
 51 |         # we want those values to have value -2
 52 |         # These values can then easily be masked at a later stage
 53 | 
 54 |         missing_tasks_indeces = np.where(np.sum(self.mapping_matrix, axis=1) == 0)
 55 | 
 56 |         self.missing_bias = np.zeros(len(to_seq))
 57 |         self.missing_bias[missing_tasks_indeces] = LabelMapper.MISSING
 58 | 
 59 |     def map(self, label):
 60 |         """Maps label from self.from_seq to self.from_to_seq
 61 | 
 62 |             The missing_bias makes sure that tasks that are missing
 63 |             in the from_seq are put as -2 in new_label.
 64 | 
 65 |         Args:
 66 |             label: A numpy array (a vector) with binary values.
 67 |             each corresponding to a binary task. Usually this task is
 68 |             to determine if whether specific pathology is present.
 69 | 
 70 |         Return:
 71 |             new_label: A numpy array with the labels whose indeces corresponds
 72 |             to the label sequence stored in self.to_seq.
 73 |         """
 74 | 
 75 |         new_label = np.dot(self.mapping_matrix, label) + self.missing_bias
 76 | 
 77 |         return new_label
 78 | 
 79 |     def _get_map_matrix(self, from_seq, to_seq):
 80 |         """ Creates a mapping matrix between to
 81 |         labeling sequences.
 82 | 
 83 |         The matrix shape is (num_from_tasks, num_to_tasks).
 84 |         That means that if a row ends up fully empty, that class
 85 |         does not exists in the from_seq. If a column ends up fully
 86 |         empty it means that the class does not exist in the target.
 87 |         """
 88 |         num_from_tasks = len(from_seq)
 89 |         num_to_tasks = len(to_seq)
 90 |         map_matrix = np.zeros((num_to_tasks, num_from_tasks))
 91 | 
 92 |         for target_pathology in to_seq:
 93 |             to_id = to_seq[target_pathology]
 94 |             if target_pathology in from_seq:
 95 |                 from_id = from_seq[target_pathology]
 96 |                 map_matrix[to_id, from_id] = 1
 97 | 
 98 |         return map_matrix
 99 | 
100 |     def label_overlap(self):
101 |         """Utility method to check overlap
102 |         between the two label_sequences"""
103 | 
104 |         overlap = set(self.from_seq).intersection(set(self.to_seq))
105 | 
106 |         return list(overlap)
107 | 
108 |     @staticmethod
109 |     def display(sequence, array):
110 |         """Prints in easy to read format the binary array
111 |            and label sequence.
112 | 
113 |         Put this in this class mainly for namespacing purposes.
114 |         """
115 | 
116 |         tasks = list(sequence)
117 |         array = array.tolist()
118 |         assert(len(tasks) == len(array))
119 | 
120 |         path_label_dict = dict(zip(tasks, array))
121 | 
122 |         print(json.dumps(path_label_dict, indent=4))
123 | 
124 |         return dict(zip(tasks, array))
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/nih_dataset.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from PIL import Image
 6 | import torch
 7 | 
 8 | from .base_dataset import BaseDataset
 9 | 
10 | class NIHDataset(BaseDataset):
11 | 
12 |     def __init__(self, data_dir,
13 |                  transform_args, split, is_training, tasks_to, frac, toy=False):
14 |         """ NIH Dataset
15 |         Args:
16 |             data_dir (string): Name of the root data director.
17 |             transform_args (Namespace): Namespace object containing all the transform arguments.
18 |             split (argsparse): Arguments used for transforms
19 |             tasks_to (dict): The sequence of tasks.
20 |         """
21 | 
22 |         super().__init__(data_dir, transform_args,
23 |                 split, is_training, 'nih', tasks_to)
24 | 
25 |         self.study_level = False
26 | 
27 |         # Load data from csv
28 |         df = self._load_df(self.data_dir, split)
29 |         if toy and split == 'train':
30 |             df = df.sample(n=20)
31 |             df = df.reset_index(drop=True)
32 | 
33 |         if frac != 1 and is_training:
34 |             df = df.sample(frac=frac)
35 |             df = df.reset_index(drop=True)
36 | 
37 |         # Get labels and studies
38 |         self.labels = self._get_labels(df)
39 | 
40 |         # Get image paths
41 |         self.img_paths = self._get_paths(df)
42 | 
43 |         # Set transforms and class weights
44 |         self._set_class_weights(self.labels)
45 | 
46 |     @staticmethod
47 |     def _load_df(data_dir, split):
48 | 
49 |         if split == 'test':
50 |             csv_path = data_dir / 'test420.csv'
51 |         else:
52 |             csv_path = data_dir / (split + '_medium.csv')
53 | 
54 |         df = pd.read_csv(csv_path)
55 |         img_dir = data_dir / 'images'
56 |         df['Path'] = df['Path'].apply(lambda x: img_dir / x)
57 |         df = df.reset_index(drop=True)
58 | 
59 |         return df
60 | 
61 |     @staticmethod
62 |     def _get_paths(df):
63 |         """Get list pf paths to images"""
64 | 
65 |         # Skip the first header row
66 |         return df['Path'].tolist()[1:]
67 | 
68 |     def _get_studies(self, df):
69 |         """The NIH dataset does not have study level data"""
70 |         return None
71 |     def _get_labels(self, df):
72 |         """Return all the labels.
73 | 
74 |         In the NIH datset all labels are in one column. The
75 |         diferent pathologies are separated with pipes
76 |         E.g: 0|0|1|1|0|1|1|0|0|0|0|1|0|0
77 |         """
78 | 
79 |         labels = np.array([np.fromstring(row['Label'], sep='|', dtype=int) for i, row in df.iterrows() if i])
80 |         return labels
81 | 
82 |     def __getitem__(self, index):
83 | 
84 |         # Get and transform the label
85 |         label = self.labels[index, :]
86 |         if self.label_mapper is not None:
87 |             label = self.label_mapper.map(label)
88 |         label = torch.FloatTensor(label)
89 | 
90 |         # Get and transform the image
91 |         img = Image.open(self.img_paths[index]).convert('RGB')
92 |         img = self.transform(img)
93 | 
94 |         return img, label
95 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/pad_collate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def pad_tensor(vec, pad, dim):
 5 |     """
 6 |     args:
 7 |         vec - tensor to pad
 8 |         pad - the size to pad to
 9 |         dim - dimension to pad
10 | 
11 |     return:
12 |         a new tensor padded to 'pad' in dimension 'dim'
13 |     """
14 |     pad_size = list(vec.shape)
15 |     pad_size[dim] = pad - vec.size(dim)
16 |     return torch.cat([vec, torch.zeros(*pad_size)], dim=dim)
17 | 
18 | 
19 | class PadCollate:
20 |     """
21 |     a variant of callate_fn that pads according to the longest sequence in
22 |     a batch of sequences
23 |     """
24 | 
25 |     def __init__(self, dim=0):
26 |         """
27 |         args:
28 |             dim - the dimension to be padded (dimension of time in sequences)
29 |         """
30 |         self.dim = dim
31 | 
32 |     def pad_collate(self, batch):
33 |         """
34 |         args:
35 |             batch - list of (tensor, label)
36 | 
37 |         return:
38 |             xs - a tensor of all examples in 'batch' after padding
39 |             ys - a LongTensor of all labels in batch
40 |             mask - a mask with 0s in positions that should be ignored
41 |         """
42 |         # find longest sequence
43 |         study_lens = list(map(lambda x: x[0].shape[self.dim], batch))
44 |         max_len = max(study_lens)
45 | 
46 |         # Pad first example according to max_len
47 |         num_components = max(len(x) for x in batch)
48 |         batch = [(pad_tensor(x[0], pad=max_len, dim=self.dim),) + tuple(x[1:]) for x in batch]
49 | 
50 |         # Stack padded items and
51 |         batch = tuple(self._merge(batch, component_idx=i) for i in range(num_components))
52 |         masks = [[1] * sl + [0] * (max_len - sl) for sl in study_lens]
53 |         masks = torch.tensor(masks, dtype=torch.float32)
54 | 
55 |         return batch + (masks,)
56 | 
57 |     def __call__(self, batch):
58 |         return self.pad_collate(batch)
59 | 
60 |     @staticmethod
61 |     def _merge(batch, component_idx):
62 |         """Merge components of a batch into a single tensor or list.
63 | 
64 |         Args:
65 |             batch: Batch to merge.
66 |             component_idx: Index of component in each example that will be merged.
67 | 
68 |         Returns:
69 |              Merged components
70 |         """
71 |         # Group all components into list
72 |         components = [x[component_idx] for x in batch]
73 |         assert len(components) > 0, 'Error in pad_collate: Cannot merge a batch of size 0'
74 |         first_component = components[0]
75 | 
76 |         # Merge based on data type of components
77 |         if isinstance(first_component, dict):
78 |             merged_components = {k: [d[k] for d in components] for k in first_component}
79 |         elif isinstance(first_component, torch.Tensor):
80 |             merged_components = torch.stack(components, dim=0)
81 |         else:
82 |             raise ValueError('Unexpected type in PadCollate._merge: {}'.format(type(components[0])))
83 | 
84 |         return merged_components
85 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/predict_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "aggregation_method": "mean",
 3 |   "task2models": {
 4 |     "No Finding": [
 5 |       {
 6 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
 7 |         "is_3class": true
 8 |       }
 9 |     ],
10 |     "Enlarged Cardiomediastinum": [
11 |       {
12 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
13 |         "is_3class": true
14 |       }
15 |     ],
16 |     "Cardiomegaly": [
17 |       {
18 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
19 |         "is_3class": true
20 |       }
21 |     ],
22 |     "Lung Lesion": [
23 |       {
24 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
25 |         "is_3class": true
26 |       }
27 |     ],
28 |     "Airspace Opacity": [
29 |       {
30 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
31 |         "is_3class": true
32 |       }
33 |     ],
34 |     "Edema": [
35 |       {
36 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_ones/best.pth.tar",
37 |         "is_3class": false
38 |       }
39 |     ],
40 |     "Consolidation": [
41 |       {
42 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_ignore/best.pth.tar",
43 |         "is_3class": false
44 |       }
45 |     ],
46 |     "Pneumonia": [
47 |       {
48 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
49 |         "is_3class": true
50 |       }
51 |     ],
52 |     "Atelectasis": [
53 |       {
54 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_ones/best.pth.tar",
55 |         "is_3class": false
56 |       }
57 |     ],
58 |     "Pneumothorax": [
59 |       {
60 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
61 |         "is_3class": true
62 |       }
63 |     ],
64 |     "Pleural Effusion": [
65 |       {
66 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
67 |         "is_3class": true
68 |       }
69 |     ],
70 |     "Pleural Other": [
71 |       {
72 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
73 |         "is_3class": true
74 |       }
75 |     ],
76 |     "Fracture": [
77 |       {
78 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
79 |         "is_3class": true
80 |       }
81 |     ],
82 |     "Support Devices": [
83 |       {
84 |         "ckpt_path": "ckpts/DenseNet121_224_1e-04_no_hier_3class/best.pth.tar",
85 |         "is_3class": true
86 |       }
87 |     ]
88 |   }
89 | }


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/task_sequences.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "competition": {
  3 |         "Atelectasis": 0,
  4 |         "Cardiomegaly": 1, 
  5 |         "Consolidation": 2,
  6 |         "Edema": 3,
  7 |         "Pleural Effusion": 4
  8 |     },
  9 |     "stanford": {
 10 |         "No Finding": 0,
 11 |         "Enlarged Cardiomediastinum": 1,
 12 |         "Cardiomegaly": 2,
 13 |         "Lung Lesion": 3,
 14 |         "Airspace Opacity": 4,
 15 |         "Edema": 5,
 16 |         "Consolidation": 6,
 17 |         "Pneumonia": 7,
 18 |         "Atelectasis": 8,
 19 |         "Pneumothorax": 9,
 20 |         "Pleural Effusion": 10,
 21 |         "Pleural Other": 11,
 22 |         "Fracture": 12,
 23 |         "Support Devices": 13
 24 |     },
 25 |     "stanford_exclude_NF": {
 26 |         "Enlarged Cardiomediastinum": 0,
 27 |         "Cardiomegaly": 1,
 28 |         "Lung Lesion": 2,
 29 |         "Airspace Opacity": 3,
 30 |         "Edema": 4,
 31 |         "Consolidation": 5,
 32 |         "Pneumonia": 6,
 33 |         "Atelectasis": 7,
 34 |         "Pneumothorax": 8,
 35 |         "Pleural Effusion": 9,
 36 |         "Pleural Other": 10,
 37 |         "Fracture": 11,
 38 |         "Support Devices": 12
 39 |     },
 40 |     "nih": {
 41 |         "Cardiomegaly": 0,
 42 |         "Emphysema": 1,
 43 |         "Pleural Effusion": 2,
 44 |         "Hernia": 3,
 45 |         "Infiltration": 4,
 46 |         "Mass": 5,
 47 |         "Nodule": 6,
 48 |         "Atelectasis": 7,
 49 |         "Pneumothorax": 8,
 50 |         "Pleural Thickening": 9,
 51 |         "Pneumonia": 10,
 52 |         "Fibrosis": 11,
 53 |         "Edema": 12,
 54 |         "Consolidation": 13
 55 |     },
 56 | 
 57 |     "nih_su_union": {
 58 |         "Pleural Effusion": 0,
 59 |         "Pleural Other": 1,
 60 |         "Infiltration": 2,
 61 |         "Consolidation": 3,
 62 |         "Mass": 4,
 63 |         "Support Devices": 5,
 64 |         "Airspace Opacity": 6,
 65 |         "Lung Lesion": 7,
 66 |         "No Finding": 8,
 67 |         "Atelectasis": 9,
 68 |         "Nodule": 10,
 69 |         "Pneumothorax": 11,
 70 |         "Enlarged Cardiomediastinum": 12,
 71 |         "Fracture": 13,
 72 |         "Edema": 14,
 73 |         "Emphysema": 15,
 74 |         "Pleural Thickening": 16,
 75 |         "Hernia": 17,
 76 |         "Pneumonia": 18,
 77 |         "Fibrosis": 19,
 78 |         "Cardiomegaly": 20
 79 |     },
 80 |     "su_using_nih_labeller": {
 81 |         "Cardiomegaly": 0,
 82 |         "Edema": 1,
 83 |         "Consolidation": 2,
 84 |         "Pneumonia": 3,
 85 |         "Atelectasis": 4,
 86 |         "Pneumothorax": 5,
 87 |         "Pleural Effusion": 6
 88 |     },
 89 |     "single_atelectasis": {
 90 |         "Atelectasis": 0
 91 |     },
 92 |     "single_cardiomegaly": {
 93 |         "Cardiomegaly": 0
 94 |     },
 95 |     "single_consolidation": {
 96 |         "Consolidation": 0
 97 |     },
 98 |     "single_edema": {
 99 |         "Edema": 0
100 |     },
101 |     "single_pleural_effusion": {
102 |         "Pleural Effusion": 0
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | from .clahe import CLAHE
2 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/dataset/transforms/clahe.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | from PIL import Image
 4 | 
 5 | class CLAHE(object):
 6 |     """ Apply CLAHE on a single image"""
 7 | 
 8 |     def __init__(self, clip_limit=2.0, tile_grid_size=(8,8)):
 9 |         self.clip_limit = 2.0
10 |         self.tile_grid_size = tile_grid_size
11 | 
12 |     def __call__(self, PIL_img, save = False):
13 |         im_np = np.asarray(PIL_img)
14 |         im_np = cv2.cvtColor(im_np, cv2.COLOR_BGR2GRAY)
15 | 
16 |         # create a CLAHE object (Arguments are optional)
17 |         clahe = cv2.createCLAHE(self.clip_limit, self.tile_grid_size)
18 |         cl1 = clahe.apply(im_np)
19 |         imaged = cv2.cvtColor(cl1, cv2.COLOR_GRAY2RGB)
20 |         img = Image.fromarray(imaged)
21 | 
22 |         if save:
23 |             # Saving images to display
24 |             preimage.save(str(PIL_image) + "original.png")
25 |             img.save(str(preimage) + "CLAHEd.png")
26 |         return img
27 | 
28 |     # Not sure if the following is necessary
29 |     def __repr__(self):
30 |         return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
31 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/eval/__init__.py:
--------------------------------------------------------------------------------
1 | from .average_meter import AverageMeter
2 | from .evaluator import Evaluator
3 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/eval/average_meter.py:
--------------------------------------------------------------------------------
 1 | class AverageMeter(object):
 2 |     """Computes and stores the average and current value.
 3 | 
 4 |     Adapted from:
 5 |         https://github.com/pytorch/examples/blob/master/imagenet/main.py
 6 |     """
 7 |     def __init__(self):
 8 |         self.avg = 0
 9 |         self.val = 0
10 |         self.sum = 0
11 |         self.count = 0
12 | 
13 |     def reset(self):
14 |         self.__init__()
15 | 
16 |     def update(self, val, n=1):
17 |         self.val = val
18 |         self.sum += val * n
19 |         self.count += n
20 |         self.avg = self.sum / self.count
21 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/eval/below_curve_counter.py:
--------------------------------------------------------------------------------
 1 | """Define below curve counter class."""
 2 | import sklearn.metrics as sk_metrics
 3 | 
 4 | 
 5 | class BelowCurveCounter(object):
 6 |     def __init__(self, rad_perf, task_name):
 7 |         self.rad_perf = rad_perf
 8 |         self.task_name = task_name
 9 | 
10 |     def ROC(self, ground_truth, predictions):
11 | 
12 |         self.rad_perf.index = self.rad_perf['Score']
13 |         num_below_roc = 0
14 | 
15 |         fpr, tpr, threshold = sk_metrics.roc_curve(ground_truth, predictions)
16 |         for rad_name in ['Rad1', 'Rad2', 'Rad3']:
17 |             rad_sensitivity =\
18 |                 self.rad_perf.loc[f'{self.task_name} Sensitivity',
19 |                                   rad_name]
20 |             rad_specificity =\
21 |                 self.rad_perf.loc[f'{self.task_name} Specificity',
22 |                                   rad_name]
23 | 
24 |             rad_vertical_projection, rad_horizontal_projection =\
25 |                 self._project(fpr, tpr, 1 - rad_specificity, rad_sensitivity)
26 | 
27 |             if (rad_vertical_projection >= rad_sensitivity):
28 |                 num_below_roc += 1
29 | 
30 |         return num_below_roc
31 | 
32 |     def PR(self, ground_truth, predictions):
33 |         self.rad_perf.index = self.rad_perf['Score']
34 | 
35 |         num_below_pr = 0
36 |         precision, recall, threshold =\
37 |             sk_metrics.precision_recall_curve(ground_truth, predictions)
38 | 
39 |         for rad_name in ['Rad1', 'Rad2', 'Rad3']:
40 |             rad_sensitivity =\
41 |                 self.rad_perf.loc[f'{self.task_name} Sensitivity',
42 |                                   rad_name]
43 |             rad_precision =\
44 |                 self.rad_perf.loc[f'{self.task_name} Precision',
45 |                                   rad_name]
46 | 
47 |             rad_vertical_projection, rad_horizontal_projection =\
48 |                 self._project(recall, precision,
49 |                               rad_sensitivity, rad_precision)
50 | 
51 |             if (rad_vertical_projection >= rad_precision):
52 |                 num_below_pr += 1
53 | 
54 |         return num_below_pr
55 | 
56 |     @staticmethod
57 |     def _project(X, Y, rad_x, rad_y):
58 |         """Find the closest points on the curve to the point in
59 |         X and Y directions."""
60 |         x = 0
61 |         y = 0
62 | 
63 |         while (((x+2 < len(X)) and (X[x] > rad_x and X[x + 1] > rad_x))
64 |                 or (X[x] < rad_x and X[x + 1] < rad_x)):
65 |             x += 1
66 |         while ((y+2 < len(Y)) and (Y[y] > rad_y and Y[y + 1] > rad_y)
67 |                 or (Y[y] < rad_y and Y[y + 1] < rad_y)):
68 |             y += 1
69 | 
70 |         rad_vertical_projection =\
71 |             (Y[x + 1] - Y[x]) * (rad_x - X[x]) + Y[x]
72 |         rad_horizontal_projection =\
73 |             (X[y + 1] - X[y]) * (rad_y - Y[y]) + X[y]
74 | 
75 |         return rad_vertical_projection, rad_horizontal_projection
76 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/eval/evaluator.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import sklearn.metrics as sk_metrics
  4 | import torch.nn as nn
  5 | 
  6 | from .below_curve_counter import BelowCurveCounter
  7 | from .loss import CrossEntropyLossWithUncertainty, MaskedLossWrapper
  8 | 
  9 | 
 10 | class Evaluator(object):
 11 |     """Evaluator class for evaluating predictions against
 12 |     binary groundtruth."""
 13 |     def __init__(self, logger=None, **kwargs):
 14 |         self.logger = logger
 15 |         self.kwargs = kwargs
 16 | 
 17 |         if "operating_points_path" in kwargs:
 18 |             self.rad_perf = pd.read_csv(kwargs["operating_points_path"])
 19 |         else:
 20 |             self.rad_perf = None
 21 | 
 22 |         self.set_eval_functions()
 23 | 
 24 |     def evaluate(self, groundtruth, predictions, metric, threshold=0.5):
 25 |         """Evaluate a single metric on groundtruth and predictions."""
 26 |         print("Evaluating metric: {}".format(metric))
 27 |         if metric in self.summary_metrics:
 28 |             metric_fn = self.summary_metrics[metric]
 29 |             value = metric_fn(groundtruth, predictions)
 30 |         elif metric in self.curve_metrics:
 31 |             metric_fn = self.curve_metrics[metric]
 32 |             value = metric_fn(groundtruth, predictions)
 33 |         elif metric in self.point_metrics:
 34 |             metric_fn = self.point_metrics[metric]
 35 |             value = metric_fn(groundtruth, predictions > threshold)
 36 |             # if metric == 'precision' or metric == 'recall':
 37 |             #    if value < 0.01:
 38 |             #        raise ValueError(f"Metric {metric} should not have score less than 0.01")
 39 |         else:
 40 |             raise ValueError(f"Metric {metric} not supported.")
 41 | 
 42 |         return value
 43 | 
 44 |     def evaluate_tasks(self, groundtruth, predictions, threshold=0.5):
 45 |         """Compute evaluation metrics and curves on multiple tasks."""
 46 |         metrics = {}
 47 |         curves = {}
 48 |         for task in list(predictions):
 49 |             print("Evaluating task: {}".format(task))
 50 | 
 51 |             task_groundtruth = groundtruth[task]
 52 |             task_predictions = predictions[task]
 53 |             # filter out those with -1 in groundtruth
 54 |             non_label = task_groundtruth.index[task_groundtruth == -1.0]
 55 |             task_predictions = task_predictions.drop(non_label)
 56 |             task_groundtruth = task_groundtruth.drop(non_label)
 57 | 
 58 |             metrics.update({f"{task}:{metric}":
 59 |                             self.evaluate(task_groundtruth,
 60 |                                           task_predictions,
 61 |                                           metric=metric)
 62 |                             for metric in self.summary_metrics})
 63 | 
 64 |             metrics.update({f"{task}:{metric}@thresh={threshold}":
 65 |                             self.evaluate(task_groundtruth,
 66 |                                           task_predictions,
 67 |                                           metric=metric,
 68 |                                           threshold=threshold)
 69 |                             for metric in self.point_metrics})
 70 |             """
 71 |             if self.rad_perf is not None:
 72 | 
 73 |                 below_curve_counter = BelowCurveCounter(self.rad_perf,
 74 |                                                         task)
 75 |                 metrics.update({
 76 |                     f'{task}:rads_below_ROC':
 77 |                     below_curve_counter.ROC(task_groundtruth,
 78 |                                             task_predictions),
 79 |                     f'{task}:rads_below_PR':
 80 |                     below_curve_counter.PR(task_groundtruth,
 81 |                                            task_predictions)
 82 |                 })
 83 |             """
 84 |             curves.update({f"{task}:{metric}":
 85 |                            self.evaluate(task_groundtruth,
 86 |                                          task_predictions,
 87 |                                          metric=metric,
 88 |                                          threshold=threshold)
 89 |                            for metric in self.curve_metrics})
 90 | 
 91 |         return metrics, curves
 92 | 
 93 |     def evaluate_average_metric(self, metrics, evaluate_tasks,
 94 |                                 average_metric_name):
 95 |         """Evaluate an average metric over classes."""
 96 | 
 97 |         # All provided names must be of the form "...-{metric_name}"
 98 |         metric_name = average_metric_name.split("-")[-1]
 99 | 
100 |         average_metric = np.mean([metrics[f"{task}:{metric_name}"]
101 |                                   for task in evaluate_tasks])
102 | 
103 |         return average_metric
104 | 
105 |     def set_eval_functions(self):
106 |         """Set the evaluation functions."""
107 |         def undefined_catcher(func, x, y):
108 |             try:
109 |                 return func(x, y)
110 |             except Exception:
111 |                 return np.nan
112 | 
113 |         # Functions that take probs as input
114 |         self.summary_metrics = {
115 |             'AUPRC': lambda x, y: undefined_catcher(sk_metrics.average_precision_score, x, y),
116 |             'AUROC': lambda x, y: undefined_catcher(sk_metrics.roc_auc_score, x, y),
117 |             'log_loss': lambda x, y: undefined_catcher(sk_metrics.log_loss, x, y),
118 |         }
119 | 
120 |         # Functions that take binary values as input
121 |         self.point_metrics = {
122 |             'accuracy': lambda x, y: undefined_catcher(sk_metrics.accuracy_score, x, y),
123 |             'precision': lambda x, y: undefined_catcher(sk_metrics.precision_score, x, y),
124 |             'recall': lambda x, y: undefined_catcher(sk_metrics.recall_score, x, y),
125 |         }
126 | 
127 |         self.curve_metrics = {
128 |             'PRC': lambda x, y: undefined_catcher(sk_metrics.precision_recall_curve, x, y),
129 |             'ROC': lambda x, y: undefined_catcher(sk_metrics.roc_curve, x, y),
130 |         }
131 | 
132 |     def get_loss_fn(self, loss_fn_name, model_uncertainty,
133 |                     mask_uncertain, device):
134 |         """Get the loss function used for training.
135 | 
136 |         Args:
137 |             loss_fn_name: Name of loss function to use.
138 |             model_uncertainty: Bool indicating whether to predict
139 |                                UNCERTAIN directly.
140 |             mask_uncertain: Bool indicating whether to mask
141 |                             UNCERTAIN labels.
142 |             device: device to compute loss on (gpu or cpu).
143 |         """
144 |         print("evaluator: loss function name: {}".format(loss_fn_name))
145 |         if model_uncertainty:
146 |             loss_fn = CrossEntropyLossWithUncertainty()
147 |         elif loss_fn_name == 'cross_entropy':
148 |             loss_fn = nn.BCEWithLogitsLoss(reduction="none"
149 |                                            if mask_uncertain else "mean")
150 | 
151 |             # Apply a wrapper that masks uncertain labels.
152 |             if mask_uncertain:
153 |                 loss_fn = MaskedLossWrapper(loss_fn, device)
154 | 
155 |         else:
156 |             raise ValueError("No loss function for supplied arguments.")
157 | 
158 |         return loss_fn
159 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/eval/loss.py:
--------------------------------------------------------------------------------
 1 | """Define uncertainty cross entropy class."""
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from constants import *
 6 | 
 7 | 
 8 | class CrossEntropyLossWithUncertainty(nn.Module):
 9 |     """Cross-entropy loss modified to also include uncertainty outputs."""
10 |     def __init__(self, size_average=True, reduce=True):
11 |         super(CrossEntropyLossWithUncertainty, self).__init__()
12 |         self.ce_loss = nn.CrossEntropyLoss(reduce=False)
13 |         self.size_average = size_average
14 |         self.reduce = reduce
15 | 
16 |     def forward(self, logits, labels):
17 |         """
18 |         Args:
19 |             logits: Un-normalized outputs of shape (batch_size, num_tasks, 3)
20 |             labels: Labels of shape (batch_size, num_tasks)
21 |                     where -1 is uncertain, 0 is negative, 1 is positive.
22 |         """
23 |         batch_size, last_dim = logits.size()
24 |         if last_dim % 3:
25 |             raise ValueError('Last dim should be divisible by 3, ' +
26 |                              f'got last dim of {last_dim}')
27 |         num_tasks = last_dim // 3
28 | 
29 |         # Fuse batch and task dimensions
30 |         logits = logits.view(batch_size * num_tasks, 3)
31 |         # Shift labels into range [0, 2]
32 |         labels = (labels + 1).type(torch.int64)
33 |         # Flatten
34 |         labels = labels.view(-1)
35 | 
36 |         # Output shape (batch_size * num_tasks,)
37 |         loss = self.ce_loss(logits, labels)
38 |         # Reshape and take average over batch dim
39 |         loss = loss.view(batch_size, num_tasks)
40 | 
41 |         if self.size_average:
42 |             loss = loss.mean(1)
43 |         if self.reduce:
44 |             loss = loss.mean(0)
45 | 
46 |         return loss
47 | 
48 | 
49 | class MaskedLossWrapper(nn.Module):
50 | 
51 |     def __init__(self, loss_fn, device):
52 | 
53 |         super().__init__()
54 |         self.loss_fn = loss_fn
55 |         self.device = device
56 | 
57 |     def _get_mask(self, targets):
58 |         """Returns a mask to mask uncertain
59 |         and missing labels.
60 | 
61 |         Functions tales advantage of the following:
62 |             Negative/Positive: 0/1
63 |             Uncertain: -1
64 |             Missing: -2        """
65 | 
66 |         mask = torch.ones(targets.shape)
67 |         mask[targets == UNCERTAIN] = 0
68 |         mask[targets == MISSING] = 0
69 | 
70 |         mask = mask.to(self.device)
71 | 
72 |         return mask
73 | 
74 |     def forward(self, logits, targets):
75 | 
76 |         # Apply loss function
77 |         loss = self.loss_fn(logits, targets)
78 | 
79 |         # Apply mask to skip missing labels
80 |         # and handle uncertain labels
81 |         mask = self._get_mask(targets)
82 |         loss = loss * mask
83 | 
84 |         # Average the loss
85 |         loss = loss.sum()
86 |         loss = loss * (1 / (mask.sum()))
87 | 
88 |         return loss
89 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/logger/__init__.py:
--------------------------------------------------------------------------------
1 | from .logger import Logger


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/logger/logger.py:
--------------------------------------------------------------------------------
 1 | """Define Logger class for logging information to stdout and disk."""
 2 | import pandas as pd
 3 | import sys
 4 | from tensorboardX import SummaryWriter
 5 | 
 6 | from constants import COL_PATH, COL_TASK, COL_METRIC, COL_VALUE
 7 | 
 8 | 
 9 | class Logger(object):
10 |     """Class for logging output."""
11 |     def __init__(self, log_path, save_dir, results_dir=None):
12 |         self.log_path = log_path
13 |         self.log_file = log_path.open('w')
14 | 
15 |         self.tb_log_dir = save_dir / "tb"
16 |         self.summary_writer = SummaryWriter(log_dir=str(self.tb_log_dir))
17 | 
18 |         self.results_dir = results_dir
19 |         if results_dir is not None:
20 |             self.metrics_path = results_dir / "scores.txt"
21 |             self.metrics_csv_path = results_dir / "scores.csv"
22 |             self.metrics_file = self.metrics_path.open('w')
23 |             self.predictions_path = results_dir / "predictions.csv"
24 |             self.groundtruth_path = results_dir / "groundtruth.csv"
25 | 
26 |     def log(self, *args):
27 |         self.log_stdout(*args)
28 |         print(*args, file=self.log_file)
29 |         self.log_file.flush()
30 | 
31 |     def log_metrics(self, metrics, save_csv=False):
32 |         for metric, value in metrics.items():
33 |             msg = f'{metric}:\t{value}'
34 |             if self.results_dir is not None:
35 |                 self.log_stdout(msg)
36 |                 print(msg, file=self.metrics_file)
37 |                 self.metrics_file.flush()
38 |             else:
39 |                 self.log(f"[{msg}]")
40 | 
41 |         if save_csv:
42 |             col_tasks = []
43 |             col_metrics = []
44 |             col_values = []
45 |             for task_metric, value in metrics.items():
46 |                 # Extract task and metric from dict key
47 |                 tokens = task_metric.split(":")
48 |                 assert len(tokens) == 2, "Failed to split key on ':'!"
49 |                 task, metric = tokens
50 |                 col_tasks.append(task)
51 |                 col_metrics.append(metric)
52 |                 col_values.append(value)
53 | 
54 |             # Assemble a DataFrame and save as CSV
55 |             metrics_df = pd.DataFrame({COL_TASK: col_tasks,
56 |                                        COL_METRIC: col_metrics,
57 |                                        COL_VALUE: col_values})
58 |             metrics_df.to_csv(self.metrics_csv_path, index=False)
59 | 
60 |     def log_stdout(self, *args):
61 |         print(*args, file=sys.stdout)
62 |         sys.stdout.flush()
63 | 
64 |     def close(self):
65 |         self.log_file.close()
66 | 
67 |     def log_scalars(self, scalar_dict, iterations, print_to_stdout=True):
68 |         """Log all values in a dict as scalars to TensorBoard."""
69 |         for k, v in scalar_dict.items():
70 |             if print_to_stdout:
71 |                 self.log_stdout(f'[{k}: {v:.3g}]')
72 |             k = k.replace(':', '/')  # Group in TensorBoard by phase
73 |             self.summary_writer.add_scalar(k, v, iterations)
74 | 
75 |     # def log_scalars2(self, scalar_dict, iterations, print_to_stdout=True):
76 |     #     """Log AUROC and accuracy in a dict as scalars to TensorBoard."""
77 |     #     for k, v in scalar_dict.items():
78 |     #         # Only prints AUROC and accuracy
79 |     #         if ('AUROC' in k) or ('accuracy' in k):
80 |     #             k = k.replace(':', '/')  # Group in TensorBoard by phase
81 |     #             self.summary_writer.add_scalar(k, v, iterations)
82 | 
83 |     def log_predictions_groundtruth(self, predictions, groundtruth,
84 |                                     paths=None):
85 |         if paths is not None:
86 |             predictions.insert(0, COL_PATH, paths)
87 |             groundtruth.insert(0, COL_PATH, paths)
88 | 
89 |         predictions.to_csv(self.predictions_path, index=False)
90 |         groundtruth.to_csv(self.groundtruth_path, index=False)
91 | 
92 |         if paths is not None:
93 |             del predictions[COL_PATH]
94 |             del groundtruth[COL_PATH]
95 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import *
2 | from .calibrate import Calibrator
3 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/models/calibrate.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | 
 4 | from pathlib import Path
 5 | from sklearn.isotonic import IsotonicRegression
 6 | from sklearn.calibration import _SigmoidCalibration
 7 | 
 8 | 
 9 | class Calibrator(object):
10 |     """Class for performing post-processing calibration techniques."""
11 |     def __init__(self, calibrator_type, calibrator_dir, task_name, eval=True):
12 |         # Where to save or load calibration model
13 |         self.calibrator_type = calibrator_type
14 |         self.path = calibrator_dir / (f"{calibrator_type}_{task_name}.pkl")
15 |         self.eval = eval
16 |         
17 |         if self.eval:
18 |             # If in eval mode, load the calibration model
19 |             self.load()
20 | 
21 |     def predict(self, y_prob):
22 |         # Run the loaded calibration model
23 |         return self.calibrator.predict(y_prob)
24 | 
25 |     def train(self, y_true, y_prob):
26 |         if self.calibrator_type == 'isotonic':
27 |             self.calibrator = IsotonicRegression(out_of_bounds='clip')
28 |         elif self.calibrator_type == 'platt':
29 |             self.calibrator = _SigmoidCalibration()
30 | 
31 |         self.calibrator.fit(y_prob, y_true)
32 | 
33 |         self.save()
34 | 
35 |     def load(self):
36 |         print(f"Loading calibration model from {self.path}")
37 |         with self.path.open('rb') as f:
38 |             self.calibrator = pickle.load(f)
39 | 
40 |     def save(self):
41 |         print(f"Saving calibration model to {self.path}")
42 |         if not self.path.parent.exists():
43 |             self.path.parent.mkdir(parents=True)
44 |         with self.path.open('wb') as f:
45 |             pickle.dump(self.calibrator, f)
46 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/optim/__init__.py:
--------------------------------------------------------------------------------
1 | from .optimizer import Optimizer
2 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/predict/__init__.py:
--------------------------------------------------------------------------------
1 | from .predict import Predictor
2 | from .ensemble_predict import EnsemblePredictor
3 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/predict/configs/toy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "aggregation_method": "mean",
 3 |     "task2models": {
 4 |         "Atelectasis": [
 5 |             {
 6 |                 "ckpt_path": "/deep/group/xray4all/final_ckpts/DenseNet121_320_1e-04_uncertainty_ones_top10/iter_336000.pth.tar",
 7 |                 "is_3class": false
 8 |             },
 9 |             {
10 |                 "ckpt_path": "/deep/group/xray4all/final_ckpts/DenseNet121_320_1e-04_uncertainty_ones_top10/iter_384000.pth.tar",
11 |                 "is_3class": false
12 |             }
13 |         ],
14 |         "Cardiomegaly": [
15 |             {
16 |                 "ckpt_path": "/deep/group/xray4all/final_ckpts/DenseNet121_320_1e-04_uncertainty_3-class_top10/iter_403200.pth.tar",
17 |                 "is_3class": true
18 |             },
19 |             {
20 |                 "ckpt_path": "/deep/group/xray4all/final_ckpts/DenseNet121_320_1e-04_uncertainty_3-class_top10/iter_350400.pth.tar",
21 |                 "is_3class": true
22 |             }
23 |         ]
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/predict/predict.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | 
 6 | import util
 7 | 
 8 | NEG_INF = -1e9
 9 | 
10 | 
11 | class Predictor(object):
12 |     """Predictor class for a single model."""
13 |     def __init__(self, model, device):
14 | 
15 |         self.model = model
16 |         self.device = device
17 | 
18 |     def predict(self, loader):
19 |         self.model.eval()
20 |         probs = []
21 |         gt = []
22 |         all_embeddings = []
23 |         if loader.dataset.return_info_dict:
24 |             paths = []
25 |         with tqdm(total=len(loader.dataset)) as progress_bar:
26 |             for data in loader:
27 |                 with torch.no_grad():
28 |                     if loader.dataset.study_level:
29 |                         if loader.dataset.return_info_dict:
30 |                             inputs, targets, info_dict, mask = data
31 |                         else:
32 |                             inputs, targets, mask = data
33 | 
34 | 
35 |                         # Fuse batch size `b` and study length `s`
36 |                         b, s, c, h, w = inputs.size()
37 |                         inputs = inputs.view(-1, c, h, w)
38 | 
39 |                         # Predict
40 |                         logits, embeddings = self.model(inputs.to(self.device))
41 |                         all_embeddings.append(embeddings.detach().cpu().numpy())
42 |                         logits = logits.view(b, s, -1)
43 | 
44 |                         # Mask padding to negative infinity
45 |                         ignore_where = (mask == 0).unsqueeze(-1)
46 |                         ignore_where = ignore_where.repeat(1, 1,
47 |                                                            logits.size(-1))
48 |                         ignore_where = ignore_where.to(self.device)
49 |                         logits = torch.where(ignore_where,
50 |                                              torch.full_like(logits, NEG_INF),
51 |                                              logits)
52 |                         batch_logits, _ = torch.max(logits, 1)
53 | 
54 |                     else:
55 |                         if loader.dataset.return_info_dict:
56 |                             inputs, targets, info_dict = data
57 |                         else:
58 |                             inputs, targets = data
59 | 
60 |                         batch_logits = self.model(inputs.to(self.device))
61 | 
62 |                     if self.model.module.model_uncertainty:
63 |                         batch_probs =\
64 |                             util.uncertain_logits_to_probs(batch_logits)
65 |                     else:
66 |                         batch_probs = torch.sigmoid(batch_logits)
67 | 
68 |                 probs.append(batch_probs.cpu())
69 |                 gt.append(targets)
70 |                 if loader.dataset.return_info_dict:
71 |                     paths.extend(info_dict['paths'])
72 |                 progress_bar.update(targets.size(0))
73 | 
74 |         concat = np.concatenate(all_embeddings)
75 |         all_embeddings = concat.reshape(len(concat), -1)
76 |         probs_concat = np.concatenate(probs)
77 |         gt_concat = np.concatenate(gt)
78 | 
79 |         with open('cx_res18.npy', 'wb') as f:
80 |             np.save(f, all_embeddings)
81 |             np.save(f, gt_concat)
82 |        
83 |         print(probs_concat.shape)
84 |         print(gt_concat.shape)
85 |         tasks = self.model.module.tasks # Tasks decided at self.model.module.tasks. 
86 |         print(tasks)
87 |         probs_df = pd.DataFrame({task: probs_concat[:, i]
88 |                                  for i, task in enumerate(tasks)})
89 |         gt_df = pd.DataFrame({task: gt_concat[:, i] # Check how gt_df looks like.
90 |                               for i, task in enumerate(tasks)})
91 | 
92 |         self.model.train()
93 | 
94 |         if loader.dataset.return_info_dict:
95 |             return probs_df, gt_df, paths
96 | 
97 |         return probs_df, gt_df
98 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/saver/__init__.py:
--------------------------------------------------------------------------------
1 | from .model_saver import ModelSaver
2 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/scripts/get_cams.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from pathlib import Path
  3 | sys.path.append(str(Path(__file__).absolute().parent.parent))
  4 | 
  5 | import os
  6 | 
  7 | import pandas as pd
  8 | import cv2
  9 | import torch
 10 | import numpy as np
 11 | from imageio import imsave
 12 | 
 13 | import util
 14 | from dataset import TASK_SEQUENCES
 15 | from cams import GradCAM, EnsembleCAM
 16 | from cams import GuidedBackPropagation
 17 | from saver import ModelSaver
 18 | from args import TestArgParser
 19 | from dataset import get_loader, get_eval_loaders
 20 | from dataset.constants import IMAGENET_MEAN, IMAGENET_STD
 21 | 
 22 | def save_grad_cams(args, loader, model, output_dir, only_competition=False, only_top_task=False):
 23 |     """Save grad cams for all examples in a loader."""
 24 | 
 25 |     # 'study_level' determined if the loader is returning
 26 |     # studies or individual images
 27 |     study_level = loader.dataset.study_level
 28 |     
 29 |     # NOTE: some model does not have task_sequence
 30 |     if hasattr(model.module, 'task_sequence'):
 31 |         task_sequence = model.module.task_sequence
 32 |     # NOTE: Right now hard code to "stanford" task_sequence,
 33 |     # to match the number of predictions CheXpert makes.
 34 |     else:
 35 |         # task_sequence = TASK_SEQUENCES[data_args.task_sequence]
 36 |         task_sequence = TASK_SEQUENCES["stanford"]
 37 |         print(f'WARNING: assuming that the models task sequence is \n {task_sequence}')
 38 | 
 39 |     if hasattr(model, "task2model_dicts"):
 40 |         grad_cam = EnsembleCAM(model, args.device)
 41 |     else:
 42 |         grad_cam = GradCAM(model, args.device)
 43 | 
 44 |     # By keeping track of the example id
 45 |     # we can name each folder using the example_id.
 46 |     counter = 0
 47 | 
 48 |     if study_level:
 49 |         # for inputs_batch, labels_batch, masks_batch in loader:
 50 |         for inputs_batch, labels_batch, info_batch, masks_batch in loader:
 51 |             for i, (input_study, label_study, mask_study) in enumerate(zip(inputs_batch, labels_batch, masks_batch)):
 52 | 
 53 |                 directory = f'{output_dir}/{counter}'
 54 |                 # Loop over the views in a studyo
 55 |                 view_id = 0
 56 |                 for input_, mask_val in zip(input_study, mask_study):
 57 |                     # Skip this image if it is just a 'padded' image
 58 |                     if mask_val == 0:
 59 |                         continue
 60 | 
 61 |                     write_grad_cams(input_, label_study, grad_cam, directory,
 62 |                                     task_sequence,
 63 |                                     only_competition=only_competition,
 64 |                                     view_id=view_id)
 65 |                     view_id = view_id + 1
 66 | 
 67 |                 # Write label to txt and save to same folder
 68 |                 # to make inspecting the cams easier
 69 |                 label = np.reshape(label_study.numpy(), (1, -1))
 70 |                 label_df = pd.DataFrame(label, columns=list(task_sequence))
 71 |                 label_df["Path"] = info_batch['paths'][i]
 72 |                 label_df["Counter"] = counter
 73 |                 label_df.to_csv(f'{directory}/groundtruth.txt', index=False)
 74 | 
 75 |                 counter = counter + 1
 76 | 
 77 |     else:
 78 |         for inputs, labels in loader:
 79 |             for input_, label in zip(inputs, labels):
 80 |                 directory = f'{output_dir}/{counter}'
 81 |                 write_grad_cams(input_, label, grad_cam, directory, task_sequence)
 82 | 
 83 |             counter = counter + 1
 84 | 
 85 | def write_grad_cams(input_, label, grad_cam,
 86 |         directory, task_sequence, only_competition=False, only_top_task=False, view_id=None):
 87 | 
 88 |     """Creates a CAM for each image.
 89 | 
 90 |         Args:
 91 |             input: Image tensor with shape (3 x h x h)
 92 |             grad_cam: EnsembleCam Object wrapped around GradCam objects, which are wrapped around models.
 93 |             directory: the output folder for these set of cams
 94 |             task_sequence:
 95 |     """
 96 |     if only_competition:
 97 |         COMPETITION_TASKS = TASK_SEQUENCES['competition']
 98 | 
 99 |     # Get the original image by
100 |     # unnormalizing (img pixels will be between 0 and 1)
101 |     # img shape: c, h, w
102 |     img = util.un_normalize(input_, IMAGENET_MEAN, IMAGENET_STD)
103 | 
104 |     # move rgb chanel to last
105 |     img = np.moveaxis(img, 0, 2)
106 | 
107 |     # Add the batch dimension
108 |     # as the model requires it.
109 |     input_ = input_.unsqueeze(0)
110 |     _, channels, height, width = input_.shape
111 |     num_tasks = len(task_sequence)
112 | 
113 |     # Create the directory for cams for this specific example
114 |     if not os.path.exists(directory):
115 |         os.makedirs(directory)
116 | 
117 |     #assert (inputs.shape[0] == 1), 'batch size must be equal to 1'
118 |     with torch.set_grad_enabled(True):
119 | 
120 |         for task_id in range(num_tasks):
121 |             task_name = list(task_sequence)[task_id]
122 |             if only_competition:
123 |                 if task_name not in COMPETITION_TASKS:
124 |                     continue
125 | 
126 |             task = task_name.lower()
127 |             task = task.replace(' ', '_')
128 |             task_label = int(label[task_id].item())
129 |             if any([((task in f) and (f'v-{view_id}' in f)) for f in os.listdir(directory)]) or task_label != 1:
130 |                 continue
131 | 
132 |             probs, idx, cam = grad_cam.get_cam(input_, task_id, task_name)
133 | 
134 |             # Resize cam and overlay on image
135 |             resized_cam = cv2.resize(cam, (height, width))
136 |             # We don't normalize since the grad clam class has already taken care of that
137 |             img_with_cam = util.add_heat_map(img, resized_cam, normalize=False)
138 | 
139 |             # Save a cam for this task and image
140 |             # using task, prob and groundtruth in file name
141 |             prob = probs[idx==task_id].item()
142 |             if view_id is None:
143 |                 filename = f'{task}-p{prob:.3f}-gt{task_label}.png'
144 |             else:
145 |                 filename = f'{task}-p{prob:.3f}-gt{task_label}-v-{view_id}.png'
146 |             output_path = os.path.join(directory, filename)
147 |             imsave(output_path, img_with_cam)
148 | 
149 | 
150 |     # Save the original image in the same folder
151 |     output_path = os.path.join(directory, f'original_image-v-{view_id}.png')
152 |     img = np.uint8(img * 255)
153 |     imsave(output_path, img)
154 | 
155 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/scripts/get_model_size.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from prettytable import PrettyTable
 4 | from torchvision import models
 5 | 
 6 | 
 7 | def count_parameters(model):
 8 |     table = PrettyTable(["Modules", "Parameters"])
 9 |     total_params = 0
10 |     for name, parameter in model.named_parameters():
11 |         if not parameter.requires_grad: continue
12 |         param = parameter.numel()
13 |         table.add_row([name, param])
14 |         total_params+=param
15 |     print(table)
16 |     print(f"Total Trainable Params: {total_params}")
17 |     return total_params
18 |     
19 | 
20 | if __name__ == '__main__' :
21 | 
22 |     net = models.__dict__[sys.argv[1]]()
23 |     count_parameters(net)


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/test.py:
--------------------------------------------------------------------------------
  1 | """Entry-point script to train models."""
  2 | import torch
  3 | 
  4 | from args import TestArgParser
  5 | from logger import Logger
  6 | from predict import Predictor, EnsemblePredictor
  7 | from saver import ModelSaver
  8 | from data import get_loader
  9 | from eval import Evaluator
 10 | from constants import *
 11 | from scripts.get_cams import save_grad_cams
 12 | from dataset import TASK_SEQUENCES
 13 | 
 14 | 
 15 | def test(args):
 16 |     """Run model testing."""
 17 | 
 18 |     model_args = args.model_args
 19 |     data_args = args.data_args
 20 |     logger_args = args.logger_args
 21 | 
 22 |     # import pdb; pdb.set_trace()
 23 | 
 24 |     # Get logger.
 25 |     logger = Logger(logger_args.log_path,
 26 |                     logger_args.save_dir,
 27 |                     logger_args.results_dir)
 28 | 
 29 |     # Get image paths corresponding to predictions for logging
 30 |     paths = None
 31 | 
 32 |     if model_args.config_path is not None:
 33 |         # Instantiate the EnsemblePredictor class for obtaining
 34 |         # model predictions.
 35 |         predictor = EnsemblePredictor(config_path=model_args.config_path,
 36 |                                       model_args=model_args,
 37 |                                       data_args=data_args,
 38 |                                       gpu_ids=args.gpu_ids,
 39 |                                       device=args.device,
 40 |                                       logger=logger)
 41 |         # Obtain ensemble predictions.
 42 |         # Caches both individual and ensemble predictions.
 43 |         # We always turn off caching to ensure that we write the Path column.
 44 |         predictions, groundtruth, paths = predictor.predict(cache=False,
 45 |                                                             return_paths=True,
 46 |                                                             all_gt_tasks=True)
 47 |     else:
 48 |         # Load the model at ckpt_path.
 49 |         ckpt_path = model_args.ckpt_path
 50 |         ckpt_save_dir = Path(ckpt_path).parent
 51 |         model_uncertainty = model_args.model_uncertainty
 52 |         # Get model args from checkpoint and add them to
 53 |         # command-line specified model args.
 54 |         model_args, transform_args\
 55 |             = ModelSaver.get_args(cl_model_args=model_args,
 56 |                                   dataset=data_args.dataset,
 57 |                                   ckpt_save_dir=ckpt_save_dir,
 58 |                                   model_uncertainty=model_uncertainty)
 59 |         
 60 |         # TODO JBY: in test moco should never be true.
 61 |         model_args.moco = args.model_args.moco
 62 |         model, ckpt_info = ModelSaver.load_model(ckpt_path=ckpt_path,
 63 |                                                  gpu_ids=args.gpu_ids,
 64 |                                                  model_args=model_args,
 65 |                                                  is_training=False)
 66 | 
 67 |         # Instantiate the Predictor class for obtaining model predictions.
 68 |         predictor = Predictor(model=model, device=args.device)
 69 |         # Get phase loader object.
 70 |         return_info_dict = True
 71 |         loader = get_loader(phase=data_args.phase,
 72 |                             data_args=data_args,
 73 |                             transform_args=transform_args,
 74 |                             is_training=False,
 75 |                             return_info_dict=return_info_dict,
 76 |                             logger=logger)
 77 |         # Obtain model predictions.
 78 |         if return_info_dict:
 79 |             predictions, groundtruth, paths = predictor.predict(loader)
 80 |         else:
 81 |             predictions, groundtruth = predictor.predict(loader)
 82 |         # print(predictions[CHEXPERT_COMPETITION_TASKS])
 83 |         if model_args.calibrate:
 84 |             #open the json file which has the saved parameters
 85 |             import json
 86 |             with open(CALIBRATION_FILE) as f:
 87 |                 data = json.load(f)
 88 |             i = 0
 89 |             #print(predictions)
 90 |             import math
 91 |             def sigmoid(x):
 92 |                 return 1 / (1 + math.exp(-x))
 93 | 
 94 |             for column in predictions:
 95 |                 predictions[column] = predictions[column].apply \
 96 |                                       (lambda x: sigmoid(x * data[i][0][0][0] \
 97 |                                       + data[i][1][0]))
 98 |                 i += 1
 99 |         
100 |             # print(predictions[CHEXPERT_COMPETITION_TASKS])
101 |             #run forward on all the predictions in each row of predictions
102 | 
103 |     # Log predictions and groundtruth to file in CSV format.
104 |     logger.log_predictions_groundtruth(predictions, groundtruth, paths)
105 | 
106 |     if not args.inference_only:
107 |         # Instantiate the evaluator class for evaluating models.
108 |         evaluator = Evaluator(logger,
109 |                               operating_points_path=CHEXPERT_RAD_PATH)
110 |         # Get model metrics and curves on the phase dataset.
111 |         metrics, curves = evaluator.evaluate_tasks(groundtruth, predictions)
112 |         # Log metrics to stdout and file.
113 |         logger.log_stdout(f"Writing metrics to {logger.metrics_path}.")
114 |         logger.log_metrics(metrics, save_csv=True)
115 | 
116 |     # TODO: make this work with ensemble
117 |     # TODO: investigate if the eval_loader can just be the normal loader here
118 |     if logger_args.save_cams:
119 |         cams_dir = logger_args.save_dir / 'cams'
120 |         print(f'Save cams to {cams_dir}')
121 |         save_grad_cams(args, loader, model,
122 |                        cams_dir,
123 |                        only_competition=logger_args.only_competition_cams,
124 |                        only_top_task=False)
125 | 
126 |     logger.log("=== Testing Complete ===")
127 |     # Produce other visuals
128 |     # TODO: This causes "unexpected error to scripts"
129 |     # raise NotImplementedError()
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     torch.multiprocessing.set_sharing_strategy('file_system')
134 |     parser = TestArgParser()
135 |     print("Start test...")
136 |     test(parser.parse_args())
137 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/test_images.py:
--------------------------------------------------------------------------------
 1 | # Create dummy csv and dummy image folders before running test
 2 | import subprocess
 3 | import shutil
 4 | import os
 5 | import glob
 6 | import csv
 7 | from constants import *
 8 | from argparse import ArgumentParser
 9 | 
10 | 
11 | def parse_script_args():
12 |     """Parse command line arguments.
13 | 
14 |     Returns:
15 |         args (Namespace): Parsed command line arguments
16 | 
17 |     """
18 |     parser = ArgumentParser()
19 | 
20 |     parser.add_argument('--save_dir',
21 |                         type=str, default=str(CHEXPERT_SAVE_DIR),
22 |                         help='Directory to save model data.')
23 | 
24 |     parser.add_argument('--img_folder', type=str,
25 |                         default=None, required=True,
26 |                         help='Path to folder of all the images')
27 | 
28 |     parser.add_argument('--batch_size',
29 |                         type=int, default=16,
30 |                         help='Batch size for training / evaluation.')
31 | 
32 |     parser.add_argument('--ckpt_path',
33 |                         type=str, default=None,
34 |                         help=('Checkpoint path for eval.'))
35 |     
36 |     parser.add_argument('--config_path',
37 |                         type=str, default=None,
38 |                         help=('Path to ensemble.'))
39 | 
40 |     args = parser.parse_args()
41 |     return args
42 | 
43 | def folders_csv(folder):
44 |     """Create csv and put images in folder
45 |     
46 |     Args:
47 |         folder (str): path to all the images
48 |     """
49 |     images = glob.glob(folder + "/*.jpg")
50 |     rows = []
51 |     for image in images:
52 |         img_path = Path(image)
53 |         img_name = img_path.name
54 |         new_dir = img_path.parent / img_name.rstrip('.jpg')
55 |         new_dir.mkdir(exist_ok=True, parents=True)
56 |         new_path = new_dir / img_name
57 |         rows.append([str(new_path.absolute())] + [None] * 4 + [0] * 14)
58 |         img_path.rename(new_path)
59 |     with open(folder + '/dummy.csv', 'w') as csv_file:
60 |         row = ["Path", "Sex", "Age", "Frontal/Lateral", "AP/PA"] \
61 |                 + CHEXPERT_TASKS
62 |         writer = csv.writer(csv_file)
63 |         writer.writerow(row)
64 |         for row in rows:
65 |             writer.writerow(row)
66 | 
67 | def run_test(args):
68 |     """Run test on dummy csv
69 | 
70 |     Args:
71 |         args (Namespace): Parsed command line arguments
72 |     """
73 |     if args.config_path is not None:
74 |         path = "--config_path"
75 |         path_name = args.config_path
76 |     else:
77 |         path = "--ckpt_path"
78 |         path_name = args.ckpt_path
79 |     subprocess.run(['python', 'test.py', '--dataset', 'custom', path, path_name,
80 |                     '--phase', 'test', '--together', 'True', '--test_csv',
81 |                     str(args.img_folder + '/dummy.csv'), '--save_dir', args.save_dir])
82 |     os.remove(args.img_folder + '/dummy.csv')     #remove if you want to keep csv
83 |     os.remove(args.save_dir + '/results/test/groundtruth.csv')
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     args = parse_script_args()
88 |     csv = folders_csv(args.img_folder)
89 |     run_test(args)


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/test_one.py:
--------------------------------------------------------------------------------
  1 | """Evaluate a ckpt or config on a test CSV.
  2 | 
  3 | Usage:
  4 |     python test_one.py --model_path <ckpt_path or config_path>
  5 |                        --csv_path <csv_path>
  6 |                        --name <unique name of experiment>
  7 | 
  8 | """
  9 | import os
 10 | import pandas as pd
 11 | import sys
 12 | 
 13 | from argparse import ArgumentParser
 14 | from datetime import datetime
 15 | from getpass import getuser
 16 | from pathlib import Path
 17 | from shutil import copy
 18 | from subprocess import run
 19 | 
 20 | 
 21 | FILE_ENDINGS = set(['.pth', '.tar', '.json'])
 22 | ROOT_DIR = Path('/deep/group/chexperturbed/runs')
 23 | USER_DIR = ROOT_DIR / getuser()
 24 | TASKS = ['Cardiomegaly',
 25 |          'Edema',
 26 |          'Consolidation',
 27 |          'Atelectasis',
 28 |          'Pleural Effusion',
 29 |          'Normal']
 30 | METRIC = 'AUROC'
 31 | 
 32 | 
 33 | def parse_script_args():
 34 |     """Parse command line arguments.
 35 | 
 36 |     Returns:
 37 |         args (Namespace): parsed command line arguments
 38 | 
 39 |     """
 40 |     parser = ArgumentParser()
 41 | 
 42 |     parser.add_argument('--name', type=str, required=True,
 43 |                         help='Name of the run')
 44 | 
 45 |     parser.add_argument('--model_path', type=str, required=True,
 46 |                         help='Path of ckpt or config file')
 47 | 
 48 |     parser.add_argument('--csv_path', type=str, required=True,
 49 |                         help='Path to test_csv')
 50 | 
 51 |     parser.add_argument('--is_3class', action='store_true',
 52 |                         help='Whether this is a 3-class model')
 53 | 
 54 |     parser.add_argument('--save_cams', action='store_true',
 55 |                         help='Whether to also generate CAMs')
 56 | 
 57 |     parser.add_argument('--gpu_ids', type=str, required=True,
 58 |                         help='Devices to use')
 59 | 
 60 |     parser.add_argument('--inference_only', action='store_true',
 61 |                         help='Whether to only run inference')
 62 | 
 63 |     args = parser.parse_args()
 64 |     args.model_path = Path(args.model_path)
 65 |     assert args.model_path.exists()
 66 |     args.csv_path = Path(args.csv_path)
 67 |     assert args.csv_path.exists()
 68 |     if args.model_path.suffix not in FILE_ENDINGS:
 69 |         print('Error: unrecognized file ending! Exiting.')
 70 |         exit()
 71 |     return args
 72 | 
 73 | 
 74 | if __name__ == '__main__':
 75 |     args = parse_script_args()
 76 |     exp_dir = USER_DIR / args.name
 77 |     print('Saving run results in %s...' % str(exp_dir))
 78 |     USER_DIR.mkdir(exist_ok=True, parents=True)
 79 | 
 80 |     # Don't allow experiment to proceed if already exists, to avoid clobbering
 81 |     try:
 82 |         exp_dir.mkdir(parents=True)
 83 |     except FileExistsError as e:
 84 |         print('Error: directory already exists! Exiting.')
 85 |         exit()
 86 | 
 87 |     # Save command for reproducibility
 88 |     cmd_path = exp_dir / 'cmd.txt'
 89 |     print('Saving command to %s...' % str(cmd_path))
 90 |     cmd = ' '.join(['python'] + sys.argv)
 91 |     with open(cmd_path, 'w+') as f:
 92 |         f.write('%s\n' % cmd)
 93 | 
 94 |     # Testing ensemble
 95 |     model_path = None
 96 |     if args.model_path.suffix == '.json':
 97 |         config_dst_path = exp_dir / args.model_path.name
 98 |         copy(args.model_path, config_dst_path)
 99 |         model_path = ('--config_path', str(config_dst_path))
100 |     # Testing single model
101 |     else:
102 |         ckpt_dst_path = exp_dir / args.model_path.name
103 |         copy(args.model_path, ckpt_dst_path)
104 |         model_path = ('--ckpt_path', str(ckpt_dst_path))
105 |         args_dst_path = exp_dir / 'args.json'
106 |         copy(args.model_path.parent / 'args.json', args_dst_path)
107 | 
108 |     test_args = ['python', 'test.py',
109 |                  '--dataset', 'custom',
110 |                  '--together', 'True',
111 |                  '--test_csv', args.csv_path,
112 |                  model_path[0], model_path[1],
113 |                  '--phase', 'test',
114 |                  '--save_dir', str(exp_dir),
115 |                  '--gpu_ids', args.gpu_ids]
116 | 
117 |     if args.is_3class:
118 |         test_args += ['--model_uncertainty', 'True']
119 | 
120 |     if args.save_cams:
121 |         test_args += ['--save_cams', 'True']
122 |         test_args += ['--only_competition_cams', 'True']
123 | 
124 |     if args.inference_only:
125 |         test_args += ['--inference_only']
126 | 
127 |     # Run the model, but suppress output
128 |     print('Running model...')
129 |     with open(os.devnull, 'w') as devnull:
130 |         run(test_args, stdout=devnull)
131 | 
132 |     # Delete the checkpoint when done to save space
133 |     if model_path[0] == '--ckpt_path':
134 |         print('Deleting checkpoint...')
135 |         Path(model_path[1]).unlink()
136 | 
137 |     # Quit if we're only doing inference
138 |     if args.inference_only:
139 |         exit()
140 | 
141 |     # Print out relevant metrics
142 |     scores_path = exp_dir
143 |     if model_path[0] == '--config_path':
144 |         scores_path /= args.model_path.stem
145 |     scores_path = scores_path / 'results' / 'test' / 'scores.csv'
146 |     df = pd.read_csv(scores_path)
147 |     print('Selected results:')
148 |     values = []
149 |     for task in TASKS:
150 |         value = float(df[(df['Metrics'] == METRIC) &
151 |                          (df['Tasks'] == task)]['Values'])
152 |         values.append(value)
153 |         print('%s (%s): %f' % (METRIC, task, value))
154 | 
155 |     # Build row for spreadsheet
156 |     ss_date = datetime.now().strftime('%m/%d/%Y')
157 |     ss_path = str(args.model_path)
158 |     ss_test_data = args.name.split('__')[-1]
159 |     ss_values = [str(value) for value in values]
160 |     ss_results_dir = str(exp_dir)
161 |     ss_cmd = cmd
162 |     ss_row = [ss_date, ss_path, ss_test_data]
163 |     ss_row += ss_values + [ss_results_dir, ss_cmd]
164 |     ss_row = ','.join(ss_row)
165 |     print('Generated row for spreadsheet: %s' % ss_row)
166 |     with open(exp_dir / 'row.txt', 'w+') as f:
167 |         f.write('%s\n' % ss_row)
168 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/timeout_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import threading
 3 | from time import sleep
 4 | 
 5 | import _thread as thread
 6 | 
 7 | 
 8 | def quit_function(fn_name):
 9 |     # print to stderr, unbuffered in Python 2.
10 |     print('{0} took too long'.format(fn_name), file=sys.stderr)
11 |     sys.stderr.flush() # Python 3 stderr is likely buffered.
12 |     thread.interrupt_main() # raises KeyboardInterrupt
13 |     # raise TimeoutError
14 | 
15 | 
16 | def exit_after(s):
17 |     '''
18 |     use as decorator to exit process if 
19 |     function takes longer than s seconds
20 |     '''
21 |     def outer(fn):
22 |         def inner(*args, **kwargs):
23 |             timer = threading.Timer(s, quit_function, args=[fn.__name__])
24 |             timer.start()
25 |             try:
26 |                 result = fn(*args, **kwargs)
27 |             finally:
28 |                 timer.cancel()
29 |             return result
30 |         return inner
31 |     return outer    
32 | 
33 | 
34 | @exit_after(5)
35 | def countdown(n):
36 |     print('countdown started', flush=True)
37 |     for i in range(n, -1, -1):
38 |         print(i, end=', ', flush=True)
39 |         sleep(1)
40 |     print('countdown finished')
41 | 
42 | 
43 | if __name__ == '__main__':
44 | 
45 |     countdown(2)
46 | 
47 |     try:
48 |         countdown(5)
49 |     except:
50 |         print('here')
51 | 
52 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/util/__init__.py:
--------------------------------------------------------------------------------
1 | from util.cuda_util import *
2 | from util.io_util import *
3 | from util.image_util import *
4 | from util.model_util import *
5 | from util.label_util import *


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/util/cuda_util.py:
--------------------------------------------------------------------------------
 1 | """Utility file for CUDA and GPU-specific functions."""
 2 | 
 3 | 
 4 | import torch
 5 | import torch.backends.cudnn as cudnn
 6 | 
 7 | 
 8 | def setup_gpus(gpu_ids):
 9 |     """Set up the GPUs and return the device to be used.
10 | 
11 |     Args:
12 |         gpu_ids (list): list of GPU IDs
13 | 
14 |     Returns:
15 |         device (str): the device, either 'cuda' or 'cpu'
16 | 
17 |     """
18 |     device = None
19 |     if len(gpu_ids) > 0 and torch.cuda.is_available():
20 |         torch.cuda.set_device(gpu_ids[0])
21 |         cudnn.benchmark = True
22 |         device = 'cuda'
23 |     else:
24 |         device = 'cpu'
25 | 
26 |     return device
27 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/util/io_util.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from sys import stderr
 3 | 
 4 | 
 5 | def args_to_list(csv, allow_empty, arg_type=int, allow_negative=True):
 6 |     """Convert comma-separated arguments to a list.
 7 | 
 8 |     Args:
 9 |         csv: Comma-separated list of arguments as a string.
10 |         allow_empty: If True, allow the list to be empty. Otherwise return None instead of empty list.
11 |         arg_type: Argument type in the list.
12 |         allow_negative: If True, allow negative inputs.
13 | 
14 |     Returns:
15 |         List of arguments, converted to `arg_type`.
16 |     """
17 |     arg_vals = [arg_type(d) for d in str(csv).split(',')]
18 |     if not allow_negative:
19 |         arg_vals = [v for v in arg_vals if v >= 0]
20 |     if not allow_empty and len(arg_vals) == 0:
21 |         return None
22 |     return arg_vals
23 | 
24 | # TODO: Move to logger
25 | def print_err(*args, **kwargs):
26 |     """Print a message to stderr."""
27 |     print(*args, file=stderr, **kwargs)
28 | 
29 | 
30 | def str_to_bool(arg):
31 |     """Convert an argument string into its boolean value.
32 | 
33 |     Args:
34 |         arg: String representing a bool.
35 | 
36 |     Returns:
37 |         Boolean value for the string.
38 |     """
39 |     if arg.lower() in ('yes', 'true', 't', 'y', '1'):
40 |         return True
41 |     elif arg.lower() in ('no', 'false', 'f', 'n', '0'):
42 |         return False
43 |     else:
44 |         raise argparse.ArgumentTypeError('Boolean value expected.')
45 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/util/label_util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | PATH_TO_STUDY_RE = re.compile(r'(valid|train|test)/patient(\d+)/study(\d+)')
 5 | 
 6 | 
 7 | def get_study_id(path):
 8 |     """Get a unique study ID from a (study or image) path.
 9 | 
10 |     For example:
11 |         /deep/group/xray4all/images/valid/patient64542/study1 -> valid/patient64542/study1
12 | 
13 |     Args:
14 |         path (str): Path to convert to study_id.
15 |     """
16 |     path = str(path)
17 |     match = PATH_TO_STUDY_RE.search(path)
18 |     return match.group(0) if match else None
19 | 


--------------------------------------------------------------------------------
/chexpert_supervised/chexpert-model/util/model_util.py:
--------------------------------------------------------------------------------
 1 | import torch.nn.functional as F
 2 | 
 3 | 
 4 | def uncertain_logits_to_probs(logits):
 5 |     """Convert explicit uncertainty modeling logits to probabilities P(is_abnormal).
 6 | 
 7 |     Args:
 8 |         logits: Input of shape (batch_size, num_tasks * 3).
 9 | 
10 |     Returns:
11 |         probs: Output of shape (batch_size, num_tasks).
12 |             Position (i, j) interpreted as P(example i has pathology j).
13 |     """
14 |     b, n_times_d = logits.size()
15 |     d = 3
16 |     if n_times_d % d:
17 |         raise ValueError('Expected logits dimension to be divisible by ' +
18 |                          f'{d}, got size {n_times_d}.')
19 |     n = n_times_d // d
20 | 
21 |     logits = logits.view(b, n, d)
22 |     probs = F.softmax(logits[:, :, 1:], dim=-1)
23 |     probs = probs[:, :, 1]
24 | 
25 |     return probs
26 | 


--------------------------------------------------------------------------------
/chexpert_supervised/environment.yml:
--------------------------------------------------------------------------------
  1 | name: chexpert-baseline
  2 | channels:
  3 |   - pytorch
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - blas=1.0=mkl
  9 |   - bzip2=1.0.8=h516909a_2
 10 |   - ca-certificates=2020.1.1=0
 11 |   - cairo=1.16.0=h18b612c_1001
 12 |   - certifi=2020.4.5.1=py37_0
 13 |   - cudatoolkit=10.1.243=h6bb024c_0
 14 |   - cycler=0.10.0=py_2
 15 |   - dbus=1.13.6=he372182_0
 16 |   - expat=2.2.9=he1b5a44_2
 17 |   - ffmpeg=4.0=hcdf2ecd_0
 18 |   - fontconfig=2.13.1=he4413a7_1000
 19 |   - freeglut=3.0.0=hf484d3e_1005
 20 |   - freetype=2.9.1=h8a8886c_1
 21 |   - glib=2.63.1=h3eb4bd4_1
 22 |   - graphite2=1.3.13=he1b5a44_1001
 23 |   - gst-plugins-base=1.14.0=hbbd80ab_1
 24 |   - gstreamer=1.14.0=hb31296c_0
 25 |   - harfbuzz=1.8.8=hffaf4a1_0
 26 |   - hdf5=1.10.2=hc401514_3
 27 |   - icu=58.2=hf484d3e_1000
 28 |   - intel-openmp=2020.1=217
 29 |   - jasper=2.0.14=h07fcdf6_1
 30 |   - joblib=0.15.1=py_0
 31 |   - jpeg=9b=h024ee3a_2
 32 |   - kiwisolver=1.2.0=py37h99015e2_0
 33 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 34 |   - libedit=3.1.20181209=hc058e9b_0
 35 |   - libffi=3.3=he6710b0_1
 36 |   - libgcc-ng=9.1.0=hdf63c60_0
 37 |   - libgfortran=3.0.0=1
 38 |   - libgfortran-ng=7.3.0=hdf63c60_0
 39 |   - libglu=9.0.0=he1b5a44_1001
 40 |   - libopencv=3.4.2=hb342d67_1
 41 |   - libopus=1.3.1=h7b6447c_0
 42 |   - libpng=1.6.37=hbc83047_0
 43 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 44 |   - libtiff=4.1.0=h2733197_1
 45 |   - libuuid=2.32.1=h14c3975_1000
 46 |   - libvpx=1.7.0=h439df22_0
 47 |   - libxcb=1.13=h14c3975_1002
 48 |   - libxml2=2.9.10=he19cac6_1
 49 |   - lz4-c=1.9.2=he6710b0_0
 50 |   - matplotlib=3.1.3=py37_0
 51 |   - matplotlib-base=3.1.3=py37hef1b27d_0
 52 |   - mkl=2020.1=217
 53 |   - mkl-service=2.3.0=py37he904b0f_0
 54 |   - mkl_fft=1.0.15=py37ha843d7b_0
 55 |   - mkl_random=1.1.1=py37h0573a6f_0
 56 |   - ncurses=6.2=he6710b0_1
 57 |   - ninja=1.9.0=py37hfd86e86_0
 58 |   - numpy=1.18.1=py37h4f9e942_0
 59 |   - numpy-base=1.18.1=py37hde5b4d6_1
 60 |   - olefile=0.46=py37_0
 61 |   - opencv=3.4.2=py37h6fd60c2_1
 62 |   - openssl=1.1.1g=h7b6447c_0
 63 |   - pandas=1.0.3=py37h0573a6f_0
 64 |   - pcre=8.44=he1b5a44_0
 65 |   - pillow=7.1.2=py37hb39fc2d_0
 66 |   - pip=20.0.2=py37_3
 67 |   - pixman=0.38.0=h516909a_1003
 68 |   - pthread-stubs=0.4=h14c3975_1001
 69 |   - py-opencv=3.4.2=py37hb342d67_1
 70 |   - pyparsing=2.4.7=pyh9f0ad1d_0
 71 |   - pyqt=5.9.2=py37hcca6a23_4
 72 |   - python=3.7.7=hcff3b4d_5
 73 |   - python-dateutil=2.8.1=py_0
 74 |   - python_abi=3.7=1_cp37m
 75 |   - pytorch=1.4.0=py3.7_cuda10.1.243_cudnn7.6.3_0
 76 |   - pytz=2020.1=py_0
 77 |   - qt=5.9.7=h5867ecd_1
 78 |   - readline=8.0=h7b6447c_0
 79 |   - scikit-learn=0.22.1=py37hd81dba3_0
 80 |   - scipy=1.4.1=py37h0b6359f_0
 81 |   - setuptools=47.1.1=py37_0
 82 |   - sip=4.19.8=py37hf484d3e_0
 83 |   - six=1.15.0=py_0
 84 |   - sqlite=3.31.1=h62c20be_1
 85 |   - tk=8.6.8=hbc83047_0
 86 |   - torchvision=0.5.0=py37_cu101
 87 |   - tornado=6.0.4=py37h8f50634_1
 88 |   - wheel=0.34.2=py37_0
 89 |   - xorg-fixesproto=5.0=h14c3975_1002
 90 |   - xorg-inputproto=2.3.2=h14c3975_1002
 91 |   - xorg-kbproto=1.0.7=h14c3975_1002
 92 |   - xorg-libice=1.0.10=h516909a_0
 93 |   - xorg-libsm=1.2.3=h84519dc_1000
 94 |   - xorg-libx11=1.6.9=h516909a_0
 95 |   - xorg-libxau=1.0.9=h14c3975_0
 96 |   - xorg-libxdmcp=1.1.3=h516909a_0
 97 |   - xorg-libxext=1.3.4=h516909a_0
 98 |   - xorg-libxfixes=5.0.3=h516909a_1004
 99 |   - xorg-libxi=1.7.10=h516909a_0
100 |   - xorg-libxrender=0.9.10=h516909a_1002
101 |   - xorg-renderproto=0.11.1=h14c3975_1002
102 |   - xorg-xextproto=7.3.0=h14c3975_1002
103 |   - xorg-xproto=7.0.31=h14c3975_1007
104 |   - xz=5.2.5=h7b6447c_0
105 |   - zlib=1.2.11=h7b6447c_3
106 |   - zstd=1.4.4=h0b5b093_3
107 |   - pip:
108 |     - absl-py==0.9.0
109 |     - cachetools==4.1.0
110 |     - chardet==3.0.4
111 |     - future==0.18.2
112 |     - google-auth==1.17.2
113 |     - google-auth-oauthlib==0.4.1
114 |     - grpcio==1.29.0
115 |     - idna==2.9
116 |     - importlib-metadata==1.6.1
117 |     - markdown==3.2.2
118 |     - munch==2.5.0
119 |     - oauthlib==3.1.0
120 |     - pretrainedmodels==0.7.4
121 |     - protobuf==3.12.2
122 |     - pyasn1==0.4.8
123 |     - pyasn1-modules==0.2.8
124 |     - pytorch-lightning==0.7.6
125 |     - pyyaml==5.3.1
126 |     - requests==2.23.0
127 |     - requests-oauthlib==1.3.0
128 |     - rsa==4.6
129 |     - tensorboard==2.2.2
130 |     - tensorboard-plugin-wit==1.6.0.post3
131 |     - tqdm==4.46.1
132 |     - urllib3==1.25.9
133 |     - werkzeug==1.0.1
134 |     - zipp==3.1.0
135 | 
136 | 


--------------------------------------------------------------------------------
/image_source/contrastive_learning.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/image_source/contrastive_learning.PNG


--------------------------------------------------------------------------------
/image_source/cx_all_full_ci.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/image_source/cx_all_full_ci.PNG


--------------------------------------------------------------------------------
/image_source/cx_all_last_ci.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/image_source/cx_all_last_ci.PNG


--------------------------------------------------------------------------------
/image_source/moco_flowchart_new.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/image_source/moco_flowchart_new.PNG


--------------------------------------------------------------------------------
/moco_pretraining/moco/aihc_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/moco_pretraining/moco/aihc_utils/__init__.py


--------------------------------------------------------------------------------
/moco_pretraining/moco/aihc_utils/image_transform.py:
--------------------------------------------------------------------------------
 1 | import torchvision.transforms as transforms
 2 | 
 3 | CXR_MEAN = [.5020, .5020, .5020]
 4 | CXR_STD = [.085585, .085585, .085585]
 5 | 
 6 | 
 7 | def get_transform(args, training):
 8 |     # Shorter side scaled to args.img_size
 9 |     if args.maintain_ratio:
10 |         transforms_list = [transforms.Resize(args.img_size)]
11 |     else:
12 |         transforms_list = [transforms.Resize((args.img_size, args.img_size))]
13 | 
14 |     # Data augmentation
15 |     if training:
16 |         transforms_list += [transforms.RandomHorizontalFlip(),
17 |                             transforms.RandomRotation(args.rotate), 
18 |                             transforms.RandomCrop((args.crop, args.crop)) if args.crop != 0 else None]
19 |     else:
20 |         transforms_list += [transforms.CenterCrop((args.crop, args.crop)) if args.crop else None]
21 | 
22 |     # Normalization
23 |     # Seems like the arguments do not contain clahe anyways
24 |     # if t_args.clahe:
25 |     #     transforms_list += [CLAHE(clip_limit=2.0, tile_grid_size=(8, 8))]
26 | 
27 |     normalize = transforms.Normalize(mean=CXR_MEAN, std=CXR_STD)
28 |     transforms_list += [transforms.ToTensor(), normalize]
29 | 
30 |     # transform = transforms.Compose([t for t in transforms_list if t])
31 |     transform = [t for t in transforms_list if t]
32 |     return transform


--------------------------------------------------------------------------------
/moco_pretraining/moco/aihc_utils/storage_util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import datetime
 3 | 
 4 | from pathlib import Path
 5 | import getpass
 6 | 
 7 | import getpass
 8 | 
 9 | if str(getpass.getuser()) == 'jby':
10 |     STORAGE_ROOT = Path('/home/jby/chexpert_experiments')
11 | else:
12 |     STORAGE_ROOT = Path('/deep/group/aihc-bootcamp-spring2020/cxr_fewer_samples/experiments')
13 | 
14 | 
15 | def get_storage_folder(exp_name, exp_type):
16 | 
17 |     try:
18 |         jobid = os.environ["SLURM_JOB_ID"]
19 |     except:
20 |         jobid = None
21 | 
22 |     datestr = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
23 |     username = str(getpass.getuser())
24 | 
25 |     fname = f'{exp_name}_{exp_type}_{datestr}_SLURM{jobid}' if jobid is not None else f'{exp_name}_{exp_type}_{datestr}'
26 | 
27 |     path_name = STORAGE_ROOT / username / fname
28 |     os.makedirs(path_name)
29 | 
30 |     print(f'Experiment storage is at {fname}')
31 |     return path_name


--------------------------------------------------------------------------------
/moco_pretraining/moco/detection/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## MoCo: Transferring to Detection
 3 | 
 4 | The `train_net.py` script reproduces the object detection experiments on Pascal VOC and COCO.
 5 | 
 6 | ### Instruction
 7 | 
 8 | 1. Install [detectron2](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md).
 9 | 
10 | 1. Convert a pre-trained MoCo model to detectron2's format:
11 |    ```
12 |    python3 convert-pretrain-to-detectron2.py input.pth.tar output.pkl
13 |    ```
14 | 
15 | 1. Put dataset under "./datasets" directory,
16 |    following the [directory structure](https://github.com/facebookresearch/detectron2/tree/master/datasets)
17 | 	 requried by detectron2.
18 | 
19 | 1. Run training:
20 |    ```
21 |    python train_net.py --config-file configs/pascal_voc_R_50_C4_24k_moco.yaml \
22 | 	--num-gpus 8 MODEL.WEIGHTS ./output.pkl
23 |    ```
24 | 
25 | ### Results
26 | 
27 | Below are the results on Pascal VOC 2007 test, fine-tuned on 2007+2012 trainval for 24k iterations using Faster R-CNN with a R50-C4 backbone:
28 | 
29 | <table><tbody>
30 | <!-- START TABLE -->
31 | <!-- TABLE HEADER -->
32 | <th valign="bottom">pretrain</th>
33 | <th valign="bottom">AP50</th>
34 | <th valign="bottom">AP</th>
35 | <th valign="bottom">AP75</th>
36 | <!-- TABLE BODY -->
37 | <tr><td align="left">ImageNet-1M, supervised</td>
38 | <td align="center">81.3</td>
39 | <td align="center">53.5</td>
40 | <td align="center">58.8</td>
41 | </tr>
42 | <tr><td align="left">ImageNet-1M, MoCo v1, 200ep</td>
43 | <td align="center">81.5</td>
44 | <td align="center">55.9</td>
45 | <td align="center">62.6</td>
46 | </tr>
47 | </tr>
48 | <tr><td align="left">ImageNet-1M, MoCo v2, 200ep</td>
49 | <td align="center">82.4</td>
50 | <td align="center">57.0</td>
51 | <td align="center">63.6</td>
52 | </tr>
53 | </tr>
54 | <tr><td align="left">ImageNet-1M, MoCo v2, 800ep</td>
55 | <td align="center">82.5</td>
56 | <td align="center">57.4</td>
57 | <td align="center">64.0</td>
58 | </tr>
59 | </tbody></table>
60 | 
61 | ***Note:*** These results are means of 5 trials. Variation on Pascal VOC is large: the std of AP50, AP, AP75 is expected to be 0.2, 0.2, 0.4 in most cases. We recommend to run 5 trials and compute means.
62 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/detection/configs/Base-RCNN-C4-BN.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   RPN:
 4 |     PRE_NMS_TOPK_TEST: 6000
 5 |     POST_NMS_TOPK_TEST: 1000
 6 |   ROI_HEADS:
 7 |     NAME: "Res5ROIHeadsExtraNorm"
 8 |   BACKBONE:
 9 |     FREEZE_AT: 0
10 |   RESNETS:
11 |     NORM: "SyncBN"
12 | TEST:
13 |   PRECISE_BN:
14 |     ENABLED: True
15 | SOLVER:
16 |   IMS_PER_BATCH: 16
17 |   BASE_LR: 0.02
18 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/detection/configs/coco_R_50_C4_2x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-RCNN-C4-BN.yaml"
 2 | MODEL:
 3 |   MASK_ON: True
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 | INPUT:
 6 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
 7 |   MIN_SIZE_TEST: 800
 8 | DATASETS:
 9 |   TRAIN: ("coco_2017_train",)
10 |   TEST: ("coco_2017_val",)
11 | SOLVER:
12 |   STEPS: (120000, 160000)
13 |   MAX_ITER: 180000
14 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/detection/configs/coco_R_50_C4_2x_moco.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "coco_R_50_C4_2x.yaml"
 2 | MODEL:
 3 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 4 |   PIXEL_STD: [58.395, 57.120, 57.375]
 5 |   WEIGHTS: "See Instructions"
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False
 8 | INPUT:
 9 |   FORMAT: "RGB"
10 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/detection/configs/pascal_voc_R_50_C4_24k.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-RCNN-C4-BN.yaml"
 2 | MODEL:
 3 |   MASK_ON: False
 4 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 5 |   ROI_HEADS:
 6 |     NUM_CLASSES: 20
 7 | INPUT:
 8 |   MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800)
 9 |   MIN_SIZE_TEST: 800
10 | DATASETS:
11 |   TRAIN: ('voc_2007_trainval', 'voc_2012_trainval')
12 |   TEST: ('voc_2007_test',)
13 | SOLVER:
14 |   STEPS: (18000, 22000)
15 |   MAX_ITER: 24000
16 |   WARMUP_ITERS: 100
17 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/detection/configs/pascal_voc_R_50_C4_24k_moco.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "pascal_voc_R_50_C4_24k.yaml"
 2 | MODEL:
 3 |   PIXEL_MEAN: [123.675, 116.280, 103.530]
 4 |   PIXEL_STD: [58.395, 57.120, 57.375]
 5 |   WEIGHTS: "See Instructions"
 6 |   RESNETS:
 7 |     STRIDE_IN_1X1: False
 8 | INPUT:
 9 |   FORMAT: "RGB"
10 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/detection/convert-pretrain-to-detectron2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import pickle as pkl
 5 | import sys
 6 | import torch
 7 | 
 8 | if __name__ == "__main__":
 9 |     input = sys.argv[1]
10 | 
11 |     obj = torch.load(input, map_location="cpu")
12 |     obj = obj["state_dict"]
13 | 
14 |     newmodel = {}
15 |     for k, v in obj.items():
16 |         if not k.startswith("module.encoder_q."):
17 |             continue
18 |         old_k = k
19 |         k = k.replace("module.encoder_q.", "")
20 |         if "layer" not in k:
21 |             k = "stem." + k
22 |         for t in [1, 2, 3, 4]:
23 |             k = k.replace("layer{}".format(t), "res{}".format(t + 1))
24 |         for t in [1, 2, 3]:
25 |             k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
26 |         k = k.replace("downsample.0", "shortcut")
27 |         k = k.replace("downsample.1", "shortcut.norm")
28 |         print(old_k, "->", k)
29 |         newmodel[k] = v.numpy()
30 | 
31 |     res = {"model": newmodel, "__author__": "MOCO", "matching_heuristics": True}
32 | 
33 |     with open(sys.argv[2], "wb") as f:
34 |         pkl.dump(res, f)
35 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/detection/train_net.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 3 | 
 4 | import os
 5 | 
 6 | from detectron2.checkpoint import DetectionCheckpointer
 7 | from detectron2.config import get_cfg
 8 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch
 9 | from detectron2.evaluation import COCOEvaluator, PascalVOCDetectionEvaluator
10 | from detectron2.layers import get_norm
11 | from detectron2.modeling.roi_heads import ROI_HEADS_REGISTRY, Res5ROIHeads
12 | 
13 | 
14 | @ROI_HEADS_REGISTRY.register()
15 | class Res5ROIHeadsExtraNorm(Res5ROIHeads):
16 |     """
17 |     As described in the MOCO paper, there is an extra BN layer
18 |     following the res5 stage.
19 |     """
20 |     def _build_res5_block(self, cfg):
21 |         seq, out_channels = super()._build_res5_block(cfg)
22 |         norm = cfg.MODEL.RESNETS.NORM
23 |         norm = get_norm(norm, out_channels)
24 |         seq.add_module("norm", norm)
25 |         return seq, out_channels
26 | 
27 | 
28 | class Trainer(DefaultTrainer):
29 |     @classmethod
30 |     def build_evaluator(cls, cfg, dataset_name, output_folder=None):
31 |         if output_folder is None:
32 |             output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
33 |         if "coco" in dataset_name:
34 |             return COCOEvaluator(dataset_name, cfg, True, output_folder)
35 |         else:
36 |             assert "voc" in dataset_name
37 |             return PascalVOCDetectionEvaluator(dataset_name)
38 | 
39 | 
40 | def setup(args):
41 |     cfg = get_cfg()
42 |     cfg.merge_from_file(args.config_file)
43 |     cfg.merge_from_list(args.opts)
44 |     cfg.freeze()
45 |     default_setup(cfg, args)
46 |     return cfg
47 | 
48 | 
49 | def main(args):
50 |     cfg = setup(args)
51 | 
52 |     if args.eval_only:
53 |         model = Trainer.build_model(cfg)
54 |         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
55 |             cfg.MODEL.WEIGHTS, resume=args.resume
56 |         )
57 |         res = Trainer.test(cfg, model)
58 |         return res
59 | 
60 |     trainer = Trainer(cfg)
61 |     trainer.resume_or_load(resume=args.resume)
62 |     return trainer.train()
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     args = default_argument_parser().parse_args()
67 |     print("Command Line Args:", args)
68 |     launch(
69 |         main,
70 |         args.num_gpus,
71 |         num_machines=args.num_machines,
72 |         machine_rank=args.machine_rank,
73 |         dist_url=args.dist_url,
74 |         args=(args,),
75 |     )
76 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/moco/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
2 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/moco/loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | from PIL import ImageFilter
 3 | import random
 4 | 
 5 | 
 6 | class TwoCropsTransform:
 7 |     """Take two random crops of one image as the query and key."""
 8 | 
 9 |     def __init__(self, base_transform):
10 |         self.base_transform = base_transform
11 | 
12 |     def __call__(self, x):
13 |         q = self.base_transform(x)
14 |         k = self.base_transform(x)
15 |         return [q, k]
16 | 
17 | 
18 | class GaussianBlur(object):
19 |     """Gaussian blur augmentation in SimCLR https://arxiv.org/abs/2002.05709"""
20 | 
21 |     def __init__(self, sigma=[.1, 2.]):
22 |         self.sigma = sigma
23 | 
24 |     def __call__(self, x):
25 |         sigma = random.uniform(self.sigma[0], self.sigma[1])
26 |         x = x.filter(ImageFilter.GaussianBlur(radius=sigma))
27 |         return x
28 | 


--------------------------------------------------------------------------------
/moco_pretraining/moco/training_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stanfordmlgroup/MoCo-CXR/d433acabe6518b332a1345a6a1fed49f0c23c253/moco_pretraining/moco/training_tools/__init__.py


--------------------------------------------------------------------------------
/moco_pretraining/moco/training_tools/combiner.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from collections import defaultdict
 3 | 
 4 | 
 5 | def detach_tensor(tensor):
 6 |     if type(tensor) != np.ndarray:
 7 |         if type(tensor) == list:
 8 |             return np.ndarray(tensor)
 9 |         else:
10 |             return tensor.cpu().detach().numpy()
11 |     return tensor
12 | 
13 | def recursive_append(target_dict, source_dict):
14 |     for e in source_dict:
15 |         if type(source_dict[e]) == dict:
16 |             if e not in target_dict:
17 |                 target_dict[e] = defaultdict(list)
18 |             target_dict[e] = recursive_append(target_dict[e], source_dict[e])
19 |         elif source_dict[e] is not None:
20 |             if type(source_dict[e]) == list:
21 |                 target_dict[e].append(source_dict[e])
22 |             else:
23 |                 target_dict[e].append(source_dict[e].cpu())
24 |     
25 |     return target_dict
26 | 
27 | def recursive_concat(source_dict):
28 |     for e in source_dict:
29 |         if type(source_dict[e]) == dict or type(source_dict[e]) == defaultdict:
30 |             source_dict[e] = recursive_concat(source_dict[e])
31 |         elif source_dict[e] is not None:
32 |             source_dict[e] = np.concatenate(source_dict[e])
33 |     
34 |     return source_dict


--------------------------------------------------------------------------------
/moco_pretraining/moco/training_tools/evaluator.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import random
  4 | import time
  5 | import warnings
  6 | import sys
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.parallel
 12 | import torch.backends.cudnn as cudnn
 13 | import torch.distributed as dist
 14 | import torch.optim
 15 | from sklearn.metrics import roc_auc_score
 16 | from scipy.special import softmax
 17 | 
 18 | from .meters import AverageMeter
 19 | from .meters import ProgressMeter
 20 | from .combiner import detach_tensor
 21 | 
 22 | '''
 23 | def pred_accuracy(output, target, k):
 24 |     """Computes the accuracy over the k top predictions for the specified values of k"""
 25 |     
 26 |     output = detach_tensor(output)
 27 |     target = detach_tensor(target)
 28 | 
 29 |     batch_size = target.size(0)
 30 | 
 31 |     argsorted_out = np.argsort(output)[:,-k:]
 32 |     return np.asarray(np.any(argsorted_y.T == target, axis=0).mean(dtype='f')),
 33 | 
 34 |     
 35 |     _, pred = output.topk(maxk, 1, True, True)
 36 |     pred = pred.t()
 37 |     correct = pred.eq(target.view(1, -1).expand_as(pred))
 38 | 
 39 |     res = []
 40 |     correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
 41 |     res.append(correct_k.mul_(100.0 / batch_size))
 42 |     return res[0]           # Seems like we only want the 1st
 43 | '''
 44 | 
 45 | 
 46 | def decorator_detach_tensor(function):
 47 |     def wrapper(*args, **kwargs):
 48 |         # TODO Find a simple way to handle this business ...
 49 |         # If is eval, or if fast debug, or
 50 |         # is train and not heavy, or is train and heavy
 51 |         output = detach_tensor(args[0])
 52 |         target = detach_tensor(args[1])
 53 |         args = args[2:]
 54 | 
 55 |         result = function(output, target, *args, **kwargs)
 56 |         return result
 57 |     return wrapper
 58 | 
 59 | @decorator_detach_tensor
 60 | def topk_acc(output, target, k):
 61 |     """Computes the accuracy over the k top predictions for the specified values of k"""
 62 |     argsorted_out = np.argsort(output)[:,-k:]
 63 |     matching = np.asarray(np.any(argsorted_out.T == target, axis=0))
 64 |     return matching.mean(dtype='f')
 65 | 
 66 | 
 67 | @decorator_detach_tensor
 68 | def compute_auc_binary(output, target):
 69 |     #assuming output and target are all vectors for binary case
 70 |     try:
 71 |         o = softmax(output, axis=1)
 72 |         auc = roc_auc_score(target, o[:,1])
 73 |     except:
 74 |         return -1
 75 |     return auc
 76 | 
 77 | 
 78 | class Evaluator:
 79 | 
 80 |     def __init__(self, model, loss_func, metrics, loaders, args):
 81 | 
 82 |         self.model = model
 83 |         self.loss_func = loss_func
 84 |         self.metrics = metrics
 85 |         self.loaders = loaders
 86 |         self.args = args
 87 | 
 88 |         self.metric_best_vals = {metric: 0 for metric in self.metrics}
 89 | 
 90 | 
 91 |     def evaluate(self, eval_type, epoch):
 92 | 
 93 |         print(f'==> Evaluation for {eval_type}, epoch {epoch}')
 94 | 
 95 |         loader = self.loaders[eval_type]
 96 | 
 97 |         batch_time = AverageMeter('Time', ':6.3f')
 98 |         losses = AverageMeter('Loss', ':.4e')
 99 | 
100 |         metric_meters = {metric: AverageMeter(metric, self.metrics[metric]['format']) \
101 |                                                     for metric in self.metrics}
102 |         list_meters = [metric_meters[m] for m in metric_meters]
103 | 
104 |         progress = ProgressMeter(
105 |             len(loader),
106 |             [batch_time, losses, *list_meters],
107 |             prefix=f'{eval_type}@Epoch {epoch}: ')
108 | 
109 |         # switch to evaluate mode
110 |         self.model.eval()
111 |         all_output = []
112 |         all_gt = []
113 | 
114 |         with torch.no_grad():
115 |             end = time.time()
116 |             for i, (images, target) in enumerate(loader):
117 |                 if self.args.gpu is not None:
118 |                     images = images.cuda(self.args.gpu, non_blocking=True)
119 |                 target = target.cuda(self.args.gpu, non_blocking=True)
120 |                 all_gt.append(target.cpu())        
121 | 
122 |                 # compute output
123 |                 output = self.model(images)
124 |                 all_output.append(output.cpu())
125 |                 
126 |                 loss = self.loss_func(output, target)
127 |                 
128 |                 # JBY: For simplicity do losses first
129 |                 losses.update(loss.item(), images.size(0))
130 | 
131 |                 for metric in self.metrics:
132 |                     args = [output, target, *self.metrics[metric]['args']]    
133 |                     metric_func = globals()[self.metrics[metric]['func']]
134 |                     result = metric_func(*args)
135 |                     
136 |                     metric_meters[metric].update(result, images.size(0))
137 | 
138 |                 # measure elapsed time
139 |                 batch_time.update(time.time() - end)
140 |                 end = time.time()
141 | 
142 |                 if i % self.args.print_freq == 0:
143 |                     progress.display(i)
144 | 
145 |             # TODO: this should also be done with the ProgressMeter
146 |             # print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
147 |             #    .format(top1=top1, top5=top5))
148 |             progress.display(i + 1)
149 | 
150 |         all_output = np.concatenate(all_output)
151 |         all_gt = np.concatenate(all_gt)
152 | 
153 |         for metric in self.metrics:
154 |             args = [all_output, all_gt, *self.metrics[metric]['args']]    
155 |             metric_func = globals()[self.metrics[metric]['func']]
156 |             result = metric_func(*args)
157 |             
158 |             metric_meters[metric].update(result, images.size(0))
159 | 
160 |             self.metric_best_vals[metric] = max(metric_meters[metric].avg,
161 |                                                 self.metric_best_vals[metric])
162 | 
163 |         progress.display(i + 1, summary=True)


--------------------------------------------------------------------------------
/moco_pretraining/moco/training_tools/meters.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class AverageMeter(object):
 4 |     """Computes and stores the average and current value"""
 5 |     def __init__(self, name, fmt=':f'):
 6 |         self.name = name
 7 |         self.fmt = fmt
 8 |         self.reset()
 9 | 
10 |     def reset(self):
11 |         self.val = 0
12 |         self.avg = 0
13 |         self.sum = 0
14 |         self.count = 0
15 | 
16 |     def update(self, val, n=1):
17 |         if type(val) == torch.Tensor:
18 |             val = val.item()
19 | 
20 |         self.val = val
21 |         self.sum += val * n
22 |         self.count += n
23 |         self.avg = self.sum / self.count
24 | 
25 |     def str_val(self):
26 |         if self.name == 'Loss':
27 |             fmtstr = '{name} {val' + self.fmt + '}\n'
28 |         else:
29 |             fmtstr = '{name} {val' + self.fmt + '}'
30 |         return fmtstr.format(**self.__dict__)
31 | 
32 |     def __str__(self):
33 |         if self.name == 'Loss':
34 |             fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})\n'
35 |         else:
36 |             fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
37 |         return fmtstr.format(**self.__dict__)
38 | 
39 | 
40 | class ProgressMeter(object):
41 |     def __init__(self, num_batches, meters, prefix=""):
42 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
43 |         self.meters = meters
44 |         self.prefix = prefix
45 | 
46 |     def display(self, batch, summary=False):
47 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
48 |         if not summary:
49 |             entries += [str(meter) for meter in self.meters]
50 |             print('\t'.join(entries))
51 |         else:
52 |             entries += [meter.str_val() for meter in self.meters]
53 |             print('Summary: ' + '\t'.join(entries))
54 |     
55 |     def _get_batch_fmtstr(self, num_batches):
56 |         num_digits = len(str(num_batches // 1))
57 |         fmt = '{:' + str(num_digits) + 'd}'
58 |         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
59 | 
60 | 


--------------------------------------------------------------------------------
/moco_pretraining/scripts/convert_to_chexpert.py:
--------------------------------------------------------------------------------
  1 | '''File created to reorganize montgomery and shenzhen dataset to fit 
  2 | torchvision.ImageFolder class
  3 | '''
  4 | 
  5 | from collections import defaultdict
  6 | import copy
  7 | import os
  8 | import pprint as pp
  9 | import random
 10 | import re
 11 | import shutil
 12 | import sys
 13 | 
 14 | import pandas as pd
 15 | from pathlib import Path
 16 | from tqdm import tqdm
 17 | 
 18 | # 2e-7 ~ 2--1
 19 | ALL_SEMI_RATIO =  [0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5]
 20 | TEST_RATIO = 0.15
 21 | VAL_RATIO = 0.1
 22 | 
 23 | 
 24 | def print_summary(df, name):
 25 |     total_len = len(df)
 26 |     no_finding = len(df[df['No Finding'] == 0])
 27 |     tb = len(df[df['Tuberculosis'] == 0])
 28 | 
 29 |     print(f'CSV: {name}, No Finding: {no_finding}, Tuberculosis: {tb}')
 30 | 
 31 | 
 32 | def convert_shenzhen(root_folder):
 33 | 
 34 |     RE_SEX_AGE = re.compile(r'(?P<sex>.*al)[e]?[\s|,]*(?P<age>[0-9]+)[yr]?[s]?')
 35 |     RE_FNAME = re.compile(r'CHNCXR\_(?P<idx>[0-9]+)\_(?P<lbl>[0|1])\.txt')
 36 | 
 37 |     root_path = Path(root_folder)
 38 | 
 39 |     key_words = ['upper', 'lower', 'left', 'right', 'bilateral', 'atb', 'ptb', 'stb']
 40 | 
 41 |     # readings = {'healthy': [], 'disease': []}
 42 |     parsed = []
 43 |     for i, f in tqdm(enumerate(os.listdir(root_path / 'ClinicalReadings'))):
 44 | 
 45 |         f_result = RE_FNAME.search(f)
 46 |         pid = f_result.groupdict()['idx']
 47 |         lbl = f_result.groupdict()['lbl']
 48 | 
 49 |         data = {
 50 |             'Study': None,
 51 |             'Age': None,
 52 |             'Sex': None,
 53 |             'No Finding': None,
 54 |             'Tuberculosis': None, 
 55 |             'Path': None
 56 |         }
 57 | 
 58 |         disease = None
 59 |         with open(root_path / 'ClinicalReadings' / f, 'r') as txt:
 60 |             lines = txt.readlines()
 61 | 
 62 |             # if len(lines) > 3:
 63 |             #    import pdb; pdb.set_trace()
 64 | 
 65 |             for l in lines:
 66 |                 result = RE_SEX_AGE.search(l)
 67 | 
 68 |                 if result:
 69 |                     age = int(result.groupdict()['age'])
 70 |                     sex = result.groupdict()['sex'].lower()
 71 | 
 72 |                     data['Age'] = age
 73 |                     data['Sex'] = sex
 74 |                 else:
 75 |                     l = l.strip().lower()
 76 | 
 77 |                     if len(l) > 0:
 78 |                         if 'normal' in l:
 79 |                             assert lbl == '0'
 80 |                             disease = False
 81 |                         else:
 82 |                             if lbl != '1':
 83 |                                 import pdb; pdb.set_trace()
 84 | 
 85 |                             disease = False
 86 |                             for k in key_words:
 87 |                                 if k in l:
 88 |                                     disease = True
 89 |                             
 90 |                             if 'pleuritis' in l:
 91 |                                 disease = True
 92 | 
 93 |             assert disease is not None
 94 |             
 95 |         if disease:
 96 |             data['No Finding'] = 0
 97 |             data['Tuberculosis'] = 1
 98 |         else:
 99 |             data['No Finding'] = 1
100 |             data['Tuberculosis'] = 0
101 |         
102 |     
103 |         fname = root_path / 'shenzhentest' / 'test' / f'patient{pid}' / 'study1' / 'view1_frontal.jpg'
104 |         study = Path('shenzhen') / 'shenzhentest' / 'test' / f'patient{pid}' / 'study1'
105 |         data['Study'] = study
106 |         data['Path'] = fname
107 | 
108 |         parsed.append(data)
109 |     
110 |     val_rows = []
111 |     test_rows = []
112 | 
113 |     ratios = ALL_SEMI_RATIO + [1]
114 |     fine_tune_splitted_rows = {s: [] for s in ratios}
115 |     for stuff in tqdm(parsed):
116 |         rnd = random.random()
117 | 
118 |         if rnd < VAL_RATIO:
119 |             val_rows.append(stuff)
120 |         elif rnd < VAL_RATIO + TEST_RATIO:
121 |             test_rows.append(stuff)
122 |         else:
123 |             rnd = random.random()
124 | 
125 |             for s in ratios:
126 |                 if rnd < s:
127 |                     fine_tune_splitted_rows[s].append(stuff)
128 | 
129 |     df = pd.DataFrame(val_rows)
130 |     df.to_csv(root_path / f'chexpert_like_val.csv')
131 |     print_summary(df, 'validation')
132 | 
133 |     df = pd.DataFrame(test_rows)
134 |     df.to_csv(root_path / f'chexpert_like_test.csv')
135 |     print_summary(df, 'test')
136 | 
137 |     for s in ratios:
138 |         df = pd.DataFrame(fine_tune_splitted_rows[s])
139 |         df.to_csv(root_path / f'chexpert_like_{s}.csv')
140 |         print_summary(df, f'semi_{s}')
141 | 
142 | if __name__ == '__main__':
143 |     # Usage:
144 |     #   python convert_to_chexpert.py moco/shenzhen 23
145 |     # Try 17, 28, 20
146 | 
147 |     random.seed(sys.argv[2])
148 | 
149 |     convert_shenzhen(sys.argv[1])


--------------------------------------------------------------------------------
/moco_pretraining/scripts/generate_moco_training_scripts.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import os
  3 | 
  4 | 
  5 | SBATCH_SCRIPT = \
  6 | '''#!/bin/bash
  7 | #SBATCH --partition=deep
  8 | #SBATCH --nodes=1
  9 | #SBATCH --cpus-per-task=4
 10 | #SBATCH --mem=32000
 11 | 
 12 | # only use the following on partition with GPUs
 13 | #SBATCH --gres=gpu:1
 14 | 
 15 | #SBATCH --job-name="SB_JOBNAME"
 16 | #SBATCH --output=exp_logs/SB_JOBNAME-%j.out
 17 | 
 18 | # only use the following if you want email notification
 19 | ####SBATCH --mail-user=youremailaddress
 20 | ####SBATCH --mail-type=ALL
 21 | 
 22 | # list out some useful information
 23 | echo "SLURM_JOBID="$SLURM_JOBID
 24 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
 25 | echo "SLURM_NNODES"=$SLURM_NNODES
 26 | echo "SLURMTMPDIR="$SLURMTMPDIR
 27 | echo "working directory = "$SLURM_SUBMIT_DIR
 28 | 
 29 | # sample job
 30 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l`
 31 | echo NPROCS=$NPROCS
 32 | 
 33 | cd ../moco; python main_moco.py -a SB_MODEL \\
 34 |             --lr SB_LR --batch-size SB_BATCH_SIZE \\
 35 |             --epochs SB_EPOCHS \\
 36 |             --world-size 1 --rank 0 \\
 37 |             --mlp --moco-t 0.2 SB_FROM_IMAGENET \\
 38 |             --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \\
 39 | 			--aug-setting chexpert --rotate SB_ROTATION --maintain-ratio \\
 40 |             --train_data /deep/group/data/moco/chexpert-proper-test/data/full_train \\
 41 |             --exp-name SB_EXPNAME
 42 | 
 43 | # done
 44 | echo "Done"
 45 | '''
 46 | 
 47 | BASH_SCRIPT = \
 48 | '''cd /home/jby/aihc-spring20-fewer/moco; python main_moco.py -a SB_MODEL \\
 49 |             --lr SB_LR --batch-size SB_BATCH_SIZE \\
 50 |             --world-size 1 --rank 0 \\
 51 |             --mlp --moco-t 0.2 SB_FROM_IMAGENET \\
 52 |             --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \\
 53 | 			--aug-setting chexpert --rotate SB_ROTATION --maintain-ratio \\
 54 |             --train_data /home/jby/CheXpert/full_train \\
 55 |             --exp-name SB_EXPNAME 2>&1 | tee /home/jby/chexpert_experiments/jby/SB_EXPNAME_log.txt
 56 | '''
 57 | 
 58 | 
 59 | LR_SHORT  = {
 60 |                 1e-7: '1n7',
 61 |                 1e-6: '1n6',
 62 |                 5e-5: '5n5',
 63 |                 3e-5: '3n5',
 64 |                 2e-5: '2n5',
 65 |                 1e-5: '1n5',
 66 |                 1e-4: '1n4',
 67 |                 1e-3: '1n3',
 68 |                 1e-2: '1n2',
 69 |                 5e-2: '5n2',
 70 |                 5e-4: '5n4'
 71 |             }
 72 | 
 73 | MODEL_SHORT_NAME_MAP = {'resnet18': 'r8',
 74 |                         'resnet50': 'r5',
 75 |                         'densenet121': 'd1'}
 76 | 
 77 | def gen_script(model, lr, batch_size, imagenet, epoch, gcp):
 78 | 
 79 |     today = datetime.datetime.now()
 80 |     strtoday = today.strftime('%Y%m%dh%H')
 81 | 
 82 |     sb_model = model
 83 |     sb_lr = str(lr)
 84 |     sb_epoch = str(epoch)
 85 |     sb_batch_size = str(batch_size)
 86 |     sb_from_imagenet = '--from-imagenet' if imagenet else ''
 87 |     sb_rotation = str(10)
 88 |     sb_jobname = f'{MODEL_SHORT_NAME_MAP[sb_model]}{"w" if imagenet else "o"}{LR_SHORT[lr]}{batch_size}'
 89 |     sb_expname = f'{sb_jobname}_{strtoday}'
 90 | 
 91 |     if not gcp:
 92 |         script = SBATCH_SCRIPT
 93 |     else:
 94 |         script = BASH_SCRIPT
 95 | 
 96 |     script = script.replace('SB_JOBNAME', sb_jobname)
 97 |     script = script.replace('SB_MODEL', sb_model)
 98 |     script = script.replace('SB_LR', sb_lr)
 99 |     script = script.replace('SB_EPOCH', sb_epoch)
100 |     script = script.replace('SB_BATCH_SIZE', sb_batch_size)
101 |     script = script.replace('SB_FROM_IMAGENET', sb_from_imagenet)
102 |     script = script.replace('SB_ROTATION', sb_rotation)
103 |     script = script.replace('SB_EXPNAME', sb_expname)
104 | 
105 |     fname = f'{sb_jobname}{"_local" if gcp else ""}.sh'
106 |     with open(f'training_scripts/{fname}', 'w') as f:
107 |         f.write(script)
108 | 
109 | if __name__ == '__main__':
110 | 
111 |     GCP = False
112 | 
113 |     os.makedirs('training_scripts', exist_ok=True)
114 |     # densenet121: 32
115 |     # resnet50: 32
116 |     # resnet18L 128
117 | 
118 |     BATCH_SIZE_MAP = {
119 |         'resnet18': 24,
120 |         'resnet50': 24, 
121 |         'densenet121': 24, 
122 |     }
123 | 
124 |     LR_EPOCH_MAP = {
125 |         1e-5: 20,
126 |         1e-4: 20,
127 |         1e-2: 35
128 |     }
129 | 
130 |     # for model in ['densenet121', 'resnet18', 'resnet50']:
131 |     for model in ['resnet18']:
132 |         for imagenet in [True, False]:
133 |             for lr in [1e-5, 1e-4, 1e-2]:
134 |                 if not imagenet:
135 |                     actual_lr = lr * 5
136 |                 else:
137 |                     actual_lr = lr
138 | 
139 |                 gen_script(model, actual_lr, BATCH_SIZE_MAP[model], imagenet, LR_EPOCH_MAP[lr], gcp=False)


--------------------------------------------------------------------------------
/moco_pretraining/scripts/parse_log.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | LOG_RE = re.compile(r'Epoch: \[([0-9]+)\](\[[0-9]+\\[0-9]+\])(\s(\w+)(\[0-9]+\.[0-9]+)\s\(([0-9]+\.[0-9]+)\))')
 4 | 
 5 | # Epoch:\s+\[([0-9]+)\]\[([0-9]+)\/([0-9]+)\](\s+([a-zA-z@]+)\s+(-?[\d.]+(?:e-?\d+)?)\s+\([\s]*(-?[\d.]+(?:e-?\d+)?)\))+
 6 | # \s+([a-zA-z@]+)\s+(-?[\d.]+(?:e-?\d+)?)\s+\([\s]*(-?[\d.]+(?:e-?\d+)?)\)
 7 | # Epoch: [125][1420/1569] Time  0.836 ( 1.043)    Data  0.000 ( 0.276)    Loss 5.7912e+00 (5.8516e+00)    Acc@1 100.00 ( 92.73)   Acc@5 100.00 ( 97.18)
 8 | 
 9 | def analyze_log():
10 |     


--------------------------------------------------------------------------------
/moco_pretraining/scripts/resize.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Convert all images in $1 to $2
3 | 
4 | mkdir -p $2
5 | 
6 | for filename in $1/*; do
7 |     # echo $filename
8 |     convert $filename -resize 500x500! $2/$(basename "$filename")
9 | done


--------------------------------------------------------------------------------
/moco_pretraining/scripts/shenzhen_mutiple_split.py:
--------------------------------------------------------------------------------
  1 | '''File created to reorganize montgomery and shenzhen dataset to fit 
  2 | torchvision.ImageFolder class
  3 | '''
  4 | 
  5 | from collections import defaultdict
  6 | import copy
  7 | import os
  8 | import pprint as pp
  9 | import random
 10 | import re
 11 | import shutil
 12 | import sys
 13 | 
 14 | import pandas as pd
 15 | from pathlib import Path
 16 | from tqdm import tqdm
 17 | 
 18 | # 2e-7 ~ 2--1
 19 | ALL_SEMI_RATIO =  [0.0078125, 0.015625, 0.03125, 0.0625, 0.125, 0.25, 0.5]
 20 | 
 21 | SEMI_ITERATIONS = { 0.0078125: 12,
 22 |                     0.015625: 10,
 23 |                     0.03125: 8,
 24 |                     0.0625: 8,
 25 |                     0.125: 4,
 26 |                     0.25: 4,
 27 |                     0.5: 4,
 28 |                     1: 1
 29 |                 }
 30 | 
 31 | TEST_RATIO = 0.20
 32 | VAL_RATIO = 0.15
 33 | TOTAL_TRAIN_RATIO = 1 - TEST_RATIO - VAL_RATIO
 34 | TOTAL = 662
 35 | 
 36 | 
 37 | def verify_one_split(df, name, ratio=None):
 38 |     if df is None:
 39 |         return False
 40 | 
 41 |     total_len = len(df)
 42 | 
 43 |     if ratio is not None:
 44 |         if not (TOTAL * ratio > total_len * 0.95 and TOTAL * ratio < total_len * 1.05):
 45 |             print(f'Split {name} has incorrect number of items {total_len}')
 46 |             return False
 47 | 
 48 |     if 'Tuberculosis' not in df:
 49 |         return False
 50 | 
 51 |     # no_finding = len(df[df['No Finding'] == 0])
 52 |     tb = len(df[df['Tuberculosis'] == 1])
 53 |     no_tb = len(df[df['Tuberculosis'] == 0])
 54 | 
 55 |     if tb == 0 or no_tb == 0:
 56 |         print(f'Split {name} has a ratio of infnity, which is BAD')
 57 |         return False
 58 | 
 59 |     ratio = no_tb / tb
 60 | 
 61 |     if ratio > 0.9 and ratio < 1.2:
 62 |         return True
 63 |     else:
 64 |         print(f'Split {name} has a ratio of {ratio}, which is BAD')
 65 |         return False
 66 | 
 67 | def print_summary(df, name):
 68 |     total_len = len(df)
 69 |     no_finding = len(df[df['No Finding'] == 0])
 70 |     tb = len(df[df['Tuberculosis'] == 0])
 71 | 
 72 |     print(f'CSV: {name}, No Finding: {no_finding}, Tuberculosis: {tb}')
 73 | 
 74 | 
 75 | def perform_split(root_path, parsed):
 76 |     
 77 |     okay = False
 78 |     while not okay:
 79 |         val_rows = []
 80 |         test_rows = []
 81 |         train_rows = []
 82 |         
 83 |         try:
 84 |             for stuff in tqdm(parsed):
 85 |                 rnd = random.random()
 86 | 
 87 |                 if rnd < VAL_RATIO:
 88 |                     val_rows.append(stuff)
 89 |                 elif rnd < VAL_RATIO + TEST_RATIO:
 90 |                     test_rows.append(stuff)
 91 |                 else:
 92 |                     train_rows.append(stuff)
 93 | 
 94 |             val_df = pd.DataFrame(val_rows)
 95 |             assert verify_one_split(val_df, 'val', ratio=VAL_RATIO)
 96 |             val_df.to_csv(root_path / f'chexpert_like_val.csv')
 97 |             print_summary(val_df, 'validation')
 98 | 
 99 |             test_df = pd.DataFrame(test_rows)
100 |             assert verify_one_split(test_df, 'test', ratio=TEST_RATIO)
101 |             test_df.to_csv(root_path / f'chexpert_like_test.csv')
102 |             print_summary(test_df, 'test')
103 | 
104 |             okay = True
105 |         except AssertionError:
106 |             pass
107 |     
108 |     ratios = ALL_SEMI_RATIO + [1]
109 |     for s in ratios:
110 |         for it in range(SEMI_ITERATIONS[s]):
111 |             
112 |             df = None
113 |             name = f'{s}_{it}'
114 |             while not verify_one_split(df, name):
115 |                 items = []
116 |                 for item in train_rows:
117 |                     rnd = random.random()
118 |                     if rnd < s:
119 |                         items.append(item)
120 | 
121 |                 df = pd.DataFrame(items)
122 |                 verify_one_split(df, name, s * TOTAL_TRAIN_RATIO)
123 | 
124 |             df.to_csv(root_path / f'chexpert_like_{name}.csv')
125 |             print_summary(df, name)
126 | 
127 | 
128 | def convert_shenzhen(root_folder):
129 | 
130 |     RE_SEX_AGE = re.compile(r'(?P<sex>.*al)[e]?[\s|,]*(?P<age>[0-9]+)[yr]?[s]?')
131 |     RE_FNAME = re.compile(r'CHNCXR\_(?P<idx>[0-9]+)\_(?P<lbl>[0|1])\.txt')
132 | 
133 |     root_path = Path(root_folder)
134 | 
135 |     key_words = ['upper', 'lower', 'left', 'right', 'bilateral', 'atb', 'ptb', 'stb']
136 | 
137 |     # readings = {'healthy': [], 'disease': []}
138 |     parsed = []
139 |     for i, f in tqdm(enumerate(os.listdir(root_path / 'ClinicalReadings'))):
140 | 
141 |         f_result = RE_FNAME.search(f)
142 |         pid = f_result.groupdict()['idx']
143 |         lbl = f_result.groupdict()['lbl']
144 | 
145 |         data = {
146 |             'Study': None,
147 |             'Age': None,
148 |             'Sex': None,
149 |             'No Finding': None,
150 |             'Tuberculosis': None, 
151 |             'Path': None
152 |         }
153 | 
154 |         disease = None
155 |         with open(root_path / 'ClinicalReadings' / f, 'r') as txt:
156 |             lines = txt.readlines()
157 | 
158 |             # if len(lines) > 3:
159 |             #    import pdb; pdb.set_trace()
160 | 
161 |             for l in lines:
162 |                 result = RE_SEX_AGE.search(l)
163 | 
164 |                 if result:
165 |                     age = int(result.groupdict()['age'])
166 |                     sex = result.groupdict()['sex'].lower()
167 | 
168 |                     data['Age'] = age
169 |                     data['Sex'] = sex
170 |                 else:
171 |                     l = l.strip().lower()
172 | 
173 |                     if len(l) > 0:
174 |                         if 'normal' in l:
175 |                             assert lbl == '0'
176 |                             disease = False
177 |                         else:
178 |                             if lbl != '1':
179 |                                 import pdb; pdb.set_trace()
180 | 
181 |                             disease = False
182 |                             for k in key_words:
183 |                                 if k in l:
184 |                                     disease = True
185 |                             
186 |                             if 'pleuritis' in l:
187 |                                 disease = True
188 | 
189 |             assert disease is not None
190 |             
191 |         if disease:
192 |             data['No Finding'] = 0
193 |             data['Tuberculosis'] = 1
194 |         else:
195 |             data['No Finding'] = 1
196 |             data['Tuberculosis'] = 0
197 |         
198 |     
199 |         fname = root_path / 'shenzhentest' / 'test' / f'patient{pid}' / 'study1' / 'view1_frontal.jpg'
200 |         study = Path('shenzhen') / 'shenzhentest' / 'test' / f'patient{pid}' / 'study1'
201 |         data['Study'] = study
202 |         data['Path'] = fname
203 | 
204 |         parsed.append(data)
205 |     
206 |     perform_split(root_path, parsed)
207 | 
208 | if __name__ == '__main__':
209 |     # Usage:
210 |     #   python shenzhen_mutiple_split.py moco/shenzhen
211 |     # Try 17, 28, 20
212 | 
213 |     convert_shenzhen(sys.argv[1])


--------------------------------------------------------------------------------
/moco_pretraining/scripts/split_into_train_val.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import sys
 3 | import shutil
 4 | import random
 5 | import pandas as pd
 6 | 
 7 | random.seed(2020)
 8 | 
 9 | TRAIN_RATIO = 0.7
10 | 
11 | 
12 | def split_folder(source_folder, target_train, target_val):
13 | 
14 |     os.makedirs(target_train, exist_ok=True)
15 |     os.makedirs(target_val, exist_ok=True)
16 | 
17 |     for label in os.listdir(source_folder):
18 |         os.makedirs(os.path.join(target_train, label), exist_ok=True)
19 |         os.makedirs(os.path.join(target_val, label), exist_ok=True)
20 | 
21 |     allocation = []
22 |     for label in os.listdir(source_folder):
23 |         if os.path.isfile(os.path.join(source_folder, label)):
24 |             continue
25 |         for fname in os.listdir(os.path.join(source_folder, label)):
26 | 
27 |             source = os.path.join(source_folder, label, fname)
28 |             train_path = os.path.join(target_train, label, fname)
29 |             val_path = os.path.join(target_val, label, fname)
30 | 
31 |             if random.random() < TRAIN_RATIO:
32 |                 shutil.copy(source, train_path)
33 |                 # all_train[label].append(train_path)
34 |                 allocation.append({'orig_path': source, 'new_path': train_path,
35 |                                    'train': 1, 'val': 0})
36 |             else:
37 |                 shutil.copy(source, val_path)
38 |                 # all_val[label].append(val_path)
39 |                 allocation.append({'orig_path': source,  'new_path': val_path,
40 |                                    'train': 0, 'val': 1})
41 | 
42 |     df = pd.DataFrame(allocation)
43 |     df.to_csv(os.path.join(source_folder, 'assignment.csv'))
44 | 
45 | 
46 | if __name__ == '__main__':
47 | 
48 |     source = sys.argv[1]
49 |     train = sys.argv[2]
50 |     val = sys.argv[3]
51 | 
52 |     split_folder(source, train, val)
53 | 
54 | 
55 |     


--------------------------------------------------------------------------------
/moco_pretraining/scripts/training_scripts/r8w1n416.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=deep
 3 | #SBATCH --nodes=1
 4 | #SBATCH --cpus-per-task=4
 5 | #SBATCH --mem=32000
 6 | 
 7 | # only use the following on partition with GPUs
 8 | #SBATCH --gres=gpu:1
 9 | 
10 | #SBATCH --job-name="r8w1n416"
11 | #SBATCH --output=exp_logs/r8w1n416-%j.out
12 | 
13 | # only use the following if you want email notification
14 | ####SBATCH --mail-user=youremailaddress
15 | ####SBATCH --mail-type=ALL
16 | 
17 | # list out some useful information
18 | echo "SLURM_JOBID="$SLURM_JOBID
19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
20 | echo "SLURM_NNODES"=$SLURM_NNODES
21 | echo "SLURMTMPDIR="$SLURMTMPDIR
22 | echo "working directory = "$SLURM_SUBMIT_DIR
23 | 
24 | # sample job
25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l`
26 | echo NPROCS=$NPROCS
27 | 
28 | cd ../moco; python main_moco.py -a resnet18 \
29 |             --lr 0.0001 --batch-size 16 \
30 |             --epochs 20 \
31 |             --world-size 1 --rank 0 \
32 |             --mlp --moco-t 0.2 --from-imagenet \
33 |             --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \
34 | 			--aug-setting chexpert --rotate 10 --maintain-ratio \
35 |             --train_data /deep/group/data/moco/chexpert-proper-test/data/full_train \
36 |             --exp-name r8w1n416_20200911h13
37 | 
38 | # done
39 | echo "Done"
40 | 


--------------------------------------------------------------------------------
/moco_pretraining/scripts/training_scripts/sbatch_lincls_template.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=deep
 3 | #SBATCH --nodes=1
 4 | #SBATCH --cpus-per-task=4
 5 | #SBATCH --mem=64000
 6 | 
 7 | # only use the following on partition with GPUs
 8 | #SBATCH --gres=gpu:2
 9 | 
10 | #SBATCH --job-name="REPLACE_JOB_NAME"
11 | #SBATCH --output=REPLACE_OUTPUT_PATH-%j.out
12 | 
13 | # only use the following if you want email notification
14 | ####SBATCH --mail-user=youremailaddress
15 | ####SBATCH --mail-type=ALL
16 | 
17 | # list out some useful information
18 | echo "SLURM_JOBID="$SLURM_JOBID
19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
20 | echo "SLURM_NNODES"=$SLURM_NNODES
21 | echo "SLURMTMPDIR="$SLURMTMPDIR
22 | echo "working directory = "$SLURM_SUBMIT_DIR
23 | 
24 | # sample job
25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l`
26 | echo NPROCS=$NPROCS
27 | 
28 | cd ../moco; python main_lincls.py -a resnet18 --lr REPLACE_LR \
29 |         --batch-size 48 \
30 |         --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \
31 |         --pretrained REPLACE_CHECKPOINT \
32 |         --world-size 1 --rank 0 REPLACE_COS \
33 |         --train_data REPLACE_TRAIN \
34 |         --val_data REPLACE_VALID\
35 |         --test_data REPLACE_TEST \
36 |         --from-imagenet REPLACE_SEMI \
37 |         --binary \
38 |         --aug-setting chexpert --rotate --maintain-ratio \
39 |         --exp-name REPLACE_EXP_NAME
40 | 
41 | echo "Done"
42 | 


--------------------------------------------------------------------------------
/moco_pretraining/scripts/training_scripts/sbatch_moco_lincls.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=deep
 3 | #SBATCH --nodes=1
 4 | #SBATCH --cpus-per-task=8
 5 | #SBATCH --mem=120000
 6 | 
 7 | # only use the following on partition with GPUs
 8 | #SBATCH --gres=gpu:4
 9 | 
10 | #SBATCH --job-name="moco-v1-lincls"
11 | #SBATCH --output=exp_logs/v1-lincls-%j.out
12 | 
13 | # only use the following if you want email notification
14 | ####SBATCH --mail-user=youremailaddress
15 | ####SBATCH --mail-type=ALL
16 | 
17 | # list out some useful information
18 | echo "SLURM_JOBID="$SLURM_JOBID
19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
20 | echo "SLURM_NNODES"=$SLURM_NNODES
21 | echo "SLURMTMPDIR="$SLURMTMPDIR
22 | echo "working directory = "$SLURM_SUBMIT_DIR
23 | 
24 | # sample job
25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l`
26 | echo NPROCS=$NPROCS
27 | 
28 | cd ../moco; python main_lincls.py -a resnet50 --lr 30.0 --batch-size 256 \
29 |         --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \
30 |         --world-size 1 --rank 0 \
31 |         --train_data chexpert-v10-small-as-imagenet/data/actual_train \
32 |         --val_data chexpert-v10-small-as-imagenet/data/actual_valid \
33 |         --test_data chexpert-v10-small-as-imagenet/data/valid \
34 |         --from-imagenet \
35 |         --exp-name moco_v1_lincls
36 | # done
37 | echo "Done"
38 | 


--------------------------------------------------------------------------------
/moco_pretraining/scripts/training_scripts/sbatch_moco_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=deep
 3 | #SBATCH --nodes=1
 4 | #SBATCH --cpus-per-task=4
 5 | #SBATCH --mem=32000
 6 | 
 7 | # only use the following on partition with GPUs
 8 | #SBATCH --gres=gpu:1
 9 | 
10 | #SBATCH --job-name="dense121"
11 | #SBATCH --output=exp_logs/dense121-%j.out
12 | 
13 | # only use the following if you want email notification
14 | ####SBATCH --mail-user=youremailaddress
15 | ####SBATCH --mail-type=ALL
16 | 
17 | # list out some useful information
18 | echo "SLURM_JOBID="$SLURM_JOBID
19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
20 | echo "SLURM_NNODES"=$SLURM_NNODES
21 | echo "SLURMTMPDIR="$SLURMTMPDIR
22 | echo "working directory = "$SLURM_SUBMIT_DIR
23 | 
24 | # sample job
25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l`
26 | echo NPROCS=$NPROCS
27 | 
28 | cd ../moco; python main_moco.py -a densenet121 \
29 |             --lr 1e-4 --batch-size 16 \
30 |             --world-size 1 --rank 0 \
31 |             --mlp --moco-t 0.2 \
32 |             --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \
33 |             --from-imagenet \
34 | 			--aug-setting chexpert --rotate --maintain-ratio \
35 |             --train_data data/full_train \
36 |             --exp-name dense121
37 | 
38 | # done
39 | echo "Done"
40 | 


--------------------------------------------------------------------------------
/moco_pretraining/scripts/training_scripts/sbatch_moco_train_local.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=deep
 3 | #SBATCH --nodes=1
 4 | #SBATCH --cpus-per-task=4
 5 | #SBATCH --mem=32000
 6 | 
 7 | # only use the following on partition with GPUs
 8 | #SBATCH --gres=gpu:1
 9 | 
10 | #SBATCH --job-name="densenet121"
11 | #SBATCH --output=exp_logs/densenet121-%j.out
12 | 
13 | # only use the following if you want email notification
14 | ####SBATCH --mail-user=youremailaddress
15 | ####SBATCH --mail-type=ALL
16 | 
17 | # list out some useful information
18 | echo "SLURM_JOBID="$SLURM_JOBID
19 | echo "SLURM_JOB_NODELIST"=$SLURM_JOB_NODELIST
20 | echo "SLURM_NNODES"=$SLURM_NNODES
21 | echo "SLURMTMPDIR="$SLURMTMPDIR
22 | echo "working directory = "$SLURM_SUBMIT_DIR
23 | 
24 | # sample job
25 | NPROCS=`sbatch --nodes=${SLURM_NNODES} bash -c 'hostname' |wc -l`
26 | echo NPROCS=$NPROCS
27 | 
28 | cd ../moco; python main_moco.py -a densenet121 \
29 |             --lr 1e-4 --batch-size 16 \
30 |             --world-size 1 --rank 0 \
31 |             --mlp --moco-t 0.2 \
32 |             --dist-url 'tcp://localhost:10001' --multiprocessing-distributed \
33 | 			--aug-setting chexpert --rotate --maintain-ratio \
34 |             --train_data data/full_train \
35 |             --exp-name densenet121
36 | 
37 | # done
38 | echo "Done"
39 | 


--------------------------------------------------------------------------------