├── KneeLocalizer
    ├── oulukneeloc
    │   ├── svm_model.npy
    │   ├── __init__.py
    │   ├── proposals.py
    │   └── detector.py
    ├── MANIFEST.in
    ├── create_conda_env.sh
    ├── setup.py
    ├── LICENSE.txt
    └── README.md
├── requirements.txt
├── LICENSE.md
├── README.md
├── constants_and_util.py
├── image_processing.py
└── non_image_data_processing.py


/KneeLocalizer/oulukneeloc/svm_model.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epierson9/pain-disparities/HEAD/KneeLocalizer/oulukneeloc/svm_model.npy


--------------------------------------------------------------------------------
/KneeLocalizer/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE.txt MANIFEST.in
2 | 
3 | recursive-include oulukneeloc *.npy
4 | include create_conda_env.sh
5 | 


--------------------------------------------------------------------------------
/KneeLocalizer/oulukneeloc/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | SVM_MODEL_PATH = os.path.join(os.path.dirname(__file__),
5 |                               'svm_model.npy')
6 | 


--------------------------------------------------------------------------------
/KneeLocalizer/create_conda_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | conda create -y -n knee_localizer python=3.6
 4 | conda install -y -n knee_localizer numpy opencv scipy
 5 | source activate knee_localizer
 6 | 
 7 | pip install pip -U
 8 | pip install pydicom
 9 | pip install tqdm
10 | 
11 | pip install -e .
12 | 


--------------------------------------------------------------------------------
/KneeLocalizer/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(
 5 |     name='oulu-knee-localizer',
 6 |     version='0.1',
 7 |     author='Aleksei Tiulpin',
 8 |     author_email='aleksei.tiulpin@oulu.fi',
 9 |     packages=find_packages(),
10 |     include_package_data=True,
11 |     license='LICENSE.txt',
12 |     long_description=open('README.md').read(),
13 | )
14 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | backcall==0.1.0
 2 | cycler==0.10.0
 3 | decorator==4.3.2
 4 | ipykernel==5.1.0
 5 | ipython==6.4.0
 6 | ipython-genutils==0.2.0
 7 | jedi==0.13.2
 8 | jupyter-client==5.2.4
 9 | jupyter-core==4.4.0
10 | kiwisolver==1.0.1
11 | matplotlib==2.2.2
12 | notebook==5.7.4
13 | numpy==1.14.1
14 | opencv-python==3.4.3.18
15 | pandas==0.23.3
16 | parso==0.3.3
17 | patsy==0.5.0
18 | pexpect==4.6.0
19 | pickleshare==0.7.5
20 | Pillow==5.0.0
21 | prompt-toolkit==1.0.15
22 | ptyprocess==0.6.0
23 | pydicom==1.2.2
24 | Pygments==2.3.1
25 | pyparsing==2.2.0
26 | python-dateutil==2.5.0
27 | pytz==2018.5
28 | pyzmq==17.1.2
29 | scikit-learn==0.20.0
30 | scipy==1.0.1
31 | seaborn==0.9.0
32 | simplegeneric==0.8.1
33 | six==1.10.0
34 | statsmodels==0.9.0
35 | torch==0.3.1
36 | torchsummary==1.5.1
37 | torchvision==0.2.0
38 | tornado==5.1.1
39 | tqdm==4.30.0
40 | traitlets==4.3.2
41 | wcwidth==0.1.7
42 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Emma Pierson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/KneeLocalizer/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Aleksei Tiulpin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/KneeLocalizer/README.md:
--------------------------------------------------------------------------------
 1 | # Code for the paper from SCIA'17: A novel method for automatic localization of joint area on knee plain radiographs
 2 | 
 3 | # Description
 4 | Repository contains the code for an automatic knee joint detection on plain radiographs. It can be used to process very large amount of knee X-rays and generate bounding boxes (up to 6 000 000 per day on a high-end computer).
 5 | 
 6 | Our package processes the data in a batch mode using multiple threads. To run in on your machine, you have to install the conda environment. For that, simply execute `create_conda_env.sh`.
 7 | 
 8 | # How to run
 9 | Run the script as follows:
10 | ```
11 | cd oulukneeloc
12 | python detector.py --path_input <dir with DICOM files> \
13 |                    --fname_output <file to write the results>
14 | ```
15 | 
16 | Script will produce the bounding boxes of 120mm and save it to the specified file
17 | (by default, `../detection_results.txt`).
18 | 
19 | # How to cite
20 | If you use our package in your own research, please cite us:
21 | 
22 | ```
23 | @inproceedings{tiulpin2017novel,
24 |   title={A novel method for automatic localization of joint area on knee plain radiographs},
25 |   author={Tiulpin, Aleksei and Thevenot, Jerome and Rahtu, Esa and Saarakkala, Simo},
26 |   booktitle={Scandinavian Conference on Image Analysis},
27 |   pages={290--301},
28 |   year={2017},
29 |   organization={Springer}
30 | }
31 | ```
32 | 


--------------------------------------------------------------------------------
/KneeLocalizer/oulukneeloc/proposals.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pydicom as dicom
 3 | from traceback import print_exc
 4 | 
 5 | def read_dicom(filename):
 6 |     """Read DICOM file and convert it to a decent quality uint8 image.
 7 | 
 8 |     Parameters
 9 |     ----------
10 |     filename: str
11 |         Existing DICOM file filename.
12 |     """
13 |     try:
14 |         data = dicom.read_file(filename)
15 |         img = np.frombuffer(data.PixelData, dtype=np.uint16).copy()
16 | 
17 |         if data.PhotometricInterpretation == 'MONOCHROME1':
18 |             img = img.max() - img
19 |         img = img.reshape((data.Rows, data.Columns))
20 |         if hasattr(data, 'ImagerPixelSpacing') and type(data.ImagerPixelSpacing) is list:
21 |             pixel_spacing = data.ImagerPixelSpacing[0]
22 |         else:
23 |             pixel_spacing = data.PixelSpacing[0]
24 |         return img, pixel_spacing
25 |     except:
26 |         print_exc()
27 |         return None
28 | 
29 | 
30 | def preprocess_xray(img, cut_min=5., cut_max=99.):
31 |     """Preprocess the X-ray image using histogram clipping and global contrast normalization.
32 | 
33 |     Parameters
34 |     ----------
35 |     cut_min: int
36 |         Lowest percentile which is used to cut the image histogram.
37 |     cut_max: int
38 |         Highest percentile.
39 |     """
40 | 
41 |     img = img.astype(np.float64)
42 | 
43 |     lim1, lim2 = np.percentile(img, [cut_min, cut_max])
44 | 
45 |     img[img < lim1] = lim1
46 |     img[img > lim2] = lim2
47 | 
48 |     img -= lim1
49 | 
50 |     img /= img.max()
51 |     img *= 255
52 | 
53 |     return img.astype(np.uint8, casting='unsafe')
54 | 
55 | 
56 | def get_joint_y_proposals(img, av_points=11, margin=0.25):
57 |     """Return Y-coordinates of the joint approximate locations."""
58 | 
59 |     R, C = img.shape
60 | 
61 |     # Sum the middle if the leg is along the X-axis
62 |     segm_line = np.sum(img[int(R * margin):int(R * (1 - margin)),
63 |                            int(C / 3):int(C - C / 3)], axis=1)
64 |     # Smooth the segmentation line and find the absolute of the derivative
65 |     segm_line = np.abs(np.convolve(
66 |         np.diff(segm_line), np.ones((av_points, )) / av_points)[(av_points-1):])
67 | 
68 |     # Get top tau % of the peaks
69 |     peaks = np.argsort(segm_line)[::-1][:int(0.1 * R * (1 - 2 * margin))]
70 |     return peaks[::10] + int(R * margin)
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | Code to generate results in "An algorithmic approach to reducing unexplained pain disparities in underserved populations". Please contact emmap1@cs.stanford.edu with any questions. 
 3 | 
 4 | ## Citation
 5 | 
 6 | If you use this code, please cite: 
 7 | 
 8 | Emma Pierson, David M. Cutler, Jure Leskovec, Sendhil Mullainathan, and Ziad Obermeyer. An algorithmic approach to reducing unexplained pain disparities in underserved populations. *Nature Medicine*, 2021.
 9 | 
10 | If you use the OAI data, please cite: 
11 | 
12 | Michael C. Nevitt, David T. Felson, and Gayle Lester. The Osteoarthritis Initiative. 2006.
13 | 
14 | ## Regenerating results
15 | 
16 | 1. **Setting up virtualenv**. Our code is run in a virtual environment using Python 3.5.2. You can set up the environment by using `virtualenv -p python3.5 YOUR_PATH_TO_VIRTUALENV`, activating the virtualenv via `source YOUR_PATH_TO_VIRTUALENV/bin/activate`, and then installing packages via `pip install -r requirements.txt`. Make sure the virtual environment is activated prior to running any of the steps below.  If you want to run `main_results_for_public_repo.ipynb` you will additionally need to run `python -m ipykernel install --user --name=knee` to install a kernel for the IPython notebook; make sure to use this kernel when running the notebook. 
17 | 
18 | Additionally, if you are going to run `image_processing.py`, our code makes use of the KneeLocalizer (repo)[https://github.com/MIPT-Oulu/KneeLocalizer] to crop knees in some of our initial experiments (not the final paper). For your convenience, we've provided a copy of this repo, since we made slight modifications to it to allow it to run on the OAI data. After setting up the virtualenv, please cd into the KneeLocalizer directory and run `python setup.py install`. Please let us know if you have any issues with this dependency, which is not essential to reproduce our analysis. 
19 | 
20 | 2. **Data processing**. We provide the code needed to regenerate the processed data from raw OAI data (which can be downloaded at [https://nda.nih.gov/oai/](https://nda.nih.gov/oai/)). Data was processed on a computer with several terabytes of RAM and hundreds of cores. We do not know whether the OAI data format provided online will remain constant over time - eg, folder names may change - so please contact us if you have any questions. 
21 | 
22 |     - a. Process the original DICOM files into a pickle of numpy arrays. This can be done by running `python image_processing.py`. (We recommend running this in a screen session or similar because it takes a while). 
23 |     - b. Write out the individual images as separate files because the original pickle is too large. This can be done by running 
24 |         `python image_processing.py --normalization_method our_statistics --show_both_knees_in_each_image True --downsample_factor_on_reload None --write_out_image_data True --seed_to_further_shuffle_train_test_val_sets None --crop_to_just_the_knee False`. Again, we recommend running this in a screen session. Note this actually writes out four datasets, not three - train, val, test, and a blinded hold out set. As described in the paper, all exploratory analysis on the paper was performed using only the train, val, and test sets. However, for the final analysis, we retrained models on the train+test sets and evaluated on the blinded hold out set. The four datasets can be combined into three using the method `rename_blinded_test_set_files` in `constants_and_util.py`. 
25 | 
26 | 3. **Set paths.** You will need to set paths suitable for your system in constants_and_util.py. Please see the "Please set these paths for your system" comment in `constants_and_util.py`, and the associated capitalized variables. 
27 | 
28 | 4. **Training models.** Neural network experiments are performed using `python train_models.py EXPERIMENT_NAME`. (For valid experiment names, see the `train_one_model` method.) Running this script will train neural net models indefinitely (after a model is trained and saved, training for a new one begins) which is useful for ensembling models. Models in the paper were trained using four Nvidia XP GPUs. Specific experiments discussed in the results are: 
29 | 
30 |     - `train_best_model_continuous`: Trains models to predict pain using the best-performing config. 
31 | 
32 |     - `hold_out_one_imaging_site`: Trains the models using data from all but one imaging site to confirm results generalize across sites. 
33 | 
34 |     - `predict_klg`: Train the models to predict KLG rather than pain (using same config as in `train_best_model_continuous`) and show that our results are comparable to previous ones. 
35 | 
36 |     - `increase_diversity`: Assess the effect of altering the racial or socioeconomic diversity in the train dataset while keeping dataset size constant. 
37 | 
38 | 5. **Analyzing models and generating figures for paper**. Once models have been run, figures and results in the paper can be reproduced by running `main_results_for_public_repo.ipynb`. Running this notebook takes a while (about a day) because of the number of bootstrap iterations, so we recommend running it in a screen session using, eg, `jupyter nbconvert --execute --ExecutePreprocessor.timeout=-1 --to notebook main_results_for_public_repo.ipynb`. A much faster approach is to run only the cells you need to reproduce the results of interest; alternately, you can reduce the number of bootstrap iterations. Note that running cells which call the class `non_image_data_processing.NonImageData` will require downloading the original non-image data from the OAI (but these files are much smaller and faster to process than the image files). 
39 | 
40 | ## Files
41 | 
42 | **constants_and_util.py**: Constants and general utility methods. 
43 | 
44 | **non_image_data_processing.py**: Processes non-image data.
45 | 
46 | **image_processing.py**: Processes image data and combines with non-image data. 
47 | 
48 | **train_models.py**: Trains the neural network models used in analysis. 
49 | 
50 | **analysis.py**: Helper methods for analysis used in the paper. 
51 | 
52 | **main_results_for_public_repo.ipynb**: Generates the figures and numerical results in the paper using the methods in `analysis.py`. 
53 | 
54 | **requirements.txt**: Packages used in the virtualenv. 
55 | 


--------------------------------------------------------------------------------
/KneeLocalizer/oulukneeloc/detector.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import argparse
  4 | from multiprocessing import Pool, cpu_count
  5 | 
  6 | import numpy as np
  7 | import cv2
  8 | from tqdm import tqdm
  9 | 
 10 | from oulukneeloc import SVM_MODEL_PATH
 11 | from oulukneeloc.proposals import (read_dicom, get_joint_y_proposals,
 12 |                                    preprocess_xray)
 13 | 
 14 | 
 15 | class KneeLocalizer:
 16 |     def __init__(self, svm_model_path=SVM_MODEL_PATH, size_mm=120):
 17 |         super().__init__()
 18 |         self.win_size = (64, 64)
 19 |         self.win_stride = (64, 64)
 20 |         self.block_size = (16, 16)
 21 |         self.block_stride = (8, 8)
 22 |         self.cell_size = (8, 8)
 23 |         self.padding = (0, 0)
 24 |         self.nbins = 9
 25 |         self.scales = [3.2, 3.3, 3.4, 3.6, 3.8]
 26 |         self.step = 95
 27 | 
 28 |         self.size_mm = size_mm
 29 |         self.svm_w, self.svm_b = np.load(svm_model_path, encoding='bytes')
 30 | 
 31 |     def predict(self, fileobj, spacing=None):
 32 |         """Localize the left and the right knee joints in PA X-ray image.
 33 | 
 34 |         Parameters
 35 |         ----------
 36 |         fileobj: str or ndarray
 37 |             Filename of the DICOM image, or already extracted uint16 ndarray.
 38 |         spacing: float or None
 39 |             Spacing extracted from the previously read DICOM.
 40 | 
 41 |         Returns
 42 |         -------
 43 |         detections: list of lists
 44 |             The first list has the bbox for the left knee joint.
 45 |             The second list has the bbox for the right knee joint.
 46 |         """
 47 | 
 48 |         if isinstance(fileobj, str):
 49 |             tmp = read_dicom(fileobj)
 50 |             if tmp is None:
 51 |                 return None
 52 |             if len(tmp) != 2:
 53 |                 return None
 54 |             img, spacing = tmp
 55 |             img = preprocess_xray(img)
 56 |         elif isinstance(fileobj, np.ndarray):
 57 |             img = fileobj
 58 |             if spacing is None:
 59 |                 raise ValueError
 60 |         else:
 61 |             raise ValueError
 62 | 
 63 |         R, C = img.shape
 64 |         split_point = C // 2
 65 |         spacing = float(spacing)
 66 |         assert spacing > 0
 67 | 
 68 |         right_leg = img[:, :split_point]
 69 |         left_leg = img[:, split_point:]
 70 | 
 71 |         sizepx = int(self.size_mm / spacing)  # Proposal size
 72 | 
 73 |         # We will store the coordinates of the top left and
 74 |         # the bottom right corners of the bounding box
 75 |         hog = cv2.HOGDescriptor(self.win_size,
 76 |                                 self.block_size,
 77 |                                 self.block_stride,
 78 |                                 self.cell_size,
 79 |                                 self.nbins)
 80 | 
 81 |         # Make proposals for the right leg
 82 |         R, C = right_leg.shape
 83 |         displacements = range(-C // 4, 1 * C // 4 + 1, self.step)
 84 |         prop = get_joint_y_proposals(right_leg)
 85 |         best_score = -np.inf
 86 | 
 87 |         for y_coord in prop:
 88 |             for x_displ in displacements:
 89 |                 for scale in self.scales:
 90 |                     if C / 2 + x_displ - R / scale / 2 >= 0:
 91 |                         # Candidate ROI
 92 |                         roi = np.array([C / 2 + x_displ - R / scale / 2,
 93 |                                         y_coord - R / scale / 2,
 94 |                                         R / scale, R / scale], dtype=np.int)
 95 |                         x1, y1 = roi[0], roi[1]
 96 |                         x2, y2 = roi[0] + roi[2], roi[1] + roi[3]
 97 |                         patch = cv2.resize(img[y1:y2, x1:x2], (64, 64))
 98 | 
 99 |                         hog_descr = hog.compute(patch, self.win_stride, self.padding)
100 |                         score = np.inner(self.svm_w, hog_descr.ravel()) + self.svm_b
101 | 
102 |                         if score > best_score:
103 |                             jc = np.array([C / 2 + x_displ, y_coord])
104 |                             best_score = score
105 | 
106 |         roi_R = np.array([jc[0] - sizepx // 2,
107 |                           jc[1] - sizepx // 2,
108 |                           jc[0] + sizepx // 2,
109 |                           jc[1] + sizepx // 2]).round().astype(np.int)
110 | 
111 |         # Make proposals for the left leg
112 |         R, C = left_leg.shape
113 |         displacements = range(-C // 4, 1 * C // 4 + 1, self.step)
114 |         prop = get_joint_y_proposals(left_leg)
115 |         best_score = -np.inf
116 | 
117 |         for y_coord in prop:
118 |             for x_displ in displacements:
119 |                 for scale in self.scales:
120 |                     if split_point + x_displ + R / scale / 2 < img.shape[1]:
121 |                         roi = np.array([split_point + C / 2 + x_displ - R / scale / 2,
122 |                                         y_coord - R / scale / 2,
123 |                                         R / scale, R / scale], dtype=np.int)
124 |                         x1, y1 = roi[0], roi[1]
125 |                         x2, y2 = roi[0] + roi[2], roi[1] + roi[3]
126 |                         patch = np.fliplr(cv2.resize(img[y1:y2, x1:x2], (64, 64)))
127 | 
128 |                         hog_descr = hog.compute(patch, self.win_stride, self.padding)
129 |                         score = np.inner(self.svm_w, hog_descr.ravel()) + self.svm_b
130 | 
131 |                         if score > best_score:
132 |                             jc = np.array([split_point + C / 2 + x_displ, y_coord])
133 |                             best_score = score
134 | 
135 |         roi_L = np.array([jc[0] - sizepx // 2,
136 |                           jc[1] - sizepx // 2,
137 |                           jc[0] + sizepx // 2,
138 |                           jc[1] + sizepx // 2]).round().astype(np.int)
139 | 
140 |         return [roi_L.tolist(), roi_R.tolist()], img
141 | 
142 | 
143 | def worker(fname, path_input, localizer):
144 |     tmp = read_dicom(os.path.join(path_input, fname))
145 |     if tmp is None:
146 |         ret = [fname, ] + [-1, ] * 4 + [-1, ] * 4
147 |         return ' '.join([str(e) for e in ret])
148 | 
149 |     img, spacing = tmp
150 |     img = preprocess_xray(img)
151 |     try:
152 |         detections = localizer.predict(img, spacing)
153 |     except:
154 |         print('Error finding the knee joints')
155 |         detections = [[-1]*4, [-1]*4]
156 | 
157 |     if detections is None:
158 |         detections = [[-1]*4, [-1]*4]
159 |     return ' '.join(map(str, [fname, ] + detections[0] + detections[1]))
160 | 
161 | 
162 | def parse_args():
163 |     parser = argparse.ArgumentParser()
164 |     parser.add_argument('--path_input', "--dir")
165 |     parser.add_argument('--fname_output', "--output",
166 |                         default='../detection_results.txt')
167 | 
168 |     args = parser.parse_args()
169 |     args.path_input = os.path.abspath(args.path_input)
170 |     args.fname_output = os.path.abspath(args.fname_output)
171 |     return args
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     args = parse_args()
176 | 
177 |     ts_start = time.time()
178 | 
179 |     localizer = KneeLocalizer()
180 | 
181 |     def worker_partial(fname):
182 |         return worker(fname, args.path_input, localizer)
183 | 
184 |     fnames = os.listdir(args.path_input)
185 |     
186 |     with Pool(cpu_count()) as pool:
187 |         res = list(tqdm(pool.imap(
188 |             worker_partial, iter(fnames)), total=len(fnames)))
189 |         
190 |     with open(args.fname_output, 'w') as f:
191 |         for entry in res:
192 |             f.write(entry + '\n')
193 | 
194 |     ts_end = time.time() - ts_start
195 |     print('Script execution took {} seconds'.format(ts_end))
196 | 


--------------------------------------------------------------------------------
/constants_and_util.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import warnings 
  4 | import os
  5 | import pandas as pd
  6 | import copy
  7 | from scipy.stats import pearsonr
  8 | import random
  9 | from collections import Counter
 10 | import numpy as np
 11 | import pickle
 12 | import platform
 13 | import sys
 14 | import subprocess
 15 | import time
 16 | import platform
 17 | import getpass
 18 | 
 19 | 
 20 | node_name = platform.node().split('.')[0]
 21 | print("Running code on %s with Python version %s" % (node_name, sys.version.split()[0]))
 22 | 
 23 | pd.set_option('max_columns', 500)
 24 | warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 25 | warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 26 | warnings.filterwarnings("ignore", message="This call to matplotlib.use() has no effect because the backend has already been chosen")
 27 | 
 28 | 
 29 | USE_HELD_OUT_TEST_SET = True
 30 | MODEL_NAME = 'resnet18'
 31 | TOTAL_PEOPLE = 4796
 32 | N_BOOTSTRAPS = 1000
 33 | 
 34 | if getpass.getuser() == 'emmap1':
 35 |     # Do not modify this code; it is the original logic the authors used to process the images/run models, maintained for reproducibility.
 36 |     REPROCESS_RAW_DATA = True # set this to False if you just want to work with the processed data, and don't need to reprocess it. 
 37 |     assert node_name in ['hyperion', 'hyperion2', 'hyperion3', 'rambo', 'trinity', 'turing1', 'turing2']
 38 |     assert sys.version.split()[0] == '3.5.2'
 39 |     NODE_TO_USE_TO_STORE_IMAGES_FOR_GPU = 'hyperion3'
 40 |     assert NODE_TO_USE_TO_STORE_IMAGES_FOR_GPU in ['turing2', 'hyperion3']
 41 |     BASE_NON_IMAGE_DATA_DIR = '/dfs/dataset/tmp/20180910-OAI/data/emma_downloaded_oai_data_9112018/'
 42 |     BASE_IMAGE_DATA_DIR = '/dfs/dataset/tmp/20180910-OAI/data/'
 43 |     BASE_MURA_DIR = '/dfs/dataset/tmp/20180910-OAI/data/mura_pretrained_weights/'
 44 |     DFS_BASE_IMAGES_PATH = os.path.join(BASE_IMAGE_DATA_DIR, 'processed_image_data', 'individual_images')
 45 |     FITTED_MODEL_DIR = os.path.join(BASE_IMAGE_DATA_DIR, 'fitted_models')
 46 | 
 47 |     assert os.path.exists(DFS_BASE_IMAGES_PATH)
 48 |     if node_name in ['rambo', 'trinity', 'hyperion2', 'hyperion', 'turing1', 'turing2', 'turing3', 'turing4']:
 49 |         # if we are on rambo or trinity, we are reprocessing the images.
 50 |         INDIVIDUAL_IMAGES_PATH = DFS_BASE_IMAGES_PATH
 51 |         raise Exception("This is likely the wrong computer to be processing things on; it uses the wrong test set and you should be using hyperion3. Unless you are testing or regenerating data, something may be wrong.")
 52 |     else:
 53 |         if NODE_TO_USE_TO_STORE_IMAGES_FOR_GPU == 'hyperion3':
 54 |             if USE_HELD_OUT_TEST_SET:
 55 |                 INDIVIDUAL_IMAGES_PATH = '/lfs/hyperion3/0/emmap1/oai_image_data/processed_image_data_new_with_held_out_test_set_april_2019/'
 56 |             else:
 57 |                 INDIVIDUAL_IMAGES_PATH = '/lfs/hyperion3/0/emmap1/oai_image_data/processed_image_data/'
 58 |         elif NODE_TO_USE_TO_STORE_IMAGES_FOR_GPU == 'turing2':
 59 |             if USE_HELD_OUT_TEST_SET:
 60 |                 INDIVIDUAL_IMAGES_PATH = '/lfs/turing2/0/emmap1/oai_image_data/processed_image_data_new_with_held_out_test_set_april_2019/'
 61 |             else:
 62 |                 INDIVIDUAL_IMAGES_PATH = '/lfs/turing2/0/emmap1/oai_image_data/processed_image_data/'
 63 |         else:
 64 |             raise Exception("invalid place to store images for GPU")
 65 | else:
 66 |     # Please modify variables / paths here. 
 67 |     if sys.version.split()[0] != '3.5.2':
 68 |         print("Warning: running code with a Python version which differs from original Python version (3.5.2)")
 69 |     REPROCESS_RAW_DATA = False # set this to False if you just want to work with the processed data, and don't need to reprocess it. 
 70 |     
 71 |     # Please set these paths for your system. 
 72 |     INDIVIDUAL_IMAGES_PATH = 'THIS_IS_A_TEMPORARY_PATH_PLEASE_REPLACE_ME' # points to the directory which stores the processed data, so you should download the processed data into this folder. If you are reprocessing the raw data, the individual images will be stored in this folder. 
 73 |     FITTED_MODEL_DIR = 'THIS_IS_A_TEMPORARY_PATH_PLEASE_REPLACE_ME' # This is where you store the fitted models.  Please create three empty subdirectories in this directory: "configs", "results", and "model_weights". 
 74 |     
 75 |     # Only need to set these paths if you are reprocessing raw data. 
 76 |     BASE_NON_IMAGE_DATA_DIR = 'THIS_IS_A_TEMPORARY_PATH_PLEASE_REPLACE_ME' # Set this path to point to the directory where you downloaded the NON-IMAGE OAI data - eg, it should contain folders like "AllClinical_ASCII". 
 77 |     BASE_IMAGE_DATA_DIR = 'THIS_IS_A_TEMPORARY_PATH_PLEASE_REPLACE_ME' # Set this path to point to the directory where you downloaded the IMAGE OAI data - eg, it should contain folders like "00m" for each timepoint. 
 78 | 
 79 | assert os.path.exists(INDIVIDUAL_IMAGES_PATH), 'You need to set INDIVIDUAL_IMAGES_PATH; see "Please set these paths for your system" comment in constants_and_util.py'
 80 | assert os.path.exists(FITTED_MODEL_DIR), 'You need to set FITTED_MODEL_DIR; see "Please set these paths for your system" comment in constants_and_util.py. After setting this directory, please create empty subdirectories called "configs", "results", and "model_weights" within it'
 81 | assert os.path.exists(os.path.join(FITTED_MODEL_DIR, 'configs')) and os.path.exists(os.path.join(FITTED_MODEL_DIR, 'results')) and os.path.exists(os.path.join(FITTED_MODEL_DIR, 'model_weights')), 'Please create empty subdirectories called "configs","results", and "model_weights" within %s' % FITTED_MODEL_DIR
 82 | 
 83 | if REPROCESS_RAW_DATA:
 84 |     # these paths are primarily used in reprocessing data; they store the non-image data and image data. 
 85 |     assert os.path.exists(BASE_NON_IMAGE_DATA_DIR), 'If you are reprocessing raw data, you need to set BASE_NON_IMAGE_DATA_DIR; see "Please set these paths for your system" comment in constants_and_util.py'
 86 |     assert os.path.exists(BASE_IMAGE_DATA_DIR), 'If you are reprocessing raw data, you need to set BASE_IMAGE_DATA_DIR; see "Please set these paths for your system" comment in constants_and_util.py'
 87 | 
 88 | RESAMPLED_IMAGE_SIZE = [1024, 1024]
 89 | CROPPED_KNEE_RESAMPLED_IMAGE_SIZE = [int(0.5 * RESAMPLED_IMAGE_SIZE[0]), int(0.5 * RESAMPLED_IMAGE_SIZE[1])] # smaller because it's just the knee. 
 90 |                                      
 91 | assert RESAMPLED_IMAGE_SIZE[0] == RESAMPLED_IMAGE_SIZE[1]
 92 | IMAGE_DATASET_KWARGS = {'desired_image_type':'Bilateral PA Fixed Flexion Knee',
 93 |                 'normalization_method':'our_statistics',
 94 |                 'max_images_to_load':1000000000}
 95 | GAPS_OF_INTEREST_COLS = ['race_black', 'binarized_income_at_least_50k', 'binarized_education_graduated_college', 'is_male']
 96 | CLINICAL_CONTROL_COLUMNS = ['xrosfm', 'xrscfm','xrcyfm', 'xrjsm', 'xrchm','xrostm','xrsctm','xrcytm','xrattm','xrkl','xrosfl','xrscfl','xrcyfl', 'xrjsl','xrchl','xrostl','xrsctl','xrcytl','xrattl']
 97 | OTHER_KOOS_PAIN_SUBSCORES = ['koos_function_score', 'koos_quality_of_life_score', 'koos_symptoms_score']
 98 | MEDICATION_CODES = {'V00RXACTM':'Acetaminophen', 
 99 |                         'V00RXANALG':'Analgesic',
100 |                         'V00RXASPRN':'Aspirin',
101 |                         'V00RXBISPH':'Bisphosphonate',
102 |                         'V00RXCHOND':'Chondroitin',
103 |                         'V00RXCLCTN':'Calcitonin',
104 |                         'V00RXCLCXB':'Celecoxib',
105 |                         'V00RXCOX2':'COX_II',
106 |                         'V00RXFLUOR':'Fluoride',
107 |                         'V00RXGLCSM':'Glucosamine',
108 |                         'V00RXIHYAL':'Injected_hyaluronic_acid',
109 |                         'V00RXISTRD':'Injected_corticosteroid',
110 |                         'V00RXMSM':'Methylsulfonylmethane',
111 |                         'V00RXNARC':'Narcotic_analgesic',
112 |                         'V00RXNSAID':'NSAID',
113 |                         'V00RXNTRAT':'Nitrate',
114 |                         'V00RXOSTRD':'Oral_corticosteroid',
115 |                         'V00RXOTHAN':'Other_analgesic',
116 |                         'V00RXRALOX':'Raloxifene',
117 |                         'V00RXRFCXB':'Rofecoxib',
118 |                         'V00RXSALIC':'Salicylate',
119 |                         'V00RXSAME':'S_adenosylmethionine',
120 |                         'V00RXTPRTD':'Teriparatide',
121 |                         'V00RXVIT_D':'Vitamin_D',
122 |                         'V00RXVLCXB':'Valdecoxib'
123 |     }
124 | 
125 | # Variables associated with enrollment visit are prefixed V00, variables at 12-month follow-up are prefixed V01, variables at 18-month interim visit are prefixed V02, variables at 24-month follow-up are prefixed V03, variables at 30-month follow-up are prefixed V04, variables at 36-month follow-up are prefixed V05, variables at 48-month follow-up visit are prefixed V06, variables at 72-month follow-up visit are prefixed V08, and variables at 96-month follow- up visit are prefixed V10.
126 | # Or see also the document ClinicalDataGettingStartedOverview.pdf
127 | CLINICAL_WAVES_TO_FOLLOWUP = {'00':'00 month follow-up: Baseline',
128 | '01':'12 month follow-up',
129 | '03':'24 month follow-up',
130 | '05':'36 month follow-up',
131 | '06':'48 month follow-up',
132 | '07':'60 month follow-up',
133 | '08':'72 month follow-up',
134 | '09':'84 month follow-up',
135 | '10':'96 month follow-up',
136 | '11':'108 month follow-up'}
137 | 
138 | TIMEPOINTS_TO_FILTER_FOR = ['12 month follow-up', 
139 |                               '24 month follow-up', 
140 |                               '36 month follow-up', 
141 |                               '48 month follow-up', 
142 |                               '00 month follow-up: Baseline']
143 | 
144 | WAVES_WE_ARE_USING = ['00', '01', '03', '05', '06']
145 | 
146 | assert set(TIMEPOINTS_TO_FILTER_FOR) == set([CLINICAL_WAVES_TO_FOLLOWUP[a] for a in WAVES_WE_ARE_USING])
147 | 
148 | TRAIN_VAL_TEST_HOLD_OUT_FRACTIONS = {'train_frac':(TOTAL_PEOPLE - 1500. - 1000.)/TOTAL_PEOPLE,
149 |                                   'val_frac':(500. + 1e-3)/TOTAL_PEOPLE, #1e-3 is a small hack to make sure that train_frac + val_frac + test_frac doesn't get weirdly rounded. Sigh. 
150 |                                   'test_frac':500./TOTAL_PEOPLE, 
151 |                                   'hold_out_frac':1500./TOTAL_PEOPLE} 
152 | 
153 | assert TRAIN_VAL_TEST_HOLD_OUT_FRACTIONS['hold_out_frac'] <= 1500./TOTAL_PEOPLE # if you do change hold out set, must make it smaller. 
154 | 
155 | 
156 | IMAGE_TIMEPOINT_DIRS_TO_FOLLOWUP = {'00m':'00 month follow-up: Baseline', 
157 | '12m':'12 month follow-up', 
158 | '18m':'18 month follow-up', 
159 | '24m':'24 month follow-up', 
160 | '30m':'30 month follow-up', 
161 | '36m':'36 month follow-up', 
162 | '48m':'48 month follow-up', 
163 | '72m':'72 month follow-up', 
164 | '96m':'96 month follow-up'}
165 | KOOS_BINARIZATION_THRESH = 86.1
166 | WOMAC_BINARIZATION_THRESH = 3.
167 | 
168 | AGE_RACE_SEX_SITE = ['C(age_at_visit)*C(p02sex)', 'C(p02hisp)', "C(p02race, Treatment(reference='1: White or Caucasian'))", 'C(v00site)']
169 | AGE_SEX_SITE_NO_RACE = ['C(age_at_visit)*C(p02sex)', 'C(v00site)']
170 | 
171 | KNEE_INJURY_OR_SURGERY = ['C(knee_surgery)', 'C(knee_injury)']
172 | MEDICAL_HISTORY = ['C(hrtat)', 
173 | 'C(hrtfail)', 'C(bypleg)','C(stroke)', 'C(asthma)', 
174 |     'C(lung)', 'C(ulcer)', 'C(diab)', 'C(kidfxn)', 
175 |     'C(ra)', 'C(polyrh)', 'C(livdam)', 'C(cancer)']
176 | OTHER_PAIN = ['left_hip_pain_more_than_half_of_days',
177 |    'right_hip_pain_more_than_half_of_days',
178 |    'how_often_bothered_by_back_pain',
179 |    'left_foot_pain_more_than_half_of_days',
180 |    'right_foot_pain_more_than_half_of_days',
181 |    'left_ankle_pain_more_than_half_of_days',
182 |    'right_ankle_pain_more_than_half_of_days',
183 |    'left_shoulder_pain_more_than_half_of_days',
184 |    'right_shoulder_pain_more_than_half_of_days',
185 |    'left_elbow_pain_more_than_half_of_days',
186 |    'right_elbow_pain_more_than_half_of_days',
187 |    'left_wrist_pain_more_than_half_of_days',
188 |    'right_wrist_pain_more_than_half_of_days',
189 |    'left_hand_pain_more_than_half_of_days',
190 |    'right_hand_pain_more_than_half_of_days']
191 | RISK_FACTORS = ['C(cigarette_smoker)', 'C(drinks_per_week)', 'C(v00maritst)']
192 | BMI = ["C(current_bmi, Treatment(reference='18.5-25'))", "C(max_bmi, Treatment(reference='18.5-25'))"]
193 | MRI = ['C(bml2plusl)', 'C(bml2plusm)', 'C(bml2pluspf)',
194 |                'C(car11plusl)', 'C(car11plusm)', 'C(car11pluspf)', 
195 |                'C(menextl)', 'C(menextm)', 'C(mentearl)', 'C(mentearm)']
196 | FRACTURES_AND_FALLS = ['fractured_spine', 'fractured_hip', 'fractured_bone', 'fell_in_last_12_months']
197 | TIMEPOINT_AND_SIDE = ["C(visit, Treatment(reference='00 month follow-up: Baseline'))", 'side', 'C(dominant_leg)']
198 | 
199 | def validate_folder_contents(path):
200 |     """
201 |     Make sure that the folder we're copying (of processed image data) has exactly the files we expect, and return the maximum image file number. 
202 |     """
203 |     all_filenames = os.listdir(path)
204 |     all_filenames.remove('image_codes.pkl')
205 |     all_filenames.remove('non_image_data.csv')
206 |     image_numbers = sorted([int(a.replace('image_', '').replace('.npy', '')) for a in all_filenames])
207 |     max_image_number = max(image_numbers)
208 |     assert image_numbers == list(range(max_image_number + 1))
209 |     return max_image_number
210 | 
211 | 
212 | def rename_blinded_test_set_files(base_dir, inner_folder, running_for_real=False):
213 |     """
214 |     Take the four folders 'BLINDED_HOLD_OUT_DO_NOT_USE', 'test', 'train', 'val' 
215 |     and combine them into three:
216 |     test + train -> train
217 |     val -> val
218 |     BLINDED_HOLD_OUT_DO_NOT_USE -> test
219 | 
220 |     Sample usage: 
221 |     rename_blinded_test_set_files('/lfs/hyperion3/0/emmap1/oai_image_data/processed_image_data_new_with_held_out_test_set_april_2019/', 
222 |                                 'show_both_knees_True_downsample_factor_None_normalization_method_our_statistics', 
223 |                                 running_for_real=True)
224 | 
225 |     """
226 |     raise Exception("This method is destructive. Do you actually want to run it?")
227 |     print("Relabeling folders in %s" % base_dir)
228 |     expected_datasets = ['BLINDED_HOLD_OUT_DO_NOT_USE', 'test', 'train', 'val']
229 |     assert sorted(os.listdir(base_dir)) == sorted(expected_datasets)
230 |     for dataset in expected_datasets:
231 |         full_folder_path = os.path.join(base_dir, dataset, inner_folder)
232 |         assert os.path.exists(full_folder_path)
233 |         max_image_number = validate_folder_contents(full_folder_path)
234 |         print("Maximum image number in %s: %i (total images is this + 1)" % (dataset, max_image_number))
235 | 
236 |     # combine test + train set into train set. 
237 |     full_train_path = os.path.join(base_dir, 'train', inner_folder)
238 |     full_test_path = os.path.join(base_dir, 'test', inner_folder)
239 |     max_train_image = validate_folder_contents(full_train_path)
240 |     max_test_image = validate_folder_contents(full_test_path)
241 | 
242 |     train_non_image_data =  pd.read_csv(os.path.join(full_train_path, 'non_image_data.csv'), index_col=0)
243 |     test_non_image_data =  pd.read_csv(os.path.join(full_test_path, 'non_image_data.csv'), index_col=0)
244 |     combined_non_image_data = pd.concat([train_non_image_data, test_non_image_data])
245 |     combined_non_image_data.index = range(len(combined_non_image_data))
246 | 
247 |     train_image_codes = pickle.load(open(os.path.join(full_train_path, 'image_codes.pkl'), 'rb'))
248 |     test_image_codes = pickle.load(open(os.path.join(full_test_path, 'image_codes.pkl'), 'rb'))
249 |     combined_image_codes = train_image_codes + test_image_codes
250 | 
251 |     ensure_barcodes_match(combined_non_image_data, combined_image_codes)
252 |     assert len(combined_image_codes) == len(combined_non_image_data)
253 |     assert len(combined_image_codes) == (max_train_image + 1) + (max_test_image + 1)
254 | 
255 |     print("Moving test images to train folder")
256 |     for i in range(max_test_image + 1):
257 |         old_path = os.path.join(full_test_path, 'image_%i.npy' % i)
258 |         new_path = os.path.join(full_train_path, 'image_%i.npy' % (max_train_image + 1 + i))
259 |         assert os.path.exists(old_path)
260 |         assert not os.path.exists(new_path)
261 |         cmd = 'mv %s %s' % (old_path, new_path)
262 |         print(cmd)
263 |         if running_for_real:
264 |             os.system(cmd)
265 | 
266 |     print("Moving test non-image data to train folder")
267 |     if running_for_real:
268 |         combined_non_image_data.to_csv(os.path.join(full_train_path, 'non_image_data.csv'))
269 |         pickle.dump(combined_image_codes, open(os.path.join(full_train_path, 'image_codes.pkl'), 'wb'))
270 |         
271 |     print("Renaming blinded held out set to test")
272 |     full_blinded_held_out_path =  os.path.join(base_dir, 'BLINDED_HOLD_OUT_DO_NOT_USE', inner_folder)
273 |     if running_for_real:
274 |         os.system('rm -rf %s' % full_test_path)
275 |         os.system('mv %s %s' % (full_blinded_held_out_path, full_test_path))
276 |     expected_datasets = ['test', 'train', 'val']
277 |     assert sorted(os.listdir(base_dir)) == sorted(expected_datasets)
278 |     print("Done relabeling folders")
279 | 
280 | 
281 | def binarize_koos(koos_arr):
282 |     return 1.*(koos_arr <= KOOS_BINARIZATION_THRESH)
283 | 
284 | def binarize_womac(womac_arr):
285 |     return 1.*(womac_arr > WOMAC_BINARIZATION_THRESH)
286 | 
287 | def get_all_ids():
288 |     """
289 |     Gets all the ids from the clinical file. Checked. 
290 |     """
291 |     full_path = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'AllClinical_ASCII', 'AllClinical00.txt')
292 |     d = pd.read_csv(full_path, sep='|')
293 |     ids = sorted(list(d['ID'].values.astype(int)))
294 |     assert len(set(ids)) == len(ids)
295 |     assert len(ids) == TOTAL_PEOPLE
296 |     return ids
297 | 
298 | def make_train_val_test_hold_out_set(seed_to_further_shuffle_train_test_val_sets):
299 |     """
300 |     Get the list of ids to have in the train/test/hold-out set. Checked. 
301 |     If seed_to_further_shuffle_train_test_val_sets is None, returns the original data. 
302 |     Otherwise, further shuffles the data as a robustness check 
303 |     (so we see how much results vary across test splits)
304 |     """
305 | 
306 |     if seed_to_further_shuffle_train_test_val_sets is not None:
307 |         print("Attention: further shuffling with random seed %s" % str(seed_to_further_shuffle_train_test_val_sets))
308 |     train_frac = TRAIN_VAL_TEST_HOLD_OUT_FRACTIONS['train_frac']
309 |     val_frac = TRAIN_VAL_TEST_HOLD_OUT_FRACTIONS['val_frac']
310 |     test_frac = TRAIN_VAL_TEST_HOLD_OUT_FRACTIONS['test_frac']
311 |     hold_out_frac = TRAIN_VAL_TEST_HOLD_OUT_FRACTIONS['hold_out_frac']
312 |     assert np.allclose(train_frac + val_frac + test_frac + hold_out_frac, 1)
313 |     ids = get_all_ids()
314 |     n = len(ids)
315 |     random.Random(0).shuffle(ids)
316 | 
317 |     # make sure the ids are in the same order as before (random seeds are the same). 
318 |     shuffled_id_path = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'shuffled_ids.pkl')
319 |     if os.path.exists(shuffled_id_path):
320 |         previously_cached_ids = pickle.load(open(shuffled_id_path, 'rb'))
321 |         assert ids == previously_cached_ids
322 |     else:
323 |         pickle.dump(ids, open(shuffled_id_path, 'wb'))
324 | 
325 |     if seed_to_further_shuffle_train_test_val_sets is not None:
326 |         # if seed is not None, further shuffle everything but the blinded hold out set. 
327 |         train_test_val_cutoff = int((train_frac + val_frac + test_frac)*n)
328 |         train_test_val_ids = copy.deepcopy(ids[:train_test_val_cutoff])
329 |         random.Random(seed_to_further_shuffle_train_test_val_sets).shuffle(train_test_val_ids)
330 |         ids[:train_test_val_cutoff] = train_test_val_ids
331 | 
332 |     results = {'train_ids':ids[:int(train_frac*n)], 
333 |     'val_ids':ids[int(train_frac*n):int((train_frac + val_frac)*n)], 
334 |     'test_ids':ids[int((train_frac + val_frac)*n):int((train_frac + val_frac + test_frac)*n)], 
335 |     'BLINDED_HOLD_OUT_DO_NOT_USE_ids':ids[int((train_frac + val_frac + test_frac)*n):]}
336 | 
337 |     assert sorted(results['train_ids'] + results['val_ids'] + results['test_ids'] + results['BLINDED_HOLD_OUT_DO_NOT_USE_ids']) == sorted(ids)
338 |     
339 |     blinded_hold_out_set_path = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'blinded_hold_out_set_ids.pkl')
340 |     if os.path.exists(blinded_hold_out_set_path):
341 |         previously_cached_hold_out_set_ids = pickle.load(open(blinded_hold_out_set_path, 'rb'))
342 |         assert results['BLINDED_HOLD_OUT_DO_NOT_USE_ids'] == previously_cached_hold_out_set_ids
343 |     else:
344 |         pickle.dump(results['BLINDED_HOLD_OUT_DO_NOT_USE_ids'], open(blinded_hold_out_set_path, 'wb'))
345 |     for k in results:
346 |         print("Number of ids in %s set: %i" % (k.replace('_ids', ''), len(results[k])))
347 |     return results
348 | 
349 | def copy_data_from_hyperion_to_turing():
350 |     assert node_name == 'turing2'
351 |     os.system('scp -r emmap1@hyperion3:/lfs/hyperion3/0/emmap1/oai_image_data/processed_image_data/* /lfs/turing2/0/emmap1/oai_image_data/processed_image_data/')
352 |     os.system('scp -r emmap1@hyperion3:/lfs/hyperion3/0/emmap1/oai_image_data/processed_image_data_new_with_held_out_test_set_april_2019/* /lfs/turing2/0/emmap1/oai_image_data/processed_image_data_new_with_held_out_test_set_april_2019/')
353 |     print("Successfully copied images")
354 | 
355 | def copy_data_from_dfs_to_hyperion(substrings_to_copy, datasets_to_copy=['test', 'val', 'train', 'BLINDED_HOLD_OUT_DO_NOT_USE']):
356 |     """
357 |     Move processed image data from DFS to hyperion because it loads way faster. 
358 |     """
359 |     raise Exception("This is deprecated. You should update so it works with either turing or hyperion")
360 |     original_dfs_folders = None
361 |     assert USE_NEW_DATA_ON_HYPERION_WITH_HELD_OUT_TEST_SET
362 | 
363 |     raise Exception("Do not use this method lightly! It deletes files! Remove this exception if you really want to use it.")
364 | 
365 |     for dataset in datasets_to_copy:
366 |         print("Removing data from %s" % os.path.join(HYPERION_BASE_IMAGES_PATH, dataset))
367 |         print('rm -rf %s/*' % os.path.join(HYPERION_BASE_IMAGES_PATH, dataset))
368 |         os.system('rm -rf %s/*' % os.path.join(HYPERION_BASE_IMAGES_PATH, dataset))
369 |         dfs_folders = sorted(os.listdir(os.path.join(DFS_BASE_IMAGES_PATH, dataset)))
370 |         
371 | 
372 |         dfs_folders = [a for a in dfs_folders 
373 |                        if any([substring_to_copy in a for substring_to_copy in substrings_to_copy])
374 |                        and 'random_seed' not in a]
375 | 
376 |         if original_dfs_folders is not None:
377 |             assert original_dfs_folders == dfs_folders
378 |         else:
379 |             original_dfs_folders = dfs_folders
380 | 
381 |         for dfs_folder in dfs_folders:
382 |             original_full_path = os.path.join(DFS_BASE_IMAGES_PATH, dataset, dfs_folder)
383 |             new_full_path = os.path.join(HYPERION_BASE_IMAGES_PATH, dataset, dfs_folder)
384 |             cmd = 'cp -r %s/ %s/' % (original_full_path, new_full_path)
385 |             print(cmd)
386 |             t0 = time.time()
387 |             p1 = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) # https://stackoverflow.com/a/38956698/9477154
388 |             p1.communicate()
389 |             print("Command %s completed in %2.3f seconds" % (cmd, time.time() - t0))
390 |     print("Successfully completed all copying.")
391 | 
392 | def get_combined_dataframe(non_image_dataset, clinical_assessments):
393 |     """
394 |     Returns a combined data with knee pain scores, semiquantitative scores, and demographic covariates. 
395 |     Uses clinical_assessments as the original dataframe to select the subset of rows. 
396 |     Each row should have a unique id, visit, and side. 
397 |     Checked. 
398 |     """
399 |     combined_data = copy.deepcopy(clinical_assessments)
400 |     print("Number of datapoints with clinical assessments: %i" % len(combined_data))
401 |     # merge with pain scores. 
402 |     combined_data = pd.merge(combined_data, 
403 |                            non_image_dataset.processed_dataframes['all_knee_pain_scores'], 
404 |                            how='inner', 
405 |                            on=['id', 'visit', 'side'])
406 |     assert len(combined_data.dropna(subset=['koos_pain_subscore', 'womac_pain_subscore'])) == len(combined_data)
407 |     old_len = len(combined_data)
408 | 
409 |     # Now merge with a lot of control dataframes. 
410 |     original_order = copy.deepcopy(combined_data[['id', 'visit', 'side']]) # debugging sanity check: sometimes the merge changes the join order. 
411 |     for control_dataframe in sorted(list(non_image_dataset.processed_dataframes.keys())):
412 |         if control_dataframe in ['kxr_sq_bu', 'all_knee_pain_scores']:
413 |             continue
414 |         df_to_merge_with = copy.deepcopy(non_image_dataset.processed_dataframes[control_dataframe])
415 |         cols_to_merge_on = [a for a in ['id', 'visit', 'side'] if a in df_to_merge_with.columns] # changing this doesn't make a difference.         
416 |         if control_dataframe == 'david_mri_data':
417 |             join_type = 'left' # we are lacking rows for some images for MRI data, and we don't want to cut these out. 
418 |         else:
419 |             join_type = 'inner'
420 | 
421 |         print("Performing a %s join with %s using columns %s" % (join_type, control_dataframe, cols_to_merge_on))
422 |         combined_data = pd.merge(combined_data, 
423 |                              df_to_merge_with, 
424 |                              how=join_type, 
425 |                              on=cols_to_merge_on)
426 |         assert len(combined_data) == old_len
427 |         assert len(combined_data[['id', 'visit', 'side']].drop_duplicates()) == len(combined_data)
428 |         if not original_order.equals(combined_data[['id', 'visit', 'side']]):
429 |             print("Alert! Order of dataframe changed after merge. Old order:")
430 |             print(original_order.head())
431 |             print("new order:")
432 |             print(combined_data[['id', 'visit', 'side']].head())
433 |             original_order = copy.deepcopy(combined_data[['id', 'visit', 'side']])
434 |              
435 | 
436 | 
437 |     pd.set_option('max_rows', 500)
438 |     print("Prior to dropping people missing socioeconomic status data, %i rows" % len(combined_data))
439 |     combined_data = combined_data.dropna(
440 |         subset=['binarized_education_graduated_college', 'binarized_income_at_least_50k'])
441 |     print("After dropping people missing socioeconomic status data, %i rows" % len(combined_data))
442 |     combined_data = combined_data.dropna(subset=['p02hisp', 'p02race', 'p02sex', 'age_at_visit'])
443 |     print("After dropping people missing age/race/sex data, %i rows" % len(combined_data))
444 | 
445 |     missing_data_fracs_by_col = []
446 |     for c in combined_data.columns:
447 |         missing_data_fracs_by_col.append({
448 |             'col':c, 
449 |             'missing_data':pd.isnull(combined_data[c]).mean()})
450 | 
451 |     missing_data_fracs_by_col = pd.DataFrame(missing_data_fracs_by_col) 
452 |     print(missing_data_fracs_by_col.sort_values(by='missing_data')[::-1])
453 | 
454 |     return combined_data
455 | 
456 | def find_image_barcodes_that_pass_qc(non_image_dataset):
457 |     """
458 |     Get the list of image barcodes which pass QC. Note: this returns the LONG barcodes (12 characters). 
459 |     """
460 |     all_good_barcodes = set()
461 |     for k in sorted(non_image_dataset.original_dataframes):
462 |         if 'xray' in k:
463 |             visit_id = k.replace('xray', '')
464 |             assert visit_id in CLINICAL_WAVES_TO_FOLLOWUP
465 |             df = copy.deepcopy(non_image_dataset.original_dataframes[k])
466 |             passed_qc_vals = ["'Y': QCd and found to be acceptable", 
467 |                                "'YD': Not QCd and accepted by default"]
468 |             all_vals = passed_qc_vals + ["'NR': QCd unacceptable, chosen for release", "'NA': QCd unacceptable, no better available", "P"] # P is a very rare value
469 |             assert (df['v%saccept' % visit_id] == 'P').sum() < 10
470 |             print("Warning: %i values in xray dataset %s are P, a value which should occur rarely" % ((df['v%saccept' % visit_id] == 'P').sum(), k))
471 |             assert df['v%saccept' % visit_id].dropna().map(lambda x:x in all_vals).all()
472 |             passed_qc = df['v%saccept' % visit_id].map(lambda x:x in passed_qc_vals)
473 |             good_barcodes_for_visit = df['v%sxrbarcd' % visit_id].loc[passed_qc].values
474 |             assert len(set(good_barcodes_for_visit)) == len(good_barcodes_for_visit)
475 |             good_barcodes_for_visit = set(good_barcodes_for_visit)
476 |             assert len(good_barcodes_for_visit.intersection(all_good_barcodes)) == 0
477 |             all_good_barcodes = all_good_barcodes.union(good_barcodes_for_visit)
478 |     all_good_barcodes = ['0' + str(int(a)) for a in all_good_barcodes] # first digit is truncated; it's a 0 -- so we add it back in. 
479 |     assert all([len(a) == 12 for a in all_good_barcodes])
480 |     all_good_barcodes = set(all_good_barcodes)
481 |     return all_good_barcodes
482 | 
483 | def ensure_barcodes_match(combined_df, image_codes):
484 |     """
485 |     Sanity check: make sure non-image data matches image data. 
486 |     """
487 |     print("Ensuring that barcodes line up.")
488 |     assert len(combined_df) == len(image_codes)
489 |     for idx in range(len(combined_df)):
490 |         barcode = str(combined_df.iloc[idx]['barcdbu'])
491 |         if len(barcode) == 11:
492 |             barcode = '0' + barcode
493 |         side = str(combined_df.iloc[idx]['side'])
494 |         code_in_df = barcode + '*' + side
495 | 
496 |         if image_codes[idx] != code_in_df:
497 |             raise Exception("Barcode mismatch at index %i, %s != %s" % (idx, image_codes[idx], code_in_df))
498 |     print("All %i barcodes line up." % len(combined_df))
499 | 
500 | def match_image_dataset_to_non_image_dataset(image_dataset, non_image_dataset, swap_left_and_right=False):
501 |     """
502 |     Given an image dataset + a non-image dataset, returns
503 |     a) a dataframe of clinical ratings and 
504 |     b) a list of images which correspond to the clinical ratings
505 |     There should be no missing data in either. 
506 |     Checked. 
507 |     """
508 | 
509 |     # Filter for clinical assessments for images that pass QC.
510 |     clinical_assessments = copy.deepcopy(non_image_dataset.processed_dataframes['kxr_sq_bu'])
511 |     assert clinical_assessments['barcdbu'].map(lambda x:len(x) == 12).all()
512 |     print(clinical_assessments.head())
513 |     print("Prior to filtering for images that pass QC, %i images" % len(clinical_assessments))
514 |     acceptable_barcodes = find_image_barcodes_that_pass_qc(non_image_dataset)
515 |     clinical_assessments = clinical_assessments.loc[clinical_assessments['barcdbu'].map(lambda x:x in acceptable_barcodes)]
516 |     print("After filtering for images that pass QC, %i images" % len(clinical_assessments)) # this doesn't filter out a lot of clinical assessments, even though a lot of values in the xray01 etc datasets are NA, because those values are already filtered out of the kxr_sq_bu -- you can't assign image scores to an image which isn't available. 
517 |     
518 |     combined_df = get_combined_dataframe(non_image_dataset, clinical_assessments)
519 |     non_image_keys = list(combined_df['barcdbu'].map(str) + '*' + combined_df['side'])
520 |     non_image_keys = dict(zip(non_image_keys, range(len(non_image_keys))))
521 |     matched_images = [None for i in range(len(combined_df))]
522 |     image_codes = [None for i in range(len(combined_df))]
523 |     for i in range(len(image_dataset.images)):
524 |         if i % 1000 == 0:
525 |             print('Image %i/%i' % (i, len(image_dataset.images)))
526 |         image = image_dataset.images[i]
527 |         if not swap_left_and_right:
528 |             left_key = str(image['barcode']) + '*left'
529 |             right_key = str(image['barcode'])  + '*right'
530 |         else:
531 |             right_key = str(image['barcode']) + '*left'
532 |             left_key = str(image['barcode'])  + '*right'
533 |         if left_key in non_image_keys: 
534 |             idx = non_image_keys[left_key]
535 |             assert matched_images[idx] is None
536 |             matched_images[idx] = image['left_knee'].copy()
537 |             image_codes[idx] = left_key
538 |         if right_key in non_image_keys:
539 |             idx = non_image_keys[right_key]
540 |             assert matched_images[idx] is None
541 |             matched_images[idx] = image['right_knee'].copy()
542 |             image_codes[idx] = right_key
543 |     combined_df['has_matched_image'] = [a is not None for a in matched_images]
544 |     print("Fraction of clinical x-ray ratings with matched images")
545 |     print(combined_df[['has_matched_image', 'visit', 'side']].groupby(['visit', 'side']).agg(['mean', 'sum']))
546 |     idxs_to_keep = []
547 |     for i in range(len(combined_df)):
548 |         if combined_df['has_matched_image'].values[i]:
549 |             idxs_to_keep.append(i)
550 |     combined_df = combined_df.iloc[idxs_to_keep]
551 |     combined_df.index = range(len(combined_df))
552 |     matched_images = [matched_images[i] for i in idxs_to_keep]
553 |     image_codes = [image_codes[i] for i in idxs_to_keep]
554 |     ensure_barcodes_match(combined_df, image_codes)
555 |     print("Total number of images matched to clinical ratings: %i" % len(matched_images))
556 |     assert all([a is not None for a in matched_images])
557 |     assert combined_df['has_matched_image'].all()
558 |     return combined_df, matched_images, image_codes
559 | 
560 | 
561 | 
562 | 
563 | 


--------------------------------------------------------------------------------
/image_processing.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | 
   3 | import time
   4 | from traceback import print_exc
   5 | import cv2
   6 | import numpy as np
   7 | from constants_and_util import *
   8 | import matplotlib.pyplot as plt
   9 | 
  10 | import non_image_data_processing
  11 | from scipy.stats import spearmanr
  12 | from traceback import print_exc
  13 | import random
  14 | import pickle
  15 | from sklearn.linear_model import Lasso
  16 | import seaborn as sns
  17 | import datetime
  18 | import sys
  19 | import statsmodels.api as sm
  20 | from scipy.ndimage.filters import gaussian_filter
  21 | import gc
  22 | 
  23 | import torch
  24 | import pydicom
  25 | from pydicom.data import get_testdata_files
  26 | from torchvision import datasets, models, transforms
  27 | import torchsummary
  28 | import torch.nn as nn
  29 | from torch.utils.data import Dataset, DataLoader
  30 | from PIL import Image
  31 | sys.path.append('KneeLocalizer/oulukneeloc/')
  32 | from detector import KneeLocalizer
  33 | 
  34 | 
  35 | 
  36 | def get_directories(path):
  37 |     """ 
  38 |     Small helper method: list the directories along a given path.
  39 |     Checked. 
  40 |     """
  41 |     return sorted([a for a in os.listdir(path) if os.path.isdir(os.path.join(path, a))])
  42 | 
  43 | def is_valid_date(s):
  44 |     """
  45 |     asserts that s is in fact a date. 
  46 |     Should be an 8-character string in yyyymmdd format. 
  47 |     Checked. 
  48 |     """
  49 |     if not len(s) == 8:
  50 |         print(s)
  51 |         return False
  52 |     year = int(s[:4])
  53 |     month = int(s[4:6])
  54 |     day = int(s[6:])
  55 |     try:
  56 |         datetime.datetime(year, month, day)
  57 |     except:
  58 |         print_exc()
  59 |         return False
  60 |     if year > 2017:
  61 |         return False
  62 |     return True
  63 | 
  64 | class XRayImageDataset:
  65 |     """
  66 |     Class for loading data.
  67 |     """
  68 |     def __init__(self, 
  69 |         desired_image_type, 
  70 |         normalization_method, 
  71 |         reprocess_all_images,
  72 |         show_both_knees_in_each_image,
  73 |         crop_to_just_the_knee,
  74 |         make_plot=False,
  75 |         max_images_to_load=None, 
  76 |         use_small_data=False, 
  77 |         downsample_factor_on_reload=None):
  78 |         """
  79 |         Creates the dataset. 
  80 |         desired_image_type: the type of x-ray part you want (for example, Bilateral PA Fixed Flexion Knee)
  81 |         normalization_method: specifies how to z-score each image. 
  82 |         reprocess_all_images: whether to rerun the whole pipeline or just load the processed pkl. 
  83 |         make_plot: whether to plot a random sample of images. 
  84 |         max_images_to_load: set to a small number to test. 
  85 |         downsample_factor_on_reload: how much we further downsample the (already downsampled) images when we reload them. This is a little messy. 
  86 |         Originally we save the images as 1024x1024; we can further downsample them. 
  87 | 
  88 |         The pipeline, then, is: 
  89 |         (before saving)
  90 |         1. Load all diacom images and downsample each image. 
  91 |         2. Scale each image to 0-1 and compute statistics of dataset. 
  92 |         3. Save all the images. 
  93 |         (after reloading saved images)
  94 |         4. Cut each image in half (or flip, or crop)
  95 |         5. If desired, further downsample each image. 
  96 |         6. Turn each image into RGB (ie, give it three channels) and normalize the images (z-score etc). 
  97 |         """
  98 | 
  99 |         self.images = []
 100 |         self.desired_image_type = desired_image_type
 101 |         assert self.desired_image_type == 'Bilateral PA Fixed Flexion Knee' # not sure pipeline will work for other body parts 
 102 |         self.normalization_method = normalization_method
 103 |         self.make_plot = make_plot
 104 |         self.reprocess_all_images = reprocess_all_images
 105 |         self.downsample_factor_on_reload = downsample_factor_on_reload
 106 |         self.show_both_knees_in_each_image = show_both_knees_in_each_image
 107 |         self.knee_localizer = KneeLocalizer()
 108 |         self.crop_to_just_the_knee = crop_to_just_the_knee
 109 | 
 110 |         if use_small_data:
 111 |             self.processed_image_path = os.path.join(BASE_IMAGE_DATA_DIR, 'processed_image_data', 'small_data.pkl')
 112 |         else:
 113 |             self.processed_image_path = os.path.join(BASE_IMAGE_DATA_DIR, 'processed_image_data', 'data.pkl')
 114 | 
 115 |         self.extra_margin_for_each_image = 1.1 # how much extra margin to give the left/right images. 
 116 | 
 117 |         if max_images_to_load is not None:
 118 |             self.max_images_to_load = max_images_to_load
 119 |         else:
 120 |             self.max_images_to_load = 99999999999
 121 |         if self.reprocess_all_images:
 122 |             print("Reprocessing all images from scratch")
 123 |             self.load_all_images() # load images into numpy arrays from dicom
 124 | 
 125 |             # put images on 0-1 scale. Do this separately for the cropped knee images and the full images. 
 126 |             # Note: it is important to do this for cropped knees separately because they are not on the same scale. 
 127 |             # The external package that we uses loads them as 8-bit rather than 16-bit or something. 
 128 |             self.diacom_image_statistics = {}
 129 |             self.compute_dataset_image_statistics_and_divide_by_max(just_normalize_cropped_knees=False)
 130 |             self.compute_dataset_image_statistics_and_divide_by_max(just_normalize_cropped_knees=True)
 131 | 
 132 |             for i in range(len(self.images)):
 133 |                 # don't save extra images
 134 |                 self.images[i]['unnormalized_image_array'] = None
 135 |             print("Number of images: %i" % len(self.images))
 136 |             pickle.dump({'images':self.images, 'image_statistics':self.diacom_image_statistics}, open(self.processed_image_path, 'wb'))
 137 |             print("Successfully processed and saved images")
 138 |         else:
 139 |             print("loading images from %s" % self.processed_image_path)
 140 |             reloaded_data = pickle.load(open(self.processed_image_path, 'rb'))
 141 |             self.images = reloaded_data['images']
 142 |             if not self.crop_to_just_the_knee:
 143 |                 if not self.show_both_knees_in_each_image:
 144 |                     self.cut_images_in_two() # cut into left + right images. 
 145 |                 else:
 146 |                     self.flip_right_images() # if you want both knees in one image, flip the right images so knees are on same side. 
 147 |             else:
 148 |                 for i in range(len(self.images)):
 149 |                     assert self.images[i]['cropped_left_knee'].max() <= 1
 150 |                     assert self.images[i]['cropped_right_knee'].max() <= 1
 151 |                     assert self.images[i]['cropped_left_knee'].min() >= 0
 152 |                     assert self.images[i]['cropped_right_knee'].min() >= 0
 153 | 
 154 |                     self.images[i]['left_knee_scaled_to_zero_one'] = self.images[i]['cropped_left_knee'].copy()
 155 |                     self.images[i]['right_knee_scaled_to_zero_one'] = self.images[i]['cropped_right_knee'][:, ::-1].copy()
 156 |                     self.images[i]['cropped_left_knee'] = None
 157 |                     self.images[i]['cropped_right_knee'] = None
 158 | 
 159 | 
 160 |             if self.downsample_factor_on_reload is not None:
 161 |                 for i in range(len(self.images)):
 162 |                     for side in ['left', 'right']:
 163 |                         orig_shape = self.images[i]['%s_knee_scaled_to_zero_one' % side].shape
 164 |                         assert len(orig_shape) == 2
 165 |                         new_shape = (int(orig_shape[0] * self.downsample_factor_on_reload), 
 166 |                             int(orig_shape[1] * self.downsample_factor_on_reload))
 167 | 
 168 |                         # https://stackoverflow.com/questions/21248245/opencv-image-resize-flips-dimensions
 169 |                         # confusing: open cv resize flips image dimensions, so if image is not a square we have to flip the shape we want. 
 170 |                         new_shape = new_shape[::-1] 
 171 |                         self.images[i]['%s_knee_scaled_to_zero_one' % side] = cv2.resize(self.images[i]['%s_knee_scaled_to_zero_one' % side],
 172 |                          dsize=tuple(new_shape))
 173 |             self.diacom_image_statistics = reloaded_data['image_statistics']
 174 |             print("Image statistics are", reloaded_data['image_statistics'])
 175 |             self.make_images_RGB_and_zscore() # z-score. The reason we do this AFTER processing is that we don't want to save the image 3x. 
 176 |             #self.plot_pipeline_examples(25) # make sanity check plots
 177 |             print("Successfully loaded %i images" % len(self.images))
 178 | 
 179 |     def crop_to_knee(self, dicom_image_path):
 180 |         results = self.knee_localizer.predict(dicom_image_path)
 181 |         if results is None:
 182 |             print("Warning: was not able to identify bounding boxes for this knee.")
 183 |             return None, None
 184 |         bounding_boxes, image = results
 185 |         l_bounding_box, r_bounding_box = bounding_boxes
 186 |         # IMPORTANT AND CONFUSING: THE IMAGE ON THE LEFT IS THE RIGHT KNEE.
 187 |         # Per email: "Confusingly, the knee on the right of the image is the patient's left knee."
 188 |         assert l_bounding_box[0] > r_bounding_box[0]
 189 |         assert l_bounding_box[2] > r_bounding_box[2]
 190 |         left_knee = image[l_bounding_box[1]:l_bounding_box[3], l_bounding_box[0]:l_bounding_box[2]]
 191 |         right_knee = image[r_bounding_box[1]:r_bounding_box[3], r_bounding_box[0]:r_bounding_box[2]] 
 192 |         print("Size of left knee prior to resizing is", left_knee.shape)
 193 |         print("Size of right knee prior to resizing is", right_knee.shape)
 194 |         if min(left_knee.shape) == 0 or min(right_knee.shape) == 0:
 195 |             print("Warning: was not able to identify bounding boxes for this knee.")
 196 |             return None, None
 197 | 
 198 |         left_knee = self.resize_image(left_knee, CROPPED_KNEE_RESAMPLED_IMAGE_SIZE)
 199 |         right_knee = self.resize_image(right_knee, CROPPED_KNEE_RESAMPLED_IMAGE_SIZE)
 200 | 
 201 |         print("Size of left knee after resizing is", left_knee.shape)
 202 |         print("Size of right knee after resizing is", right_knee.shape)
 203 |         return left_knee, right_knee
 204 |         
 205 |     def load_all_images(self):
 206 |         """
 207 |         loop over the nested subfolders + load images. 
 208 |         """
 209 |         for timepoint_dir in get_directories(BASE_IMAGE_DATA_DIR):
 210 |             if timepoint_dir not in IMAGE_TIMEPOINT_DIRS_TO_FOLLOWUP:
 211 |                 continue
 212 |             # confirmed that this set of directories is consistent with website that provides information about data. 
 213 |             base_dir_for_timepoint = os.path.join(BASE_IMAGE_DATA_DIR, timepoint_dir)
 214 |             # for some reason some directories are nested -- /dfs/dataset/tmp/20180910-OAI/data/48m/48m/48m -- 
 215 |             while timepoint_dir in get_directories(base_dir_for_timepoint):
 216 |                 print("%s directory is found in %s; concatenating and looking in the nested directory" % (timepoint_dir, base_dir_for_timepoint))
 217 |                 base_dir_for_timepoint = os.path.join(base_dir_for_timepoint, timepoint_dir)
 218 |             for cohort_folder in get_directories(base_dir_for_timepoint):
 219 |                 # A value of "C" for letter [X] indicates that the images are from participants are in the initial 2686 participants in Group C of the OAI cohort, 
 220 |                 # and a value of "E" represents the remaining 2110 participants from the cohort.
 221 |                 print(cohort_folder)
 222 |                 if timepoint_dir in ['18m']:
 223 |                     assert cohort_folder.split('.')[1] in ['D']
 224 |                     assert len(get_directories(base_dir_for_timepoint)) == 1
 225 |                 elif timepoint_dir in ['30m']:
 226 |                     assert cohort_folder.split('.')[1] in ['G']
 227 |                     assert len(get_directories(base_dir_for_timepoint)) == 1
 228 |                 else:
 229 |                     assert cohort_folder.split('.')[1] in ['C', 'E']
 230 |                     assert len(get_directories(base_dir_for_timepoint)) == 2
 231 |                 participants = get_directories(os.path.join(base_dir_for_timepoint, 
 232 |                                                             cohort_folder))
 233 |                 for participant in participants:
 234 |                     participant_path = os.path.join(base_dir_for_timepoint, 
 235 |                                                cohort_folder, 
 236 |                                                participant)
 237 |                     dates = get_directories(participant_path)
 238 |                     # Each individual participant's folder contains subfolders for each date on which a participant had images 
 239 |                     # (format of folder name is yyyymmdd).
 240 |                     for date in dates:
 241 |                         assert is_valid_date(date)
 242 |                         date_path = os.path.join(base_dir_for_timepoint, 
 243 |                                                cohort_folder, 
 244 |                                                participant, 
 245 |                                                date)
 246 |                         # There is one more level of sub- folders below this level: 
 247 |                         # one sub-folder for each image series acquired on that date. 
 248 |                         # These sub-folders have unique 8-digit identifiers that are assigned 
 249 |                         # to the image series in the central OAI imaging database maintained 
 250 |                         # at Synarc, Inc. 
 251 |                         # If the 8-digit identifier begins with 0 then the folder contains x-ray images, 
 252 |                         # and if it starts with 1, then the folder contains MR images.
 253 |                         all_image_series = get_directories(date_path)
 254 |                         assert all([a[0] in ['0', '1'] for a in all_image_series])
 255 |                         for image_series in all_image_series:
 256 |                             is_xray = image_series[0] == '0'
 257 |                             image_series_dir = os.path.join(date_path, 
 258 |                                                     image_series)
 259 |                             if is_xray:
 260 |                                 if len(self.images) >= self.max_images_to_load:
 261 |                                     print("Loaded the maximum number of images: %i" % len(self.images))
 262 |                                     return
 263 |                                 assert os.listdir(image_series_dir) == ['001']
 264 |                                 image_path = os.path.join(image_series_dir, '001')
 265 |                                 diacom_image = self.load_diacom_file(image_path, 
 266 |                                     desired_image_type=self.desired_image_type)
 267 | 
 268 |                                 
 269 |                                 if diacom_image is not None:
 270 |                                     cropped_left_knee, cropped_right_knee = self.crop_to_knee(image_path)
 271 |                                     if (cropped_left_knee is None) or (cropped_right_knee is None):
 272 |                                         print("Warning: unable to crop knee image.")
 273 | 
 274 |                                     image_array = self.get_resized_pixel_array_from_dicom_image(diacom_image)
 275 |                                     self.images.append({'timepoint_dir':timepoint_dir, 
 276 |                                         'full_path':image_path,
 277 |                                         'cohort_folder':cohort_folder, 
 278 |                                         'visit':diacom_image.ClinicalTrialTimePointDescription,
 279 |                                         'id':int(participant), 
 280 |                                         'date':date, 
 281 |                                         'image_series':image_series, 
 282 |                                         'body_part':diacom_image.BodyPartExamined, 
 283 |                                         'series_description':diacom_image.SeriesDescription,
 284 |                                         'unnormalized_image_array':image_array, 
 285 |                                         'cropped_left_knee':cropped_left_knee, 
 286 |                                         'cropped_right_knee':cropped_right_knee,
 287 |                                         # Users may also want to identify the specific image that was assessed to generate the data for an anatomic site and time point and merge the image assessment data with meta-data about that image (please see Appendix D for example SAS code). Individual images (radiographs, MRI series) are identified by a unique barcode. The barcode is recorded in the AccessionNumber in the DICOM header of the image.
 288 |                                         'barcode':diacom_image.AccessionNumber
 289 |                                         })
 290 |     def plot_pipeline_examples(self, n_examples):
 291 |         """
 292 |         plot n_examples random images to make sure pipeline looks ok. 
 293 |         Checked. 
 294 |         """
 295 |         print("Plotting pipeline examples")
 296 |         for i in range(n_examples):
 297 |             idx = random.choice(range(len(self.images)))
 298 |             plt.figure(figsize=[15, 5])
 299 | 
 300 |             original_diacom_image = self.load_diacom_file(self.images[idx]['full_path'], self.images[idx]['series_description'])
 301 |             plt.subplot(131)
 302 |             plt.imshow(original_diacom_image.pixel_array, cmap='bone')
 303 |             plt.colorbar()
 304 |             
 305 |             zscore_range = 2
 306 |             plt.subplot(132)
 307 |             plt.imshow(self.images[idx]['left_knee'][0, :, :], cmap='bone', clim=[-zscore_range, zscore_range])
 308 |             plt.title("Left knee")
 309 |             plt.colorbar()
 310 | 
 311 |             plt.subplot(133)
 312 |             plt.imshow(self.images[idx]['right_knee'][0, :, :], cmap='bone', clim=[-zscore_range, zscore_range])
 313 |             plt.title("Right knee")
 314 |             plt.colorbar()
 315 | 
 316 |             plt.subplots_adjust(wspace=.3, hspace=.3)
 317 |             plt.savefig('example_images/pipeline_example_%i.png' % i, dpi=300)
 318 |             plt.show()
 319 |     
 320 |     def cut_image_in_half(self, image_arr):
 321 |         """
 322 |         Cut the image into left + right knees. 
 323 |         Checked. 
 324 |         """
 325 |         half_image = RESAMPLED_IMAGE_SIZE[1] / 2.
 326 |        
 327 |         border_of_image_on_the_left = int(half_image * self.extra_margin_for_each_image)
 328 |         border_of_image_on_the_right = RESAMPLED_IMAGE_SIZE[1] - int(half_image * self.extra_margin_for_each_image)
 329 | 
 330 |         image_on_the_left = image_arr[:, :border_of_image_on_the_left].copy()
 331 |         image_on_the_right = image_arr[:, border_of_image_on_the_right:].copy()
 332 | 
 333 |         # flip left image so symmetric
 334 |         image_on_the_left = image_on_the_left[:, ::-1]
 335 |         assert image_on_the_left.shape == image_on_the_right.shape
 336 | 
 337 |         # IMPORTANT AND CONFUSING: THE IMAGE ON THE LEFT IS THE RIGHT KNEE.
 338 |         # Per email: "Confusingly, the knee on the right of the image is the patient's left knee."
 339 |         right_knee = image_on_the_left
 340 |         left_knee = image_on_the_right
 341 | 
 342 |         return left_knee, right_knee
 343 | 
 344 |     def cut_images_in_two(self):
 345 |         """
 346 |         Loop over all images and cut each in two. 
 347 |         """
 348 |         for i in range(len(self.images)):
 349 |             self.images[i]['left_knee_scaled_to_zero_one'], self.images[i]['right_knee_scaled_to_zero_one'] = self.cut_image_in_half(self.images[i]['image_array_scaled_to_zero_one'])
 350 |             self.images[i]['image_array_scaled_to_zero_one'] = None
 351 | 
 352 |     def flip_right_images(self):
 353 |         for i in range(len(self.images)):
 354 |             self.images[i]['left_knee_scaled_to_zero_one'] = self.images[i]['image_array_scaled_to_zero_one'].copy()
 355 |             self.images[i]['right_knee_scaled_to_zero_one'] = self.images[i]['image_array_scaled_to_zero_one'][:, ::-1].copy()
 356 |             self.images[i]['image_array_scaled_to_zero_one'] = None
 357 | 
 358 | 
 359 |     def resize_image(self, original_array, new_size):
 360 |         """
 361 |         resample the image to new_size. Checked. 
 362 |         """
 363 |         assert len(original_array.shape) == 2
 364 |         print("Resizing image from %s to %s" % (original_array.shape, new_size))
 365 |         new_array = cv2.resize(original_array, dsize=tuple(new_size), interpolation=cv2.INTER_CUBIC)
 366 |         return new_array
 367 | 
 368 |     def load_diacom_file(self, filename, desired_image_type):
 369 |         """
 370 |         load a matplotlib array from the pydicom file filename. Checked. 
 371 |         Drawn heavily from this documentation example: 
 372 |         https://pydicom.github.io/pydicom/stable/auto_examples/input_output/plot_read_dicom.html#sphx-glr-auto-examples-input-output-plot-read-dicom-py
 373 |         """
 374 |         dataset = pydicom.dcmread(filename)
 375 |         
 376 |         if dataset.SeriesDescription != desired_image_type:
 377 |             return None
 378 | 
 379 |         print("Image %i" % len(self.images))
 380 |         print("Filename.........:", filename)
 381 |         pat_name = dataset.PatientName
 382 |         display_name = pat_name.family_name + ", " + pat_name.given_name
 383 |         print("Patient's name...: %s" % display_name)
 384 |         print("Patient id.......: %s" % dataset.PatientID)
 385 |         print("Modality.........: %s" % dataset.Modality)
 386 |         print("Study Date.......: %s" % dataset.StudyDate)
 387 |         print("Body part examined: %s" % dataset.BodyPartExamined)
 388 |         print("Series description: %s" % dataset.SeriesDescription) # eg, Bilateral PA Fixed Flexion Knee
 389 |         print("Accession number: %s" % dataset.AccessionNumber) # this is the barcode. 
 390 |         print("ClinicalTrialTimePointDescription: %s" % dataset.ClinicalTrialTimePointDescription)
 391 |         print("ClinicalTrialTimePointID: %s" % dataset.ClinicalTrialTimePointID)
 392 | 
 393 |         if 'PixelData' in dataset:
 394 |             rows = int(dataset.Rows)
 395 |             cols = int(dataset.Columns)
 396 |             print("Image size.......: {rows:d} x {cols:d}, {size:d} bytes".format(
 397 |                 rows=rows, cols=cols, size=len(dataset.PixelData)))
 398 |             if 'PixelSpacing' in dataset:
 399 |                 print("Pixel spacing....:", dataset.PixelSpacing)
 400 | 
 401 |         return dataset
 402 | 
 403 |     def get_resized_pixel_array_from_dicom_image(self, diacom_image):
 404 |         """
 405 |         Extract pydicom pixel array and resize. Checked. 
 406 |         Per documentation, "The pixel_array property returns a NumPy array"
 407 |         """
 408 |         arr = self.resize_image(diacom_image.pixel_array, RESAMPLED_IMAGE_SIZE) * 1.0
 409 |         assert len(arr.shape) == 2
 410 |         return arr
 411 | 
 412 |     def compute_dataset_image_statistics_and_divide_by_max(self, just_normalize_cropped_knees):
 413 |         """
 414 |         Put images into the zero-one range by dividing by the maximum value. 
 415 |         Also compute statistics of the images: mean and std. 
 416 | 
 417 |         Note: it is important to do this for cropped knees separately because they are not on the same scale. 
 418 |         The external package that we uses loads them as 8-bit rather than 16-bit or something. 
 419 | 
 420 |         Checked. 
 421 |         """
 422 |         print("\n\nNow computing overall dataset statistics")
 423 |         print("Just analyze cropped knees: %s" % just_normalize_cropped_knees)
 424 | 
 425 |         all_pixel_arrays = []
 426 |         for i in range(len(self.images)):
 427 |             if just_normalize_cropped_knees:
 428 |                 if self.images[i]['cropped_right_knee'] is not None:
 429 |                     all_pixel_arrays.append(self.images[i]['cropped_right_knee'])
 430 |                     all_pixel_arrays.append(self.images[i]['cropped_left_knee'])
 431 |             else:
 432 |                 all_pixel_arrays.append(self.images[i]['unnormalized_image_array'])
 433 |                 
 434 |         all_pixel_arrays = np.array(all_pixel_arrays)
 435 |         arr_max =  np.max(all_pixel_arrays)
 436 |         assert np.min(all_pixel_arrays) >= 0
 437 |         
 438 |         if just_normalize_cropped_knees:
 439 |             suffix = 'cropped_knee_only'
 440 |         else:
 441 |             suffix = 'full_image'
 442 | 
 443 |         self.diacom_image_statistics['max_%s' % suffix] = 1.0*arr_max
 444 | 
 445 |         for i in range(len(self.images)):
 446 |             if just_normalize_cropped_knees:
 447 |                 if self.images[i]['cropped_right_knee'] is not None:
 448 |                     self.images[i]['cropped_right_knee'] = self.images[i]['cropped_right_knee'] / arr_max
 449 |                     self.images[i]['cropped_left_knee'] = self.images[i]['cropped_left_knee'] / arr_max
 450 |             else:
 451 |                 self.images[i]['image_array_scaled_to_zero_one'] = self.images[i]['unnormalized_image_array'] / arr_max
 452 |         self.diacom_image_statistics['mean_of_zero_one_data_%s' % suffix] = np.mean(all_pixel_arrays) / arr_max
 453 |         self.diacom_image_statistics['std_of_zero_one_data_%s' % suffix] = np.std(all_pixel_arrays) / arr_max
 454 |         for k in self.diacom_image_statistics.keys():
 455 |             print(k, self.diacom_image_statistics[k])
 456 |     
 457 |     def make_images_RGB_and_zscore(self):
 458 |         """
 459 |         Normalize each image by z-scoring. 
 460 |         Checked. 
 461 |         """
 462 |         print("Computing normalized images")
 463 |         assert self.normalization_method in ['imagenet_statistics', 'our_statistics', 'zscore_individually']
 464 |         
 465 |         def normalize_array(arr, mean_to_use, std_to_use):
 466 |             assert len(mean_to_use) == 3
 467 |             assert len(std_to_use) == 3
 468 |             new_arr = arr.copy()
 469 |             for k in range(3):
 470 |                 new_arr[k, :, :] = (new_arr[k, :, :] - mean_to_use[k]) / std_to_use[k]
 471 |             return new_arr
 472 | 
 473 |         for i in range(len(self.images)):
 474 |             for side in ['left', 'right']:
 475 |                 original_image = self.images[i]['%s_knee_scaled_to_zero_one' % side]
 476 | 
 477 |                 rgb_image = np.array([original_image, original_image, original_image])
 478 |                 
 479 |                 # determine what the size of the image ought to be. 
 480 | 
 481 |                 if self.crop_to_just_the_knee:
 482 |                     original_reloaded_image_size = CROPPED_KNEE_RESAMPLED_IMAGE_SIZE[0]
 483 |                 else:
 484 |                     original_reloaded_image_size = RESAMPLED_IMAGE_SIZE[0]
 485 | 
 486 |                 if self.downsample_factor_on_reload is not None:
 487 |                     downsampled_size = int(original_reloaded_image_size * self.downsample_factor_on_reload)
 488 |                 else:
 489 |                     downsampled_size = original_reloaded_image_size
 490 | 
 491 |                 if self.show_both_knees_in_each_image or self.crop_to_just_the_knee:
 492 |                     assert rgb_image.shape == tuple([3, downsampled_size, downsampled_size])
 493 |                 else:
 494 |                     assert rgb_image.shape == tuple([3, downsampled_size, int(downsampled_size * self.extra_margin_for_each_image / 2.)])
 495 |                 if self.normalization_method == 'imagenet_statistics':
 496 |                     mean_to_use = [0.485, 0.456, 0.406]
 497 |                     std_to_use = [0.229, 0.224, 0.225]
 498 |                 elif self.normalization_method == 'our_statistics':
 499 |                     if self.crop_to_just_the_knee:
 500 |                         mean_to_use = [self.diacom_image_statistics['mean_of_zero_one_data_cropped_knee_only']] * 3
 501 |                         std_to_use = [self.diacom_image_statistics['std_of_zero_one_data_cropped_knee_only']] * 3
 502 |                     else:
 503 |                         mean_to_use = [self.diacom_image_statistics['mean_of_zero_one_data_full_image']] * 3
 504 |                         std_to_use = [self.diacom_image_statistics['std_of_zero_one_data_full_image']] * 3
 505 |                 elif self.normalization_method == 'zscore_individually':
 506 |                     mean_to_use = [original_image.mean()] * 3
 507 |                     std_to_use = [original_image.std()] * 3
 508 |                 else:
 509 |                     raise Exception("invalid image normalization method")
 510 | 
 511 |                 self.images[i]['%s_knee' % side] = normalize_array(
 512 |                     rgb_image, 
 513 |                     mean_to_use, 
 514 |                     std_to_use)
 515 |                 self.images[i]['%s_knee_scaled_to_zero_one' % side] = None
 516 | 
 517 | def compare_contents_files_to_loaded_images(image_dataset, series_description):
 518 |     """
 519 |     Sanity check: make sure the images we loaded are the images which are supposed to be there
 520 |     according to the contents file. 
 521 |     """
 522 |     barcodes_in_image_dataset = [a['barcode'][5:] for a in image_dataset.images]
 523 |     assert all([len(a) == 7 for a in barcodes_in_image_dataset])
 524 |     # Every x-ray image has a unique 12 digit barcode associated with it and the first 5 digits are always 01660.
 525 |     # so we look at the last 7 digits. 
 526 |     assert len(barcodes_in_image_dataset) == len(set(barcodes_in_image_dataset))
 527 |     barcodes_in_image_dataset = set(barcodes_in_image_dataset)
 528 |     print("Total number of barcodes in image dataset: %i" % len(barcodes_in_image_dataset))
 529 |     all_barcodes_in_contents_dir = set()                     
 530 |     for image_timepoint_dir in sorted(IMAGE_TIMEPOINT_DIRS_TO_FOLLOWUP):
 531 |         content_filename = os.path.join(BASE_IMAGE_DATA_DIR, image_timepoint_dir, 'contents.csv')
 532 |         d = pd.read_csv(content_filename, dtype={'Barcode':str})
 533 |         
 534 |         d['SeriesDescription'] = d['SeriesDescription'].map(lambda x:x.strip())
 535 |         d = d.loc[d['SeriesDescription'] == series_description]
 536 |         # several contents files are, unfortunately, inconsistently formatted. 
 537 |         if 'Barcode' not in d.columns:
 538 |             d['Barcode'] = d['AccessionNumber'].map(lambda x:str(x)[4:])
 539 |         elif image_timepoint_dir == '72m':
 540 |             d['Barcode'] = d['Barcode'].map(lambda x:str(x)[4:])
 541 |         else:
 542 |             needs_leading_0 = d['Barcode'].map(lambda x:len(x) == 6)
 543 |             d.loc[needs_leading_0, 'Barcode'] = '0' + d.loc[needs_leading_0, 'Barcode'] 
 544 |         if len(d) > 0:
 545 |             assert d['Barcode'].map(lambda x:len(x) == 7).all()
 546 |             assert len(set(d['Barcode'])) == len(d)
 547 |         all_barcodes_in_contents_dir = all_barcodes_in_contents_dir.union(set(d['Barcode']))
 548 |         n_properly_loaded = d['Barcode'].map(lambda x:x in barcodes_in_image_dataset).sum()
 549 |         
 550 |         print("%-5i/%-5i images in %s match to our dataset" % (n_properly_loaded,
 551 |                                                       len(d),
 552 |                                                       content_filename))
 553 | 
 554 |     print("Warning: The following images have barcodes in our dataset but do not appear in contents file")
 555 |     print("This appears to be due to barcodes that differ by 1 in a very small number of images")
 556 |     print([a for a in barcodes_in_image_dataset if a not in all_barcodes_in_contents_dir])
 557 |     assert sum([a not in all_barcodes_in_contents_dir for a in barcodes_in_image_dataset]) <= 5
 558 | 
 559 | def check_consistency_with_enrollees_table(image_dataset, non_image_dataset):
 560 |     """
 561 |     Check consistency between the images we have and the images the enrollees table thinks we should have. 
 562 |     THIS IS NOT CURRENTLY WORKING AND WE ARE NOT USING IT.
 563 |     """
 564 |     raise Exception("Not using at present because the enrollees data is weird and the image data shows good concordance with other files. If you use this, check it.")
 565 |     print(Counter([a['visit'] for a in image_dataset.images]))
 566 |     for timepoint in ['00', '01', '03', '05', '06', '08']:
 567 |         df = copy.deepcopy(non_image_dataset.original_dataframes['enrollees'])
 568 |         all_ids_in_enrollees_table = set(df['id'])
 569 |         def has_knee_xray(s):
 570 |             
 571 |             assert s in {'0: No', 
 572 |                          '2: Yes, Knee Xray only', 
 573 |                          '1: Yes, Knee MR only', 
 574 |                          '.: Missing Form/Incomplete Workbook', 
 575 |                          '3: Yes, Knee MR and knee xray'}
 576 |             return s in ['2: Yes, Knee Xray only', '3: Yes, Knee MR and knee xray']
 577 |         df['has_knee_xray'] = (df['v%simagesc' % timepoint].map(has_knee_xray) | 
 578 |                                df['v%simagese' % timepoint].map(has_knee_xray))
 579 |         people_who_should_have_xrays = set(list(df['id'].loc[df['has_knee_xray']].map(int)))
 580 |         
 581 |         # now figure out who actually does. 
 582 |         people_who_actually_have_xrays = set()
 583 |         timepoints_to_visit_names = {'00':'Screening Visit', 
 584 |         '01':'12 month Annual Visit', 
 585 |         '03':'24 month Annual Visit', 
 586 |         '05':'36 month Annual Visit', 
 587 |         '06':'48 month Annual Visit', 
 588 |         '08':'72 month Annual Visit'}
 589 |         for image in image_dataset.images:
 590 |             if (image['visit'] == timepoints_to_visit_names[timepoint] and 
 591 |                 image['id'] in all_ids_in_enrollees_table):
 592 |                 people_who_actually_have_xrays.add(image['id'])
 593 |         print("%i/%i who should have knee xrays at timepoint %s actually do" % (
 594 |             len([a for a in people_who_should_have_xrays if a in people_who_actually_have_xrays]),
 595 |             len(people_who_should_have_xrays),
 596 |             timepoint))
 597 |         have_ids_and_not_in_enrollees_table = [a for a in people_who_actually_have_xrays if a not in people_who_should_have_xrays]
 598 |         if len(have_ids_and_not_in_enrollees_table) > 0:
 599 |             print("Warning: %i people in our dataset has x-rays and does not appear in enrollees table as someone who should" % 
 600 |                 len(have_ids_and_not_in_enrollees_table))
 601 | 
 602 | class PretrainedTorchModel:
 603 |     """
 604 |     class for loading pretrained Torch models.
 605 |     Checked.  
 606 |     """
 607 |     def __init__(self, model_name, layer_of_interest_name, use_adaptive_pooling):
 608 |         assert model_name in ['resnet18', 
 609 |         'resnet34', 'resnet50', 'resnet101', 'resnet152']
 610 |         self.model_name = model_name
 611 |         self.layer_of_interest_name = layer_of_interest_name
 612 |         self.use_adaptive_pooling = use_adaptive_pooling
 613 |         if 'resnet' in model_name:
 614 |             assert self.layer_of_interest_name in ['avgpool'] # could also try something like "layer3"
 615 |             if model_name == 'resnet18':
 616 |                 self.model = models.resnet18(pretrained=True)
 617 |                 self.embedding_size = [512]
 618 |             elif model_name == 'resnet34':
 619 |                 self.model = models.resnet34(pretrained=True)
 620 |                 self.embedding_size = [512]
 621 |             elif model_name == 'resnet50':
 622 |                 self.model = models.resnet50(pretrained=True)
 623 |                 self.embedding_size = [2048]
 624 |             elif model_name == 'resnet101':
 625 |                 self.model = models.resnet101(pretrained=True)
 626 |                 self.embedding_size = [2048]
 627 |             elif model_name == 'resnet152':
 628 |                 self.model = models.resnet152(pretrained=True)
 629 |                 self.embedding_size = [2048]
 630 |             else:
 631 |                 raise Exception("%s is not a valid model" % model_name)
 632 |             if self.use_adaptive_pooling:
 633 |                 print("Using adaptive pooling")
 634 |                 self.model.avgpool = nn.AdaptiveAvgPool2d(1) # see eg http://forums.fast.ai/t/ideas-behind-adaptive-max-pooling/12634. Basically this automatically computes the appropriate size for the window. 
 635 |             self.model.cuda()
 636 |         else:
 637 |             raise Exception("%s is not a valid model" % model_name)
 638 | 
 639 |         # Use the model object to select the desired layer
 640 |         self.layer_of_interest = self.model._modules.get(self.layer_of_interest_name)
 641 | 
 642 |         self.model.eval()
 643 |         print("model")
 644 |         print(self.model)
 645 |         
 646 | 
 647 |     def get_embedding(self, input_data):
 648 |         # Load the pretrained model
 649 |         # https://becominghuman.ai/extract-a-feature-vector-for-any-image-with-pytorch-9717561d1d4c
 650 |         # 1. Create a vector of zeros that will hold our feature vector
 651 |         my_embedding = torch.zeros(*self.embedding_size)
 652 | 
 653 |         # 2. Define a function that will copy the output of a layer
 654 |         def copy_data(m, i, o):
 655 |             my_embedding.copy_(o.data.squeeze())
 656 | 
 657 |         # 3. Attach that function to our selected layer
 658 |         h = self.layer_of_interest.register_forward_hook(copy_data)
 659 | 
 660 |         # 4. Run the model on our transformed image
 661 |         self.model(input_data)
 662 | 
 663 |         # 5. Detach our copy function from the layer
 664 |         h.remove()
 665 | 
 666 |         # 6. Return the feature vector, 
 667 |         # converted to numpy and flattened. 
 668 |         return my_embedding.numpy().flatten()
 669 | 
 670 | def convert_to_torch_tensor(arr):
 671 |     """
 672 |     convert to torch tensor.
 673 |     Checked. 
 674 |     """
 675 |     input_data = torch.from_numpy(arr).float()
 676 |     input_data = input_data.unsqueeze(0)
 677 |     input_data = torch.autograd.Variable(input_data).cuda()
 678 |     return input_data
 679 | 
 680 | def generate_embeddings_for_images_from_pretrained_model(images, 
 681 |     torch_model_name, 
 682 |     model_layer):
 683 |     """
 684 |     Given a list of images, generates embeddings for the images using a pretrained neural net. 
 685 |     Two different embedding methods: use_adaptive_pooling, which modifies the neural net to work with different image sizes
 686 |     and rescale, which resamples the image. 
 687 | 
 688 |     Checked. 
 689 |     """
 690 |     assert torch_model_name in ['resnet18', 
 691 |         'resnet34', 'resnet50', 'resnet101', 'resnet152']
 692 |     embedding_method_to_embeddings = {}
 693 |     for embedding_method in ['use_adaptive_pooling', 'rescale']:
 694 |         embedding_method_to_embeddings[embedding_method] = []
 695 |         print("Embedding method: %s" % embedding_method)
 696 |         assert embedding_method in ['use_adaptive_pooling', 'rescale']
 697 |         use_adaptive_pooling = embedding_method == 'use_adaptive_pooling'
 698 |         torch_model = PretrainedTorchModel(model_name=torch_model_name, 
 699 |                                        layer_of_interest_name=model_layer, 
 700 |                                        use_adaptive_pooling=use_adaptive_pooling)      
 701 |         for idx, image in enumerate(images):
 702 |             if idx % 1000 == 0:
 703 |                 print(idx, len(images))
 704 |             if embedding_method == 'rescale':
 705 |                 resized_images = []
 706 |                 for k in range(3):
 707 |                     resized_images.append(cv2.resize(image[k, :, :], (224,224)))
 708 |                 image = np.array(resized_images)
 709 |             torch_tensor = convert_to_torch_tensor(image)
 710 |             embedding = torch_model.get_embedding(torch_tensor)
 711 |             embedding_method_to_embeddings[embedding_method].append(embedding)
 712 |         embedding_method_to_embeddings[embedding_method] = np.array(embedding_method_to_embeddings[embedding_method])
 713 |         print("Size of image embeddings is", embedding_method_to_embeddings[embedding_method].shape)
 714 |     return embedding_method_to_embeddings
 715 | 
 716 | def predict_yhat_from_embeddings(all_train_embeddings, 
 717 |     all_test_embeddings, 
 718 |     train_combined_df, 
 719 |     test_combined_df):
 720 |     """
 721 |     Given train + test embeddings, and train and test datasets which include pain scores
 722 |     Comes up with train and test predictions using lasso. 
 723 |     Checked. 
 724 |     """
 725 |     assert list(all_train_embeddings.keys()) == list(all_test_embeddings.keys())
 726 |     all_yhat = []
 727 |     for y_col in ['koos_pain_subscore', 'womac_pain_subscore']:
 728 |         for alpha in [10 ** a for a in np.arange(-3, 4, .5)]:
 729 |             for embedding_method in all_train_embeddings.keys():
 730 |                 print("Embedding method %s" % embedding_method)
 731 |                 train_Y = copy.deepcopy(train_combined_df[y_col].values)
 732 |                 test_Y = copy.deepcopy(test_combined_df[y_col].values)
 733 |                 train_X = copy.deepcopy(all_train_embeddings[embedding_method])
 734 |                 test_X = copy.deepcopy(all_test_embeddings[embedding_method])
 735 |                 linear_model = Lasso(alpha=alpha)
 736 |                 linear_model.fit(train_X, train_Y)
 737 |                 num_nnz_coefficients = (np.abs(linear_model.coef_) > 1e-6).sum()
 738 |                 print("Number of nonzero coefficients: %i" % num_nnz_coefficients)
 739 |                 if num_nnz_coefficients == 0:
 740 |                     continue
 741 | 
 742 |                 train_yhat = linear_model.predict(train_X)
 743 |                 test_yhat = linear_model.predict(test_X)
 744 |                 train_r, train_p = pearsonr(train_yhat, train_Y)
 745 |                 test_r, test_p = pearsonr(test_yhat, test_Y)
 746 |                 
 747 |                 all_yhat.append({'train_yhat':train_yhat, 
 748 |                                 'test_yhat':test_yhat, 
 749 |                                 'train_r':train_r, 
 750 |                                 'test_r':test_r, 
 751 |                                 'train_p':train_p, 
 752 |                                 'test_p':test_p, 
 753 |                                 'alpha':alpha, 
 754 |                                 'embedding_method':embedding_method, 
 755 |                                 'y_col':y_col
 756 |                                 })
 757 |                 print("\n\n**Embedding method %s, alpha=%2.3f; train r: %2.3f (p=%2.3e); test r: %2.3f; (p=%2.3e)" % (embedding_method, 
 758 |                                                                                                                       alpha, 
 759 |                                                                                                                       train_r, 
 760 |                                                                                                                       train_p, 
 761 |                                                                                                                       test_r, 
 762 |                                                                                                                       test_p))
 763 | 
 764 |                 # quick plot to give a sense of results. 
 765 |                 plt.figure(figsize=[8, 8])
 766 |                 sns.regplot(test_Y, test_yhat, x_jitter=.2)
 767 |                 plt.xlabel("Test Y")
 768 |                 plt.ylabel("Test Yhat")
 769 |                 if y_col == 'womac_pain_subscore':
 770 |                     plt.ylim([0, 20])
 771 |                     plt.xlim([0, 20])
 772 |                 else:
 773 |                     plt.ylim([0, 100])
 774 |                     plt.xlim([0, 100])
 775 |                 plt.show()
 776 | 
 777 |                 # are results driven by only a single visit or a single side? 
 778 |                 for visit in sorted(list(set(test_combined_df['visit']))):
 779 |                     idxs = (test_combined_df['visit'] == visit).values
 780 |                     r, p = pearsonr(test_yhat[idxs], test_Y[idxs])
 781 |                     print("Visit %s, test r %2.3f (n = %i)" % (visit, r, idxs.sum()))
 782 |                 for side in ['left', 'right']:
 783 |                     idxs = (test_combined_df['side'] == side).values
 784 |                     r, p = pearsonr(test_yhat[idxs], test_Y[idxs])
 785 |                     print("Side %s, test r %2.3f (n = %i)" % (side, r, idxs.sum()))
 786 |     all_yhat = pd.DataFrame(all_yhat)
 787 |     return all_yhat
 788 | 
 789 | def delete_old_images_from_dfs():
 790 |     """
 791 |     remove the old image files when we regenerate images so we don't have any old stuff lying around. 
 792 |     This command takes a while to run. 
 793 |     """
 794 |     raise Exception("Do not use this method lightly! It deletes files! Remove this exception if you really want to use it.")
 795 |     assert node_name in ['rambo', 'trinity']
 796 |     for dataset in ['train', 'val', 'test', 'BLINDED_HOLD_OUT_DO_NOT_USE']:
 797 |         base_path_to_delete = os.path.join(INDIVIDUAL_IMAGES_PATH, dataset)
 798 |         if os.path.exists(base_path_to_delete):
 799 |             cmd = 'rm -rf %s/' % base_path_to_delete
 800 |             print("Deleting all files from directory %s" % base_path_to_delete)
 801 |             os.system(cmd)
 802 |         # make a new folder, because we've deleted the old folder. The reason we have to do it this way is 
 803 |         # if we don't delete the folder but only the files within it, 
 804 |         # we get an error during the deletion command because there are too many image files. 
 805 |         cmd = 'mkdir %s' % base_path_to_delete
 806 |         os.system(cmd)
 807 | 
 808 | def get_base_dir_for_individual_image(dataset, 
 809 |     show_both_knees_in_each_image, 
 810 |     downsample_factor_on_reload, 
 811 |     normalization_method, 
 812 |     seed_to_further_shuffle_train_test_val_sets, 
 813 |     crop_to_just_the_knee):
 814 |     """
 815 |     Get the path for an image. 
 816 |     """
 817 |     assert seed_to_further_shuffle_train_test_val_sets is None # this is deprecated; don't let us use it accidentally. 
 818 |     assert dataset in ['train', 'val', 'test', 'BLINDED_HOLD_OUT_DO_NOT_USE']
 819 |     assert show_both_knees_in_each_image in [True, False]
 820 |     assert downsample_factor_on_reload in [None, 0.7, 0.5, 0.3]
 821 |     assert normalization_method in ['imagenet_statistics', 'our_statistics', 'zscore_individually']
 822 |     assert crop_to_just_the_knee in [True, False]
 823 | 
 824 |     if show_both_knees_in_each_image:
 825 |         assert not crop_to_just_the_knee
 826 | 
 827 | 
 828 |     if seed_to_further_shuffle_train_test_val_sets is None:
 829 |         random_seed_suffix = ''
 830 |     else:
 831 |         random_seed_suffix = '_random_seed_%i' % seed_to_further_shuffle_train_test_val_sets
 832 |     
 833 |     if not crop_to_just_the_knee:
 834 |         base_dir = os.path.join(INDIVIDUAL_IMAGES_PATH, 
 835 |                 dataset, 
 836 |                 'show_both_knees_%s_downsample_factor_%s_normalization_method_%s%s'  % (
 837 |                     show_both_knees_in_each_image, 
 838 |                     downsample_factor_on_reload, 
 839 |                     normalization_method, 
 840 |                     random_seed_suffix))
 841 |     else:
 842 |         base_dir = os.path.join(INDIVIDUAL_IMAGES_PATH, 
 843 |                 dataset, 
 844 |                 'crop_to_just_the_knee_downsample_factor_%s_normalization_method_%s%s'  % (
 845 |                     downsample_factor_on_reload, 
 846 |                     normalization_method, 
 847 |                     random_seed_suffix))
 848 | 
 849 |     return base_dir
 850 |         
 851 | 
 852 | def write_out_individual_images_for_one_dataset(write_out_image_data, 
 853 |     normalization_method, 
 854 |     show_both_knees_in_each_image, 
 855 |     downsample_factor_on_reload, 
 856 |     seed_to_further_shuffle_train_test_val_sets, 
 857 |     crop_to_just_the_knee):
 858 |     """
 859 |     If we actually want to train several neural nets simultaneously, the entire image dataset is too large to fit in memory. 
 860 |     So, after loading the whole image dataset, we also write out each image into a separate file. 
 861 |     We save the images several different ways -- with different preprocessing and downsampling sizes. 
 862 |     Checked. 
 863 |     """    
 864 |     image_dataset_kwargs = copy.deepcopy(IMAGE_DATASET_KWARGS)
 865 |     image_dataset_kwargs['reprocess_all_images'] = False
 866 |     image_dataset_kwargs['use_small_data'] = False
 867 |     image_dataset_kwargs['normalization_method'] = normalization_method
 868 |     image_dataset_kwargs['downsample_factor_on_reload'] = downsample_factor_on_reload
 869 |     image_dataset_kwargs['show_both_knees_in_each_image'] = show_both_knees_in_each_image
 870 |     image_dataset_kwargs['crop_to_just_the_knee'] = crop_to_just_the_knee
 871 |     image_dataset = XRayImageDataset(**image_dataset_kwargs)
 872 |     for dataset in ['train', 'val', 'test', 'BLINDED_HOLD_OUT_DO_NOT_USE']:
 873 |         print("Writing out individual images for %s" % dataset)
 874 |         base_path = get_base_dir_for_individual_image(dataset=dataset, 
 875 |                                                       show_both_knees_in_each_image=show_both_knees_in_each_image, 
 876 |                                                       downsample_factor_on_reload=downsample_factor_on_reload, 
 877 |                                                       normalization_method=normalization_method, 
 878 |                                                       seed_to_further_shuffle_train_test_val_sets=seed_to_further_shuffle_train_test_val_sets, 
 879 |                                                       crop_to_just_the_knee=crop_to_just_the_knee)
 880 |         if os.path.exists(base_path):
 881 |             raise Exception('base path %s should not exist' % base_path)
 882 |         time.sleep(3)
 883 | 
 884 |         while not os.path.exists(base_path):
 885 |             # for some reason this command occasionally fails; make it more robust. 
 886 |             os.system('mkdir %s' % base_path)
 887 |             time.sleep(10)
 888 | 
 889 |         if dataset == 'BLINDED_HOLD_OUT_DO_NOT_USE':
 890 |             i_promise_i_really_want_to_use_the_blinded_hold_out_set = True
 891 |         else:
 892 |             i_promise_i_really_want_to_use_the_blinded_hold_out_set = False
 893 | 
 894 |         non_image_dataset = non_image_data_processing.NonImageData(what_dataset_to_use=dataset, 
 895 |                                                                    timepoints_to_filter_for=TIMEPOINTS_TO_FILTER_FOR, 
 896 |                                                                    seed_to_further_shuffle_train_test_val_sets=seed_to_further_shuffle_train_test_val_sets, 
 897 |                                                                    i_promise_i_really_want_to_use_the_blinded_hold_out_set=i_promise_i_really_want_to_use_the_blinded_hold_out_set)
 898 |         combined_df, matched_images, image_codes = match_image_dataset_to_non_image_dataset(image_dataset, non_image_dataset)
 899 |         ensure_barcodes_match(combined_df, image_codes)
 900 |         assert combined_df['visit'].map(lambda x:x in TIMEPOINTS_TO_FILTER_FOR).all()
 901 |         
 902 |         non_image_csv_outfile = os.path.join(base_path, 'non_image_data.csv')
 903 |         combined_df.to_csv(non_image_csv_outfile)
 904 |         if write_out_image_data:
 905 |             ensure_barcodes_match(combined_df, image_codes)
 906 |             pickle.dump(image_codes, open(os.path.join(base_path, 'image_codes.pkl'), 'wb'))
 907 |             for i in range(len(combined_df)):
 908 |                 image_path = os.path.join(base_path, 'image_%i.npy' % i)
 909 |                 np.save(image_path, matched_images[i])
 910 |                 print("%s image %i/%i written out to %s" % (dataset, i + 1, len(combined_df), image_path))
 911 |     print("Successfully wrote out all images.")
 912 | 
 913 | def write_out_image_datasets_in_parallel():
 914 |     """
 915 |     Parallelize the writing out of images since it takes a while. This can be run on rambo. 
 916 |     Each job writes out the images for one normalization_method,show_both_knees_in_each_image,downsample_factor_on_reload. 
 917 |     This undoubtedly is not the CPU or memory-efficient way to do it, but whatever. 
 918 | 
 919 |     This does not write out the cropped-knee datasets or different random seed datasets; I wrote separate methods to do taht. 
 920 |     """
 921 |     dataset_idx = 1
 922 |     n_currently_running = 0
 923 |     for normalization_method in ['imagenet_statistics', 'our_statistics', 'zscore_individually']:
 924 |         for show_both_knees_in_each_image in [True]:
 925 |             for downsample_factor_on_reload in [None, 0.7, 0.5, 0.3]:
 926 |                 for crop_to_just_the_knee in [False]:
 927 |                     cmd = 'nohup python -u image_processing.py --normalization_method %s --show_both_knees_in_each_image %s --downsample_factor_on_reload %s --write_out_image_data True --seed_to_further_shuffle_train_test_val_sets None --crop_to_just_the_knee %s > processing_outfiles/image_processing_dataset_%i.out &' % (
 928 |                         normalization_method, 
 929 |                         show_both_knees_in_each_image, 
 930 |                         downsample_factor_on_reload, 
 931 |                         crop_to_just_the_knee, 
 932 |                         dataset_idx)
 933 | 
 934 |                     print("Now running command %s" % cmd)
 935 |                     dataset_idx += 1
 936 |                     n_currently_running += 1
 937 |                     os.system(cmd)
 938 |                     if n_currently_running >= 4:
 939 |                         time.sleep(6 * 3600)
 940 |                         n_currently_running = 0
 941 | 
 942 | 
 943 | def write_out_datasets_shuffled_with_different_random_seed():
 944 |     """
 945 |     Write out a couple additional shuffled datasets. Robustness check to make sure our main results are consistent across train sets. 
 946 |     """
 947 |     raise Exception("This is deprecated; we now can just reshuffle the train/test/val sets using the original dataset.")
 948 |     dataset_idxs = [int(a.split('_')[-1].replace('.out', '')) for a in os.listdir('processing_outfiles')]
 949 |     dataset_idx = max(dataset_idxs) + 1
 950 |     n_currently_running = 0
 951 |     for normalization_method in ['our_statistics']:
 952 |         for show_both_knees_in_each_image in [True]:
 953 |             for downsample_factor_on_reload in [None]:
 954 |                 for random_seed in range(1, 5):
 955 |                 
 956 |                     cmd = 'nohup python -u image_processing.py --normalization_method %s --show_both_knees_in_each_image %s --downsample_factor_on_reload %s --write_out_image_data True --seed_to_further_shuffle_train_test_val_sets %i --crop_to_just_the_knee False > processing_outfiles/image_processing_dataset_%i.out &' % (
 957 |                         normalization_method, 
 958 |                         show_both_knees_in_each_image, 
 959 |                         downsample_factor_on_reload, 
 960 |                         random_seed,
 961 |                         dataset_idx)
 962 |                     print("Now running command %s" % cmd)
 963 |                     dataset_idx += 1
 964 |                     n_currently_running += 1
 965 |                     os.system(cmd)
 966 |                     if n_currently_running >= 1:
 967 |                         time.sleep(6 * 3600)
 968 |                         n_currently_running = 0
 969 | 
 970 | def write_out_datasets_cropped_to_just_the_knee():
 971 |     """
 972 |     Write out cropped knee datasets. 
 973 |     """
 974 |     dataset_idx = 1
 975 |     for normalization_method in ['imagenet_statistics', 'our_statistics', 'zscore_individually']:
 976 |         for downsample_factor_on_reload in [None, 0.5]:
 977 |             cmd = 'nohup python -u image_processing.py --normalization_method %s --show_both_knees_in_each_image False --downsample_factor_on_reload %s --write_out_image_data True --seed_to_further_shuffle_train_test_val_sets None --crop_to_just_the_knee True > processing_outfiles/image_processing_dataset_%i.out &' % (
 978 |                         normalization_method, 
 979 |                         downsample_factor_on_reload, 
 980 |                         dataset_idx)
 981 |             print("Now running command %s" % cmd)
 982 |             dataset_idx += 1
 983 |             os.system(cmd)
 984 | 
 985 | 
 986 | 
 987 | 
 988 | def random_horizontal_vertical_translation(img, max_horizontal_translation, max_vertical_translation):
 989 |     """
 990 |     Translates the image horizontally/vertically by a fraction of its width/length. 
 991 |     To keep the image the same size + scale, we add a background color to fill in any space created. 
 992 |     """
 993 |     assert max_horizontal_translation >= 0 and max_horizontal_translation <= 1
 994 |     assert max_vertical_translation >= 0 and max_vertical_translation <= 1
 995 |     if max_horizontal_translation == 0 and max_vertical_translation == 0:
 996 |         return img
 997 | 
 998 |     img = img.copy()
 999 | 
1000 |     assert len(img.shape) == 3
1001 |     assert img.shape[0] == 3
1002 |     assert img.shape[1] >= img.shape[2]
1003 |     
1004 |     height = img.shape[1]
1005 |     width = img.shape[2]
1006 | 
1007 |     translated_img = img
1008 |     horizontal_translation = int((random.random() - .5) * max_horizontal_translation * width)
1009 |     vertical_translation = int((random.random() - .5) * max_vertical_translation * height)
1010 |     background_color = img[:, -10:, -10:].mean(axis=1).mean(axis=1)
1011 | 
1012 |     # first we translate the image. 
1013 |     if horizontal_translation != 0:
1014 |         if horizontal_translation > 0:
1015 |             translated_img = translated_img[:, :, horizontal_translation:] # this cuts off pixels on the left of the image
1016 |         else:
1017 |             translated_img = translated_img[:, :, :horizontal_translation] # this cuts off pixels on the right of the image
1018 | 
1019 |     if vertical_translation != 0:
1020 |         if vertical_translation > 0:
1021 |             translated_img = translated_img[:, vertical_translation:, :] # this cuts off pixels on the top of the image
1022 |         else:
1023 |             translated_img = translated_img[:, :vertical_translation, :] # this cuts off pixels on the bottom of the image. 
1024 | 
1025 |     # then we keep the dimensions the same. 
1026 |     new_height = translated_img.shape[1]
1027 |     new_width = translated_img.shape[2]
1028 |     new_image = []
1029 |     for i in range(3): # loop over RGB
1030 |         background_square = np.ones([height, width]) * background_color[i]
1031 |         if horizontal_translation < 0:
1032 |             if vertical_translation < 0:
1033 |                 # I don't really know if the signs here matter all that much -- it's just whether we're putting the translated 
1034 |                 # images on the left or right. 
1035 |                 background_square[-new_height:, -new_width:] = translated_img[i, :, :]
1036 |             else:
1037 |                 background_square[:new_height, -new_width:] = translated_img[i, :, :]
1038 |         else:
1039 |             if vertical_translation < 0:
1040 |                 background_square[-new_height:, :new_width] = translated_img[i, :, :]
1041 |             else:
1042 |                 background_square[:new_height, :new_width] = translated_img[i, :, :]
1043 |         new_image.append(background_square)
1044 |     new_image = np.array(new_image)
1045 | 
1046 |     return new_image
1047 | 
1048 | class PytorchImagesDataset(Dataset):
1049 |     """
1050 |     A class for loading in images one at a time. 
1051 |     Follows pytorch dataset tutorial: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
1052 |     """
1053 | 
1054 |     def __init__(self, 
1055 |         dataset, 
1056 |         downsample_factor_on_reload, 
1057 |         normalization_method, 
1058 |         show_both_knees_in_each_image,
1059 |         y_col, 
1060 |         transform, 
1061 |         seed_to_further_shuffle_train_test_val_sets,
1062 |         crop_to_just_the_knee,
1063 |         max_horizontal_translation=None, 
1064 |         max_vertical_translation=None, 
1065 |         additional_features_to_predict=None,
1066 |         use_very_very_small_subset=False, 
1067 |         load_only_single_klg=None, 
1068 |         blur_filter=None):
1069 |         """
1070 |         Args:
1071 |             dataset: train, val, or test. 
1072 |             downsample_factor_on_reload, normalization_method -- same as in image processing. 
1073 |             y_col: what we're trying to predict. 
1074 |             transform: how to augment the loaded images. 
1075 |         """
1076 |         assert dataset in ['train', 'val', 'test', 'BLINDED_HOLD_OUT_DO_NOT_USE']
1077 |         assert downsample_factor_on_reload in [None, 0.7, 0.5, 0.3]
1078 |         assert y_col in ['koos_pain_subscore', 
1079 |         'womac_pain_subscore', 
1080 |         'binarized_koos_pain_subscore', 
1081 |         'binarized_womac_pain_subscore',
1082 |         'xrkl', 
1083 |         'koos_pain_subscore_residual',
1084 |         'binarized_education_graduated_college', 
1085 |         'binarized_income_at_least_50k']
1086 | 
1087 |         assert normalization_method in ['imagenet_statistics', 'our_statistics', 'zscore_individually']
1088 |         assert transform in [None, 'random_translation', 'random_translation_and_then_random_horizontal_flip']
1089 |         assert (max_horizontal_translation is None) == (transform is None)
1090 |         assert (max_vertical_translation is None) == (transform is None)
1091 |         if show_both_knees_in_each_image == True:
1092 |             assert transform != 'random_translation_and_then_random_horizontal_flip'
1093 |              
1094 |         self.dataset = dataset
1095 |         self.downsample_factor_on_reload = downsample_factor_on_reload
1096 |         self.normalization_method = normalization_method
1097 |         self.show_both_knees_in_each_image = show_both_knees_in_each_image
1098 |         self.crop_to_just_the_knee = crop_to_just_the_knee
1099 |         self.use_very_very_small_subset = use_very_very_small_subset
1100 |         self.max_horizontal_translation = max_horizontal_translation
1101 |         self.max_vertical_translation = max_vertical_translation
1102 |         self.seed_to_further_shuffle_train_test_val_sets = seed_to_further_shuffle_train_test_val_sets
1103 |         self.clinical_control_columns = CLINICAL_CONTROL_COLUMNS
1104 |         self.additional_features_to_predict = additional_features_to_predict
1105 |         self.load_only_single_klg = load_only_single_klg
1106 |         self.blur_filter = blur_filter
1107 | 
1108 |         if seed_to_further_shuffle_train_test_val_sets is None:
1109 |             self.base_dir_for_images = get_base_dir_for_individual_image(dataset=self.dataset,
1110 |                                                                         show_both_knees_in_each_image=self.show_both_knees_in_each_image,
1111 |                                                                         downsample_factor_on_reload=self.downsample_factor_on_reload,
1112 |                                                                         normalization_method=self.normalization_method, 
1113 |                                                 seed_to_further_shuffle_train_test_val_sets=self.seed_to_further_shuffle_train_test_val_sets, 
1114 |                                                 crop_to_just_the_knee=self.crop_to_just_the_knee)
1115 |             self.image_codes = pickle.load(open(os.path.join(self.base_dir_for_images, 'image_codes.pkl'), 'rb'))
1116 |             self.non_image_data = pd.read_csv(os.path.join(self.base_dir_for_images, 'non_image_data.csv'), index_col=0)
1117 |         else:
1118 |             # We need to (somewhat hackily) paste the train, val, and test sets together. 
1119 |             print("Alert! Random seed is %i" % self.seed_to_further_shuffle_train_test_val_sets)
1120 |             assert dataset in ['train', 'val', 'test']
1121 |             shuffled_ids = make_train_val_test_hold_out_set(self.seed_to_further_shuffle_train_test_val_sets)
1122 |             ids_we_are_using = set(shuffled_ids[dataset + '_ids'])
1123 |             self.non_image_data = []
1124 |             self.image_codes = []
1125 |             self.new_image_paths = []
1126 |             for dataset_2 in ['train', 'val', 'test']:
1127 |                 base_dir_for_dataset = get_base_dir_for_individual_image(dataset=dataset_2,
1128 |                                                                         show_both_knees_in_each_image=self.show_both_knees_in_each_image,
1129 |                                                                         downsample_factor_on_reload=self.downsample_factor_on_reload,
1130 |                                                                         normalization_method=self.normalization_method, 
1131 |                                                 seed_to_further_shuffle_train_test_val_sets=None, 
1132 |                                                 crop_to_just_the_knee=self.crop_to_just_the_knee)
1133 |                 non_image_data_from_dataset = pd.read_csv(os.path.join(base_dir_for_dataset, 'non_image_data.csv'), index_col=0)
1134 |                 image_codes_from_this_dataset = pickle.load(open(os.path.join(base_dir_for_dataset, 'image_codes.pkl'), 'rb'))
1135 |                 idxs_from_this_dataset = non_image_data_from_dataset['id'].map(lambda x:x in ids_we_are_using).values
1136 |                 self.non_image_data.append(non_image_data_from_dataset.loc[idxs_from_this_dataset])
1137 |                 self.image_codes += list(np.array(image_codes_from_this_dataset)[idxs_from_this_dataset])
1138 | 
1139 |                 # for loading individual images, we just create a new data structure, image paths, which has a list of paths that you need. 
1140 |                 image_numbers_for_dataset = np.arange(len(non_image_data_from_dataset))[idxs_from_this_dataset]
1141 |                 image_paths_for_dataset = [os.path.join(base_dir_for_dataset, 'image_%i.npy' % i) for i in image_numbers_for_dataset]
1142 |                 self.new_image_paths += image_paths_for_dataset
1143 |                 assert len(image_paths_for_dataset) == len(image_numbers_for_dataset) == idxs_from_this_dataset.sum()
1144 |                 print("Number of new images added to dataset from original %s dataset: %i; IDs: %i" % (
1145 |                     dataset_2, len(image_paths_for_dataset), len(set(non_image_data_from_dataset.loc[idxs_from_this_dataset, 'id'].values))))
1146 |             self.non_image_data = pd.concat(self.non_image_data)
1147 |             self.non_image_data.index = range(len(self.non_image_data))
1148 |             assert set(self.non_image_data['id']) - ids_we_are_using == set([])
1149 |             print("Reconstructed dataset %s with %i rows and %i IDs" % (dataset, len(self.non_image_data), len(set(self.non_image_data['id']))))
1150 | 
1151 |         if self.additional_features_to_predict is not None:
1152 |             self.additional_feature_array = copy.deepcopy(self.non_image_data[self.additional_features_to_predict].values)
1153 |             for i in range(len(self.additional_features_to_predict)):
1154 |                 not_nan = ~np.isnan(self.additional_feature_array[:, i])
1155 |                 std = np.std(self.additional_feature_array[not_nan, i], ddof=1)
1156 |                 mu = np.mean(self.additional_feature_array[not_nan, i])
1157 |                 print("Z-scoring additional feature %s with mean %2.3f and std %2.3f" % (
1158 |                     self.additional_features_to_predict[i], mu, std))
1159 |                 self.additional_feature_array[:, i] = (self.additional_feature_array[:, i] - mu) / std
1160 | 
1161 | 
1162 |         if 'binarized_' in y_col:
1163 |             if 'koos' in y_col:
1164 |                 assert y_col not in list(self.non_image_data.columns)
1165 |                 self.non_image_data[y_col] = binarize_koos(self.non_image_data['koos_pain_subscore'].values)
1166 |                 print("Using binary column %s as y_col, a fraction %2.3f are positive (high pain) examples <= threshold %2.3f" % 
1167 |                 (y_col, self.non_image_data[y_col].mean(), KOOS_BINARIZATION_THRESH))
1168 |             elif 'womac' in y_col:
1169 |                 assert y_col not in list(self.non_image_data.columns)
1170 |                 self.non_image_data[y_col] = binarize_womac(self.non_image_data['womac_pain_subscore'].values)
1171 |                 print("Using binary column %s as y_col, a fraction %2.3f are positive (high pain) examples > threshold %2.3f" % 
1172 |                 (y_col, self.non_image_data[y_col].mean(), WOMAC_BINARIZATION_THRESH))
1173 | 
1174 |         # add column with residual. 
1175 |         if y_col == 'koos_pain_subscore_residual':
1176 |             assert len(self.non_image_data[['koos_pain_subscore', 'xrkl']].dropna()) == len(self.non_image_data)
1177 |             pain_kl_model = sm.OLS.from_formula('koos_pain_subscore ~ C(xrkl)', data=self.non_image_data).fit()
1178 |             assert 'koos_pain_subscore_residual' not in self.non_image_data.columns
1179 |             self.non_image_data['koos_pain_subscore_residual'] = self.non_image_data['koos_pain_subscore'].values - pain_kl_model.predict(self.non_image_data).values
1180 |             print(pain_kl_model.summary())
1181 |             
1182 |         self.y_col = y_col
1183 |         self.transform = transform
1184 |         print("Dataset %s has %i rows" % (dataset, len(self.non_image_data)))
1185 |         ensure_barcodes_match(self.non_image_data, self.image_codes)
1186 |         if self.show_both_knees_in_each_image:
1187 |             self.spot_check_ensure_original_images_match()
1188 |         
1189 |             
1190 |     
1191 |     def spot_check_ensure_original_images_match(self):
1192 |         """
1193 |         Sanity check: make sure we're loading the right images, as measured by a high correlation between the processed images and the original dicom images. 
1194 |         Can only do this if there's been relatively little preprocessing -- eg, no dramatic cropping of images. 
1195 | 
1196 |         Images are not necessarily identical because we have done some preprocessing (eg, smoothing or downsampling) but should be very highly correlated. 
1197 |         """
1198 |         necessary_path = os.path.join(BASE_IMAGE_DATA_DIR, '00m')
1199 |         if not os.path.exists(necessary_path):
1200 |             print("Warning: not spot-checking that images match original raw data because necessary path %s does not exist" % necessary_path)
1201 |             print("If you want to spot-check, you need to download the raw data and store it at this path")
1202 |             return
1203 | 
1204 |         print("Spot checking that images match.")
1205 |         contents_df = pd.read_csv(os.path.join(BASE_IMAGE_DATA_DIR, '00m/contents.csv'))
1206 |         idxs_to_sample = [a for a in range(len(self.non_image_data)) if self.non_image_data.iloc[a]['visit'] in ['00 month follow-up: Baseline']]
1207 |         all_correlations = []
1208 |         for random_idx in random.sample(idxs_to_sample, 10):
1209 |             row = self.non_image_data.iloc[random_idx][['id', 'side', 'barcdbu', 'visit']]
1210 |             #print(row)
1211 |             barcode = int(row['barcdbu'].astype(str)[-7:])
1212 |             folder = str(contents_df.loc[(contents_df['SeriesDescription'] == 'Bilateral PA Fixed Flexion Knee') & 
1213 |                     (contents_df['Barcode'] == barcode)].iloc[0]['Folder'])
1214 |             original_image_path = os.path.join(BASE_IMAGE_DATA_DIR, '00m', folder, '001')
1215 |             original_image = pydicom.dcmread(original_image_path)
1216 |             if self.seed_to_further_shuffle_train_test_val_sets is None:
1217 |                 our_image_path = os.path.join(self.base_dir_for_images, 'image_%i.npy' % random_idx)
1218 |             else:
1219 |                 our_image_path = self.new_image_paths[random_idx]
1220 |             our_test_image = np.load(our_image_path)[0, :, :].squeeze()
1221 |             original_image = cv2.resize(original_image.pixel_array, dsize=tuple(our_test_image.shape)[::-1], interpolation=cv2.INTER_CUBIC)
1222 |             if row['side'] == 'right':
1223 |                 original_image = original_image[:, ::-1]
1224 |             all_correlations.append(spearmanr(original_image.flatten(), our_test_image.flatten())[0])
1225 |             print("Correlation between original and reloaded image is", all_correlations[-1])
1226 |         assert np.median(all_correlations) >= .99
1227 |         assert np.mean(all_correlations) >= .97
1228 |         print("Image spot check image passed.")
1229 | 
1230 |     def __len__(self):
1231 |         if self.use_very_very_small_subset:
1232 |             return 500
1233 |         if self.load_only_single_klg is not None:
1234 |             raise Exception("This is not an option you should be using.")
1235 |             return (self.non_image_data['xrkl'].values == self.load_only_single_klg).sum()
1236 |         return len(self.non_image_data)
1237 | 
1238 |     def __getitem__(self, idx):
1239 |         if self.seed_to_further_shuffle_train_test_val_sets is None:
1240 |             image_path = os.path.join(self.base_dir_for_images, 'image_%i.npy' % idx)
1241 |         else:
1242 |             image_path = self.new_image_paths[idx]
1243 |         image = np.load(image_path)
1244 |         if self.transform:
1245 |             assert self.transform in ['random_translation_and_then_random_horizontal_flip', 'random_translation']
1246 |             image = random_horizontal_vertical_translation(image, self.max_horizontal_translation, self.max_vertical_translation)
1247 |             if self.transform == 'random_translation_and_then_random_horizontal_flip':
1248 |                 if random.random() < 0.5:
1249 |                     image = image[:, :, ::-1].copy()
1250 |         if self.blur_filter is not None:
1251 |             assert self.blur_filter > 0 and self.blur_filter < 1 # this argument is the downsample fraction
1252 |             downsample_frac = self.blur_filter
1253 |             new_image = []
1254 |             for i in range(3):
1255 |                 img = image[i, :, :].copy()
1256 |                 original_size = img.shape # note have to reverse arguments for cv2. 
1257 |                 img2 = cv2.resize(img, (int(original_size[1] * downsample_frac), int(original_size[0] * downsample_frac)))
1258 |                 new_image.append(cv2.resize(img2, tuple(original_size[::-1])))
1259 |                 #image[i, :, :] = gaussian_filter(image[i, :, :], sigma=self.gaussian_blur_filter)
1260 |             new_image = np.array(new_image)
1261 |             assert new_image.shape == image.shape
1262 |             image = new_image
1263 |         if self.additional_features_to_predict is not None:
1264 |             additional_features = self.additional_feature_array[idx, :]
1265 |             additional_features_are_not_nan = ~np.isnan(additional_features)
1266 |             additional_features[~additional_features_are_not_nan] = 0
1267 |             additional_features_are_not_nan = additional_features_are_not_nan * 1.
1268 |         else:
1269 |             additional_features = []
1270 |             additional_features_are_not_nan = []
1271 | 
1272 |         yval = self.non_image_data[self.y_col].iloc[idx]
1273 |         
1274 |         klg = self.non_image_data['xrkl'].iloc[idx]
1275 |         assert klg in [0, 1, 2, 3, 4]
1276 |         klg_coding = np.array([0., 0., 0., 0., 0.])
1277 |         klg_coding[int(klg)] = 1.
1278 |         klg = klg_coding
1279 | 
1280 |         binarized_education_graduated_college = self.non_image_data['binarized_education_graduated_college'].iloc[idx]
1281 |         assert binarized_education_graduated_college in [0, 1]
1282 | 
1283 |         binarized_income_at_least_50k = self.non_image_data['binarized_income_at_least_50k'].iloc[idx]
1284 |         assert binarized_income_at_least_50k in [0, 1]
1285 | 
1286 |         site = self.non_image_data['v00site'].iloc[idx]
1287 |         assert site in ['A', 'B', 'C', 'D', 'E']
1288 | 
1289 |         assert ~np.isnan(yval)
1290 | 
1291 |         sample = {'image': image, 
1292 |         'y':yval, 
1293 |         'klg':klg,
1294 |         'binarized_education_graduated_college':binarized_education_graduated_college,
1295 |         'binarized_income_at_least_50k':binarized_income_at_least_50k,
1296 |         'additional_features_to_predict':additional_features, 
1297 |         'additional_features_are_not_nan':additional_features_are_not_nan, 
1298 |         'site':site}
1299 |         return sample
1300 | 
1301 | if __name__ == '__main__':
1302 |     from traceback import print_exc
1303 |     import argparse
1304 | 
1305 |     parser = argparse.ArgumentParser()
1306 |     args = sys.argv
1307 | 
1308 |     def str2bool(x):
1309 |         assert x in ['True', 'False']
1310 |         return x == 'True'
1311 | 
1312 |     if len(sys.argv) > 1:
1313 |         parser.add_argument('--write_out_image_data', type=str)
1314 |         parser.add_argument('--normalization_method', type=str)
1315 |         parser.add_argument('--show_both_knees_in_each_image', type=str)
1316 |         parser.add_argument('--downsample_factor_on_reload', type=str)
1317 |         parser.add_argument('--seed_to_further_shuffle_train_test_val_sets', type=str)
1318 |         parser.add_argument('--crop_to_just_the_knee', type=str)
1319 |         args = parser.parse_args()
1320 | 
1321 |         downsample_factor_on_reload = None if args.downsample_factor_on_reload == 'None' else float(args.downsample_factor_on_reload)
1322 |         seed_to_further_shuffle_train_test_val_sets = None if args.seed_to_further_shuffle_train_test_val_sets == 'None' else int(args.seed_to_further_shuffle_train_test_val_sets)
1323 | 
1324 |         write_out_individual_images_for_one_dataset(write_out_image_data=str2bool(args.write_out_image_data), 
1325 |                         normalization_method=args.normalization_method, 
1326 |                         show_both_knees_in_each_image=str2bool(args.show_both_knees_in_each_image), 
1327 |                         downsample_factor_on_reload=downsample_factor_on_reload, 
1328 |                         seed_to_further_shuffle_train_test_val_sets=seed_to_further_shuffle_train_test_val_sets, 
1329 |                         crop_to_just_the_knee=str2bool(args.crop_to_just_the_knee))
1330 |     else:
1331 |         image_dataset = XRayImageDataset(reprocess_all_images=True, show_both_knees_in_each_image=True, crop_to_just_the_knee=False, **IMAGE_DATASET_KWARGS)
1332 |         # DEPRECATED COMMENTS. 
1333 |         # Step 1: clear out old images on /dfs.
1334 |         #delete_old_images_from_dfs()
1335 |         # Step 2: reprocess the original DICOM images into a pkl.
1336 |         # image_dataset = XRayImageDataset(reprocess_all_images=True, show_both_knees_in_each_image=True, crop_to_just_the_knee=False, **IMAGE_DATASET_KWARGS)
1337 |         # Step 3: write out individual images on /dfs. 
1338 |         #write_out_image_datasets_in_parallel()
1339 |         #time.sleep(6 * 3600)
1340 |         # Step 4: (somewhat optional) write out images cropped to the knee. 
1341 |         #time.sleep(6 * 3600)
1342 |         #write_out_datasets_cropped_to_just_the_knee()        
1343 |         #time.sleep(8 * 3600)
1344 |         
1345 |         #
1346 |         
1347 |     #compare_contents_files_to_loaded_images(image_dataset, IMAGE_DATASET_KWARGS['desired_image_type'])
1348 | 
1349 |     
1350 | 
1351 | 
1352 | 


--------------------------------------------------------------------------------
/non_image_data_processing.py:
--------------------------------------------------------------------------------
   1 | from constants_and_util import *
   2 | import os
   3 | import pandas as pd
   4 | import copy
   5 | from scipy.stats import pearsonr
   6 | from collections import Counter
   7 | import datetime
   8 | 
   9 | class NonImageData():
  10 |     """
  11 |     Class for loading the non-image data. 
  12 |     Requires an argument to specify train val test or BLINDED_HOLD_OUT_SET. 
  13 |     """
  14 |     def __init__(self, 
  15 |         what_dataset_to_use, 
  16 |         timepoints_to_filter_for, 
  17 |         seed_to_further_shuffle_train_test_val_sets=None,
  18 |         i_promise_i_really_want_to_use_the_blinded_hold_out_set=False, 
  19 |         filter_out_special_values_in_mri_data=False):
  20 |         """
  21 |         Load raw data, turn it into processed data, and do some validations. Checked. 
  22 |         Raw data was downloaded from https://ndar.nih.gov/oai/full_downloads.html
  23 | 
  24 |         Minor note: this method raises a "DtypeWarning: Columns (5) have mixed types." warning. This is caused by a file in a column we do not use in a timepoint we do not use. It could be fixed by using 
  25 | 
  26 |         pd.read_csv('/dfs/dataset/tmp/20180910-OAI/data/emma_downloaded_oai_data_9112018/MRI MetaAnalysis_ASCII/MRI10.txt', 
  27 |                 sep='|', 
  28 |                 dtype={'V10MQCCMNT':str})
  29 |         """
  30 |         assert what_dataset_to_use in ['train', 'val', 'test', 'BLINDED_HOLD_OUT_DO_NOT_USE', 'all']
  31 |         if not i_promise_i_really_want_to_use_the_blinded_hold_out_set:
  32 |             assert what_dataset_to_use not in ['BLINDED_HOLD_OUT_DO_NOT_USE', 'all'] # just a sanity check to make sure we don't accidentally use these. 
  33 |         self.seed_to_further_shuffle_train_test_val_sets = seed_to_further_shuffle_train_test_val_sets
  34 |         self.what_dataset_to_use = what_dataset_to_use
  35 |         self.clinical_base_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'AllClinical_ASCII')
  36 |         self.semiquantitative_xray_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR, 
  37 |             'X-Ray Image Assessments_ASCII', 
  38 |             'Semi-Quant Scoring_ASCII')
  39 |         self.semiquantitative_mri_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR, 
  40 |             'MR Image Assessment_ASCII', 
  41 |             'Semi-Quant Scoring_ASCII')
  42 |         self.xray_metadata_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'X-Ray MetaAnalysis_ASCII')
  43 |         self.mri_metadata_dir = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'MRI MetaAnalysis_ASCII')
  44 |         self.original_dataframes = {} # store the original CSVs
  45 |         self.processed_dataframes = {} # store the processed data
  46 |         self.col_mappings = {}
  47 |         self.missing_data_val = '.: Missing Form/Incomplete Workbook'
  48 |         self.filter_out_special_values_in_mri_data = filter_out_special_values_in_mri_data
  49 |         # From the OAI quantitative x-ray notes: 
  50 |         # The variable SIDE denotes whether the row of data is for a right side image (SIDE=1) or a left side image (SIDE=2)
  51 |         self.side_mappings = {1:'right', 2:'left'}
  52 |         if timepoints_to_filter_for is None:
  53 |             self.timepoints_to_filter_for = TIMEPOINTS_TO_FILTER_FOR
  54 |             print("Set timepoints to filter for to", TIMEPOINTS_TO_FILTER_FOR)
  55 |         else:
  56 |             self.timepoints_to_filter_for = timepoints_to_filter_for
  57 |             
  58 |         # load various dataframes 
  59 |         self.load_clinical_data()
  60 |         self.load_semiquantitative_xray_data()
  61 |         self.load_xray_metadata()
  62 |         self.load_semiquantitative_mri_data()
  63 |         self.load_mri_metadata()
  64 | 
  65 |         # make processed dataframes. 
  66 |         
  67 |         self.make_nonstandard_interventions_dataframe()
  68 |         self.make_medications_dataframe()
  69 |         self.make_400m_walk_dataframe()
  70 |         self.make_redundant_knee_xray_variable_dataframe()
  71 |         self.make_knee_pain_dataframe()
  72 |         self.make_other_koos_subscores_dataframe()
  73 |         self.make_per_person_controls_dataframe()
  74 |         self.make_previous_injury_dataframe()
  75 |         self.make_previous_surgery_dataframe()
  76 |         self.make_previous_knee_replacement_dataframe()
  77 |         self.make_bmi_dataframe()
  78 |         self.make_drinking_and_smoking_dataframe()
  79 |         self.make_medical_history_dataframe()
  80 |         self.make_pain_dataframe_for_all_other_types_of_pain()
  81 |         self.make_age_dataframe()
  82 |         self.make_dominant_leg_dataframe()
  83 |         self.make_previous_fracture_or_fall_dataframe()
  84 |         self.make_processed_mri_data()
  85 | 
  86 |         # some validation. 
  87 |         self.validate_processed_data()
  88 |         self.validate_ids()
  89 |         
  90 |         if self.what_dataset_to_use != 'all':
  91 |             self.filter_for_correct_set()
  92 |         self.filter_out_timepoints()
  93 |         self.filter_out_visits_too_far_from_xray_imaging()
  94 |         print("Successfully loaded non-image data.")
  95 | 
  96 | 
  97 | 
  98 |     def filter_out_timepoints(self):
  99 |         """
 100 |         Remove datapoints from processed dataframes if they're not in timepoints_to_filter_for.
 101 |         """
 102 |         print("Filtering for timepoints", self.timepoints_to_filter_for)
 103 |         for k in sorted(self.processed_dataframes.keys()):
 104 |             if 'visit' in self.processed_dataframes[k].columns:
 105 |                 print("\nLength of %s prior to filtering: %i" % (k, len(self.processed_dataframes[k])))
 106 |                 assert pd.isnull(self.processed_dataframes[k]['visit']).sum() == 0
 107 |                 print("Values of visit prior to filtering", sorted(list(set(self.processed_dataframes[k]['visit']))))
 108 |                 if not all([a in list(set(self.processed_dataframes[k]['visit'].dropna())) for a in self.timepoints_to_filter_for]):
 109 |                     raise Exception("There is a problem with the visit column in %s: not all the timepoints we want are present." % k)
 110 |                 if not all([a in CLINICAL_WAVES_TO_FOLLOWUP.values() for a in list(set(self.processed_dataframes[k]['visit'].dropna()))]):
 111 |                     raise Exception("There is a problem with the visit column in %s: not all values in the column are valid visits." % k)
 112 | 
 113 |                 self.processed_dataframes[k] = self.processed_dataframes[k].loc[self.processed_dataframes[k]['visit'].map(
 114 |                     (lambda x:x in self.timepoints_to_filter_for))]
 115 | 
 116 |                 self.processed_dataframes[k].index = range(len(self.processed_dataframes[k]))
 117 |                 print("Length of %s after filtering: %i" % (k, len(self.processed_dataframes[k])))
 118 |                 print("Values of visit after filtering", sorted(list(set(self.processed_dataframes[k]['visit']))))
 119 |             else:
 120 |                 print("Not filtering for visit for dataframe %s because no visit column" % k)
 121 | 
 122 | 
 123 |     def filter_for_correct_set(self):
 124 |         """
 125 |         Make sure our dataset contains only the right dataset (eg, train set etc). Checked. 
 126 |         """
 127 |         print("Filtering for %s set." % self.what_dataset_to_use)
 128 |         ids = make_train_val_test_hold_out_set(seed_to_further_shuffle_train_test_val_sets=self.seed_to_further_shuffle_train_test_val_sets)
 129 |         ids = ids[self.what_dataset_to_use + '_ids']
 130 |         self.all_ids = sorted(ids)
 131 |         id_set = set(ids)
 132 |         print('****Filtering unprocessed data for %s set.' % self.what_dataset_to_use)
 133 |         for k in sorted(self.original_dataframes.keys()):
 134 |             assert 'ID' not in self.original_dataframes[k].columns
 135 |             if 'id' in self.original_dataframes[k].columns:
 136 |                 orig_length = len(self.original_dataframes[k])
 137 |                 self.original_dataframes[k] = self.original_dataframes[k].loc[self.original_dataframes[k]['id'].map(lambda x:x in id_set)]
 138 |                 print("After filtering, number of rows in %s goes from %i -> %i" % (k, orig_length, len(self.original_dataframes[k])))
 139 |                 assert orig_length != len(self.original_dataframes[k])
 140 |         print('\n****Filtering processed data for %s set.' % self.what_dataset_to_use)
 141 |         for k in sorted(self.processed_dataframes.keys()):
 142 |             assert 'ID' not in self.processed_dataframes[k].columns
 143 |             if 'id' in self.processed_dataframes[k].columns:
 144 |                 orig_length = len(self.processed_dataframes[k])
 145 |                 self.processed_dataframes[k] = self.processed_dataframes[k].loc[self.processed_dataframes[k]['id'].map(lambda x:x in id_set)]
 146 |                 print("After filtering, number of rows in %s goes from %i -> %i" % (k, orig_length, len(self.processed_dataframes[k])))
 147 |                 assert orig_length != len(self.processed_dataframes[k])
 148 | 
 149 |     def validate_processed_data(self):
 150 |         """
 151 |         Make sure there are no missing data values in the processed data. Checked. 
 152 |         """
 153 |         for k in self.processed_dataframes:
 154 |             assert 'id' in self.processed_dataframes[k].columns
 155 |             print("Validating id column for %s" % k)
 156 |             assert pd.isnull(self.processed_dataframes[k]['id']).sum() == 0
 157 |             if 'visit' in self.processed_dataframes[k].columns:
 158 |                 print("Validating visit column for %s" % k)
 159 |                 assert pd.isnull(self.processed_dataframes[k]['visit']).sum() == 0
 160 |                 assert self.processed_dataframes[k]['visit'].map(lambda x:x in CLINICAL_WAVES_TO_FOLLOWUP.values()).all()
 161 |             if 'side' in self.processed_dataframes[k].columns:
 162 |                 print("Validating side column for %s" % k)
 163 |                 assert pd.isnull(self.processed_dataframes[k]['side']).sum() == 0
 164 |                 assert self.processed_dataframes[k]['side'].map(lambda x:x in ['left', 'right']).all()
 165 |             for c in self.processed_dataframes[k].columns:
 166 |                 assert self.processed_dataframes[k][c].map(lambda x:str(x) == self.missing_data_val).sum() == 0
 167 | 
 168 |     def load_all_text_files_in_directory(self, base_dir, datasets_to_skip):
 169 |         """
 170 |         Given a base directory, and datasets to skip, loads in the relevant datasets to self.original_dataframes.
 171 |         Column names + dataset names are stored in lowercase. 
 172 |         Checked. 
 173 |         """
 174 |         print("Base directory: %s" % base_dir)
 175 |         skipped_datasets = [] # make sure we actually skipped all the datasets we want to skip. 
 176 |         for filename in sorted(os.listdir(base_dir)):
 177 |             if filename[-4:] == '.txt':
 178 |                 dataset_name = filename.replace('.txt', '').lower()
 179 |                 if dataset_name in datasets_to_skip:
 180 |                     skipped_datasets.append(dataset_name)
 181 |                     continue
 182 |                 full_path = os.path.join(base_dir, filename)
 183 |                 d = pd.read_csv(full_path, sep='|')
 184 |                 d.columns = d.columns.map(lambda x:x.lower())
 185 |                 assert len(d.columns) == len(set(d.columns))
 186 |                 print("%s has %i columns, %i rows" % (filename, len(d.columns), len(d)))
 187 |                 assert dataset_name not in self.original_dataframes # don't add same dataset twice. 
 188 |                 self.original_dataframes[dataset_name] = d
 189 |                 self.col_mappings[dataset_name] = {} # in case we want to map column names to anything else, this is a data dictionary. 
 190 |         assert sorted(datasets_to_skip) == sorted(skipped_datasets) 
 191 | 
 192 |     def concatenate_dataframes_from_multiple_timepoints(self, dataset_substring, columns_to_subset_on=None, visit_numbers_to_skip=None):
 193 |         """
 194 |         Takes all datasets in original_dataframes that contain dataset_substring, takes the columns in columns_to_subset_on, 
 195 |         and adds a column called "visit" which denotes which visit it is. 
 196 |         Checked. 
 197 |         """
 198 |         print('Combining dataframes with substring %s' % dataset_substring)
 199 |         dataframes_to_concatenate = []
 200 |         expected_columns = None
 201 |         for dataset_name in sorted(self.original_dataframes):
 202 |             if dataset_substring in dataset_name:
 203 |                 visit_number = dataset_name.replace(dataset_substring, '') # this should be something like 00. 
 204 |                 if visit_numbers_to_skip is not None and visit_number in visit_numbers_to_skip:
 205 |                     continue
 206 |                 visit = CLINICAL_WAVES_TO_FOLLOWUP[visit_number]
 207 |                 print("Adding visit=%s to dataframe %s" % (visit, dataset_name))                
 208 |                 dataset_copy = copy.deepcopy(self.original_dataframes[dataset_name])
 209 |                 # make sure each field has a consistent prefix (eg, v00) indicating that it comes from the right timepoint. 
 210 |                 # there are some exceptions: fields like id, and fields with p01 or p02, which indicate pre-enrollment measurements. 
 211 |                 assert all(['v%s' % visit_number in a for a in dataset_copy.columns if a not in ['id', 'side', 'readprj', 'version'] and a[:3] not in ['p01', 'p02']])
 212 |                 dataset_copy.columns = dataset_copy.columns.map(lambda x:x.replace('v%s' % visit_number, ''))
 213 |                 # if desired, subset the columns. 
 214 |                 if columns_to_subset_on is not None:
 215 |                     dataset_copy = dataset_copy[columns_to_subset_on]
 216 | 
 217 |                 # make sure columns stay consistent. 
 218 |                 if expected_columns is None:
 219 |                     expected_columns = list(dataset_copy.columns)
 220 |                 else:
 221 |                     assert expected_columns == list(dataset_copy.columns)
 222 |                 dataset_copy['visit'] = visit
 223 |                 dataframes_to_concatenate.append(dataset_copy)
 224 |         combined_data = pd.concat(dataframes_to_concatenate)
 225 |         combined_data.index = range(len(combined_data))
 226 |         print("Number of rows in combined data: %i" % len(combined_data))
 227 |         return combined_data
 228 | 
 229 |     def load_clinical_data(self):
 230 |         print("\n***Loading all clinical data.")
 231 |         # skip allclinical02 and allclinical04 because they have very little data.
 232 |         self.load_all_text_files_in_directory(self.clinical_base_dir, datasets_to_skip=['allclinical02', 'allclinical04'])
 233 | 
 234 |     def map_to_date(self, x):
 235 |         # sometimes X-ray dates are missing because, as documentation notes
 236 |         # "In addition, x-ray date and all QC variables have been set to missing .A for numeric variables, 
 237 |         # blank for text variables) when an x-ray was acquired, but is not available."
 238 |         # So this date is fairly often NA. But that's okay, because that only occurs (confirmed this) 
 239 |         # if the ACCEPT variable is NA anyway, so the data gets filtered out subsequently in find_image_barcodes_that_pass_qc
 240 |         if x is not None and str(x) != 'nan':
 241 |             return datetime.datetime.strptime(x, '%m/%d/%Y')
 242 |         return None
 243 | 
 244 |     def filter_out_visits_too_far_from_xray_imaging(self):
 245 |         print("\n\n***Filtering out visits too far from x-rays.")
 246 |         THRESHOLD_IN_DAYS = 90
 247 |         visits_to_bad_ids = {}
 248 |         for visit_substring in ['00', '01', '03', '05', '06']:
 249 |             allclinical_df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit_substring])
 250 |             xray_df = copy.deepcopy(self.original_dataframes['xray%s' % visit_substring])
 251 |             xray_df = xray_df.loc[xray_df['v%sexamtp' % visit_substring] == 'Bilateral PA Fixed Flexion Knee']
 252 |             xray_date_dict = dict(zip(xray_df['id'].values, 
 253 |                                      xray_df['v%sxrdate' % visit_substring].values))
 254 | 
 255 |             def return_date_in_dict_if_possible(xray_date_dict, x):
 256 |                 if x in xray_date_dict:
 257 |                     return xray_date_dict[x]
 258 |                 else:
 259 |                     print("Warning! ID %i not in dict." % x) # this happens only once. 
 260 |                     return '01/01/1900'
 261 |             allclinical_df['v%sxrdate' % visit_substring] = allclinical_df['id'].map(lambda x:return_date_in_dict_if_possible(xray_date_dict, x))
 262 | 
 263 |             # xrdate: Date x-ray completed (calc). 
 264 |             # p01svdate: Date Screening Visit completed. 
 265 |             # v00evdate: Date Enrollment Visit completed. 
 266 |             # v01fvdate: Follow-up visit date. 
 267 | 
 268 |             if visit_substring == '00':
 269 |                 all_date_cols = ['p01svdate', 'v00evdate', 'v00xrdate']
 270 |             else:
 271 |                 all_date_cols = ['v%sfvdate' % visit_substring, 'v%sxrdate' % visit_substring]
 272 |             
 273 |             print("\n\n%s visit" % CLINICAL_WAVES_TO_FOLLOWUP[visit_substring])
 274 | 
 275 |             # At followup, there are some people missing dates for x-rays because they didn't have them. 
 276 |             # We don't filter them out at this stage because they are filtered out subsequently. 
 277 |             # We do verify that a) very few people are missing any date data at the initial timepoint (00) and 
 278 |             # b) everyone missing allclinical data is also missing x-ray data, so should be filtered out. 
 279 |             if visit_substring != '00':
 280 |                 xr_missing_date = pd.isnull(allclinical_df['v%sxrdate' % visit_substring].map(lambda x:self.map_to_date(x)))
 281 |                 allclinical_missing_date = pd.isnull(allclinical_df['v%sfvdate' % visit_substring].map(lambda x:self.map_to_date(x)))
 282 |                 assert (allclinical_missing_date & (~xr_missing_date)).sum() == 0 # make sure there's no one who has x-rays without coming in for followup in allclinical. 
 283 |             else:
 284 |                 for k in all_date_cols:
 285 |                     assert pd.isnull(allclinical_df[k].map(lambda x:self.map_to_date(x))).mean() < .005
 286 |             
 287 |             bad_ids = None
 288 |             
 289 |             assert len(set(allclinical_df['id'])) == len(allclinical_df)
 290 |             for i in range(len(all_date_cols)):
 291 |                 print("Fraction of date column %s which cannot be mapped to a date: %2.3f" % 
 292 |                     (all_date_cols[i], 
 293 |                     pd.isnull(allclinical_df[all_date_cols[i]].map(lambda x:self.map_to_date(x))).mean()))
 294 |                 for j in range(i):
 295 |                     print('***gaps between %s and %s' % (all_date_cols[i], all_date_cols[j]))
 296 | 
 297 |                     days_between = np.abs((allclinical_df[all_date_cols[i]].map(lambda x:self.map_to_date(x)) - 
 298 |                                     allclinical_df[all_date_cols[j]].map(lambda x:self.map_to_date(x))).map(lambda x:x.days))
 299 |                     print("Mean: %2.3f; median %2.3f; greater than 30 days %2.3f; greater than 60 days %2.3f; greater than 90 days %2.5f; missing data %2.5f" % (
 300 |                                                                                                         days_between.mean(), 
 301 |                                                                                                         days_between.median(), 
 302 |                                                                                                         (days_between > 30).mean(), 
 303 |                                                                                                         (days_between > 60).mean(), 
 304 |                                                                                                         (days_between > 90).mean(), 
 305 |                                                                                                         np.isnan(days_between).mean()))
 306 |                     if bad_ids is None:
 307 |                         bad_ids = set(allclinical_df.loc[days_between > THRESHOLD_IN_DAYS, 'id'].values)
 308 |                     else:
 309 |                         bad_ids = bad_ids.union(set(allclinical_df.loc[days_between > THRESHOLD_IN_DAYS, 'id'].values))
 310 |                     
 311 |             visits_to_bad_ids[visit_substring] = bad_ids
 312 |             print("Total number of IDs filtered out for visit: %i/%i" % (len(bad_ids), len(allclinical_df)))
 313 |         self.visits_too_far_from_xray_screening = visits_to_bad_ids
 314 | 
 315 |         for k in self.processed_dataframes:
 316 |             if 'visit' in self.processed_dataframes[k].columns:
 317 |                 rows_to_filter_out = None
 318 |                 for visit in self.visits_too_far_from_xray_screening:
 319 |                     bad_rows_for_visit = (self.processed_dataframes[k]['id'].map(lambda x:x in self.visits_too_far_from_xray_screening[visit]) & 
 320 |                                           (self.processed_dataframes[k]['visit'] == CLINICAL_WAVES_TO_FOLLOWUP[visit]))
 321 |                     if rows_to_filter_out is None:
 322 |                         rows_to_filter_out = bad_rows_for_visit
 323 |                     else:
 324 |                         rows_to_filter_out = rows_to_filter_out | bad_rows_for_visit
 325 |                 self.processed_dataframes[k] = self.processed_dataframes[k].loc[~rows_to_filter_out]
 326 |                 print("For dataframe %s, filtered out %i/%i rows as too far from x-ray date" % (k, rows_to_filter_out.sum(), len(rows_to_filter_out)))
 327 | 
 328 |     def make_drinking_and_smoking_dataframe(self):
 329 |         """
 330 |         Risk factors at baseline. 
 331 |         """
 332 |         df = copy.deepcopy(self.original_dataframes['allclinical00'])
 333 |         
 334 |         # cigarette smoking. 
 335 |         df['cigarette_smoker'] = df['v00smoker']
 336 |         df.loc[df['cigarette_smoker'] == '3: Current, but never regular', 'cigarette_smoker'] = '1: Current'
 337 |         df.loc[df['cigarette_smoker'] == self.missing_data_val, 'cigarette_smoker'] = None
 338 |         print('Cigarette smoker: ', Counter(df['cigarette_smoker']))
 339 |         
 340 |         # drinks per week
 341 |         df['drinks_per_week'] = df['v00drnkamt']
 342 |         df.loc[df['drinks_per_week'] == self.missing_data_val, 'drinks_per_week'] = None
 343 |         print('Drinks per week: ', Counter(df['drinks_per_week']))
 344 |         
 345 |         self.processed_dataframes['drinking_and_smoking'] = df[['id', 'drinks_per_week', 'cigarette_smoker']]
 346 | 
 347 |     def make_medical_history_dataframe(self):
 348 |         """
 349 |         Used to replicate David's regressions as a sanity check, but not actually for any analysis in the paper. 
 350 |         
 351 |         Currently someone is defined as a 1 if they report having a disease prior to the timepoint
 352 |         Defined as missing if they are missing disease data at baseline and don't report having it subsequently. 
 353 |         Defined as false otherwise. 
 354 |         
 355 |         Not entirely sure this is the right way to do this. There's a lot of missing data for RA at baseline. Regarding RA: people are supposed to be excluded if they have it for sure. But I guess v00ra may or may not indicate RA, as defined by the study -- perhaps they think some people are giving unreliable answers, and that accounts for the missing data? 
 356 | 
 357 |         "Participants who report that a doctor has told them they have RA, SLE, psoriatic arthritis, ankylosing spondylitis or another inflammatory arthritis will be asked about use of specific medications that are used primarily for RA and other forms of inflammatory arthritis: e.g. gold, methotrexate, etanercept, infliximab, leflunamide, plaquenil, etc. If the person has ever used any of these medications, they will be excluded. If the participant reports having RA or inflammatory arthritis but none of these medications have been used, they will be asked about symptoms of RA and excluded if the responses are suggestive of RA"
 358 | 
 359 |         This includes a couple of other covariates David actually doesn't use in his regression. 
 360 |         """
 361 |         print("\n\n***Making dataframe of medical history.")
 362 |         all_dfs = []
 363 |         medical_conditions = ['hrtat', 'hrtfail', 'bypleg', 'stroke', 'asthma', 'lung', 
 364 |                                         'ulcer', 'diab', 'kidfxn', 'ra', 'polyrh', 'livdam', 'cancer']
 365 |         
 366 |         # we omit ALZDZ even though it's in david's script because it doessn't appear to be in our data. 
 367 |         
 368 |         all_ids = list(self.original_dataframes['allclinical00']['id'])
 369 |         has_disease = {}
 370 |         nas_at_baseline = {}
 371 |         for condition in medical_conditions:
 372 |             has_disease[condition] = set([])
 373 |             nas_at_baseline[condition] = set([])
 374 |         for visit in WAVES_WE_ARE_USING:
 375 |             df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit])
 376 |             for condition in medical_conditions:
 377 |                 if visit == '00':
 378 |                     has_disease_idxs = df['v%s%s' % (visit, condition)] == '1: Yes'
 379 |                     self.validate_col(df['v%s%s' % (visit, condition)], ['1: Yes', '0: No', self.missing_data_val])
 380 |                     nas_at_baseline_idxs = df['v%s%s' % (visit, condition)] == self.missing_data_val
 381 |                     nas_at_baseline[condition] = set(df.loc[nas_at_baseline_idxs, 'id'])
 382 |                     print('Proportion missing data for %-10s at visit 00: %2.3f' % (condition, nas_at_baseline_idxs.mean()))
 383 |                 elif visit in ['03', '06']:
 384 |                     has_disease_idxs = df['v%s%s' % (visit, condition)] == 1.0
 385 |                     self.validate_col(df['v%s%s' % (visit, condition)], [0, 1])
 386 |                     print("Proportion missing data for %-10s at visit %s: %2.3f" % (condition, visit, pd.isnull(df['v%s%s' % (visit, condition)]).mean()))
 387 |                 else:
 388 |                     # unfortunately, don't appear to have data for these visits. 
 389 |                     continue
 390 |                 has_disease_ids = set(df.loc[has_disease_idxs, 'id'])
 391 |                 has_disease[condition] = has_disease[condition].union(has_disease_ids)
 392 |                     
 393 |             df_for_visit = pd.DataFrame({'id':all_ids, 'visit':CLINICAL_WAVES_TO_FOLLOWUP[visit]})
 394 |             for condition in medical_conditions:
 395 |                 has_disease_idxs = df_for_visit['id'].map(lambda x:x in has_disease[condition])
 396 |                 df_for_visit[condition] = has_disease_idxs.values * 1.
 397 |                 nas_at_baseline_idxs = df_for_visit['id'].map(lambda x:x in nas_at_baseline[condition])
 398 |                 df_for_visit.loc[nas_at_baseline_idxs & (~has_disease_idxs), condition] = None
 399 |             all_dfs.append(df_for_visit)
 400 |         combined_df = pd.concat(all_dfs)
 401 |         combined_df.index = range(len(combined_df))
 402 |         print(combined_df.groupby('visit').mean())
 403 |         self.processed_dataframes['medical_history'] = combined_df
 404 | 
 405 |     def make_previous_fracture_or_fall_dataframe(self):
 406 |         """
 407 |         Fractures are cumulatively defined: currently someone is defined as a 1 if they report having a fracture prior to the timepoint. 
 408 |         Defined as missing if they are missing data at baseline and don't report having it subsequently. 
 409 |         Defined as false otherwise. 
 410 |         
 411 |         Falls occur in the last 12 months and are thus not cumulatively defined. 
 412 |         """
 413 |         print("Making fracture and fall dataframe!")
 414 |         all_ids = list(self.original_dataframes['allclinical00']['id'])
 415 |         have_fracture = {}
 416 |         nas_at_baseline = {}
 417 |         all_dfs = []
 418 |         for condition in ['fractured_bone', 'fractured_hip', 'fractured_spine']:
 419 |             have_fracture[condition] = set([])
 420 |             nas_at_baseline[condition] = set([])
 421 |             
 422 |         for visit in WAVES_WE_ARE_USING:
 423 |             # get the DF we need data from
 424 |             df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit])
 425 |             
 426 |             # construct df for visit. 
 427 |             df_for_visit = pd.DataFrame({'id':all_ids})
 428 |             df_for_visit['visit'] = CLINICAL_WAVES_TO_FOLLOWUP[visit]
 429 |             
 430 |             # Do falls. This is different from fractures because it's non-cumulative. 
 431 |             fall_col = 'v%sfall' % visit
 432 |             if visit in ['00', '01']:
 433 |                 self.validate_col(df[fall_col], ['1: Yes', '0: No', self.missing_data_val])
 434 |                 fell_ids = set(df.loc[df[fall_col] == '1: Yes', 'id'].values)
 435 |                 fall_missing_data_ids = set(df.loc[df[fall_col] == self.missing_data_val, 'id'].values)
 436 |             else:
 437 |                 fell_ids = set(df.loc[df[fall_col] == 1.0, 'id'].values)
 438 |                 self.validate_col(df[fall_col], [0, 1])
 439 |                 fall_missing_data_ids = set(df.loc[pd.isnull(df[fall_col]), 'id'].values)
 440 |             df_for_visit['fell_in_last_12_months'] = df_for_visit['id'].map(lambda x:x in fell_ids)
 441 |             df_for_visit.loc[df_for_visit['id'].map(lambda x:x in fall_missing_data_ids), 'fell_in_last_12_months'] = None
 442 |             
 443 |             
 444 |             # Do fractures. 
 445 |             got_fracture_at_timepoint = {}
 446 |             for condition in have_fracture.keys():
 447 |                 got_fracture_at_timepoint[condition] = set([])
 448 |                 if condition == 'fractured_bone':
 449 | 
 450 |                     if visit == '00':
 451 |                         col = 'v00bonefx'
 452 |                     else:
 453 |                         col = 'v%sbonfx' % visit
 454 |                     if visit in ['01', '00']:
 455 |                         got_fracture_at_timepoint[condition] = df.loc[df[col] == '1: Yes', 'id'].values
 456 |                         self.validate_col(df[col], ['1: Yes', '0: No', self.missing_data_val])
 457 |                     else:
 458 |                         got_fracture_at_timepoint[condition] = df.loc[df[col] == 1.0, 'id'].values
 459 |                         self.validate_col(df[col], [0, 1])
 460 |                     if visit == '00':
 461 |                         nas_at_baseline[condition] = df.loc[df[col] == self.missing_data_val, 'id'].values
 462 |                 elif condition == 'fractured_hip':
 463 |                     if visit == '00':
 464 |                         col = 'v00hipfx'
 465 |                         got_fracture_at_timepoint[condition] = df.loc[df[col] == '1: Yes', 'id'].values
 466 |                         nas_at_baseline[condition] = df.loc[df[col] == self.missing_data_val, 'id'].values
 467 |                         self.validate_col(df[col], ['1: Yes', '0: No', self.missing_data_val])
 468 |                     else:
 469 |                         # can't find hip fracture data at subsequent timepoints. 
 470 |                         continue
 471 |                 elif condition == 'fractured_spine':
 472 |                     if visit == '00':
 473 |                         col = 'v00spnfx' 
 474 |                     else:
 475 |                         col = 'v%sbonfx6' % visit
 476 |                     if visit in ['01', '00']:
 477 |                         got_fracture_at_timepoint[condition] = df.loc[df[col] == '1: Yes', 'id'].values
 478 |                         self.validate_col(df[col], ['1: Yes', '0: No', self.missing_data_val])
 479 |                     else:
 480 |                         got_fracture_at_timepoint[condition] = df.loc[df[col] == 1.0, 'id'].values
 481 |                         self.validate_col(df[col], [0, 1])
 482 |                     if visit == '00':
 483 |                         nas_at_baseline[condition] = df.loc[df[col] == self.missing_data_val, 'id'].values
 484 |                 else:
 485 |                     raise Exception("not a valid disease")
 486 |                     
 487 |             for condition in have_fracture.keys():
 488 |                 have_fracture[condition] = have_fracture[condition].union(got_fracture_at_timepoint[condition])
 489 |                 df_for_visit[condition] = df_for_visit['id'].map(lambda x:x in have_fracture[condition])
 490 |                 na_idxs = df_for_visit['id'].map(lambda x:x in nas_at_baseline[condition] )
 491 |                 df_for_visit.loc[na_idxs & (~df_for_visit[condition]), condition] = None
 492 |                 
 493 |             
 494 |             all_dfs.append(df_for_visit)
 495 |         combined_df = pd.concat(all_dfs)
 496 |         combined_df.index = range(len(combined_df))
 497 |         print("Average values by visit")
 498 |         print(combined_df[[a for a in combined_df.columns if a != 'id']].groupby('visit').mean())
 499 |         print("NAs by visit")
 500 |         print(combined_df[[a for a in combined_df.columns if a != 'id']].groupby('visit').agg(lambda x:np.mean(pd.isnull(x))))
 501 |         self.processed_dataframes['fractures_and_falls'] = combined_df  
 502 | 
 503 |     def make_400m_walk_dataframe(self):
 504 |         """
 505 |         Stats about how quickly they can walk. Only have data for three timepoints.
 506 |         """
 507 |         walk_cols = ['400mtr', '400excl', '400mcmp', '400mtim']
 508 |         walk_df = self.concatenate_dataframes_from_multiple_timepoints(dataset_substring='allclinical', 
 509 |                                                             columns_to_subset_on=['id'] + walk_cols, 
 510 |                                                             visit_numbers_to_skip=['01', '05', '07', '08', '09','10', '11'])
 511 |         ids = sorted(list(set(walk_df['id'])))
 512 |         
 513 |         print(Counter(walk_df['400excl'].dropna()))
 514 |         print(Counter(walk_df['400mcmp'].dropna()))
 515 |         walk_df['400excl'] = walk_df['400excl'].map(lambda x:str(x) not in ['0.0', '0: Not excluded'])
 516 |         walk_df['400mcmp'] = walk_df['400mcmp'].map(lambda x:str(x) in ['1.0', '1: Completed test without stopping'])
 517 | 
 518 |         print("After processing")
 519 |         print(Counter(walk_df['400excl'].dropna()))
 520 |         print(Counter(walk_df['400mcmp'].dropna()))
 521 |         for c in walk_df.columns:
 522 |             assert (walk_df[c].astype(str) == self.missing_data_val).sum() == 0
 523 |         print(walk_df.head())
 524 | 
 525 | 
 526 |         # Add timepoints for '01' and '05' for consistency with other processing (just fill out other columns with None). 
 527 |         for timepoint in ['01', '05']:
 528 |             timepoint_df = pd.DataFrame({'id':ids, 'visit':CLINICAL_WAVES_TO_FOLLOWUP[timepoint]})
 529 |             for col in walk_cols:
 530 |                 timepoint_df[col] = None
 531 |             timepoint_df = timepoint_df[walk_df.columns]
 532 |             walk_df = pd.concat([walk_df, timepoint_df])
 533 |         self.processed_dataframes['400m_walk'] = walk_df
 534 | 
 535 | 
 536 | 
 537 |     def make_redundant_knee_xray_variable_dataframe(self):
 538 |         """
 539 |         A couple extra variables that Sendhil noticed at baseline and wanted to pull just in case. 
 540 |         """
 541 |         cols = ['P01SV%sKOST', 'P01SV%sKJSL', 'P01SV%sKJSM']
 542 |         new_col_names = ['knee_osteophytes', 
 543 |                          'knee_lateral_joint_space_narrowing', 
 544 |                          'knee_medial_joint_space_narrowing']
 545 |                          
 546 |         cols = [col.lower() for col in cols]
 547 |         left_cols = [col % 'l' for col in cols]
 548 |         right_cols = [col % 'r' for col in cols]
 549 |         
 550 |         left_df = self.original_dataframes['allclinical00'][['id'] + left_cols].copy()
 551 |         right_df = self.original_dataframes['allclinical00'][['id'] + right_cols].copy()
 552 |         
 553 |         left_df.columns = ['id'] + new_col_names
 554 |         right_df.columns = ['id'] + new_col_names
 555 |                              
 556 |         left_df['side'] = 'left'
 557 |         right_df['side'] = 'right'
 558 |         
 559 |         redundant_knee_xray_clinical_features = pd.concat([left_df, right_df])
 560 |         redundant_knee_xray_clinical_features.index = range(len(redundant_knee_xray_clinical_features))
 561 |         for c in new_col_names:
 562 |             if c == 'id':
 563 |                 continue
 564 |             print(c)
 565 |             assert pd.isnull(redundant_knee_xray_clinical_features[c]).sum() == 0
 566 |             redundant_knee_xray_clinical_features.loc[
 567 |                 redundant_knee_xray_clinical_features[c] == self.missing_data_val, 
 568 |                 c] = None
 569 |             print(redundant_knee_xray_clinical_features[c].value_counts())
 570 |             print("Missing data fraction: %2.3f" % pd.isnull(redundant_knee_xray_clinical_features[c]).mean())
 571 |         
 572 |         self.processed_dataframes['redundant_knee_xray_clinical_features'] = redundant_knee_xray_clinical_features
 573 |     def make_dominant_leg_dataframe(self):
 574 |         """
 575 |         Checked. 
 576 |         Don’t use timepoint info (ie, we define this using allclinical00 only) because lots of missing data at 
 577 |         subsequent timepoints and seems like there are causality problems.  
 578 |         """
 579 |         print("\n\n***Making dominant leg dataframe")
 580 |         right_leg_df = copy.deepcopy(self.original_dataframes['allclinical00'][['id', 'v00kikball']])
 581 |         right_leg_df.columns = ['id', 'dominant_leg']
 582 |         missing_data_idxs = (right_leg_df['dominant_leg'] == self.missing_data_val).values
 583 |         left_leg_df = copy.deepcopy(right_leg_df)
 584 |         
 585 |         right_leg_df['dominant_leg'] = right_leg_df['dominant_leg'].map(lambda x:'right' in x.lower())
 586 |         left_leg_df['dominant_leg'] = left_leg_df['dominant_leg'].map(lambda x:'left' in x.lower())
 587 |         
 588 |         left_leg_df.loc[missing_data_idxs, 'dominant_leg'] = None
 589 |         right_leg_df.loc[missing_data_idxs, 'dominant_leg'] = None
 590 |         
 591 |         left_leg_df['side'] = 'left'
 592 |         right_leg_df['side'] = 'right'
 593 |         
 594 |         combined_df = pd.concat([left_leg_df, right_leg_df])
 595 |         combined_df.index = range(len(combined_df))
 596 |         
 597 |         print(combined_df[['side', 'dominant_leg']].groupby('side').agg(['mean', 'size']))
 598 |         print("Missing data: %2.3f" % pd.isnull(combined_df['dominant_leg']).mean())
 599 |         
 600 |         self.processed_dataframes['dominant_leg'] = combined_df
 601 | 
 602 |     def make_bmi_dataframe(self):
 603 |         """
 604 |         Computes current and max BMI as categorical variables. Only uses baseline numbers. 
 605 |         Checked. 
 606 |         """
 607 |         print("\n\nComputing current amd max BMI.")
 608 | 
 609 |         current_weight_col = 'p01weight'
 610 |         max_weight_col = 'v00wtmaxkg'
 611 |         current_height_col = 'p01height'
 612 |         desired_cols = ['id'] + [current_weight_col, max_weight_col, current_height_col]
 613 |         bmi_df = copy.deepcopy(self.original_dataframes['allclinical00'][desired_cols])
 614 |         
 615 |         bmi_df['current_bmi'] = bmi_df[current_weight_col] / ((bmi_df[current_height_col] / 1000.) ** 2)
 616 |         bmi_df['max_bmi'] = bmi_df[max_weight_col] / ((bmi_df[current_height_col] / 1000.) ** 2)
 617 |         bmi_df = bmi_df[['id', 'current_bmi', 'max_bmi']]    
 618 |         def map_bmi_to_david_cats(x):
 619 |             if x < 18.5:
 620 |                 return '<18.5'
 621 |             elif x < 25:
 622 |                 return '18.5-25'
 623 |             elif x < 30:
 624 |                 return '25-30'
 625 |             elif x < 35:
 626 |                 return '30-35'
 627 |             elif x >= 35:
 628 |                 return '>=35'
 629 |             else:
 630 |                 return None
 631 |             
 632 |         bmi_not_nan = (~pd.isnull(bmi_df['current_bmi'])) & (~pd.isnull(bmi_df['max_bmi']))
 633 |         bmi_max_smaller_than_current = bmi_not_nan & (bmi_df['current_bmi'] > bmi_df['max_bmi'])
 634 |         print('Warning: proportion %2.3f of rows have current BMI > max BMI. Setting max to current.' % 
 635 |               bmi_max_smaller_than_current.mean()) # this is likely caused by fact that max BMI is self-reported, while current BMI I assume is weighed at the site. 
 636 |         bmi_df.loc[bmi_max_smaller_than_current, 'max_bmi'] = bmi_df.loc[bmi_max_smaller_than_current, 'current_bmi'].values
 637 |         assert (bmi_not_nan & (bmi_df['current_bmi'] > bmi_df['max_bmi'])).sum() == 0
 638 | 
 639 |         print(bmi_df[['current_bmi', 'max_bmi']].describe())
 640 |         bmi_df['current_bmi'] = bmi_df['current_bmi'].map(map_bmi_to_david_cats)
 641 |         bmi_df['max_bmi'] = bmi_df['max_bmi'].map(map_bmi_to_david_cats)
 642 |         
 643 |         print('Counts of values for current BMI are', Counter(bmi_df['current_bmi']))
 644 |         print('Counts of values for max BMI are', Counter(bmi_df['max_bmi']))
 645 |         self.processed_dataframes['bmi'] = bmi_df
 646 | 
 647 |     def make_previous_knee_replacement_dataframe(self):
 648 |         print("\n\nComputing previous knee replacements/arthroplasties")
 649 |         # "ever have replacement where all or part of joint was replaced"
 650 |         self.processed_dataframes['knee_replacement'] = self.make_previous_injury_or_surgery_dataframe(
 651 |             baseline_substring='krs', 
 652 |             followup_substring='krs',
 653 |             col_name='knee_replacement', 
 654 |             set_missing_baseline_to_0=True, 
 655 |             waves_to_skip='06'
 656 |             )
 657 |         df_to_concat = self.processed_dataframes['knee_replacement'].loc[self.processed_dataframes['knee_replacement']['visit'] == '36 month follow-up'].copy()
 658 |         df_to_concat['visit'] = '48 month follow-up'
 659 |         self.processed_dataframes['knee_replacement'] = pd.concat([self.processed_dataframes['knee_replacement'], df_to_concat])
 660 |         self.processed_dataframes['knee_replacement'].index = range(len(self.processed_dataframes['knee_replacement']))
 661 | 
 662 |     def make_previous_injury_dataframe(self):
 663 |         print("\n\nComputing previous injuries to knees!")
 664 |         self.processed_dataframes['knee_injury'] = self.make_previous_injury_or_surgery_dataframe(
 665 |             baseline_substring='inj', 
 666 |             followup_substring='inj',
 667 |             col_name='knee_injury')
 668 | 
 669 |     def make_previous_surgery_dataframe(self):
 670 |         print("\n\nComputing previous surgeries to knees!")
 671 |         self.processed_dataframes['knee_surgery'] = self.make_previous_injury_or_surgery_dataframe(
 672 |             baseline_substring='ksurg', 
 673 |             followup_substring='ksrg',
 674 |             col_name='knee_surgery')
 675 | 
 676 |     def make_age_dataframe(self):
 677 |         print("\n\n***Creating combined age dataframe")
 678 |         combined_df = []
 679 |         for visit in WAVES_WE_ARE_USING:
 680 |             age_df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit][['id', 'v%sage' % visit]])
 681 |             age_df.columns = ['id', 'age_at_visit']
 682 |             age_df['visit'] = CLINICAL_WAVES_TO_FOLLOWUP[visit]
 683 |             combined_df.append(age_df)
 684 |         
 685 |         def convert_age_to_categorical_variable(age):
 686 |             assert not (age < 45)
 687 |             assert not (age > 85)
 688 |             if age < 50 and age >= 45:
 689 |                 return '45-49'
 690 |             if age < 55:
 691 |                 return '50-54'
 692 |             if age < 60:
 693 |                 return '55-59'
 694 |             if age < 65:
 695 |                 return '60-64'
 696 |             if age < 70:
 697 |                 return '65-69'
 698 |             if age < 75:
 699 |                 return '70-74'
 700 |             if age < 80:
 701 |                 return '75-79'
 702 |             if age < 85:
 703 |                 return '80-84'
 704 |             assert np.isnan(age)
 705 |             return None
 706 |             
 707 |         combined_df = pd.concat(combined_df)
 708 |         combined_df['age_at_visit'] = combined_df['age_at_visit'].map(convert_age_to_categorical_variable)
 709 |         print(Counter(combined_df['age_at_visit']))
 710 |         self.processed_dataframes['age_at_visit'] = combined_df
 711 | 
 712 |     def make_other_pain_dataframe(self, type_of_pain):
 713 |         """
 714 |         Helper method to make the combined pain dataframe. 
 715 |         Returns things as strings. 
 716 |         """
 717 |         assert type_of_pain in ['hip', 'back', 
 718 |                                 'foot', 'ankle', 'shoulder', 'elbow', 'wrist', 'hand']
 719 |         
 720 |         combined_df = []
 721 |         for visit in WAVES_WE_ARE_USING:        
 722 |             # first have to identify cols of interest. 
 723 |             if type_of_pain == 'hip':
 724 |                 if visit == '00':
 725 |                     cols_of_interest = ['p01hp%s12cv' % side for side in ['l', 'r']]
 726 |                 else:
 727 |                     cols_of_interest = ['v%shp%s12cv' % (visit, side) for side in ['l', 'r']]
 728 |                 col_names_to_use = ['id', 
 729 |                                     'left_hip_pain_more_than_half_of_days', 
 730 |                                     'right_hip_pain_more_than_half_of_days']
 731 |             elif type_of_pain == 'back':
 732 |                 if visit == '00':
 733 |                     cols_of_interest = ['p01bp30oft']
 734 |                 else: 
 735 |                     cols_of_interest = ['v%sbp30oft' % visit]
 736 |                 col_names_to_use = ['id', 'how_often_bothered_by_back_pain']
 737 |             elif type_of_pain in ['foot', 'ankle', 'shoulder', 'elbow', 'wrist', 'hand']:
 738 |                 pain_abbrv = type_of_pain[0]
 739 |                 if visit == '00':
 740 |                     cols_of_interest = ['p01ojpn%s%s' % (side, pain_abbrv) for side in ['l', 'r']]
 741 |                 else:
 742 |                     cols_of_interest = ['v%sojpn%s%s' % (visit, side, pain_abbrv) for side in ['l', 'r']]
 743 |                 col_names_to_use = ['id', 
 744 |                                     'left_%s_pain_more_than_half_of_days' % type_of_pain, 
 745 |                                     'right_%s_pain_more_than_half_of_days' % type_of_pain]
 746 |             else:
 747 |                 raise Exception("Your pain is invalid :(")
 748 |            
 749 |             # select columns. 
 750 |             pain_df = copy.deepcopy(self.original_dataframes['allclinical%s' % visit][['id'] + cols_of_interest])
 751 |             
 752 |             # do mapping. 
 753 |             if type_of_pain == 'hip':
 754 |                 if visit == '00' or visit == '01':
 755 |                     for col in cols_of_interest:
 756 |                         self.validate_col(pain_df[col], ['1: Yes', '0: No', self.missing_data_val])
 757 |                 else:
 758 |                     for col in cols_of_interest:
 759 |                         self.validate_col(pain_df[col], [0, 1])
 760 |                         pain_df[col] = pain_df[col].replace({np.nan:self.missing_data_val, 
 761 |                                                              1:'1: Yes',
 762 |                                                              0:'0: No'}).astype(str)
 763 |                 for col in cols_of_interest:
 764 |                     self.validate_col(pain_df[col], [self.missing_data_val, '1: Yes', '0: No'])
 765 | 
 766 |             elif type_of_pain == 'back':
 767 |                 if visit == '00' or visit == '01':
 768 |                     for col in cols_of_interest:
 769 |                         self.validate_col(pain_df[col], ['1: Some of the time', '0: Rarely', 
 770 |                             '2: Most of the time', '3: All of the time', self.missing_data_val])
 771 |                 else:
 772 |                     for col in cols_of_interest:
 773 |                         self.validate_col(pain_df[col], [0, 1, 2, 3])
 774 |                         pain_df[col] = pain_df[col].replace({1:'1: Some of the time', 
 775 |                                                              0:'0: Rarely', 
 776 |                                                              2:'2: Most of the time', 
 777 |                                                              3:'3: All of the time', 
 778 |                                                             np.nan:self.missing_data_val}).astype(str)
 779 |                 for col in cols_of_interest:
 780 |                     self.validate_col(pain_df[col], ['0: Rarely', '1: Some of the time', '2: Most of the time', '3: All of the time', self.missing_data_val])
 781 |                 
 782 |                 
 783 |             elif type_of_pain in ['foot', 'ankle', 'shoulder', 'elbow', 'wrist', 'hand']:
 784 |                 if visit == '00' or visit == '01':
 785 |                     for col in cols_of_interest:
 786 |                         self.validate_col(pain_df[col], ['1: Yes', '0: No', self.missing_data_val])
 787 |                 else:
 788 |                     for col in cols_of_interest:
 789 |                         self.validate_col(pain_df[col], [0, 1])
 790 |                         pain_df[col] = pain_df[col].replace({None:self.missing_data_val, 
 791 |                                                             1:'1: Yes'}).astype(str)
 792 |                 for col in cols_of_interest:        
 793 |                     self.validate_col(pain_df[col], [self.missing_data_val, '1: Yes'])
 794 |                 
 795 |             pain_df.columns = col_names_to_use
 796 |             pain_df['visit'] = CLINICAL_WAVES_TO_FOLLOWUP[visit]
 797 |             combined_df.append(pain_df)
 798 | 
 799 |         combined_df = pd.concat(combined_df)
 800 |         combined_df.index = range(len(combined_df))
 801 | 
 802 |         # Set missing values to None for consistency with the rest of data processing. 
 803 |         for col in combined_df.columns:
 804 |             if col == 'visit' or col == 'id':
 805 |                 continue
 806 |             assert type(combined_df[col].iloc[0]) is str
 807 |             assert pd.isnull(pain_df[col]).sum() == 0
 808 |             print("Setting values of %s in column %s to None" % (self.missing_data_val, col))
 809 |             combined_df.loc[combined_df[col] == self.missing_data_val, col] = None
 810 | 
 811 |         return combined_df
 812 | 
 813 |     def make_nonstandard_interventions_dataframe(self):
 814 |         """
 815 |         Make dataframe of 0-1 indicators whether someone has had other interventions for pain 
 816 |         which are not standard in medical practice. 
 817 |         """
 818 |         print("Processing interventions data")
 819 |         interventions = ["V00ACUTCV", "V00ACUSCV", "V00CHELCV", "V00CHIRCV", 
 820 |                        "V00FOLKCV", "V00HOMECV", "V00MASSCV", "V00DIETCV", 
 821 |                        "V00VITMCV", "V00RUBCV", "V00CAPSNCV", "V00BRACCV", 
 822 |                        "V00YOGACV", "V00HERBCV", "V00RELACV", "V00SPIRCV", 
 823 |                        "V00OTHCAMC", "V00OTHCAM"]
 824 |         cols = ['id'] + [a.lower() for a in interventions]
 825 |         df = self.original_dataframes['allclinical00'][cols].copy()
 826 |         
 827 |         for c in df.columns:
 828 |             if c != 'id':
 829 |                 self.validate_col(df[c], ['0: No', '1: Yes', self.missing_data_val])
 830 |                 
 831 |                 nan_idxs = df[c].map(lambda x:x in self.missing_data_val).values
 832 |                 intervention_idxs = df[c] == '1: Yes'
 833 |                 df[c] = 0.
 834 |                 df.loc[intervention_idxs, c] = 1.
 835 |                 df.loc[nan_idxs, c] = None
 836 |         print("Missing data")
 837 |         print(df.agg(lambda x:np.mean(pd.isnull(x))))
 838 |         print("Fraction with other interventions")
 839 |         print(df.mean())
 840 | 
 841 |         self.processed_dataframes['nonstandard_interventions'] = df
 842 |         
 843 |     def make_medications_dataframe(self):
 844 |         """
 845 |         Make dataframe of 0-1 indicators whether someone is taking medication. 
 846 |         """
 847 |         print("Processing medications data")
 848 |         medications = ["V00RXACTM", "V00RXANALG", "V00RXASPRN", "V00RXBISPH", 
 849 |                        "V00RXCHOND", "V00RXCLCTN", "V00RXCLCXB", "V00RXCOX2", 
 850 |                        "V00RXFLUOR", "V00RXGLCSM", "V00RXIHYAL", "V00RXISTRD", 
 851 |                        "V00RXMSM", "V00RXNARC", "V00RXNSAID", "V00RXNTRAT", 
 852 |                        "V00RXOSTRD", "V00RXOTHAN", "V00RXRALOX", "V00RXRFCXB", 
 853 |                        "V00RXSALIC", "V00RXSAME", "V00RXTPRTD", "V00RXVIT_D", "V00RXVLCXB"]
 854 |         medications = [a.replace('V00', '').lower() for a in medications]
 855 |         med_df = self.concatenate_dataframes_from_multiple_timepoints(dataset_substring='allclinical', 
 856 |                                                                 columns_to_subset_on=['id'] + medications, 
 857 |                                                                 visit_numbers_to_skip=['07', '08', '09', '10', '11'])
 858 |         for c in med_df.columns:
 859 |             if c != 'id' and c != 'visit':
 860 |                 self.validate_col(med_df[c].map(lambda x:str(x)), ['1.0', '0.0', 
 861 |                                                                    '0: Not used in last 30 days', 
 862 |                                                                    '1: Used in last 30 days', 
 863 |                                                                    self.missing_data_val, 
 864 |                                                                    'nan'])
 865 |                 nan_idxs = med_df[c].map(lambda x:str(x) in [self.missing_data_val, 'nan']).values
 866 |                 took_idxs = med_df[c].map(lambda x:str(x) in ['1: Used in last 30 days', '1.0']).values
 867 |                 med_df[c] = 0.
 868 |                 med_df.loc[took_idxs, c] = 1.
 869 |                 med_df.loc[nan_idxs, c] = None
 870 |         print("Missing data")
 871 |         print(med_df.groupby('visit').agg(lambda x:np.mean(pd.isnull(x))))
 872 |         print("Fraction taking medication")
 873 |         print(med_df.groupby('visit').mean())
 874 |             
 875 |         self.processed_dataframes['medications'] = med_df
 876 |         
 877 |     def make_pain_dataframe_for_all_other_types_of_pain(self):
 878 |         print("\n\n\n***Creating dataframe for all other types of pain")
 879 |         for i, other_type_of_pain in enumerate(['hip', 'back', 
 880 |                                 'foot', 'ankle', 'shoulder', 'elbow', 'wrist', 'hand']):
 881 |             if i == 0:
 882 |                 combined_pain_df = self.make_other_pain_dataframe(other_type_of_pain)
 883 |                 original_len = len(combined_pain_df)
 884 |             else:
 885 |                 combined_pain_df = pd.merge(combined_pain_df, 
 886 |                                             self.make_other_pain_dataframe(other_type_of_pain), 
 887 |                                             how='inner', 
 888 |                                             on=['id', 'visit'])
 889 |                 assert len(combined_pain_df) == original_len
 890 |                 assert len(combined_pain_df[['id', 'visit']].drop_duplicates() == original_len)
 891 | 
 892 |         print("Missing data by timepoint")
 893 |         print(combined_pain_df.groupby('visit').agg(lambda x:np.mean(pd.isnull(x))))
 894 |                 
 895 |         self.processed_dataframes['other_pain'] = combined_pain_df
 896 | 
 897 |     def validate_col(self, col, expected_values):
 898 |         if not (col.dropna().map(lambda x:x not in expected_values).sum() == 0):
 899 |             print("Error: unexpected value in column. Expected values:")
 900 |             print(expected_values)
 901 |             print("Actual values")
 902 |             print(sorted(list(set(col.dropna()))))
 903 |             assert False
 904 | 
 905 |     def make_previous_injury_or_surgery_dataframe(self, baseline_substring, followup_substring, col_name, set_missing_baseline_to_0=False, waves_to_skip=None):
 906 |         """
 907 |         While the code in this method refers to "injury", we actually use it to define both injuries + surgeries. 
 908 |         baseline_substring identifies the column used in allclinical00
 909 |         followup_substring identifies the column in subsequent clinical dataframes
 910 |         col_name is the name we want to give the column. 
 911 | 
 912 |         Set someone to True if they report an injury at any previous timepoint. 
 913 |         Set them to NA if they don't report an injury and are missing data for the first timepoint 
 914 |         Set them to False otherwise. 
 915 |         (some followup people are missing data, so we might have a few false negatives who didn't report an injury, but it should be small). 
 916 |         Checked. 
 917 |         """
 918 |         
 919 |         ids_who_report_injury_at_any_timepoint = {'left':set([]), 'right':set([])}
 920 |         ids_with_nas_at_first_timepoint = {'left':set([]), 'right':set([])}
 921 |         all_dfs = []
 922 |         if waves_to_skip is None:
 923 |             waves_to_skip = []
 924 | 
 925 |         for visit in WAVES_WE_ARE_USING:
 926 |             if visit in waves_to_skip:
 927 |                 continue
 928 |             if visit == '00':
 929 |                 left_col = 'p01%sl' % baseline_substring
 930 |                 right_col = 'p01%sr' % baseline_substring
 931 |             else:
 932 |                 left_col = 'v%s%sl12' % (visit, followup_substring)
 933 |                 right_col = 'v%s%sr12' % (visit, followup_substring)
 934 |             df_to_use = copy.deepcopy(self.original_dataframes['allclinical%s' % visit][['id', left_col, right_col]])
 935 |             df_to_use.columns = ['id', 'left_side', 'right_side']
 936 |             assert len(set(df_to_use['id'])) == len(df_to_use)
 937 |             df_to_use['visit'] = CLINICAL_WAVES_TO_FOLLOWUP[visit]
 938 |             if visit == '00':
 939 |                 all_ids = set(df_to_use['id'])
 940 |             else:
 941 |                 assert set(df_to_use['id']) == all_ids
 942 | 
 943 |             dfs_by_knee = {}
 944 |             for side in ['left', 'right']:
 945 |                 dfs_by_knee[side] = copy.deepcopy(df_to_use[['id', 'visit', '%s_side' % side]])
 946 |                 dfs_by_knee[side].columns = ['id', 'visit', col_name]
 947 |                 dfs_by_knee[side]['side'] = side
 948 |                 
 949 |                 # map to bools.
 950 |                 if visit == '00' or visit == '01':
 951 |                     self.validate_col(dfs_by_knee[side][col_name], ['1: Yes', '0: No', self.missing_data_val])
 952 |                     knee_injury_at_this_timepoint = set(dfs_by_knee[side]['id'].loc[
 953 |                         dfs_by_knee[side][col_name] == '1: Yes'])
 954 |                     
 955 |                 else:
 956 |                     knee_injury_at_this_timepoint = set(dfs_by_knee[side]['id'].loc[
 957 |                         dfs_by_knee[side][col_name] == 1])
 958 |                     self.validate_col(dfs_by_knee[side][col_name], [0, 1])
 959 |                 if visit == '00':
 960 |                     na_ids = set(dfs_by_knee[side]['id'].loc[dfs_by_knee[side][col_name] == self.missing_data_val])
 961 |                     if set_missing_baseline_to_0:
 962 |                         ids_with_nas_at_first_timepoint[side] = set([])
 963 |                         print("Warning: setting %i missing datapoints for baseline to 0" % len(na_ids))
 964 |                     else:
 965 |                         ids_with_nas_at_first_timepoint[side] = na_ids
 966 |                 
 967 |                 # update list of people who report an injury. 
 968 |                 ids_who_report_injury_at_any_timepoint[side] = ids_who_report_injury_at_any_timepoint[side].union(knee_injury_at_this_timepoint)
 969 |                 
 970 |                 # set people to True if report injury at any timepoint. 
 971 |                 dfs_by_knee[side][col_name] = dfs_by_knee[side]['id'].map(lambda x:x in ids_who_report_injury_at_any_timepoint[side])
 972 |                 # set people to NA if False and missing data at initial timepoint 
 973 |                 dfs_by_knee[side].loc[dfs_by_knee[side]['id'].map(lambda x:(x in ids_with_nas_at_first_timepoint[side]) & 
 974 |                                                                   (x not in ids_who_report_injury_at_any_timepoint[side])),
 975 |                                       col_name] = None
 976 |                 
 977 |                 
 978 |                 dfs_by_knee[side].index = range(len(dfs_by_knee[side]))
 979 |                 all_dfs.append(dfs_by_knee[side].copy())
 980 |                 print("At timepoint %s, rate for %s leg: %i=1, %i=0, %i are missing" % (CLINICAL_WAVES_TO_FOLLOWUP[visit],
 981 |                                                                                       side, 
 982 |                                                                                       (dfs_by_knee[side][col_name] == 1).sum(), 
 983 |                                                                                       (dfs_by_knee[side][col_name] == 0).sum(),
 984 |                                                                                       pd.isnull(dfs_by_knee[side][col_name]).sum()))
 985 |             
 986 |         combined_df = pd.concat(all_dfs)
 987 |         combined_df.index = range(len(combined_df))
 988 |         assert len(combined_df[['id', 'visit', 'side']].drop_duplicates()) == len(combined_df)
 989 |         print("Average values")
 990 |         print(combined_df[[col_name, 'visit', 'side']].groupby(['side', 'visit']).agg(['mean', 'size']))
 991 |         print("Missing data")
 992 |         print(combined_df[[col_name, 'visit', 'side']].groupby(['side', 'visit']).agg(lambda x:np.mean(pd.isnull(x))))
 993 | 
 994 |         return combined_df
 995 | 
 996 |     def make_other_koos_subscores_dataframe(self):
 997 |         """
 998 |         Make dataframe of other Koos pain subscores. 
 999 |         Each row is one visit for one side for one id. 
1000 |         Other koos_symptoms_score is knee specific. Everything else is the same for both. 
1001 |         """
1002 |         print("Making other koos subscores dataframe")
1003 |         
1004 |         base_cols = {'koosfsr':'koos_function_score', 
1005 |         'koosqol':'koos_quality_of_life_score', 
1006 |         'koosym':'koos_symptoms_score'}
1007 | 
1008 |         left_cols = copy.deepcopy(base_cols)
1009 |         right_cols = copy.deepcopy(base_cols)
1010 | 
1011 |         left_cols['koosyml'] = left_cols['koosym']
1012 |         right_cols['koosymr'] = right_cols['koosym']
1013 |         del left_cols['koosym']
1014 |         del right_cols['koosym']
1015 | 
1016 |         dfs_to_concat = []
1017 |         for side in ['left', 'right']:
1018 |             if side == 'left':
1019 |                 cols_to_use = left_cols
1020 |             else:
1021 |                 cols_to_use = right_cols
1022 | 
1023 |             old_col_names = sorted(cols_to_use.keys())
1024 |             new_col_names = [cols_to_use[a] for a in old_col_names]
1025 |             all_koos_scores_for_side = self.concatenate_dataframes_from_multiple_timepoints(dataset_substring='allclinical', 
1026 |                 columns_to_subset_on=['id'] + old_col_names)
1027 |             assert list(all_koos_scores_for_side.columns) == ['id'] + old_col_names + ['visit']
1028 |             all_koos_scores_for_side.columns = ['id'] + new_col_names + ['visit']
1029 |             all_koos_scores_for_side['side'] = side
1030 |             dfs_to_concat.append(all_koos_scores_for_side)
1031 |         final_df = pd.concat(dfs_to_concat)
1032 |         final_df.index = range(len(final_df))
1033 |         
1034 |         def map_blank_strings_to_none(x):
1035 |             # small helper method: empty strings become none, otherwise cast to float. 
1036 |             if len(str(x).strip()) == 0:
1037 |                 return None
1038 |             return float(x)
1039 | 
1040 |         for c in sorted(base_cols.values()):
1041 |             final_df[c] = final_df[c].map(map_blank_strings_to_none)
1042 | 
1043 |         print('means by column and visit')
1044 |         print(final_df[['visit', 'side'] + list(base_cols.values())].groupby(['visit', 'side']).mean())
1045 |         for c in base_cols.values():
1046 |             print('missing data fraction for %s is %2.3f' % (c, pd.isnull(final_df[c]).mean()))
1047 |         for c1 in base_cols.values():
1048 |             for c2 in base_cols.values():
1049 |                 if c1 > c2:
1050 |                     good_idxs = ~(pd.isnull(final_df[c1]) | pd.isnull(final_df[c2]))
1051 |                     print("Correlation between %s and %s: %2.3f" % (
1052 |                         c1, 
1053 |                         c2, 
1054 |                         pearsonr(final_df.loc[good_idxs, c1], final_df.loc[good_idxs, c2])[0]))
1055 | 
1056 | 
1057 |         self.processed_dataframes['other_koos_subscores'] = final_df
1058 | 
1059 | 
1060 | 
1061 |     def make_knee_pain_dataframe(self):
1062 |         """
1063 |         Extract Koos and Womac knee pain scores 
1064 |         Koos scores are transformed to a 0–100 scale, with zero representing extreme knee problems and 100 representing no knee problems as is common in orthopaedic assessment scales and generic measures. 
1065 |         http://www.koos.nu/koosfaq.html
1066 |         Womac scores: Higher scores on the WOMAC indicate worse pain, stiffness, and functional limitations. 
1067 |         https://www.physio-pedia.com/WOMAC_Osteoarthritis_Index
1068 |         Checked. 
1069 |         """
1070 |         all_left_knee_pain_scores = self.concatenate_dataframes_from_multiple_timepoints(dataset_substring='allclinical', 
1071 |             columns_to_subset_on=['id', 'kooskpl', 'womkpl'])
1072 |         assert list(all_left_knee_pain_scores.columns) == ['id', 'kooskpl', 'womkpl', 'visit']
1073 |         all_left_knee_pain_scores.columns = ['id', 'koos_pain_subscore', 'womac_pain_subscore', 'visit']
1074 |         all_left_knee_pain_scores['side'] = 'left'
1075 | 
1076 |         all_right_knee_pain_scores = self.concatenate_dataframes_from_multiple_timepoints(dataset_substring='allclinical', 
1077 |             columns_to_subset_on=['id', 'kooskpr', 'womkpr'])
1078 |         assert list(all_right_knee_pain_scores.columns) == ['id', 'kooskpr', 'womkpr', 'visit']
1079 |         all_right_knee_pain_scores.columns = ['id', 'koos_pain_subscore', 'womac_pain_subscore', 'visit']
1080 |         all_right_knee_pain_scores['side'] = 'right'
1081 |         all_knee_pain_scores = pd.concat([all_left_knee_pain_scores, all_right_knee_pain_scores])
1082 |         for k in ['koos_pain_subscore', 'womac_pain_subscore']:
1083 |             all_knee_pain_scores[k] = all_knee_pain_scores[k].map(lambda x:float(x) if len(str(x).strip()) > 0 else None)
1084 |         print("Number of knee pain scores: %i" % len(all_knee_pain_scores))
1085 |         print("Womac scores not missing data: %i; koos not missing data: %i" % (len(all_knee_pain_scores['koos_pain_subscore'].dropna()), 
1086 |             len(all_knee_pain_scores['womac_pain_subscore'].dropna())))
1087 |         for timepoint in sorted(list(set(all_knee_pain_scores['visit']))):
1088 |             df_for_timepoint = copy.deepcopy(all_knee_pain_scores.loc[all_knee_pain_scores['visit'] == timepoint])
1089 |             print("Timepoint %s, fraction womac scores complete: %2.3f; koos scores complete %2.3f" % (timepoint, 
1090 |                 1 - pd.isnull(df_for_timepoint['womac_pain_subscore']).mean(), 
1091 |                 1 - pd.isnull(df_for_timepoint['koos_pain_subscore']).mean()))
1092 | 
1093 |         all_knee_pain_scores = all_knee_pain_scores.dropna()
1094 |         print("Number of knee pain scores not missing data: %i" % len(all_knee_pain_scores))
1095 |         print("Correlation between KOOS and WOMAC scores is %2.3f" % pearsonr(all_knee_pain_scores['koos_pain_subscore'], 
1096 |             all_knee_pain_scores['womac_pain_subscore'])[0])
1097 |         self.processed_dataframes['all_knee_pain_scores'] = all_knee_pain_scores
1098 | 
1099 |     def make_per_person_controls_dataframe(self):
1100 |         """
1101 |         Extract covariates which are person-specific (eg, income). 
1102 |         Checked.
1103 |         """
1104 |         print("\n***Making dataset of per-person controls.")
1105 |         missing_data_val = self.missing_data_val
1106 | 
1107 |         # Income, education, marital status. Each row is one person. 
1108 |         all_clinical00_d = copy.deepcopy(self.original_dataframes['allclinical00'][['id', 'v00income', 'v00edcv', 'v00maritst']])
1109 |         for c in ['v00income', 'v00edcv']:
1110 |             val_counts = Counter(all_clinical00_d[c])
1111 |             for val in sorted(val_counts.keys()):
1112 |                 print('%-50s %2.1f%%' % (val, 100.*val_counts[val] / len(all_clinical00_d)))
1113 |             missing_data_idxs = all_clinical00_d[c] == missing_data_val
1114 |             if c == 'v00edcv':
1115 |                 col_name = 'binarized_education_graduated_college'
1116 |                 all_clinical00_d[col_name] = (all_clinical00_d[c] >= '3: College graduate') * 1.
1117 |             elif c == 'v00income':
1118 |                 col_name = 'binarized_income_at_least_50k'
1119 |                 all_clinical00_d[col_name] = (all_clinical00_d[c] >= '4: $50K to < $100K') * 1.
1120 |             all_clinical00_d.loc[missing_data_idxs, col_name] = None
1121 |             all_clinical00_d.loc[missing_data_idxs, c] = None
1122 |             print("Binarizing into column %s with mean %2.3f and %2.3f missing data" % (col_name, 
1123 |                 all_clinical00_d[col_name].mean(), 
1124 |                 pd.isnull(all_clinical00_d[col_name]).mean()))
1125 | 
1126 |         all_clinical00_d.loc[all_clinical00_d['v00maritst'] == missing_data_val, 'v00maritst'] = None
1127 | 
1128 | 
1129 |         # Gender + race + site. 
1130 |         enrollees_path = os.path.join(BASE_NON_IMAGE_DATA_DIR, 'General_ASCII')
1131 |         self.load_all_text_files_in_directory(enrollees_path, datasets_to_skip=[])
1132 |         race_sex_site = copy.deepcopy(self.original_dataframes['enrollees'][['id', 'p02hisp', 'p02race', 'p02sex', 'v00site']])
1133 | 
1134 | 
1135 |         for c in race_sex_site.columns:
1136 |             if c == 'id':
1137 |                 continue
1138 |             missing_data_idxs = race_sex_site[c] == missing_data_val
1139 |             race_sex_site.loc[missing_data_idxs, c] = None
1140 | 
1141 |         race_sex_site['race_black'] = (race_sex_site['p02race'] == '2: Black or African American') * 1.
1142 |         race_sex_site.loc[pd.isnull(race_sex_site['p02race']), 'race_black'] = None
1143 |         print("Proportion of missing data for race (this will be dropped): %2.3f; proportion black: %2.3f" % 
1144 |             (pd.isnull(race_sex_site['race_black']).mean(), 
1145 |              race_sex_site['race_black'].mean()))
1146 | 
1147 |         assert len(race_sex_site) == TOTAL_PEOPLE
1148 |         assert len(all_clinical00_d) == TOTAL_PEOPLE
1149 |         assert len(set(race_sex_site['id'])) == len(race_sex_site)
1150 |         assert len(set(all_clinical00_d['id'])) == len(all_clinical00_d)
1151 |         assert sorted(list(race_sex_site['id'])) == sorted(list(all_clinical00_d['id']))
1152 | 
1153 |         d = pd.merge(race_sex_site, all_clinical00_d, on='id', how='inner')
1154 |         assert len(d) == TOTAL_PEOPLE
1155 |         assert len(set(d['id'])) == len(d)
1156 | 
1157 |         print("All columns in per-person dataframe")
1158 |         for c in d.columns:
1159 |             if c == 'id':
1160 |                 continue
1161 |             print("\nSummary stats for column %s" % c)
1162 |             print("Missing data: %2.1f%%" % (pd.isnull(d[c]).mean() * 100))
1163 |             val_counts = Counter(d[c].dropna())
1164 |             for val in sorted(val_counts.keys()):
1165 |                 print('%-50s %2.1f%%' % (val, 100.*val_counts[val] / len(d[c].dropna())))
1166 | 
1167 |         self.processed_dataframes['per_person_covariates'] = d
1168 | 
1169 |     def make_processed_mri_data(self):
1170 |         """
1171 |         Process MRI data, roughly following David's methodology. 
1172 |         Essentially, to get each processed column, we take the max of a bunch of raw columns, then threshold that max. (So the processed variable is binary.)
1173 | 
1174 |         Various data peculiarities: 
1175 |         1. Appears that most patients are actually lacking the MOAKS data. Asked David about this, seems fine. 
1176 |         2. "For pooling MOAKS readings from different reading projects please read the documentation for the kMRI_SQ_MOAKS_BICLxx datasets very carefully." Took another look, seems fine. 
1177 |         3. what about special values of 0.5 or -0.5? These values occur quite rarely. Verified that they don't change our results. 
1178 |         4. Asymmetry in which knees are rated (some projects only rated one knee...) -- this seems unavoidable. 
1179 |         """
1180 |         print("Processing MRI data as David did!")
1181 | 
1182 |         concatenated_mri = self.concatenate_dataframes_from_multiple_timepoints('kmri_sq_moaks_bicl')
1183 |         
1184 |         processed_cols = {'car11plusm':{'cols':['mcmfmc', 'mcmfmp', 'mcmtma', 'mcmtmc', 'mcmtmp'], 
1185 |                                      'thresh':1.1}, 
1186 |                           'car11plusl':{'cols':['mcmflc', 'mcmflp', 'mcmtla','mcmtlc','mcmtlp'], 
1187 |                                         'thresh':1.1},
1188 |                           'car11pluspf':{'cols':['mcmfma', 'mcmfla','mcmpm', 'mcmpl'], 
1189 |                                          'thresh':1.1}, 
1190 |                           'bml2plusm':{'cols':['mbmsfmc', 'mbmsfmp', 'mbmstma', 'mbmstmc', 'mbmstmp'], 
1191 |                                        'thresh':2.0}, 
1192 |                           'bml2plusl':{'cols':['mbmsflc', 'mbmsflp', 'mbmstla', 'mbmstlc', 'mbmstlp'], 
1193 |                                        'thresh':2.0},
1194 |                           'bml2pluspf':{'cols':['mbmsfma','mbmsfla','mbmspm','mbmspl'], 
1195 |                                        'thresh':2.0},
1196 |                           'mentearm':{'cols':['mmtma', 'mmtmb', 'mmtmp'], 
1197 |                                       'thresh':2.0},
1198 |                           'mentearl':{'cols':['mmtla', 'mmtlb', 'mmtlp'], 
1199 |                                       'thresh':2.0},
1200 |                           'menextm':{'cols':['mmxmm', 'mmxma'], 
1201 |                                       'thresh':2.0},
1202 |                           'menextl':{'cols':['mmxll', 'mmxla'], 
1203 |                                       'thresh':2.0}
1204 |                          }
1205 |         side_mappings = {'2: Left':'left', '1: Right':'right', 1:'right', 2:'left'}
1206 |         concatenated_mri['side'] = concatenated_mri['side'].map(lambda x:side_mappings[x])
1207 |         print('Side variable for MRI', Counter(concatenated_mri['side']))
1208 |         self.validate_col(concatenated_mri['side'], ['right', 'left'])
1209 | 
1210 |         # we have multiple readings for each knee. Sort by number of missing values, keep the duplicate with fewest missing values. 
1211 |         all_necessary_cols = []
1212 |         for col in processed_cols:
1213 |              all_necessary_cols += processed_cols[col]['cols']
1214 | 
1215 | 
1216 |         def map_mri_to_float(x):
1217 |             if x == self.missing_data_val:
1218 |                 return None
1219 |             if str(x) == 'nan':
1220 |                 return None
1221 |             if type(x) is float:
1222 |                 return x
1223 |             return float(x.split(':')[0])
1224 | 
1225 |         if self.filter_out_special_values_in_mri_data:
1226 |             # just a sanity check which we do not use by default in main processing. 
1227 |             # Basically, I was uncertain of whether we wanted to simply threshold all values, as is done in a previous analysis
1228 |             # even though values of 0.5 and -0.5 indicate change over time. So I wrote code so we could filter these rows out
1229 |             # and verify that it didn't change results. 
1230 |             special_values = np.array([False for a in range(len(concatenated_mri))])
1231 |             for col in all_necessary_cols:
1232 |                 values_in_col = concatenated_mri[col].map(lambda x:map_mri_to_float(x))
1233 |                 special_values_in_col = concatenated_mri[col].map(lambda x:map_mri_to_float(x) in [0.5, -0.5, -1]).values
1234 |                 
1235 |                 print(Counter(values_in_col[~np.isnan(values_in_col)]))
1236 |                 special_values = special_values | special_values_in_col
1237 |                 print("Fraction of special values in %s: %2.3f (n=%i); cumulative fraction %2.3f" % (col, 
1238 |                     special_values_in_col.mean(), 
1239 |                     special_values_in_col.sum(), 
1240 |                     special_values.mean()))
1241 | 
1242 |             print("Fraction of special values in MRI data: %2.3f." % special_values.mean())
1243 |             concatenated_mri = concatenated_mri.loc[~special_values]
1244 |             concatenated_mri.index = range(len(concatenated_mri))
1245 | 
1246 |         missing_data = ((concatenated_mri[all_necessary_cols] == self.missing_data_val).sum(axis=1) +
1247 |                         pd.isnull(concatenated_mri[all_necessary_cols]).sum(axis=1))
1248 |         concatenated_mri['num_missing_fields'] = missing_data.values
1249 |         concatenated_mri = concatenated_mri.sort_values(by='num_missing_fields')
1250 |         print("Prior to dropping duplicate readings for same side, person, and timepoint, %i rows" % 
1251 |               len(concatenated_mri))
1252 |         concatenated_mri = concatenated_mri.drop_duplicates(subset=['id', 'side', 'visit'], keep='first')
1253 |         print("After dropping duplicate readings for same side, person, and timepoint, %i rows" % 
1254 |               len(concatenated_mri))
1255 |                                       
1256 |         
1257 | 
1258 |         original_cols_already_used = set([]) # sanity check: make sure we're not accidentally using raw columns in two different processed columns. 
1259 |         for processed_col_name in processed_cols:
1260 |             original_cols = processed_cols[processed_col_name]['cols']
1261 |             processed_col_vals = []
1262 |             for c in original_cols:
1263 |                 assert c not in original_cols_already_used
1264 |                 original_cols_already_used.add(c)
1265 |                 concatenated_mri[c] = concatenated_mri[c].map(map_mri_to_float).astype(float)
1266 |                 print(concatenated_mri[c].value_counts(dropna=False)/len(concatenated_mri))
1267 |             for i in range(len(concatenated_mri)):
1268 |                 vals_to_max = concatenated_mri.iloc[i][original_cols].values
1269 |                 not_null = ~pd.isnull(vals_to_max)
1270 |                 if not_null.sum() > 0:
1271 |                     max_val = np.max(vals_to_max[not_null])
1272 |                     processed_col_vals.append(max_val >= processed_cols[processed_col_name]['thresh'])
1273 |                 else:
1274 |                     processed_col_vals.append(None)
1275 |                 
1276 |             concatenated_mri[processed_col_name] = processed_col_vals
1277 |             concatenated_mri[processed_col_name] = concatenated_mri[processed_col_name].astype('float')
1278 |         concatenated_mri = concatenated_mri[['id', 'side', 'visit', 'readprj'] + sorted(list(processed_cols.keys()))]
1279 |         print("Average values")
1280 |         print(concatenated_mri.groupby(['visit', 'side']).mean())
1281 |         print("missing data")
1282 |         print(concatenated_mri.groupby(['visit', 'side']).agg(lambda x:np.mean(pd.isnull(x))))
1283 |         concatenated_mri.index = range(len(concatenated_mri))
1284 |         self.processed_dataframes['david_mri_data'] = concatenated_mri
1285 | 
1286 |     def load_semiquantitative_xray_data(self):
1287 |         """
1288 |         Load in all the semiquantitative x-ray ratings. 
1289 |         Checked.
1290 |         """
1291 |         print("\n***Loading all semi-quantitative x-ray data.")
1292 |         dataset_substring = 'kxr_sq_bu'
1293 |         datasets_to_skip = [a.replace('.txt', '') for a in os.listdir(self.semiquantitative_xray_dir) if dataset_substring not in a and '.txt' in a]
1294 |         self.load_all_text_files_in_directory(self.semiquantitative_xray_dir, datasets_to_skip=datasets_to_skip)
1295 |         
1296 |         for dataset_name in sorted(self.original_dataframes):
1297 |             if dataset_substring in dataset_name:
1298 |                 # From the OAI notes: 
1299 |                 # Please note that although some participants are coded READPRJ=42, they are in fact participants in Project 37. Users should recode these participants from READPRJ=42 to READPRJ=37.
1300 |                 miscoded_project_idxs = self.original_dataframes[dataset_name]['readprj'] == 42
1301 |                 self.original_dataframes[dataset_name].loc[miscoded_project_idxs, 'readprj'] = 37
1302 |                 self.original_dataframes[dataset_name]['side'] = self.original_dataframes[dataset_name]['side'].map(lambda x:self.side_mappings[x])
1303 |         combined_data = self.concatenate_dataframes_from_multiple_timepoints(dataset_substring)
1304 |         
1305 |         # drop a very small number of rows with weird barcodes. 
1306 |         print("prior to dropping semiquantitative data missing a barcode, %i rows" % len(combined_data))
1307 |         combined_data = combined_data.dropna(subset=['barcdbu'])
1308 |         combined_data = combined_data.loc[combined_data['barcdbu'] != 'T']
1309 |         combined_data['barcdbu'] = combined_data['barcdbu'].map(lambda x:'0'+str(int(x)))
1310 |         assert (combined_data['barcdbu'].map(len) == 12).all()
1311 |         assert (combined_data['barcdbu'].map(lambda x:x[:4] == '0166')).all()
1312 |         print("After dropping, %i rows" % len(combined_data))
1313 |         
1314 |         # From the notes: "the variables uniquely identifying a record in these datasets are ID, SIDE, and READPRJ"
1315 |         assert len(combined_data.drop_duplicates(subset=['id', 'side', 'visit', 'readprj'])) == len(combined_data)
1316 | 
1317 |         # but we don't actually want multiple readings (from different projects) for a given knee and timepoint;
1318 |         # it appears that each timepoint is pretty exclusively read by a single project, so we just use the 
1319 |         # predominant project at each timepoint. 
1320 |         filtered_for_project = []
1321 | 
1322 |         def timepoint_a_less_than_or_equal_to_b(a, b):
1323 |             valid_timepoints = ['00 month follow-up: Baseline', 
1324 |             '12 month follow-up', 
1325 |             '24 month follow-up', 
1326 |             '36 month follow-up', 
1327 |             '48 month follow-up', 
1328 |             '72 month follow-up',
1329 |             '96 month follow-up']
1330 |             assert (a in valid_timepoints) and (b in valid_timepoints)
1331 |             a_idx = valid_timepoints.index(a)
1332 |             b_idx = valid_timepoints.index(b)
1333 |             return a_idx <= b_idx
1334 | 
1335 |         for timepoint in sorted(list(set(combined_data['visit']))):
1336 |             if timepoint == '72 month follow-up':
1337 |                 print("Skipping %s because not sure how to fill in missing data; there is lots of missing data even for people with KLG >= 2" % timepoint)
1338 |                 continue
1339 |             timepoint_idxs = combined_data['visit'] == timepoint
1340 |             df_for_timepoint = combined_data.loc[timepoint_idxs]
1341 |             readings_for_15 = set(df_for_timepoint.loc[df_for_timepoint['readprj'] == 15, 'id'])
1342 |             readings_for_37 = set(df_for_timepoint.loc[df_for_timepoint['readprj'] == 37, 'id'])
1343 |             # This illustrates that it is safe to take one project or the other for each timepoint. 
1344 |             # Many people do have readings for both projects. But I think it is cleaner to be consistent in the project used for timepoints 0 - 48m. 
1345 |             # Project 37 is done only on  a weird sample of people, so attempting to merge somehow would lead to an inconsistent definition of image variables
1346 |             # on a non-random subset of the population. However, note that this means that our definitions of some image variables don't quite line up 
1347 |             # with the definitions of image variables in allclinical00: eg, their knee lateral joint space narrowing appears to be some kind of max of the two projects. This is fine, because we don't use those variables for analysis.
1348 |             print("%s: %i people had readings for 15 but not 37; %i had readings for 37 but not 15; %i had readings for both" % (
1349 |                 timepoint, 
1350 |                 len(readings_for_15 - readings_for_37), 
1351 |                 len(readings_for_37 - readings_for_15), 
1352 |                 len(readings_for_37.intersection(readings_for_15))))
1353 |             if timepoint in ['00 month follow-up: Baseline', 
1354 |             '12 month follow-up', 
1355 |             '24 month follow-up', 
1356 |             '36 month follow-up', 
1357 |             '48 month follow-up']:
1358 |                 df_for_timepoint = df_for_timepoint.loc[df_for_timepoint['readprj'] == 15]
1359 |             elif timepoint in ['72 month follow-up', '96 month follow-up']:
1360 |                 df_for_timepoint = df_for_timepoint.loc[df_for_timepoint['readprj'] == 37]
1361 |             else:
1362 |                 raise Exception("invalid timepoint")
1363 | 
1364 |             print("Filling in missing values for %s as 0" % timepoint)
1365 |             # Fill in missing data.
1366 |             # KLG and OARSI JSN grades are available for all participants in this project at all available time points. Scores for other IRFs (osteophytes, subchondral sclerosis, cysts and attrition) are available only in participants with definite radiographic OA at least one knee at one (or more) of the time points.
1367 |             # Following this, we say you should have data if you have had KLG >= 2 at this timepoint or earlier.
1368 |             participants_who_have_had_definite_radiographic_oa = set(combined_data['id'].loc[
1369 |                     combined_data['visit'].map(lambda x:timepoint_a_less_than_or_equal_to_b(x, timepoint)) & 
1370 |                     (combined_data['xrkl'] >= 2)])
1371 | 
1372 |             people_who_are_missing_data_but_should_have_data = None
1373 |             for c in df_for_timepoint.columns:
1374 |                 missing_data_idxs = pd.isnull(df_for_timepoint[c]).values
1375 |                 people_who_should_have_data = df_for_timepoint['id'].map(lambda x:x in participants_who_have_had_definite_radiographic_oa).values
1376 |                 if c[0] == 'x':
1377 |                     if c not in ['xrjsl', 'xrjsm', 'xrkl']:
1378 |                         print("Filling in missing data for %i values in column %s" % (missing_data_idxs.sum(), c))
1379 |                         # fill in data as 0 for those we don't expect to have it. 
1380 |                         df_for_timepoint.loc[missing_data_idxs & (~people_who_should_have_data), c] = 0
1381 | 
1382 |                         # keep track of those who are missing data but shouldn't be, so we can drop them later.
1383 |                         if people_who_are_missing_data_but_should_have_data is None:
1384 |                             people_who_are_missing_data_but_should_have_data = (missing_data_idxs & people_who_should_have_data)
1385 |                         else:
1386 |                             people_who_are_missing_data_but_should_have_data = (missing_data_idxs & people_who_should_have_data) | people_who_are_missing_data_but_should_have_data
1387 | 
1388 |                     else:
1389 |                         print("NOT filling in missing data for %i values in column %s" % (missing_data_idxs.sum(), c))
1390 |                     print("Fraction of missing data %2.3f; non-missing values:" % pd.isnull(df_for_timepoint[c]).mean(), Counter(df_for_timepoint[c].dropna()))
1391 |                 if c in ['id', 'side', 'readprj', 'version']:
1392 |                     assert missing_data_idxs.sum() == 0
1393 |             print("Prior to dropping missing data in x-ray image scoring for %s, %i points" % (timepoint, len(df_for_timepoint)))
1394 |             df_for_timepoint = df_for_timepoint.loc[~people_who_are_missing_data_but_should_have_data]
1395 |             # In total, this line drops about 1% of values for timepoints baseline - 48 m, which isn't the end of the world. 
1396 |             print("After dropping people who should be scored for other attributes but aren't, %i timepoints (%2.1f%% of values are bad)" % (len(df_for_timepoint), people_who_are_missing_data_but_should_have_data.mean() * 100))
1397 |             df_for_timepoint = df_for_timepoint.dropna(subset=['xrkl'])
1398 |             print("After dropping missing data in xrkl for %s, %i points" % (timepoint, len(df_for_timepoint)))
1399 |             
1400 |             filtered_for_project.append(df_for_timepoint)
1401 |         combined_data = pd.concat(filtered_for_project)
1402 |         combined_data.index = range(len(combined_data))
1403 |         assert len(combined_data.drop_duplicates(subset=['id', 'side', 'visit'])) == len(combined_data)
1404 |         assert len(combined_data.drop_duplicates(subset=['barcdbu', 'side'])) == len(combined_data)
1405 | 
1406 | 
1407 |         
1408 | 
1409 |         for timepoint in sorted(list(set(combined_data['visit']))):
1410 |             print(timepoint,
1411 |                 Counter(combined_data.loc[(combined_data['visit'] == timepoint) & (combined_data['side'] == 'left'), 
1412 |                                         'readprj']))
1413 |         self.processed_dataframes[dataset_substring] = combined_data
1414 |         self.clinical_xray_semiquantitative_cols = [a for a in self.processed_dataframes['kxr_sq_bu'] if a[0] == 'x']
1415 | 
1416 |     def load_xray_metadata(self):
1417 |         # Load raw x-ray metadata. Checked. Not being used at present. 
1418 |         print("\n***Loading all x-ray metadata.")
1419 |         self.load_all_text_files_in_directory(self.xray_metadata_dir, datasets_to_skip=[])
1420 |     
1421 |     def load_semiquantitative_mri_data(self):
1422 |         # Load raw semiquantitative MRI data. Checked. Not being used at present. 
1423 |         print("\n***Loading all semi-quantitative MRI data.")
1424 |         self.load_all_text_files_in_directory(self.semiquantitative_mri_dir, datasets_to_skip=[])
1425 | 
1426 |     def load_mri_metadata(self):
1427 |         # Load raw MRI metadata. Checked. Not being used at present. 
1428 |         print("\n***Loading all MRI metadata.")
1429 |         self.load_all_text_files_in_directory(self.mri_metadata_dir, datasets_to_skip=[])
1430 | 
1431 |     def map_str_column_to_float(self, dataset_name, column):
1432 |         raise Exception("If you actually use this you need to check it.")
1433 |         col_dtype = str(self.original_dataframes[dataset_name][column].dtype)
1434 |         if 'float' in col_dtype:
1435 |             raise Exception("%s in %s is not a string column, it is a float column" % (column, dataset_name))
1436 |         #assert self.original_dataframes[dataset_name][column].dtype is str
1437 |         #self.original_dataframes[dataset_name][column] = self.original_dataframes[dataset_name][column].astype(str)
1438 |         nan_idxs = pd.isnull(self.original_dataframes[dataset_name][column])
1439 |         nan_value = self.missing_data_val
1440 |         #self.original_dataframes[dataset_name].loc[nan_idxs, column] = nan_value
1441 |         nan_idxs = pd.isnull(self.original_dataframes[dataset_name][column])
1442 |         assert nan_idxs.sum() == 0
1443 | 
1444 |         unique_vals = sorted(list(set(self.original_dataframes[dataset_name][column])))
1445 |         codebook = {}
1446 |         for original_val in unique_vals:
1447 |             assert ': ' in original_val
1448 |             if original_val == nan_value:
1449 |                 shortened_val = None
1450 |             else:
1451 |                 shortened_val = float(original_val.split(':')[0])
1452 |             codebook[original_val] = shortened_val
1453 |         self.original_dataframes[dataset_name][column] = self.original_dataframes[dataset_name][column].map(lambda x:codebook[x])
1454 |         p_missing = pd.isnull(self.original_dataframes[dataset_name][column]).mean()
1455 |         print("After mapping, column %s in dataset %s has proportion %2.3f missing data" % (column, dataset_name, p_missing))
1456 |         self.col_mappings[dataset_name][column] = codebook
1457 | 
1458 |     def validate_ids(self):
1459 |         """
1460 |         Make sure IDs are consistent across datasets they should be consistent in. 
1461 |         """
1462 |         print("\n***Validating that IDs look kosher")
1463 |         self.all_ids = sorted(list(copy.deepcopy(self.original_dataframes['allclinical00']['id'])))
1464 |         assert len(self.all_ids) == TOTAL_PEOPLE
1465 |         assert sorted(self.all_ids) == sorted(get_all_ids())
1466 |         assert len(set(self.all_ids)) == len(self.all_ids)
1467 | 
1468 |         for k in self.original_dataframes:
1469 |             if (('allclinical' in k)
1470 |                 or ('baseclin' in k) 
1471 |                 or ('enrollees' in k) 
1472 |                 or ('enrshort' in k)
1473 |                 or ('outcomes99' in k) 
1474 |                 or ('outshort' in k)):
1475 |                 print("Validating ids in %s" % k)
1476 |                 assert len(self.original_dataframes[k]) == TOTAL_PEOPLE
1477 |                 ids_in_dataframe = sorted(self.original_dataframes[k]['id'])
1478 |                 assert len(set(ids_in_dataframe)) == len(ids_in_dataframe)
1479 |                 assert ids_in_dataframe == self.all_ids
1480 |         
1481 | 


--------------------------------------------------------------------------------