├── AUTHOR
├── LICENSE
├── MANIFEST.in
├── README.rst
├── cantab
    ├── imagen_cantab_age_at_session_start_time.py
    └── imagen_cantab_extract_deidentify.py
├── dawba
    └── imagen_dawba_deidentify.py
├── genomics
    ├── rna_seq_deidentify_imagen.py
    └── rna_seq_deidentify_stratify.py
├── geolocation
    └── geolocation.sh
├── imagen_databank
    ├── __init__.py
    ├── additional_data.py
    ├── behavioral.py
    ├── cantab.py
    ├── core.py
    ├── dicom_utils.py
    ├── image_data.py
    ├── sanity
    │   ├── __init__.py
    │   ├── cantab.py
    │   └── imaging.py
    └── scanning.py
├── mri
    └── imagen_sample_FU3_mri_deidentify.py
├── onsets
    ├── imagen_onsets_copy_FU3.sh
    ├── imagen_onsets_copy_STRATIFY.sh
    └── imagen_onsets_extract_deidentify.py
├── psc
    └── imagen_update_dawba_codes_from_tokens.py
├── psytools
    ├── imagen_psytools_deidentify.py
    └── imagen_psytools_download.py
├── setup.py
├── sex
    ├── imagen_sex.py
    ├── imagen_sex_dataset.py
    ├── imagen_sex_methylation.py
    ├── imagen_sex_psytools.py
    ├── imagen_sex_recruitment.py
    └── imagen_sex_xnat.py
└── stratify_demographics
    ├── demographics.py
    └── stratify_debug_psytools.py


/AUTHOR:
--------------------------------------------------------------------------------
1 | Dimitri Papadopoulos
2 | David Goyard
3 | Antoine Grigis
4 | Vincent Frouin
5 | Robin Cherbonnier
6 | Thomas Gareau
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include AUTHOR LICENSE MANIFEST.in setup.py README.txt
2 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | =========================================
 2 | Databank operations of the Imagen project
 3 | =========================================
 4 | 
 5 | Databank operations are mostly documented internally at NeuroSpin.
 6 | 
 7 | Basic information is available from the `project wiki`_.
 8 | 
 9 | This Python package combines a Python library *imagen_databank* for basic
10 | sanity check and preprocessing of Imagen data and a set of scripts to
11 | extract, check, anonymize and transform raw Imagen data.
12 | 
13 | ``imagen_databank``
14 |   Read and perform sanity checks on raw datasets.
15 | 
16 | ``cantab``
17 |   Extract age from FU2 Cantab data.
18 | 
19 | ``dawba``
20 |   Remove identifying data and convert PSC1 to PSC2 in Dawba data,
21 |   after manual download from the youthinmind_ server.
22 | 
23 | ``stratify_demographics``
24 |   Cross-check Stratify age and sex with `stratify_debug_psytools.py`.
25 |   Print demographics with `demographics.py`, using recruitment files and
26 |   validated age/sex from the output of teh previosu script.
27 | 
28 | ``geolocation``
29 |   Merge and convert geolocation data from PSC1 to PSC2.
30 | 
31 | ``mri``
32 |   De-identify some NIfTI files that used to contain the PSC1 code.
33 | 
34 | ``onsets``
35 |   Remove identifying data and convert PSC1 to PSC2 in FU3 onsets files.
36 | 
37 | ``psc``
38 |   Update FU3 Dawba codes from token tables maintained on the Delosis_ serevr.
39 | 
40 | ``psytools``
41 |   Download Psytools data as CSV files from the Delosis_ server.
42 |   Remove identifying data and convert PSC1 to PSC2.
43 | 
44 | ``sex``
45 |   Derive reference sex of Imagen subjects from multiple sources.
46 |   There had been errors at baseline.
47 | 
48 | .. _`project wiki`: https://github.com/imagen2/imagen_databank/wiki
49 | .. _youthinmind: http://youthinmind.com
50 | .. _Delosis: https://www.delosis.com
51 | 


--------------------------------------------------------------------------------
/cantab/imagen_cantab_age_at_session_start_time.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """...
 3 | 
 4 | ==========
 5 | Attributes
 6 | ==========
 7 | 
 8 | Input
 9 | -----
10 | 
11 | FU2_MASTER_DIR : str
12 |     Location of FU2 PSC1-encoded data.
13 | 
14 | Output
15 | ------
16 | 
17 | ???
18 | 
19 | """
20 | 
21 | FU2_MASTER_DIR = '/neurospin/imagen/FU2/RAW/PSC1'
22 | 
23 | import logging
24 | logger = logging.getLogger(__name__)
25 | logging.basicConfig(level=logging.INFO)
26 | 
27 | import os
28 | import glob
29 | from datetime import date
30 | 
31 | # import ../imagen_databank
32 | import sys
33 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
34 | from imagen_databank import PSC2_FROM_PSC1
35 | from imagen_databank import DOB_FROM_PSC1
36 | from imagen_databank import read_datasheet
37 | 
38 | 
39 | def main():
40 |     # find datasheet_*.csv files
41 |     logger.info('start globing datasheet_*.csv files')
42 |     datasheets = glob.glob(os.path.join(FU2_MASTER_DIR,
43 |                            '*/*/AdditionalData/datasheet_*.csv'))
44 |     logger.info('finished globing datasheet_*.csv files')
45 | 
46 |     for datasheet in datasheets:
47 |         subject_ids, session_start_times, dummy_r, dummy_c, dummy_f = read_datasheet(datasheet)
48 |         if len(subject_ids) != 1:
49 |             logger.warning('Proper "Subject ID" not found: %s', datasheet)
50 |             continue
51 |         psc1 = subject_ids.pop()[:12]
52 | 
53 |         # find age
54 |         if psc1 not in DOB_FROM_PSC1:
55 |             logger.error('unknown age for PSC1 code %s: %s', psc1, datasheet)
56 |             continue
57 |         dob = DOB_FROM_PSC1[psc1]
58 |         session_start_times = set(sst.date() for sst in session_start_times)
59 |         if len(session_start_times) != 1:
60 |             logger.warning('Proper "Session start time" not found: %s',
61 |                            datasheet)
62 |             continue
63 |         session_start_time = session_start_times.pop()
64 |         if session_start_time < date(2007, 1, 1):
65 |             logger.error('Bogus "Session start time" %s: %s',
66 |                          session_start_time, datasheet)
67 |             continue
68 |         age = (session_start_time - dob).days
69 | 
70 |         # find PSC2
71 |         if psc1 not in PSC2_FROM_PSC1:
72 |             logger.error('unknown PSC1 code %s: %s', psc1, datasheet)
73 |             continue
74 |         psc2 = PSC2_FROM_PSC1[psc1]
75 | 
76 |         print('{0},{1}'.format(psc2, age))
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     main()
81 | 


--------------------------------------------------------------------------------
/dawba/imagen_dawba_deidentify.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Re-encode and anonymize DAWBA files (BL, FU1, FU2 and FU3).
  3 | 
  4 | This script replaces the Scito anoymization pipeline which does not
  5 | seem to be working anymore for DAWBA files.
  6 | 
  7 | ==========
  8 | Attributes
  9 | ==========
 10 | 
 11 | Input
 12 | -----
 13 | 
 14 | DAWBA_BL_MASTER_DIR : str
 15 |     Location of BL PSC1-encoded files.
 16 | DAWBA_FU1_MASTER_DIR : str
 17 |     Location of FU1 PSC1-encoded files.
 18 | DAWBA_FU2_MASTER_DIR : str
 19 |     Location of FU2 PSC1-encoded files.
 20 | DAWBA_FU3_MASTER_DIR : str
 21 |     Location of FU3 PSC1-encoded files.
 22 | DAWBA_SB_MASTER_DIR : str
 23 |     Location of Stratify PSC1-encoded files.
 24 | 
 25 | Output
 26 | ------
 27 | 
 28 | DAWBA_BL_PSC2_DIR : str
 29 |     Location of BL PSC2-encoded files.
 30 | DAWBA_FU1_PSC2_DIR : str
 31 |     Location of FU1 PSC2-encoded files.
 32 | DAWBA_FU2_PSC2_DIR : str
 33 |     Location of FU2 PSC2-encoded files.
 34 | DAWBA_FU3_PSC2_DIR : str
 35 |     Location of FU3 PSC2-encoded files.
 36 | DAWBA_SB_PSC2_DIR : str
 37 |     Location of Stratify PSC2-encoded files.
 38 | 
 39 | """
 40 | 
 41 | DAWBA_BL_MASTER_DIR = '/neurospin/imagen/BL/RAW/PSC1/dawba'
 42 | DAWBA_BL_PSC2_DIR = '/neurospin/imagen/BL/RAW/PSC2/dawba'
 43 | DAWBA_FU1_MASTER_DIR = '/neurospin/imagen/FU1/RAW/PSC1/dawba'
 44 | DAWBA_FU1_PSC2_DIR = '/neurospin/imagen/FU1/RAW/PSC2/dawba'
 45 | DAWBA_FU2_MASTER_DIR = '/neurospin/imagen/FU2/RAW/PSC1/dawba'
 46 | DAWBA_FU2_PSC2_DIR = '/neurospin/imagen/FU2/RAW/PSC2/dawba'
 47 | DAWBA_FU3_MASTER_DIR = '/neurospin/imagen/FU3/RAW/PSC1/dawba'
 48 | DAWBA_FU3_PSC2_DIR = '/neurospin/imagen/FU3/RAW/PSC2/dawba'
 49 | DAWBA_SB_MASTER_DIR = '/neurospin/imagen/STRATIFY/RAW/PSC1/dawba'
 50 | DAWBA_SB_PSC2_DIR = '/neurospin/imagen/STRATIFY/RAW/PSC2/dawba'
 51 | 
 52 | WITHDRAWN_DAWBA_CODES = {
 53 |     # DAWBA1 codes, missing for some reason - just ignore them...
 54 |     '19042',
 55 |     '19044',
 56 |     '19045',
 57 |     '19046',
 58 |     '19047',
 59 |     '19048',
 60 |     '19049',
 61 |     '19050',
 62 |     '19051',
 63 |     '23094',
 64 |     '23095',
 65 |     '23096',
 66 |     '23097',
 67 |     '23098',
 68 |     '23099',
 69 |     '23100',
 70 |     '23101',
 71 |     '23102',
 72 |     '23103',
 73 |     '23104',
 74 |     '23105',
 75 |     '23106',
 76 |     '23107',
 77 |     '23108',
 78 |     '23109',
 79 |     '23110',
 80 |     '23112',
 81 |     '23881',
 82 |     '27361',
 83 |     '27512',
 84 |     '28117',
 85 |     '28694',
 86 |     '31469',
 87 |     '31470',
 88 |     '31471',
 89 |     '31473',
 90 |     '38297',
 91 |     '38298',
 92 |     '38299',
 93 |     '38300',
 94 |     '38301',
 95 |     # see thread "DAWBA3 codes conversion table" from 2015-05-18
 96 |     '127657',
 97 |     # see thread "DAWBA3 codes conversion table" from 2015-12-15
 98 |     '128847',
 99 |     '127658',
100 |     '132983',
101 |     '129716',
102 |     '129500',
103 |     # see thread "Imagen: Dawba data 201490 acquired on 13 September 2015" on 2019-05-27
104 |     '201490',
105 |     # see thread "Imagen FU3 Dawba code 221867" on 2019-05-08
106 |     '221867',
107 |     # see thread "token management in Imagen FU3" on 2019-05-03
108 |     '228686',
109 |     '228691',
110 |     # see thread "token management in Imagen FU3" on 2019-05-03
111 |     '239204',
112 |     '239230',
113 |     # see thread "Imagen FU3 Dawba code 252346" on 2019-05-04
114 |     '252346',
115 |     # see thread "Re: AW:Imagen FU3 token management: 272443 / 272444" on 2019-06-25
116 |     # 244471 and 244513 are the same participant, we were told to keep the former
117 |     '244513',
118 |     # see thread "AW: [ext] Fwd: Pause to production of new teams" on 2019-07-23
119 |     '265683',
120 |     '265684',
121 |     '265685',
122 |     '265686',
123 |     '265687',
124 |     '265689',
125 |     # see thread "IMAGEN FU3, DAWBA-PSC1 clarification" on 2019-09-04
126 |     # 236038 and 254243 are the same participant, we were told to keep the former
127 |     '254243',
128 | }
129 | 
130 | import os
131 | from datetime import datetime
132 | 
133 | # import ../imagen_databank
134 | import sys
135 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
136 | from imagen_databank import PSC1_FROM_DAWBA
137 | from imagen_databank import PSC2_FROM_PSC1
138 | from imagen_databank import DOB_FROM_PSC1
139 | 
140 | import logging
141 | logging.basicConfig(level=logging.INFO)
142 | 
143 | 
144 | def _create_psc2_file(dawba_path, psc2_path):
145 |     """Anonymize and re-encode a DAWBA questionnaire from DAWBA to PSC2.
146 | 
147 |     DAWBA questionnaire files are CSV files.
148 | 
149 |     Columns containing a date will be modified and the date will converted to
150 |     the age of the subject in days, as required by the anonymization process.
151 | 
152 |     Parameters
153 |     ----------
154 |     psc2_from_dawba: map
155 |         Conversion table, from DAWBA to PSC2.
156 |     dawba_path: str
157 |         Input: DAWBA-encoded CSV file.
158 |     psc2_path: str
159 |         Output: PSC2-encoded CSV file.
160 | 
161 |     """
162 |     with open(dawba_path, 'r') as dawba_file:
163 |         # identify columns to anonymize/remove in header
164 |         header = next(iter(dawba_file))
165 |         items = header.split('\t')
166 |         convert = {i for i, item in enumerate(items)
167 |                    if 'sstartdate' in item or 'p1startdate' in item}
168 |         skip = {i for i, item in enumerate(items)
169 |                 if 'ratername' in item or 'ratedate' in item}
170 | 
171 |         with open(psc2_path, 'w') as psc2_file:
172 |             # write header
173 |             items = [item for i, item in enumerate(items)
174 |                      if i not in skip]
175 |             psc2_file.write('\t'.join(items))
176 |             if not items[-1].endswith('\n'):
177 |                 psc2_file.write('\n')
178 | 
179 |             # write data
180 |             for line in dawba_file:
181 |                 items = line.split('\t')
182 |                 dawba = items[0]
183 |                 if dawba not in PSC1_FROM_DAWBA:
184 |                     if dawba in WITHDRAWN_DAWBA_CODES:
185 |                         logging.info('withdrawn DAWBA code: %s', dawba)
186 |                     else:
187 |                         logging.error('DAWBA code missing from conversion table: %s',
188 |                                       dawba)
189 |                     continue
190 |                 psc1 = PSC1_FROM_DAWBA[dawba]
191 |                 if psc1 not in PSC2_FROM_PSC1:
192 |                     logging.error('PSC1 code missing from conversion table: %s',
193 |                                   psc1)
194 |                     continue
195 |                 psc2 = PSC2_FROM_PSC1[psc1]
196 |                 logging.info('converting subject %s from DAWBA to PSC2',
197 |                              psc1)
198 |                 items[0] = psc2
199 |                 # convert dates to subject age in days
200 |                 for i in convert:
201 |                     if items[i] != '':
202 |                         if psc1 in DOB_FROM_PSC1:
203 |                             startdate = datetime.strptime(items[i],
204 |                                                           '%d.%m.%y').date()
205 |                             birthdate = DOB_FROM_PSC1[psc1]
206 |                             age = startdate - birthdate
207 |                             logging.info('age of subject %s: %d',
208 |                                          psc1, age.days)
209 |                             items[i] = str(age.days)
210 |                         else:
211 |                             items[i] = ''
212 |                 items = [item for i, item in enumerate(items)
213 |                          if i not in skip]
214 |                 psc2_file.write('\t'.join(items))
215 |                 if not items[-1].endswith('\n'):
216 |                     psc2_file.write('\n')
217 | 
218 | 
219 | def create_psc2_files(master_dir, psc2_dir, prefix=None):
220 |     """Anonymize and re-encode all DAWBA questionnaires within a directory.
221 | 
222 |     DAWBA-encoded files are read from `master_dir`, anoymized and converted
223 |     from DAWBA codes to PSC2, and the result is written in `psc2_dir`.
224 | 
225 |     Parameters
226 |     ----------
227 |     master_dir: str
228 |         Input directory with DAWBA-encoded questionnaires.
229 |     psc2_dir: str
230 |         Output directory with PSC2-encoded and anonymized questionnaires.
231 | 
232 |     """
233 |     for master_file in os.listdir(master_dir):
234 |         master_path = os.path.join(master_dir, master_file)
235 |         if prefix:
236 |             master_file = prefix + master_file
237 |         psc2_path = os.path.join(psc2_dir, master_file)
238 |         _create_psc2_file(master_path, psc2_path)
239 | 
240 | 
241 | def main():
242 |     create_psc2_files(DAWBA_BL_MASTER_DIR, DAWBA_BL_PSC2_DIR, prefix='IMAGEN_')
243 |     create_psc2_files(DAWBA_FU1_MASTER_DIR, DAWBA_FU1_PSC2_DIR, prefix='IMAGEN_')
244 |     create_psc2_files(DAWBA_FU2_MASTER_DIR, DAWBA_FU2_PSC2_DIR, prefix='IMAGEN_')
245 |     create_psc2_files(DAWBA_FU3_MASTER_DIR, DAWBA_FU3_PSC2_DIR, prefix='IMAGEN_')
246 |     create_psc2_files(DAWBA_SB_MASTER_DIR, DAWBA_SB_PSC2_DIR, prefix='STRATIFY_')
247 | 
248 | 
249 | if __name__ == "__main__":
250 |     main()
251 | 


--------------------------------------------------------------------------------
/genomics/rna_seq_deidentify_imagen.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | from imagen_databank import PSC2_FROM_PSC1
  4 | 
  5 | file_labID_PSC1_conv='/imagen/FU3/RAW/PSC1/genomics/rna/env_IMAGEN_align60_no.dups_metadata.tsv'
  6 | 
  7 | #use either first or seconf bloc, for gene_counts or gene_tmp
  8 | 
  9 | input_dir_imagen_PSC1='/imagen/FU3/RAW/PSC1/genomics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts.tsv'
 10 | output_dir_imagen_BL_PSC2='/imagen/BL/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2_BL.tsv'
 11 | output_dir_imagen_FU2_PSC2='/imagen/FU2/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2_FU2.tsv'
 12 | output_dir_imagen_FU3_PSC2='/imagen/FU3/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2_FU3.tsv'
 13 | """
 14 | input_dir_imagen_PSC1="/imagen/FU3/RAW/PSC1/genomics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm.tsv"
 15 | output_dir_imagen_BL_PSC2='/imagen/BL/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2_BL.tsv'
 16 | output_dir_imagen_FU2_PSC2='/imagen/FU2/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2_FU2.tsv'
 17 | output_dir_imagen_FU3_PSC2='/imagen/FU3/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2_FU3.tsv'
 18 | """
 19 | 
 20 | 
 21 | 
 22 | def convert_labID_to_PSC2_with_timepoint(labID):
 23 |     labID_index = headers.index("Lab_Code")
 24 |     psc1_index = headers.index("PSC1")
 25 |     timepoint_index=headers.index("TimePoint")
 26 |     for line in tab_conv_labID_psc1:
 27 |         if line[labID_index]==labID:
 28 |             try:
 29 |                 if len(line[psc1_index])<12:
 30 |                     psc1="0"+line[psc1_index]
 31 |                 elif len(line[psc1_index])<12:
 32 |                     psc1=line[psc1_index]
 33 |                 psc2 = PSC2_FROM_PSC1[psc1]
 34 |                 return (psc2, line[timepoint_index])
 35 |             except:
 36 |                 print("invalid PSC1 code:", line[psc1_index])
 37 |                 #return ("###", line[timepoint_index])
 38 | 
 39 |     print("PSC1 not found for labID: ", labID)
 40 | 
 41 |     """
 42 |     for line in file_labID_PSC1:
 43 |         columns = line.strip().split(",")
 44 |         #print("check:",columns[labID_index],labID==columns[labID_index])
 45 |         if columns[labID_index] == labID:
 46 |             #print("deidentified: ",columns[psc1_index], "****", columns[timepoint_index])
 47 |             psc2 = PSC2_FROM_PSC1["0"+columns[psc1_index]]
 48 |             return(psc2,columns[timepoint_index])
 49 |     print("PSC1 not found for labID: ", labID)
 50 |     """
 51 | 
 52 | if __name__ == "__main__":
 53 |     with open(file_labID_PSC1_conv, 'r', errors='ignore') as file_labID_PSC1:
 54 |         reader = csv.reader(file_labID_PSC1, delimiter=',')
 55 |         tab_conv_labID_psc1 = [row for row in reader]
 56 |         headers=tab_conv_labID_psc1[0]
 57 |         #headers = list(next(reader))
 58 |         print(headers)
 59 |         print(convert_labID_to_PSC2_with_timepoint("GB97ENVKCLR301518"))
 60 | 
 61 |         with open(input_dir_imagen_PSC1, 'r', newline='',errors='ignore') as labID_infile:
 62 |             reader_input = csv.reader(labID_infile, delimiter='\t')
 63 | 
 64 |             data = [row for row in reader_input]
 65 | 
 66 |             #print(data[0])
 67 |             #intialize list of lists that will be written in the output file
 68 |             data_psc2_BL=[[] for i in range(len(data))]
 69 |             data_psc2_FU2 = [[] for i in range(len(data))]
 70 |             data_psc2_FU3 = [[] for i in range(len(data))]
 71 |             #intialize the two first columns of the three timepoints
 72 |             for i in range(len(data)):
 73 |                 #print(data[i][0]," ***** ", data[i][1], " ***** ", data[i][2])
 74 |                 #print(data_psc2_BL[i])
 75 |                 data_psc2_BL[i].append(data[i][0])
 76 |                 data_psc2_BL[i].append(data[i][1])
 77 | 
 78 |                 data_psc2_FU2[i].append(data[i][0])
 79 |                 data_psc2_FU2[i].append(data[i][1])
 80 | 
 81 |                 data_psc2_FU3[i].append(data[i][0])
 82 |                 data_psc2_FU3[i].append(data[i][1])
 83 | 
 84 | 
 85 |             count_BL=0
 86 |             count_FU2=0
 87 |             count_FU3 = 0
 88 |             #copy the resting column to the respective matrix depending on the timepoint
 89 |             for col_index in range(2,len(data[0])):
 90 |                 #print(col_index)
 91 |                 lab_id=data[0][col_index]
 92 |                 lab_id.strip()
 93 |                 #print(convert_labID_to_PSC2_with_timepoint(lab_id))
 94 |                 try:
 95 |                     (psc2, timepoint)= convert_labID_to_PSC2_with_timepoint(lab_id)
 96 |                     if timepoint == "BL":
 97 | 
 98 |                         count_BL=count_BL+1
 99 |                         data_psc2_BL[0].append(psc2)
100 |                         for i in range(1,len(data)):
101 |                             data_psc2_BL[i].append(data[i][col_index])
102 |                     elif timepoint == "FU2":
103 | 
104 |                         count_FU2=count_FU2+1
105 |                         data_psc2_FU2[0].append(psc2)
106 |                         for i in range(1,len(data)):
107 |                             data_psc2_FU2[i].append(data[i][col_index])
108 |                     elif timepoint == "FU3":
109 | 
110 |                         count_FU3=count_FU3+1
111 |                         data_psc2_FU3[0].append(psc2)
112 |                         for i in range(1,len(data)):
113 |                             data_psc2_FU3[i].append(data[i][col_index])
114 |                     else:
115 |                         print("invalid timepoint:",timepoint)
116 |                 except:
117 |                     continue
118 |             print("BL", count_BL)
119 |             print("FU2", count_FU2)
120 |             print("FU3", count_FU3)
121 | 
122 |         #write the output to the files
123 |         print("writing ...")
124 |         with open(output_dir_imagen_BL_PSC2, 'w', newline='') as PSC2_BL_outfile:
125 |             writer_BL = csv.writer(PSC2_BL_outfile, delimiter='\t')
126 |             writer_BL.writerows(data_psc2_BL)
127 | 
128 |         with open(output_dir_imagen_FU2_PSC2, 'w', newline='') as PSC2_FU2_outfile:
129 |             writer_FU2 = csv.writer(PSC2_FU2_outfile, delimiter='\t')
130 |             writer_FU2.writerows(data_psc2_FU2)
131 | 
132 |         with open(output_dir_imagen_FU3_PSC2, 'w', newline='') as PSC2_FU3_outfile:
133 |             writer_FU3 = csv.writer(PSC2_FU3_outfile, delimiter='\t')
134 |             writer_FU3.writerows(data_psc2_FU3)
135 | 


--------------------------------------------------------------------------------
/genomics/rna_seq_deidentify_stratify.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | from imagen_databank import PSC2_FROM_PSC1
  4 | 
  5 | file_labID_PSC1_conv_stratify='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_STRATIFY_align60_no.dups_metadata.tsv'
  6 | file_labID_PSC1_conv_estra='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_ESTRA_align60_no.dups_metadata.tsv'
  7 | 
  8 | #use either first or seconf bloc, for gene_counts or gene_tmp
  9 | 
 10 | input_dir_STRATIFY_PSC1_counts='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts.tsv'
 11 | input_dir_ESTRA_PSC1_counts='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts.tsv'
 12 | output_dir_STRATIFY_PSC2_counts='/imagen/STRATIFY/processed/genetics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2.tsv'
 13 | output_dir_ESTRA_PSC2_counts='/imagen/STRATIFY/processed/genetics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2.tsv'
 14 | 
 15 | 
 16 | input_dir_STRATIFY_PSC1_tpm='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm.tsv'
 17 | input_dir_ESTRA_PSC1_tpm='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm.tsv'
 18 | output_dir_STRATIFY_PSC2_tpm='/imagen/STRATIFY/processed/genetics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2.tsv'
 19 | output_dir_ESTRA_PSC2_tpm='/imagen/STRATIFY/processed/genetics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2.tsv'
 20 | 
 21 | 
 22 | 
 23 | 
 24 | def convert_labID_to_PSC2_with_timepoint(labID,tab_conv_labID_psc1):
 25 |     headers = tab_conv_labID_psc1[0]
 26 |     labID_index = headers.index("Lab_Code")
 27 |     psc1_index = headers.index("PSC1")
 28 |     timepoint_index=headers.index("TimePoint")
 29 |     for line in tab_conv_labID_psc1:
 30 |         if line[labID_index]==labID:
 31 |             try:
 32 |                 if len(line[psc1_index])<12:
 33 |                     psc1="0"+line[psc1_index]
 34 |                 elif len(line[psc1_index])<12:
 35 |                     psc1=line[psc1_index]
 36 |                 psc2 = PSC2_FROM_PSC1[psc1]
 37 | 
 38 |                 return (psc2, line[timepoint_index])
 39 |             except:
 40 |                 print("invalid PSC1 code:", line[psc1_index])
 41 |                 #return ("###", line[timepoint_index])
 42 | 
 43 |     print("PSC1 not found for labID: ", labID)
 44 | 
 45 | 
 46 | def convert_file_to_PSC2(file_labID_PSC1_conv, input_dir_PSC1, output_dir_PSC2, delimiter_metadata):
 47 |     print("converting ", input_dir_PSC1, " to PSC2...")
 48 |     with open(file_labID_PSC1_conv, 'r', errors='ignore') as file_labID_PSC1:
 49 |         reader = csv.reader(file_labID_PSC1, delimiter=delimiter_metadata)
 50 |         tab_conv_labID_psc1 = [row for row in reader]
 51 |         headers = tab_conv_labID_psc1[0]
 52 |         # headers = list(next(reader))
 53 |         print(headers)
 54 |         # print(convert_labID_to_PSC2_with_timepoint("GB97ENVKCLR301518"))
 55 | 
 56 |         with open(input_dir_PSC1, 'r', newline='',errors='ignore') as labID_infile:
 57 |             reader_input = csv.reader(labID_infile, delimiter='\t')
 58 | 
 59 |             data = [row for row in reader_input]
 60 | 
 61 |             #print(data[0])
 62 |             #intialize list of lists that will be written in the output file
 63 |             data_psc2=[[] for i in range(len(data))]
 64 | 
 65 |             #intialize the two first columns of the three timepoints
 66 |             for i in range(len(data)):
 67 |                 #print(data[i][0]," ***** ", data[i][1], " ***** ", data[i][2])
 68 |                 #print(data_psc2_BL[i])
 69 |                 data_psc2[i].append(data[i][0])
 70 |                 data_psc2[i].append(data[i][1])
 71 | 
 72 |             count=0
 73 | 
 74 |             #copy the resting column to the respective matrix depending on the timepoint
 75 |             for col_index in range(2,len(data[0])):
 76 |                 #print(col_index)
 77 |                 lab_id=data[0][col_index]
 78 |                 lab_id.strip()
 79 |                 #print(convert_labID_to_PSC2_with_timepoint(lab_id))
 80 |                 try:
 81 |                     (psc2, timepoint)= convert_labID_to_PSC2_with_timepoint(lab_id,tab_conv_labID_psc1)
 82 |                     count=count+1
 83 |                     data_psc2[0].append(psc2)
 84 |                     for i in range(1,len(data)):
 85 |                         data_psc2[i].append(data[i][col_index])
 86 | 
 87 |                 except:
 88 |                     continue
 89 |             print("number of lines in file: " ,count)
 90 | 
 91 | 
 92 |         #write the output to the files
 93 |         print("writing ...")
 94 |         with open(output_dir_PSC2, 'w', newline='') as PSC2_outfile:
 95 |             writer = csv.writer(PSC2_outfile, delimiter='\t')
 96 |             writer.writerows(data_psc2)
 97 | 
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     convert_file_to_PSC2(file_labID_PSC1_conv_stratify, input_dir_STRATIFY_PSC1_counts, output_dir_STRATIFY_PSC2_counts,",")
102 | 
103 |     convert_file_to_PSC2(file_labID_PSC1_conv_stratify, input_dir_STRATIFY_PSC1_tpm, output_dir_STRATIFY_PSC2_tpm, ",")
104 | 
105 |     convert_file_to_PSC2(file_labID_PSC1_conv_estra, input_dir_ESTRA_PSC1_counts, output_dir_ESTRA_PSC2_counts, "\t")
106 | 
107 |     convert_file_to_PSC2(file_labID_PSC1_conv_estra, input_dir_ESTRA_PSC1_tpm, output_dir_ESTRA_PSC2_tpm, "\t")
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/geolocation/geolocation.sh:
--------------------------------------------------------------------------------
 1 | #/bin/sh
 2 | 
 3 | #
 4 | # process geolocation at each time point
 5 | #
 6 | for timepoint in BL FU1 FU2 FU3
 7 | do
 8 |     DIR_PSC1="/neurospin/imagen/${timepoint}/RAW/PSC1/geolocation"
 9 |     FILE_PSC2="/neurospin/imagen/${timepoint}/processed/geolocation/IMAGEN_geolocation_${timepoint}.csv"
10 | 
11 |     # print output file header line
12 |     echo "PSC2,latitude,longitude,notes" > "$FILE_PSC2"
13 |     # process each input file
14 |     for file in "${DIR_PSC1}/IMAGEN_geolocation_"*"_${timepoint}.csv"
15 |     do
16 |         # some commands cannot process DOS line endings
17 |         tmpfile=`mktemp -t tmp.geolocation.XXXXXXXXXX`
18 |         dos2unix -n "$file" "$tmpfile" 2>/dev/null
19 |         # some sites lack a "Notes" column
20 |         if head -1 "$tmpfile" | grep -q "Notes"
21 |         then
22 |             ADD_NOTES=0
23 |         else
24 |             ADD_NOTES=1
25 |         fi
26 |         # skip input file header line
27 |         tail -n +2 "$tmpfile" |
28 |         # some sites lack a "Notes" column
29 |         if [ "$ADD_NOTES" ]
30 |         then
31 |             sed 's/$/,/'
32 |         fi
33 |         # clean up
34 |         rm -f "$tmpfile"
35 |     done | psc2psc.py 2>/dev/null | sort >> "$FILE_PSC2"
36 |     unix2dos -o "$FILE_PSC2" 2>/dev/null
37 | done
38 | 
39 | 
40 | #
41 | # process geolocation backdated from BL
42 | #
43 | BACKDATED_PSC1="/neurospin/imagen/FU3/RAW/PSC1/geolocation/IMAGEN_geolocation_ALL_SITES_backdated_Dublin_updated.csv"
44 | BACKDATED_PSC2="/neurospin/imagen/FU3/processed/geolocation/IMAGEN_geolocation_backdated.csv"
45 | 
46 | # print output file header line
47 | echo "PSC2,year,latitude,longitude" > "$BACKDATED_PSC2"
48 | # skip input file header line
49 | tail -n +2 "$BACKDATED_PSC1" | psc2psc.py 2>/dev/null | sort >> "$BACKDATED_PSC2"
50 | unix2dos -o "$BACKDATED_PSC2" 2>/dev/null
51 | 


--------------------------------------------------------------------------------
/imagen_databank/__init__.py:
--------------------------------------------------------------------------------
 1 | # noqa
 2 | 
 3 | # Copyright (c) 2014-2018 CEA
 4 | #
 5 | # This software is governed by the CeCILL license under French law and
 6 | # abiding by the rules of distribution of free software. You can use,
 7 | # modify and/ or redistribute the software under the terms of the CeCILL
 8 | # license as circulated by CEA, CNRS and INRIA at the following URL
 9 | # "http://www.cecill.info".
10 | #
11 | # As a counterpart to the access to the source code and rights to copy,
12 | # modify and redistribute granted by the license, users are provided only
13 | # with a limited warranty and the software's author, the holder of the
14 | # economic rights, and the successive licensors have only limited
15 | # liability.
16 | #
17 | # In this respect, the user's attention is drawn to the risks associated
18 | # with loading, using, modifying and/or developing or reproducing the
19 | # software by the user in light of its specific status of free software,
20 | # that may mean that it is complicated to manipulate, and that also
21 | # therefore means that it is reserved for developers and experienced
22 | # professionals having in-depth computer knowledge. Users are therefore
23 | # encouraged to load and test the software's suitability as regards their
24 | # requirements in conditions enabling the security of their systems and/or
25 | # data to be ensured and, more generally, to use and operate it in the
26 | # same conditions as regards security.
27 | #
28 | # The fact that you are presently reading this means that you have had
29 | # knowledge of the CeCILL license and that you accept its terms.
30 | 
31 | __all__ = ['additional_data', 'behavioral', 'cantab', 'core', 'dicom_utils',
32 |            'image_data', 'scanning', 'sanity']
33 | 
34 | from . import core
35 | from .core import (LONDON, NOTTINGHAM, DUBLIN, BERLIN,
36 |                    HAMBURG, MANNHEIM, PARIS, DRESDEN,
37 |                    SOUTHAMPTON, AACHEN)
38 | from .core import CENTER_NAME
39 | from .core import (PSC2_FROM_PSC1, PSC1_FROM_PSC2,
40 |                    PSC1_FROM_DAWBA, PSC2_FROM_DAWBA,  # PSC2_FROM_DAWBA is obsolete
41 |                    DOB_FROM_PSC1, DOB_FROM_PSC2)  # DOB_FROM_PSC2 is obsolete
42 | from .core import (detect_psc1, detect_psc2, guess_psc1)
43 | from .core import Error
44 | 
45 | from . import additional_data
46 | from .additional_data import (walk_additional_data, report_additional_data)
47 | 
48 | from . import behavioral
49 | from .behavioral import (MID_CSV, FT_CSV, SS_CSV, RECOG_CSV)
50 | from .behavioral import (read_mid, read_ft, read_ss, read_recog)
51 | 
52 | from . import cantab
53 | from .cantab import (CANTAB_CCLAR, DETAILED_DATASHEET_CSV, DATASHEET_CSV,
54 |                      REPORT_HTML)
55 | from .cantab import (read_cant, read_datasheet, read_detailed_datasheet,
56 |                      read_report)
57 | 
58 | from . import dicom_utils
59 | from .dicom_utils import read_metadata
60 | 
61 | from . import image_data
62 | from .image_data import (SEQUENCE_LOCALIZER_CALIBRATION,
63 |                          SEQUENCE_T2, SEQUENCE_T2_FLAIR,
64 |                          SEQUENCE_ADNI_MPRAGE,
65 |                          SEQUENCE_MID, SEQUENCE_FT, SEQUENCE_SST,
66 |                          SEQUENCE_B0_MAP, SEQUENCE_DTI,
67 |                          SEQUENCE_RESTING_STATE,
68 |                          SEQUENCE_NODDI)
69 | from .image_data import SEQUENCE_NAME
70 | from .image_data import NONSTANDARD_DICOM
71 | from .image_data import series_type_from_description
72 | from .image_data import walk_image_data, report_image_data
73 | 
74 | from . import scanning
75 | from .scanning import read_scanning
76 | 
77 | from . import sanity
78 | 
79 | __author__ = 'Dimitri Papadopoulos'
80 | __copyright__ = 'Copyright (c) 2014-2018 CEA'
81 | __license__ = 'CeCILL'
82 | __version__ = '0.1.0'
83 | __email__ = 'imagendatabase@cea.fr'
84 | __status__ = 'Development'
85 | 


--------------------------------------------------------------------------------
/imagen_databank/additional_data.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2014-2017 CEA
  2 | #
  3 | # This software is governed by the CeCILL license under French law and
  4 | # abiding by the rules of distribution of free software. You can use,
  5 | # modify and/ or redistribute the software under the terms of the CeCILL
  6 | # license as circulated by CEA, CNRS and INRIA at the following URL
  7 | # "http://www.cecill.info".
  8 | #
  9 | # As a counterpart to the access to the source code and rights to copy,
 10 | # modify and redistribute granted by the license, users are provided only
 11 | # with a limited warranty and the software's author, the holder of the
 12 | # economic rights, and the successive licensors have only limited
 13 | # liability.
 14 | #
 15 | # In this respect, the user's attention is drawn to the risks associated
 16 | # with loading, using, modifying and/or developing or reproducing the
 17 | # software by the user in light of its specific status of free software,
 18 | # that may mean that it is complicated to manipulate, and that also
 19 | # therefore means that it is reserved for developers and experienced
 20 | # professionals having in-depth computer knowledge. Users are therefore
 21 | # encouraged to load and test the software's suitability as regards their
 22 | # requirements in conditions enabling the security of their systems and/or
 23 | # data to be ensured and, more generally, to use and operate it in the
 24 | # same conditions as regards security.
 25 | #
 26 | # The fact that you are presently reading this means that you have had
 27 | # knowledge of the CeCILL license and that you accept its terms.
 28 | 
 29 | import os
 30 | import re
 31 | 
 32 | from .cantab import (CANTAB_CCLAR, DETAILED_DATASHEET_CSV, DATASHEET_CSV,
 33 |                      REPORT_HTML,
 34 |                      read_cant, read_datasheet, read_detailed_datasheet,
 35 |                      read_report)
 36 | from .behavioral import (MID_CSV, FT_CSV, SS_CSV, RECOG_CSV,
 37 |                          read_mid, read_ft, read_ss, read_recog)
 38 | 
 39 | import logging
 40 | logger = logging.getLogger(__name__)
 41 | 
 42 | __all__ = ['walk_additional_data', 'report_additional_data']
 43 | 
 44 | 
 45 | #
 46 | # check filenames against these regex'es when exploring Additional Data
 47 | #
 48 | # in some case order is important, for example:
 49 | # - first match 'detailed_datasheet'
 50 | # - then match 'datasheet'
 51 | #
 52 | _LOOSE_ADDITIONAL_DATA_REGEXES = (
 53 |     (re.compile(r'(\w+_)?cant(_\w+)?\.cclar', re.IGNORECASE), CANTAB_CCLAR),
 54 |     # Mannheim send 'detailed datasheet' files (space instead of underscore)
 55 |     (re.compile(r'(\w+_)?detailed[_ ]datasheet(_\w+)?\.csv', re.IGNORECASE),
 56 |      DETAILED_DATASHEET_CSV),
 57 |     (re.compile(r'(\w+_)?datasheet(_\w+)?\.csv', re.IGNORECASE), DATASHEET_CSV),
 58 |     (re.compile(r'(\w+_)?report(_\w+)?\.html', re.IGNORECASE), REPORT_HTML),
 59 |     (re.compile(r'ft_\w+\.csv', re.IGNORECASE), FT_CSV),
 60 |     (re.compile(r'mid_\w+\.csv', re.IGNORECASE), MID_CSV),
 61 |     (re.compile(r'recog_\w+\.csv', re.IGNORECASE), RECOG_CSV),
 62 |     (re.compile(r'ss_\w+\.csv', re.IGNORECASE), SS_CSV),
 63 | )
 64 | 
 65 | _EXACT_ADDITIONAL_DATA_REGEXES = (
 66 |     (re.compile(r'cant_\d{12}(fu|FU)?\.cclar'), CANTAB_CCLAR),
 67 |     (re.compile(r'detailed_datasheet_\d{12}(fu|FU)?\.csv'), DETAILED_DATASHEET_CSV),
 68 |     (re.compile(r'datasheet_\d{12}(fu|FU)?\.csv'), DATASHEET_CSV),
 69 |     (re.compile(r'report_\d{12}(fu|FU)?\.html'), REPORT_HTML),
 70 |     (re.compile(r'ft_\d{12}(fu|FU)?\.csv'), FT_CSV),
 71 |     (re.compile(r'mid_\d{12}(fu|FU)?\.csv'), MID_CSV),
 72 |     (re.compile(r'recog_\d{12}(fu|FU)?\.csv'), RECOG_CSV),
 73 |     (re.compile(r'ss_\d{12}(fu|FU)?\.csv', re.IGNORECASE), SS_CSV),
 74 | )
 75 | 
 76 | 
 77 | def _match_additional_data_sops(filename, exact=False):
 78 |     """Compare filename to filenames defined in Imagen FU2 SOPs.
 79 | 
 80 |     Compare actual filename to expected filenames expected for Additional
 81 |     Data in SOPs, either in a strict way or a loose way. This matching
 82 |     function is empirical and based on experimentation.
 83 | 
 84 |     Parameters
 85 |     ----------
 86 |     filename : unicode
 87 |         The file basename to match.
 88 | 
 89 |     exact : bool
 90 |         Exact match if True else loose match.
 91 | 
 92 |     Returns
 93 |     -------
 94 |     str
 95 |         If the filename loosely matches a file type defined in the SOPs,
 96 |         return the type file type, else return None.
 97 | 
 98 |     """
 99 |     if exact:
100 |         regex_list = _EXACT_ADDITIONAL_DATA_REGEXES
101 |     else:
102 |         regex_list = _LOOSE_ADDITIONAL_DATA_REGEXES
103 |     for regex, filetype in regex_list:
104 |         if regex.match(filename):
105 |             logger.debug('assign type "%s" to filename: %s',
106 |                          filetype, filename)
107 |             return filetype
108 |     logger.info('filename does not match any known type: %s', filename)
109 |     return None
110 | 
111 | 
112 | def walk_additional_data(path):
113 |     """Generate information on Additional Data files in a directory.
114 | 
115 |     Parameters
116 |     ----------
117 |     path : unicode
118 |         The directory to look for files into.
119 | 
120 |     Returns
121 |     -------
122 |     tuple
123 |         Yield a 2-tuple: the name and the path of each file relative to path.
124 | 
125 |     """
126 | 
127 |     for root, dummy_dirs, files in os.walk(path):
128 |         for filename in files:
129 |             relpath = os.path.relpath(os.path.join(root, filename), path)
130 |             yield filename, relpath
131 | 
132 | 
133 | def report_additional_data(path, psc1, exact=False):
134 |     """Find Additional Data files that fit the Imagen FU2 SOPs.
135 | 
136 |     The Imagen FU2 SOPs define a precise file organization for Additional
137 |     Data. In practice we have found the SOPs are only loosely followed by
138 |     acquisition centres, hence the tolerant optional argument.
139 | 
140 |     This function scans the directory where we expect to find the Additional
141 |     Data of a dataset and builds a collection of files identified as the
142 |     files described in the SOPs.
143 | 
144 |     Parameters
145 |     ----------
146 |     path : unicode
147 |         The directory to look for Additional Data into.
148 | 
149 |     psc1 : str
150 |         PSC1 code of the subject.
151 | 
152 |     exact : bool
153 |         Exact match if True, else loose match.
154 | 
155 |     Returns
156 |     -------
157 |     dict
158 |         The key identifies the type of identified files and the value
159 |         lists the relative path of the files.
160 | 
161 |     """
162 |     additional_files = {}
163 | 
164 |     for filename, relpath in walk_additional_data(path):
165 |         filetype = _match_additional_data_sops(filename, exact)
166 |         if filetype:
167 |             logger.debug('assign type "%s" to file: %s',
168 |                          filetype, relpath)
169 |             additional_files.setdefault(filetype, []).append(relpath)
170 |         else:
171 |             logger.warning('cannot match any known type: %s', relpath)
172 | 
173 |     additional_data = {}
174 | 
175 |     # read cant_*.cclar where available
176 |     if CANTAB_CCLAR in additional_files:
177 |         for f in additional_files[CANTAB_CCLAR]:
178 |             f_path = os.path.join(path, f)
179 |             subject_ids = read_cant(f_path)
180 |             if psc1 in subject_ids:
181 |                 subject_ids.remove(psc1)
182 |             additional_data.setdefault(CANTAB_CCLAR, {})[f] = subject_ids
183 |     # read datasheet_*.csv where available
184 |     if DATASHEET_CSV in additional_files:
185 |         for f in additional_files[DATASHEET_CSV]:
186 |             f_path = os.path.join(path, f)
187 |             subject_ids, dummy_st, dummy_r, dummy_c, dummy_f = read_datasheet(f_path)
188 |             if psc1 in subject_ids:
189 |                 subject_ids.remove(psc1)
190 |             additional_data.setdefault(DATASHEET_CSV, {})[f] = subject_ids
191 |     # read detailed_datasheet_*.csv where available
192 |     if DETAILED_DATASHEET_CSV in additional_files:
193 |         for f in additional_files[DETAILED_DATASHEET_CSV]:
194 |             f_path = os.path.join(path, f)
195 |             subject_ids = read_detailed_datasheet(f_path)
196 |             if psc1 in subject_ids:
197 |                 subject_ids.remove(psc1)
198 |             additional_data.setdefault(DETAILED_DATASHEET_CSV, {})[f] = subject_ids
199 |     # read report_*.html where available
200 |     if REPORT_HTML in additional_files:
201 |         for f in additional_files[REPORT_HTML]:
202 |             f_path = os.path.join(path, f)
203 |             subject_ids = read_report(f_path)
204 |             if psc1 in subject_ids:
205 |                 subject_ids.remove(psc1)
206 |             additional_data.setdefault(REPORT_HTML, {})[f] = subject_ids
207 |     # read Scanning/ft_*.csv where available
208 |     if FT_CSV in additional_files:
209 |         for f in additional_files[FT_CSV]:
210 |             f_path = os.path.join(path, f)
211 |             subject_id, _, _, _ = read_ft(f_path)
212 |             if subject_id:
213 |                 additional_data.setdefault(FT_CSV, {})[f] = set(subject_id)
214 |     # read Scanning/mid_*.csv where available
215 |     if MID_CSV in additional_files:
216 |         for f in additional_files[MID_CSV]:
217 |             f_path = os.path.join(path, f)
218 |             subject_id, _, _, _ = read_mid(f_path)
219 |             if subject_id:
220 |                 additional_data.setdefault(MID_CSV, {})[f] = set(subject_id)
221 |     # read Scanning/recog_*.csv where available
222 |     if RECOG_CSV in additional_files:
223 |         for f in additional_files[RECOG_CSV]:
224 |             f_path = os.path.join(path, f)
225 |             subject_id, _, _, _ = read_recog(f_path)
226 |             if subject_id:
227 |                 additional_data.setdefault(RECOG_CSV, {})[f] = set(subject_id)
228 |     # read Scanning/ss_*.csv where available
229 |     if SS_CSV in additional_files:
230 |         for f in additional_files[SS_CSV]:
231 |             f_path = os.path.join(path, f)
232 |             subject_id, _, _, _ = read_ss(f_path)
233 |             if subject_id:
234 |                 additional_data.setdefault(SS_CSV, {})[f] = set(subject_id)
235 | 
236 |     return additional_data
237 | 


--------------------------------------------------------------------------------
/imagen_databank/behavioral.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2014-2017 CEA
  2 | #
  3 | # This software is governed by the CeCILL license under French law and
  4 | # abiding by the rules of distribution of free software. You can use,
  5 | # modify and/ or redistribute the software under the terms of the CeCILL
  6 | # license as circulated by CEA, CNRS and INRIA at the following URL
  7 | # "http://www.cecill.info".
  8 | #
  9 | # As a counterpart to the access to the source code and rights to copy,
 10 | # modify and redistribute granted by the license, users are provided only
 11 | # with a limited warranty and the software's author, the holder of the
 12 | # economic rights, and the successive licensors have only limited
 13 | # liability.
 14 | #
 15 | # In this respect, the user's attention is drawn to the risks associated
 16 | # with loading, using, modifying and/or developing or reproducing the
 17 | # software by the user in light of its specific status of free software,
 18 | # that may mean that it is complicated to manipulate, and that also
 19 | # therefore means that it is reserved for developers and experienced
 20 | # professionals having in-depth computer knowledge. Users are therefore
 21 | # encouraged to load and test the software's suitability as regards their
 22 | # requirements in conditions enabling the security of their systems and/or
 23 | # data to be ensured and, more generally, to use and operate it in the
 24 | # same conditions as regards security.
 25 | #
 26 | # The fact that you are presently reading this means that you have had
 27 | # knowledge of the CeCILL license and that you accept its terms.
 28 | 
 29 | import csv
 30 | from datetime import datetime
 31 | 
 32 | from .core import Error
 33 | 
 34 | import logging
 35 | logger = logging.getLogger(__name__)
 36 | 
 37 | __all__ = ['MID_COLUMNS', 'FT_COLUMNS', 'SS_COLUMNS', 'RECOG_COLUMNS',
 38 |            'read_mid', 'read_ft', 'read_ss', 'read_recog']
 39 | 
 40 | #
 41 | # types of files we expect to be find under AdditionalData/Scanning
 42 | #
 43 | FT_CSV = 'ft'
 44 | MID_CSV = 'mid'
 45 | SS_CSV = 'ss'
 46 | RECOG_CSV = 'recog'
 47 | 
 48 | 
 49 | def _parse_behavioral_datetime(date_string):
 50 |     """Read date in the format found in CSV files.
 51 | 
 52 |     * LONDON      01/02/2015 01:02:03
 53 |     * NOTTINGHAM  01/02/2015 01:02:03
 54 |     * DUBLIN      01/02/2015 01:02:03  2/1/2015 1:02:03 AM
 55 |     * BERLIN      01.02.2015 01:02:03
 56 |     * HAMBURG     01.02.2015 01:02:03
 57 |     * MANNHEIM    01.02.2015 01:02:03
 58 |     * PARIS       01/02/2015 01:02:03
 59 |     * DRESDEN     01.02.2015 01:02:03
 60 | 
 61 |     """
 62 |     DATE_FORMATS = (
 63 |         '%d.%m.%Y %H:%M:%S',
 64 |         '%d/%m/%Y %H:%M:%S',
 65 |         '%m/%d/%Y %I:%M:%S %p',
 66 |     )
 67 |     for date_format in DATE_FORMATS:
 68 |         try:
 69 |             dt = datetime.strptime(date_string, date_format)
 70 |             return dt
 71 |         except ValueError:
 72 |             pass
 73 |     return None
 74 | 
 75 | 
 76 | def _fix_spurious_quotes(s):
 77 |     if s.startswith('"'):
 78 |         last = s.rfind('"')
 79 |         if last > 0:
 80 |             main = s[1:last]
 81 |             last += 1
 82 |             tail = s[last:]
 83 |             if tail.isspace():
 84 |                 s = main + tail
 85 |     return s
 86 | 
 87 | 
 88 | def _fix_terminal_tab(s):
 89 |     last = s.rfind('\t')
 90 |     if last > 0:
 91 |         main = s[:last]
 92 |         last += 1
 93 |         tail = s[last:]
 94 |         if tail.isspace():
 95 |             s = main + tail
 96 |     return s
 97 | 
 98 | 
 99 | MID_COLUMNS = (
100 |     'Trial',
101 |     'Trial Category',
102 |     'Trial Start Time (Onset)',
103 |     'Pre-determined Onset',
104 |     'Cue Presented',
105 |     'Anticipation Phase Start Time',
106 |     'Anticipation Phase Duration',
107 |     'Target Phase Start Time',
108 |     'Target Phase Duration',
109 |     'Response Made by Subject',
110 |     'Response time',
111 |     'Feedback Phase Start Time',
112 |     'Outcome',
113 |     'Amount',
114 |     'Fixation Phase Start Time (Lasts until next trial start time)',
115 |     'Success Rate',
116 |     'Scanner Pulse',
117 | )
118 | 
119 | FT_COLUMNS = (
120 |     'Trial Start Time (Onset)',
121 |     'Video Clip Name',
122 | )
123 | 
124 | SS_COLUMNS = (
125 |     'Trial',
126 |     'Trial Category',
127 |     'Trial Start Time (Onset)',
128 |     'Pre-determined/randomised onset',
129 |     'Go Stimulus Presentation Time',  # 'Go Stimulus Presentation Time '
130 |     'Stimulus Presented',
131 |     'Delay',
132 |     'Stop Stimulus Presentation Time',
133 |     'Response made by subject',
134 |     'Absolute Response Time',
135 |     'Relative Response Time',
136 |     'Response Outcome',
137 |     'Real Jitter',
138 |     'Pre-determined Jitter',
139 |     'Success Rate of Variable Delay Stop Trials',
140 |     'Scanner Pulse',
141 | )
142 | 
143 | RECOG_COLUMNS = (
144 |     'TimePassed',
145 |     'UserResponse',
146 |     'ImageFileName',
147 | )
148 | 
149 | # for each of the 4 tasks we provide a tuple:
150 | # * first word in the behavioral file that identifies the task
151 | # * list of columns in the 2nd line
152 | # * column from which to extract the last ascending numerical sequence
153 | # * True if the numerical sequence is strictly ascending
154 | _TASK_SPECIFICS = {
155 |     MID_CSV: ('MID_TASK', MID_COLUMNS, 0, True),
156 |     FT_CSV: ('FACE_TASK', FT_COLUMNS, 0, True),
157 |     SS_CSV: ('STOP_SIGNAL_TASK', SS_COLUMNS, 0, False),
158 |     RECOG_CSV: ('RECOGNITION_TASK', RECOG_COLUMNS, 0, True),
159 | }
160 | 
161 | 
162 | def _read_generic_behavioral(path, task, strict=True):
163 |     """Read behavioral files and return part of the contents and errors.
164 | 
165 |     Sometimes complete lines are enclosed in quotes. Such quotes
166 |     must be fixed before the contents can be read as CSV.
167 | 
168 |     Parameters
169 |     ----------
170 |     path : str
171 |         Path to the behavioral file to read from.
172 | 
173 |     task : ?
174 |         Type of task.
175 | 
176 |     strict : bool
177 |         Be more lenient and let wholly quoted lines through if False,
178 |         else do report the error.
179 | 
180 |     Returns
181 |     -------
182 |     psc1 : str
183 |         PSC1 code.
184 |     timestamp : datetime
185 |         Time stamp extracted from the header.
186 |     trials : array_like
187 |         Last ascending sequence of trials.
188 |     errors : array_like
189 |         List of Error.
190 | 
191 |     Raises
192 |     ------
193 |     FileNotFoundError
194 |         If path does not exist.
195 | 
196 |     """
197 |     psc1 = None
198 |     timestamp = None
199 |     sequence = []
200 |     errors = []
201 | 
202 |     with open(path, 'r') as behavioral:  # add newline='' in Python 3
203 |         lines = behavioral.readlines()
204 | 
205 |     # attempt to handle broken CSV files with fully quoted lines
206 |     reader = csv.reader(lines, delimiter='\t')
207 |     if not strict and max(len(row) for row in reader) < 2:
208 |         lines = [_fix_spurious_quotes(line) for line in lines]
209 | 
210 |     # remove spurious terminal tab
211 |     lines = [_fix_terminal_tab(line) for line in lines]
212 | 
213 |     # now re-read file contents
214 |     reader = csv.reader(lines, delimiter='\t')
215 | 
216 |     # 1st line
217 |     header = next(reader)
218 |     if header:
219 |         header = [x.strip() for x in header]
220 |         if len(header) != 4:
221 |             errors.append(Error(path, 'Line 1 contains {0} columns instead of 4'
222 |                                       .format(len(header)), header))
223 |         if len(header) > 3:
224 |             COLUMN = 'Task type: Scanning'
225 |             if header[3] != COLUMN:
226 |                 errors.append(Error(path, 'Column 4 of line 1 must be "{0}" '
227 |                                           'instead of "{1}"'
228 |                                           .format(COLUMN, header[3]), header))
229 |         if len(header) > 2:
230 |             COLUMN = 'Subject ID:'
231 |             if header[2].startswith(COLUMN):
232 |                 psc1 = header[2][len(COLUMN):].lstrip()
233 |             else:
234 |                 errors.append(Error(path, 'Column 3 of line 1 "{0}" must start '
235 |                                           'with "{1}"'
236 |                                           .format(header[2], COLUMN), header))
237 |         if len(header) > 1:
238 |             timestamp = _parse_behavioral_datetime(header[1])
239 |             if not timestamp:
240 |                 errors.append(Error(path, 'Column 2 of line 1 "{0}" is not a standard time stamp'
241 |                                           .format(header[1]), header))
242 |         if len(header) > 0:
243 |             COLUMN = '{0} task'.format(_TASK_SPECIFICS[task][0])
244 |             if header[0] != COLUMN:
245 |                 errors.append(Error(path, 'Column 1 of line 1 must be "{0}" '
246 |                                           'instead of "{1}"'
247 |                                           .format(COLUMN, header[0]), header))
248 |     else:
249 |         errors.append(Error(path, 'Empty file'))
250 | 
251 |     # 2nd line
252 |     try:
253 |         header = next(reader)
254 |         header = [x.strip() for x in header]
255 |         COLUMNS = _TASK_SPECIFICS[task][1]
256 |         if len(header) != len(COLUMNS):
257 |             errors.append(Error(path, 'Line 2 contains {0} columns instead of {1}'
258 |                                       .format(len(header), len(COLUMNS)),
259 |                                       header))
260 |         for i, (h, c) in enumerate(zip(header, COLUMNS)):
261 |             if h != c:
262 |                 errors.append(Error(path, 'Column {0} of line 2 must be {1} instead of {2}'
263 |                                           .format(i + 1, c, h), header))
264 |                 break
265 |     except StopIteration:
266 |         errors.append(Error(path, 'Missing 2nd line'))
267 | 
268 |     # data
269 |     last = None
270 |     for n, row in enumerate(reader, 3):
271 |         row = [x.strip() for x in row]
272 |         COLUMNS = _TASK_SPECIFICS[task][1]
273 |         if not any(row):  # get rid of empty rows
274 |             continue
275 |         elif (len(row) != len(COLUMNS)):
276 |             errors.append(Error(path, 'Line {0} contains {1} columns instead of {2}'
277 |                                       .format(n, len(row), len(COLUMNS)),
278 |                                       row))
279 |         # column to check for ascending numerical sequence
280 |         current = row[_TASK_SPECIFICS[task][2]].strip()
281 |         try:
282 |             # expect ascending numerical sequences
283 |             current = int(current)
284 |             if last:
285 |                 if _TASK_SPECIFICS[task][3]:  # strictly ascending
286 |                     if current <= last:
287 |                         sequence = []  # start new ascending sequence
288 |                 else:
289 |                     if current < last:
290 |                         sequence = []  # start new ascending sequence
291 |             sequence.append(current)
292 |             last = current
293 |         except ValueError:
294 |             errors.append(Error(path, 'Column {0} of line {1} "{2}" should contain '
295 |                                       'only numbers'
296 |                                       .format(_TASK_SPECIFICS[task][2] + 1, n, current), row))
297 |             if last:
298 |                 last = None
299 | 
300 |     return psc1, timestamp, sequence, errors
301 | 
302 | 
303 | def read_mid(path, strict=True):
304 |     """Return "Subject ID" and other information extracted from mid_*.csv.
305 | 
306 |     Sometimes complete lines are enclosed in quotes. In that case
307 |     mid_*.csv content must be fixed before it can be read as CSV.
308 | 
309 |     Parameters
310 |     ----------
311 |     path : unicode
312 |         Path to the mid_*.csv file to read from.
313 | 
314 |     strict : bool
315 |         Be more lenient and let wholly quoted lines through if False,
316 |         else do report the error.
317 | 
318 |     Returns
319 |     -------
320 |     psc1 : str
321 |         PSC1 code.
322 |     timestamp : datetime
323 |         Time stamp extracted from the header.
324 |     trials : array_like
325 |         The last ascending sequence of trials ('Trials' column).
326 |     errors : array_like
327 |         List of Error.
328 | 
329 |     Raises
330 |     ------
331 |     FileNotFoundError
332 |         If path does not exist.
333 | 
334 |     """
335 |     return _read_generic_behavioral(path, MID_CSV, strict)
336 | 
337 | 
338 | def read_ft(path, strict=True):
339 |     """Return "Subject ID" and other information extracted from ft_*.csv.
340 | 
341 |     Sometimes complete lines are enclosed in quotes. In that case
342 |     ft_*.csv content must be fixed before it can be read as CSV.
343 | 
344 |     Parameters
345 |     ----------
346 |     path : unicode
347 |         Path to the ft_*.csv file to read from.
348 | 
349 |     strict : bool
350 |         Be more lenient and let wholly quoted lines through if False,
351 |         else do report the error.
352 | 
353 |     Returns
354 |     -------
355 |     psc1 : str
356 |         PSC1 code.
357 |     timestamp : datetime
358 |         Time stamp extracted from the header.
359 |     trials : array_like
360 |         The last ascending sequence of trials ('Trials' column).
361 |     errors : array_like
362 |         List of Error.
363 | 
364 |     Raises
365 |     ------
366 |     FileNotFoundError
367 |         If path does not exist.
368 | 
369 |     """
370 |     return _read_generic_behavioral(path, FT_CSV, strict)
371 | 
372 | 
373 | def read_ss(path, strict=True):
374 |     """Return "Subject ID" and other information extracted from ss_*.csv.
375 | 
376 |     Sometimes complete lines are enclosed in quotes. In that case
377 |     ss_*.csv content must be fixed before it can be read as CSV.
378 | 
379 |     Parameters
380 |     ----------
381 |     path : unicode
382 |         Path to the ss_*.csv file to read from.
383 | 
384 |     strict : bool
385 |         Be more lenient and let wholly quoted lines through if False,
386 |         else do report the error.
387 | 
388 |     Returns
389 |     -------
390 |     psc1 : str
391 |         PSC1 code.
392 |     timestamp : datetime
393 |         Time stamp extracted from the header.
394 |     trials : array_like
395 |         The last ascending sequence of trials ('Trials' column).
396 |     errors : array_like
397 |         List of Error.
398 | 
399 |     Raises
400 |     ------
401 |     FileNotFoundError
402 |         If path does not exist.
403 | 
404 |     """
405 |     return _read_generic_behavioral(path, SS_CSV, strict)
406 | 
407 | 
408 | def read_recog(path, strict=True):
409 |     """Return "Subject ID" and other information extracted from recog_*.csv.
410 | 
411 |     Sometimes complete lines are enclosed in quotes. In that case
412 |     recog_*.csv content must be fixed before it can be read as CSV.
413 | 
414 |     Parameters
415 |     ----------
416 |     path : unicode
417 |         Path to the recog_*.csv file to read from.
418 | 
419 |     strict : bool
420 |         Be more lenient and let wholly quoted lines through if False,
421 |         else do report the error.
422 | 
423 |     Returns
424 |     -------
425 |     psc1 : str
426 |         PSC1 code.
427 |     timestamp : datetime
428 |         Time stamp extracted from the header.
429 |     times : array_like
430 |         The last ascending sequence of trials ('TimePassed' column).
431 |     errors : array_like
432 |         List of Error.
433 | 
434 |     Raises
435 |     ------
436 |     FileNotFoundError
437 |         If path does not exist.
438 | 
439 |     """
440 |     return _read_generic_behavioral(path, RECOG_CSV, strict)
441 | 
442 | 
443 | def main():
444 |     import os.path
445 | 
446 |     ROOT_DIR = '/neurospin/imagen/FU2/RAW/PSC1'
447 |     for center in os.listdir(ROOT_DIR):
448 |         center_path = os.path.join(ROOT_DIR, center)
449 |         for subject in os.listdir(center_path):
450 |             subject_path = os.path.join(center_path, subject)
451 |             behavioral_path = os.path.join(subject_path,
452 |                                            'AdditionalData', 'Scanning')
453 |             if os.path.isdir(behavioral_path):
454 |                 #~ mid_files = tuple(os.path.join(behavioral_path, b)
455 |                                  #~ for b in os.listdir(behavioral_path)
456 |                                  #~ if 'mid_' in b)
457 |                 #~ for mid_file in mid_files:
458 |                     #~ (psc1, _timestamp, onsets, errors) = read_mid(mid_file, False)
459 |                     #~ print('▸ {0} MID {1}'.format(psc1, len(onsets)))
460 |                     #~ for error in errors:
461 |                         #~ print('  ✗ {0}: {1}'.format(error.message,
462 |                               #~ os.path.relpath(error.path, ROOT_DIR)))
463 |                 #~ ft_files = tuple(os.path.join(behavioral_path, b)
464 |                                  #~ for b in os.listdir(behavioral_path)
465 |                                  #~ if 'ft_' in b)
466 |                 #~ for ft_file in ft_files:
467 |                     #~ (psc1, _timestamp, onsets, errors) = read_ft(ft_file, False)
468 |                     #~ print('▸ {0} FT {1}'.format(psc1, len(onsets)))
469 |                     #~ for error in errors:
470 |                         #~ print('  ✗ {0}: {1}'.format(error.message,
471 |                               #~ os.path.relpath(error.path, ROOT_DIR)))
472 |                 ss_files = tuple(os.path.join(behavioral_path, b)
473 |                                  for b in os.listdir(behavioral_path)
474 |                                  if 'ss_' in b)
475 |                 for ss_file in ss_files:
476 |                     (psc1, timestamp, onsets, errors) = read_ss(ss_file,  # pylint: disable=unused-variable
477 |                                                                 False)
478 |                     print('▸ {0} SS {1}'.format(psc1, len(onsets)))
479 |                     for error in errors:
480 |                         print('  ✗ {0}: {1}'.format(error.message,
481 |                               os.path.relpath(error.path, ROOT_DIR)))
482 |                 #~ recog_files = tuple(os.path.join(behavioral_path, b)
483 |                                  #~ for b in os.listdir(behavioral_path)
484 |                                  #~ if 'recog_' in b)
485 |                 #~ for recog_file in recog_files:
486 |                     #~ (psc1, timestamp, onsets, errors) = read_recog(recog_file, False)
487 |                     #~ print('▸ {0} RECOG {1}'.format(psc1, len(onsets)))
488 |                     #~ for error in errors:
489 |                         #~ print('  ✗ {0}: {1}'.format(error.message,
490 |                               #~ os.path.relpath(error.path, ROOT_DIR)))
491 | 
492 | 
493 | if __name__ == '__main__':
494 |     main()
495 | 


--------------------------------------------------------------------------------
/imagen_databank/cantab.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2014-2017 CEA
  2 | #
  3 | # This software is governed by the CeCILL license under French law and
  4 | # abiding by the rules of distribution of free software. You can use,
  5 | # modify and/ or redistribute the software under the terms of the CeCILL
  6 | # license as circulated by CEA, CNRS and INRIA at the following URL
  7 | # "http://www.cecill.info".
  8 | #
  9 | # As a counterpart to the access to the source code and rights to copy,
 10 | # modify and redistribute granted by the license, users are provided only
 11 | # with a limited warranty and the software's author, the holder of the
 12 | # economic rights, and the successive licensors have only limited
 13 | # liability.
 14 | #
 15 | # In this respect, the user's attention is drawn to the risks associated
 16 | # with loading, using, modifying and/or developing or reproducing the
 17 | # software by the user in light of its specific status of free software,
 18 | # that may mean that it is complicated to manipulate, and that also
 19 | # therefore means that it is reserved for developers and experienced
 20 | # professionals having in-depth computer knowledge. Users are therefore
 21 | # encouraged to load and test the software's suitability as regards their
 22 | # requirements in conditions enabling the security of their systems and/or
 23 | # data to be ensured and, more generally, to use and operate it in the
 24 | # same conditions as regards security.
 25 | #
 26 | # The fact that you are presently reading this means that you have had
 27 | # knowledge of the CeCILL license and that you accept its terms.
 28 | 
 29 | from zipfile import ZipFile
 30 | from lxml import etree
 31 | import datetime
 32 | import csv
 33 | import re
 34 | import sys
 35 | 
 36 | import logging
 37 | logger = logging.getLogger(__name__)
 38 | 
 39 | __all___ = ['CANTAB_CCLAR', 'DETAILED_DATASHEET_CSV', 'DATASHEET_CSV',
 40 |             'REPORT_HTML',
 41 |             'read_cant', 'read_datasheet', 'read_detailed_datasheet',
 42 |             'read_report']
 43 | 
 44 | 
 45 | #
 46 | # types of files we expect to be find under AdditionalData
 47 | #
 48 | CANTAB_CCLAR = 'cantab'
 49 | DETAILED_DATASHEET_CSV = 'detailed_datasheet'
 50 | DATASHEET_CSV = 'datasheet'
 51 | REPORT_HTML = 'report'
 52 | 
 53 | _ID_XPATH = ".//{http://www.camcog.com/proteus/entity/xml}attribute[@name='ID']"
 54 | 
 55 | 
 56 | def read_cant(path):
 57 |     """Return "Subject ID" values found in a cant_*.cclar file.
 58 | 
 59 |     Parameters
 60 |     ----------
 61 |     path : unicode
 62 |         Path to the cant_*.cclar file to read from.
 63 | 
 64 |     Returns
 65 |     -------
 66 |     list
 67 |         "Subject ID" values found in the file.
 68 | 
 69 |     """
 70 |     subject_ids = set()
 71 |     cantfile = ZipFile(path, 'r')
 72 |     for name in cantfile.namelist():
 73 |         if name.endswith('index.xml'):
 74 |             root = etree.fromstring(cantfile.read(name))
 75 |             for element in root.findall(_ID_XPATH):
 76 |                 subject_ids.add(element.attrib['value'])
 77 |     cantfile.close()
 78 |     return subject_ids
 79 | 
 80 | 
 81 | def _parse_csv_datetime(date_string):
 82 |     """Read date in the format found in CSV files.
 83 | 
 84 |     * LONDON      01-Feb-2015 12:34:56
 85 |     * NOTTINGHAM  01-Feb-2015 12:34:56   01/02/2015 12:34
 86 |     * DUBLIN      01-Feb-2015 12:34:56
 87 |     * BERLIN      01.02.2015 12:34:56
 88 |     * HAMBURG     01.02.2015 12:34:56
 89 |     * MANNHEIM    01.02.2015 12:34:56
 90 |     * PARIS       01 Feb 2015 12:34:56
 91 |     * DRESDEN     12:34:56 01.02.2015
 92 | 
 93 |     """
 94 |     DATE_FORMATS = (
 95 |         '%d-%b-%Y %H:%M:%S',  # 01-Feb-2015 12:34:56
 96 |         '%d/%m/%Y %H:%M',     # 01/02/2015 12:34
 97 |         '%d.%m.%Y %H:%M:%S',  # 01.02.2015 12:34:56
 98 |         '%d %b %Y %H:%M:%S',  # 01 Feb 2015 12:34:56
 99 |         '%H:%M:%S %d.%m.%Y',  # 12:34:56 01.02.2015
100 |     )
101 |     for date_format in DATE_FORMATS:
102 |         try:
103 |             dt = datetime.datetime.strptime(date_string, date_format)
104 |             return dt
105 |         except ValueError:
106 |             pass
107 |     return None
108 | 
109 | 
110 | def read_datasheet(path):
111 |     """Return "Subject ID" and other information extracted from datasheet_*.csv.
112 | 
113 |     Parameters
114 |     ----------
115 |     path : unicode
116 |         Path to the datasheet_*.csv file to read from.
117 | 
118 |     Returns
119 |     -------
120 |     list
121 |         * "Subject ID" values found in the file.
122 |         * "Session start time" values found in the file.
123 |         * number of rows.
124 |         * minimal number of columns.
125 |         * list of column titles.
126 | 
127 |     """
128 |     with open(path) as csvfile:
129 |         # read header
130 |         dialect = csv.Sniffer().sniff(csvfile.read())
131 |         csvfile.seek(0)
132 |         reader = csv.reader(csvfile, dialect)
133 |         rows = 0
134 |         columns_max = columns_min = 0
135 |         fields = {}
136 |         header = next(reader)
137 |         if header:
138 |             fields = {v: i for i, v in enumerate(header)}
139 |             columns_max = columns_min = len(header)
140 |             rows += 1
141 |         subject_ids = set()
142 |         session_start_times = set()
143 |         # read values from the rest of the table
144 |         for row in reader:
145 |             if len(row) > 0:
146 |                 if "Subject ID" in fields:
147 |                     subject_id = row[fields["Subject ID"]]
148 |                 else:
149 |                     subject_id = row[0]
150 |                 subject_ids.add(subject_id)
151 |             if "Session start time" in fields:
152 |                 session_start_time = _parse_csv_datetime(row[fields["Session start time"]])
153 |                 if session_start_time is not None:
154 |                     if session_start_time < datetime.datetime(2007, 1, 1):
155 |                         logger.warning('"Session start time" for %s anterior to 2007: %s',
156 |                                        subject_id, session_start_time.date())
157 |                     session_start_times.add(session_start_time)
158 |             columns_min = min(len(row), columns_min)
159 |             columns_max = max(len(row), columns_max)
160 |             rows += 1
161 |         return (subject_ids, session_start_times, rows, columns_min, fields)
162 | 
163 | 
164 | #
165 | # match lines with "Subject ID"
166 | #
167 | _DETAILED_DATASHEET_REGEX = re.compile(r'"?Subject ID : (\w*)"?')
168 | 
169 | 
170 | def read_detailed_datasheet(path):
171 |     """Return "Subject ID" values found in a detailed_datasheet_*.csv file.
172 | 
173 |     Parameters
174 |     ----------
175 |     path : unicode
176 |         Path to the detailed_datasheet_*.csv file to read from.
177 | 
178 |     Returns
179 |     -------
180 |     list
181 |         "Subject ID" values found in the file.
182 | 
183 |     """
184 |     with open(path, encoding='latin1') as f:
185 |         subject_ids = set()
186 |         for line in f:
187 |             match = _DETAILED_DATASHEET_REGEX.match(line)
188 |             if match:
189 |                 subject_ids.add(match.group(1))
190 |         return subject_ids
191 | 
192 | 
193 | _REPORT_REGEX = re.compile('<th>Subject ID</th><td>(.*)</td><th>Gender</th><td>(.*)</td>')
194 | 
195 | 
196 | def read_report(path):
197 |     """Return "Subject ID" values found in a report_*.html file.
198 | 
199 |     Parameters
200 |     ----------
201 |     path : unicode
202 |         Path to the report_*.html to read from.
203 | 
204 |     Returns
205 |     -------
206 |     list
207 |         "Subject ID" values found in the file.
208 | 
209 |     """
210 |     with open(path, encoding='latin-1') as report_html:
211 |         subject_ids = set()
212 |         for line in report_html:
213 |             match = _REPORT_REGEX.match(line)
214 |             if match:
215 |                 subject_ids.add(match.group(1))
216 |         return subject_ids
217 | 


--------------------------------------------------------------------------------
/imagen_databank/core.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2014-2019 CEA
  2 | #
  3 | # This software is governed by the CeCILL license under French law and
  4 | # abiding by the rules of distribution of free software. You can use,
  5 | # modify and/ or redistribute the software under the terms of the CeCILL
  6 | # license as circulated by CEA, CNRS and INRIA at the following URL
  7 | # "http://www.cecill.info".
  8 | #
  9 | # As a counterpart to the access to the source code and rights to copy,
 10 | # modify and redistribute granted by the license, users are provided only
 11 | # with a limited warranty and the software's author, the holder of the
 12 | # economic rights, and the successive licensors have only limited
 13 | # liability.
 14 | #
 15 | # In this respect, the user's attention is drawn to the risks associated
 16 | # with loading, using, modifying and/or developing or reproducing the
 17 | # software by the user in light of its specific status of free software,
 18 | # that may mean that it is complicated to manipulate, and that also
 19 | # therefore means that it is reserved for developers and experienced
 20 | # professionals having in-depth computer knowledge. Users are therefore
 21 | # encouraged to load and test the software's suitability as regards their
 22 | # requirements in conditions enabling the security of their systems and/or
 23 | # data to be ensured and, more generally, to use and operate it in the
 24 | # same conditions as regards security.
 25 | #
 26 | # The fact that you are presently reading this means that you have had
 27 | # knowledge of the CeCILL license and that you accept its terms.
 28 | 
 29 | import re
 30 | import datetime
 31 | 
 32 | import logging
 33 | logger = logging.getLogger(__name__)
 34 | 
 35 | __all___ = ['LONDON', 'NOTTINGHAM', 'DUBLIN', 'BERLIN',
 36 |             'HAMBURG', 'MANNHEIM', 'PARIS', 'DRESDEN',
 37 |             'SOUTHAMPTON', 'AACHEN',
 38 |             'CENTER_NAME',
 39 |             'PSC2_FROM_PSC1', 'PSC1_FROM_PSC2',
 40 |             'PSC1_FROM_DAWBA', 'PSC2_FROM_DAWBA',  # PSC2_FROM_DAWBA is obsolete
 41 |             'DOB_FROM_PSC1',
 42 |             'detect_psc1', 'detect_psc2', 'guess_psc1',
 43 |             'Error']
 44 | 
 45 | 
 46 | #
 47 | # numerical ID of acquisition centers of Imagen
 48 | #
 49 | LONDON = 1
 50 | NOTTINGHAM = 2
 51 | DUBLIN = 3
 52 | BERLIN = 4
 53 | HAMBURG = 5
 54 | MANNHEIM = 6
 55 | PARIS = 7
 56 | DRESDEN = 8
 57 | SOUTHAMPTON = 90  # Stratify
 58 | AACHEN = 91  # Stratify
 59 | 
 60 | #
 61 | # from numerical ID to standard name of acquisition centers of Imagen
 62 | #
 63 | CENTER_NAME = {
 64 |     LONDON: 'LONDON',
 65 |     NOTTINGHAM: 'NOTTINGHAM',
 66 |     DUBLIN: 'DUBLIN',
 67 |     BERLIN: 'BERLIN',
 68 |     HAMBURG: 'HAMBURG',
 69 |     MANNHEIM: 'MANNHEIM',
 70 |     PARIS: 'PARIS',
 71 |     DRESDEN: 'DRESDEN',
 72 |     SOUTHAMPTON: 'SOUTHAMPTON',  # Stratify
 73 |     AACHEN: 'AACHEN',  # Stratify
 74 | }
 75 | 
 76 | #
 77 | # file that maps PSC1 to PSC2 and DAWBA codes to PSC1
 78 | #
 79 | _PSC2PSC = '/neurospin/imagen/src/scripts/psc_tools/psc2psc.csv'
 80 | _PSC2PSC_STRATIFY = '/neurospin/imagen/src/scripts/psc_tools/psc2psc_SB.csv'
 81 | 
 82 | #
 83 | # file that maps PSC1 codes to date of birth
 84 | #
 85 | _DOB = '/neurospin/imagen/src/scripts/psc_tools/DOB.csv'
 86 | _DOB_STRATIFY = '/neurospin/imagen/src/scripts/psc_tools/DOB_SB.csv'
 87 | 
 88 | 
 89 | def _initialize_psc1_dawba_psc2():
 90 |     """Returns dictionaries to map PSC1 to PSC2 and DAWBA codes to PSC1.
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     path  : unicode
 95 |         File containing PSC1=DAWBA=PSC2 mappings.
 96 | 
 97 |     Returns
 98 |     -------
 99 |     tuple
100 |         Pair of PSC1→PSC2 and DAWBA→PSC1 dictionaries.
101 | 
102 |     """
103 |     psc2_from_psc1 = {}
104 |     psc1_from_dawba = {}
105 |     for psc2psc in (_PSC2PSC, _PSC2PSC_STRATIFY):
106 |         with open(psc2psc, 'rU') as f:
107 |             for line in f:
108 |                 psc1, dawba, psc2 = line.strip('\n').split('=')
109 |                 # 1st line is: PSC1=DAWBA=PSC2
110 |                 if psc1 == 'PSC1' and dawba == 'DAWBA' and psc2 == 'PSC2':
111 |                     continue
112 |                 if psc2 in psc2_from_psc1:
113 |                     if psc2_from_psc1[psc1] != psc2:
114 |                         logger.critical('inconsistent PSC1/PSC2 mapping: %s', _PSC2PSC)
115 |                         raise Exception('inconsistent PSC1/PSC2 mapping')
116 |                 else:
117 |                     psc2_from_psc1[psc1] = psc2
118 |                 psc1_from_dawba[dawba] = psc1
119 |     return psc2_from_psc1, psc1_from_dawba
120 | 
121 | 
122 | _REGEX_DOB = re.compile(r'(\d{4})-(\d{2})-(\d{2})')
123 | 
124 | 
125 | def _initialize_dob():
126 |     """Returns dictionary to map PSC1 code to date of birth.
127 | 
128 |     Parameters
129 |     ----------
130 |     path  : unicode
131 |         DOB.csv file left over by initial Imagen team.
132 | 
133 |     Returns
134 |     -------
135 |     dict
136 |         Dictionary map PSC1 code to date of birth.
137 | 
138 |     """
139 |     dob_from_psc1 = {}
140 |     for dob in (_DOB, _DOB_STRATIFY):
141 |         with open(dob, 'rU') as f:
142 |             for line in f:
143 |                 psc1, dob, dummy_when = line.strip('\n').split(',')
144 |                 match = _REGEX_DOB.match(dob)
145 |                 if match:
146 |                     year = int(match.group(1))
147 |                     month = int(match.group(2))
148 |                     day = int(match.group(3))
149 |                     if year > 2012 or year < 1987:
150 |                         raise Exception('unexpected date of birth: {0} ({1}-{2}-{3})'.format(dob, year, month, day))
151 |                     dob_from_psc1[psc1] = datetime.date(year, month, day)
152 |                 else:
153 |                     raise Exception('unexpected line in DOB.csv: {0}'.format(line))
154 |     return dob_from_psc1
155 | 
156 | 
157 | PSC2_FROM_PSC1, PSC1_FROM_DAWBA = _initialize_psc1_dawba_psc2()
158 | PSC2_FROM_DAWBA = {k: PSC2_FROM_PSC1[v]  # obsolete
159 |                    for k, v in PSC1_FROM_DAWBA.items() if v in PSC2_FROM_PSC1}
160 | PSC1_FROM_PSC2 = {v: k for k, v in PSC2_FROM_PSC1.items()}
161 | DOB_FROM_PSC1 = _initialize_dob()
162 | DOB_FROM_PSC2 = {PSC2_FROM_PSC1[k]: v  # obsolete
163 |                  for k, v in DOB_FROM_PSC1.items() if k in PSC2_FROM_PSC1}
164 | 
165 | 
166 | #
167 | # the heuristic to detect a PSC1 code is that:
168 | # - it starts with 0 followed by the digit associated to each center
169 | # - it is a series of 12 digits
170 | #
171 | _PSC1_REGEX = re.compile('(0[' +
172 |                          ''.join([str(c) for c in CENTER_NAME]) +
173 |                          ']\d{10})[^d]?')
174 | 
175 | 
176 | def detect_psc1(string):
177 |     """Find potential PSC1 codes in a filename.
178 | 
179 |     PSC1 codes are sequences of 12 digits starting with 0 followed by a
180 |     different digit for each center, followed by 10 digits.
181 | 
182 |     Parameters
183 |     ----------
184 |     filename : str
185 |         The string to search for PSC1.
186 | 
187 |     Returns
188 |     -------
189 |     str
190 |         Potential PSC1 code or None.
191 | 
192 |     """
193 |     match = _PSC1_REGEX.search(string)
194 |     if match:
195 |         return match.group(1)
196 |     else:
197 |         return None
198 | 
199 | 
200 | #
201 | # the heuristic to detect a PSC2 code is that:
202 | # - it starts with 0 followed by a different digit for each center
203 | # - it a series of 12 digits
204 | #
205 | _PSC2_REGEX = re.compile('(0\d{11})[^d]?')
206 | 
207 | 
208 | def detect_psc2(string):
209 |     """Find potential PSC2 codes in a filename.
210 | 
211 |     PSC2 codes are sequences of 12 digits starting with 0.
212 | 
213 |     Parameters
214 |     ----------
215 |     filename : str
216 |         The string to search for PSC2.
217 | 
218 |     Returns
219 |     -------
220 |     str
221 |         Potential PSC2 code or None.
222 | 
223 |     """
224 |     match = _PSC2_REGEX.search(string)
225 |     if match:
226 |         return match.group(1)
227 |     else:
228 |         return None
229 | 
230 | 
231 | def guess_psc1(subject_id, center):
232 |     subject_id = subject_id.split('_')[0]
233 |     if subject_id.upper().startswith('FU2'):
234 |         subject_id = subject_id[3:]
235 |     if subject_id.upper().endswith('FU3'):
236 |         subject_id = subject_id[:-3]
237 |     elif subject_id.upper().endswith('FU2'):
238 |         subject_id = subject_id[:-3]
239 |     elif subject_id.upper().endswith('FU'):
240 |         subject_id = subject_id[:-2]
241 |     # this is very empirical and based on cases seen so far!
242 |     if len(subject_id) < 10:
243 |         subject_id = '0' + str(center) + subject_id.rjust(10, '0')
244 |     elif len(subject_id) < 11:
245 |         if len(subject_id) < 10:
246 |             subject_id = subject_id.rjust(10, '0')
247 |         subject_id = '0' + str(center) + subject_id
248 |     elif len(subject_id) < 12:
249 |         subject_id = subject_id[0:2] + '0' + subject_id[2:]
250 |     # check this is an existing PSC1 code
251 |     if subject_id in PSC2_FROM_PSC1:
252 |         return subject_id
253 |     return None
254 | 
255 | 
256 | class Error:
257 |     """Error while parsing files.
258 | 
259 |     Returned by functions that parse Cantab and behavioral files.
260 | 
261 |     Attributes
262 |     ----------
263 |     path : str
264 |         File name.
265 |     message : str
266 |         Message explaining the error.
267 |     sample : str
268 |         Part of the file that generated the error.
269 | 
270 |     """
271 |     _SAMPLE_LEN = 30
272 | 
273 |     def __init__(self, path, message, sample=None):
274 |         self.path = path
275 |         self.message = message
276 |         self.sample = sample
277 | 
278 |     def __str__(self):
279 |         if self.path:
280 |             if self.sample:
281 |                 sample = repr(self.sample)
282 |                 if len(sample) > self._SAMPLE_LEN:
283 |                     sample = sample[:self._SAMPLE_LEN] + '...'
284 |                 return '{0}: <{1}>: {2}'.format(self.message, sample, self.path)
285 |             else:
286 |                 return '{0}: {1}'.format(self.message, self.path)
287 |         else:
288 |             return '{0}'.format(self.message)
289 | 


--------------------------------------------------------------------------------
/imagen_databank/dicom_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2014-2017 CEA
  2 | #
  3 | # This software is governed by the CeCILL license under French law and
  4 | # abiding by the rules of distribution of free software. You can use,
  5 | # modify and/ or redistribute the software under the terms of the CeCILL
  6 | # license as circulated by CEA, CNRS and INRIA at the following URL
  7 | # "http://www.cecill.info".
  8 | #
  9 | # As a counterpart to the access to the source code and rights to copy,
 10 | # modify and redistribute granted by the license, users are provided only
 11 | # with a limited warranty and the software's author, the holder of the
 12 | # economic rights, and the successive licensors have only limited
 13 | # liability.
 14 | #
 15 | # In this respect, the user's attention is drawn to the risks associated
 16 | # with loading, using, modifying and/or developing or reproducing the
 17 | # software by the user in light of its specific status of free software,
 18 | # that may mean that it is complicated to manipulate, and that also
 19 | # therefore means that it is reserved for developers and experienced
 20 | # professionals having in-depth computer knowledge. Users are therefore
 21 | # encouraged to load and test the software's suitability as regards their
 22 | # requirements in conditions enabling the security of their systems and/or
 23 | # data to be ensured and, more generally, to use and operate it in the
 24 | # same conditions as regards security.
 25 | #
 26 | # The fact that you are presently reading this means that you have had
 27 | # knowledge of the CeCILL license and that you accept its terms.
 28 | 
 29 | import re
 30 | import datetime
 31 | import dateutil.tz
 32 | try:
 33 |     import pydicom
 34 | except:
 35 |     import dicom as pydicom
 36 | from pydicom.filereader import InvalidDicomError
 37 | from pydicom.filereader import dcmread
 38 | 
 39 | import logging
 40 | logger = logging.getLogger(__name__)
 41 | 
 42 | __all__ = ['read_metadata']
 43 | 
 44 | 
 45 | #
 46 | # parse DICOM DateTime and Time tags
 47 | #
 48 | _REGEX_DT = re.compile(r"((\d{4,14})(\.(\d{1,6}))?)([+-]\d{4})?")
 49 | _REGEX_TM = re.compile(r"(\d{2,6})(\.(\d{1,6}))?")
 50 | 
 51 | 
 52 | def _datetime_from_dt(dt):
 53 |     """Convert DICOM DateTime to Python datetime.
 54 | 
 55 |     Parameters
 56 |     ----------
 57 |     dt : str
 58 |         DateTime tag from DICOM image.
 59 | 
 60 |     Returns
 61 |     -------
 62 |     datetime
 63 | 
 64 |     """
 65 |     match = _REGEX_DT.match(dt)
 66 |     if match and len(dt) <= 26:
 67 |         dt_match = match.group(2)
 68 |         year = int(dt_match[0:4])
 69 |         if len(dt_match) < 6:
 70 |             month = 1
 71 |         else:
 72 |             month = int(dt_match[4:6])
 73 |         if len(dt_match) < 8:
 74 |             day = 1
 75 |         else:
 76 |             day = int(dt_match[6:8])
 77 |         if len(dt_match) < 10:
 78 |             hour = 0
 79 |         else:
 80 |             hour = int(dt_match[8:10])
 81 |         if len(dt_match) < 12:
 82 |             minute = 0
 83 |         else:
 84 |             minute = int(dt_match[10:12])
 85 |         if len(dt_match) < 14:
 86 |             second = 0
 87 |             microsecond = 0
 88 |         else:
 89 |             second = int(dt_match[12:14])
 90 |             ms_match = match.group(4)
 91 |             if ms_match:
 92 |                 microsecond = int(ms_match.rstrip().ljust(6, '0'))
 93 |             else:
 94 |                 microsecond = 0
 95 |         tz_match = match.group(5)
 96 |         if tz_match:
 97 |             offset = (int(tz_match[1:3]) * 60 + int(tz_match[3:5])) * 60
 98 |             if tz_match[0] == '-':
 99 |                 offset = -offset
100 |             tzinfo = dateutil.tz.tzoffset(tz_match, offset)
101 |         else:
102 |             tzinfo = None
103 |         return datetime.datetime(year, month, day,
104 |                                  hour, minute, second, microsecond,
105 |                                  tzinfo)
106 |     else:
107 |         logger.error('incorrect DICOM DT: %s', dt)
108 |         return None
109 | 
110 | 
111 | def _date_from_da(da):
112 |     """Convert DICOM Date to Python date.
113 | 
114 |     Parameters
115 |     ----------
116 |     da : str
117 |         Date tag from DICOM image.
118 | 
119 |     Returns
120 |     -------
121 |     date
122 | 
123 |     """
124 |     if len(da) == 8:
125 |         year = int(da[0:4])
126 |         month = int(da[4:6])
127 |         day = int(da[6:8])
128 |         return datetime.date(year, month, day)
129 |     elif len(da) == 10 and da[4] == '.' and da[7] == '.':
130 |         # ACR-NEMA Standard 300, predecessor to DICOM - for compatibility
131 |         year = int(da[0:4])
132 |         month = int(da[5:7])
133 |         day = int(da[8:10])
134 |         return datetime.date(year, month, day)
135 |     else:
136 |         logger.error('incorrect DICOM DA: %s', da)
137 |         return None
138 | 
139 | 
140 | def _time_from_tm(tm):
141 |     """Convert DICOM Time to Python time.
142 | 
143 |     Parameters
144 |     ----------
145 |     tm : str
146 |         Time tag from DICOM image.
147 | 
148 |     Returns
149 |     -------
150 |     time
151 | 
152 |     """
153 |     match = _REGEX_TM.match(tm)
154 |     if match and len(tm) <= 16:
155 |         tm_match = match.group(1)
156 |         hour = int(tm_match[0:2])
157 |         if len(tm_match) < 4:
158 |             minute = 0
159 |         else:
160 |             minute = int(tm_match[2:4])
161 |         if len(tm_match) < 6:
162 |             second = 0
163 |             microsecond = 0
164 |         else:
165 |             second = int(tm_match[4:6])
166 |             ms_match = match.group(3)
167 |             if ms_match:
168 |                 microsecond = int(ms_match.rstrip().ljust(6, '0'))
169 |             else:
170 |                 microsecond = 0
171 |         return datetime.time(hour, minute, second, microsecond)
172 |     else:
173 |         logger.error('incorrect DICOM TM: %s', tm)
174 |         return None
175 | 
176 | 
177 | def read_metadata(path, force=False):
178 |     """Read select metadata from a DICOM file.
179 | 
180 |     We always attempt to read the following DICOM tags. An exception is raised
181 |     if one of the tags cannot be read:
182 |         - SOPClassUID
183 |         - SeriesInstanceUID
184 |         - SeriesNumber
185 |         - SeriesDescription
186 |         - SOPInstanceUID
187 | 
188 |     We also attempt to read the following DICOM tags if they are present:
189 |         - ImageType
190 |         - AcquisitionDateTime
191 |         - AcquisitionDate
192 |         - AcquisitionTime
193 |         - StationName
194 |         - Manufacturer
195 |         - ManufacturerModelName
196 |         - DeviceSerialNumber
197 |         - SoftwareVersions
198 |         - PatientID
199 | 
200 |     Parameters
201 |     ----------
202 |     path : str
203 |         Path name of the DICOM file.
204 |     force : bool
205 |         If True read nonstandard files, typically without "Part 10" headers.
206 | 
207 |     Returns
208 |     -------
209 |     dict
210 | 
211 |     """
212 |     dataset = dcmread(path, force=force)
213 | 
214 |     # missing compulsory tags will raise exceptions
215 |     if 'SeriesDescription' in dataset:
216 |         description = dataset.SeriesDescription
217 |     elif 'ProtocolName' in dataset:
218 |         description = dataset.ProtocolName
219 |     else:
220 |         description = dataset.SeriesDescription  # raise an exception!
221 | 
222 |     metadata = {
223 |         'SOPClassUID': dataset.SOPClassUID,
224 |         'SOPInstanceUID': dataset.SOPInstanceUID,
225 |         'SeriesInstanceUID': dataset.SeriesInstanceUID,
226 |         'SeriesNumber': dataset.SeriesNumber,
227 |         'SeriesDescription': description,
228 |     }
229 | 
230 |     # optional tags
231 |     if 'ImageType' in dataset:
232 |         metadata['ImageType'] = dataset.ImageType
233 |     if 'AcquisitionDateTime' in dataset:
234 |         dt = _datetime_from_dt(dataset.AcquisitionDateTime)
235 |         metadata['AcquisitionDate'] = dt.date()
236 |         metadata['AcquisitionTime'] = dt.time()
237 |     else:
238 |         if 'AcquisitionDate' in dataset:
239 |             metadata['AcquisitionDate'] = _date_from_da(dataset.AcquisitionDate)
240 |         if 'AcquisitionTime' in dataset:
241 |             metadata['AcquisitionTime'] = _time_from_tm(dataset.AcquisitionTime)
242 |     if 'StationName' in dataset:
243 |         metadata['StationName'] = dataset.StationName
244 |     if 'Manufacturer' in dataset:
245 |         metadata['Manufacturer'] = dataset.Manufacturer
246 |     if 'ManufacturerModelName' in dataset:
247 |         metadata['ManufacturerModelName'] = dataset.ManufacturerModelName
248 |     if 'DeviceSerialNumber' in dataset:
249 |         metadata['DeviceSerialNumber'] = dataset.DeviceSerialNumber
250 |     if 'SoftwareVersions' in dataset:
251 |         if pydicom.dataelem.isMultiValue(dataset.SoftwareVersions):
252 |             # usually the last part is the more informative
253 |             # for example on Philips scanners:
254 |             # ['3.2.1', '3.2.1.1'] → '3.2.1.1'
255 |             metadata['SoftwareVersions'] = dataset.SoftwareVersions[-1]
256 |         else:
257 |             metadata['SoftwareVersions'] = dataset.SoftwareVersions
258 |     if 'StudyComments' in dataset:  # DUBLIN
259 |         metadata['StudyComments'] = dataset.StudyComments
260 |     if 'PatientName' in dataset:  # BERLIN, NOTTINGHAM
261 |         metadata['PatientName'] = dataset.PatientName
262 |     if 'ImageComments' in dataset:  # HAMBURG, DRESDEN
263 |         metadata['ImageComments'] = dataset.ImageComments
264 |     if 'StudyDescription' in dataset:  # LONDON
265 |         metadata['StudyDescription'] = dataset.StudyDescription
266 |     if 'PerformedProcedureStepDescription' in dataset:  # LONDON
267 |         metadata['PerformedProcedureStepDescription'] = dataset.PerformedProcedureStepDescription
268 |     if 'PatientID' in dataset:  # BERLIN, MANNHEIM, PARIS
269 |         metadata['PatientID'] = dataset.PatientID
270 | 
271 |     return metadata
272 | 


--------------------------------------------------------------------------------
/imagen_databank/image_data.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2014-2018 CEA
  2 | #
  3 | # This software is governed by the CeCILL license under French law and
  4 | # abiding by the rules of distribution of free software. You can use,
  5 | # modify and/ or redistribute the software under the terms of the CeCILL
  6 | # license as circulated by CEA, CNRS and INRIA at the following URL
  7 | # "http://www.cecill.info".
  8 | #
  9 | # As a counterpart to the access to the source code and rights to copy,
 10 | # modify and redistribute granted by the license, users are provided only
 11 | # with a limited warranty and the software's author, the holder of the
 12 | # economic rights, and the successive licensors have only limited
 13 | # liability.
 14 | #
 15 | # In this respect, the user's attention is drawn to the risks associated
 16 | # with loading, using, modifying and/or developing or reproducing the
 17 | # software by the user in light of its specific status of free software,
 18 | # that may mean that it is complicated to manipulate, and that also
 19 | # therefore means that it is reserved for developers and experienced
 20 | # professionals having in-depth computer knowledge. Users are therefore
 21 | # encouraged to load and test the software's suitability as regards their
 22 | # requirements in conditions enabling the security of their systems and/or
 23 | # data to be ensured and, more generally, to use and operate it in the
 24 | # same conditions as regards security.
 25 | #
 26 | # The fact that you are presently reading this means that you have had
 27 | # knowledge of the CeCILL license and that you accept its terms.
 28 | 
 29 | import os
 30 | import re
 31 | import time
 32 | import datetime
 33 | 
 34 | from .core import (LONDON, NOTTINGHAM, DUBLIN, BERLIN,
 35 |                    HAMBURG, MANNHEIM, PARIS, DRESDEN,
 36 |                    SOUTHAMPTON, AACHEN)
 37 | from .dicom_utils import read_metadata
 38 | from .dicom_utils import InvalidDicomError
 39 | 
 40 | import logging
 41 | logger = logging.getLogger(__name__)
 42 | 
 43 | __all__ = ['SEQUENCE_LOCALIZER_CALIBRATION', 'SEQUENCE_T2',
 44 |            'SEQUENCE_T2_FLAIR', 'SEQUENCE_ADNI_MPRAGE',
 45 |            'SEQUENCE_MID', 'SEQUENCE_FT', 'SEQUENCE_SST',
 46 |            'SEQUENCE_B0_MAP', 'SEQUENCE_DTI',
 47 |            'SEQUENCE_RESTING_STATE',
 48 |            'SEQUENCE_NODDI',
 49 |            'SEQUENCE_NAME',
 50 |            'NONSTANDARD_DICOM',
 51 |            'series_type_from_description',
 52 |            'walk_image_data', 'report_image_data']
 53 | 
 54 | 
 55 | #
 56 | # information sent by Anna Cattrell to Dimitri on 13 June 2014:
 57 | # Standard Operating Procedure IMAGEN Follow-up 2 study
 58 | #
 59 | # 2.2.1 Overview of Imaging Session:
 60 | #
 61 | #   2. 3 plane localizer / Parallel imaging calibration
 62 | #   3. Axial T2 slices (site specific duration)
 63 | #   4. Axial T2 Flair slices (site specific duration)
 64 | #   5. 3D Sagittal ADNI MPRAGE (Long)
 65 | #   7. Monetary Incentive Delay Task (MID)
 66 | #   9. Face task (FT)
 67 | #  11. Stop-signal task (SST)
 68 | #  12. B0 Map
 69 | #  13. DTI (duration is heart-rate dependent at sites with cardiac gating)
 70 | #  14. Resting State
 71 | #  15. Short MPRAGE (baseline only)
 72 | #  16. EPI Global (JBP suggestion followed by a few centres at baseline)
 73 | #  17. NODDI (optional, added in Follow-up 3)
 74 | #
 75 | # the following constants attempt to describe each of these sequences
 76 | #
 77 | SEQUENCE_LOCALIZER_CALIBRATION = 2
 78 | SEQUENCE_T2 = 3
 79 | SEQUENCE_T2_FLAIR = 4
 80 | SEQUENCE_ADNI_MPRAGE = 5
 81 | SEQUENCE_MID = 7
 82 | SEQUENCE_FT = 9
 83 | SEQUENCE_SST = 11
 84 | SEQUENCE_B0_MAP = 12
 85 | SEQUENCE_DTI = 13
 86 | SEQUENCE_RESTING_STATE = 14
 87 | SEQUENCE_SHORT_MPRAGE = 15
 88 | SEQUENCE_GLOBAL = 16
 89 | SEQUENCE_NODDI = 17
 90 | 
 91 | #
 92 | # from sequence ID to sequence name
 93 | #
 94 | SEQUENCE_NAME = {
 95 |     SEQUENCE_LOCALIZER_CALIBRATION: 'Localizer/Calibration',
 96 |     SEQUENCE_T2: 'T2',
 97 |     SEQUENCE_T2_FLAIR: 'T2 Flair',
 98 |     SEQUENCE_ADNI_MPRAGE: 'ADNI MPRAGE',
 99 |     SEQUENCE_MID: 'EPI MID',
100 |     SEQUENCE_FT: 'EPI Faces',
101 |     SEQUENCE_SST: 'EPI Signal Stop',
102 |     SEQUENCE_B0_MAP: 'B0 Map',
103 |     SEQUENCE_DTI: 'DTI',
104 |     SEQUENCE_RESTING_STATE: 'Resting State',
105 |     SEQUENCE_SHORT_MPRAGE: 'Short MPRAGE',
106 |     SEQUENCE_GLOBAL: 'EPI Global',
107 |     SEQUENCE_NODDI: 'NODDI',
108 | }
109 | 
110 | #
111 | # check sequence names against these regex'es when trying to identify
112 | # the type of a sequence from its name
113 | #
114 | # in some case order is important, for example:
115 | # - first match 'FLAIR' and 'short MPRAGE'
116 | # - then match 'T2' and 'MPRAGE'
117 | #
118 | _LOOSE_IMAGE_DATA_REGEXES = (
119 |     (re.compile(r'LOCALI[ZS]ER', re.IGNORECASE), SEQUENCE_LOCALIZER_CALIBRATION),
120 |     # LONDON calibration
121 |     (re.compile(r'ASSET[- ]Cal', re.IGNORECASE), SEQUENCE_LOCALIZER_CALIBRATION),
122 |     # NOTTINGHAM 3-plane scout
123 |     (re.compile(r'Survey_SHC'), SEQUENCE_LOCALIZER_CALIBRATION),
124 |     # LONDON FU3 3-plane Localizer
125 |     (re.compile(r'3Plane'), SEQUENCE_LOCALIZER_CALIBRATION),
126 |     # first search for "FLAIR" then for "T2"
127 |     (re.compile(r'FLAIR', re.IGNORECASE), SEQUENCE_T2_FLAIR),
128 |     (re.compile(r'T2', re.IGNORECASE), SEQUENCE_T2),
129 |     (re.compile(r'short MPRAGE', re.IGNORECASE), SEQUENCE_SHORT_MPRAGE),
130 |     (re.compile(r'MPRAGE', re.IGNORECASE), SEQUENCE_ADNI_MPRAGE),
131 |     (re.compile(r'MID', re.IGNORECASE), SEQUENCE_MID),
132 |     # "EPI short reward" and "EPI reward short" are the same as "EPI short MID"
133 |     (re.compile(r'reward', re.IGNORECASE), SEQUENCE_MID),
134 |     (re.compile(r'face', re.IGNORECASE), SEQUENCE_FT),
135 |     (re.compile(r'stop[- ]signal', re.IGNORECASE), SEQUENCE_SST),
136 |     # LONDON stop signal DICOM files contain "SST"
137 |     (re.compile(r'SST', re.IGNORECASE), SEQUENCE_SST),
138 |     (re.compile(r'global', re.IGNORECASE), SEQUENCE_GLOBAL),
139 |     (re.compile(r'B0'), SEQUENCE_B0_MAP),
140 |     # LONDON B0 maps made of 3 DICOM files containing "FIELDMAP"
141 |     (re.compile(r'FIELDMAP', re.IGNORECASE), SEQUENCE_B0_MAP),
142 |     (re.compile(r'DTI'), SEQUENCE_DTI),
143 |     (re.compile(r'REST', re.IGNORECASE), SEQUENCE_RESTING_STATE),
144 | )
145 | 
146 | #
147 | # some acquisition centers may send nonstandard DICOM files
148 | #
149 | # for example Hamburg have sent DICOM files without "PART 10" headers
150 | #
151 | NONSTANDARD_DICOM = {
152 |     LONDON: False,
153 |     NOTTINGHAM: False,
154 |     DUBLIN: False,
155 |     BERLIN: False,
156 |     HAMBURG: True,
157 |     MANNHEIM: False,
158 |     PARIS: False,
159 |     DRESDEN: False,
160 |     SOUTHAMPTON: False,
161 |     AACHEN: False,
162 | }
163 | 
164 | #
165 | # the SOP Class UIDs we expect to find while scanning DICOM files:
166 | # - those we process
167 | # - those we discard silently
168 | #
169 | # any other SOP Class UID generates a warning
170 | #
171 | _ALLOWED_SOP_CLASS_UIDS = {
172 |     'MR Image Storage',
173 |     'Enhanced MR Image Storage',
174 | }
175 | _IGNORED_SOP_CLASS_UIDS = {
176 |     'Grayscale Softcopy Presentation State Storage SOP Class',
177 |     'Raw Data Storage',
178 |     'Enhanced SR Storage',
179 |     'Philips Private Gyroscan MR Serie Data',
180 |     'Private MR Series Data Storage', '1.3.46.670589.11.0.0.12.2',
181 |     'Private MR Examcard Storage', '1.3.46.670589.11.0.0.12.4',
182 |     'Secondary Capture Image Storage',
183 | }
184 | 
185 | 
186 | def series_type_from_description(series_description):
187 |     """Match series description to those listed in Imagen FU2 SOPs.
188 | 
189 |     This matching function is empirical and based on experimentation.
190 | 
191 |     Parameters
192 |     ----------
193 |     series_description : unicode
194 |         The series description to match.
195 | 
196 |     Returns
197 |     -------
198 |     str
199 |         If the series description loosely matches a series type listed
200 |         in the SOPs, return this series type, else return None.
201 | 
202 |     """
203 |     for regex, series_type in _LOOSE_IMAGE_DATA_REGEXES:
204 |         if regex.search(series_description):
205 |             return series_type
206 |     return None
207 | 
208 | 
209 | def walk_image_data(path, force=False):
210 |     """Generate information on DICOM files in a directory.
211 | 
212 |     File that cannot be read are skipped and an error message is logged.
213 | 
214 |     Parameters
215 |     ----------
216 |     path : unicode
217 |         Directory to read DICOM files from.
218 |     force : bool
219 |         Try reading nonstandard DICOM files, typically without "PART 10" headers.
220 | 
221 |     Yields
222 |     ------
223 |     tuple
224 |         Yields a pair (metadata, relpath) where metadata is a dictionary
225 |         of extracted DICOM metadata.
226 | 
227 |     """
228 |     n = 0
229 |     start = time.time()
230 | 
231 |     logger.info('start processing files under: %s', path)
232 | 
233 |     for root, dummy_dirs, files in os.walk(path):
234 |         n += len(files)
235 |         for filename in files:
236 |             abspath = os.path.join(root, filename)
237 |             relpath = os.path.normpath(os.path.relpath(abspath, path))
238 |             # skip DICOMDIR since we are going to read all DICOM files anyway
239 |             # beware, Nottigham had sent a DICOMDIR2 file!
240 |             if filename.startswith('DICOMDIR'):
241 |                 continue
242 |             logger.debug('read file: %s', relpath)
243 |             try:
244 |                 metadata = read_metadata(abspath, force=force)
245 |             except OSError as e:
246 |                 logger.error('cannot read file (%s): %s', str(e), relpath)
247 |             except InvalidDicomError as e:
248 |                 logger.error('cannot read nonstandard DICOM file: %s: %s', str(e), relpath)
249 |             except AttributeError as e:
250 |                 logger.error('missing attribute: %s: %s', str(e), relpath)
251 |             else:
252 |                 yield (metadata, relpath)
253 | 
254 |     elapsed = time.time() - start
255 |     logger.info('processed %d files in %.2f s: %s', n, elapsed, path)
256 | 
257 | 
258 | def report_image_data(path, force=False):
259 |     """Find DICOM files loosely organized according to the Imagen FU2 SOPs.
260 | 
261 |     The Imagen FU2 SOPs define a precise file organization for Image Data. In
262 |     practice we have found the SOPs are only loosely followed. A method to find
263 |     DICOM files while adapting to local variations is to read all DICOM files,
264 |     then filter and break them down into series based on their contents.
265 | 
266 |     This function scans the directory where we expect to find the Image Data
267 |     of a dataset and reports series of valid DICOM files.
268 | 
269 |     Parameters
270 |     ----------
271 |     path : unicode
272 |         Directory to read DICOM files from.
273 |     force : bool
274 |         Try reading nonstandard DICOM files, typically without "PART 10" headers.
275 | 
276 |     Returns
277 |     -------
278 |     dict
279 |         The key identifies a series while the value is a pair
280 |         (series_data, images).
281 | 
282 |     """
283 |     series_dict = {}
284 | 
285 |     for (image_data, relpath) in walk_image_data(path, force=force):
286 |         if str(image_data['SOPClassUID']) in _IGNORED_SOP_CLASS_UIDS:
287 |             continue
288 |         # extract DICOM tags of interest, throw exceptions if missing tags!
289 |         series_uid = image_data['SeriesInstanceUID']
290 |         image_uid = image_data['SOPInstanceUID']
291 |         series_number = image_data['SeriesNumber']
292 |         series_description = image_data['SeriesDescription']
293 |         image_types = image_data.get('ImageType', [])
294 |         station_name = image_data.get('StationName', None)
295 |         manufacturer = image_data.get('Manufacturer', None)
296 |         manufacturer_model_name = image_data.get('ManufacturerModelName', None)
297 |         software_versions = image_data.get('SoftwareVersions', None)
298 |         device_serial_number = image_data.get('DeviceSerialNumber', None)
299 |         if 'AcquisitionDate' in image_data:
300 |             acquisition_date = image_data['AcquisitionDate']
301 |             if 'AcquisitionTime' in image_data:
302 |                 acquisition_time = image_data['AcquisitionTime']
303 |                 timestamp = datetime.datetime.combine(acquisition_date,
304 |                                                       acquisition_time)
305 |             else:
306 |                 timestamp = datetime.datetime(acquisition_date.year,
307 |                                               acquisition_date.month,
308 |                                               acquisition_date.day)
309 |         else:
310 |             logger.error('missing acquisition time: %s', relpath)
311 |         # FIXME: this is obviously wrong! # find PSC1 code
312 |         if 'CommentsOnThePerformedProcedureStep' in image_data:  # DUBLIN
313 |             psc1 = image_data['CommentsOnThePerformedProcedureStep']
314 |         elif 'ImageComments' in image_data:  # HAMBURG, DRESDEN
315 |             psc1 = image_data['ImageComments']
316 |         elif 'PatientID' in image_data:  # LONDON, NOTTINGHAM, BERLIN, MANNHEIM, PARIS
317 |             psc1 = image_data['PatientID']
318 |         elif 'PatientName' in image_data:  # LONDON, NOTTINGHAM, BERLIN, MANNHEIM, PARIS
319 |             psc1 = image_data['PatientName']
320 |         else:
321 |             psc1 = None
322 |         # build the dictionary of series using 'SeriesInstanceUID' as a key
323 |         if series_uid not in series_dict:
324 |             series_data = {
325 |                 'SeriesNumber': series_number,
326 |                 'SeriesDescription': series_description,
327 |                 'ImageType': set(image_types),
328 |                 'MinAcquisitionDateTime': timestamp,
329 |                 'MaxAcquisitionDateTime': timestamp,
330 |             }
331 |             if station_name:
332 |                 series_data['StationName'] = station_name
333 |             if manufacturer:
334 |                 series_data['Manufacturer'] = manufacturer
335 |             if manufacturer_model_name:
336 |                 series_data['ManufacturerModelName'] = manufacturer_model_name
337 |             if software_versions:
338 |                 series_data['SoftwareVersions'] = software_versions
339 |             if device_serial_number:
340 |                 series_data['DeviceSerialNumber'] = device_serial_number
341 |             if psc1:
342 |                 series_data['PSC1'] = psc1
343 |             # populate series with relative path to DICOM files
344 |             series_dict[series_uid] = (series_data, {image_uid: relpath})
345 |         else:
346 |             series_dict[series_uid][0]['ImageType'] |= set(image_types)
347 |             # check consistency within series:
348 |             if series_number != series_dict[series_uid][0]['SeriesNumber']:
349 |                 logger.error('inconsistent series number '
350 |                              '"%s" / "%s":\n  %s\n  %s',
351 |                              series_dict[series_uid][0]['SeriesNumber'],
352 |                              series_number,
353 |                              next(iter(series_dict[series_uid][1].values())),
354 |                              relpath)
355 |             elif series_description != series_dict[series_uid][0]['SeriesDescription']:
356 |                 logger.error('inconsistent series description '
357 |                              '"%s" / "%s":\n  %s\n  %s',
358 |                              series_dict[series_uid][0]['SeriesDescription'],
359 |                              series_description,
360 |                              next(iter(series_dict[series_uid][1].values())),
361 |                              relpath)
362 |             if station_name:
363 |                 if 'StationName' in series_dict[series_uid][0]:
364 |                     if station_name != series_dict[series_uid][0]['StationName']:
365 |                         logger.error('inconsistent station name '
366 |                                      '"%s" / "%s":\n  %s\n  %s',
367 |                                      series_dict[series_uid][0]['StationName'],
368 |                                      station_name,
369 |                                      next(iter(series_dict[series_uid][1].values())),
370 |                                      relpath)
371 |                 else:
372 |                     series_dict[series_uid][0]['StationName'] = station_name
373 |             if manufacturer:
374 |                 if 'Manufacturer' in series_dict[series_uid][0]:
375 |                     if manufacturer != series_dict[series_uid][0]['Manufacturer']:
376 |                         logger.error('inconsistent manufacturer '
377 |                                      '"%s" / "%s":\n  %s\n  %s',
378 |                                      series_dict[series_uid][0]['Manufacturer'],
379 |                                      manufacturer,
380 |                                      next(iter(series_dict[series_uid][1].values())),
381 |                                      relpath)
382 |                 else:
383 |                     series_dict[series_uid][0]['Manufacturer'] = manufacturer
384 |             if manufacturer_model_name:
385 |                 if 'ManufacturerModelName' in series_dict[series_uid][0]:
386 |                     if manufacturer_model_name != series_dict[series_uid][0]['ManufacturerModelName']:
387 |                         logger.error('inconsistent manufacturer model name '
388 |                                      '"%s" / "%s":\n  %s\n  %s',
389 |                                      series_dict[series_uid][0]['ManufacturerModelName'],
390 |                                      manufacturer_model_name,
391 |                                      next(iter(series_dict[series_uid][1].values())),
392 |                                      relpath)
393 |                 else:
394 |                     series_dict[series_uid][0]['ManufacturerModelName'] = manufacturer_model_name
395 |             if software_versions:
396 |                 if 'SoftwareVersions' in series_dict[series_uid][0]:
397 |                     if software_versions != series_dict[series_uid][0]['SoftwareVersions']:
398 |                         logger.error('inconsistent software versions '
399 |                                      '"%s" / "%s":\n  %s\n  %s',
400 |                                      series_dict[series_uid][0]['SoftwareVersions'],
401 |                                      software_versions,
402 |                                      next(iter(series_dict[series_uid][1].values())),
403 |                                      relpath)
404 |                 else:
405 |                     series_dict[series_uid][0]['SoftwareVersions'] = software_versions
406 |             if device_serial_number:
407 |                 if 'DeviceSerialNumber' in series_dict[series_uid][0]:
408 |                     if device_serial_number != series_dict[series_uid][0]['DeviceSerialNumber']:
409 |                         logger.error('inconsistent device serial number '
410 |                                      '"%s" / "%s":\n  %s\n  %s',
411 |                                      series_dict[series_uid][0]['DeviceSerialNumber'],
412 |                                      device_serial_number,
413 |                                      next(iter(series_dict[series_uid][1].values())),
414 |                                      relpath)
415 |                 else:
416 |                     series_dict[series_uid][0]['DeviceSerialNumber'] = device_serial_number
417 | 
418 |             if psc1:
419 |                 if 'PSC1' in series_dict[series_uid][0]:
420 |                     if psc1 != series_dict[series_uid][0]['PSC1']:
421 |                         logger.error('inconsistent PSC1 code '
422 |                                      '"%s" / "%s":\n  %s\n  %s',
423 |                                      series_dict[series_uid][0]['PSC1'],
424 |                                      psc1,
425 |                                      next(iter(series_dict[series_uid][1].values())),
426 |                                      relpath)
427 |                 else:
428 |                     series_dict[series_uid][0]['PSC1'] = psc1
429 |             # populate series with relative path to DICOM files
430 |             if image_uid not in series_dict[series_uid][1]:
431 |                 series_dict[series_uid][1][image_uid] = relpath
432 |             else:
433 |                 logger.error('duplicate image in series (%s):'
434 |                              '\n  %s\n  %s',
435 |                              series_description,
436 |                              series_dict[series_uid][1][image_uid],
437 |                              relpath)
438 |             # update acquisition date/time range by series
439 |             if timestamp < series_dict[series_uid][0]['MinAcquisitionDateTime']:
440 |                 series_dict[series_uid][0]['MinAcquisitionDateTime'] = timestamp
441 |             if timestamp > series_dict[series_uid][0]['MaxAcquisitionDateTime']:
442 |                 series_dict[series_uid][0]['MaxAcquisitionDateTime'] = timestamp
443 | 
444 |     return series_dict
445 | 


--------------------------------------------------------------------------------
/imagen_databank/sanity/__init__.py:
--------------------------------------------------------------------------------
 1 | # noqa
 2 | 
 3 | # Copyright (c) 2014-2017 CEA
 4 | #
 5 | # This software is governed by the CeCILL license under French law and
 6 | # abiding by the rules of distribution of free software. You can use,
 7 | # modify and/ or redistribute the software under the terms of the CeCILL
 8 | # license as circulated by CEA, CNRS and INRIA at the following URL
 9 | # "http://www.cecill.info".
10 | #
11 | # As a counterpart to the access to the source code and rights to copy,
12 | # modify and redistribute granted by the license, users are provided only
13 | # with a limited warranty and the software's author, the holder of the
14 | # economic rights, and the successive licensors have only limited
15 | # liability.
16 | #
17 | # In this respect, the user's attention is drawn to the risks associated
18 | # with loading, using, modifying and/or developing or reproducing the
19 | # software by the user in light of its specific status of free software,
20 | # that may mean that it is complicated to manipulate, and that also
21 | # therefore means that it is reserved for developers and experienced
22 | # professionals having in-depth computer knowledge. Users are therefore
23 | # encouraged to load and test the software's suitability as regards their
24 | # requirements in conditions enabling the security of their systems and/or
25 | # data to be ensured and, more generally, to use and operate it in the
26 | # same conditions as regards security.
27 | #
28 | # The fact that you are presently reading this means that you have had
29 | # knowledge of the CeCILL license and that you accept its terms.
30 | 
31 | __all__ = ['cantab', 'imaging']
32 | 
33 | 
34 | from . import cantab
35 | __all__.extend(cantab.__all__)
36 | from .cantab import check_cant_name
37 | from .cantab import check_datasheet_name
38 | from .cantab import check_detailed_datasheet_name
39 | from .cantab import check_report_name
40 | from .cantab import check_cant_content
41 | from .cantab import check_datasheet_content
42 | from .cantab import check_detailed_datasheet_content
43 | from .cantab import check_report_content
44 | 
45 | from . import imaging
46 | __all__.extend(imaging.__all__)
47 | from .imaging import check_zip_name
48 | from .imaging import check_zip_content
49 | from .imaging import ZipTree
50 | 


--------------------------------------------------------------------------------
/imagen_databank/scanning.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2014-2017 CEA
 2 | #
 3 | # This software is governed by the CeCILL license under French law and
 4 | # abiding by the rules of distribution of free software. You can use,
 5 | # modify and/ or redistribute the software under the terms of the CeCILL
 6 | # license as circulated by CEA, CNRS and INRIA at the following URL
 7 | # "http://www.cecill.info".
 8 | #
 9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 | 
29 | import re
30 | 
31 | from . core import detect_psc1
32 | 
33 | import logging
34 | logger = logging.getLogger(__name__)
35 | 
36 | 
37 | _SUBJECT_ID_REGEX = re.compile('\d{2}[/\.]\d{2}[/\.]\d{4} \d{2}:\d{2}:\d{2}\tSubject ID: (\w+)')
38 | 
39 | 
40 | def read_scanning(path):
41 |     """Return "Subject ID" values found in a Scanning/*.csv file.
42 | 
43 |     Parameters
44 |     ----------
45 |     path : unicode
46 |         Path to the Scanning/*.csv to read from.
47 | 
48 |     Returns
49 |     -------
50 |     str
51 |         "Subject ID" value found in the file.
52 | 
53 |     """
54 | 
55 |     with open(path) as scanning:
56 |         subject_ids = set()
57 |         for line in scanning:
58 |             match = _SUBJECT_ID_REGEX.match(line)
59 |             if match:
60 |                 subject_id = detect_psc1(match.group(1))
61 |                 if subject_id is None:
62 |                     subject_id = match.group(1)
63 |                 subject_ids.add(subject_id)
64 |         return subject_ids
65 | 


--------------------------------------------------------------------------------
/mri/imagen_sample_FU3_mri_deidentify.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright (c) 2010-2019 CEA
  4 | #
  5 | # This software is governed by the CeCILL license under French law and
  6 | # abiding by the rules of distribution of free software. You can use,
  7 | # modify and/ or redistribute the software under the terms of the CeCILL
  8 | # license as circulated by CEA, CNRS and INRIA at the following URL
  9 | # "http://www.cecill.info".
 10 | #
 11 | # As a counterpart to the access to the source code and rights to copy,
 12 | # modify and redistribute granted by the license, users are provided only
 13 | # with a limited warranty and the software's author, the holder of the
 14 | # economic rights, and the successive licensors have only limited
 15 | # liability.
 16 | #
 17 | # In this respect, the user's attention is drawn to the risks associated
 18 | # with loading, using, modifying and/or developing or reproducing the
 19 | # software by the user in light of its specific status of free software,
 20 | # that may mean that it is complicated to manipulate, and that also
 21 | # therefore means that it is reserved for developers and experienced
 22 | # professionals having in-depth computer knowledge. Users are therefore
 23 | # encouraged to load and test the software's suitability as regards their
 24 | # requirements in conditions enabling the security of their systems and/or
 25 | # data to be ensured and, more generally, to use and operate it in the
 26 | # same conditions as regards security.
 27 | #
 28 | # The fact that you are presently reading this means that you have had
 29 | # knowledge of the CeCILL license and that you accept its terms.
 30 | 
 31 | import os
 32 | import zipfile
 33 | import zlib
 34 | import tempfile
 35 | from datetime import datetime
 36 | import shutil
 37 | import subprocess
 38 | from imagen_databank import PSC2_FROM_PSC1
 39 | import json
 40 | import logging
 41 | logger = logging.getLogger(__name__)
 42 | logging.basicConfig(level=logging.INFO)
 43 | 
 44 | 
 45 | QUARANTINE_PATH = '/imagen/FU3/RAW/QUARANTINE'
 46 | BIDS_PATH = '/neurospin/tmp/imagen/dcm2niix'
 47 | SKIP_PATH = '/imagen/mri_skip.json'
 48 | 
 49 | 
 50 | def quarantine_filename_semantics(filename):
 51 |     root, ext = os.path.splitext(filename)
 52 | 
 53 |     if (ext != '.zip'):
 54 |         logger.debug('%s: filename without ".zip" extension', filename)
 55 | 
 56 |     increment, suffix = root.split('_data_')
 57 |     increment = int(increment)
 58 | 
 59 |     psc1 = suffix[:-6]  # last 6 characters added by the upload portal
 60 |     if len(psc1) > 12:
 61 |         timepoint = psc1[12:]
 62 |         psc1 = psc1[:12]
 63 |     else:
 64 |         logger.error('%s: missing timepoint', psc1)
 65 | 
 66 |     return increment, psc1, timepoint
 67 | 
 68 | 
 69 | def timestamps(top, include_dirs=True):
 70 |     min_timestamp = datetime.max
 71 |     max_timestamp = datetime.min
 72 | 
 73 |     for root, dirs, files in os.walk(top):
 74 |         if include_dirs:
 75 |             for dirname in dirs:
 76 |                 path = os.path.join(root, dirname)
 77 |                 timestamp = datetime.fromtimestamp(os.path.getmtime(path))
 78 |                 min_timestamp = min(timestamp, min_timestamp)
 79 |                 max_timestamp = max(timestamp, max_timestamp)
 80 |         for filename in files:
 81 |             path = os.path.join(root, filename)
 82 |             timestamp = datetime.fromtimestamp(os.path.getmtime(path))
 83 |             min_timestamp = min(timestamp, min_timestamp)
 84 |             max_timestamp = max(timestamp, max_timestamp)
 85 | 
 86 |     return (min_timestamp, max_timestamp)
 87 | 
 88 | 
 89 | def list_datasets(path):
 90 |     datasets = {}
 91 | 
 92 |     for zip_file in os.listdir(path):
 93 |         zip_path = os.path.join(path, zip_file)
 94 |         root, ext = os.path.splitext(zip_file)
 95 | 
 96 |         if (ext != '.zip'):
 97 |             logger.debug('%s: this is not a ZIP file ', zip_file)
 98 |             continue
 99 |         elif not zipfile.is_zipfile(zip_path):
100 |             logger.warn('%s: skip invalid ZIP file ', zip_file)
101 |             continue
102 | 
103 |         # Unix timestamp of the ZIP file
104 |         timestamp = os.path.getmtime(zip_path)
105 | 
106 |         # semantics of ZIP file name
107 |         increment, psc1, timepoint = quarantine_filename_semantics(zip_file)
108 | 
109 |         # compare increment/timestamp of ZIP files, keep most recent
110 |         timepoint = datasets.setdefault(timepoint, {})
111 |         if psc1 in timepoint:
112 |             old_zip_path, old_increment, old_timestamp = timepoint[psc1]
113 |             if (increment <= old_increment or timestamp <= old_timestamp):
114 |                 if (increment >= old_increment or timestamp >= old_timestamp):
115 |                     logger.error('%s: inconsistent timestamps', zip_file)
116 |                 continue
117 |         timepoint[psc1] = (zip_path, increment, timestamp)
118 | 
119 |     return datasets
120 | 
121 | 
122 | def dcm2nii(src, dst, comment):
123 |     status = 0
124 | 
125 |     logger.info('%s: running dcm2niix: %s', src, dst)
126 | 
127 |     dcm2niix = ['dcm2niix',
128 |                 '-z', 'y', '-9'
129 |                 '-c', comment,
130 |                 '-o', dst,
131 |                 src]
132 |     completed = subprocess.run(dcm2niix,
133 |                                capture_output=True)
134 |     if completed.returncode:
135 |         logger.error('%s: dcm2niix failed: %s',
136 |                      src, completed.stdout)
137 |         status = completed.returncode
138 | 
139 |     return status
140 | 
141 | 
142 | def deidentify(timepoint, psc1, zip_path, bids_path):
143 |     logger.info('%s/%s: deidentify', psc1, timepoint)
144 | 
145 |     psc2 = PSC2_FROM_PSC1[psc1]
146 |     out_sub_path = os.path.join(bids_path, 'sub-' + psc2)
147 |     out_ses_path = os.path.join(out_sub_path, 'ses-' + timepoint)
148 | 
149 |     # skip ZIP files that have already been processed
150 |     if os.path.isdir(out_ses_path):
151 |         zip_timestamp = datetime.fromtimestamp(os.path.getmtime(zip_path))
152 |         min_timestamp, max_timestamp = timestamps(out_ses_path)
153 |         if min_timestamp > zip_timestamp:
154 |             return
155 |         else:
156 |             shutil.rmtree(out_ses_path)
157 |             os.makedirs(out_ses_path)
158 | 
159 |     status = 0
160 |     prefix = 'cveda-mri-' + psc1
161 |     with tempfile.TemporaryDirectory(prefix=prefix) as tempdir:
162 |         # unpack ZIP file into temporary directory
163 |         zip_file = zipfile.ZipFile(zip_path)
164 |         try:
165 |             zip_file.extractall(tempdir)
166 |         except (zipfile.BadZipFile, OSError, EOFError, zlib.error) as e:
167 |             logger.error('%s/%s: corrupt ZIP file: %s',
168 |                          psc1, timepoint,  str(e))
169 |             return
170 | 
171 |         os.makedirs(out_ses_path)
172 |         status = dcm2nii(tempdir, out_ses_path,
173 |                          psc2 + '/' + timepoint)
174 | 
175 |     if status:
176 |         shutil.rmtree(out_ses_path)
177 |         if not os.listdir(out_sub_path):  # empty directory
178 |             os.rmdir(out_sub_path)
179 | 
180 |     return status
181 | 
182 | 
183 | def main():
184 |     datasets = list_datasets(QUARANTINE_PATH)
185 | 
186 |     for timepoint, timepoint_datasets in datasets.items():
187 |         for psc1, (zip_path, increment, timestamp) in timepoint_datasets.items():
188 |             with open(SKIP_PATH) as skip_file:
189 |                 skip = json.load(skip_file)
190 |                 if timepoint in skip and psc1 in skip[timepoint]:
191 |                     continue
192 |             deidentify(timepoint, psc1, zip_path, BIDS_PATH)
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     main()
197 | 


--------------------------------------------------------------------------------
/onsets/imagen_onsets_copy_FU3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SOURCE='/neurospin/imagen/FU3/RAW/PSC2/onsets'
 4 | TARGET='/neurospin/imagen/FU3/processed/nifti'
 5 | 
 6 | for f in "${SOURCE}/"*.csv
 7 | do
 8 |     basename=`basename "$f" '.csv'`
 9 |     psc2=`echo "$basename" | sed -e 's/^.*_//; s/FU3$//'`
10 |     if [ -d "${TARGET}/${psc2}" ]
11 |     then
12 |         mkdir -p "${TARGET}/${psc2}/BehaviouralData"
13 |         cp -p "${SOURCE}/${basename}.csv" "${TARGET}/${psc2}/BehaviouralData/"
14 |     else
15 |         >&2 echo "ERROR: $psc2: missing folder!"
16 |     fi
17 | done
18 | 


--------------------------------------------------------------------------------
/onsets/imagen_onsets_copy_STRATIFY.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SOURCE='/neurospin/imagen/STRATIFY/RAW/PSC2/onsets'
 4 | TARGET='/neurospin/imagen/STRATIFY/processed/nifti'
 5 | 
 6 | for f in "${SOURCE}/"*.csv
 7 | do
 8 |     basename=`basename "$f" '.csv'`
 9 |     psc2=`echo "$basename" | sed -e 's/^.*_//; s/SB$//'`
10 |     if [ -d "${TARGET}/${psc2}" ]
11 |     then
12 |         mkdir -p "${TARGET}/${psc2}/BehaviouralData"
13 |         cp -p "${SOURCE}/${basename}.csv" "${TARGET}/${psc2}/BehaviouralData/"
14 |     else
15 |         >&2 echo "ERROR: $psc2: missing folder!"
16 |     fi
17 | done
18 | 


--------------------------------------------------------------------------------
/onsets/imagen_onsets_extract_deidentify.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import zipfile
  5 | from datetime import datetime
  6 | from tempfile import TemporaryDirectory
  7 | from multiprocessing import Pool
  8 | from imagen_databank import PSC2_FROM_PSC1, DOB_FROM_PSC1
  9 | import logging
 10 | 
 11 | logging.basicConfig(level=logging.INFO)
 12 | 
 13 | WORKER_PROCESSES = 8
 14 | 
 15 | DATASETS_FU3_SB = '/neurospin/imagen/FU3/RAW/QUARANTINE'
 16 | ONSETS = {
 17 |     'FU3': '/neurospin/imagen/FU3/RAW/PSC2/onsets',
 18 |     'SB': '/neurospin/imagen/STRATIFY/RAW/PSC2/onsets',
 19 | }
 20 | 
 21 | 
 22 | def _parse_onsets_datetime(date_string):
 23 |     """Read date in the format found in CSV files.
 24 | 
 25 |     """
 26 |     DATE_FORMATS = (
 27 |         '%d.%m.%Y %H:%M:%S',
 28 |         '%d/%m/%Y %H:%M:%S',
 29 |     )
 30 |     for date_format in DATE_FORMATS:
 31 |         try:
 32 |             dt = datetime.strptime(date_string, date_format)
 33 |             return dt
 34 |         except ValueError:
 35 |             pass
 36 |     return None
 37 | 
 38 | 
 39 | def _extract_psc1_timestamp(path):
 40 |     """Extract time stamp from FU3 / Stratify zip files in QUARANTINE.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     path : unicode
 45 |         Zip file name.
 46 | 
 47 |     Returns
 48 |     -------
 49 |     tuple (str, int)
 50 |         PSC1 code and database increment number from tarball file name.
 51 | 
 52 |     """
 53 |     path = os.path.basename(path)
 54 |     root, ext = os.path.splitext(path)
 55 | 
 56 |     # extract database increment number and PSC1
 57 |     increment, data, psc1 = root.split('_', 2)
 58 |     assert(increment.isdigit())
 59 |     increment = int(increment)
 60 |     while not psc1[:12].isdigit():
 61 |         split = psc1.split('_', 1)
 62 |         if len(split) > 1:
 63 |             psc1 = split[-1]
 64 |         else:
 65 |             psc1 = None
 66 |             break
 67 |     else:
 68 |         psc1 = psc1[:12]
 69 |     
 70 |     return psc1, increment
 71 | 
 72 | 
 73 | def process_behavioural(path, timepoint, prefix, psc1, psc2):
 74 |     logging.info('%s: processing behavioural file...', path)
 75 | 
 76 |     with open(path, encoding='latin-1', newline='') as content:
 77 |         output_path = ONSETS[timepoint]
 78 |         output = os.path.join(output_path, prefix + '_' + psc2 + timepoint + '.csv')
 79 |         with open(output, 'w') as output:
 80 |             # de-identify 1st line
 81 |             line = next(iter(content))
 82 |             column = line.split('\t')
 83 |             if psc1 in DOB_FROM_PSC1:
 84 |                 column[1] = str((_parse_onsets_datetime(column[1]).date() -
 85 |                                 DOB_FROM_PSC1[psc1]).days)
 86 |             else:
 87 |                 column[1] = ''
 88 |             column[2] = column[2].replace(psc1, psc2)
 89 |             line = '\t'.join(column)
 90 |             # write to target file
 91 |             output.write(line)
 92 |             for line in content:
 93 |                 output.write(line)
 94 | 
 95 | 
 96 | def process_dataset(arguments):
 97 |     (timepoint, psc1, psc2, dataset_path) = arguments  # unpack multiple arguments
 98 | 
 99 |     logging.info('%s: processing zipped %s dataset...', psc1, timepoint)
100 | 
101 |     with TemporaryDirectory(prefix='imagen_behavioural_') as tmp:
102 |         with zipfile.ZipFile(dataset_path) as dataset_zipfile:
103 |             members = dataset_zipfile.infolist()
104 |     
105 |             for prefix in ('ft', 'mid', 'recog', 'ss'):
106 |                 for member in members:
107 |                     if member.filename == (psc1 + timepoint + '/AdditionalData/Scanning/' +
108 |                                            prefix + '_' + psc1 + timepoint + '.csv'):
109 |                         dataset_zipfile.extract(member, path=tmp)
110 |                         behavioural_path = os.path.join(tmp, member.filename)
111 |                         process_behavioural(behavioural_path, timepoint, prefix, psc1, psc2)
112 |                         break
113 |                 else:
114 |                     logging.warning('%s: missing %s_*.csv file', psc1, prefix)
115 | 
116 |     logging.info('%s: processed zipped %s dataset', psc1, timepoint)
117 | 
118 | 
119 | def list_datasets(path, timepoint):
120 |     # list zip files to process
121 |     # for subjects with multiple zip files, keep the most recent one
122 |     datasets = {}
123 |     for dataset in os.listdir(path):
124 |         root, ext = os.path.splitext(dataset)
125 |         if ext != '.zip':
126 |             continue
127 |         increment, data, psc1 = root.split('_', 2)
128 |         assert(increment.isdigit() and data == 'data' and
129 |                psc1[:12].isdigit())
130 |         if psc1[12:12+len(timepoint)] != timepoint:
131 |             continue
132 | 
133 |         psc1, timestamp = _extract_psc1_timestamp(dataset)
134 |         dataset_path = os.path.join(path, dataset)
135 |         datasets.setdefault(psc1, {})[timestamp] = dataset_path
136 | 
137 |     logging.info('found %d zipped %s datasets', len(datasets), timepoint)
138 | 
139 |     return[(psc1, timestamps[max(timestamps.keys())])  # keep latest dataset
140 |            for (psc1, timestamps) in datasets.items()]
141 | 
142 | 
143 | def process_datasets(path, timepoint):
144 |     todo_list = list(list_datasets(path, timepoint))
145 |     todo_list = [(timepoint, psc1, PSC2_FROM_PSC1[psc1], path) for (psc1, path) in todo_list]
146 | 
147 |     pool = Pool(WORKER_PROCESSES)
148 |     results = pool.map(process_dataset, todo_list)
149 |     pool.close()
150 |     pool.join()
151 |     return results
152 | 
153 | 
154 | def main():
155 |     for timepoint in ('FU3', 'SB'):
156 |         results = process_datasets(DATASETS_FU3_SB, timepoint)
157 | 
158 | 
159 | if __name__ == "__main__":
160 |     main()
161 | 


--------------------------------------------------------------------------------
/psc/imagen_update_dawba_codes_from_tokens.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Download Dawba codes for Imagen FU3 and Stratify and update conversion table.
  3 | 
  4 | ==========
  5 | Attributes
  6 | ==========
  7 | 
  8 | Output
  9 | ------
 10 | 
 11 | PSC2PSC : str
 12 |     Table of conversion between participant codes (PSC1, Dawba, PSC2).
 13 | 
 14 | """
 15 | 
 16 | import os
 17 | import requests
 18 | import json
 19 | import base64
 20 | from urllib.parse import urlparse
 21 | import datetime
 22 | import logging
 23 | from imagen_databank import PSC2_FROM_PSC1
 24 | logging.basicConfig(level=logging.INFO)
 25 | 
 26 | # The LSRC2 service at Delosis.
 27 | LSRC2_BASE_URL = 'https://www.delosis.com/qs/index.php/admin/remotecontrol'
 28 | # Since credentials are different between the legacy and the LSRC2 service,
 29 | # and ~/.netrc allows only a single set of credentials per server, store
 30 | # LSRC2 credentials in an alternate file.
 31 | LSRC2_NETRC_FILE = '~/.lsrc2'
 32 | # The PSC1, Dawba, PSC2 conversion table
 33 | PSC2PSC = '/neurospin/imagen/src/scripts/psc_tools/psc2psc.csv'
 34 | PSC2PSC_SB = '/neurospin/imagen/src/scripts/psc_tools/psc2psc_SB.csv'
 35 | 
 36 | 
 37 | class LimeSurveyError(Exception):
 38 |     def __init__(self, message, code):
 39 |         super().__init__(message)
 40 |         self.code = code
 41 | 
 42 | 
 43 | def error2exception(func):
 44 |     def wrapper(*args, **kwargs):
 45 |         response, error = func(*args, **kwargs)
 46 |         if error:
 47 |             try:
 48 |                 code = error['code']
 49 |                 message = error['message']
 50 |             except (TypeError, KeyError):
 51 |                 code = -32603  # internal JSON-RPC error
 52 |                 message = 'Unexpected JSON-RPC error type'
 53 |             raise LimeSurveyError(message, code)
 54 |         return response
 55 |     return wrapper
 56 | 
 57 | 
 58 | class LimeSurveySession:
 59 |     """LimeSurvey JSON-RPC LSRC2 session
 60 | 
 61 |     Documented here:
 62 |     https://www.delosis.com/qs/index.php/admin/remotecontrol
 63 |     https://manual.limesurvey.org/RemoteControl_2_API
 64 | 
 65 |     """
 66 |     __request_id = 0
 67 | 
 68 |     def __init__(self, url, username, password):
 69 |         self.url = url
 70 |         # start a Requests session
 71 |         self.session = requests.Session()
 72 |         # Keep-alive is 100% automatic in Requests, thanks to urllib3
 73 |         self.session.headers.update({'content-type': 'application/json'})
 74 |         # start a LimeSurvey RemoteControl 2 session
 75 |         self.key = self._get_session_key(username, password)
 76 | 
 77 |     def __enter__(self):
 78 |         return self
 79 | 
 80 |     def __exit__(self, type, value, traceback):
 81 |         self.close()
 82 |         return False  # re-raises the exception
 83 | 
 84 |     def close(self):
 85 |         """Release LimeSurvey session key, then close Requests session"""
 86 |         self._release_session_key(self.key)
 87 |         self.key = None
 88 |         self.session.close()
 89 | 
 90 |     @staticmethod
 91 |     def _generate_request_id():
 92 |         LimeSurveySession.__request_id += 1
 93 |         return LimeSurveySession.__request_id
 94 | 
 95 |     @staticmethod
 96 |     def _request(method, params):
 97 |         return {
 98 |             'jsonrpc': '2.0',
 99 |             'id': LimeSurveySession._generate_request_id(),
100 |             'method': method,
101 |             'params': params,
102 |         }
103 | 
104 |     def _post(self, request):
105 |         logging.debug('JSON-RPC request: %s', request)
106 |         assert 'method' in request and 'params' in request and 'id' in request
107 |         response = self.session.post(self.url, data=json.dumps(request))
108 |         response = response.json()
109 |         logging.debug('JSON-RPC response: %s', response)
110 |         assert response['id'] == request['id']
111 |         result = response['result']
112 |         error = response['error']
113 |         if error:
114 |             logging.error('JSON-RPC error: %s', error)
115 |         return result, error
116 | 
117 |     def _get_session_key(self, username, password):
118 |         request = self._request('get_session_key', [username, password])
119 |         response, error = self._post(request)
120 | 
121 |         # fix non-sensical LSRC2 error handling
122 |         # completely at odds with JSON-RPC error handling
123 |         try:
124 |             status = response['status']
125 |         except (TypeError, KeyError):
126 |             if error is not None:
127 |                 logging.error('LSRC2 failed to create a session key')
128 |                 response = None
129 |             else:
130 |                 logging.info('LSRC2 new session key: %s', response)
131 |         else:
132 |             logging.error(status)
133 |             error = {
134 |                 'code': -32099,  # implementation-defined error in JSON-RPC
135 |                 'message': status,
136 |             }
137 |             response = None
138 | 
139 |         return response
140 | 
141 |     def _release_session_key(self, key):
142 |         request = self._request('release_session_key', [key])
143 |         logging.info('LSRC2 release session key: %s', key)
144 |         dummy_response, dummy_error = self._post(request)  # returns ('OK', None) even if bogus key
145 | 
146 |     @error2exception
147 |     def surveys(self):
148 |         request = self._request('list_surveys', [self.key])
149 |         return self._post(request)
150 | 
151 |     @error2exception
152 |     def participants(self, survey, attributes=False):
153 |         request = self._request('list_participants',
154 |                                 [self.key, survey, 0, 5000, False, attributes])
155 |         responses, error = self._post(request)
156 | 
157 |         # fix non-sensical LSRC2 error handling
158 |         # completely at odds with JSON-RPC error handling
159 |         try:
160 |             status = responses['status']
161 |         except (TypeError, KeyError):
162 |             pass
163 |         else:
164 |             # LSRC2 returns errors as a dict with a 'status' attribute
165 |             if status == 'No Tokens found':
166 |                 # When a survey is empty, LSRC2 also returns a dict:
167 |                 # {"status": "No Tokens found"}
168 |                 if error is not None:
169 |                     logging.error('JSON-RPC error report does not match "status"')
170 |                     error = None
171 |             else:
172 |                 error = {
173 |                     'code': -32099,  # implementation-defined error in JSON-RPC
174 |                     'message': status,
175 |                 }
176 |             responses = []
177 | 
178 |         return responses, error
179 | 
180 |     @error2exception
181 |     def participant_properties(self, survey, participant, attributes):
182 |         request = self._request('get_participant_properties',
183 |                                 [self.key, survey, participant, attributes])
184 |         return self._post(request)
185 | 
186 |     @error2exception
187 |     def responses(self, survey, status='all'):
188 |         request = self._request('export_responses',
189 |                                 [self.key, survey, 'csv', None, status])
190 |         responses, error = self._post(request)
191 | 
192 |         try:
193 |             responses = base64.b64decode(responses).decode('utf_8').split('\n')
194 |         except TypeError:
195 |             # fix non-sensical LSRC2 error handling
196 |             # completely at odds with JSON-RPC error handling
197 |             try:
198 |                 status = responses['status']
199 |             except (TypeError, KeyError):
200 |                 message = 'JSON-RPC function "export_responses" expected a Base64-encoded string'
201 |                 logging.error(message)
202 |                 error = {
203 |                     'code': -32099,  # implementation-defined error in JSON-RPC
204 |                     'message': message,
205 |                 }
206 |             else:
207 |                 # LSRC2 returns errors as a dict with a 'status' attribute
208 |                 if status == 'No Data, could not get max id.':
209 |                     # When a survey is empty, LSRC2 also returns a dict:
210 |                     # {"status": "No Data, could not get max id."}
211 |                     if error is not None:
212 |                         logging.error('JSON-RPC error report does not match "status"')
213 |                         error = None
214 |                 else:
215 |                     error = {
216 |                         'code': -32099,  # implementation-defined error in JSON-RPC
217 |                         'message': status,
218 |                     }
219 |             responses = []
220 | 
221 |         return responses, error
222 | 
223 | 
224 | def _get_netrc_auth(url):
225 |     try:
226 |         netrc_path = os.path.expanduser(LSRC2_NETRC_FILE)
227 |     except KeyError:
228 |         import warnings
229 |         warnings.warn('Unable to find home directory')
230 |         return
231 |     if not os.path.exists(netrc_path):
232 |         return
233 | 
234 |     netloc = urlparse(url).netloc
235 | 
236 |     try:
237 |         from netrc import netrc, NetrcParseError
238 |         try:
239 |             authenticators = netrc(netrc_path).authenticators(netloc)
240 |         except (NetrcParseError, OSError):
241 |             return
242 |         if authenticators:
243 |             return (authenticators[0], authenticators[2])
244 |     except (ImportError):
245 |         return
246 | 
247 | 
248 | def download_lsrc2_tokens(base_url, startswith=None):
249 |     """JSON RPC calls to LSRC2 service to retrieve tokens.
250 | 
251 |     """
252 |     username, password = _get_netrc_auth(base_url)
253 |     with LimeSurveySession(base_url, username, password) as session:
254 |         dawba_from_psc1 = {}
255 | 
256 |         surveys = session.surveys()
257 |         for survey in surveys:
258 |             title = survey['surveyls_title']
259 |             sid = survey['sid']
260 |             active = survey['active']
261 | 
262 |             if title.startswith(startswith):
263 |                 if active == 'N':
264 |                     logging.info('skip inactive survey: %s', title)
265 |                     continue
266 |                 else:
267 |                     logging.info('read survey: %s', title)
268 |             else:
269 |                 logging.info('skip survey: %s', title)
270 |                 continue
271 | 
272 |             # subjects in surveys are identified by "sid" and "token"
273 |             # retrieve correlation between "token" and PSC1 and Dawba codes
274 |             psc1_from_token = {}
275 |             dawba_from_token = {}
276 |             participants = session.participants(sid, ['completed', 'reminded', 'attribute_1', 'attribute_2'])
277 | 
278 |             for participant in participants:
279 |                 token = participant['token']
280 |                 if ('reminded' in participant and participant['reminded'] == 'Duplicate' or
281 |                         'completed' in participant and participant['completed'] == 'N'):
282 |                     continue
283 |                 # PSC1
284 |                 if 'attribute_1' in participant:
285 |                     psc1 = participant['attribute_1'].strip()
286 |                     if psc1.endswith('SB'):
287 |                         psc1 = psc1[:-2]
288 |                     if psc1.endswith('FU3'):
289 |                         psc1 = psc1[:-3]
290 |                     if psc1.isdigit():
291 |                         if token in psc1_from_token:
292 |                             if psc1 != psc1_from_token[token]:
293 |                                 logging.error('survey: %s: duplicate token has inconsistent PSC1 codes: %s / %s',
294 |                                               title, psc1_from_token[token], psc1)
295 |                             else:
296 |                                 logging.warning('survey: %s: duplicate token for PSC1 code: %s',
297 |                                                 title, psc1)
298 |                         else:
299 |                             psc1_from_token[token] = psc1
300 |                     else:
301 |                         logging.info('survey: %s: skipping invalid PSC1 code: %s',
302 |                                      title, psc1)
303 |                 else:
304 |                     logging.error('survey: %s: token %s lacks a PSC1 code',
305 |                                   title, token)
306 |                 # Dawba
307 |                 if 'attribute_2' in participant:
308 |                     dawba = participant['attribute_2']
309 |                     if dawba:
310 |                         dawba = dawba.strip()
311 |                         if dawba.isdigit():
312 |                             if token in dawba_from_token:
313 |                                 if dawba != dawba_from_token[token]:
314 |                                     logging.error('survey: %s: duplicate token has inconsistent Dawba codes: %s / %s',
315 |                                                   title, dawba_from_token[token], dawba)
316 |                                 else:
317 |                                     logging.warning('survey: %s: duplicate token for Dawba code: %s',
318 |                                                     title, dawba)
319 |                             else:
320 |                                 dawba_from_token[token] = dawba
321 |                         elif dawba == '-':
322 |                             logging.warning("survey: %s: %s: skipping empty Dawba code '-'",
323 |                                             title, psc1)
324 |                         else:
325 |                             logging.info('survey: %s: %s: skipping invalid Dawba code: %s',
326 |                                          title, psc1, dawba)
327 |                     else:
328 |                         logging.info('survey: %s: %s: skipping empty Dawba code',
329 |                                      title, psc1)
330 |                 else:
331 |                     logging.error('survey: %s: token %s lacks a Dawba code',
332 |                                   title, token)
333 | 
334 |             for token in psc1_from_token.keys() & dawba_from_token.keys():
335 |                 psc1 = psc1_from_token[token]
336 |                 dawba = dawba_from_token[token]
337 |                 dawba_from_psc1.setdefault(psc1, {}).setdefault(dawba, set())
338 |                 dawba_from_psc1[psc1][dawba].add(title)
339 | 
340 |         for psc1, codes in dawba_from_psc1.items():
341 |             if len(codes) > 1:
342 |                 message_details = ''
343 |                 for dawba, titles in codes.items():
344 |                     message_details += '\t{}:\n\t\t{}\n'.format(dawba, '\n\t\t'.join(title for title in titles))
345 |                 logging.error('%s: multiple Dawba codes:\n%s',
346 |                               psc1, message_details)
347 |                 dawba_from_psc1[psc1] = None
348 |             else:
349 |                 dawba_from_psc1[psc1] = next(iter(dawba_from_psc1[psc1].keys()))
350 |         dawba_from_psc1 = {psc1: dawba for psc1, dawba in dawba_from_psc1.items()
351 |                            if dawba}
352 | 
353 |         return dawba_from_psc1
354 | 
355 | 
356 | def main():
357 |     projects = (
358 |         (PSC2PSC, 'Imagen FUIII - Core'),
359 |         (PSC2PSC_SB, 'STRATIFY Core'),
360 |     )
361 | 
362 |     for psc2psc, startswith in projects:
363 |         dawba_from_psc1 = download_lsrc2_tokens(LSRC2_BASE_URL, startswith)
364 | 
365 |         root, ext = os.path.splitext(psc2psc)
366 |         output = root + '_' + datetime.date.today().isoformat() + ext
367 |         with open(psc2psc, 'r') as p, open(output, 'w') as o:
368 |             # skip header line
369 |             line = next(p).strip('\n')
370 |             print(line, file=o)
371 | 
372 |             done = set()
373 |             for line in p:
374 |                 line = line.strip('\n')
375 |                 psc1, dawba, psc2 = line.split('=')
376 |                 if (int(dawba) > 200000 or  # process only FU3 and Stratify
377 |                         dawba == '000000'):
378 |                     if psc1 in dawba_from_psc1:
379 |                         if dawba != dawba_from_psc1[psc1]:
380 |                             if dawba == '000000':
381 |                                 logging.info('%s: Dawba code initialized from %s to %s',
382 |                                              psc1, dawba, dawba_from_psc1[psc1])
383 |                             else:
384 |                                 logging.error('%s: Dawba code changed from %s to %s',
385 |                                               psc1, dawba, dawba_from_psc1[psc1])
386 |                         dawba = dawba_from_psc1[psc1]
387 |                         line = '='.join((psc1, dawba, psc2))
388 |                     done.add(psc1)
389 |                 print(line, file=o)
390 | 
391 |             for psc1 in (dawba_from_psc1.keys() - done):
392 |                 dawba = dawba_from_psc1[psc1]
393 |                 psc2 = PSC2_FROM_PSC1[psc1]
394 |                 line = '='.join((psc1, dawba, psc2))
395 |                 print(line, file=o)
396 | 
397 | 
398 | if __name__ == "__main__":
399 |     main()
400 | 


--------------------------------------------------------------------------------
/psytools/imagen_psytools_deidentify.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Re-encode and pseudonymize Psytools CSV files (BL, FU1, FU2, FU3 and Stratify).
  3 | 
  4 | This script replaces the Scito pseudonymization pipeline.
  5 | 
  6 | ==========
  7 | Attributes
  8 | ==========
  9 | 
 10 | Input
 11 | -----
 12 | 
 13 | PSYTOOLS_BL_DERIVED_DIR : str
 14 |     Location of BL PSC1-encoded files.
 15 | PSYTOOLS_FU1_DERIVED_DIR : str
 16 |     Location of FU1 PSC1-encoded files.
 17 | PSYTOOLS_FU2_DERIVED_DIR : str
 18 |     Location of FU2 PSC1-encoded files.
 19 | PSYTOOLS_FU3_DERIVED_DIR : str
 20 |     Location of FU3 PSC1-encoded files.
 21 | PSYTOOLS_STRATIFY_DERIVED_DIR : str
 22 |     Location of Stratify PSC1-encoded files.
 23 | 
 24 | Output
 25 | ------
 26 | 
 27 | PSYTOOLS_BL_PSC2_DIR : str
 28 |     Location of BL PSC2-encoded files.
 29 | PSYTOOLS_FU1_PSC2_DIR : str
 30 |     Location of FU1 PSC2-encoded files.
 31 | PSYTOOLS_FU2_PSC2_DIR : str
 32 |     Location of FU2 PSC2-encoded files.
 33 | PSYTOOLS_FU3_PSC2_DIR : str
 34 |     Location of FU3 PSC2-encoded files.
 35 | PSYTOOLS_STRATIFY_PSC2_DIR : str
 36 |     Location of Stratify PSC2-encoded files.
 37 | 
 38 | """
 39 | 
 40 | PSYTOOLS_BL_DERIVED_DIR = '/tmp/imagen/BL/processed/psytools'
 41 | PSYTOOLS_BL_PSC2_DIR = '/neurospin/imagen/BL/processed/psytools'
 42 | PSYTOOLS_FU1_DERIVED_DIR = '/tmp/imagen/FU1/processed/psytools'
 43 | PSYTOOLS_FU1_PSC2_DIR = '/neurospin/imagen/FU1/processed/psytools'
 44 | PSYTOOLS_FU2_DERIVED_DIR = '/tmp/imagen/FU2/processed/psytools'
 45 | PSYTOOLS_FU2_PSC2_DIR = '/neurospin/imagen/FU2/processed/psytools'
 46 | PSYTOOLS_FU3_DERIVED_DIR = '/tmp/imagen/FU3/processed/psytools'
 47 | PSYTOOLS_FU3_PSC2_DIR = '/neurospin/imagen/FU3/processed/psytools'
 48 | PSYTOOLS_STRATIFY_DERIVED_DIR = '/tmp/imagen/STRATIFY/processed/psytools'
 49 | PSYTOOLS_STRATIFY_PSC2_DIR = '/neurospin/imagen/STRATIFY/processed/psytools'
 50 | PSYTOOLS_STRATIFY_FU_DERIVED_DIR = '/tmp/imagen/STRATIFY_FU/processed/psytools'
 51 | PSYTOOLS_STRATIFY_FU_PSC2_DIR = '/neurospin/imagen/STRATIFY_FU/processed/psytools'
 52 | PSYTOOLS_IMACOV19_BL_DERIVED_DIR = '/tmp/imagen/IMACOV19_BL/processed/psytools'
 53 | PSYTOOLS_IMACOV19_BL_PSC2_DIR = '/neurospin/imagen/IMACOV19_BL/processed/psytools'
 54 | PSYTOOLS_IMACOV19_FU_DERIVED_DIR = '/tmp/imagen/IMACOV19_FU/processed/psytools'
 55 | PSYTOOLS_IMACOV19_FU_PSC2_DIR = '/neurospin/imagen/IMACOV19_FU/processed/psytools'
 56 | PSYTOOLS_IMACOV19_FU2_DERIVED_DIR = '/tmp/imagen/IMACOV19_FU2/processed/psytools'
 57 | PSYTOOLS_IMACOV19_FU2_PSC2_DIR = '/neurospin/imagen/IMACOV19_FU2/processed/psytools'
 58 | PSYTOOLS_IMACOV19_FU3_DERIVED_DIR = '/tmp/imagen/IMACOV19_FU3/processed/psytools'
 59 | PSYTOOLS_IMACOV19_FU3_PSC2_DIR = '/neurospin/imagen/IMACOV19_FU3/processed/psytools'
 60 | PSYTOOLS_STRATICO19_BL_DERIVED_DIR = '/tmp/imagen/STRATICO19_BL/processed/psytools'
 61 | PSYTOOLS_STRATICO19_BL_PSC2_DIR = '/neurospin/imagen/STRATICO19_BL/processed/psytools'
 62 | PSYTOOLS_STRATICO19_FU_DERIVED_DIR = '/tmp/imagen/STRATICO19_FU/processed/psytools'
 63 | PSYTOOLS_STRATICO19_FU_PSC2_DIR = '/neurospin/imagen/STRATICO19_FU/processed/psytools'
 64 | PSYTOOLS_STRATICO19_FU2_DERIVED_DIR = '/tmp/imagen/STRATICO19_FU2/processed/psytools'
 65 | PSYTOOLS_STRATICO19_FU2_PSC2_DIR = '/neurospin/imagen/STRATICO19_FU2/processed/psytools'
 66 | PSYTOOLS_STRATICO19_FU3_DERIVED_DIR = '/tmp/imagen/STRATICO19_FU3/processed/psytools'
 67 | PSYTOOLS_STRATICO19_FU3_PSC2_DIR = '/neurospin/imagen/STRATICO19_FU3/processed/psytools'
 68 | 
 69 | 
 70 | import os
 71 | from csv import DictReader
 72 | from csv import DictWriter
 73 | from datetime import datetime
 74 | import logging
 75 | logging.basicConfig(level=logging.INFO)
 76 | 
 77 | # import ../imagen_databank
 78 | import sys
 79 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
 80 | from imagen_databank import PSC2_FROM_PSC1
 81 | from imagen_databank import DOB_FROM_PSC1
 82 | 
 83 | 
 84 | def _deidentify_legacy(psc2_from_psc1, psytools_path, psc2_path):
 85 |     """Anonymize and re-encode a legacy Psytools questionnaire from PSC1 to PSC2.
 86 | 
 87 |     Legacy questionnaires are in long format.
 88 | 
 89 |     Parameters
 90 |     ----------
 91 |     psc2_from_psc1: map
 92 |         Conversion table, from PSC1 to PSC2.
 93 |     psytools_path: str
 94 |         Input: PSC1-encoded Psytools file.
 95 |     psc2_path: str
 96 |         Output: PSC2-encoded Psytools file.
 97 | 
 98 |     """
 99 |     with open(psytools_path, 'r') as psc1_file:
100 |         psc1_reader = DictReader(psc1_file, dialect='excel')
101 | 
102 |         # de-identify columns with timestamps
103 |         ANONYMIZED_COLUMNS = {
104 |             'Completed Timestamp': ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S'),
105 |             'Processed Timestamp': ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S'),
106 |         }
107 |         convert = [fieldname for fieldname in psc1_reader.fieldnames
108 |                    if fieldname in ANONYMIZED_COLUMNS]
109 | 
110 |         # discard other columns with dates
111 |         DISCARDED_COLUMNS = {
112 |             'id_check_dob', 'id_check_gender', 'id_check_relation',
113 |             # FU3 / NI DATA
114 |             'DATE_BIRTH_1', 'DATE_BIRTH_2', 'DATE_BIRTH_3',
115 |             'TEST_DATE_1', 'TEST_DATE_2', 'TEST_DATE_3'
116 |         }
117 | 
118 |         # read/process each row and save for later writing
119 |         rows = {}
120 |         for row in psc1_reader:
121 |             psc1, suffix = row['User code'][:12], row['User code'][12:]
122 |             if psc1 in PSC2_FROM_PSC1:
123 |                 psc2 = PSC2_FROM_PSC1[psc1]
124 |                 if suffix in {'-C', '-P', '-I'}:
125 |                     # keep the suffix of Imagen subject IDs
126 |                     #   -C  Child
127 |                     #   -P  Parent
128 |                     #   -I  Institute
129 |                     row['User code'] = psc2 + suffix
130 |                 else:
131 |                     if suffix in {'FU', 'SU'}:
132 |                         # as a short-term decision, discard "FU" follow-up participants as well as "SU" follow-up
133 |                         # from Stratify and LimeSurvey-derived files
134 |                         logging.info('discarding STRATIFY follow-up participant %s!',
135 |                                         row['User code'])
136 |                         continue
137 |                     elif suffix not in {'FU3', 'SB'}: #'SU' corresponds to ESTRA FU, has been removed temporarily to generate psytools SB
138 |                         # remove "FU3 and "SB"/"SU" suffixes
139 |                         # in Stratify and LimeSurvey-derived files
140 |                         logging.error('unknown suffix %s in user code %s',
141 |                                       suffix, row['User code'])
142 |                     row['User code'] = psc2
143 |             else:
144 |                 logging.error('unknown PSC1 code %s in user code %s',
145 |                               psc1, row['User code'])
146 |                 continue
147 | 
148 |             # de-identify columns with timestamps
149 |             for fieldname in convert:
150 |                 if psc1 in DOB_FROM_PSC1:
151 |                     birth = DOB_FROM_PSC1[psc1]
152 |                     for timestamp_format in ANONYMIZED_COLUMNS[fieldname]:
153 |                         try:
154 |                             timestamp = datetime.strptime(row[fieldname],
155 |                                                           timestamp_format).date()
156 |                         except ValueError:
157 |                             continue
158 |                         else:
159 |                             age = timestamp - birth
160 |                             row[fieldname] = str(age.days)
161 |                             break
162 |                     else:
163 |                         logging.error('%s: invalid "%s": %s',
164 |                                       psc1, fieldname, row[fieldname])
165 |                         row[fieldname] = None
166 |                 else:
167 |                     row[fieldname] = None
168 | 
169 |             # convert to age in days at date of birth - should be 0 if correct!
170 |             # FU2 / ESPAD CHILD
171 |             # FU2 / NI DATA
172 |             for column in ('education_end', 'ni_period', 'ni_date'):
173 |                 if column in psc1_reader.fieldnames:
174 |                     if psc1 in DOB_FROM_PSC1:
175 |                         birth = DOB_FROM_PSC1[psc1]
176 |                         try:
177 |                             d = datetime.strptime(row[column],
178 |                                                   '%d-%m-%Y').date()
179 |                         except ValueError:
180 |                             row[column] = None
181 |                         else:
182 |                             age = d - birth
183 |                             row[column] = str(age.days)
184 |                     else:
185 |                         row[column] = None
186 | 
187 |             # convert to age of parents in days at assessment
188 |             # BL/FU1 / PBQ
189 |             for column in ('pbq_01', 'pbq_02'):
190 |                 if column in psc1_reader.fieldnames:
191 |                     try:
192 |                         birth = datetime.strptime(row[column],
193 |                                                       '%d-%m-%Y').date()
194 |                     except ValueError:
195 |                         row[column] = None
196 |                     else:
197 |                         # last 'timestamp' ought to be 'Processed timestamp'
198 |                         age = timestamp - birth
199 |                         row[column] = str(age.days)
200 | 
201 |             # discard other columns with dates
202 |             for column in DISCARDED_COLUMNS:
203 |                 if column in psc1_reader.fieldnames:
204 |                     del row[column]
205 | 
206 |             rows.setdefault(psc2, []).append(row)
207 | 
208 |         # save rows into output file, sort by PSC2
209 |         with open(psc2_path, 'w') as psc2_file:
210 |             fieldnames = [fieldname for fieldname in psc1_reader.fieldnames
211 |                           if fieldname not in DISCARDED_COLUMNS]
212 |             psc2_writer = DictWriter(psc2_file, fieldnames, dialect='excel')
213 |             psc2_writer.writeheader()
214 |             for psc2 in sorted(rows):
215 |                 for row in rows[psc2]:
216 |                     psc2_writer.writerow(row)
217 | 
218 | 
219 | def _psc1(psc1, psc2_from_psc1):
220 |     if 'TEST' in psc1.upper():
221 |         # skip test subjects
222 |         logging.debug('skipping test subject "%s"', psc1)
223 |     else:
224 |         # find and skip subjects with invalid identifier
225 |         if psc1[-3:] in {'FU2', 'FU3'}:
226 |             psc1 = psc1[:-3]
227 |         elif psc1[-2:] in {'SB'}: #removing SU to skip followup acquisitions in SB psytools
228 |             psc1 = psc1[:-2]
229 |         if psc1 in psc2_from_psc1:
230 |             return psc1
231 |         elif psc1 in {'0x0000xxxxxx'}:
232 |             logging.info('skipping known invalid subject identifier "%s"',
233 |                          psc1)
234 |         else:
235 |             logging.error('invalid subject identifier "%s"', psc1)
236 |     return None
237 | 
238 | 
239 | def _deidentify_lsrc2(psc2_from_psc1, psytools_path, psc2_path):
240 |     """Anonymize and re-encode an LSRC2 Psytools questionnaire from PSC1 to PSC2.
241 | 
242 |     LSRC2 questionnaires are in wide format.
243 | 
244 |     Parameters
245 |     ----------
246 |     psc2_from_psc1: map
247 |         Conversion table, from PSC1 to PSC2.
248 |     psytools_path: str
249 |         Input: PSC1-encoded Psytools file.
250 |     psc2_path: str
251 |         Output: PSC2-encoded Psytools file.
252 | 
253 |     """
254 |     COLUMNS_TO_REMOVE = {
255 |         'token',
256 |         'ipaddr',
257 |         'IdCheckGender',
258 |         'IdCheckDob',
259 |         'geoLoc_search',  # Covid-19 questionnaires
260 |     }
261 |     COLUMNS_WITH_DATE = {
262 |         'startdate',
263 |         'datestamp',
264 |         'submitdate',
265 |     }
266 | 
267 |     with open(psytools_path, 'r') as psc1_file:
268 |         psc1_reader = DictReader(psc1_file, dialect='excel')
269 |         # columns to remove entirely
270 |         fieldnames = [x for x in psc1_reader.fieldnames
271 |                       if x not in COLUMNS_TO_REMOVE]
272 |         with open(psc2_path, 'w') as psc2_file:
273 |             psc2_writer = DictWriter(psc2_file, fieldnames, dialect='excel')
274 |             psc2_writer.writeheader()
275 |             for row in psc1_reader:
276 |                 # skip test and invalid subjects
277 |                 psc1 = _psc1(row['id'], psc2_from_psc1)
278 |                 if psc1:
279 |                     psc2 = psc2_from_psc1[psc1]
280 |                     # columns to remove entirely
281 |                     for x in COLUMNS_TO_REMOVE:
282 |                         if x in row:
283 |                             del row[x]
284 |                     # columns to de-identify
285 |                     row['id'] = psc2
286 |                     for x in COLUMNS_WITH_DATE:
287 |                         if x in row and row[x]:
288 |                             date = datetime.strptime(row[x],
289 |                                                      '%Y-%m-%d %H:%M:%S').date()
290 |                             if psc1 in DOB_FROM_PSC1:
291 |                                 birth = DOB_FROM_PSC1[psc1]
292 |                                 age = date - birth
293 |                                 row[x] = age.days
294 |                             else:
295 |                                 logging.error('unknown date of birth: "%s"',
296 |                                               psc1)
297 |                                 row[x] = None
298 |                     psc2_writer.writerow(row)
299 | 
300 | 
301 | def deidentify(psc2_from_psc1, master_dir, psc2_dir):
302 |     """Anonymize and re-encode Psytools questionnaires within a directory.
303 | 
304 |     PSC1-encoded files are read from `master_dir`, anoymized and converted
305 |     from PSC1 codes to PSC2, and the result is written in `psc2_dir`.
306 | 
307 |     Parameters
308 |     ----------
309 |     psc2_from_psc1: map
310 |         Conversion table, from PSC1 to PSC2.
311 |     master_dir: str
312 |         Input directory with PSC1-encoded questionnaires.
313 |     psc2_dir: str
314 |         Output directory with PSC2-encoded and anonymized questionnaires.
315 | 
316 |     """
317 |     CURRENTLY_NOT_PROPERLY_DEIDENTIFIED = {
318 |         'IMAGEN-IMGN_RELIABILITY_PI_FU2-BASIC_DIGEST.csv',
319 |         'IMAGEN-IMGN_RELIABILITY_FU3-BASIC_DIGEST.csv',
320 |         'STRATIFY_screening_(London).csv',
321 |         'STRATIFY_screening_(Southampton).csv',
322 |         'STRATIFY_screening_(ED).csv',
323 |     }
324 | 
325 |     for filename in os.listdir(master_dir):
326 |         if filename in CURRENTLY_NOT_PROPERLY_DEIDENTIFIED:
327 |             continue
328 |         master_path = os.path.join(master_dir, filename)
329 |         psc2_path = os.path.join(psc2_dir, filename)
330 |         if filename.startswith('IMAGEN-') or filename.startswith('STRATIFY-') or filename.startswith('IMACOV19-') or filename.startswith('STRATICO19-'):
331 |             _deidentify_legacy(psc2_from_psc1, master_path, psc2_path)
332 |         elif filename.startswith('Imagen_') or filename.startswith('STRATIFY_'):
333 |             _deidentify_lsrc2(psc2_from_psc1, master_path, psc2_path)
334 |         else:
335 |             logging.error('skipping unknown file: %s', filename)
336 | 
337 | 
338 | def main():
339 |     # IMAGEN
340 |     deidentify(PSC2_FROM_PSC1,
341 |                PSYTOOLS_BL_DERIVED_DIR, PSYTOOLS_BL_PSC2_DIR)
342 |     deidentify(PSC2_FROM_PSC1,
343 |                PSYTOOLS_FU1_DERIVED_DIR, PSYTOOLS_FU1_PSC2_DIR)
344 |     deidentify(PSC2_FROM_PSC1,
345 |                PSYTOOLS_FU2_DERIVED_DIR, PSYTOOLS_FU2_PSC2_DIR)
346 |     deidentify(PSC2_FROM_PSC1,
347 |                PSYTOOLS_FU3_DERIVED_DIR, PSYTOOLS_FU3_PSC2_DIR)
348 |     # STRATIFY/ESTRA
349 |     deidentify(PSC2_FROM_PSC1,
350 |                PSYTOOLS_STRATIFY_DERIVED_DIR, PSYTOOLS_STRATIFY_PSC2_DIR)
351 |     #deidentify(PSC2_FROM_PSC1,
352 |     #           PSYTOOLS_STRATIFY_FU_DERIVED_DIR, PSYTOOLS_STRATIFY_FU_PSC2_DIR)
353 |     # IMACOV
354 |     deidentify(PSC2_FROM_PSC1,
355 |                PSYTOOLS_IMACOV19_BL_DERIVED_DIR, PSYTOOLS_IMACOV19_BL_PSC2_DIR)
356 |     deidentify(PSC2_FROM_PSC1,
357 |                PSYTOOLS_IMACOV19_FU_DERIVED_DIR, PSYTOOLS_IMACOV19_FU_PSC2_DIR)
358 |     deidentify(PSC2_FROM_PSC1,
359 |                PSYTOOLS_IMACOV19_FU2_DERIVED_DIR, PSYTOOLS_IMACOV19_FU2_PSC2_DIR)
360 |     deidentify(PSC2_FROM_PSC1,
361 |                PSYTOOLS_IMACOV19_FU3_DERIVED_DIR, PSYTOOLS_IMACOV19_FU3_PSC2_DIR)
362 |     # STRATICO
363 |     deidentify(PSC2_FROM_PSC1,
364 |                PSYTOOLS_STRATICO19_BL_DERIVED_DIR, PSYTOOLS_STRATICO19_BL_PSC2_DIR)
365 |     deidentify(PSC2_FROM_PSC1,
366 |                PSYTOOLS_STRATICO19_FU_DERIVED_DIR, PSYTOOLS_STRATICO19_FU_PSC2_DIR)
367 |     deidentify(PSC2_FROM_PSC1,
368 |                PSYTOOLS_STRATICO19_FU2_DERIVED_DIR, PSYTOOLS_STRATICO19_FU2_PSC2_DIR)
369 |     deidentify(PSC2_FROM_PSC1,
370 |                PSYTOOLS_STRATICO19_FU3_DERIVED_DIR, PSYTOOLS_STRATICO19_FU3_PSC2_DIR)
371 | 
372 | 
373 | if __name__ == "__main__":
374 |     main()
375 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2016-2017 CEA
 2 | #
 3 | # This software is governed by the CeCILL license under French law and
 4 | # abiding by the rules of distribution of free software. You can use,
 5 | # modify and/ or redistribute the software under the terms of the CeCILL
 6 | # license as circulated by CEA, CNRS and INRIA at the following URL
 7 | # "http://www.cecill.info".
 8 | #
 9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 | 
29 | from setuptools import setup
30 | from imagen_databank import __version__
31 | from imagen_databank import __author__
32 | from imagen_databank import __email__
33 | from imagen_databank import __license__
34 | 
35 | 
36 | def readme():
37 |     with open('README.rst') as f:
38 |         return f.read()
39 | 
40 | 
41 | def license():
42 |     with open('LICENSE') as f:
43 |         return f.read()
44 | 
45 | 
46 | setup(
47 |     name='imagen_databank',
48 |     version=__version__,
49 |     author=__author__,
50 |     author_email=__email__,
51 |     description='Imagen project databank software',
52 |     long_description=readme(),
53 |     license=__license__,
54 |     url='https://github.com/imagen2/imagen_databank',
55 |     packages=['imagen_databank'],
56 |     scripts=[
57 |         'cantab/imagen_cantab_age_at_session_start_time.py',
58 |         'dawba/imagen_dawba_process.py',
59 |         'psytools/imagen_psytools_download_csv.py',
60 |         'psytools/imagen_psytools_download_json.py',
61 |         'psytools/imagen_psytools_deidentify_csv.py',
62 |         'psytools/imagen_psytools_deidentify_json.py',
63 |     ],
64 |     classifiers=[
65 |         "License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)",
66 |         "Intended Audience :: Developers",
67 |         "Intended Audience :: Science/Research",
68 |         "Environment :: Console",
69 |         "Development Status :: 4 - Beta",
70 |         "Programming Language :: Python",
71 |         "Programming Language :: Python :: 2.7",
72 |         "Programming Language :: Python :: 3",
73 |         "Operating System :: OS Independent",
74 |         "Topic :: Scientific/Engineering :: Medical Science Apps.",
75 |         "Topic :: Utilities",
76 |     ],
77 |     install_requires=[
78 |         'pydicom',
79 |     ],
80 | )
81 | 


--------------------------------------------------------------------------------
/sex/imagen_sex.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | import csv
  5 | import logging
  6 | from imagen_databank import PSC2_FROM_PSC1
  7 | 
  8 | logging.basicConfig(level=logging.INFO)
  9 | 
 10 | WORKER_PROCESSES = 8
 11 | 
 12 | FU3_VALIDATION = '/neurospin/imagen/FU3/RAW/PSC1/meta_data/sex_validation_2018.csv'
 13 | 
 14 | FEMALE = 'F'
 15 | MALE = 'M'
 16 | 
 17 | 
 18 | def validation_FU3(path):
 19 |     result = {}
 20 | 
 21 |     with open(path, newline='') as csvfile:
 22 |         reader = csv.reader(csvfile, delimiter=',')
 23 |         next(reader)  # skip header
 24 |         for row in reader:
 25 |             psc1 = row[0]
 26 |             sex = row[1]
 27 |             result[psc1] = sex
 28 | 
 29 |     return result
 30 | 
 31 | 
 32 | def main():
 33 |     # read different sources
 34 |     with open('imagen_sex_recruitment.csv', 'r') as f:
 35 |         reader = csv.DictReader(f, dialect='excel')
 36 |         recruitment = {row['PSC1']: row['Recruitment']
 37 |                        for row in reader}
 38 | 
 39 |     with open('imagen_sex_dataset.csv', 'r') as f:
 40 |         reader = csv.DictReader(f, dialect='excel')
 41 |         dataset = {row['PSC1']:
 42 |                    (row['QualityReport.txt'] if 'QualityReport.txt' in row else None,
 43 |                     row['BL MRI'] if 'BL MRI' in row else None,
 44 |                     row['BL Cantab'] if 'BL Cantab' in row else None,
 45 |                     row['FU2 MRI'] if 'FU2 MRI' in row else None,
 46 |                     row['FU2 Cantab'] if 'FU2 Cantab' in row else None,
 47 |                     row['FU3 MRI'] if 'FU3 MRI' in row else None,
 48 |                     row['FU3 Cantab'] if 'FU3 Cantab' in row else None)
 49 |                    for row in reader}
 50 | 
 51 |     with open('imagen_sex_psytools.csv', 'r') as f:
 52 |         reader = csv.DictReader(f, dialect='excel')
 53 |         psytools = {row['PSC1']:
 54 |                     (row['Psytools BL'] if 'Psytools BL' in row else None,
 55 |                      row['Psytools FU1'] if 'Psytools FU1' in row else None,
 56 |                      row['Psytools FU2'] if 'Psytools FU2' in row else None,
 57 |                      row['Psytools FU3'] if 'Psytools FU3' in row else None)
 58 |                     for row in reader}
 59 | 
 60 |     with open('imagen_sex_xnat.csv', 'r') as f:
 61 |         reader = csv.DictReader(f, dialect='excel')
 62 |         xnat = {row['PSC1']: row['XNAT gender'] if 'XNAT gender' in row else None
 63 |                 for row in reader}
 64 | 
 65 |     with open('imagen_sex_methylation.csv', 'r') as f:
 66 |         reader = csv.DictReader(f, dialect='excel')
 67 |         methylation = {row['PSC1']:
 68 |                        (row['Methylation BL'] if 'Methylation BL' in row else None,
 69 |                         row['Methylation FU'] if 'Methylation FU' in row else None)
 70 |                        for row in reader}
 71 | 
 72 |     validation = validation_FU3(FU3_VALIDATION)
 73 | 
 74 |     # merge sources
 75 |     psc1s = set()
 76 |     for source in (recruitment, psytools, xnat, validation, methylation):
 77 |         psc1s = psc1s.union(set(source.keys()))
 78 |     psc1s = psc1s.intersection(set(PSC2_FROM_PSC1.keys()))  # LONDON recruitment file
 79 | 
 80 |     with open('imagen_sex.csv', 'w', newline='') as csvfile:
 81 |         fieldnames = ['PSC1',
 82 |                       'Recruitment',
 83 |                       'QualityReport.txt', 'MRI BL', 'Cantab BL', 'MRI FU2', 'Cantab FU2', 'MRI FU3', 'Cantab FU3',
 84 |                       'Psytools BL', 'Psytools FU1', 'Psytools FU2', 'Psytools FU3',
 85 |                       'XNAT gender',
 86 |                       '2018 validation',
 87 |                       'Reference',
 88 |                       'Methylation BL', 'Methylation FU']
 89 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
 90 |         writer.writeheader()
 91 | 
 92 |         for psc1 in sorted(psc1s):
 93 |             row = {}
 94 |             if psc1 in recruitment:
 95 |                 row['Recruitment'] = recruitment[psc1]
 96 |             if psc1 in dataset:
 97 |                 if dataset[psc1][0]:
 98 |                     row['QualityReport.txt'] = dataset[psc1][0]
 99 |                 if dataset[psc1][1]:
100 |                     row['MRI BL'] = dataset[psc1][1]
101 |                 if dataset[psc1][2]:
102 |                     row['Cantab BL'] = dataset[psc1][2]
103 |                 if dataset[psc1][3]:
104 |                     row['MRI FU2'] = dataset[psc1][3]
105 |                 if dataset[psc1][4]:
106 |                     row['Cantab FU2'] = dataset[psc1][4]
107 |                 if dataset[psc1][5]:
108 |                     row['MRI FU3'] = dataset[psc1][5]
109 |                 if dataset[psc1][6]:
110 |                     row['Cantab FU3'] = dataset[psc1][6]
111 |             if psc1 in psytools:
112 |                 if psytools[psc1][0]:
113 |                     row['Psytools BL'] = psytools[psc1][0]
114 |                 if psytools[psc1][1]:
115 |                     row['Psytools FU1'] = psytools[psc1][1]
116 |                 if psytools[psc1][2]:
117 |                     row['Psytools FU2'] = psytools[psc1][2]
118 |                 if psytools[psc1][3]:
119 |                     row['Psytools FU3'] = psytools[psc1][3]
120 |             if psc1 in xnat:
121 |                 row['XNAT gender'] = xnat[psc1]
122 |             if psc1 in validation:
123 |                 row['2018 validation'] = validation[psc1]
124 | 
125 |             if psc1 in xnat and psc1 in validation:
126 |                 if xnat[psc1] != validation[psc1]:
127 |                     logging.warning('%s: changed XNAT %s into %s',
128 |                                     psc1, xnat[psc1], validation[psc1])
129 | 
130 |             values = set(row.values())
131 |             if len(values) > 1:
132 |                 if psc1 in validation:
133 |                     row['Reference'] = validation[psc1]
134 |                 elif psc1 in xnat:
135 |                     row['Reference'] = xnat[psc1]
136 |                 else:
137 |                     logging.warning('%s: cannot derive a reference value for sex',
138 |                                     psc1)
139 |             else:
140 |                 row['Reference'] = next(iter(values))
141 | 
142 |             if psc1 in methylation:
143 |                 if methylation[psc1][0]:
144 |                     row['Methylation BL'] = methylation[psc1][0]
145 |                 if methylation[psc1][1]:
146 |                     row['Methylation FU'] = methylation[psc1][1]
147 | 
148 |             row['PSC1'] = psc1
149 |             writer.writerow(row)
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     main()
154 | 


--------------------------------------------------------------------------------
/sex/imagen_sex_methylation.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import csv
 5 | import logging
 6 | 
 7 | logging.basicConfig(level=logging.INFO)
 8 | 
 9 | WORKER_PROCESSES = 8
10 | 
11 | METHYLATION = '/neurospin/imagen/TODO/predicted_gender.csv'
12 | PSC1_FROM_CHIP = '/neurospin/imagen/TODO/PSC1/Associated PSC1 codes.csv'
13 | 
14 | FEMALE = 'F'
15 | MALE = 'M'
16 | 
17 | 
18 | def psc1_from_chip(path):
19 |     result = {}
20 | 
21 |     with open(path, newline='') as csvfile:
22 |         reader = csv.reader(csvfile, delimiter=',')
23 |         next(reader)  # skip header
24 |         for row in reader:
25 |             chip = row[0]
26 |             psc1 = row[1]
27 |             if psc1.endswith('FU'):
28 |                 psc1 = psc1[:-len('FU')]
29 |                 timepoint = 'FU2'
30 |             else:
31 |                 timepoint = 'BL'
32 |             result[chip] = (psc1, timepoint)
33 | 
34 |     return result
35 | 
36 | 
37 | def methylation_process(path, psc1_from_chip):
38 |     result_BL = {}
39 |     result_FU2 = {}
40 | 
41 |     with open(path, newline='') as csvfile:
42 |         reader = csv.reader(csvfile, delimiter=',')
43 |         next(reader)  # skip header
44 |         for row in reader:
45 |             chip = row[0]
46 |             sex = row[1]
47 |             if sex == '1':
48 |                 sex = 'F'
49 |             elif sex == '2':
50 |                 sex = 'M'
51 |             else:
52 |                 logging.error('%s: incorrect sex (%s) in prediction CSV file: %s',
53 |                               chip, sex, f)
54 |                 continue
55 |             if chip in psc1_from_chip:
56 |                 psc1, timepoint = psc1_from_chip[chip]
57 |                 if timepoint == 'FU2':
58 |                     result = result_FU2
59 |                 elif timepoint == 'BL':
60 |                     result = result_BL
61 |                 else:
62 |                     logging.error('%s: incorrect connversion table', chip)
63 |                     continue
64 |                 if psc1 in result:
65 |                     if result[psc1] != sex:
66 |                         logging.error('%s: inconsistent sex from methylation', psc1)
67 |                         result[psc1] = '?'
68 |                 else:
69 |                     result[psc1] = sex
70 | 
71 |     return result_BL, result_FU2
72 | 
73 | 
74 | def main():
75 |     psc1_from_chip_table = psc1_from_chip(PSC1_FROM_CHIP)
76 |     methylation_BL, methylation_FU2 = methylation_process(METHYLATION, psc1_from_chip_table)
77 |     methylation = (methylation_BL, methylation_FU2)
78 | 
79 |     with open('imagen_sex_methylation.csv', 'w', newline='') as csvfile:
80 |         sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
81 |         sex.writerow(['PSC1',
82 |                       'Methylation BL', 'Methylation FU'])
83 |         psc1s = set()
84 |         for timepoint in methylation:
85 |             psc1s = psc1s.union(set(timepoint.keys()))
86 |         for psc1 in sorted(psc1s):
87 |             row = [psc1]
88 |             for timepoint in methylation:
89 |                 if psc1 in timepoint:
90 |                     row.append(timepoint[psc1])
91 |                 else:
92 |                     row.append(None)
93 |             sex.writerow(row)
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/sex/imagen_sex_psytools.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | from multiprocessing import Pool
  5 | import csv
  6 | from collections import Counter
  7 | import logging
  8 | 
  9 | logging.basicConfig(level=logging.INFO)
 10 | 
 11 | BL_PSYTOOLS = '/neurospin/imagen/BL/RAW/PSC1/psytools'
 12 | FU1_PSYTOOLS = '/neurospin/imagen/FU1/RAW/PSC1/psytools'
 13 | FU2_PSYTOOLS = '/neurospin/imagen/FU2/RAW/PSC1/psytools'
 14 | FU3_PSYTOOLS = '/neurospin/imagen/FU3/RAW/PSC1/psytools'
 15 | 
 16 | WORKER_PROCESSES = 24
 17 | 
 18 | 
 19 | FEMALE = 'F'
 20 | MALE = 'M'
 21 | 
 22 | _CSV_ID_CHECK_GENDER_MAPPING = {
 23 |     '1': MALE,
 24 |     '2': FEMALE,
 25 |     'female': FEMALE,
 26 |     'male': MALE,
 27 | }
 28 | 
 29 | _LSRC2_ID_CHECK_GENDER_MAPPING = {
 30 |     'F': FEMALE,
 31 |     'M': MALE,
 32 | }
 33 | 
 34 | 
 35 | def _psytools_choice(psc1, counter):
 36 |     female = counter[FEMALE]
 37 |     male = counter[MALE]
 38 |     total = female + male
 39 |     if female and male:
 40 |         logging.error('%s: inconsistent information about sex', psc1)
 41 |         return None
 42 |     elif female:
 43 |         return FEMALE
 44 |     elif male:
 45 |         return MALE
 46 |     else:
 47 |         logging.error('%s: cannot find information about sex', psc1)
 48 |         sex = None
 49 | 
 50 | 
 51 | def list_psytools_timepoint(path):
 52 |     """List Psytools CSV files exported from Delosis.
 53 | 
 54 |     Parameters
 55 |     ----------
 56 |     path : str
 57 |         Directory to read Psytools CSV files from.
 58 | 
 59 |     Yields
 60 |     ------
 61 |     str
 62 |         Path to Psytools CSV file.
 63 | 
 64 |     """
 65 |     CSV_PREFIX = ('IMAGEN-IMGN_', 'IMAGEN-cVEDA_')
 66 |     LSRC2_PREFIX = ('Imagen_', 'STRATIFY_')
 67 | 
 68 |     for f in os.listdir(path):
 69 |         root, ext = os.path.splitext(f)
 70 |         if ext == '.csv':
 71 |             if any(root.startswith(prefix) for prefix in CSV_PREFIX):
 72 |                 yield (False, os.path.join(path, f))
 73 |             elif any(root.startswith(prefix) for prefix in LSRC2_PREFIX):
 74 |                 yield (True, os.path.join(path, f))
 75 |             else:
 76 |                 logging.error('skipping unknown CSV file: %s', f)
 77 | 
 78 | 
 79 | def process_psytools_timepoint(arguments):
 80 |     (lsrc2, path) = arguments  # unpack multiple arguments
 81 | 
 82 |     result = {}
 83 | 
 84 |     with open(path, 'r') as f:
 85 |         reader = csv.DictReader(f, dialect='excel')
 86 |         for row in reader:
 87 |             if lsrc2:
 88 |                 psc1 = row['id']
 89 |                 if psc1.endswith('FU3'):
 90 |                     psc1 = psc1[:-len('FU3')]
 91 |                 elif psc1.endswith('FU2'):  # Parent questionnaires
 92 |                     psc1 = psc1[:-len('FU2')]
 93 |                 if psc1.isdigit() and len(psc1) == 12:
 94 |                     if 'IdCheckGender' in row:
 95 |                         id_check_gender = row['IdCheckGender']
 96 |                         if id_check_gender in _LSRC2_ID_CHECK_GENDER_MAPPING:
 97 |                             sex = _LSRC2_ID_CHECK_GENDER_MAPPING[id_check_gender]
 98 |                             result.setdefault(psc1, []).append(sex)
 99 |                         else:
100 |                             logging.error("%s: invalid 'IdCheckGender': %s",
101 |                                           psc1, id_check_gender)
102 |                 else:
103 |                     logging.info('%s: cannot interpret as PSC1 code', psc1)
104 |             else:
105 |                 completed = row['Completed']
106 |                 trial = row['Trial']
107 |                 if completed == 't' and trial == "id_check_gender":
108 |                     psc1_suffix = row['User code'].rsplit('-', 1)
109 |                     psc1 = psc1_suffix[0]
110 |                     if psc1.isdigit() and len(psc1) == 12:
111 |                         trial_result = row['Trial result']
112 |                         if trial_result in _CSV_ID_CHECK_GENDER_MAPPING:
113 |                             sex = _CSV_ID_CHECK_GENDER_MAPPING[trial_result]
114 |                             result.setdefault(psc1, []).append(sex)
115 |                         else:
116 |                             logging.error("%s: invalid 'id_check_gender': %s",
117 |                                           psc1, trial_result)
118 |                     else:
119 |                         logging.info('%s: cannot interpret as PSC1 code', psc1)
120 | 
121 |     return result
122 | 
123 | 
124 | def _decide_from_counter(counter):
125 |     female = counter[FEMALE]
126 |     male = counter[MALE]
127 |     total = sum(counter.values())
128 |     if total:
129 |         if female > male:
130 |             sex = FEMALE
131 |             percentage = ((200 * female) // total + 1) // 2  # closest integer percentage
132 |         elif male > female:
133 |             sex = MALE
134 |             percentage = ((200 * male) // total + 1) // 2  # closest integer percentage
135 |         else:
136 |             sex = None
137 |             percentage = 50
138 |     else:
139 |         sex = None
140 |         percentage = None
141 | 
142 |     return sex, percentage
143 | 
144 | 
145 | def psytools_timepoint(path):
146 |     todo_list = list(list_psytools_timepoint(path))
147 | 
148 |     pool = Pool(WORKER_PROCESSES)
149 |     results = pool.map(process_psytools_timepoint, todo_list)
150 |     pool.close()
151 |     pool.join()
152 | 
153 |     sex_counter = {}
154 |     for result in results:
155 |         for psc1, sex in result.items():
156 |             sex_counter.setdefault(psc1, Counter()).update(sex)
157 | 
158 |     return {psc1: _decide_from_counter(counter)
159 |             for psc1, counter in sex_counter.items()}
160 | 
161 | 
162 | def main():
163 |     psytools_BL = psytools_timepoint(BL_PSYTOOLS)
164 |     psytools_FU1 = psytools_timepoint(FU1_PSYTOOLS)
165 |     psytools_FU2 = psytools_timepoint(FU2_PSYTOOLS)
166 |     psytools_FU3 = psytools_timepoint(FU3_PSYTOOLS)
167 |     psytools = (psytools_BL, psytools_FU1, psytools_FU2, psytools_FU3)
168 | 
169 |     with open('imagen_sex_psytools.csv', 'w', newline='') as csvfile:
170 |         sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
171 |         sex.writerow(['PSC1',
172 |                       'Psytools BL', 'Psytools FU1',
173 |                       'Psytools FU2', 'Psytools FU3'])
174 |         psc1s = set()
175 |         for timepoint in psytools:
176 |             psc1s = psc1s.union(set(timepoint.keys()))
177 |         for psc1 in sorted(psc1s):
178 |             row = [psc1]
179 |             for timepoint in psytools:
180 |                 if psc1 in timepoint:
181 |                     row.append(timepoint[psc1][0])
182 |                 else:
183 |                     row.append(None)
184 |             sex.writerow(row)
185 | 
186 |             if any(psc1 in timepoint and timepoint[psc1][1] != 100
187 |                    for timepoint in psytools):
188 |                 s = '%s: inconsistent sex:'
189 |                 if psc1 in psytools_BL:
190 |                     s += '\n\tBL:  {} {}%%'.format(psytools_BL[psc1][0], psytools_BL[psc1][1])
191 |                 if psc1 in psytools_FU1:
192 |                     s += '\n\tFU1: {} {}%%'.format(psytools_FU1[psc1][0], psytools_FU1[psc1][1])
193 |                 if psc1 in psytools_FU2:
194 |                     s += '\n\tFU2: {} {}%%'.format(psytools_FU2[psc1][0], psytools_FU2[psc1][1])
195 |                 if psc1 in psytools_FU3:
196 |                     s += '\n\tFU3: {} {}%%'.format(psytools_FU3[psc1][0], psytools_FU3[psc1][1])
197 |                 logging.warning(s, psc1)
198 | 
199 | 
200 | if __name__ == "__main__":
201 |     main()
202 | 


--------------------------------------------------------------------------------
/sex/imagen_sex_recruitment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | from multiprocessing import Pool
  5 | import csv
  6 | from datetime import datetime
  7 | import logging
  8 | 
  9 | logging.basicConfig(level=logging.INFO)
 10 | 
 11 | BL_RECRUITMENT_INFO = os.path.join('/neurospin/imagen/BL/RAW/PSC1/recruitment')
 12 | 
 13 | WORKER_PROCESSES = 16
 14 | 
 15 | 
 16 | FEMALE = 'F'
 17 | MALE = 'M'
 18 | 
 19 | _RECRUITMENT_SEX_MAPPING = {
 20 |     'f': FEMALE,
 21 |     'F': FEMALE,
 22 |     'm': MALE,
 23 |     'M': MALE,
 24 |     'w': FEMALE,
 25 | }
 26 | 
 27 | _RECRUITMENT_SEX_VOID = {
 28 |     '',
 29 |     '0',
 30 |     '0.0',
 31 |     'Test',
 32 |     'not known',
 33 | }
 34 | 
 35 | 
 36 | def _recruitment_center(s):
 37 |     s = s.strip()
 38 | 
 39 |     if set(s).issubset('12345678.0'):
 40 |         if '.' in s:
 41 |             try:
 42 |                 s = float(s)
 43 |             except ValueError:
 44 |                 logging.info('%s: cannot interpret as center code', s)
 45 |                 return None
 46 |             else:
 47 |                 s = str(int(s // 1))  # integral part
 48 |         if len(s) == 1:
 49 |             return s
 50 |         else:
 51 |             logging.error('%s: incorrect center code', s)
 52 |     else:
 53 |         logging.debug('%s: skipping center code', s)
 54 | 
 55 |     return None
 56 | 
 57 | 
 58 | def _recruitment_psc1(s, center):
 59 |     s = s.strip()
 60 | 
 61 |     if s.isdigit():
 62 |         if len(s) < 7:
 63 |             s = '0' + center + s.zfill(10)
 64 |         if len(s) == 12:
 65 |             return s
 66 |         else:
 67 |             logging.error('%s: incorrect PSC1 code', s)
 68 |     elif s:
 69 |         logging.warn('%s: cannot interpret as PSC1 code', s)
 70 |     else:
 71 |         logging.debug('empty PSC1 code')
 72 | 
 73 |     return None
 74 | 
 75 | 
 76 | def _recruitment_choice(psc1, timestamps):
 77 |     # use data with most recent time stamp
 78 |     counter = Counter(timestamps[max(timestamps.keys())])
 79 | 
 80 |     female = counter[FEMALE]
 81 |     male = counter[MALE]
 82 |     if female and male:
 83 |         logging.error('%s: inconsistent information about sex', psc1)
 84 |         return None
 85 |     elif female:
 86 |         return FEMALE
 87 |     elif male:
 88 |         return MALE
 89 |     else:
 90 |         logging.error('%s: cannot find information about sex', psc1)
 91 |         sex = None
 92 | 
 93 | 
 94 | def list_recruitment_BL(path):
 95 |     """List recruitment CSV files sent by recruitment centres.
 96 | 
 97 |     Parameters
 98 |     ----------
 99 |     path : str
100 |         Directory to read CSV recruitment files from.
101 | 
102 |     Yields
103 |     ------
104 |     str
105 |         Path to CSV file.
106 | 
107 |     """
108 |     for f in os.listdir(path):
109 |         root, ext = os.path.splitext(f)
110 |         if ext == '.csv':
111 |             yield os.path.join(path, f)
112 | 
113 | 
114 | def process_recruitment_BL(path):
115 |     timestamp = os.path.getmtime(path)
116 | 
117 |     recruitment_sex = {}
118 | 
119 |     with open(path, encoding='latin1', newline='') as csvfile:
120 |         recruitment = csv.reader(csvfile, delimiter=',')
121 |         for row in recruitment:
122 |             center = _recruitment_center(row[0])
123 |             if center:
124 |                 psc1 = _recruitment_psc1(row[1], center)
125 |                 if psc1:
126 |                     gender = row[2].strip()
127 |                     if gender in _RECRUITMENT_SEX_MAPPING:
128 |                         sex = _RECRUITMENT_SEX_MAPPING[gender]
129 |                         if psc1 in recruitment_sex:
130 |                             if recruitment_sex[psc1] != sex:
131 |                                 logging.error('%s: inconsistent duplicate line',
132 |                                               psc1)
133 |                             else:
134 |                                 logging.error('%s: duplicate line',
135 |                                               psc1)
136 |                         else:
137 |                             recruitment_sex[psc1] = sex
138 |                     elif gender not in _RECRUITMENT_SEX_VOID:
139 |                         logging.error("%s: incorrect 'gender': %s",
140 |                                       psc1, gender)
141 | 
142 |     return timestamp, recruitment_sex
143 | 
144 | 
145 | def recruitment_BL(path):
146 |     """Process CSV recruitment files sent by recruitment centres at baseline.
147 | 
148 |     First list the files to process, then read these files in parallel.
149 | 
150 |     Parameters
151 |     ----------
152 |     path : str
153 |         Directory to read CSV recruitment files from.
154 | 
155 |     Returns
156 |     -------
157 |     dict
158 |         Key is PSC1 and value a pair (xnat_sex, xnat_experiment_sex).
159 | 
160 |     """
161 |     todo_list = list(list_recruitment_BL(path))
162 | 
163 |     pool = Pool(WORKER_PROCESSES)
164 |     results = pool.map(process_recruitment_BL, todo_list)
165 |     pool.close()
166 |     pool.join()
167 | 
168 |     sex_by_timestamp = {}
169 |     for timestamp, result in results:
170 |         for psc1, sex in result.items():
171 |             sex_by_timestamp.setdefault(psc1, {})[timestamp] = result[psc1]
172 | 
173 |     recruitment_sex = {}
174 |     for psc1, timestamps in sex_by_timestamp.items():
175 |         max_timestamp = max(timestamps)
176 |         sex = timestamps[max_timestamp]
177 |         for k, v in timestamps.items():
178 |             if v != sex:
179 |                 logging.error("%s: inconsistent 'gender' across time stamps\n"
180 |                               '\t%s: %s\n'
181 |                               '\t%s: %s',
182 |                               psc1,
183 |                               datetime.fromtimestamp(k).date(), v,
184 |                               datetime.fromtimestamp(max_timestamp).date(), sex)
185 |         recruitment_sex[psc1] = sex
186 | 
187 |     return recruitment_sex
188 | 
189 | 
190 | def main():
191 |     recruitment = recruitment_BL(BL_RECRUITMENT_INFO)
192 | 
193 |     with open('imagen_sex_recruitment.csv', 'w', newline='') as csvfile:
194 |         sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
195 |         sex.writerow(['PSC1', 'Recruitment'])
196 |         for psc1 in sorted(recruitment):
197 |             row = [psc1]
198 |             row.append(recruitment[psc1])
199 |             sex.writerow(row)
200 | 
201 | 
202 | if __name__ == "__main__":
203 |     main()
204 | 


--------------------------------------------------------------------------------
/sex/imagen_sex_xnat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | from multiprocessing import Pool
  5 | from xml.etree import ElementTree
  6 | from imagen_databank import PSC1_FROM_PSC2
  7 | import csv
  8 | import logging
  9 | 
 10 | logging.basicConfig(level=logging.INFO)
 11 | 
 12 | BL_XNAT = '/neurospin/imagen/export/xml'
 13 | 
 14 | WORKER_PROCESSES = 16
 15 | 
 16 | 
 17 | FEMALE = 'F'
 18 | MALE = 'M'
 19 | 
 20 | _XNAT_GENDER_MAPPING = {
 21 |     'female': FEMALE,
 22 |     'male': MALE,
 23 | }
 24 | 
 25 | _XNAT_EXPERIMENT_GENDER_MAPPING = {
 26 |     'f': FEMALE,
 27 |     'F': FEMALE,
 28 |     'm': MALE,
 29 |     'M': MALE,
 30 |     'w': FEMALE,
 31 |     'female': FEMALE, # single occurrence!
 32 | }
 33 | 
 34 | _XNAT_EXPERIMENT_GENDER_VOID = {
 35 |     '0',
 36 |     'Test',
 37 |     'not known',
 38 | }
 39 | 
 40 | 
 41 | def list_xnat_BL(path):
 42 |     """List XML files exported from XNAT.
 43 | 
 44 |     Yields only files with standard names:
 45 |         IMAGEN_<PSC2>.xml
 46 | 
 47 |     Parameters
 48 |     ----------
 49 |     path : str
 50 |         Directory to read XML files from.
 51 | 
 52 |     Yields
 53 |     ------
 54 |     tuple of str
 55 |         Yields a pair (psc2, path).
 56 | 
 57 |     """
 58 |     for f in os.listdir(path):
 59 |         root, ext = os.path.splitext(f)
 60 |         if ext == '.xml':
 61 |             PREFIX = 'IMAGEN_'
 62 |             if root.startswith(PREFIX):
 63 |                 psc2 = root[len(PREFIX):]
 64 |                 logging.debug('%s: found XML file: %s', psc2, f)
 65 |                 assert(psc2.isdigit() and len(psc2) == 12)
 66 |                 yield (psc2, os.path.join(path, f))
 67 |             else:
 68 |                 logging.error('unexpected XML file: %s', f)
 69 |         else:
 70 |             logging.debug('skipping non-XML file: %s', f)
 71 | 
 72 | 
 73 | def process_xnat_BL(arguments):
 74 |     """Read subject sex from XML file exported from XNAT.
 75 | 
 76 |     Looks for this information in two distinct places.
 77 | 
 78 |     Parameters
 79 |     ----------
 80 |     arguments : tuple of str
 81 |         Expects a pair (psc2, path)
 82 | 
 83 |     Returns
 84 |     -------
 85 |     tuple of str
 86 |         Yields a pair (xnat_sex, xnat_experiment_sex).
 87 | 
 88 |     """
 89 |     (psc2, path) = arguments  # unpack multiple arguments
 90 | 
 91 |     tree = ElementTree.parse(path)
 92 |     root = tree.getroot()
 93 | 
 94 |     xnat_sex = None
 95 |     xnat_gender = root.find('.//{http://nrg.wustl.edu/xnat}gender')
 96 |     if xnat_gender is None:
 97 |         logging.warn("%s: missing 'gender' in XML file", psc2)
 98 |     else:
 99 |         xnat_gender = xnat_gender.text
100 |         if xnat_gender in _XNAT_GENDER_MAPPING:
101 |             xnat_sex = _XNAT_GENDER_MAPPING[xnat_gender]
102 |         else:
103 |             logging.error("%s: incorrect 'gender' (%s) in XML file",
104 |                           psc2, xnat_gender)
105 | 
106 |     xnat_experiment_sex = None
107 |     xnat_experiment_gender = root.find('.//{http://nrg.wustl.edu/xnat}experiment[@gender]')
108 |     if xnat_experiment_gender is None:
109 |         logging.warn("%s: missing 'experiment[@gender]' in XML file", psc2)
110 |     else:
111 |         xnat_experiment_gender = xnat_experiment_gender.attrib['gender']
112 |         xnat_experiment_gender = xnat_experiment_gender.strip()
113 |         if xnat_experiment_gender in _XNAT_EXPERIMENT_GENDER_MAPPING:
114 |             xnat_experiment_sex = _XNAT_EXPERIMENT_GENDER_MAPPING[xnat_experiment_gender]
115 |         elif xnat_experiment_gender not in _XNAT_EXPERIMENT_GENDER_VOID:
116 |             logging.error("%s: incorrect 'experiment[@gender]' (%s) in XML file",
117 |                           psc2, xnat_experiment_gender)
118 | 
119 |     return xnat_sex, xnat_experiment_sex
120 | 
121 | 
122 | def xnat_BL(path):
123 |     """Process XML files exported from XNAT.
124 | 
125 |     First list the files to process, then read these files in parallel.
126 | 
127 |     Parameters
128 |     ----------
129 |     path : str
130 |         Directory to read XML files from.
131 | 
132 |     Returns
133 |     -------
134 |     dict
135 |         Key is PSC2 and value a pair (xnat_sex, xnat_experiment_sex).
136 | 
137 |     """
138 |     todo_list = list(list_xnat_BL(BL_XNAT))
139 | 
140 |     pool = Pool(WORKER_PROCESSES)
141 |     results = pool.map(process_xnat_BL, todo_list)
142 |     pool.close()
143 |     pool.join()
144 | 
145 |     psc1, path = zip(*todo_list)
146 |     return dict(zip(psc1, results))
147 | 
148 | 
149 | def main():
150 |     xnat = xnat_BL(BL_XNAT)
151 | 
152 |     xnat = {PSC1_FROM_PSC2[psc2]: v for psc2, v in xnat.items()}
153 | 
154 |     with open('imagen_sex_xnat.csv', 'w', newline='') as csvfile:
155 |         sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
156 |         sex.writerow(['PSC1',
157 |                       'XNAT gender'])
158 |         for psc1 in sorted(xnat):
159 |             row = [psc1]
160 |             if xnat[psc1][0] and xnat[psc1][1]:
161 |                 if xnat[psc1][0] != xnat[psc1][1]:
162 |                     logging.error("%s: inconsistent 'gender' (%s) / 'experiment@gender' (%s)",
163 |                                   psc1, xnat[psc1][0], xnat[psc1][1])
164 |                     row.append('?')
165 |                 else:
166 |                     row.append(xnat[psc1][0])
167 |             elif xnat[psc1][0]:
168 |                 row.append(xnat[psc1][0])
169 |             elif xnat[psc1][1]:
170 |                 row.append(xnat[psc1][1])
171 |             else:
172 |                 row.append(None)
173 |             sex.writerow(row)
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     main()
178 | 


--------------------------------------------------------------------------------
/stratify_demographics/demographics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | 
  4 | import os
  5 | from csv import reader
  6 | from csv import DictWriter
  7 | import xlrd
  8 | from imagen_databank import PSC2_FROM_PSC1, CENTER_NAME
  9 | 
 10 | import logging
 11 | logging.basicConfig(level=logging.ERROR)
 12 | 
 13 | 
 14 | _DEBUG_PSYTOOLS_SEX = '/imagen/STRATIFY/RAW/PSC1/meta_data/STRATIFY_SEX_2024-10-17.txt'
 15 | 
 16 | _DEMOGRAPHIC_RECORDS_DIR = '/imagen/STRATIFY/RAW/PSC1/meta_data'
 17 | _DEMOGRAPHIC_RECORDS = [
 18 |     os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'STRATIFY_recruitment_file_SOUTHAMPTON_2024-10-16.xlsx'),
 19 |     os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'STRATIFY_recruitment_file_LONDON_2024-03-14.xlsx'),
 20 |     os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'ESTRA_recruitment_file_LONDON_2024-08-16.xlsx'),
 21 |     os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'ESTRA_recruitment_file_LONDON_CONTROLS_2023-07-24.xlsx'),
 22 |     os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'STRATIFY_recruitment_file_BERLIN_2024-10-16.xlsx'),
 23 | ]
 24 | 
 25 | _FINAL_COLUMNS = (
 26 |     'PSC2',
 27 |     'sex',
 28 |     'recruitment site',
 29 |     'scanning site',
 30 |     'patient group',
 31 |     'complete',
 32 |     #'missing data',
 33 | )
 34 | 
 35 | _DEMOGRAPHIC_COLUMNS = {
 36 |     # handle separately 'PSC1 Code'
 37 |     # Stratify
 38 |     'Sex': _FINAL_COLUMNS[1],
 39 |     'Acquisition Centre (and Scanning Site)': _FINAL_COLUMNS[3],
 40 |     'Acquisition Centre': _FINAL_COLUMNS[3],
 41 |     'Patient Group': _FINAL_COLUMNS[4],
 42 |     'Fully Complete? Y/N': _FINAL_COLUMNS[5],
 43 |     #'Missing Data (Please Specify)': _FINAL_COLUMNS[6],
 44 |     # ESTRA
 45 |     # (skip 'Recruitment Centre')
 46 |     'Scanning Site': _FINAL_COLUMNS[3],
 47 |     'Gender ': _FINAL_COLUMNS[1],
 48 |     'Diagnosis ': _FINAL_COLUMNS[4],
 49 |     'Diagnosis': _FINAL_COLUMNS[4],
 50 |     # Stratify 20 additional controls
 51 |     'Site': _FINAL_COLUMNS[3],
 52 |     'Group': _FINAL_COLUMNS[4],
 53 |     'Gender': _FINAL_COLUMNS[1],
 54 |     # LONDON CONTROLS
 55 |     
 56 |     # BERLIN
 57 |     'sex': _FINAL_COLUMNS[1],
 58 |     'scanning site': _FINAL_COLUMNS[3],
 59 |     'patient group': _FINAL_COLUMNS[4],
 60 |     'complete': _FINAL_COLUMNS[5],
 61 |     #'missing data': _FINAL_COLUMNS[6],
 62 | }
 63 | 
 64 | _CONTROL_GROUP = 'Control'
 65 | _CONTROL_GROUP_ESTRA = 'Control_ESTRA'
 66 | _ADHD_GROUP = 'ADHD'
 67 | _AUD_GROUP = 'AUD'
 68 | _AN_GROUP = 'AN'
 69 | _RECAN_GROUP = 'recAN'
 70 | _BN_GROUP = 'BN'
 71 | _RECBN_GROUP = 'recBN'
 72 | _MDD_GROUP = 'MDD'
 73 | _PSYCHOSIS_GROUP = 'Psychosis'
 74 | _BED_GROUP= 'BED'
 75 | 
 76 | _PATIENT_GROUPS = {
 77 |     _CONTROL_GROUP,
 78 |     _CONTROL_GROUP_ESTRA,
 79 |     _ADHD_GROUP,
 80 |     _AUD_GROUP,
 81 |     _AN_GROUP,
 82 |     _RECAN_GROUP,
 83 |     _BN_GROUP,
 84 |     _RECBN_GROUP,
 85 |     _MDD_GROUP,
 86 |     _PSYCHOSIS_GROUP,
 87 |     _BED_GROUP,
 88 | }
 89 | 
 90 | 
 91 | def normalize_patient_group(s):
 92 |     table = {
 93 |         'control': _CONTROL_GROUP,
 94 |         'Control_ESTRA': _CONTROL_GROUP_ESTRA,
 95 |         'depression': _MDD_GROUP,
 96 |         'psychosis': _PSYCHOSIS_GROUP,
 97 |         'Alcohol Use Disorder': _AUD_GROUP,
 98 |         'Major Depressive Disorder': _MDD_GROUP,
 99 |         'Healthy Control': _CONTROL_GROUP,
100 | 
101 |     }
102 |     if s in table:
103 |        s = table[s]
104 | 
105 |     return s
106 | 
107 | 
108 | def normalize_scanning_site(s):
109 |     table = {
110 |         # LONDON: 'CNS' or 'Invicro'
111 |         'KCL': 'CNS',
112 |         'Denmark Hill': 'CNS',
113 |         # SOUTHAMPTON
114 |         'Southampton': None,
115 |         # BERLIN
116 |         'BERLIN': None,
117 |     }
118 |     if s in table:
119 |        s = table[s]
120 | 
121 |     return s
122 | 
123 | 
124 | def normalize_sex(s):
125 |     s = s.upper()
126 | 
127 |     table = {
128 |         'FEMALE': 'F',
129 |         'MALE': 'M',
130 |     }
131 |     if s in table:
132 |        s = table[s]
133 | 
134 |     return s
135 | 
136 | 
137 | def strip_cell(s):
138 |     try:
139 |         s = s.strip()
140 |     except AttributeError:  # floats and other types
141 |         pass
142 |     return s
143 | 
144 | 
145 | def read_demographic_record(path):
146 |     demographics = {}
147 | 
148 |     with xlrd.open_workbook(path) as workbook:
149 |         worksheet = workbook.sheet_by_index(0)
150 | 
151 |         # read header
152 |         psc1_index = None
153 |         index = {}
154 |         row = [strip_cell(x) for x in worksheet.row_values(0)]
155 |         print(path)
156 |         for i, value in enumerate(row):
157 |             if value in _DEMOGRAPHIC_COLUMNS:
158 |                 index[_DEMOGRAPHIC_COLUMNS[value]] = i
159 |                 print(i, value, '→', _DEMOGRAPHIC_COLUMNS[value])
160 |             elif value == 'PSC1 Code' or value == 'PSC1':
161 |                 psc1_index = i
162 |             else:
163 |                 print(i, value, '→', '?????')
164 | 
165 |         if psc1_index is None:
166 |             logging.error('%s: cannot find PSC1 code', path)
167 |             return demographics
168 | 
169 |         # read data
170 |         for i in range(1, worksheet.nrows):
171 |             row = [strip_cell(x) for x in worksheet.row_values(i)]
172 | 
173 |             psc1 = row[psc1_index]
174 |             psc1 = psc1[:12]  # remove trailing FU3 or SB
175 |             if psc1 not in PSC2_FROM_PSC1:
176 |                 logging.error('%s: invalid PSC1 code', psc1)
177 |                 continue
178 | 
179 |             demographics[psc1] = {}
180 | 
181 |             for name, i in index.items():
182 |                 value = row[i]
183 |                 if name == 'sex':
184 |                     value = normalize_sex(value)
185 |                     if value not in {'F', 'M'}:
186 |                         logging.error('%s: invalid sex: %s', psc1, value)
187 |                         continue
188 |                 elif name == 'patient group':
189 |                     value = normalize_patient_group(value)
190 |                     if value not in _PATIENT_GROUPS:
191 |                         logging.error('%s: invalid patient group: %s',
192 |                                       psc1, value)
193 |                         continue
194 |                 elif name == 'scanning site':
195 |                     value = normalize_scanning_site(value)
196 |                 elif name == 'complete':
197 |                     if value not in {'Y', 'N', ''}:
198 |                         logging.error('%s: invalid completeness: %s',
199 |                                       psc1, value)
200 |                         continue
201 |                 elif name == 'missing data':
202 |                     value = value.rstrip(',.')
203 |                     if value.lower() == 'none':
204 |                         value = None
205 |                 demographics[psc1][name] = value
206 | 
207 |     return demographics
208 | 
209 | 
210 | def read_demographic_records(paths):
211 |     demographic_records = {}
212 | 
213 |     for path in paths:
214 |         demographic_records.update(read_demographic_record(path))
215 | 
216 |     return demographic_records
217 | 
218 | 
219 | def main():
220 |     demographics = read_demographic_records(_DEMOGRAPHIC_RECORDS)
221 | 
222 |     with open(_DEBUG_PSYTOOLS_SEX, 'r') as sex_file:
223 |         sex_reader = reader(sex_file, dialect='excel')
224 | 
225 |         with open('STRATIFY_participants.csv', 'w') as demographics_file:
226 |             demographics_writer = DictWriter(demographics_file,
227 |                                              _FINAL_COLUMNS,
228 |                                               dialect='excel')
229 |             demographics_writer.writeheader()
230 |             for row in sex_reader:
231 |                 psc1 = row[0]
232 |                 psc2 = PSC2_FROM_PSC1[psc1]
233 |                 center = int(psc1[1])
234 |                 if center > 8:
235 |                     center = int(psc1[1:3])
236 |                 center = CENTER_NAME[center]
237 |                 sex = row[1]
238 |                 if psc1 in demographics:
239 |                     data = demographics[psc1]
240 |                     data['PSC2'] = psc2
241 |                     data['recruitment site'] = center
242 |                     if 'sex' in data:
243 |                         if data['sex'] != sex:
244 |                             logging.error('%s: inconsistent sex between Psytools and recruitment file', psc1)
245 |                         data['sex'] = sex
246 |                 else:
247 |                     data = {
248 |                         'PSC2': psc2,
249 |                         'sex': sex, 
250 |                         'recruitment site': center,
251 |                     }
252 |                 row = {x: data[x] if x in data else None
253 |                        for x in _FINAL_COLUMNS}
254 |                 demographics_writer.writerow(row)
255 | 
256 | 
257 | if __name__ == "__main__":
258 |     main()
259 | 


--------------------------------------------------------------------------------
/stratify_demographics/stratify_debug_psytools.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import os
  4 | from multiprocessing import Pool
  5 | import csv
  6 | from datetime import datetime, date
  7 | from collections import Counter
  8 | import logging
  9 | 
 10 | logging.basicConfig(level=logging.INFO)
 11 | 
 12 | STRATIFY_PSYTOOLS = '/neurospin/imagen/STRATIFY/RAW/PSC1/psytools'
 13 | STRATIFY_DOB = '/neurospin/imagen/STRATIFY/RAW/PSC1/meta_data/dob_validation.csv'
 14 | STRATIFY_SEX = '/neurospin/imagen/STRATIFY/RAW/PSC1/meta_data/sex_validation.csv'
 15 | 
 16 | WORKER_PROCESSES = 24
 17 | 
 18 | 
 19 | FEMALE = 'F'
 20 | MALE = 'M'
 21 | 
 22 | _CSV_ID_CHECK_GENDER_MAPPING = {
 23 |     '1': MALE,
 24 |     '2': FEMALE,
 25 |     'female': FEMALE,
 26 |     'male': MALE,
 27 | }
 28 | 
 29 | _LSRC2_ID_CHECK_GENDER_MAPPING = {
 30 |     'F': FEMALE,
 31 |     'M': MALE,
 32 | }
 33 | 
 34 | _CANTAB_GENDER_MAPPING = {
 35 |     'Female': FEMALE,
 36 |     'Male': MALE,
 37 | }
 38 | 
 39 | 
 40 | def list_psytools_timepoint(path):
 41 |     """List Psytools CSV files exported from Delosis.
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     path : str
 46 |         Directory to read Psytools CSV files from.
 47 | 
 48 |     Yields
 49 |     ------
 50 |     str
 51 |         Path to Psytools CSV file.
 52 | 
 53 |     """
 54 |     CSV_PREFIX = ('IMAGEN-', 'STRATIFY-')
 55 |     LSRC2_PREFIX = ('Imagen_', 'STRATIFY_Core')  # exclude STRATIFY_Screening
 56 | 
 57 |     for f in os.listdir(path):
 58 |         root, ext = os.path.splitext(f)
 59 |         if ext == '.csv':
 60 |             if any(root.startswith(prefix) for prefix in CSV_PREFIX):
 61 |                 yield (False, os.path.join(path, f), root)
 62 |             elif any(root.startswith(prefix) for prefix in LSRC2_PREFIX):
 63 |                 yield (True, os.path.join(path, f), root)
 64 |             else:
 65 |                 logging.error('skipping unknown CSV file: %s', f)
 66 | 
 67 | 
 68 | def process_psytools_timepoint(arguments):
 69 |     (lsrc2, path, name) = arguments  # unpack multiple arguments
 70 | 
 71 |     sex_counter = {}
 72 |     dob_counter = {}
 73 | 
 74 |     with open(path, 'r') as f:
 75 |         reader = csv.DictReader(f, dialect='excel')
 76 |         for row in reader:
 77 |             if lsrc2:
 78 |                 psc1 = row['id']
 79 |                 if psc1.endswith('SB'):
 80 |                     psc1 = psc1[:-len('SB')]
 81 |                 if psc1.endswith('FU'):
 82 |                     psc1 = psc1[:-len('FU')]
 83 |                 if psc1.isdigit() and len(psc1) == 12:
 84 |                     if 'IdCheckGender' in row:
 85 |                         id_check_gender = row['IdCheckGender']
 86 |                         if id_check_gender in _LSRC2_ID_CHECK_GENDER_MAPPING:
 87 |                             id_check_gender = _LSRC2_ID_CHECK_GENDER_MAPPING[id_check_gender]
 88 |                             sex_counter.setdefault(psc1, {}).setdefault(id_check_gender, Counter()).update(('IdCheckGender',))
 89 |                         elif id_check_gender:
 90 |                             logging.error("%s: %s: invalid 'IdCheckGender': %s",
 91 |                                           name, psc1, id_check_gender)
 92 |                         else:
 93 |                             logging.debug("%s: %s: empty 'IdCheckGender': %s",
 94 |                                           name, psc1, id_check_gender)
 95 |                     if 'IdCheckDob' in row:
 96 |                         id_check_dob = row['IdCheckDob']
 97 |                         try:
 98 |                             id_check_dob = datetime.strptime(id_check_dob, '%Y-%m-%d %H:%M:%S')
 99 |                         except ValueError as e:
100 |                             if id_check_dob:
101 |                                 logging.error("%s: %s: invalid 'IdCheckDob': %s",
102 |                                               name, psc1, id_check_dob)
103 |                             else:
104 |                                 logging.debug("%s: %s: empty 'IdCheckDob': %s",
105 |                                               name, psc1, id_check_dob)
106 |                         else:
107 |                             id_check_dob = id_check_dob.date()
108 |                             if id_check_dob.year > 2012 or id_check_dob.year < 1990:
109 |                                 logging.error("%s: %s: skip 'IdCheckDob': %d",
110 |                                               name, psc1, id_check_dob.year)
111 |                             else:
112 |                                 dob_counter.setdefault(psc1, {}).setdefault(id_check_dob, Counter()).update(('IdCheckDob',))
113 |                 else:
114 |                     logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1)
115 |             else:
116 |                 psc1_suffix = row['User code'].rsplit('-', 1)
117 |                 psc1 = psc1_suffix[0]
118 |                 if psc1.endswith('SB'):
119 |                     psc1 = psc1[:-len('SB')]
120 |                 completed = row['Completed']
121 |                 if completed == 't':
122 |                     trial = row['Trial']
123 |                     if trial == 'id_check_gender':
124 |                         if psc1.isdigit() and len(psc1) == 12:
125 |                             trial_result = row['Trial result']
126 |                             if trial_result in _CSV_ID_CHECK_GENDER_MAPPING:
127 |                                 id_check_gender = _CSV_ID_CHECK_GENDER_MAPPING[trial_result]
128 |                                 sex_counter.setdefault(psc1, {}).setdefault(id_check_gender, Counter()).update((trial,))
129 |                             else:
130 |                                 logging.error("%s: %s: invalid 'id_check_gender': %s",
131 |                                               name, psc1, trial_result)
132 |                         else:
133 |                             logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1)
134 |                     elif trial == 'ni_gender':
135 |                         if psc1.isdigit() and len(psc1) == 12:
136 |                             trial_result = row['Trial result']
137 |                             if trial_result in _LSRC2_ID_CHECK_GENDER_MAPPING:
138 |                                 id_check_gender = _LSRC2_ID_CHECK_GENDER_MAPPING[trial_result]
139 |                                 sex_counter.setdefault(psc1, {}).setdefault(id_check_gender, Counter()).update((trial,))
140 |                             else:
141 |                                 logging.error("%s: %s: invalid 'ni_gender': %s",
142 |                                               name, psc1, trial_result)
143 |                         else:
144 |                             logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1)
145 |                     elif trial == 'id_check_dob':
146 |                         if psc1.isdigit() and len(psc1) == 12:
147 |                             trial_result = row['Trial result']
148 |                             try:
149 |                                 month, year = trial_result.rsplit('_')
150 |                                 month = int(month)
151 |                                 year = int(year)
152 |                             except ValueError as e:
153 |                                 logging.error("%s: invalid 'id_check_dob': %s",
154 |                                               psc1, id_check_dob)
155 |                             else:
156 |                                 if year > 2012 or year < 1990:
157 |                                     logging.error("%s: skip 'id_check_dob': %d",
158 |                                                   psc1, year)
159 |                                 else:
160 |                                     dob_counter.setdefault(psc1, {}).setdefault((year, month), Counter()).update((trial,))
161 |                         else:
162 |                             logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1)
163 | 
164 |     return sex_counter, dob_counter
165 | 
166 | 
167 | def psytools_timepoint(path):
168 |     todo_list = list(list_psytools_timepoint(path))
169 | 
170 |     pool = Pool(WORKER_PROCESSES)
171 |     results = pool.map(process_psytools_timepoint, todo_list)
172 |     pool.close()
173 |     pool.join()
174 | 
175 |     sex = {}
176 |     dob = {}
177 |     for (sex_counter, dob_counter), (lsrc2, path, name) in zip(results, todo_list):
178 |         for psc1, values in sex_counter.items():
179 |             for value, variables in values.items():
180 |                 for variable, count in variables.items():
181 |                     sex.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update({name: count})
182 |         for psc1, values in dob_counter.items():
183 |             for value, variables in values.items():
184 |                 for variable, count in variables.items():
185 |                     dob.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update({name: count})
186 | 
187 |     clean_dob = {}
188 |     for psc1, values in dob.items():
189 |         exact_dates = set()
190 |         for value, variables in values.items():
191 |             if type(value) == date:
192 |                 for variable, counter in variables.items():
193 |                     exact_dates.add(value)
194 |                     clean_dob.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update(counter)
195 |         for value, variables in values.items():
196 |             if type(value) == tuple:
197 |                 year, month = value
198 |                 for variable, counter in variables.items():
199 |                     for d in exact_dates:
200 |                         if d.year == year and d.month == month:
201 |                             clean_dob.setdefault(psc1, {}).setdefault(d, {}).setdefault(variable, Counter()).update(counter)
202 |                             break
203 |                     else:
204 |                         clean_dob.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update(counter)
205 | 
206 |     return sex, clean_dob
207 | 
208 | 
209 | def cantab_timepoint(path):
210 |     sex = {}
211 |     for center in os.listdir(path):
212 |         center_path = os.path.join(path, center)
213 |         if os.path.isdir(center_path):
214 |             for psc1 in os.listdir(center_path):
215 |                 psc1_path = os.path.join(center_path, psc1)
216 |                 if os.path.isdir(psc1_path):
217 |                     if psc1.isdigit() and len(psc1) == 12:
218 |                         additional_data_path = os.path.join(psc1_path, 'AdditionalData')
219 |                         for f in os.listdir(additional_data_path):
220 |                             if f.startswith('datasheet_'):
221 |                                 if f == ('datasheet_' + psc1 + 'SB.csv'):
222 |                                     f_path = os.path.join(additional_data_path, f)
223 |                                     with open(f_path, newline='') as csvfile:
224 |                                         reader = csv.DictReader(csvfile)
225 |                                         try:
226 |                                             if 'Gender' not in reader.fieldnames:
227 |                                                 csvfile.seek(0)
228 |                                                 reader = csv.DictReader(csvfile, delimiter=';')
229 |                                                 if 'Gender' not in reader.fieldnames:
230 |                                                     reader = None
231 |                                         except:
232 |                                             logging.error('bad cantab datasheet for %s', psc1)
233 |                                         for row in reader:
234 |                                             if 'Gender' in row:
235 |                                                 if row['Gender']:
236 |                                                     sex[psc1] = _CANTAB_GENDER_MAPPING[row['Gender']]
237 |                                                 else:
238 |                                                     logging.warning('%s: missing Gender value: %s', psc1, f)
239 |                                             else:
240 |                                                 logging.warning('%s: missing Gender column (%s): %s', psc1, reader.fieldnames, f)
241 |                                 else:
242 |                                     logging.error('%s: incorrect file name: %s', psc1, f)
243 |                     else:
244 |                         logging.info('%s: not a directory', psc1)
245 |                 else:
246 |                         logging.debug('%s: not a PSC1 code', psc1)
247 | 
248 |     return sex
249 | 
250 | 
251 | def main():
252 |     sex, dob = psytools_timepoint(STRATIFY_PSYTOOLS)
253 |     cantab_sex = cantab_timepoint('/neurospin/imagen/STRATIFY/RAW/PSC1')
254 | 
255 |     validated_dob = {}
256 |     with open(STRATIFY_DOB, 'r') as f:
257 |         reader = csv.reader(f, dialect='excel')
258 |         for row in reader:
259 |             validated_dob[row[0]] = datetime.strptime(row[1], '%Y-%m-%d').date()
260 | 
261 |     validated_sex = {}
262 |     with open(STRATIFY_SEX, 'r') as f:
263 |         reader = csv.reader(f, dialect='excel')
264 |         for row in reader:
265 |             validated_sex[row[0]] = row[1]
266 | 
267 |     for psc1 in cantab_sex:
268 |         if psc1 in sex:
269 |             sex[psc1].setdefault(cantab_sex[psc1], {}).setdefault('Gender', Counter()).update({'datasheet_' + psc1 + 'SB': 1})
270 |         else:
271 |             logging.error('%s: found in Cantab but missing from Psytools', psc1)
272 | 
273 |     today = datetime.today()
274 | 
275 |     with open('STRATIFY_SEX_' + today.strftime('%Y-%m-%d') + '.txt', 'w') as f:
276 |         for psc1, values in sex.items():
277 |             if psc1 in validated_sex:
278 |                 print(','.join((psc1, validated_sex[psc1])), file=f)
279 |             elif len(values) > 1:
280 |                 message = '{}: multiple sex values:\n'.format(psc1)
281 |                 for value, variables in values.items():
282 |                     count_value = 0
283 |                     message_variable = ''
284 |                     for variable, counters in variables.items():
285 |                         count_variable = 0
286 |                         message_name = ''
287 |                         for name, count in counters.items():
288 |                             message_name += '\t\t\t{}\n'.format(name)
289 |                             count_variable += count
290 |                         message_variable += '\t\t{} ({})\n'.format(variable, count_variable) + message_name
291 |                         count_value += count_variable
292 |                     message_value = '\t{} ({})\n'.format(value, count_value) + message_variable
293 |                     message += message_value
294 |                 logging.error(message)
295 |             else:
296 |                 value = next(iter(values.keys()))
297 |                 print(','.join((psc1, value)), file=f)
298 | 
299 |     with open('STRATIFY_DOB_' + today.strftime('%Y-%m-%d') + '.txt', 'w') as f:
300 |         for psc1, values in dob.items():
301 |             if psc1 in validated_dob:
302 |                 print(','.join((psc1, validated_dob[psc1].strftime('%Y-%m-%d'),
303 |                                 today.strftime('%Y-%m-%d_%H:%M:%S.0'))),
304 |                       file=f)
305 |             elif len(values) > 1:
306 |                 message = '{}: multiple date of birth values:\n'.format(psc1)
307 |                 for value, variables in values.items():
308 |                     count_value = 0
309 |                     message_variable = ''
310 |                     for variable, counters in variables.items():
311 |                         count_variable = 0
312 |                         message_name = ''
313 |                         for name, count in counters.items():
314 |                             message_name += '\t\t\t{} ({})\n'.format(name, count)
315 |                             count_variable += count
316 |                         message_variable += '\t\t{} ({})\n'.format(variable, count_variable) + message_name
317 |                         count_value += count_variable
318 |                     message_value = '\t{} ({})\n'.format(value, count_value) + message_variable
319 |                     message += message_value
320 |                 logging.error(message)
321 |             else:
322 |                 value = next(iter(values.keys()))
323 |                 if type(value) == date:
324 |                     value = value.strftime('%Y-%m-%d')
325 |                     print(','.join((psc1, value,
326 |                                     today.strftime('%Y-%m-%d_%H:%M:%S.0'))),
327 |                           file=f)
328 |                 else:
329 |                     logging.error('%s: skipping incomplete date: %s', psc1, str(value))
330 | 
331 | 
332 | if __name__ == "__main__":
333 |     main()
334 | 


--------------------------------------------------------------------------------