├── AUTHOR
├── LICENSE
├── MANIFEST.in
├── README.rst
├── cantab
├── imagen_cantab_age_at_session_start_time.py
└── imagen_cantab_extract_deidentify.py
├── dawba
└── imagen_dawba_deidentify.py
├── genomics
├── rna_seq_deidentify_imagen.py
└── rna_seq_deidentify_stratify.py
├── geolocation
└── geolocation.sh
├── imagen_databank
├── __init__.py
├── additional_data.py
├── behavioral.py
├── cantab.py
├── core.py
├── dicom_utils.py
├── image_data.py
├── sanity
│ ├── __init__.py
│ ├── cantab.py
│ └── imaging.py
└── scanning.py
├── mri
└── imagen_sample_FU3_mri_deidentify.py
├── onsets
├── imagen_onsets_copy_FU3.sh
├── imagen_onsets_copy_STRATIFY.sh
└── imagen_onsets_extract_deidentify.py
├── psc
└── imagen_update_dawba_codes_from_tokens.py
├── psytools
├── imagen_psytools_deidentify.py
└── imagen_psytools_download.py
├── setup.py
├── sex
├── imagen_sex.py
├── imagen_sex_dataset.py
├── imagen_sex_methylation.py
├── imagen_sex_psytools.py
├── imagen_sex_recruitment.py
└── imagen_sex_xnat.py
└── stratify_demographics
├── demographics.py
└── stratify_debug_psytools.py
/AUTHOR:
--------------------------------------------------------------------------------
1 | Dimitri Papadopoulos
2 | David Goyard
3 | Antoine Grigis
4 | Vincent Frouin
5 | Robin Cherbonnier
6 | Thomas Gareau
7 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include AUTHOR LICENSE MANIFEST.in setup.py README.txt
2 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | =========================================
2 | Databank operations of the Imagen project
3 | =========================================
4 |
5 | Databank operations are mostly documented internally at NeuroSpin.
6 |
7 | Basic information is available from the `project wiki`_.
8 |
9 | This Python package combines a Python library *imagen_databank* for basic
10 | sanity check and preprocessing of Imagen data and a set of scripts to
11 | extract, check, anonymize and transform raw Imagen data.
12 |
13 | ``imagen_databank``
14 | Read and perform sanity checks on raw datasets.
15 |
16 | ``cantab``
17 | Extract age from FU2 Cantab data.
18 |
19 | ``dawba``
20 | Remove identifying data and convert PSC1 to PSC2 in Dawba data,
21 | after manual download from the youthinmind_ server.
22 |
23 | ``stratify_demographics``
24 | Cross-check Stratify age and sex with `stratify_debug_psytools.py`.
25 | Print demographics with `demographics.py`, using recruitment files and
26 | validated age/sex from the output of teh previosu script.
27 |
28 | ``geolocation``
29 | Merge and convert geolocation data from PSC1 to PSC2.
30 |
31 | ``mri``
32 | De-identify some NIfTI files that used to contain the PSC1 code.
33 |
34 | ``onsets``
35 | Remove identifying data and convert PSC1 to PSC2 in FU3 onsets files.
36 |
37 | ``psc``
38 | Update FU3 Dawba codes from token tables maintained on the Delosis_ serevr.
39 |
40 | ``psytools``
41 | Download Psytools data as CSV files from the Delosis_ server.
42 | Remove identifying data and convert PSC1 to PSC2.
43 |
44 | ``sex``
45 | Derive reference sex of Imagen subjects from multiple sources.
46 | There had been errors at baseline.
47 |
48 | .. _`project wiki`: https://github.com/imagen2/imagen_databank/wiki
49 | .. _youthinmind: http://youthinmind.com
50 | .. _Delosis: https://www.delosis.com
51 |
--------------------------------------------------------------------------------
/cantab/imagen_cantab_age_at_session_start_time.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """...
3 |
4 | ==========
5 | Attributes
6 | ==========
7 |
8 | Input
9 | -----
10 |
11 | FU2_MASTER_DIR : str
12 | Location of FU2 PSC1-encoded data.
13 |
14 | Output
15 | ------
16 |
17 | ???
18 |
19 | """
20 |
21 | FU2_MASTER_DIR = '/neurospin/imagen/FU2/RAW/PSC1'
22 |
23 | import logging
24 | logger = logging.getLogger(__name__)
25 | logging.basicConfig(level=logging.INFO)
26 |
27 | import os
28 | import glob
29 | from datetime import date
30 |
31 | # import ../imagen_databank
32 | import sys
33 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
34 | from imagen_databank import PSC2_FROM_PSC1
35 | from imagen_databank import DOB_FROM_PSC1
36 | from imagen_databank import read_datasheet
37 |
38 |
39 | def main():
40 | # find datasheet_*.csv files
41 | logger.info('start globing datasheet_*.csv files')
42 | datasheets = glob.glob(os.path.join(FU2_MASTER_DIR,
43 | '*/*/AdditionalData/datasheet_*.csv'))
44 | logger.info('finished globing datasheet_*.csv files')
45 |
46 | for datasheet in datasheets:
47 | subject_ids, session_start_times, dummy_r, dummy_c, dummy_f = read_datasheet(datasheet)
48 | if len(subject_ids) != 1:
49 | logger.warning('Proper "Subject ID" not found: %s', datasheet)
50 | continue
51 | psc1 = subject_ids.pop()[:12]
52 |
53 | # find age
54 | if psc1 not in DOB_FROM_PSC1:
55 | logger.error('unknown age for PSC1 code %s: %s', psc1, datasheet)
56 | continue
57 | dob = DOB_FROM_PSC1[psc1]
58 | session_start_times = set(sst.date() for sst in session_start_times)
59 | if len(session_start_times) != 1:
60 | logger.warning('Proper "Session start time" not found: %s',
61 | datasheet)
62 | continue
63 | session_start_time = session_start_times.pop()
64 | if session_start_time < date(2007, 1, 1):
65 | logger.error('Bogus "Session start time" %s: %s',
66 | session_start_time, datasheet)
67 | continue
68 | age = (session_start_time - dob).days
69 |
70 | # find PSC2
71 | if psc1 not in PSC2_FROM_PSC1:
72 | logger.error('unknown PSC1 code %s: %s', psc1, datasheet)
73 | continue
74 | psc2 = PSC2_FROM_PSC1[psc1]
75 |
76 | print('{0},{1}'.format(psc2, age))
77 |
78 |
79 | if __name__ == "__main__":
80 | main()
81 |
--------------------------------------------------------------------------------
/dawba/imagen_dawba_deidentify.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Re-encode and anonymize DAWBA files (BL, FU1, FU2 and FU3).
3 |
4 | This script replaces the Scito anoymization pipeline which does not
5 | seem to be working anymore for DAWBA files.
6 |
7 | ==========
8 | Attributes
9 | ==========
10 |
11 | Input
12 | -----
13 |
14 | DAWBA_BL_MASTER_DIR : str
15 | Location of BL PSC1-encoded files.
16 | DAWBA_FU1_MASTER_DIR : str
17 | Location of FU1 PSC1-encoded files.
18 | DAWBA_FU2_MASTER_DIR : str
19 | Location of FU2 PSC1-encoded files.
20 | DAWBA_FU3_MASTER_DIR : str
21 | Location of FU3 PSC1-encoded files.
22 | DAWBA_SB_MASTER_DIR : str
23 | Location of Stratify PSC1-encoded files.
24 |
25 | Output
26 | ------
27 |
28 | DAWBA_BL_PSC2_DIR : str
29 | Location of BL PSC2-encoded files.
30 | DAWBA_FU1_PSC2_DIR : str
31 | Location of FU1 PSC2-encoded files.
32 | DAWBA_FU2_PSC2_DIR : str
33 | Location of FU2 PSC2-encoded files.
34 | DAWBA_FU3_PSC2_DIR : str
35 | Location of FU3 PSC2-encoded files.
36 | DAWBA_SB_PSC2_DIR : str
37 | Location of Stratify PSC2-encoded files.
38 |
39 | """
40 |
41 | DAWBA_BL_MASTER_DIR = '/neurospin/imagen/BL/RAW/PSC1/dawba'
42 | DAWBA_BL_PSC2_DIR = '/neurospin/imagen/BL/RAW/PSC2/dawba'
43 | DAWBA_FU1_MASTER_DIR = '/neurospin/imagen/FU1/RAW/PSC1/dawba'
44 | DAWBA_FU1_PSC2_DIR = '/neurospin/imagen/FU1/RAW/PSC2/dawba'
45 | DAWBA_FU2_MASTER_DIR = '/neurospin/imagen/FU2/RAW/PSC1/dawba'
46 | DAWBA_FU2_PSC2_DIR = '/neurospin/imagen/FU2/RAW/PSC2/dawba'
47 | DAWBA_FU3_MASTER_DIR = '/neurospin/imagen/FU3/RAW/PSC1/dawba'
48 | DAWBA_FU3_PSC2_DIR = '/neurospin/imagen/FU3/RAW/PSC2/dawba'
49 | DAWBA_SB_MASTER_DIR = '/neurospin/imagen/STRATIFY/RAW/PSC1/dawba'
50 | DAWBA_SB_PSC2_DIR = '/neurospin/imagen/STRATIFY/RAW/PSC2/dawba'
51 |
52 | WITHDRAWN_DAWBA_CODES = {
53 | # DAWBA1 codes, missing for some reason - just ignore them...
54 | '19042',
55 | '19044',
56 | '19045',
57 | '19046',
58 | '19047',
59 | '19048',
60 | '19049',
61 | '19050',
62 | '19051',
63 | '23094',
64 | '23095',
65 | '23096',
66 | '23097',
67 | '23098',
68 | '23099',
69 | '23100',
70 | '23101',
71 | '23102',
72 | '23103',
73 | '23104',
74 | '23105',
75 | '23106',
76 | '23107',
77 | '23108',
78 | '23109',
79 | '23110',
80 | '23112',
81 | '23881',
82 | '27361',
83 | '27512',
84 | '28117',
85 | '28694',
86 | '31469',
87 | '31470',
88 | '31471',
89 | '31473',
90 | '38297',
91 | '38298',
92 | '38299',
93 | '38300',
94 | '38301',
95 | # see thread "DAWBA3 codes conversion table" from 2015-05-18
96 | '127657',
97 | # see thread "DAWBA3 codes conversion table" from 2015-12-15
98 | '128847',
99 | '127658',
100 | '132983',
101 | '129716',
102 | '129500',
103 | # see thread "Imagen: Dawba data 201490 acquired on 13 September 2015" on 2019-05-27
104 | '201490',
105 | # see thread "Imagen FU3 Dawba code 221867" on 2019-05-08
106 | '221867',
107 | # see thread "token management in Imagen FU3" on 2019-05-03
108 | '228686',
109 | '228691',
110 | # see thread "token management in Imagen FU3" on 2019-05-03
111 | '239204',
112 | '239230',
113 | # see thread "Imagen FU3 Dawba code 252346" on 2019-05-04
114 | '252346',
115 | # see thread "Re: AW:Imagen FU3 token management: 272443 / 272444" on 2019-06-25
116 | # 244471 and 244513 are the same participant, we were told to keep the former
117 | '244513',
118 | # see thread "AW: [ext] Fwd: Pause to production of new teams" on 2019-07-23
119 | '265683',
120 | '265684',
121 | '265685',
122 | '265686',
123 | '265687',
124 | '265689',
125 | # see thread "IMAGEN FU3, DAWBA-PSC1 clarification" on 2019-09-04
126 | # 236038 and 254243 are the same participant, we were told to keep the former
127 | '254243',
128 | }
129 |
130 | import os
131 | from datetime import datetime
132 |
133 | # import ../imagen_databank
134 | import sys
135 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
136 | from imagen_databank import PSC1_FROM_DAWBA
137 | from imagen_databank import PSC2_FROM_PSC1
138 | from imagen_databank import DOB_FROM_PSC1
139 |
140 | import logging
141 | logging.basicConfig(level=logging.INFO)
142 |
143 |
144 | def _create_psc2_file(dawba_path, psc2_path):
145 | """Anonymize and re-encode a DAWBA questionnaire from DAWBA to PSC2.
146 |
147 | DAWBA questionnaire files are CSV files.
148 |
149 | Columns containing a date will be modified and the date will converted to
150 | the age of the subject in days, as required by the anonymization process.
151 |
152 | Parameters
153 | ----------
154 | psc2_from_dawba: map
155 | Conversion table, from DAWBA to PSC2.
156 | dawba_path: str
157 | Input: DAWBA-encoded CSV file.
158 | psc2_path: str
159 | Output: PSC2-encoded CSV file.
160 |
161 | """
162 | with open(dawba_path, 'r') as dawba_file:
163 | # identify columns to anonymize/remove in header
164 | header = next(iter(dawba_file))
165 | items = header.split('\t')
166 | convert = {i for i, item in enumerate(items)
167 | if 'sstartdate' in item or 'p1startdate' in item}
168 | skip = {i for i, item in enumerate(items)
169 | if 'ratername' in item or 'ratedate' in item}
170 |
171 | with open(psc2_path, 'w') as psc2_file:
172 | # write header
173 | items = [item for i, item in enumerate(items)
174 | if i not in skip]
175 | psc2_file.write('\t'.join(items))
176 | if not items[-1].endswith('\n'):
177 | psc2_file.write('\n')
178 |
179 | # write data
180 | for line in dawba_file:
181 | items = line.split('\t')
182 | dawba = items[0]
183 | if dawba not in PSC1_FROM_DAWBA:
184 | if dawba in WITHDRAWN_DAWBA_CODES:
185 | logging.info('withdrawn DAWBA code: %s', dawba)
186 | else:
187 | logging.error('DAWBA code missing from conversion table: %s',
188 | dawba)
189 | continue
190 | psc1 = PSC1_FROM_DAWBA[dawba]
191 | if psc1 not in PSC2_FROM_PSC1:
192 | logging.error('PSC1 code missing from conversion table: %s',
193 | psc1)
194 | continue
195 | psc2 = PSC2_FROM_PSC1[psc1]
196 | logging.info('converting subject %s from DAWBA to PSC2',
197 | psc1)
198 | items[0] = psc2
199 | # convert dates to subject age in days
200 | for i in convert:
201 | if items[i] != '':
202 | if psc1 in DOB_FROM_PSC1:
203 | startdate = datetime.strptime(items[i],
204 | '%d.%m.%y').date()
205 | birthdate = DOB_FROM_PSC1[psc1]
206 | age = startdate - birthdate
207 | logging.info('age of subject %s: %d',
208 | psc1, age.days)
209 | items[i] = str(age.days)
210 | else:
211 | items[i] = ''
212 | items = [item for i, item in enumerate(items)
213 | if i not in skip]
214 | psc2_file.write('\t'.join(items))
215 | if not items[-1].endswith('\n'):
216 | psc2_file.write('\n')
217 |
218 |
219 | def create_psc2_files(master_dir, psc2_dir, prefix=None):
220 | """Anonymize and re-encode all DAWBA questionnaires within a directory.
221 |
222 | DAWBA-encoded files are read from `master_dir`, anoymized and converted
223 | from DAWBA codes to PSC2, and the result is written in `psc2_dir`.
224 |
225 | Parameters
226 | ----------
227 | master_dir: str
228 | Input directory with DAWBA-encoded questionnaires.
229 | psc2_dir: str
230 | Output directory with PSC2-encoded and anonymized questionnaires.
231 |
232 | """
233 | for master_file in os.listdir(master_dir):
234 | master_path = os.path.join(master_dir, master_file)
235 | if prefix:
236 | master_file = prefix + master_file
237 | psc2_path = os.path.join(psc2_dir, master_file)
238 | _create_psc2_file(master_path, psc2_path)
239 |
240 |
241 | def main():
242 | create_psc2_files(DAWBA_BL_MASTER_DIR, DAWBA_BL_PSC2_DIR, prefix='IMAGEN_')
243 | create_psc2_files(DAWBA_FU1_MASTER_DIR, DAWBA_FU1_PSC2_DIR, prefix='IMAGEN_')
244 | create_psc2_files(DAWBA_FU2_MASTER_DIR, DAWBA_FU2_PSC2_DIR, prefix='IMAGEN_')
245 | create_psc2_files(DAWBA_FU3_MASTER_DIR, DAWBA_FU3_PSC2_DIR, prefix='IMAGEN_')
246 | create_psc2_files(DAWBA_SB_MASTER_DIR, DAWBA_SB_PSC2_DIR, prefix='STRATIFY_')
247 |
248 |
249 | if __name__ == "__main__":
250 | main()
251 |
--------------------------------------------------------------------------------
/genomics/rna_seq_deidentify_imagen.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | from imagen_databank import PSC2_FROM_PSC1
4 |
5 | file_labID_PSC1_conv='/imagen/FU3/RAW/PSC1/genomics/rna/env_IMAGEN_align60_no.dups_metadata.tsv'
6 |
7 | #use either first or seconf bloc, for gene_counts or gene_tmp
8 |
9 | input_dir_imagen_PSC1='/imagen/FU3/RAW/PSC1/genomics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts.tsv'
10 | output_dir_imagen_BL_PSC2='/imagen/BL/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2_BL.tsv'
11 | output_dir_imagen_FU2_PSC2='/imagen/FU2/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2_FU2.tsv'
12 | output_dir_imagen_FU3_PSC2='/imagen/FU3/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2_FU3.tsv'
13 | """
14 | input_dir_imagen_PSC1="/imagen/FU3/RAW/PSC1/genomics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm.tsv"
15 | output_dir_imagen_BL_PSC2='/imagen/BL/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2_BL.tsv'
16 | output_dir_imagen_FU2_PSC2='/imagen/FU2/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2_FU2.tsv'
17 | output_dir_imagen_FU3_PSC2='/imagen/FU3/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2_FU3.tsv'
18 | """
19 |
20 |
21 |
22 | def convert_labID_to_PSC2_with_timepoint(labID):
23 | labID_index = headers.index("Lab_Code")
24 | psc1_index = headers.index("PSC1")
25 | timepoint_index=headers.index("TimePoint")
26 | for line in tab_conv_labID_psc1:
27 | if line[labID_index]==labID:
28 | try:
29 | if len(line[psc1_index])<12:
30 | psc1="0"+line[psc1_index]
31 | elif len(line[psc1_index])<12:
32 | psc1=line[psc1_index]
33 | psc2 = PSC2_FROM_PSC1[psc1]
34 | return (psc2, line[timepoint_index])
35 | except:
36 | print("invalid PSC1 code:", line[psc1_index])
37 | #return ("###", line[timepoint_index])
38 |
39 | print("PSC1 not found for labID: ", labID)
40 |
41 | """
42 | for line in file_labID_PSC1:
43 | columns = line.strip().split(",")
44 | #print("check:",columns[labID_index],labID==columns[labID_index])
45 | if columns[labID_index] == labID:
46 | #print("deidentified: ",columns[psc1_index], "****", columns[timepoint_index])
47 | psc2 = PSC2_FROM_PSC1["0"+columns[psc1_index]]
48 | return(psc2,columns[timepoint_index])
49 | print("PSC1 not found for labID: ", labID)
50 | """
51 |
52 | if __name__ == "__main__":
53 | with open(file_labID_PSC1_conv, 'r', errors='ignore') as file_labID_PSC1:
54 | reader = csv.reader(file_labID_PSC1, delimiter=',')
55 | tab_conv_labID_psc1 = [row for row in reader]
56 | headers=tab_conv_labID_psc1[0]
57 | #headers = list(next(reader))
58 | print(headers)
59 | print(convert_labID_to_PSC2_with_timepoint("GB97ENVKCLR301518"))
60 |
61 | with open(input_dir_imagen_PSC1, 'r', newline='',errors='ignore') as labID_infile:
62 | reader_input = csv.reader(labID_infile, delimiter='\t')
63 |
64 | data = [row for row in reader_input]
65 |
66 | #print(data[0])
67 | #intialize list of lists that will be written in the output file
68 | data_psc2_BL=[[] for i in range(len(data))]
69 | data_psc2_FU2 = [[] for i in range(len(data))]
70 | data_psc2_FU3 = [[] for i in range(len(data))]
71 | #intialize the two first columns of the three timepoints
72 | for i in range(len(data)):
73 | #print(data[i][0]," ***** ", data[i][1], " ***** ", data[i][2])
74 | #print(data_psc2_BL[i])
75 | data_psc2_BL[i].append(data[i][0])
76 | data_psc2_BL[i].append(data[i][1])
77 |
78 | data_psc2_FU2[i].append(data[i][0])
79 | data_psc2_FU2[i].append(data[i][1])
80 |
81 | data_psc2_FU3[i].append(data[i][0])
82 | data_psc2_FU3[i].append(data[i][1])
83 |
84 |
85 | count_BL=0
86 | count_FU2=0
87 | count_FU3 = 0
88 | #copy the resting column to the respective matrix depending on the timepoint
89 | for col_index in range(2,len(data[0])):
90 | #print(col_index)
91 | lab_id=data[0][col_index]
92 | lab_id.strip()
93 | #print(convert_labID_to_PSC2_with_timepoint(lab_id))
94 | try:
95 | (psc2, timepoint)= convert_labID_to_PSC2_with_timepoint(lab_id)
96 | if timepoint == "BL":
97 |
98 | count_BL=count_BL+1
99 | data_psc2_BL[0].append(psc2)
100 | for i in range(1,len(data)):
101 | data_psc2_BL[i].append(data[i][col_index])
102 | elif timepoint == "FU2":
103 |
104 | count_FU2=count_FU2+1
105 | data_psc2_FU2[0].append(psc2)
106 | for i in range(1,len(data)):
107 | data_psc2_FU2[i].append(data[i][col_index])
108 | elif timepoint == "FU3":
109 |
110 | count_FU3=count_FU3+1
111 | data_psc2_FU3[0].append(psc2)
112 | for i in range(1,len(data)):
113 | data_psc2_FU3[i].append(data[i][col_index])
114 | else:
115 | print("invalid timepoint:",timepoint)
116 | except:
117 | continue
118 | print("BL", count_BL)
119 | print("FU2", count_FU2)
120 | print("FU3", count_FU3)
121 |
122 | #write the output to the files
123 | print("writing ...")
124 | with open(output_dir_imagen_BL_PSC2, 'w', newline='') as PSC2_BL_outfile:
125 | writer_BL = csv.writer(PSC2_BL_outfile, delimiter='\t')
126 | writer_BL.writerows(data_psc2_BL)
127 |
128 | with open(output_dir_imagen_FU2_PSC2, 'w', newline='') as PSC2_FU2_outfile:
129 | writer_FU2 = csv.writer(PSC2_FU2_outfile, delimiter='\t')
130 | writer_FU2.writerows(data_psc2_FU2)
131 |
132 | with open(output_dir_imagen_FU3_PSC2, 'w', newline='') as PSC2_FU3_outfile:
133 | writer_FU3 = csv.writer(PSC2_FU3_outfile, delimiter='\t')
134 | writer_FU3.writerows(data_psc2_FU3)
135 |
--------------------------------------------------------------------------------
/genomics/rna_seq_deidentify_stratify.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | from imagen_databank import PSC2_FROM_PSC1
4 |
5 | file_labID_PSC1_conv_stratify='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_STRATIFY_align60_no.dups_metadata.tsv'
6 | file_labID_PSC1_conv_estra='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_ESTRA_align60_no.dups_metadata.tsv'
7 |
8 | #use either first or seconf bloc, for gene_counts or gene_tmp
9 |
10 | input_dir_STRATIFY_PSC1_counts='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts.tsv'
11 | input_dir_ESTRA_PSC1_counts='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts.tsv'
12 | output_dir_STRATIFY_PSC2_counts='/imagen/STRATIFY/processed/genetics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2.tsv'
13 | output_dir_ESTRA_PSC2_counts='/imagen/STRATIFY/processed/genetics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2.tsv'
14 |
15 |
16 | input_dir_STRATIFY_PSC1_tpm='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm.tsv'
17 | input_dir_ESTRA_PSC1_tpm='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm.tsv'
18 | output_dir_STRATIFY_PSC2_tpm='/imagen/STRATIFY/processed/genetics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2.tsv'
19 | output_dir_ESTRA_PSC2_tpm='/imagen/STRATIFY/processed/genetics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2.tsv'
20 |
21 |
22 |
23 |
24 | def convert_labID_to_PSC2_with_timepoint(labID,tab_conv_labID_psc1):
25 | headers = tab_conv_labID_psc1[0]
26 | labID_index = headers.index("Lab_Code")
27 | psc1_index = headers.index("PSC1")
28 | timepoint_index=headers.index("TimePoint")
29 | for line in tab_conv_labID_psc1:
30 | if line[labID_index]==labID:
31 | try:
32 | if len(line[psc1_index])<12:
33 | psc1="0"+line[psc1_index]
34 | elif len(line[psc1_index])<12:
35 | psc1=line[psc1_index]
36 | psc2 = PSC2_FROM_PSC1[psc1]
37 |
38 | return (psc2, line[timepoint_index])
39 | except:
40 | print("invalid PSC1 code:", line[psc1_index])
41 | #return ("###", line[timepoint_index])
42 |
43 | print("PSC1 not found for labID: ", labID)
44 |
45 |
46 | def convert_file_to_PSC2(file_labID_PSC1_conv, input_dir_PSC1, output_dir_PSC2, delimiter_metadata):
47 | print("converting ", input_dir_PSC1, " to PSC2...")
48 | with open(file_labID_PSC1_conv, 'r', errors='ignore') as file_labID_PSC1:
49 | reader = csv.reader(file_labID_PSC1, delimiter=delimiter_metadata)
50 | tab_conv_labID_psc1 = [row for row in reader]
51 | headers = tab_conv_labID_psc1[0]
52 | # headers = list(next(reader))
53 | print(headers)
54 | # print(convert_labID_to_PSC2_with_timepoint("GB97ENVKCLR301518"))
55 |
56 | with open(input_dir_PSC1, 'r', newline='',errors='ignore') as labID_infile:
57 | reader_input = csv.reader(labID_infile, delimiter='\t')
58 |
59 | data = [row for row in reader_input]
60 |
61 | #print(data[0])
62 | #intialize list of lists that will be written in the output file
63 | data_psc2=[[] for i in range(len(data))]
64 |
65 | #intialize the two first columns of the three timepoints
66 | for i in range(len(data)):
67 | #print(data[i][0]," ***** ", data[i][1], " ***** ", data[i][2])
68 | #print(data_psc2_BL[i])
69 | data_psc2[i].append(data[i][0])
70 | data_psc2[i].append(data[i][1])
71 |
72 | count=0
73 |
74 | #copy the resting column to the respective matrix depending on the timepoint
75 | for col_index in range(2,len(data[0])):
76 | #print(col_index)
77 | lab_id=data[0][col_index]
78 | lab_id.strip()
79 | #print(convert_labID_to_PSC2_with_timepoint(lab_id))
80 | try:
81 | (psc2, timepoint)= convert_labID_to_PSC2_with_timepoint(lab_id,tab_conv_labID_psc1)
82 | count=count+1
83 | data_psc2[0].append(psc2)
84 | for i in range(1,len(data)):
85 | data_psc2[i].append(data[i][col_index])
86 |
87 | except:
88 | continue
89 | print("number of lines in file: " ,count)
90 |
91 |
92 | #write the output to the files
93 | print("writing ...")
94 | with open(output_dir_PSC2, 'w', newline='') as PSC2_outfile:
95 | writer = csv.writer(PSC2_outfile, delimiter='\t')
96 | writer.writerows(data_psc2)
97 |
98 |
99 |
100 | if __name__ == "__main__":
101 | convert_file_to_PSC2(file_labID_PSC1_conv_stratify, input_dir_STRATIFY_PSC1_counts, output_dir_STRATIFY_PSC2_counts,",")
102 |
103 | convert_file_to_PSC2(file_labID_PSC1_conv_stratify, input_dir_STRATIFY_PSC1_tpm, output_dir_STRATIFY_PSC2_tpm, ",")
104 |
105 | convert_file_to_PSC2(file_labID_PSC1_conv_estra, input_dir_ESTRA_PSC1_counts, output_dir_ESTRA_PSC2_counts, "\t")
106 |
107 | convert_file_to_PSC2(file_labID_PSC1_conv_estra, input_dir_ESTRA_PSC1_tpm, output_dir_ESTRA_PSC2_tpm, "\t")
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
--------------------------------------------------------------------------------
/geolocation/geolocation.sh:
--------------------------------------------------------------------------------
1 | #/bin/sh
2 |
3 | #
4 | # process geolocation at each time point
5 | #
6 | for timepoint in BL FU1 FU2 FU3
7 | do
8 | DIR_PSC1="/neurospin/imagen/${timepoint}/RAW/PSC1/geolocation"
9 | FILE_PSC2="/neurospin/imagen/${timepoint}/processed/geolocation/IMAGEN_geolocation_${timepoint}.csv"
10 |
11 | # print output file header line
12 | echo "PSC2,latitude,longitude,notes" > "$FILE_PSC2"
13 | # process each input file
14 | for file in "${DIR_PSC1}/IMAGEN_geolocation_"*"_${timepoint}.csv"
15 | do
16 | # some commands cannot process DOS line endings
17 | tmpfile=`mktemp -t tmp.geolocation.XXXXXXXXXX`
18 | dos2unix -n "$file" "$tmpfile" 2>/dev/null
19 | # some sites lack a "Notes" column
20 | if head -1 "$tmpfile" | grep -q "Notes"
21 | then
22 | ADD_NOTES=0
23 | else
24 | ADD_NOTES=1
25 | fi
26 | # skip input file header line
27 | tail -n +2 "$tmpfile" |
28 | # some sites lack a "Notes" column
29 | if [ "$ADD_NOTES" ]
30 | then
31 | sed 's/$/,/'
32 | fi
33 | # clean up
34 | rm -f "$tmpfile"
35 | done | psc2psc.py 2>/dev/null | sort >> "$FILE_PSC2"
36 | unix2dos -o "$FILE_PSC2" 2>/dev/null
37 | done
38 |
39 |
40 | #
41 | # process geolocation backdated from BL
42 | #
43 | BACKDATED_PSC1="/neurospin/imagen/FU3/RAW/PSC1/geolocation/IMAGEN_geolocation_ALL_SITES_backdated_Dublin_updated.csv"
44 | BACKDATED_PSC2="/neurospin/imagen/FU3/processed/geolocation/IMAGEN_geolocation_backdated.csv"
45 |
46 | # print output file header line
47 | echo "PSC2,year,latitude,longitude" > "$BACKDATED_PSC2"
48 | # skip input file header line
49 | tail -n +2 "$BACKDATED_PSC1" | psc2psc.py 2>/dev/null | sort >> "$BACKDATED_PSC2"
50 | unix2dos -o "$BACKDATED_PSC2" 2>/dev/null
51 |
--------------------------------------------------------------------------------
/imagen_databank/__init__.py:
--------------------------------------------------------------------------------
1 | # noqa
2 |
3 | # Copyright (c) 2014-2018 CEA
4 | #
5 | # This software is governed by the CeCILL license under French law and
6 | # abiding by the rules of distribution of free software. You can use,
7 | # modify and/ or redistribute the software under the terms of the CeCILL
8 | # license as circulated by CEA, CNRS and INRIA at the following URL
9 | # "http://www.cecill.info".
10 | #
11 | # As a counterpart to the access to the source code and rights to copy,
12 | # modify and redistribute granted by the license, users are provided only
13 | # with a limited warranty and the software's author, the holder of the
14 | # economic rights, and the successive licensors have only limited
15 | # liability.
16 | #
17 | # In this respect, the user's attention is drawn to the risks associated
18 | # with loading, using, modifying and/or developing or reproducing the
19 | # software by the user in light of its specific status of free software,
20 | # that may mean that it is complicated to manipulate, and that also
21 | # therefore means that it is reserved for developers and experienced
22 | # professionals having in-depth computer knowledge. Users are therefore
23 | # encouraged to load and test the software's suitability as regards their
24 | # requirements in conditions enabling the security of their systems and/or
25 | # data to be ensured and, more generally, to use and operate it in the
26 | # same conditions as regards security.
27 | #
28 | # The fact that you are presently reading this means that you have had
29 | # knowledge of the CeCILL license and that you accept its terms.
30 |
31 | __all__ = ['additional_data', 'behavioral', 'cantab', 'core', 'dicom_utils',
32 | 'image_data', 'scanning', 'sanity']
33 |
34 | from . import core
35 | from .core import (LONDON, NOTTINGHAM, DUBLIN, BERLIN,
36 | HAMBURG, MANNHEIM, PARIS, DRESDEN,
37 | SOUTHAMPTON, AACHEN)
38 | from .core import CENTER_NAME
39 | from .core import (PSC2_FROM_PSC1, PSC1_FROM_PSC2,
40 | PSC1_FROM_DAWBA, PSC2_FROM_DAWBA, # PSC2_FROM_DAWBA is obsolete
41 | DOB_FROM_PSC1, DOB_FROM_PSC2) # DOB_FROM_PSC2 is obsolete
42 | from .core import (detect_psc1, detect_psc2, guess_psc1)
43 | from .core import Error
44 |
45 | from . import additional_data
46 | from .additional_data import (walk_additional_data, report_additional_data)
47 |
48 | from . import behavioral
49 | from .behavioral import (MID_CSV, FT_CSV, SS_CSV, RECOG_CSV)
50 | from .behavioral import (read_mid, read_ft, read_ss, read_recog)
51 |
52 | from . import cantab
53 | from .cantab import (CANTAB_CCLAR, DETAILED_DATASHEET_CSV, DATASHEET_CSV,
54 | REPORT_HTML)
55 | from .cantab import (read_cant, read_datasheet, read_detailed_datasheet,
56 | read_report)
57 |
58 | from . import dicom_utils
59 | from .dicom_utils import read_metadata
60 |
61 | from . import image_data
62 | from .image_data import (SEQUENCE_LOCALIZER_CALIBRATION,
63 | SEQUENCE_T2, SEQUENCE_T2_FLAIR,
64 | SEQUENCE_ADNI_MPRAGE,
65 | SEQUENCE_MID, SEQUENCE_FT, SEQUENCE_SST,
66 | SEQUENCE_B0_MAP, SEQUENCE_DTI,
67 | SEQUENCE_RESTING_STATE,
68 | SEQUENCE_NODDI)
69 | from .image_data import SEQUENCE_NAME
70 | from .image_data import NONSTANDARD_DICOM
71 | from .image_data import series_type_from_description
72 | from .image_data import walk_image_data, report_image_data
73 |
74 | from . import scanning
75 | from .scanning import read_scanning
76 |
77 | from . import sanity
78 |
79 | __author__ = 'Dimitri Papadopoulos'
80 | __copyright__ = 'Copyright (c) 2014-2018 CEA'
81 | __license__ = 'CeCILL'
82 | __version__ = '0.1.0'
83 | __email__ = 'imagendatabase@cea.fr'
84 | __status__ = 'Development'
85 |
--------------------------------------------------------------------------------
/imagen_databank/additional_data.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2017 CEA
2 | #
3 | # This software is governed by the CeCILL license under French law and
4 | # abiding by the rules of distribution of free software. You can use,
5 | # modify and/ or redistribute the software under the terms of the CeCILL
6 | # license as circulated by CEA, CNRS and INRIA at the following URL
7 | # "http://www.cecill.info".
8 | #
9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 |
29 | import os
30 | import re
31 |
32 | from .cantab import (CANTAB_CCLAR, DETAILED_DATASHEET_CSV, DATASHEET_CSV,
33 | REPORT_HTML,
34 | read_cant, read_datasheet, read_detailed_datasheet,
35 | read_report)
36 | from .behavioral import (MID_CSV, FT_CSV, SS_CSV, RECOG_CSV,
37 | read_mid, read_ft, read_ss, read_recog)
38 |
39 | import logging
40 | logger = logging.getLogger(__name__)
41 |
42 | __all__ = ['walk_additional_data', 'report_additional_data']
43 |
44 |
45 | #
46 | # check filenames against these regex'es when exploring Additional Data
47 | #
48 | # in some case order is important, for example:
49 | # - first match 'detailed_datasheet'
50 | # - then match 'datasheet'
51 | #
52 | _LOOSE_ADDITIONAL_DATA_REGEXES = (
53 | (re.compile(r'(\w+_)?cant(_\w+)?\.cclar', re.IGNORECASE), CANTAB_CCLAR),
54 | # Mannheim send 'detailed datasheet' files (space instead of underscore)
55 | (re.compile(r'(\w+_)?detailed[_ ]datasheet(_\w+)?\.csv', re.IGNORECASE),
56 | DETAILED_DATASHEET_CSV),
57 | (re.compile(r'(\w+_)?datasheet(_\w+)?\.csv', re.IGNORECASE), DATASHEET_CSV),
58 | (re.compile(r'(\w+_)?report(_\w+)?\.html', re.IGNORECASE), REPORT_HTML),
59 | (re.compile(r'ft_\w+\.csv', re.IGNORECASE), FT_CSV),
60 | (re.compile(r'mid_\w+\.csv', re.IGNORECASE), MID_CSV),
61 | (re.compile(r'recog_\w+\.csv', re.IGNORECASE), RECOG_CSV),
62 | (re.compile(r'ss_\w+\.csv', re.IGNORECASE), SS_CSV),
63 | )
64 |
65 | _EXACT_ADDITIONAL_DATA_REGEXES = (
66 | (re.compile(r'cant_\d{12}(fu|FU)?\.cclar'), CANTAB_CCLAR),
67 | (re.compile(r'detailed_datasheet_\d{12}(fu|FU)?\.csv'), DETAILED_DATASHEET_CSV),
68 | (re.compile(r'datasheet_\d{12}(fu|FU)?\.csv'), DATASHEET_CSV),
69 | (re.compile(r'report_\d{12}(fu|FU)?\.html'), REPORT_HTML),
70 | (re.compile(r'ft_\d{12}(fu|FU)?\.csv'), FT_CSV),
71 | (re.compile(r'mid_\d{12}(fu|FU)?\.csv'), MID_CSV),
72 | (re.compile(r'recog_\d{12}(fu|FU)?\.csv'), RECOG_CSV),
73 | (re.compile(r'ss_\d{12}(fu|FU)?\.csv', re.IGNORECASE), SS_CSV),
74 | )
75 |
76 |
77 | def _match_additional_data_sops(filename, exact=False):
78 | """Compare filename to filenames defined in Imagen FU2 SOPs.
79 |
80 | Compare actual filename to expected filenames expected for Additional
81 | Data in SOPs, either in a strict way or a loose way. This matching
82 | function is empirical and based on experimentation.
83 |
84 | Parameters
85 | ----------
86 | filename : unicode
87 | The file basename to match.
88 |
89 | exact : bool
90 | Exact match if True else loose match.
91 |
92 | Returns
93 | -------
94 | str
95 | If the filename loosely matches a file type defined in the SOPs,
96 | return the type file type, else return None.
97 |
98 | """
99 | if exact:
100 | regex_list = _EXACT_ADDITIONAL_DATA_REGEXES
101 | else:
102 | regex_list = _LOOSE_ADDITIONAL_DATA_REGEXES
103 | for regex, filetype in regex_list:
104 | if regex.match(filename):
105 | logger.debug('assign type "%s" to filename: %s',
106 | filetype, filename)
107 | return filetype
108 | logger.info('filename does not match any known type: %s', filename)
109 | return None
110 |
111 |
112 | def walk_additional_data(path):
113 | """Generate information on Additional Data files in a directory.
114 |
115 | Parameters
116 | ----------
117 | path : unicode
118 | The directory to look for files into.
119 |
120 | Returns
121 | -------
122 | tuple
123 | Yield a 2-tuple: the name and the path of each file relative to path.
124 |
125 | """
126 |
127 | for root, dummy_dirs, files in os.walk(path):
128 | for filename in files:
129 | relpath = os.path.relpath(os.path.join(root, filename), path)
130 | yield filename, relpath
131 |
132 |
133 | def report_additional_data(path, psc1, exact=False):
134 | """Find Additional Data files that fit the Imagen FU2 SOPs.
135 |
136 | The Imagen FU2 SOPs define a precise file organization for Additional
137 | Data. In practice we have found the SOPs are only loosely followed by
138 | acquisition centres, hence the tolerant optional argument.
139 |
140 | This function scans the directory where we expect to find the Additional
141 | Data of a dataset and builds a collection of files identified as the
142 | files described in the SOPs.
143 |
144 | Parameters
145 | ----------
146 | path : unicode
147 | The directory to look for Additional Data into.
148 |
149 | psc1 : str
150 | PSC1 code of the subject.
151 |
152 | exact : bool
153 | Exact match if True, else loose match.
154 |
155 | Returns
156 | -------
157 | dict
158 | The key identifies the type of identified files and the value
159 | lists the relative path of the files.
160 |
161 | """
162 | additional_files = {}
163 |
164 | for filename, relpath in walk_additional_data(path):
165 | filetype = _match_additional_data_sops(filename, exact)
166 | if filetype:
167 | logger.debug('assign type "%s" to file: %s',
168 | filetype, relpath)
169 | additional_files.setdefault(filetype, []).append(relpath)
170 | else:
171 | logger.warning('cannot match any known type: %s', relpath)
172 |
173 | additional_data = {}
174 |
175 | # read cant_*.cclar where available
176 | if CANTAB_CCLAR in additional_files:
177 | for f in additional_files[CANTAB_CCLAR]:
178 | f_path = os.path.join(path, f)
179 | subject_ids = read_cant(f_path)
180 | if psc1 in subject_ids:
181 | subject_ids.remove(psc1)
182 | additional_data.setdefault(CANTAB_CCLAR, {})[f] = subject_ids
183 | # read datasheet_*.csv where available
184 | if DATASHEET_CSV in additional_files:
185 | for f in additional_files[DATASHEET_CSV]:
186 | f_path = os.path.join(path, f)
187 | subject_ids, dummy_st, dummy_r, dummy_c, dummy_f = read_datasheet(f_path)
188 | if psc1 in subject_ids:
189 | subject_ids.remove(psc1)
190 | additional_data.setdefault(DATASHEET_CSV, {})[f] = subject_ids
191 | # read detailed_datasheet_*.csv where available
192 | if DETAILED_DATASHEET_CSV in additional_files:
193 | for f in additional_files[DETAILED_DATASHEET_CSV]:
194 | f_path = os.path.join(path, f)
195 | subject_ids = read_detailed_datasheet(f_path)
196 | if psc1 in subject_ids:
197 | subject_ids.remove(psc1)
198 | additional_data.setdefault(DETAILED_DATASHEET_CSV, {})[f] = subject_ids
199 | # read report_*.html where available
200 | if REPORT_HTML in additional_files:
201 | for f in additional_files[REPORT_HTML]:
202 | f_path = os.path.join(path, f)
203 | subject_ids = read_report(f_path)
204 | if psc1 in subject_ids:
205 | subject_ids.remove(psc1)
206 | additional_data.setdefault(REPORT_HTML, {})[f] = subject_ids
207 | # read Scanning/ft_*.csv where available
208 | if FT_CSV in additional_files:
209 | for f in additional_files[FT_CSV]:
210 | f_path = os.path.join(path, f)
211 | subject_id, _, _, _ = read_ft(f_path)
212 | if subject_id:
213 | additional_data.setdefault(FT_CSV, {})[f] = set(subject_id)
214 | # read Scanning/mid_*.csv where available
215 | if MID_CSV in additional_files:
216 | for f in additional_files[MID_CSV]:
217 | f_path = os.path.join(path, f)
218 | subject_id, _, _, _ = read_mid(f_path)
219 | if subject_id:
220 | additional_data.setdefault(MID_CSV, {})[f] = set(subject_id)
221 | # read Scanning/recog_*.csv where available
222 | if RECOG_CSV in additional_files:
223 | for f in additional_files[RECOG_CSV]:
224 | f_path = os.path.join(path, f)
225 | subject_id, _, _, _ = read_recog(f_path)
226 | if subject_id:
227 | additional_data.setdefault(RECOG_CSV, {})[f] = set(subject_id)
228 | # read Scanning/ss_*.csv where available
229 | if SS_CSV in additional_files:
230 | for f in additional_files[SS_CSV]:
231 | f_path = os.path.join(path, f)
232 | subject_id, _, _, _ = read_ss(f_path)
233 | if subject_id:
234 | additional_data.setdefault(SS_CSV, {})[f] = set(subject_id)
235 |
236 | return additional_data
237 |
--------------------------------------------------------------------------------
/imagen_databank/behavioral.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2017 CEA
2 | #
3 | # This software is governed by the CeCILL license under French law and
4 | # abiding by the rules of distribution of free software. You can use,
5 | # modify and/ or redistribute the software under the terms of the CeCILL
6 | # license as circulated by CEA, CNRS and INRIA at the following URL
7 | # "http://www.cecill.info".
8 | #
9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 |
29 | import csv
30 | from datetime import datetime
31 |
32 | from .core import Error
33 |
34 | import logging
35 | logger = logging.getLogger(__name__)
36 |
37 | __all__ = ['MID_COLUMNS', 'FT_COLUMNS', 'SS_COLUMNS', 'RECOG_COLUMNS',
38 | 'read_mid', 'read_ft', 'read_ss', 'read_recog']
39 |
40 | #
41 | # types of files we expect to be find under AdditionalData/Scanning
42 | #
43 | FT_CSV = 'ft'
44 | MID_CSV = 'mid'
45 | SS_CSV = 'ss'
46 | RECOG_CSV = 'recog'
47 |
48 |
49 | def _parse_behavioral_datetime(date_string):
50 | """Read date in the format found in CSV files.
51 |
52 | * LONDON 01/02/2015 01:02:03
53 | * NOTTINGHAM 01/02/2015 01:02:03
54 | * DUBLIN 01/02/2015 01:02:03 2/1/2015 1:02:03 AM
55 | * BERLIN 01.02.2015 01:02:03
56 | * HAMBURG 01.02.2015 01:02:03
57 | * MANNHEIM 01.02.2015 01:02:03
58 | * PARIS 01/02/2015 01:02:03
59 | * DRESDEN 01.02.2015 01:02:03
60 |
61 | """
62 | DATE_FORMATS = (
63 | '%d.%m.%Y %H:%M:%S',
64 | '%d/%m/%Y %H:%M:%S',
65 | '%m/%d/%Y %I:%M:%S %p',
66 | )
67 | for date_format in DATE_FORMATS:
68 | try:
69 | dt = datetime.strptime(date_string, date_format)
70 | return dt
71 | except ValueError:
72 | pass
73 | return None
74 |
75 |
76 | def _fix_spurious_quotes(s):
77 | if s.startswith('"'):
78 | last = s.rfind('"')
79 | if last > 0:
80 | main = s[1:last]
81 | last += 1
82 | tail = s[last:]
83 | if tail.isspace():
84 | s = main + tail
85 | return s
86 |
87 |
88 | def _fix_terminal_tab(s):
89 | last = s.rfind('\t')
90 | if last > 0:
91 | main = s[:last]
92 | last += 1
93 | tail = s[last:]
94 | if tail.isspace():
95 | s = main + tail
96 | return s
97 |
98 |
99 | MID_COLUMNS = (
100 | 'Trial',
101 | 'Trial Category',
102 | 'Trial Start Time (Onset)',
103 | 'Pre-determined Onset',
104 | 'Cue Presented',
105 | 'Anticipation Phase Start Time',
106 | 'Anticipation Phase Duration',
107 | 'Target Phase Start Time',
108 | 'Target Phase Duration',
109 | 'Response Made by Subject',
110 | 'Response time',
111 | 'Feedback Phase Start Time',
112 | 'Outcome',
113 | 'Amount',
114 | 'Fixation Phase Start Time (Lasts until next trial start time)',
115 | 'Success Rate',
116 | 'Scanner Pulse',
117 | )
118 |
119 | FT_COLUMNS = (
120 | 'Trial Start Time (Onset)',
121 | 'Video Clip Name',
122 | )
123 |
124 | SS_COLUMNS = (
125 | 'Trial',
126 | 'Trial Category',
127 | 'Trial Start Time (Onset)',
128 | 'Pre-determined/randomised onset',
129 | 'Go Stimulus Presentation Time', # 'Go Stimulus Presentation Time '
130 | 'Stimulus Presented',
131 | 'Delay',
132 | 'Stop Stimulus Presentation Time',
133 | 'Response made by subject',
134 | 'Absolute Response Time',
135 | 'Relative Response Time',
136 | 'Response Outcome',
137 | 'Real Jitter',
138 | 'Pre-determined Jitter',
139 | 'Success Rate of Variable Delay Stop Trials',
140 | 'Scanner Pulse',
141 | )
142 |
143 | RECOG_COLUMNS = (
144 | 'TimePassed',
145 | 'UserResponse',
146 | 'ImageFileName',
147 | )
148 |
149 | # for each of the 4 tasks we provide a tuple:
150 | # * first word in the behavioral file that identifies the task
151 | # * list of columns in the 2nd line
152 | # * column from which to extract the last ascending numerical sequence
153 | # * True if the numerical sequence is strictly ascending
154 | _TASK_SPECIFICS = {
155 | MID_CSV: ('MID_TASK', MID_COLUMNS, 0, True),
156 | FT_CSV: ('FACE_TASK', FT_COLUMNS, 0, True),
157 | SS_CSV: ('STOP_SIGNAL_TASK', SS_COLUMNS, 0, False),
158 | RECOG_CSV: ('RECOGNITION_TASK', RECOG_COLUMNS, 0, True),
159 | }
160 |
161 |
162 | def _read_generic_behavioral(path, task, strict=True):
163 | """Read behavioral files and return part of the contents and errors.
164 |
165 | Sometimes complete lines are enclosed in quotes. Such quotes
166 | must be fixed before the contents can be read as CSV.
167 |
168 | Parameters
169 | ----------
170 | path : str
171 | Path to the behavioral file to read from.
172 |
173 | task : ?
174 | Type of task.
175 |
176 | strict : bool
177 | Be more lenient and let wholly quoted lines through if False,
178 | else do report the error.
179 |
180 | Returns
181 | -------
182 | psc1 : str
183 | PSC1 code.
184 | timestamp : datetime
185 | Time stamp extracted from the header.
186 | trials : array_like
187 | Last ascending sequence of trials.
188 | errors : array_like
189 | List of Error.
190 |
191 | Raises
192 | ------
193 | FileNotFoundError
194 | If path does not exist.
195 |
196 | """
197 | psc1 = None
198 | timestamp = None
199 | sequence = []
200 | errors = []
201 |
202 | with open(path, 'r') as behavioral: # add newline='' in Python 3
203 | lines = behavioral.readlines()
204 |
205 | # attempt to handle broken CSV files with fully quoted lines
206 | reader = csv.reader(lines, delimiter='\t')
207 | if not strict and max(len(row) for row in reader) < 2:
208 | lines = [_fix_spurious_quotes(line) for line in lines]
209 |
210 | # remove spurious terminal tab
211 | lines = [_fix_terminal_tab(line) for line in lines]
212 |
213 | # now re-read file contents
214 | reader = csv.reader(lines, delimiter='\t')
215 |
216 | # 1st line
217 | header = next(reader)
218 | if header:
219 | header = [x.strip() for x in header]
220 | if len(header) != 4:
221 | errors.append(Error(path, 'Line 1 contains {0} columns instead of 4'
222 | .format(len(header)), header))
223 | if len(header) > 3:
224 | COLUMN = 'Task type: Scanning'
225 | if header[3] != COLUMN:
226 | errors.append(Error(path, 'Column 4 of line 1 must be "{0}" '
227 | 'instead of "{1}"'
228 | .format(COLUMN, header[3]), header))
229 | if len(header) > 2:
230 | COLUMN = 'Subject ID:'
231 | if header[2].startswith(COLUMN):
232 | psc1 = header[2][len(COLUMN):].lstrip()
233 | else:
234 | errors.append(Error(path, 'Column 3 of line 1 "{0}" must start '
235 | 'with "{1}"'
236 | .format(header[2], COLUMN), header))
237 | if len(header) > 1:
238 | timestamp = _parse_behavioral_datetime(header[1])
239 | if not timestamp:
240 | errors.append(Error(path, 'Column 2 of line 1 "{0}" is not a standard time stamp'
241 | .format(header[1]), header))
242 | if len(header) > 0:
243 | COLUMN = '{0} task'.format(_TASK_SPECIFICS[task][0])
244 | if header[0] != COLUMN:
245 | errors.append(Error(path, 'Column 1 of line 1 must be "{0}" '
246 | 'instead of "{1}"'
247 | .format(COLUMN, header[0]), header))
248 | else:
249 | errors.append(Error(path, 'Empty file'))
250 |
251 | # 2nd line
252 | try:
253 | header = next(reader)
254 | header = [x.strip() for x in header]
255 | COLUMNS = _TASK_SPECIFICS[task][1]
256 | if len(header) != len(COLUMNS):
257 | errors.append(Error(path, 'Line 2 contains {0} columns instead of {1}'
258 | .format(len(header), len(COLUMNS)),
259 | header))
260 | for i, (h, c) in enumerate(zip(header, COLUMNS)):
261 | if h != c:
262 | errors.append(Error(path, 'Column {0} of line 2 must be {1} instead of {2}'
263 | .format(i + 1, c, h), header))
264 | break
265 | except StopIteration:
266 | errors.append(Error(path, 'Missing 2nd line'))
267 |
268 | # data
269 | last = None
270 | for n, row in enumerate(reader, 3):
271 | row = [x.strip() for x in row]
272 | COLUMNS = _TASK_SPECIFICS[task][1]
273 | if not any(row): # get rid of empty rows
274 | continue
275 | elif (len(row) != len(COLUMNS)):
276 | errors.append(Error(path, 'Line {0} contains {1} columns instead of {2}'
277 | .format(n, len(row), len(COLUMNS)),
278 | row))
279 | # column to check for ascending numerical sequence
280 | current = row[_TASK_SPECIFICS[task][2]].strip()
281 | try:
282 | # expect ascending numerical sequences
283 | current = int(current)
284 | if last:
285 | if _TASK_SPECIFICS[task][3]: # strictly ascending
286 | if current <= last:
287 | sequence = [] # start new ascending sequence
288 | else:
289 | if current < last:
290 | sequence = [] # start new ascending sequence
291 | sequence.append(current)
292 | last = current
293 | except ValueError:
294 | errors.append(Error(path, 'Column {0} of line {1} "{2}" should contain '
295 | 'only numbers'
296 | .format(_TASK_SPECIFICS[task][2] + 1, n, current), row))
297 | if last:
298 | last = None
299 |
300 | return psc1, timestamp, sequence, errors
301 |
302 |
303 | def read_mid(path, strict=True):
304 | """Return "Subject ID" and other information extracted from mid_*.csv.
305 |
306 | Sometimes complete lines are enclosed in quotes. In that case
307 | mid_*.csv content must be fixed before it can be read as CSV.
308 |
309 | Parameters
310 | ----------
311 | path : unicode
312 | Path to the mid_*.csv file to read from.
313 |
314 | strict : bool
315 | Be more lenient and let wholly quoted lines through if False,
316 | else do report the error.
317 |
318 | Returns
319 | -------
320 | psc1 : str
321 | PSC1 code.
322 | timestamp : datetime
323 | Time stamp extracted from the header.
324 | trials : array_like
325 | The last ascending sequence of trials ('Trials' column).
326 | errors : array_like
327 | List of Error.
328 |
329 | Raises
330 | ------
331 | FileNotFoundError
332 | If path does not exist.
333 |
334 | """
335 | return _read_generic_behavioral(path, MID_CSV, strict)
336 |
337 |
338 | def read_ft(path, strict=True):
339 | """Return "Subject ID" and other information extracted from ft_*.csv.
340 |
341 | Sometimes complete lines are enclosed in quotes. In that case
342 | ft_*.csv content must be fixed before it can be read as CSV.
343 |
344 | Parameters
345 | ----------
346 | path : unicode
347 | Path to the ft_*.csv file to read from.
348 |
349 | strict : bool
350 | Be more lenient and let wholly quoted lines through if False,
351 | else do report the error.
352 |
353 | Returns
354 | -------
355 | psc1 : str
356 | PSC1 code.
357 | timestamp : datetime
358 | Time stamp extracted from the header.
359 | trials : array_like
360 | The last ascending sequence of trials ('Trials' column).
361 | errors : array_like
362 | List of Error.
363 |
364 | Raises
365 | ------
366 | FileNotFoundError
367 | If path does not exist.
368 |
369 | """
370 | return _read_generic_behavioral(path, FT_CSV, strict)
371 |
372 |
373 | def read_ss(path, strict=True):
374 | """Return "Subject ID" and other information extracted from ss_*.csv.
375 |
376 | Sometimes complete lines are enclosed in quotes. In that case
377 | ss_*.csv content must be fixed before it can be read as CSV.
378 |
379 | Parameters
380 | ----------
381 | path : unicode
382 | Path to the ss_*.csv file to read from.
383 |
384 | strict : bool
385 | Be more lenient and let wholly quoted lines through if False,
386 | else do report the error.
387 |
388 | Returns
389 | -------
390 | psc1 : str
391 | PSC1 code.
392 | timestamp : datetime
393 | Time stamp extracted from the header.
394 | trials : array_like
395 | The last ascending sequence of trials ('Trials' column).
396 | errors : array_like
397 | List of Error.
398 |
399 | Raises
400 | ------
401 | FileNotFoundError
402 | If path does not exist.
403 |
404 | """
405 | return _read_generic_behavioral(path, SS_CSV, strict)
406 |
407 |
408 | def read_recog(path, strict=True):
409 | """Return "Subject ID" and other information extracted from recog_*.csv.
410 |
411 | Sometimes complete lines are enclosed in quotes. In that case
412 | recog_*.csv content must be fixed before it can be read as CSV.
413 |
414 | Parameters
415 | ----------
416 | path : unicode
417 | Path to the recog_*.csv file to read from.
418 |
419 | strict : bool
420 | Be more lenient and let wholly quoted lines through if False,
421 | else do report the error.
422 |
423 | Returns
424 | -------
425 | psc1 : str
426 | PSC1 code.
427 | timestamp : datetime
428 | Time stamp extracted from the header.
429 | times : array_like
430 | The last ascending sequence of trials ('TimePassed' column).
431 | errors : array_like
432 | List of Error.
433 |
434 | Raises
435 | ------
436 | FileNotFoundError
437 | If path does not exist.
438 |
439 | """
440 | return _read_generic_behavioral(path, RECOG_CSV, strict)
441 |
442 |
443 | def main():
444 | import os.path
445 |
446 | ROOT_DIR = '/neurospin/imagen/FU2/RAW/PSC1'
447 | for center in os.listdir(ROOT_DIR):
448 | center_path = os.path.join(ROOT_DIR, center)
449 | for subject in os.listdir(center_path):
450 | subject_path = os.path.join(center_path, subject)
451 | behavioral_path = os.path.join(subject_path,
452 | 'AdditionalData', 'Scanning')
453 | if os.path.isdir(behavioral_path):
454 | #~ mid_files = tuple(os.path.join(behavioral_path, b)
455 | #~ for b in os.listdir(behavioral_path)
456 | #~ if 'mid_' in b)
457 | #~ for mid_file in mid_files:
458 | #~ (psc1, _timestamp, onsets, errors) = read_mid(mid_file, False)
459 | #~ print('▸ {0} MID {1}'.format(psc1, len(onsets)))
460 | #~ for error in errors:
461 | #~ print(' ✗ {0}: {1}'.format(error.message,
462 | #~ os.path.relpath(error.path, ROOT_DIR)))
463 | #~ ft_files = tuple(os.path.join(behavioral_path, b)
464 | #~ for b in os.listdir(behavioral_path)
465 | #~ if 'ft_' in b)
466 | #~ for ft_file in ft_files:
467 | #~ (psc1, _timestamp, onsets, errors) = read_ft(ft_file, False)
468 | #~ print('▸ {0} FT {1}'.format(psc1, len(onsets)))
469 | #~ for error in errors:
470 | #~ print(' ✗ {0}: {1}'.format(error.message,
471 | #~ os.path.relpath(error.path, ROOT_DIR)))
472 | ss_files = tuple(os.path.join(behavioral_path, b)
473 | for b in os.listdir(behavioral_path)
474 | if 'ss_' in b)
475 | for ss_file in ss_files:
476 | (psc1, timestamp, onsets, errors) = read_ss(ss_file, # pylint: disable=unused-variable
477 | False)
478 | print('▸ {0} SS {1}'.format(psc1, len(onsets)))
479 | for error in errors:
480 | print(' ✗ {0}: {1}'.format(error.message,
481 | os.path.relpath(error.path, ROOT_DIR)))
482 | #~ recog_files = tuple(os.path.join(behavioral_path, b)
483 | #~ for b in os.listdir(behavioral_path)
484 | #~ if 'recog_' in b)
485 | #~ for recog_file in recog_files:
486 | #~ (psc1, timestamp, onsets, errors) = read_recog(recog_file, False)
487 | #~ print('▸ {0} RECOG {1}'.format(psc1, len(onsets)))
488 | #~ for error in errors:
489 | #~ print(' ✗ {0}: {1}'.format(error.message,
490 | #~ os.path.relpath(error.path, ROOT_DIR)))
491 |
492 |
493 | if __name__ == '__main__':
494 | main()
495 |
--------------------------------------------------------------------------------
/imagen_databank/cantab.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2017 CEA
2 | #
3 | # This software is governed by the CeCILL license under French law and
4 | # abiding by the rules of distribution of free software. You can use,
5 | # modify and/ or redistribute the software under the terms of the CeCILL
6 | # license as circulated by CEA, CNRS and INRIA at the following URL
7 | # "http://www.cecill.info".
8 | #
9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 |
29 | from zipfile import ZipFile
30 | from lxml import etree
31 | import datetime
32 | import csv
33 | import re
34 | import sys
35 |
36 | import logging
37 | logger = logging.getLogger(__name__)
38 |
39 | __all___ = ['CANTAB_CCLAR', 'DETAILED_DATASHEET_CSV', 'DATASHEET_CSV',
40 | 'REPORT_HTML',
41 | 'read_cant', 'read_datasheet', 'read_detailed_datasheet',
42 | 'read_report']
43 |
44 |
45 | #
46 | # types of files we expect to be find under AdditionalData
47 | #
48 | CANTAB_CCLAR = 'cantab'
49 | DETAILED_DATASHEET_CSV = 'detailed_datasheet'
50 | DATASHEET_CSV = 'datasheet'
51 | REPORT_HTML = 'report'
52 |
53 | _ID_XPATH = ".//{http://www.camcog.com/proteus/entity/xml}attribute[@name='ID']"
54 |
55 |
56 | def read_cant(path):
57 | """Return "Subject ID" values found in a cant_*.cclar file.
58 |
59 | Parameters
60 | ----------
61 | path : unicode
62 | Path to the cant_*.cclar file to read from.
63 |
64 | Returns
65 | -------
66 | list
67 | "Subject ID" values found in the file.
68 |
69 | """
70 | subject_ids = set()
71 | cantfile = ZipFile(path, 'r')
72 | for name in cantfile.namelist():
73 | if name.endswith('index.xml'):
74 | root = etree.fromstring(cantfile.read(name))
75 | for element in root.findall(_ID_XPATH):
76 | subject_ids.add(element.attrib['value'])
77 | cantfile.close()
78 | return subject_ids
79 |
80 |
81 | def _parse_csv_datetime(date_string):
82 | """Read date in the format found in CSV files.
83 |
84 | * LONDON 01-Feb-2015 12:34:56
85 | * NOTTINGHAM 01-Feb-2015 12:34:56 01/02/2015 12:34
86 | * DUBLIN 01-Feb-2015 12:34:56
87 | * BERLIN 01.02.2015 12:34:56
88 | * HAMBURG 01.02.2015 12:34:56
89 | * MANNHEIM 01.02.2015 12:34:56
90 | * PARIS 01 Feb 2015 12:34:56
91 | * DRESDEN 12:34:56 01.02.2015
92 |
93 | """
94 | DATE_FORMATS = (
95 | '%d-%b-%Y %H:%M:%S', # 01-Feb-2015 12:34:56
96 | '%d/%m/%Y %H:%M', # 01/02/2015 12:34
97 | '%d.%m.%Y %H:%M:%S', # 01.02.2015 12:34:56
98 | '%d %b %Y %H:%M:%S', # 01 Feb 2015 12:34:56
99 | '%H:%M:%S %d.%m.%Y', # 12:34:56 01.02.2015
100 | )
101 | for date_format in DATE_FORMATS:
102 | try:
103 | dt = datetime.datetime.strptime(date_string, date_format)
104 | return dt
105 | except ValueError:
106 | pass
107 | return None
108 |
109 |
110 | def read_datasheet(path):
111 | """Return "Subject ID" and other information extracted from datasheet_*.csv.
112 |
113 | Parameters
114 | ----------
115 | path : unicode
116 | Path to the datasheet_*.csv file to read from.
117 |
118 | Returns
119 | -------
120 | list
121 | * "Subject ID" values found in the file.
122 | * "Session start time" values found in the file.
123 | * number of rows.
124 | * minimal number of columns.
125 | * list of column titles.
126 |
127 | """
128 | with open(path) as csvfile:
129 | # read header
130 | dialect = csv.Sniffer().sniff(csvfile.read())
131 | csvfile.seek(0)
132 | reader = csv.reader(csvfile, dialect)
133 | rows = 0
134 | columns_max = columns_min = 0
135 | fields = {}
136 | header = next(reader)
137 | if header:
138 | fields = {v: i for i, v in enumerate(header)}
139 | columns_max = columns_min = len(header)
140 | rows += 1
141 | subject_ids = set()
142 | session_start_times = set()
143 | # read values from the rest of the table
144 | for row in reader:
145 | if len(row) > 0:
146 | if "Subject ID" in fields:
147 | subject_id = row[fields["Subject ID"]]
148 | else:
149 | subject_id = row[0]
150 | subject_ids.add(subject_id)
151 | if "Session start time" in fields:
152 | session_start_time = _parse_csv_datetime(row[fields["Session start time"]])
153 | if session_start_time is not None:
154 | if session_start_time < datetime.datetime(2007, 1, 1):
155 | logger.warning('"Session start time" for %s anterior to 2007: %s',
156 | subject_id, session_start_time.date())
157 | session_start_times.add(session_start_time)
158 | columns_min = min(len(row), columns_min)
159 | columns_max = max(len(row), columns_max)
160 | rows += 1
161 | return (subject_ids, session_start_times, rows, columns_min, fields)
162 |
163 |
164 | #
165 | # match lines with "Subject ID"
166 | #
167 | _DETAILED_DATASHEET_REGEX = re.compile(r'"?Subject ID : (\w*)"?')
168 |
169 |
170 | def read_detailed_datasheet(path):
171 | """Return "Subject ID" values found in a detailed_datasheet_*.csv file.
172 |
173 | Parameters
174 | ----------
175 | path : unicode
176 | Path to the detailed_datasheet_*.csv file to read from.
177 |
178 | Returns
179 | -------
180 | list
181 | "Subject ID" values found in the file.
182 |
183 | """
184 | with open(path, encoding='latin1') as f:
185 | subject_ids = set()
186 | for line in f:
187 | match = _DETAILED_DATASHEET_REGEX.match(line)
188 | if match:
189 | subject_ids.add(match.group(1))
190 | return subject_ids
191 |
192 |
193 | _REPORT_REGEX = re.compile('
Subject ID | (.*) | Gender | (.*) | ')
194 |
195 |
196 | def read_report(path):
197 | """Return "Subject ID" values found in a report_*.html file.
198 |
199 | Parameters
200 | ----------
201 | path : unicode
202 | Path to the report_*.html to read from.
203 |
204 | Returns
205 | -------
206 | list
207 | "Subject ID" values found in the file.
208 |
209 | """
210 | with open(path, encoding='latin-1') as report_html:
211 | subject_ids = set()
212 | for line in report_html:
213 | match = _REPORT_REGEX.match(line)
214 | if match:
215 | subject_ids.add(match.group(1))
216 | return subject_ids
217 |
--------------------------------------------------------------------------------
/imagen_databank/core.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2019 CEA
2 | #
3 | # This software is governed by the CeCILL license under French law and
4 | # abiding by the rules of distribution of free software. You can use,
5 | # modify and/ or redistribute the software under the terms of the CeCILL
6 | # license as circulated by CEA, CNRS and INRIA at the following URL
7 | # "http://www.cecill.info".
8 | #
9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 |
29 | import re
30 | import datetime
31 |
32 | import logging
33 | logger = logging.getLogger(__name__)
34 |
35 | __all___ = ['LONDON', 'NOTTINGHAM', 'DUBLIN', 'BERLIN',
36 | 'HAMBURG', 'MANNHEIM', 'PARIS', 'DRESDEN',
37 | 'SOUTHAMPTON', 'AACHEN',
38 | 'CENTER_NAME',
39 | 'PSC2_FROM_PSC1', 'PSC1_FROM_PSC2',
40 | 'PSC1_FROM_DAWBA', 'PSC2_FROM_DAWBA', # PSC2_FROM_DAWBA is obsolete
41 | 'DOB_FROM_PSC1',
42 | 'detect_psc1', 'detect_psc2', 'guess_psc1',
43 | 'Error']
44 |
45 |
46 | #
47 | # numerical ID of acquisition centers of Imagen
48 | #
49 | LONDON = 1
50 | NOTTINGHAM = 2
51 | DUBLIN = 3
52 | BERLIN = 4
53 | HAMBURG = 5
54 | MANNHEIM = 6
55 | PARIS = 7
56 | DRESDEN = 8
57 | SOUTHAMPTON = 90 # Stratify
58 | AACHEN = 91 # Stratify
59 |
60 | #
61 | # from numerical ID to standard name of acquisition centers of Imagen
62 | #
63 | CENTER_NAME = {
64 | LONDON: 'LONDON',
65 | NOTTINGHAM: 'NOTTINGHAM',
66 | DUBLIN: 'DUBLIN',
67 | BERLIN: 'BERLIN',
68 | HAMBURG: 'HAMBURG',
69 | MANNHEIM: 'MANNHEIM',
70 | PARIS: 'PARIS',
71 | DRESDEN: 'DRESDEN',
72 | SOUTHAMPTON: 'SOUTHAMPTON', # Stratify
73 | AACHEN: 'AACHEN', # Stratify
74 | }
75 |
76 | #
77 | # file that maps PSC1 to PSC2 and DAWBA codes to PSC1
78 | #
79 | _PSC2PSC = '/neurospin/imagen/src/scripts/psc_tools/psc2psc.csv'
80 | _PSC2PSC_STRATIFY = '/neurospin/imagen/src/scripts/psc_tools/psc2psc_SB.csv'
81 |
82 | #
83 | # file that maps PSC1 codes to date of birth
84 | #
85 | _DOB = '/neurospin/imagen/src/scripts/psc_tools/DOB.csv'
86 | _DOB_STRATIFY = '/neurospin/imagen/src/scripts/psc_tools/DOB_SB.csv'
87 |
88 |
89 | def _initialize_psc1_dawba_psc2():
90 | """Returns dictionaries to map PSC1 to PSC2 and DAWBA codes to PSC1.
91 |
92 | Parameters
93 | ----------
94 | path : unicode
95 | File containing PSC1=DAWBA=PSC2 mappings.
96 |
97 | Returns
98 | -------
99 | tuple
100 | Pair of PSC1→PSC2 and DAWBA→PSC1 dictionaries.
101 |
102 | """
103 | psc2_from_psc1 = {}
104 | psc1_from_dawba = {}
105 | for psc2psc in (_PSC2PSC, _PSC2PSC_STRATIFY):
106 | with open(psc2psc, 'rU') as f:
107 | for line in f:
108 | psc1, dawba, psc2 = line.strip('\n').split('=')
109 | # 1st line is: PSC1=DAWBA=PSC2
110 | if psc1 == 'PSC1' and dawba == 'DAWBA' and psc2 == 'PSC2':
111 | continue
112 | if psc2 in psc2_from_psc1:
113 | if psc2_from_psc1[psc1] != psc2:
114 | logger.critical('inconsistent PSC1/PSC2 mapping: %s', _PSC2PSC)
115 | raise Exception('inconsistent PSC1/PSC2 mapping')
116 | else:
117 | psc2_from_psc1[psc1] = psc2
118 | psc1_from_dawba[dawba] = psc1
119 | return psc2_from_psc1, psc1_from_dawba
120 |
121 |
122 | _REGEX_DOB = re.compile(r'(\d{4})-(\d{2})-(\d{2})')
123 |
124 |
125 | def _initialize_dob():
126 | """Returns dictionary to map PSC1 code to date of birth.
127 |
128 | Parameters
129 | ----------
130 | path : unicode
131 | DOB.csv file left over by initial Imagen team.
132 |
133 | Returns
134 | -------
135 | dict
136 | Dictionary map PSC1 code to date of birth.
137 |
138 | """
139 | dob_from_psc1 = {}
140 | for dob in (_DOB, _DOB_STRATIFY):
141 | with open(dob, 'rU') as f:
142 | for line in f:
143 | psc1, dob, dummy_when = line.strip('\n').split(',')
144 | match = _REGEX_DOB.match(dob)
145 | if match:
146 | year = int(match.group(1))
147 | month = int(match.group(2))
148 | day = int(match.group(3))
149 | if year > 2012 or year < 1987:
150 | raise Exception('unexpected date of birth: {0} ({1}-{2}-{3})'.format(dob, year, month, day))
151 | dob_from_psc1[psc1] = datetime.date(year, month, day)
152 | else:
153 | raise Exception('unexpected line in DOB.csv: {0}'.format(line))
154 | return dob_from_psc1
155 |
156 |
157 | PSC2_FROM_PSC1, PSC1_FROM_DAWBA = _initialize_psc1_dawba_psc2()
158 | PSC2_FROM_DAWBA = {k: PSC2_FROM_PSC1[v] # obsolete
159 | for k, v in PSC1_FROM_DAWBA.items() if v in PSC2_FROM_PSC1}
160 | PSC1_FROM_PSC2 = {v: k for k, v in PSC2_FROM_PSC1.items()}
161 | DOB_FROM_PSC1 = _initialize_dob()
162 | DOB_FROM_PSC2 = {PSC2_FROM_PSC1[k]: v # obsolete
163 | for k, v in DOB_FROM_PSC1.items() if k in PSC2_FROM_PSC1}
164 |
165 |
166 | #
167 | # the heuristic to detect a PSC1 code is that:
168 | # - it starts with 0 followed by the digit associated to each center
169 | # - it is a series of 12 digits
170 | #
171 | _PSC1_REGEX = re.compile('(0[' +
172 | ''.join([str(c) for c in CENTER_NAME]) +
173 | ']\d{10})[^d]?')
174 |
175 |
176 | def detect_psc1(string):
177 | """Find potential PSC1 codes in a filename.
178 |
179 | PSC1 codes are sequences of 12 digits starting with 0 followed by a
180 | different digit for each center, followed by 10 digits.
181 |
182 | Parameters
183 | ----------
184 | filename : str
185 | The string to search for PSC1.
186 |
187 | Returns
188 | -------
189 | str
190 | Potential PSC1 code or None.
191 |
192 | """
193 | match = _PSC1_REGEX.search(string)
194 | if match:
195 | return match.group(1)
196 | else:
197 | return None
198 |
199 |
200 | #
201 | # the heuristic to detect a PSC2 code is that:
202 | # - it starts with 0 followed by a different digit for each center
203 | # - it a series of 12 digits
204 | #
205 | _PSC2_REGEX = re.compile('(0\d{11})[^d]?')
206 |
207 |
208 | def detect_psc2(string):
209 | """Find potential PSC2 codes in a filename.
210 |
211 | PSC2 codes are sequences of 12 digits starting with 0.
212 |
213 | Parameters
214 | ----------
215 | filename : str
216 | The string to search for PSC2.
217 |
218 | Returns
219 | -------
220 | str
221 | Potential PSC2 code or None.
222 |
223 | """
224 | match = _PSC2_REGEX.search(string)
225 | if match:
226 | return match.group(1)
227 | else:
228 | return None
229 |
230 |
231 | def guess_psc1(subject_id, center):
232 | subject_id = subject_id.split('_')[0]
233 | if subject_id.upper().startswith('FU2'):
234 | subject_id = subject_id[3:]
235 | if subject_id.upper().endswith('FU3'):
236 | subject_id = subject_id[:-3]
237 | elif subject_id.upper().endswith('FU2'):
238 | subject_id = subject_id[:-3]
239 | elif subject_id.upper().endswith('FU'):
240 | subject_id = subject_id[:-2]
241 | # this is very empirical and based on cases seen so far!
242 | if len(subject_id) < 10:
243 | subject_id = '0' + str(center) + subject_id.rjust(10, '0')
244 | elif len(subject_id) < 11:
245 | if len(subject_id) < 10:
246 | subject_id = subject_id.rjust(10, '0')
247 | subject_id = '0' + str(center) + subject_id
248 | elif len(subject_id) < 12:
249 | subject_id = subject_id[0:2] + '0' + subject_id[2:]
250 | # check this is an existing PSC1 code
251 | if subject_id in PSC2_FROM_PSC1:
252 | return subject_id
253 | return None
254 |
255 |
256 | class Error:
257 | """Error while parsing files.
258 |
259 | Returned by functions that parse Cantab and behavioral files.
260 |
261 | Attributes
262 | ----------
263 | path : str
264 | File name.
265 | message : str
266 | Message explaining the error.
267 | sample : str
268 | Part of the file that generated the error.
269 |
270 | """
271 | _SAMPLE_LEN = 30
272 |
273 | def __init__(self, path, message, sample=None):
274 | self.path = path
275 | self.message = message
276 | self.sample = sample
277 |
278 | def __str__(self):
279 | if self.path:
280 | if self.sample:
281 | sample = repr(self.sample)
282 | if len(sample) > self._SAMPLE_LEN:
283 | sample = sample[:self._SAMPLE_LEN] + '...'
284 | return '{0}: <{1}>: {2}'.format(self.message, sample, self.path)
285 | else:
286 | return '{0}: {1}'.format(self.message, self.path)
287 | else:
288 | return '{0}'.format(self.message)
289 |
--------------------------------------------------------------------------------
/imagen_databank/dicom_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2017 CEA
2 | #
3 | # This software is governed by the CeCILL license under French law and
4 | # abiding by the rules of distribution of free software. You can use,
5 | # modify and/ or redistribute the software under the terms of the CeCILL
6 | # license as circulated by CEA, CNRS and INRIA at the following URL
7 | # "http://www.cecill.info".
8 | #
9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 |
29 | import re
30 | import datetime
31 | import dateutil.tz
32 | try:
33 | import pydicom
34 | except:
35 | import dicom as pydicom
36 | from pydicom.filereader import InvalidDicomError
37 | from pydicom.filereader import dcmread
38 |
39 | import logging
40 | logger = logging.getLogger(__name__)
41 |
42 | __all__ = ['read_metadata']
43 |
44 |
45 | #
46 | # parse DICOM DateTime and Time tags
47 | #
48 | _REGEX_DT = re.compile(r"((\d{4,14})(\.(\d{1,6}))?)([+-]\d{4})?")
49 | _REGEX_TM = re.compile(r"(\d{2,6})(\.(\d{1,6}))?")
50 |
51 |
52 | def _datetime_from_dt(dt):
53 | """Convert DICOM DateTime to Python datetime.
54 |
55 | Parameters
56 | ----------
57 | dt : str
58 | DateTime tag from DICOM image.
59 |
60 | Returns
61 | -------
62 | datetime
63 |
64 | """
65 | match = _REGEX_DT.match(dt)
66 | if match and len(dt) <= 26:
67 | dt_match = match.group(2)
68 | year = int(dt_match[0:4])
69 | if len(dt_match) < 6:
70 | month = 1
71 | else:
72 | month = int(dt_match[4:6])
73 | if len(dt_match) < 8:
74 | day = 1
75 | else:
76 | day = int(dt_match[6:8])
77 | if len(dt_match) < 10:
78 | hour = 0
79 | else:
80 | hour = int(dt_match[8:10])
81 | if len(dt_match) < 12:
82 | minute = 0
83 | else:
84 | minute = int(dt_match[10:12])
85 | if len(dt_match) < 14:
86 | second = 0
87 | microsecond = 0
88 | else:
89 | second = int(dt_match[12:14])
90 | ms_match = match.group(4)
91 | if ms_match:
92 | microsecond = int(ms_match.rstrip().ljust(6, '0'))
93 | else:
94 | microsecond = 0
95 | tz_match = match.group(5)
96 | if tz_match:
97 | offset = (int(tz_match[1:3]) * 60 + int(tz_match[3:5])) * 60
98 | if tz_match[0] == '-':
99 | offset = -offset
100 | tzinfo = dateutil.tz.tzoffset(tz_match, offset)
101 | else:
102 | tzinfo = None
103 | return datetime.datetime(year, month, day,
104 | hour, minute, second, microsecond,
105 | tzinfo)
106 | else:
107 | logger.error('incorrect DICOM DT: %s', dt)
108 | return None
109 |
110 |
111 | def _date_from_da(da):
112 | """Convert DICOM Date to Python date.
113 |
114 | Parameters
115 | ----------
116 | da : str
117 | Date tag from DICOM image.
118 |
119 | Returns
120 | -------
121 | date
122 |
123 | """
124 | if len(da) == 8:
125 | year = int(da[0:4])
126 | month = int(da[4:6])
127 | day = int(da[6:8])
128 | return datetime.date(year, month, day)
129 | elif len(da) == 10 and da[4] == '.' and da[7] == '.':
130 | # ACR-NEMA Standard 300, predecessor to DICOM - for compatibility
131 | year = int(da[0:4])
132 | month = int(da[5:7])
133 | day = int(da[8:10])
134 | return datetime.date(year, month, day)
135 | else:
136 | logger.error('incorrect DICOM DA: %s', da)
137 | return None
138 |
139 |
140 | def _time_from_tm(tm):
141 | """Convert DICOM Time to Python time.
142 |
143 | Parameters
144 | ----------
145 | tm : str
146 | Time tag from DICOM image.
147 |
148 | Returns
149 | -------
150 | time
151 |
152 | """
153 | match = _REGEX_TM.match(tm)
154 | if match and len(tm) <= 16:
155 | tm_match = match.group(1)
156 | hour = int(tm_match[0:2])
157 | if len(tm_match) < 4:
158 | minute = 0
159 | else:
160 | minute = int(tm_match[2:4])
161 | if len(tm_match) < 6:
162 | second = 0
163 | microsecond = 0
164 | else:
165 | second = int(tm_match[4:6])
166 | ms_match = match.group(3)
167 | if ms_match:
168 | microsecond = int(ms_match.rstrip().ljust(6, '0'))
169 | else:
170 | microsecond = 0
171 | return datetime.time(hour, minute, second, microsecond)
172 | else:
173 | logger.error('incorrect DICOM TM: %s', tm)
174 | return None
175 |
176 |
177 | def read_metadata(path, force=False):
178 | """Read select metadata from a DICOM file.
179 |
180 | We always attempt to read the following DICOM tags. An exception is raised
181 | if one of the tags cannot be read:
182 | - SOPClassUID
183 | - SeriesInstanceUID
184 | - SeriesNumber
185 | - SeriesDescription
186 | - SOPInstanceUID
187 |
188 | We also attempt to read the following DICOM tags if they are present:
189 | - ImageType
190 | - AcquisitionDateTime
191 | - AcquisitionDate
192 | - AcquisitionTime
193 | - StationName
194 | - Manufacturer
195 | - ManufacturerModelName
196 | - DeviceSerialNumber
197 | - SoftwareVersions
198 | - PatientID
199 |
200 | Parameters
201 | ----------
202 | path : str
203 | Path name of the DICOM file.
204 | force : bool
205 | If True read nonstandard files, typically without "Part 10" headers.
206 |
207 | Returns
208 | -------
209 | dict
210 |
211 | """
212 | dataset = dcmread(path, force=force)
213 |
214 | # missing compulsory tags will raise exceptions
215 | if 'SeriesDescription' in dataset:
216 | description = dataset.SeriesDescription
217 | elif 'ProtocolName' in dataset:
218 | description = dataset.ProtocolName
219 | else:
220 | description = dataset.SeriesDescription # raise an exception!
221 |
222 | metadata = {
223 | 'SOPClassUID': dataset.SOPClassUID,
224 | 'SOPInstanceUID': dataset.SOPInstanceUID,
225 | 'SeriesInstanceUID': dataset.SeriesInstanceUID,
226 | 'SeriesNumber': dataset.SeriesNumber,
227 | 'SeriesDescription': description,
228 | }
229 |
230 | # optional tags
231 | if 'ImageType' in dataset:
232 | metadata['ImageType'] = dataset.ImageType
233 | if 'AcquisitionDateTime' in dataset:
234 | dt = _datetime_from_dt(dataset.AcquisitionDateTime)
235 | metadata['AcquisitionDate'] = dt.date()
236 | metadata['AcquisitionTime'] = dt.time()
237 | else:
238 | if 'AcquisitionDate' in dataset:
239 | metadata['AcquisitionDate'] = _date_from_da(dataset.AcquisitionDate)
240 | if 'AcquisitionTime' in dataset:
241 | metadata['AcquisitionTime'] = _time_from_tm(dataset.AcquisitionTime)
242 | if 'StationName' in dataset:
243 | metadata['StationName'] = dataset.StationName
244 | if 'Manufacturer' in dataset:
245 | metadata['Manufacturer'] = dataset.Manufacturer
246 | if 'ManufacturerModelName' in dataset:
247 | metadata['ManufacturerModelName'] = dataset.ManufacturerModelName
248 | if 'DeviceSerialNumber' in dataset:
249 | metadata['DeviceSerialNumber'] = dataset.DeviceSerialNumber
250 | if 'SoftwareVersions' in dataset:
251 | if pydicom.dataelem.isMultiValue(dataset.SoftwareVersions):
252 | # usually the last part is the more informative
253 | # for example on Philips scanners:
254 | # ['3.2.1', '3.2.1.1'] → '3.2.1.1'
255 | metadata['SoftwareVersions'] = dataset.SoftwareVersions[-1]
256 | else:
257 | metadata['SoftwareVersions'] = dataset.SoftwareVersions
258 | if 'StudyComments' in dataset: # DUBLIN
259 | metadata['StudyComments'] = dataset.StudyComments
260 | if 'PatientName' in dataset: # BERLIN, NOTTINGHAM
261 | metadata['PatientName'] = dataset.PatientName
262 | if 'ImageComments' in dataset: # HAMBURG, DRESDEN
263 | metadata['ImageComments'] = dataset.ImageComments
264 | if 'StudyDescription' in dataset: # LONDON
265 | metadata['StudyDescription'] = dataset.StudyDescription
266 | if 'PerformedProcedureStepDescription' in dataset: # LONDON
267 | metadata['PerformedProcedureStepDescription'] = dataset.PerformedProcedureStepDescription
268 | if 'PatientID' in dataset: # BERLIN, MANNHEIM, PARIS
269 | metadata['PatientID'] = dataset.PatientID
270 |
271 | return metadata
272 |
--------------------------------------------------------------------------------
/imagen_databank/image_data.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2018 CEA
2 | #
3 | # This software is governed by the CeCILL license under French law and
4 | # abiding by the rules of distribution of free software. You can use,
5 | # modify and/ or redistribute the software under the terms of the CeCILL
6 | # license as circulated by CEA, CNRS and INRIA at the following URL
7 | # "http://www.cecill.info".
8 | #
9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 |
29 | import os
30 | import re
31 | import time
32 | import datetime
33 |
34 | from .core import (LONDON, NOTTINGHAM, DUBLIN, BERLIN,
35 | HAMBURG, MANNHEIM, PARIS, DRESDEN,
36 | SOUTHAMPTON, AACHEN)
37 | from .dicom_utils import read_metadata
38 | from .dicom_utils import InvalidDicomError
39 |
40 | import logging
41 | logger = logging.getLogger(__name__)
42 |
43 | __all__ = ['SEQUENCE_LOCALIZER_CALIBRATION', 'SEQUENCE_T2',
44 | 'SEQUENCE_T2_FLAIR', 'SEQUENCE_ADNI_MPRAGE',
45 | 'SEQUENCE_MID', 'SEQUENCE_FT', 'SEQUENCE_SST',
46 | 'SEQUENCE_B0_MAP', 'SEQUENCE_DTI',
47 | 'SEQUENCE_RESTING_STATE',
48 | 'SEQUENCE_NODDI',
49 | 'SEQUENCE_NAME',
50 | 'NONSTANDARD_DICOM',
51 | 'series_type_from_description',
52 | 'walk_image_data', 'report_image_data']
53 |
54 |
55 | #
56 | # information sent by Anna Cattrell to Dimitri on 13 June 2014:
57 | # Standard Operating Procedure IMAGEN Follow-up 2 study
58 | #
59 | # 2.2.1 Overview of Imaging Session:
60 | #
61 | # 2. 3 plane localizer / Parallel imaging calibration
62 | # 3. Axial T2 slices (site specific duration)
63 | # 4. Axial T2 Flair slices (site specific duration)
64 | # 5. 3D Sagittal ADNI MPRAGE (Long)
65 | # 7. Monetary Incentive Delay Task (MID)
66 | # 9. Face task (FT)
67 | # 11. Stop-signal task (SST)
68 | # 12. B0 Map
69 | # 13. DTI (duration is heart-rate dependent at sites with cardiac gating)
70 | # 14. Resting State
71 | # 15. Short MPRAGE (baseline only)
72 | # 16. EPI Global (JBP suggestion followed by a few centres at baseline)
73 | # 17. NODDI (optional, added in Follow-up 3)
74 | #
75 | # the following constants attempt to describe each of these sequences
76 | #
77 | SEQUENCE_LOCALIZER_CALIBRATION = 2
78 | SEQUENCE_T2 = 3
79 | SEQUENCE_T2_FLAIR = 4
80 | SEQUENCE_ADNI_MPRAGE = 5
81 | SEQUENCE_MID = 7
82 | SEQUENCE_FT = 9
83 | SEQUENCE_SST = 11
84 | SEQUENCE_B0_MAP = 12
85 | SEQUENCE_DTI = 13
86 | SEQUENCE_RESTING_STATE = 14
87 | SEQUENCE_SHORT_MPRAGE = 15
88 | SEQUENCE_GLOBAL = 16
89 | SEQUENCE_NODDI = 17
90 |
91 | #
92 | # from sequence ID to sequence name
93 | #
94 | SEQUENCE_NAME = {
95 | SEQUENCE_LOCALIZER_CALIBRATION: 'Localizer/Calibration',
96 | SEQUENCE_T2: 'T2',
97 | SEQUENCE_T2_FLAIR: 'T2 Flair',
98 | SEQUENCE_ADNI_MPRAGE: 'ADNI MPRAGE',
99 | SEQUENCE_MID: 'EPI MID',
100 | SEQUENCE_FT: 'EPI Faces',
101 | SEQUENCE_SST: 'EPI Signal Stop',
102 | SEQUENCE_B0_MAP: 'B0 Map',
103 | SEQUENCE_DTI: 'DTI',
104 | SEQUENCE_RESTING_STATE: 'Resting State',
105 | SEQUENCE_SHORT_MPRAGE: 'Short MPRAGE',
106 | SEQUENCE_GLOBAL: 'EPI Global',
107 | SEQUENCE_NODDI: 'NODDI',
108 | }
109 |
110 | #
111 | # check sequence names against these regex'es when trying to identify
112 | # the type of a sequence from its name
113 | #
114 | # in some case order is important, for example:
115 | # - first match 'FLAIR' and 'short MPRAGE'
116 | # - then match 'T2' and 'MPRAGE'
117 | #
118 | _LOOSE_IMAGE_DATA_REGEXES = (
119 | (re.compile(r'LOCALI[ZS]ER', re.IGNORECASE), SEQUENCE_LOCALIZER_CALIBRATION),
120 | # LONDON calibration
121 | (re.compile(r'ASSET[- ]Cal', re.IGNORECASE), SEQUENCE_LOCALIZER_CALIBRATION),
122 | # NOTTINGHAM 3-plane scout
123 | (re.compile(r'Survey_SHC'), SEQUENCE_LOCALIZER_CALIBRATION),
124 | # LONDON FU3 3-plane Localizer
125 | (re.compile(r'3Plane'), SEQUENCE_LOCALIZER_CALIBRATION),
126 | # first search for "FLAIR" then for "T2"
127 | (re.compile(r'FLAIR', re.IGNORECASE), SEQUENCE_T2_FLAIR),
128 | (re.compile(r'T2', re.IGNORECASE), SEQUENCE_T2),
129 | (re.compile(r'short MPRAGE', re.IGNORECASE), SEQUENCE_SHORT_MPRAGE),
130 | (re.compile(r'MPRAGE', re.IGNORECASE), SEQUENCE_ADNI_MPRAGE),
131 | (re.compile(r'MID', re.IGNORECASE), SEQUENCE_MID),
132 | # "EPI short reward" and "EPI reward short" are the same as "EPI short MID"
133 | (re.compile(r'reward', re.IGNORECASE), SEQUENCE_MID),
134 | (re.compile(r'face', re.IGNORECASE), SEQUENCE_FT),
135 | (re.compile(r'stop[- ]signal', re.IGNORECASE), SEQUENCE_SST),
136 | # LONDON stop signal DICOM files contain "SST"
137 | (re.compile(r'SST', re.IGNORECASE), SEQUENCE_SST),
138 | (re.compile(r'global', re.IGNORECASE), SEQUENCE_GLOBAL),
139 | (re.compile(r'B0'), SEQUENCE_B0_MAP),
140 | # LONDON B0 maps made of 3 DICOM files containing "FIELDMAP"
141 | (re.compile(r'FIELDMAP', re.IGNORECASE), SEQUENCE_B0_MAP),
142 | (re.compile(r'DTI'), SEQUENCE_DTI),
143 | (re.compile(r'REST', re.IGNORECASE), SEQUENCE_RESTING_STATE),
144 | )
145 |
146 | #
147 | # some acquisition centers may send nonstandard DICOM files
148 | #
149 | # for example Hamburg have sent DICOM files without "PART 10" headers
150 | #
151 | NONSTANDARD_DICOM = {
152 | LONDON: False,
153 | NOTTINGHAM: False,
154 | DUBLIN: False,
155 | BERLIN: False,
156 | HAMBURG: True,
157 | MANNHEIM: False,
158 | PARIS: False,
159 | DRESDEN: False,
160 | SOUTHAMPTON: False,
161 | AACHEN: False,
162 | }
163 |
164 | #
165 | # the SOP Class UIDs we expect to find while scanning DICOM files:
166 | # - those we process
167 | # - those we discard silently
168 | #
169 | # any other SOP Class UID generates a warning
170 | #
171 | _ALLOWED_SOP_CLASS_UIDS = {
172 | 'MR Image Storage',
173 | 'Enhanced MR Image Storage',
174 | }
175 | _IGNORED_SOP_CLASS_UIDS = {
176 | 'Grayscale Softcopy Presentation State Storage SOP Class',
177 | 'Raw Data Storage',
178 | 'Enhanced SR Storage',
179 | 'Philips Private Gyroscan MR Serie Data',
180 | 'Private MR Series Data Storage', '1.3.46.670589.11.0.0.12.2',
181 | 'Private MR Examcard Storage', '1.3.46.670589.11.0.0.12.4',
182 | 'Secondary Capture Image Storage',
183 | }
184 |
185 |
186 | def series_type_from_description(series_description):
187 | """Match series description to those listed in Imagen FU2 SOPs.
188 |
189 | This matching function is empirical and based on experimentation.
190 |
191 | Parameters
192 | ----------
193 | series_description : unicode
194 | The series description to match.
195 |
196 | Returns
197 | -------
198 | str
199 | If the series description loosely matches a series type listed
200 | in the SOPs, return this series type, else return None.
201 |
202 | """
203 | for regex, series_type in _LOOSE_IMAGE_DATA_REGEXES:
204 | if regex.search(series_description):
205 | return series_type
206 | return None
207 |
208 |
209 | def walk_image_data(path, force=False):
210 | """Generate information on DICOM files in a directory.
211 |
212 | File that cannot be read are skipped and an error message is logged.
213 |
214 | Parameters
215 | ----------
216 | path : unicode
217 | Directory to read DICOM files from.
218 | force : bool
219 | Try reading nonstandard DICOM files, typically without "PART 10" headers.
220 |
221 | Yields
222 | ------
223 | tuple
224 | Yields a pair (metadata, relpath) where metadata is a dictionary
225 | of extracted DICOM metadata.
226 |
227 | """
228 | n = 0
229 | start = time.time()
230 |
231 | logger.info('start processing files under: %s', path)
232 |
233 | for root, dummy_dirs, files in os.walk(path):
234 | n += len(files)
235 | for filename in files:
236 | abspath = os.path.join(root, filename)
237 | relpath = os.path.normpath(os.path.relpath(abspath, path))
238 | # skip DICOMDIR since we are going to read all DICOM files anyway
239 | # beware, Nottigham had sent a DICOMDIR2 file!
240 | if filename.startswith('DICOMDIR'):
241 | continue
242 | logger.debug('read file: %s', relpath)
243 | try:
244 | metadata = read_metadata(abspath, force=force)
245 | except OSError as e:
246 | logger.error('cannot read file (%s): %s', str(e), relpath)
247 | except InvalidDicomError as e:
248 | logger.error('cannot read nonstandard DICOM file: %s: %s', str(e), relpath)
249 | except AttributeError as e:
250 | logger.error('missing attribute: %s: %s', str(e), relpath)
251 | else:
252 | yield (metadata, relpath)
253 |
254 | elapsed = time.time() - start
255 | logger.info('processed %d files in %.2f s: %s', n, elapsed, path)
256 |
257 |
258 | def report_image_data(path, force=False):
259 | """Find DICOM files loosely organized according to the Imagen FU2 SOPs.
260 |
261 | The Imagen FU2 SOPs define a precise file organization for Image Data. In
262 | practice we have found the SOPs are only loosely followed. A method to find
263 | DICOM files while adapting to local variations is to read all DICOM files,
264 | then filter and break them down into series based on their contents.
265 |
266 | This function scans the directory where we expect to find the Image Data
267 | of a dataset and reports series of valid DICOM files.
268 |
269 | Parameters
270 | ----------
271 | path : unicode
272 | Directory to read DICOM files from.
273 | force : bool
274 | Try reading nonstandard DICOM files, typically without "PART 10" headers.
275 |
276 | Returns
277 | -------
278 | dict
279 | The key identifies a series while the value is a pair
280 | (series_data, images).
281 |
282 | """
283 | series_dict = {}
284 |
285 | for (image_data, relpath) in walk_image_data(path, force=force):
286 | if str(image_data['SOPClassUID']) in _IGNORED_SOP_CLASS_UIDS:
287 | continue
288 | # extract DICOM tags of interest, throw exceptions if missing tags!
289 | series_uid = image_data['SeriesInstanceUID']
290 | image_uid = image_data['SOPInstanceUID']
291 | series_number = image_data['SeriesNumber']
292 | series_description = image_data['SeriesDescription']
293 | image_types = image_data.get('ImageType', [])
294 | station_name = image_data.get('StationName', None)
295 | manufacturer = image_data.get('Manufacturer', None)
296 | manufacturer_model_name = image_data.get('ManufacturerModelName', None)
297 | software_versions = image_data.get('SoftwareVersions', None)
298 | device_serial_number = image_data.get('DeviceSerialNumber', None)
299 | if 'AcquisitionDate' in image_data:
300 | acquisition_date = image_data['AcquisitionDate']
301 | if 'AcquisitionTime' in image_data:
302 | acquisition_time = image_data['AcquisitionTime']
303 | timestamp = datetime.datetime.combine(acquisition_date,
304 | acquisition_time)
305 | else:
306 | timestamp = datetime.datetime(acquisition_date.year,
307 | acquisition_date.month,
308 | acquisition_date.day)
309 | else:
310 | logger.error('missing acquisition time: %s', relpath)
311 | # FIXME: this is obviously wrong! # find PSC1 code
312 | if 'CommentsOnThePerformedProcedureStep' in image_data: # DUBLIN
313 | psc1 = image_data['CommentsOnThePerformedProcedureStep']
314 | elif 'ImageComments' in image_data: # HAMBURG, DRESDEN
315 | psc1 = image_data['ImageComments']
316 | elif 'PatientID' in image_data: # LONDON, NOTTINGHAM, BERLIN, MANNHEIM, PARIS
317 | psc1 = image_data['PatientID']
318 | elif 'PatientName' in image_data: # LONDON, NOTTINGHAM, BERLIN, MANNHEIM, PARIS
319 | psc1 = image_data['PatientName']
320 | else:
321 | psc1 = None
322 | # build the dictionary of series using 'SeriesInstanceUID' as a key
323 | if series_uid not in series_dict:
324 | series_data = {
325 | 'SeriesNumber': series_number,
326 | 'SeriesDescription': series_description,
327 | 'ImageType': set(image_types),
328 | 'MinAcquisitionDateTime': timestamp,
329 | 'MaxAcquisitionDateTime': timestamp,
330 | }
331 | if station_name:
332 | series_data['StationName'] = station_name
333 | if manufacturer:
334 | series_data['Manufacturer'] = manufacturer
335 | if manufacturer_model_name:
336 | series_data['ManufacturerModelName'] = manufacturer_model_name
337 | if software_versions:
338 | series_data['SoftwareVersions'] = software_versions
339 | if device_serial_number:
340 | series_data['DeviceSerialNumber'] = device_serial_number
341 | if psc1:
342 | series_data['PSC1'] = psc1
343 | # populate series with relative path to DICOM files
344 | series_dict[series_uid] = (series_data, {image_uid: relpath})
345 | else:
346 | series_dict[series_uid][0]['ImageType'] |= set(image_types)
347 | # check consistency within series:
348 | if series_number != series_dict[series_uid][0]['SeriesNumber']:
349 | logger.error('inconsistent series number '
350 | '"%s" / "%s":\n %s\n %s',
351 | series_dict[series_uid][0]['SeriesNumber'],
352 | series_number,
353 | next(iter(series_dict[series_uid][1].values())),
354 | relpath)
355 | elif series_description != series_dict[series_uid][0]['SeriesDescription']:
356 | logger.error('inconsistent series description '
357 | '"%s" / "%s":\n %s\n %s',
358 | series_dict[series_uid][0]['SeriesDescription'],
359 | series_description,
360 | next(iter(series_dict[series_uid][1].values())),
361 | relpath)
362 | if station_name:
363 | if 'StationName' in series_dict[series_uid][0]:
364 | if station_name != series_dict[series_uid][0]['StationName']:
365 | logger.error('inconsistent station name '
366 | '"%s" / "%s":\n %s\n %s',
367 | series_dict[series_uid][0]['StationName'],
368 | station_name,
369 | next(iter(series_dict[series_uid][1].values())),
370 | relpath)
371 | else:
372 | series_dict[series_uid][0]['StationName'] = station_name
373 | if manufacturer:
374 | if 'Manufacturer' in series_dict[series_uid][0]:
375 | if manufacturer != series_dict[series_uid][0]['Manufacturer']:
376 | logger.error('inconsistent manufacturer '
377 | '"%s" / "%s":\n %s\n %s',
378 | series_dict[series_uid][0]['Manufacturer'],
379 | manufacturer,
380 | next(iter(series_dict[series_uid][1].values())),
381 | relpath)
382 | else:
383 | series_dict[series_uid][0]['Manufacturer'] = manufacturer
384 | if manufacturer_model_name:
385 | if 'ManufacturerModelName' in series_dict[series_uid][0]:
386 | if manufacturer_model_name != series_dict[series_uid][0]['ManufacturerModelName']:
387 | logger.error('inconsistent manufacturer model name '
388 | '"%s" / "%s":\n %s\n %s',
389 | series_dict[series_uid][0]['ManufacturerModelName'],
390 | manufacturer_model_name,
391 | next(iter(series_dict[series_uid][1].values())),
392 | relpath)
393 | else:
394 | series_dict[series_uid][0]['ManufacturerModelName'] = manufacturer_model_name
395 | if software_versions:
396 | if 'SoftwareVersions' in series_dict[series_uid][0]:
397 | if software_versions != series_dict[series_uid][0]['SoftwareVersions']:
398 | logger.error('inconsistent software versions '
399 | '"%s" / "%s":\n %s\n %s',
400 | series_dict[series_uid][0]['SoftwareVersions'],
401 | software_versions,
402 | next(iter(series_dict[series_uid][1].values())),
403 | relpath)
404 | else:
405 | series_dict[series_uid][0]['SoftwareVersions'] = software_versions
406 | if device_serial_number:
407 | if 'DeviceSerialNumber' in series_dict[series_uid][0]:
408 | if device_serial_number != series_dict[series_uid][0]['DeviceSerialNumber']:
409 | logger.error('inconsistent device serial number '
410 | '"%s" / "%s":\n %s\n %s',
411 | series_dict[series_uid][0]['DeviceSerialNumber'],
412 | device_serial_number,
413 | next(iter(series_dict[series_uid][1].values())),
414 | relpath)
415 | else:
416 | series_dict[series_uid][0]['DeviceSerialNumber'] = device_serial_number
417 |
418 | if psc1:
419 | if 'PSC1' in series_dict[series_uid][0]:
420 | if psc1 != series_dict[series_uid][0]['PSC1']:
421 | logger.error('inconsistent PSC1 code '
422 | '"%s" / "%s":\n %s\n %s',
423 | series_dict[series_uid][0]['PSC1'],
424 | psc1,
425 | next(iter(series_dict[series_uid][1].values())),
426 | relpath)
427 | else:
428 | series_dict[series_uid][0]['PSC1'] = psc1
429 | # populate series with relative path to DICOM files
430 | if image_uid not in series_dict[series_uid][1]:
431 | series_dict[series_uid][1][image_uid] = relpath
432 | else:
433 | logger.error('duplicate image in series (%s):'
434 | '\n %s\n %s',
435 | series_description,
436 | series_dict[series_uid][1][image_uid],
437 | relpath)
438 | # update acquisition date/time range by series
439 | if timestamp < series_dict[series_uid][0]['MinAcquisitionDateTime']:
440 | series_dict[series_uid][0]['MinAcquisitionDateTime'] = timestamp
441 | if timestamp > series_dict[series_uid][0]['MaxAcquisitionDateTime']:
442 | series_dict[series_uid][0]['MaxAcquisitionDateTime'] = timestamp
443 |
444 | return series_dict
445 |
--------------------------------------------------------------------------------
/imagen_databank/sanity/__init__.py:
--------------------------------------------------------------------------------
1 | # noqa
2 |
3 | # Copyright (c) 2014-2017 CEA
4 | #
5 | # This software is governed by the CeCILL license under French law and
6 | # abiding by the rules of distribution of free software. You can use,
7 | # modify and/ or redistribute the software under the terms of the CeCILL
8 | # license as circulated by CEA, CNRS and INRIA at the following URL
9 | # "http://www.cecill.info".
10 | #
11 | # As a counterpart to the access to the source code and rights to copy,
12 | # modify and redistribute granted by the license, users are provided only
13 | # with a limited warranty and the software's author, the holder of the
14 | # economic rights, and the successive licensors have only limited
15 | # liability.
16 | #
17 | # In this respect, the user's attention is drawn to the risks associated
18 | # with loading, using, modifying and/or developing or reproducing the
19 | # software by the user in light of its specific status of free software,
20 | # that may mean that it is complicated to manipulate, and that also
21 | # therefore means that it is reserved for developers and experienced
22 | # professionals having in-depth computer knowledge. Users are therefore
23 | # encouraged to load and test the software's suitability as regards their
24 | # requirements in conditions enabling the security of their systems and/or
25 | # data to be ensured and, more generally, to use and operate it in the
26 | # same conditions as regards security.
27 | #
28 | # The fact that you are presently reading this means that you have had
29 | # knowledge of the CeCILL license and that you accept its terms.
30 |
31 | __all__ = ['cantab', 'imaging']
32 |
33 |
34 | from . import cantab
35 | __all__.extend(cantab.__all__)
36 | from .cantab import check_cant_name
37 | from .cantab import check_datasheet_name
38 | from .cantab import check_detailed_datasheet_name
39 | from .cantab import check_report_name
40 | from .cantab import check_cant_content
41 | from .cantab import check_datasheet_content
42 | from .cantab import check_detailed_datasheet_content
43 | from .cantab import check_report_content
44 |
45 | from . import imaging
46 | __all__.extend(imaging.__all__)
47 | from .imaging import check_zip_name
48 | from .imaging import check_zip_content
49 | from .imaging import ZipTree
50 |
--------------------------------------------------------------------------------
/imagen_databank/scanning.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2014-2017 CEA
2 | #
3 | # This software is governed by the CeCILL license under French law and
4 | # abiding by the rules of distribution of free software. You can use,
5 | # modify and/ or redistribute the software under the terms of the CeCILL
6 | # license as circulated by CEA, CNRS and INRIA at the following URL
7 | # "http://www.cecill.info".
8 | #
9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 |
29 | import re
30 |
31 | from . core import detect_psc1
32 |
33 | import logging
34 | logger = logging.getLogger(__name__)
35 |
36 |
37 | _SUBJECT_ID_REGEX = re.compile('\d{2}[/\.]\d{2}[/\.]\d{4} \d{2}:\d{2}:\d{2}\tSubject ID: (\w+)')
38 |
39 |
40 | def read_scanning(path):
41 | """Return "Subject ID" values found in a Scanning/*.csv file.
42 |
43 | Parameters
44 | ----------
45 | path : unicode
46 | Path to the Scanning/*.csv to read from.
47 |
48 | Returns
49 | -------
50 | str
51 | "Subject ID" value found in the file.
52 |
53 | """
54 |
55 | with open(path) as scanning:
56 | subject_ids = set()
57 | for line in scanning:
58 | match = _SUBJECT_ID_REGEX.match(line)
59 | if match:
60 | subject_id = detect_psc1(match.group(1))
61 | if subject_id is None:
62 | subject_id = match.group(1)
63 | subject_ids.add(subject_id)
64 | return subject_ids
65 |
--------------------------------------------------------------------------------
/mri/imagen_sample_FU3_mri_deidentify.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright (c) 2010-2019 CEA
4 | #
5 | # This software is governed by the CeCILL license under French law and
6 | # abiding by the rules of distribution of free software. You can use,
7 | # modify and/ or redistribute the software under the terms of the CeCILL
8 | # license as circulated by CEA, CNRS and INRIA at the following URL
9 | # "http://www.cecill.info".
10 | #
11 | # As a counterpart to the access to the source code and rights to copy,
12 | # modify and redistribute granted by the license, users are provided only
13 | # with a limited warranty and the software's author, the holder of the
14 | # economic rights, and the successive licensors have only limited
15 | # liability.
16 | #
17 | # In this respect, the user's attention is drawn to the risks associated
18 | # with loading, using, modifying and/or developing or reproducing the
19 | # software by the user in light of its specific status of free software,
20 | # that may mean that it is complicated to manipulate, and that also
21 | # therefore means that it is reserved for developers and experienced
22 | # professionals having in-depth computer knowledge. Users are therefore
23 | # encouraged to load and test the software's suitability as regards their
24 | # requirements in conditions enabling the security of their systems and/or
25 | # data to be ensured and, more generally, to use and operate it in the
26 | # same conditions as regards security.
27 | #
28 | # The fact that you are presently reading this means that you have had
29 | # knowledge of the CeCILL license and that you accept its terms.
30 |
31 | import os
32 | import zipfile
33 | import zlib
34 | import tempfile
35 | from datetime import datetime
36 | import shutil
37 | import subprocess
38 | from imagen_databank import PSC2_FROM_PSC1
39 | import json
40 | import logging
41 | logger = logging.getLogger(__name__)
42 | logging.basicConfig(level=logging.INFO)
43 |
44 |
45 | QUARANTINE_PATH = '/imagen/FU3/RAW/QUARANTINE'
46 | BIDS_PATH = '/neurospin/tmp/imagen/dcm2niix'
47 | SKIP_PATH = '/imagen/mri_skip.json'
48 |
49 |
50 | def quarantine_filename_semantics(filename):
51 | root, ext = os.path.splitext(filename)
52 |
53 | if (ext != '.zip'):
54 | logger.debug('%s: filename without ".zip" extension', filename)
55 |
56 | increment, suffix = root.split('_data_')
57 | increment = int(increment)
58 |
59 | psc1 = suffix[:-6] # last 6 characters added by the upload portal
60 | if len(psc1) > 12:
61 | timepoint = psc1[12:]
62 | psc1 = psc1[:12]
63 | else:
64 | logger.error('%s: missing timepoint', psc1)
65 |
66 | return increment, psc1, timepoint
67 |
68 |
69 | def timestamps(top, include_dirs=True):
70 | min_timestamp = datetime.max
71 | max_timestamp = datetime.min
72 |
73 | for root, dirs, files in os.walk(top):
74 | if include_dirs:
75 | for dirname in dirs:
76 | path = os.path.join(root, dirname)
77 | timestamp = datetime.fromtimestamp(os.path.getmtime(path))
78 | min_timestamp = min(timestamp, min_timestamp)
79 | max_timestamp = max(timestamp, max_timestamp)
80 | for filename in files:
81 | path = os.path.join(root, filename)
82 | timestamp = datetime.fromtimestamp(os.path.getmtime(path))
83 | min_timestamp = min(timestamp, min_timestamp)
84 | max_timestamp = max(timestamp, max_timestamp)
85 |
86 | return (min_timestamp, max_timestamp)
87 |
88 |
89 | def list_datasets(path):
90 | datasets = {}
91 |
92 | for zip_file in os.listdir(path):
93 | zip_path = os.path.join(path, zip_file)
94 | root, ext = os.path.splitext(zip_file)
95 |
96 | if (ext != '.zip'):
97 | logger.debug('%s: this is not a ZIP file ', zip_file)
98 | continue
99 | elif not zipfile.is_zipfile(zip_path):
100 | logger.warn('%s: skip invalid ZIP file ', zip_file)
101 | continue
102 |
103 | # Unix timestamp of the ZIP file
104 | timestamp = os.path.getmtime(zip_path)
105 |
106 | # semantics of ZIP file name
107 | increment, psc1, timepoint = quarantine_filename_semantics(zip_file)
108 |
109 | # compare increment/timestamp of ZIP files, keep most recent
110 | timepoint = datasets.setdefault(timepoint, {})
111 | if psc1 in timepoint:
112 | old_zip_path, old_increment, old_timestamp = timepoint[psc1]
113 | if (increment <= old_increment or timestamp <= old_timestamp):
114 | if (increment >= old_increment or timestamp >= old_timestamp):
115 | logger.error('%s: inconsistent timestamps', zip_file)
116 | continue
117 | timepoint[psc1] = (zip_path, increment, timestamp)
118 |
119 | return datasets
120 |
121 |
122 | def dcm2nii(src, dst, comment):
123 | status = 0
124 |
125 | logger.info('%s: running dcm2niix: %s', src, dst)
126 |
127 | dcm2niix = ['dcm2niix',
128 | '-z', 'y', '-9'
129 | '-c', comment,
130 | '-o', dst,
131 | src]
132 | completed = subprocess.run(dcm2niix,
133 | capture_output=True)
134 | if completed.returncode:
135 | logger.error('%s: dcm2niix failed: %s',
136 | src, completed.stdout)
137 | status = completed.returncode
138 |
139 | return status
140 |
141 |
142 | def deidentify(timepoint, psc1, zip_path, bids_path):
143 | logger.info('%s/%s: deidentify', psc1, timepoint)
144 |
145 | psc2 = PSC2_FROM_PSC1[psc1]
146 | out_sub_path = os.path.join(bids_path, 'sub-' + psc2)
147 | out_ses_path = os.path.join(out_sub_path, 'ses-' + timepoint)
148 |
149 | # skip ZIP files that have already been processed
150 | if os.path.isdir(out_ses_path):
151 | zip_timestamp = datetime.fromtimestamp(os.path.getmtime(zip_path))
152 | min_timestamp, max_timestamp = timestamps(out_ses_path)
153 | if min_timestamp > zip_timestamp:
154 | return
155 | else:
156 | shutil.rmtree(out_ses_path)
157 | os.makedirs(out_ses_path)
158 |
159 | status = 0
160 | prefix = 'cveda-mri-' + psc1
161 | with tempfile.TemporaryDirectory(prefix=prefix) as tempdir:
162 | # unpack ZIP file into temporary directory
163 | zip_file = zipfile.ZipFile(zip_path)
164 | try:
165 | zip_file.extractall(tempdir)
166 | except (zipfile.BadZipFile, OSError, EOFError, zlib.error) as e:
167 | logger.error('%s/%s: corrupt ZIP file: %s',
168 | psc1, timepoint, str(e))
169 | return
170 |
171 | os.makedirs(out_ses_path)
172 | status = dcm2nii(tempdir, out_ses_path,
173 | psc2 + '/' + timepoint)
174 |
175 | if status:
176 | shutil.rmtree(out_ses_path)
177 | if not os.listdir(out_sub_path): # empty directory
178 | os.rmdir(out_sub_path)
179 |
180 | return status
181 |
182 |
183 | def main():
184 | datasets = list_datasets(QUARANTINE_PATH)
185 |
186 | for timepoint, timepoint_datasets in datasets.items():
187 | for psc1, (zip_path, increment, timestamp) in timepoint_datasets.items():
188 | with open(SKIP_PATH) as skip_file:
189 | skip = json.load(skip_file)
190 | if timepoint in skip and psc1 in skip[timepoint]:
191 | continue
192 | deidentify(timepoint, psc1, zip_path, BIDS_PATH)
193 |
194 |
195 | if __name__ == "__main__":
196 | main()
197 |
--------------------------------------------------------------------------------
/onsets/imagen_onsets_copy_FU3.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | SOURCE='/neurospin/imagen/FU3/RAW/PSC2/onsets'
4 | TARGET='/neurospin/imagen/FU3/processed/nifti'
5 |
6 | for f in "${SOURCE}/"*.csv
7 | do
8 | basename=`basename "$f" '.csv'`
9 | psc2=`echo "$basename" | sed -e 's/^.*_//; s/FU3$//'`
10 | if [ -d "${TARGET}/${psc2}" ]
11 | then
12 | mkdir -p "${TARGET}/${psc2}/BehaviouralData"
13 | cp -p "${SOURCE}/${basename}.csv" "${TARGET}/${psc2}/BehaviouralData/"
14 | else
15 | >&2 echo "ERROR: $psc2: missing folder!"
16 | fi
17 | done
18 |
--------------------------------------------------------------------------------
/onsets/imagen_onsets_copy_STRATIFY.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | SOURCE='/neurospin/imagen/STRATIFY/RAW/PSC2/onsets'
4 | TARGET='/neurospin/imagen/STRATIFY/processed/nifti'
5 |
6 | for f in "${SOURCE}/"*.csv
7 | do
8 | basename=`basename "$f" '.csv'`
9 | psc2=`echo "$basename" | sed -e 's/^.*_//; s/SB$//'`
10 | if [ -d "${TARGET}/${psc2}" ]
11 | then
12 | mkdir -p "${TARGET}/${psc2}/BehaviouralData"
13 | cp -p "${SOURCE}/${basename}.csv" "${TARGET}/${psc2}/BehaviouralData/"
14 | else
15 | >&2 echo "ERROR: $psc2: missing folder!"
16 | fi
17 | done
18 |
--------------------------------------------------------------------------------
/onsets/imagen_onsets_extract_deidentify.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import zipfile
5 | from datetime import datetime
6 | from tempfile import TemporaryDirectory
7 | from multiprocessing import Pool
8 | from imagen_databank import PSC2_FROM_PSC1, DOB_FROM_PSC1
9 | import logging
10 |
11 | logging.basicConfig(level=logging.INFO)
12 |
13 | WORKER_PROCESSES = 8
14 |
15 | DATASETS_FU3_SB = '/neurospin/imagen/FU3/RAW/QUARANTINE'
16 | ONSETS = {
17 | 'FU3': '/neurospin/imagen/FU3/RAW/PSC2/onsets',
18 | 'SB': '/neurospin/imagen/STRATIFY/RAW/PSC2/onsets',
19 | }
20 |
21 |
22 | def _parse_onsets_datetime(date_string):
23 | """Read date in the format found in CSV files.
24 |
25 | """
26 | DATE_FORMATS = (
27 | '%d.%m.%Y %H:%M:%S',
28 | '%d/%m/%Y %H:%M:%S',
29 | )
30 | for date_format in DATE_FORMATS:
31 | try:
32 | dt = datetime.strptime(date_string, date_format)
33 | return dt
34 | except ValueError:
35 | pass
36 | return None
37 |
38 |
39 | def _extract_psc1_timestamp(path):
40 | """Extract time stamp from FU3 / Stratify zip files in QUARANTINE.
41 |
42 | Parameters
43 | ----------
44 | path : unicode
45 | Zip file name.
46 |
47 | Returns
48 | -------
49 | tuple (str, int)
50 | PSC1 code and database increment number from tarball file name.
51 |
52 | """
53 | path = os.path.basename(path)
54 | root, ext = os.path.splitext(path)
55 |
56 | # extract database increment number and PSC1
57 | increment, data, psc1 = root.split('_', 2)
58 | assert(increment.isdigit())
59 | increment = int(increment)
60 | while not psc1[:12].isdigit():
61 | split = psc1.split('_', 1)
62 | if len(split) > 1:
63 | psc1 = split[-1]
64 | else:
65 | psc1 = None
66 | break
67 | else:
68 | psc1 = psc1[:12]
69 |
70 | return psc1, increment
71 |
72 |
73 | def process_behavioural(path, timepoint, prefix, psc1, psc2):
74 | logging.info('%s: processing behavioural file...', path)
75 |
76 | with open(path, encoding='latin-1', newline='') as content:
77 | output_path = ONSETS[timepoint]
78 | output = os.path.join(output_path, prefix + '_' + psc2 + timepoint + '.csv')
79 | with open(output, 'w') as output:
80 | # de-identify 1st line
81 | line = next(iter(content))
82 | column = line.split('\t')
83 | if psc1 in DOB_FROM_PSC1:
84 | column[1] = str((_parse_onsets_datetime(column[1]).date() -
85 | DOB_FROM_PSC1[psc1]).days)
86 | else:
87 | column[1] = ''
88 | column[2] = column[2].replace(psc1, psc2)
89 | line = '\t'.join(column)
90 | # write to target file
91 | output.write(line)
92 | for line in content:
93 | output.write(line)
94 |
95 |
96 | def process_dataset(arguments):
97 | (timepoint, psc1, psc2, dataset_path) = arguments # unpack multiple arguments
98 |
99 | logging.info('%s: processing zipped %s dataset...', psc1, timepoint)
100 |
101 | with TemporaryDirectory(prefix='imagen_behavioural_') as tmp:
102 | with zipfile.ZipFile(dataset_path) as dataset_zipfile:
103 | members = dataset_zipfile.infolist()
104 |
105 | for prefix in ('ft', 'mid', 'recog', 'ss'):
106 | for member in members:
107 | if member.filename == (psc1 + timepoint + '/AdditionalData/Scanning/' +
108 | prefix + '_' + psc1 + timepoint + '.csv'):
109 | dataset_zipfile.extract(member, path=tmp)
110 | behavioural_path = os.path.join(tmp, member.filename)
111 | process_behavioural(behavioural_path, timepoint, prefix, psc1, psc2)
112 | break
113 | else:
114 | logging.warning('%s: missing %s_*.csv file', psc1, prefix)
115 |
116 | logging.info('%s: processed zipped %s dataset', psc1, timepoint)
117 |
118 |
119 | def list_datasets(path, timepoint):
120 | # list zip files to process
121 | # for subjects with multiple zip files, keep the most recent one
122 | datasets = {}
123 | for dataset in os.listdir(path):
124 | root, ext = os.path.splitext(dataset)
125 | if ext != '.zip':
126 | continue
127 | increment, data, psc1 = root.split('_', 2)
128 | assert(increment.isdigit() and data == 'data' and
129 | psc1[:12].isdigit())
130 | if psc1[12:12+len(timepoint)] != timepoint:
131 | continue
132 |
133 | psc1, timestamp = _extract_psc1_timestamp(dataset)
134 | dataset_path = os.path.join(path, dataset)
135 | datasets.setdefault(psc1, {})[timestamp] = dataset_path
136 |
137 | logging.info('found %d zipped %s datasets', len(datasets), timepoint)
138 |
139 | return[(psc1, timestamps[max(timestamps.keys())]) # keep latest dataset
140 | for (psc1, timestamps) in datasets.items()]
141 |
142 |
143 | def process_datasets(path, timepoint):
144 | todo_list = list(list_datasets(path, timepoint))
145 | todo_list = [(timepoint, psc1, PSC2_FROM_PSC1[psc1], path) for (psc1, path) in todo_list]
146 |
147 | pool = Pool(WORKER_PROCESSES)
148 | results = pool.map(process_dataset, todo_list)
149 | pool.close()
150 | pool.join()
151 | return results
152 |
153 |
154 | def main():
155 | for timepoint in ('FU3', 'SB'):
156 | results = process_datasets(DATASETS_FU3_SB, timepoint)
157 |
158 |
159 | if __name__ == "__main__":
160 | main()
161 |
--------------------------------------------------------------------------------
/psc/imagen_update_dawba_codes_from_tokens.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Download Dawba codes for Imagen FU3 and Stratify and update conversion table.
3 |
4 | ==========
5 | Attributes
6 | ==========
7 |
8 | Output
9 | ------
10 |
11 | PSC2PSC : str
12 | Table of conversion between participant codes (PSC1, Dawba, PSC2).
13 |
14 | """
15 |
16 | import os
17 | import requests
18 | import json
19 | import base64
20 | from urllib.parse import urlparse
21 | import datetime
22 | import logging
23 | from imagen_databank import PSC2_FROM_PSC1
24 | logging.basicConfig(level=logging.INFO)
25 |
26 | # The LSRC2 service at Delosis.
27 | LSRC2_BASE_URL = 'https://www.delosis.com/qs/index.php/admin/remotecontrol'
28 | # Since credentials are different between the legacy and the LSRC2 service,
29 | # and ~/.netrc allows only a single set of credentials per server, store
30 | # LSRC2 credentials in an alternate file.
31 | LSRC2_NETRC_FILE = '~/.lsrc2'
32 | # The PSC1, Dawba, PSC2 conversion table
33 | PSC2PSC = '/neurospin/imagen/src/scripts/psc_tools/psc2psc.csv'
34 | PSC2PSC_SB = '/neurospin/imagen/src/scripts/psc_tools/psc2psc_SB.csv'
35 |
36 |
37 | class LimeSurveyError(Exception):
38 | def __init__(self, message, code):
39 | super().__init__(message)
40 | self.code = code
41 |
42 |
43 | def error2exception(func):
44 | def wrapper(*args, **kwargs):
45 | response, error = func(*args, **kwargs)
46 | if error:
47 | try:
48 | code = error['code']
49 | message = error['message']
50 | except (TypeError, KeyError):
51 | code = -32603 # internal JSON-RPC error
52 | message = 'Unexpected JSON-RPC error type'
53 | raise LimeSurveyError(message, code)
54 | return response
55 | return wrapper
56 |
57 |
58 | class LimeSurveySession:
59 | """LimeSurvey JSON-RPC LSRC2 session
60 |
61 | Documented here:
62 | https://www.delosis.com/qs/index.php/admin/remotecontrol
63 | https://manual.limesurvey.org/RemoteControl_2_API
64 |
65 | """
66 | __request_id = 0
67 |
68 | def __init__(self, url, username, password):
69 | self.url = url
70 | # start a Requests session
71 | self.session = requests.Session()
72 | # Keep-alive is 100% automatic in Requests, thanks to urllib3
73 | self.session.headers.update({'content-type': 'application/json'})
74 | # start a LimeSurvey RemoteControl 2 session
75 | self.key = self._get_session_key(username, password)
76 |
77 | def __enter__(self):
78 | return self
79 |
80 | def __exit__(self, type, value, traceback):
81 | self.close()
82 | return False # re-raises the exception
83 |
84 | def close(self):
85 | """Release LimeSurvey session key, then close Requests session"""
86 | self._release_session_key(self.key)
87 | self.key = None
88 | self.session.close()
89 |
90 | @staticmethod
91 | def _generate_request_id():
92 | LimeSurveySession.__request_id += 1
93 | return LimeSurveySession.__request_id
94 |
95 | @staticmethod
96 | def _request(method, params):
97 | return {
98 | 'jsonrpc': '2.0',
99 | 'id': LimeSurveySession._generate_request_id(),
100 | 'method': method,
101 | 'params': params,
102 | }
103 |
104 | def _post(self, request):
105 | logging.debug('JSON-RPC request: %s', request)
106 | assert 'method' in request and 'params' in request and 'id' in request
107 | response = self.session.post(self.url, data=json.dumps(request))
108 | response = response.json()
109 | logging.debug('JSON-RPC response: %s', response)
110 | assert response['id'] == request['id']
111 | result = response['result']
112 | error = response['error']
113 | if error:
114 | logging.error('JSON-RPC error: %s', error)
115 | return result, error
116 |
117 | def _get_session_key(self, username, password):
118 | request = self._request('get_session_key', [username, password])
119 | response, error = self._post(request)
120 |
121 | # fix non-sensical LSRC2 error handling
122 | # completely at odds with JSON-RPC error handling
123 | try:
124 | status = response['status']
125 | except (TypeError, KeyError):
126 | if error is not None:
127 | logging.error('LSRC2 failed to create a session key')
128 | response = None
129 | else:
130 | logging.info('LSRC2 new session key: %s', response)
131 | else:
132 | logging.error(status)
133 | error = {
134 | 'code': -32099, # implementation-defined error in JSON-RPC
135 | 'message': status,
136 | }
137 | response = None
138 |
139 | return response
140 |
141 | def _release_session_key(self, key):
142 | request = self._request('release_session_key', [key])
143 | logging.info('LSRC2 release session key: %s', key)
144 | dummy_response, dummy_error = self._post(request) # returns ('OK', None) even if bogus key
145 |
146 | @error2exception
147 | def surveys(self):
148 | request = self._request('list_surveys', [self.key])
149 | return self._post(request)
150 |
151 | @error2exception
152 | def participants(self, survey, attributes=False):
153 | request = self._request('list_participants',
154 | [self.key, survey, 0, 5000, False, attributes])
155 | responses, error = self._post(request)
156 |
157 | # fix non-sensical LSRC2 error handling
158 | # completely at odds with JSON-RPC error handling
159 | try:
160 | status = responses['status']
161 | except (TypeError, KeyError):
162 | pass
163 | else:
164 | # LSRC2 returns errors as a dict with a 'status' attribute
165 | if status == 'No Tokens found':
166 | # When a survey is empty, LSRC2 also returns a dict:
167 | # {"status": "No Tokens found"}
168 | if error is not None:
169 | logging.error('JSON-RPC error report does not match "status"')
170 | error = None
171 | else:
172 | error = {
173 | 'code': -32099, # implementation-defined error in JSON-RPC
174 | 'message': status,
175 | }
176 | responses = []
177 |
178 | return responses, error
179 |
180 | @error2exception
181 | def participant_properties(self, survey, participant, attributes):
182 | request = self._request('get_participant_properties',
183 | [self.key, survey, participant, attributes])
184 | return self._post(request)
185 |
186 | @error2exception
187 | def responses(self, survey, status='all'):
188 | request = self._request('export_responses',
189 | [self.key, survey, 'csv', None, status])
190 | responses, error = self._post(request)
191 |
192 | try:
193 | responses = base64.b64decode(responses).decode('utf_8').split('\n')
194 | except TypeError:
195 | # fix non-sensical LSRC2 error handling
196 | # completely at odds with JSON-RPC error handling
197 | try:
198 | status = responses['status']
199 | except (TypeError, KeyError):
200 | message = 'JSON-RPC function "export_responses" expected a Base64-encoded string'
201 | logging.error(message)
202 | error = {
203 | 'code': -32099, # implementation-defined error in JSON-RPC
204 | 'message': message,
205 | }
206 | else:
207 | # LSRC2 returns errors as a dict with a 'status' attribute
208 | if status == 'No Data, could not get max id.':
209 | # When a survey is empty, LSRC2 also returns a dict:
210 | # {"status": "No Data, could not get max id."}
211 | if error is not None:
212 | logging.error('JSON-RPC error report does not match "status"')
213 | error = None
214 | else:
215 | error = {
216 | 'code': -32099, # implementation-defined error in JSON-RPC
217 | 'message': status,
218 | }
219 | responses = []
220 |
221 | return responses, error
222 |
223 |
224 | def _get_netrc_auth(url):
225 | try:
226 | netrc_path = os.path.expanduser(LSRC2_NETRC_FILE)
227 | except KeyError:
228 | import warnings
229 | warnings.warn('Unable to find home directory')
230 | return
231 | if not os.path.exists(netrc_path):
232 | return
233 |
234 | netloc = urlparse(url).netloc
235 |
236 | try:
237 | from netrc import netrc, NetrcParseError
238 | try:
239 | authenticators = netrc(netrc_path).authenticators(netloc)
240 | except (NetrcParseError, OSError):
241 | return
242 | if authenticators:
243 | return (authenticators[0], authenticators[2])
244 | except (ImportError):
245 | return
246 |
247 |
248 | def download_lsrc2_tokens(base_url, startswith=None):
249 | """JSON RPC calls to LSRC2 service to retrieve tokens.
250 |
251 | """
252 | username, password = _get_netrc_auth(base_url)
253 | with LimeSurveySession(base_url, username, password) as session:
254 | dawba_from_psc1 = {}
255 |
256 | surveys = session.surveys()
257 | for survey in surveys:
258 | title = survey['surveyls_title']
259 | sid = survey['sid']
260 | active = survey['active']
261 |
262 | if title.startswith(startswith):
263 | if active == 'N':
264 | logging.info('skip inactive survey: %s', title)
265 | continue
266 | else:
267 | logging.info('read survey: %s', title)
268 | else:
269 | logging.info('skip survey: %s', title)
270 | continue
271 |
272 | # subjects in surveys are identified by "sid" and "token"
273 | # retrieve correlation between "token" and PSC1 and Dawba codes
274 | psc1_from_token = {}
275 | dawba_from_token = {}
276 | participants = session.participants(sid, ['completed', 'reminded', 'attribute_1', 'attribute_2'])
277 |
278 | for participant in participants:
279 | token = participant['token']
280 | if ('reminded' in participant and participant['reminded'] == 'Duplicate' or
281 | 'completed' in participant and participant['completed'] == 'N'):
282 | continue
283 | # PSC1
284 | if 'attribute_1' in participant:
285 | psc1 = participant['attribute_1'].strip()
286 | if psc1.endswith('SB'):
287 | psc1 = psc1[:-2]
288 | if psc1.endswith('FU3'):
289 | psc1 = psc1[:-3]
290 | if psc1.isdigit():
291 | if token in psc1_from_token:
292 | if psc1 != psc1_from_token[token]:
293 | logging.error('survey: %s: duplicate token has inconsistent PSC1 codes: %s / %s',
294 | title, psc1_from_token[token], psc1)
295 | else:
296 | logging.warning('survey: %s: duplicate token for PSC1 code: %s',
297 | title, psc1)
298 | else:
299 | psc1_from_token[token] = psc1
300 | else:
301 | logging.info('survey: %s: skipping invalid PSC1 code: %s',
302 | title, psc1)
303 | else:
304 | logging.error('survey: %s: token %s lacks a PSC1 code',
305 | title, token)
306 | # Dawba
307 | if 'attribute_2' in participant:
308 | dawba = participant['attribute_2']
309 | if dawba:
310 | dawba = dawba.strip()
311 | if dawba.isdigit():
312 | if token in dawba_from_token:
313 | if dawba != dawba_from_token[token]:
314 | logging.error('survey: %s: duplicate token has inconsistent Dawba codes: %s / %s',
315 | title, dawba_from_token[token], dawba)
316 | else:
317 | logging.warning('survey: %s: duplicate token for Dawba code: %s',
318 | title, dawba)
319 | else:
320 | dawba_from_token[token] = dawba
321 | elif dawba == '-':
322 | logging.warning("survey: %s: %s: skipping empty Dawba code '-'",
323 | title, psc1)
324 | else:
325 | logging.info('survey: %s: %s: skipping invalid Dawba code: %s',
326 | title, psc1, dawba)
327 | else:
328 | logging.info('survey: %s: %s: skipping empty Dawba code',
329 | title, psc1)
330 | else:
331 | logging.error('survey: %s: token %s lacks a Dawba code',
332 | title, token)
333 |
334 | for token in psc1_from_token.keys() & dawba_from_token.keys():
335 | psc1 = psc1_from_token[token]
336 | dawba = dawba_from_token[token]
337 | dawba_from_psc1.setdefault(psc1, {}).setdefault(dawba, set())
338 | dawba_from_psc1[psc1][dawba].add(title)
339 |
340 | for psc1, codes in dawba_from_psc1.items():
341 | if len(codes) > 1:
342 | message_details = ''
343 | for dawba, titles in codes.items():
344 | message_details += '\t{}:\n\t\t{}\n'.format(dawba, '\n\t\t'.join(title for title in titles))
345 | logging.error('%s: multiple Dawba codes:\n%s',
346 | psc1, message_details)
347 | dawba_from_psc1[psc1] = None
348 | else:
349 | dawba_from_psc1[psc1] = next(iter(dawba_from_psc1[psc1].keys()))
350 | dawba_from_psc1 = {psc1: dawba for psc1, dawba in dawba_from_psc1.items()
351 | if dawba}
352 |
353 | return dawba_from_psc1
354 |
355 |
356 | def main():
357 | projects = (
358 | (PSC2PSC, 'Imagen FUIII - Core'),
359 | (PSC2PSC_SB, 'STRATIFY Core'),
360 | )
361 |
362 | for psc2psc, startswith in projects:
363 | dawba_from_psc1 = download_lsrc2_tokens(LSRC2_BASE_URL, startswith)
364 |
365 | root, ext = os.path.splitext(psc2psc)
366 | output = root + '_' + datetime.date.today().isoformat() + ext
367 | with open(psc2psc, 'r') as p, open(output, 'w') as o:
368 | # skip header line
369 | line = next(p).strip('\n')
370 | print(line, file=o)
371 |
372 | done = set()
373 | for line in p:
374 | line = line.strip('\n')
375 | psc1, dawba, psc2 = line.split('=')
376 | if (int(dawba) > 200000 or # process only FU3 and Stratify
377 | dawba == '000000'):
378 | if psc1 in dawba_from_psc1:
379 | if dawba != dawba_from_psc1[psc1]:
380 | if dawba == '000000':
381 | logging.info('%s: Dawba code initialized from %s to %s',
382 | psc1, dawba, dawba_from_psc1[psc1])
383 | else:
384 | logging.error('%s: Dawba code changed from %s to %s',
385 | psc1, dawba, dawba_from_psc1[psc1])
386 | dawba = dawba_from_psc1[psc1]
387 | line = '='.join((psc1, dawba, psc2))
388 | done.add(psc1)
389 | print(line, file=o)
390 |
391 | for psc1 in (dawba_from_psc1.keys() - done):
392 | dawba = dawba_from_psc1[psc1]
393 | psc2 = PSC2_FROM_PSC1[psc1]
394 | line = '='.join((psc1, dawba, psc2))
395 | print(line, file=o)
396 |
397 |
398 | if __name__ == "__main__":
399 | main()
400 |
--------------------------------------------------------------------------------
/psytools/imagen_psytools_deidentify.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Re-encode and pseudonymize Psytools CSV files (BL, FU1, FU2, FU3 and Stratify).
3 |
4 | This script replaces the Scito pseudonymization pipeline.
5 |
6 | ==========
7 | Attributes
8 | ==========
9 |
10 | Input
11 | -----
12 |
13 | PSYTOOLS_BL_DERIVED_DIR : str
14 | Location of BL PSC1-encoded files.
15 | PSYTOOLS_FU1_DERIVED_DIR : str
16 | Location of FU1 PSC1-encoded files.
17 | PSYTOOLS_FU2_DERIVED_DIR : str
18 | Location of FU2 PSC1-encoded files.
19 | PSYTOOLS_FU3_DERIVED_DIR : str
20 | Location of FU3 PSC1-encoded files.
21 | PSYTOOLS_STRATIFY_DERIVED_DIR : str
22 | Location of Stratify PSC1-encoded files.
23 |
24 | Output
25 | ------
26 |
27 | PSYTOOLS_BL_PSC2_DIR : str
28 | Location of BL PSC2-encoded files.
29 | PSYTOOLS_FU1_PSC2_DIR : str
30 | Location of FU1 PSC2-encoded files.
31 | PSYTOOLS_FU2_PSC2_DIR : str
32 | Location of FU2 PSC2-encoded files.
33 | PSYTOOLS_FU3_PSC2_DIR : str
34 | Location of FU3 PSC2-encoded files.
35 | PSYTOOLS_STRATIFY_PSC2_DIR : str
36 | Location of Stratify PSC2-encoded files.
37 |
38 | """
39 |
40 | PSYTOOLS_BL_DERIVED_DIR = '/tmp/imagen/BL/processed/psytools'
41 | PSYTOOLS_BL_PSC2_DIR = '/neurospin/imagen/BL/processed/psytools'
42 | PSYTOOLS_FU1_DERIVED_DIR = '/tmp/imagen/FU1/processed/psytools'
43 | PSYTOOLS_FU1_PSC2_DIR = '/neurospin/imagen/FU1/processed/psytools'
44 | PSYTOOLS_FU2_DERIVED_DIR = '/tmp/imagen/FU2/processed/psytools'
45 | PSYTOOLS_FU2_PSC2_DIR = '/neurospin/imagen/FU2/processed/psytools'
46 | PSYTOOLS_FU3_DERIVED_DIR = '/tmp/imagen/FU3/processed/psytools'
47 | PSYTOOLS_FU3_PSC2_DIR = '/neurospin/imagen/FU3/processed/psytools'
48 | PSYTOOLS_STRATIFY_DERIVED_DIR = '/tmp/imagen/STRATIFY/processed/psytools'
49 | PSYTOOLS_STRATIFY_PSC2_DIR = '/neurospin/imagen/STRATIFY/processed/psytools'
50 | PSYTOOLS_STRATIFY_FU_DERIVED_DIR = '/tmp/imagen/STRATIFY_FU/processed/psytools'
51 | PSYTOOLS_STRATIFY_FU_PSC2_DIR = '/neurospin/imagen/STRATIFY_FU/processed/psytools'
52 | PSYTOOLS_IMACOV19_BL_DERIVED_DIR = '/tmp/imagen/IMACOV19_BL/processed/psytools'
53 | PSYTOOLS_IMACOV19_BL_PSC2_DIR = '/neurospin/imagen/IMACOV19_BL/processed/psytools'
54 | PSYTOOLS_IMACOV19_FU_DERIVED_DIR = '/tmp/imagen/IMACOV19_FU/processed/psytools'
55 | PSYTOOLS_IMACOV19_FU_PSC2_DIR = '/neurospin/imagen/IMACOV19_FU/processed/psytools'
56 | PSYTOOLS_IMACOV19_FU2_DERIVED_DIR = '/tmp/imagen/IMACOV19_FU2/processed/psytools'
57 | PSYTOOLS_IMACOV19_FU2_PSC2_DIR = '/neurospin/imagen/IMACOV19_FU2/processed/psytools'
58 | PSYTOOLS_IMACOV19_FU3_DERIVED_DIR = '/tmp/imagen/IMACOV19_FU3/processed/psytools'
59 | PSYTOOLS_IMACOV19_FU3_PSC2_DIR = '/neurospin/imagen/IMACOV19_FU3/processed/psytools'
60 | PSYTOOLS_STRATICO19_BL_DERIVED_DIR = '/tmp/imagen/STRATICO19_BL/processed/psytools'
61 | PSYTOOLS_STRATICO19_BL_PSC2_DIR = '/neurospin/imagen/STRATICO19_BL/processed/psytools'
62 | PSYTOOLS_STRATICO19_FU_DERIVED_DIR = '/tmp/imagen/STRATICO19_FU/processed/psytools'
63 | PSYTOOLS_STRATICO19_FU_PSC2_DIR = '/neurospin/imagen/STRATICO19_FU/processed/psytools'
64 | PSYTOOLS_STRATICO19_FU2_DERIVED_DIR = '/tmp/imagen/STRATICO19_FU2/processed/psytools'
65 | PSYTOOLS_STRATICO19_FU2_PSC2_DIR = '/neurospin/imagen/STRATICO19_FU2/processed/psytools'
66 | PSYTOOLS_STRATICO19_FU3_DERIVED_DIR = '/tmp/imagen/STRATICO19_FU3/processed/psytools'
67 | PSYTOOLS_STRATICO19_FU3_PSC2_DIR = '/neurospin/imagen/STRATICO19_FU3/processed/psytools'
68 |
69 |
70 | import os
71 | from csv import DictReader
72 | from csv import DictWriter
73 | from datetime import datetime
74 | import logging
75 | logging.basicConfig(level=logging.INFO)
76 |
77 | # import ../imagen_databank
78 | import sys
79 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))
80 | from imagen_databank import PSC2_FROM_PSC1
81 | from imagen_databank import DOB_FROM_PSC1
82 |
83 |
84 | def _deidentify_legacy(psc2_from_psc1, psytools_path, psc2_path):
85 | """Anonymize and re-encode a legacy Psytools questionnaire from PSC1 to PSC2.
86 |
87 | Legacy questionnaires are in long format.
88 |
89 | Parameters
90 | ----------
91 | psc2_from_psc1: map
92 | Conversion table, from PSC1 to PSC2.
93 | psytools_path: str
94 | Input: PSC1-encoded Psytools file.
95 | psc2_path: str
96 | Output: PSC2-encoded Psytools file.
97 |
98 | """
99 | with open(psytools_path, 'r') as psc1_file:
100 | psc1_reader = DictReader(psc1_file, dialect='excel')
101 |
102 | # de-identify columns with timestamps
103 | ANONYMIZED_COLUMNS = {
104 | 'Completed Timestamp': ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S'),
105 | 'Processed Timestamp': ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S'),
106 | }
107 | convert = [fieldname for fieldname in psc1_reader.fieldnames
108 | if fieldname in ANONYMIZED_COLUMNS]
109 |
110 | # discard other columns with dates
111 | DISCARDED_COLUMNS = {
112 | 'id_check_dob', 'id_check_gender', 'id_check_relation',
113 | # FU3 / NI DATA
114 | 'DATE_BIRTH_1', 'DATE_BIRTH_2', 'DATE_BIRTH_3',
115 | 'TEST_DATE_1', 'TEST_DATE_2', 'TEST_DATE_3'
116 | }
117 |
118 | # read/process each row and save for later writing
119 | rows = {}
120 | for row in psc1_reader:
121 | psc1, suffix = row['User code'][:12], row['User code'][12:]
122 | if psc1 in PSC2_FROM_PSC1:
123 | psc2 = PSC2_FROM_PSC1[psc1]
124 | if suffix in {'-C', '-P', '-I'}:
125 | # keep the suffix of Imagen subject IDs
126 | # -C Child
127 | # -P Parent
128 | # -I Institute
129 | row['User code'] = psc2 + suffix
130 | else:
131 | if suffix in {'FU', 'SU'}:
132 | # as a short-term decision, discard "FU" follow-up participants as well as "SU" follow-up
133 | # from Stratify and LimeSurvey-derived files
134 | logging.info('discarding STRATIFY follow-up participant %s!',
135 | row['User code'])
136 | continue
137 | elif suffix not in {'FU3', 'SB'}: #'SU' corresponds to ESTRA FU, has been removed temporarily to generate psytools SB
138 | # remove "FU3 and "SB"/"SU" suffixes
139 | # in Stratify and LimeSurvey-derived files
140 | logging.error('unknown suffix %s in user code %s',
141 | suffix, row['User code'])
142 | row['User code'] = psc2
143 | else:
144 | logging.error('unknown PSC1 code %s in user code %s',
145 | psc1, row['User code'])
146 | continue
147 |
148 | # de-identify columns with timestamps
149 | for fieldname in convert:
150 | if psc1 in DOB_FROM_PSC1:
151 | birth = DOB_FROM_PSC1[psc1]
152 | for timestamp_format in ANONYMIZED_COLUMNS[fieldname]:
153 | try:
154 | timestamp = datetime.strptime(row[fieldname],
155 | timestamp_format).date()
156 | except ValueError:
157 | continue
158 | else:
159 | age = timestamp - birth
160 | row[fieldname] = str(age.days)
161 | break
162 | else:
163 | logging.error('%s: invalid "%s": %s',
164 | psc1, fieldname, row[fieldname])
165 | row[fieldname] = None
166 | else:
167 | row[fieldname] = None
168 |
169 | # convert to age in days at date of birth - should be 0 if correct!
170 | # FU2 / ESPAD CHILD
171 | # FU2 / NI DATA
172 | for column in ('education_end', 'ni_period', 'ni_date'):
173 | if column in psc1_reader.fieldnames:
174 | if psc1 in DOB_FROM_PSC1:
175 | birth = DOB_FROM_PSC1[psc1]
176 | try:
177 | d = datetime.strptime(row[column],
178 | '%d-%m-%Y').date()
179 | except ValueError:
180 | row[column] = None
181 | else:
182 | age = d - birth
183 | row[column] = str(age.days)
184 | else:
185 | row[column] = None
186 |
187 | # convert to age of parents in days at assessment
188 | # BL/FU1 / PBQ
189 | for column in ('pbq_01', 'pbq_02'):
190 | if column in psc1_reader.fieldnames:
191 | try:
192 | birth = datetime.strptime(row[column],
193 | '%d-%m-%Y').date()
194 | except ValueError:
195 | row[column] = None
196 | else:
197 | # last 'timestamp' ought to be 'Processed timestamp'
198 | age = timestamp - birth
199 | row[column] = str(age.days)
200 |
201 | # discard other columns with dates
202 | for column in DISCARDED_COLUMNS:
203 | if column in psc1_reader.fieldnames:
204 | del row[column]
205 |
206 | rows.setdefault(psc2, []).append(row)
207 |
208 | # save rows into output file, sort by PSC2
209 | with open(psc2_path, 'w') as psc2_file:
210 | fieldnames = [fieldname for fieldname in psc1_reader.fieldnames
211 | if fieldname not in DISCARDED_COLUMNS]
212 | psc2_writer = DictWriter(psc2_file, fieldnames, dialect='excel')
213 | psc2_writer.writeheader()
214 | for psc2 in sorted(rows):
215 | for row in rows[psc2]:
216 | psc2_writer.writerow(row)
217 |
218 |
219 | def _psc1(psc1, psc2_from_psc1):
220 | if 'TEST' in psc1.upper():
221 | # skip test subjects
222 | logging.debug('skipping test subject "%s"', psc1)
223 | else:
224 | # find and skip subjects with invalid identifier
225 | if psc1[-3:] in {'FU2', 'FU3'}:
226 | psc1 = psc1[:-3]
227 | elif psc1[-2:] in {'SB'}: #removing SU to skip followup acquisitions in SB psytools
228 | psc1 = psc1[:-2]
229 | if psc1 in psc2_from_psc1:
230 | return psc1
231 | elif psc1 in {'0x0000xxxxxx'}:
232 | logging.info('skipping known invalid subject identifier "%s"',
233 | psc1)
234 | else:
235 | logging.error('invalid subject identifier "%s"', psc1)
236 | return None
237 |
238 |
239 | def _deidentify_lsrc2(psc2_from_psc1, psytools_path, psc2_path):
240 | """Anonymize and re-encode an LSRC2 Psytools questionnaire from PSC1 to PSC2.
241 |
242 | LSRC2 questionnaires are in wide format.
243 |
244 | Parameters
245 | ----------
246 | psc2_from_psc1: map
247 | Conversion table, from PSC1 to PSC2.
248 | psytools_path: str
249 | Input: PSC1-encoded Psytools file.
250 | psc2_path: str
251 | Output: PSC2-encoded Psytools file.
252 |
253 | """
254 | COLUMNS_TO_REMOVE = {
255 | 'token',
256 | 'ipaddr',
257 | 'IdCheckGender',
258 | 'IdCheckDob',
259 | 'geoLoc_search', # Covid-19 questionnaires
260 | }
261 | COLUMNS_WITH_DATE = {
262 | 'startdate',
263 | 'datestamp',
264 | 'submitdate',
265 | }
266 |
267 | with open(psytools_path, 'r') as psc1_file:
268 | psc1_reader = DictReader(psc1_file, dialect='excel')
269 | # columns to remove entirely
270 | fieldnames = [x for x in psc1_reader.fieldnames
271 | if x not in COLUMNS_TO_REMOVE]
272 | with open(psc2_path, 'w') as psc2_file:
273 | psc2_writer = DictWriter(psc2_file, fieldnames, dialect='excel')
274 | psc2_writer.writeheader()
275 | for row in psc1_reader:
276 | # skip test and invalid subjects
277 | psc1 = _psc1(row['id'], psc2_from_psc1)
278 | if psc1:
279 | psc2 = psc2_from_psc1[psc1]
280 | # columns to remove entirely
281 | for x in COLUMNS_TO_REMOVE:
282 | if x in row:
283 | del row[x]
284 | # columns to de-identify
285 | row['id'] = psc2
286 | for x in COLUMNS_WITH_DATE:
287 | if x in row and row[x]:
288 | date = datetime.strptime(row[x],
289 | '%Y-%m-%d %H:%M:%S').date()
290 | if psc1 in DOB_FROM_PSC1:
291 | birth = DOB_FROM_PSC1[psc1]
292 | age = date - birth
293 | row[x] = age.days
294 | else:
295 | logging.error('unknown date of birth: "%s"',
296 | psc1)
297 | row[x] = None
298 | psc2_writer.writerow(row)
299 |
300 |
301 | def deidentify(psc2_from_psc1, master_dir, psc2_dir):
302 | """Anonymize and re-encode Psytools questionnaires within a directory.
303 |
304 | PSC1-encoded files are read from `master_dir`, anoymized and converted
305 | from PSC1 codes to PSC2, and the result is written in `psc2_dir`.
306 |
307 | Parameters
308 | ----------
309 | psc2_from_psc1: map
310 | Conversion table, from PSC1 to PSC2.
311 | master_dir: str
312 | Input directory with PSC1-encoded questionnaires.
313 | psc2_dir: str
314 | Output directory with PSC2-encoded and anonymized questionnaires.
315 |
316 | """
317 | CURRENTLY_NOT_PROPERLY_DEIDENTIFIED = {
318 | 'IMAGEN-IMGN_RELIABILITY_PI_FU2-BASIC_DIGEST.csv',
319 | 'IMAGEN-IMGN_RELIABILITY_FU3-BASIC_DIGEST.csv',
320 | 'STRATIFY_screening_(London).csv',
321 | 'STRATIFY_screening_(Southampton).csv',
322 | 'STRATIFY_screening_(ED).csv',
323 | }
324 |
325 | for filename in os.listdir(master_dir):
326 | if filename in CURRENTLY_NOT_PROPERLY_DEIDENTIFIED:
327 | continue
328 | master_path = os.path.join(master_dir, filename)
329 | psc2_path = os.path.join(psc2_dir, filename)
330 | if filename.startswith('IMAGEN-') or filename.startswith('STRATIFY-') or filename.startswith('IMACOV19-') or filename.startswith('STRATICO19-'):
331 | _deidentify_legacy(psc2_from_psc1, master_path, psc2_path)
332 | elif filename.startswith('Imagen_') or filename.startswith('STRATIFY_'):
333 | _deidentify_lsrc2(psc2_from_psc1, master_path, psc2_path)
334 | else:
335 | logging.error('skipping unknown file: %s', filename)
336 |
337 |
338 | def main():
339 | # IMAGEN
340 | deidentify(PSC2_FROM_PSC1,
341 | PSYTOOLS_BL_DERIVED_DIR, PSYTOOLS_BL_PSC2_DIR)
342 | deidentify(PSC2_FROM_PSC1,
343 | PSYTOOLS_FU1_DERIVED_DIR, PSYTOOLS_FU1_PSC2_DIR)
344 | deidentify(PSC2_FROM_PSC1,
345 | PSYTOOLS_FU2_DERIVED_DIR, PSYTOOLS_FU2_PSC2_DIR)
346 | deidentify(PSC2_FROM_PSC1,
347 | PSYTOOLS_FU3_DERIVED_DIR, PSYTOOLS_FU3_PSC2_DIR)
348 | # STRATIFY/ESTRA
349 | deidentify(PSC2_FROM_PSC1,
350 | PSYTOOLS_STRATIFY_DERIVED_DIR, PSYTOOLS_STRATIFY_PSC2_DIR)
351 | #deidentify(PSC2_FROM_PSC1,
352 | # PSYTOOLS_STRATIFY_FU_DERIVED_DIR, PSYTOOLS_STRATIFY_FU_PSC2_DIR)
353 | # IMACOV
354 | deidentify(PSC2_FROM_PSC1,
355 | PSYTOOLS_IMACOV19_BL_DERIVED_DIR, PSYTOOLS_IMACOV19_BL_PSC2_DIR)
356 | deidentify(PSC2_FROM_PSC1,
357 | PSYTOOLS_IMACOV19_FU_DERIVED_DIR, PSYTOOLS_IMACOV19_FU_PSC2_DIR)
358 | deidentify(PSC2_FROM_PSC1,
359 | PSYTOOLS_IMACOV19_FU2_DERIVED_DIR, PSYTOOLS_IMACOV19_FU2_PSC2_DIR)
360 | deidentify(PSC2_FROM_PSC1,
361 | PSYTOOLS_IMACOV19_FU3_DERIVED_DIR, PSYTOOLS_IMACOV19_FU3_PSC2_DIR)
362 | # STRATICO
363 | deidentify(PSC2_FROM_PSC1,
364 | PSYTOOLS_STRATICO19_BL_DERIVED_DIR, PSYTOOLS_STRATICO19_BL_PSC2_DIR)
365 | deidentify(PSC2_FROM_PSC1,
366 | PSYTOOLS_STRATICO19_FU_DERIVED_DIR, PSYTOOLS_STRATICO19_FU_PSC2_DIR)
367 | deidentify(PSC2_FROM_PSC1,
368 | PSYTOOLS_STRATICO19_FU2_DERIVED_DIR, PSYTOOLS_STRATICO19_FU2_PSC2_DIR)
369 | deidentify(PSC2_FROM_PSC1,
370 | PSYTOOLS_STRATICO19_FU3_DERIVED_DIR, PSYTOOLS_STRATICO19_FU3_PSC2_DIR)
371 |
372 |
373 | if __name__ == "__main__":
374 | main()
375 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2016-2017 CEA
2 | #
3 | # This software is governed by the CeCILL license under French law and
4 | # abiding by the rules of distribution of free software. You can use,
5 | # modify and/ or redistribute the software under the terms of the CeCILL
6 | # license as circulated by CEA, CNRS and INRIA at the following URL
7 | # "http://www.cecill.info".
8 | #
9 | # As a counterpart to the access to the source code and rights to copy,
10 | # modify and redistribute granted by the license, users are provided only
11 | # with a limited warranty and the software's author, the holder of the
12 | # economic rights, and the successive licensors have only limited
13 | # liability.
14 | #
15 | # In this respect, the user's attention is drawn to the risks associated
16 | # with loading, using, modifying and/or developing or reproducing the
17 | # software by the user in light of its specific status of free software,
18 | # that may mean that it is complicated to manipulate, and that also
19 | # therefore means that it is reserved for developers and experienced
20 | # professionals having in-depth computer knowledge. Users are therefore
21 | # encouraged to load and test the software's suitability as regards their
22 | # requirements in conditions enabling the security of their systems and/or
23 | # data to be ensured and, more generally, to use and operate it in the
24 | # same conditions as regards security.
25 | #
26 | # The fact that you are presently reading this means that you have had
27 | # knowledge of the CeCILL license and that you accept its terms.
28 |
29 | from setuptools import setup
30 | from imagen_databank import __version__
31 | from imagen_databank import __author__
32 | from imagen_databank import __email__
33 | from imagen_databank import __license__
34 |
35 |
36 | def readme():
37 | with open('README.rst') as f:
38 | return f.read()
39 |
40 |
41 | def license():
42 | with open('LICENSE') as f:
43 | return f.read()
44 |
45 |
46 | setup(
47 | name='imagen_databank',
48 | version=__version__,
49 | author=__author__,
50 | author_email=__email__,
51 | description='Imagen project databank software',
52 | long_description=readme(),
53 | license=__license__,
54 | url='https://github.com/imagen2/imagen_databank',
55 | packages=['imagen_databank'],
56 | scripts=[
57 | 'cantab/imagen_cantab_age_at_session_start_time.py',
58 | 'dawba/imagen_dawba_process.py',
59 | 'psytools/imagen_psytools_download_csv.py',
60 | 'psytools/imagen_psytools_download_json.py',
61 | 'psytools/imagen_psytools_deidentify_csv.py',
62 | 'psytools/imagen_psytools_deidentify_json.py',
63 | ],
64 | classifiers=[
65 | "License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)",
66 | "Intended Audience :: Developers",
67 | "Intended Audience :: Science/Research",
68 | "Environment :: Console",
69 | "Development Status :: 4 - Beta",
70 | "Programming Language :: Python",
71 | "Programming Language :: Python :: 2.7",
72 | "Programming Language :: Python :: 3",
73 | "Operating System :: OS Independent",
74 | "Topic :: Scientific/Engineering :: Medical Science Apps.",
75 | "Topic :: Utilities",
76 | ],
77 | install_requires=[
78 | 'pydicom',
79 | ],
80 | )
81 |
--------------------------------------------------------------------------------
/sex/imagen_sex.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import csv
5 | import logging
6 | from imagen_databank import PSC2_FROM_PSC1
7 |
8 | logging.basicConfig(level=logging.INFO)
9 |
10 | WORKER_PROCESSES = 8
11 |
12 | FU3_VALIDATION = '/neurospin/imagen/FU3/RAW/PSC1/meta_data/sex_validation_2018.csv'
13 |
14 | FEMALE = 'F'
15 | MALE = 'M'
16 |
17 |
18 | def validation_FU3(path):
19 | result = {}
20 |
21 | with open(path, newline='') as csvfile:
22 | reader = csv.reader(csvfile, delimiter=',')
23 | next(reader) # skip header
24 | for row in reader:
25 | psc1 = row[0]
26 | sex = row[1]
27 | result[psc1] = sex
28 |
29 | return result
30 |
31 |
32 | def main():
33 | # read different sources
34 | with open('imagen_sex_recruitment.csv', 'r') as f:
35 | reader = csv.DictReader(f, dialect='excel')
36 | recruitment = {row['PSC1']: row['Recruitment']
37 | for row in reader}
38 |
39 | with open('imagen_sex_dataset.csv', 'r') as f:
40 | reader = csv.DictReader(f, dialect='excel')
41 | dataset = {row['PSC1']:
42 | (row['QualityReport.txt'] if 'QualityReport.txt' in row else None,
43 | row['BL MRI'] if 'BL MRI' in row else None,
44 | row['BL Cantab'] if 'BL Cantab' in row else None,
45 | row['FU2 MRI'] if 'FU2 MRI' in row else None,
46 | row['FU2 Cantab'] if 'FU2 Cantab' in row else None,
47 | row['FU3 MRI'] if 'FU3 MRI' in row else None,
48 | row['FU3 Cantab'] if 'FU3 Cantab' in row else None)
49 | for row in reader}
50 |
51 | with open('imagen_sex_psytools.csv', 'r') as f:
52 | reader = csv.DictReader(f, dialect='excel')
53 | psytools = {row['PSC1']:
54 | (row['Psytools BL'] if 'Psytools BL' in row else None,
55 | row['Psytools FU1'] if 'Psytools FU1' in row else None,
56 | row['Psytools FU2'] if 'Psytools FU2' in row else None,
57 | row['Psytools FU3'] if 'Psytools FU3' in row else None)
58 | for row in reader}
59 |
60 | with open('imagen_sex_xnat.csv', 'r') as f:
61 | reader = csv.DictReader(f, dialect='excel')
62 | xnat = {row['PSC1']: row['XNAT gender'] if 'XNAT gender' in row else None
63 | for row in reader}
64 |
65 | with open('imagen_sex_methylation.csv', 'r') as f:
66 | reader = csv.DictReader(f, dialect='excel')
67 | methylation = {row['PSC1']:
68 | (row['Methylation BL'] if 'Methylation BL' in row else None,
69 | row['Methylation FU'] if 'Methylation FU' in row else None)
70 | for row in reader}
71 |
72 | validation = validation_FU3(FU3_VALIDATION)
73 |
74 | # merge sources
75 | psc1s = set()
76 | for source in (recruitment, psytools, xnat, validation, methylation):
77 | psc1s = psc1s.union(set(source.keys()))
78 | psc1s = psc1s.intersection(set(PSC2_FROM_PSC1.keys())) # LONDON recruitment file
79 |
80 | with open('imagen_sex.csv', 'w', newline='') as csvfile:
81 | fieldnames = ['PSC1',
82 | 'Recruitment',
83 | 'QualityReport.txt', 'MRI BL', 'Cantab BL', 'MRI FU2', 'Cantab FU2', 'MRI FU3', 'Cantab FU3',
84 | 'Psytools BL', 'Psytools FU1', 'Psytools FU2', 'Psytools FU3',
85 | 'XNAT gender',
86 | '2018 validation',
87 | 'Reference',
88 | 'Methylation BL', 'Methylation FU']
89 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
90 | writer.writeheader()
91 |
92 | for psc1 in sorted(psc1s):
93 | row = {}
94 | if psc1 in recruitment:
95 | row['Recruitment'] = recruitment[psc1]
96 | if psc1 in dataset:
97 | if dataset[psc1][0]:
98 | row['QualityReport.txt'] = dataset[psc1][0]
99 | if dataset[psc1][1]:
100 | row['MRI BL'] = dataset[psc1][1]
101 | if dataset[psc1][2]:
102 | row['Cantab BL'] = dataset[psc1][2]
103 | if dataset[psc1][3]:
104 | row['MRI FU2'] = dataset[psc1][3]
105 | if dataset[psc1][4]:
106 | row['Cantab FU2'] = dataset[psc1][4]
107 | if dataset[psc1][5]:
108 | row['MRI FU3'] = dataset[psc1][5]
109 | if dataset[psc1][6]:
110 | row['Cantab FU3'] = dataset[psc1][6]
111 | if psc1 in psytools:
112 | if psytools[psc1][0]:
113 | row['Psytools BL'] = psytools[psc1][0]
114 | if psytools[psc1][1]:
115 | row['Psytools FU1'] = psytools[psc1][1]
116 | if psytools[psc1][2]:
117 | row['Psytools FU2'] = psytools[psc1][2]
118 | if psytools[psc1][3]:
119 | row['Psytools FU3'] = psytools[psc1][3]
120 | if psc1 in xnat:
121 | row['XNAT gender'] = xnat[psc1]
122 | if psc1 in validation:
123 | row['2018 validation'] = validation[psc1]
124 |
125 | if psc1 in xnat and psc1 in validation:
126 | if xnat[psc1] != validation[psc1]:
127 | logging.warning('%s: changed XNAT %s into %s',
128 | psc1, xnat[psc1], validation[psc1])
129 |
130 | values = set(row.values())
131 | if len(values) > 1:
132 | if psc1 in validation:
133 | row['Reference'] = validation[psc1]
134 | elif psc1 in xnat:
135 | row['Reference'] = xnat[psc1]
136 | else:
137 | logging.warning('%s: cannot derive a reference value for sex',
138 | psc1)
139 | else:
140 | row['Reference'] = next(iter(values))
141 |
142 | if psc1 in methylation:
143 | if methylation[psc1][0]:
144 | row['Methylation BL'] = methylation[psc1][0]
145 | if methylation[psc1][1]:
146 | row['Methylation FU'] = methylation[psc1][1]
147 |
148 | row['PSC1'] = psc1
149 | writer.writerow(row)
150 |
151 |
152 | if __name__ == "__main__":
153 | main()
154 |
--------------------------------------------------------------------------------
/sex/imagen_sex_methylation.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | import csv
5 | import logging
6 |
7 | logging.basicConfig(level=logging.INFO)
8 |
9 | WORKER_PROCESSES = 8
10 |
11 | METHYLATION = '/neurospin/imagen/TODO/predicted_gender.csv'
12 | PSC1_FROM_CHIP = '/neurospin/imagen/TODO/PSC1/Associated PSC1 codes.csv'
13 |
14 | FEMALE = 'F'
15 | MALE = 'M'
16 |
17 |
18 | def psc1_from_chip(path):
19 | result = {}
20 |
21 | with open(path, newline='') as csvfile:
22 | reader = csv.reader(csvfile, delimiter=',')
23 | next(reader) # skip header
24 | for row in reader:
25 | chip = row[0]
26 | psc1 = row[1]
27 | if psc1.endswith('FU'):
28 | psc1 = psc1[:-len('FU')]
29 | timepoint = 'FU2'
30 | else:
31 | timepoint = 'BL'
32 | result[chip] = (psc1, timepoint)
33 |
34 | return result
35 |
36 |
37 | def methylation_process(path, psc1_from_chip):
38 | result_BL = {}
39 | result_FU2 = {}
40 |
41 | with open(path, newline='') as csvfile:
42 | reader = csv.reader(csvfile, delimiter=',')
43 | next(reader) # skip header
44 | for row in reader:
45 | chip = row[0]
46 | sex = row[1]
47 | if sex == '1':
48 | sex = 'F'
49 | elif sex == '2':
50 | sex = 'M'
51 | else:
52 | logging.error('%s: incorrect sex (%s) in prediction CSV file: %s',
53 | chip, sex, f)
54 | continue
55 | if chip in psc1_from_chip:
56 | psc1, timepoint = psc1_from_chip[chip]
57 | if timepoint == 'FU2':
58 | result = result_FU2
59 | elif timepoint == 'BL':
60 | result = result_BL
61 | else:
62 | logging.error('%s: incorrect connversion table', chip)
63 | continue
64 | if psc1 in result:
65 | if result[psc1] != sex:
66 | logging.error('%s: inconsistent sex from methylation', psc1)
67 | result[psc1] = '?'
68 | else:
69 | result[psc1] = sex
70 |
71 | return result_BL, result_FU2
72 |
73 |
74 | def main():
75 | psc1_from_chip_table = psc1_from_chip(PSC1_FROM_CHIP)
76 | methylation_BL, methylation_FU2 = methylation_process(METHYLATION, psc1_from_chip_table)
77 | methylation = (methylation_BL, methylation_FU2)
78 |
79 | with open('imagen_sex_methylation.csv', 'w', newline='') as csvfile:
80 | sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
81 | sex.writerow(['PSC1',
82 | 'Methylation BL', 'Methylation FU'])
83 | psc1s = set()
84 | for timepoint in methylation:
85 | psc1s = psc1s.union(set(timepoint.keys()))
86 | for psc1 in sorted(psc1s):
87 | row = [psc1]
88 | for timepoint in methylation:
89 | if psc1 in timepoint:
90 | row.append(timepoint[psc1])
91 | else:
92 | row.append(None)
93 | sex.writerow(row)
94 |
95 |
96 | if __name__ == "__main__":
97 | main()
98 |
--------------------------------------------------------------------------------
/sex/imagen_sex_psytools.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | from multiprocessing import Pool
5 | import csv
6 | from collections import Counter
7 | import logging
8 |
9 | logging.basicConfig(level=logging.INFO)
10 |
11 | BL_PSYTOOLS = '/neurospin/imagen/BL/RAW/PSC1/psytools'
12 | FU1_PSYTOOLS = '/neurospin/imagen/FU1/RAW/PSC1/psytools'
13 | FU2_PSYTOOLS = '/neurospin/imagen/FU2/RAW/PSC1/psytools'
14 | FU3_PSYTOOLS = '/neurospin/imagen/FU3/RAW/PSC1/psytools'
15 |
16 | WORKER_PROCESSES = 24
17 |
18 |
19 | FEMALE = 'F'
20 | MALE = 'M'
21 |
22 | _CSV_ID_CHECK_GENDER_MAPPING = {
23 | '1': MALE,
24 | '2': FEMALE,
25 | 'female': FEMALE,
26 | 'male': MALE,
27 | }
28 |
29 | _LSRC2_ID_CHECK_GENDER_MAPPING = {
30 | 'F': FEMALE,
31 | 'M': MALE,
32 | }
33 |
34 |
35 | def _psytools_choice(psc1, counter):
36 | female = counter[FEMALE]
37 | male = counter[MALE]
38 | total = female + male
39 | if female and male:
40 | logging.error('%s: inconsistent information about sex', psc1)
41 | return None
42 | elif female:
43 | return FEMALE
44 | elif male:
45 | return MALE
46 | else:
47 | logging.error('%s: cannot find information about sex', psc1)
48 | sex = None
49 |
50 |
51 | def list_psytools_timepoint(path):
52 | """List Psytools CSV files exported from Delosis.
53 |
54 | Parameters
55 | ----------
56 | path : str
57 | Directory to read Psytools CSV files from.
58 |
59 | Yields
60 | ------
61 | str
62 | Path to Psytools CSV file.
63 |
64 | """
65 | CSV_PREFIX = ('IMAGEN-IMGN_', 'IMAGEN-cVEDA_')
66 | LSRC2_PREFIX = ('Imagen_', 'STRATIFY_')
67 |
68 | for f in os.listdir(path):
69 | root, ext = os.path.splitext(f)
70 | if ext == '.csv':
71 | if any(root.startswith(prefix) for prefix in CSV_PREFIX):
72 | yield (False, os.path.join(path, f))
73 | elif any(root.startswith(prefix) for prefix in LSRC2_PREFIX):
74 | yield (True, os.path.join(path, f))
75 | else:
76 | logging.error('skipping unknown CSV file: %s', f)
77 |
78 |
79 | def process_psytools_timepoint(arguments):
80 | (lsrc2, path) = arguments # unpack multiple arguments
81 |
82 | result = {}
83 |
84 | with open(path, 'r') as f:
85 | reader = csv.DictReader(f, dialect='excel')
86 | for row in reader:
87 | if lsrc2:
88 | psc1 = row['id']
89 | if psc1.endswith('FU3'):
90 | psc1 = psc1[:-len('FU3')]
91 | elif psc1.endswith('FU2'): # Parent questionnaires
92 | psc1 = psc1[:-len('FU2')]
93 | if psc1.isdigit() and len(psc1) == 12:
94 | if 'IdCheckGender' in row:
95 | id_check_gender = row['IdCheckGender']
96 | if id_check_gender in _LSRC2_ID_CHECK_GENDER_MAPPING:
97 | sex = _LSRC2_ID_CHECK_GENDER_MAPPING[id_check_gender]
98 | result.setdefault(psc1, []).append(sex)
99 | else:
100 | logging.error("%s: invalid 'IdCheckGender': %s",
101 | psc1, id_check_gender)
102 | else:
103 | logging.info('%s: cannot interpret as PSC1 code', psc1)
104 | else:
105 | completed = row['Completed']
106 | trial = row['Trial']
107 | if completed == 't' and trial == "id_check_gender":
108 | psc1_suffix = row['User code'].rsplit('-', 1)
109 | psc1 = psc1_suffix[0]
110 | if psc1.isdigit() and len(psc1) == 12:
111 | trial_result = row['Trial result']
112 | if trial_result in _CSV_ID_CHECK_GENDER_MAPPING:
113 | sex = _CSV_ID_CHECK_GENDER_MAPPING[trial_result]
114 | result.setdefault(psc1, []).append(sex)
115 | else:
116 | logging.error("%s: invalid 'id_check_gender': %s",
117 | psc1, trial_result)
118 | else:
119 | logging.info('%s: cannot interpret as PSC1 code', psc1)
120 |
121 | return result
122 |
123 |
124 | def _decide_from_counter(counter):
125 | female = counter[FEMALE]
126 | male = counter[MALE]
127 | total = sum(counter.values())
128 | if total:
129 | if female > male:
130 | sex = FEMALE
131 | percentage = ((200 * female) // total + 1) // 2 # closest integer percentage
132 | elif male > female:
133 | sex = MALE
134 | percentage = ((200 * male) // total + 1) // 2 # closest integer percentage
135 | else:
136 | sex = None
137 | percentage = 50
138 | else:
139 | sex = None
140 | percentage = None
141 |
142 | return sex, percentage
143 |
144 |
145 | def psytools_timepoint(path):
146 | todo_list = list(list_psytools_timepoint(path))
147 |
148 | pool = Pool(WORKER_PROCESSES)
149 | results = pool.map(process_psytools_timepoint, todo_list)
150 | pool.close()
151 | pool.join()
152 |
153 | sex_counter = {}
154 | for result in results:
155 | for psc1, sex in result.items():
156 | sex_counter.setdefault(psc1, Counter()).update(sex)
157 |
158 | return {psc1: _decide_from_counter(counter)
159 | for psc1, counter in sex_counter.items()}
160 |
161 |
162 | def main():
163 | psytools_BL = psytools_timepoint(BL_PSYTOOLS)
164 | psytools_FU1 = psytools_timepoint(FU1_PSYTOOLS)
165 | psytools_FU2 = psytools_timepoint(FU2_PSYTOOLS)
166 | psytools_FU3 = psytools_timepoint(FU3_PSYTOOLS)
167 | psytools = (psytools_BL, psytools_FU1, psytools_FU2, psytools_FU3)
168 |
169 | with open('imagen_sex_psytools.csv', 'w', newline='') as csvfile:
170 | sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
171 | sex.writerow(['PSC1',
172 | 'Psytools BL', 'Psytools FU1',
173 | 'Psytools FU2', 'Psytools FU3'])
174 | psc1s = set()
175 | for timepoint in psytools:
176 | psc1s = psc1s.union(set(timepoint.keys()))
177 | for psc1 in sorted(psc1s):
178 | row = [psc1]
179 | for timepoint in psytools:
180 | if psc1 in timepoint:
181 | row.append(timepoint[psc1][0])
182 | else:
183 | row.append(None)
184 | sex.writerow(row)
185 |
186 | if any(psc1 in timepoint and timepoint[psc1][1] != 100
187 | for timepoint in psytools):
188 | s = '%s: inconsistent sex:'
189 | if psc1 in psytools_BL:
190 | s += '\n\tBL: {} {}%%'.format(psytools_BL[psc1][0], psytools_BL[psc1][1])
191 | if psc1 in psytools_FU1:
192 | s += '\n\tFU1: {} {}%%'.format(psytools_FU1[psc1][0], psytools_FU1[psc1][1])
193 | if psc1 in psytools_FU2:
194 | s += '\n\tFU2: {} {}%%'.format(psytools_FU2[psc1][0], psytools_FU2[psc1][1])
195 | if psc1 in psytools_FU3:
196 | s += '\n\tFU3: {} {}%%'.format(psytools_FU3[psc1][0], psytools_FU3[psc1][1])
197 | logging.warning(s, psc1)
198 |
199 |
200 | if __name__ == "__main__":
201 | main()
202 |
--------------------------------------------------------------------------------
/sex/imagen_sex_recruitment.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | from multiprocessing import Pool
5 | import csv
6 | from datetime import datetime
7 | import logging
8 |
9 | logging.basicConfig(level=logging.INFO)
10 |
11 | BL_RECRUITMENT_INFO = os.path.join('/neurospin/imagen/BL/RAW/PSC1/recruitment')
12 |
13 | WORKER_PROCESSES = 16
14 |
15 |
16 | FEMALE = 'F'
17 | MALE = 'M'
18 |
19 | _RECRUITMENT_SEX_MAPPING = {
20 | 'f': FEMALE,
21 | 'F': FEMALE,
22 | 'm': MALE,
23 | 'M': MALE,
24 | 'w': FEMALE,
25 | }
26 |
27 | _RECRUITMENT_SEX_VOID = {
28 | '',
29 | '0',
30 | '0.0',
31 | 'Test',
32 | 'not known',
33 | }
34 |
35 |
36 | def _recruitment_center(s):
37 | s = s.strip()
38 |
39 | if set(s).issubset('12345678.0'):
40 | if '.' in s:
41 | try:
42 | s = float(s)
43 | except ValueError:
44 | logging.info('%s: cannot interpret as center code', s)
45 | return None
46 | else:
47 | s = str(int(s // 1)) # integral part
48 | if len(s) == 1:
49 | return s
50 | else:
51 | logging.error('%s: incorrect center code', s)
52 | else:
53 | logging.debug('%s: skipping center code', s)
54 |
55 | return None
56 |
57 |
58 | def _recruitment_psc1(s, center):
59 | s = s.strip()
60 |
61 | if s.isdigit():
62 | if len(s) < 7:
63 | s = '0' + center + s.zfill(10)
64 | if len(s) == 12:
65 | return s
66 | else:
67 | logging.error('%s: incorrect PSC1 code', s)
68 | elif s:
69 | logging.warn('%s: cannot interpret as PSC1 code', s)
70 | else:
71 | logging.debug('empty PSC1 code')
72 |
73 | return None
74 |
75 |
76 | def _recruitment_choice(psc1, timestamps):
77 | # use data with most recent time stamp
78 | counter = Counter(timestamps[max(timestamps.keys())])
79 |
80 | female = counter[FEMALE]
81 | male = counter[MALE]
82 | if female and male:
83 | logging.error('%s: inconsistent information about sex', psc1)
84 | return None
85 | elif female:
86 | return FEMALE
87 | elif male:
88 | return MALE
89 | else:
90 | logging.error('%s: cannot find information about sex', psc1)
91 | sex = None
92 |
93 |
94 | def list_recruitment_BL(path):
95 | """List recruitment CSV files sent by recruitment centres.
96 |
97 | Parameters
98 | ----------
99 | path : str
100 | Directory to read CSV recruitment files from.
101 |
102 | Yields
103 | ------
104 | str
105 | Path to CSV file.
106 |
107 | """
108 | for f in os.listdir(path):
109 | root, ext = os.path.splitext(f)
110 | if ext == '.csv':
111 | yield os.path.join(path, f)
112 |
113 |
114 | def process_recruitment_BL(path):
115 | timestamp = os.path.getmtime(path)
116 |
117 | recruitment_sex = {}
118 |
119 | with open(path, encoding='latin1', newline='') as csvfile:
120 | recruitment = csv.reader(csvfile, delimiter=',')
121 | for row in recruitment:
122 | center = _recruitment_center(row[0])
123 | if center:
124 | psc1 = _recruitment_psc1(row[1], center)
125 | if psc1:
126 | gender = row[2].strip()
127 | if gender in _RECRUITMENT_SEX_MAPPING:
128 | sex = _RECRUITMENT_SEX_MAPPING[gender]
129 | if psc1 in recruitment_sex:
130 | if recruitment_sex[psc1] != sex:
131 | logging.error('%s: inconsistent duplicate line',
132 | psc1)
133 | else:
134 | logging.error('%s: duplicate line',
135 | psc1)
136 | else:
137 | recruitment_sex[psc1] = sex
138 | elif gender not in _RECRUITMENT_SEX_VOID:
139 | logging.error("%s: incorrect 'gender': %s",
140 | psc1, gender)
141 |
142 | return timestamp, recruitment_sex
143 |
144 |
145 | def recruitment_BL(path):
146 | """Process CSV recruitment files sent by recruitment centres at baseline.
147 |
148 | First list the files to process, then read these files in parallel.
149 |
150 | Parameters
151 | ----------
152 | path : str
153 | Directory to read CSV recruitment files from.
154 |
155 | Returns
156 | -------
157 | dict
158 | Key is PSC1 and value a pair (xnat_sex, xnat_experiment_sex).
159 |
160 | """
161 | todo_list = list(list_recruitment_BL(path))
162 |
163 | pool = Pool(WORKER_PROCESSES)
164 | results = pool.map(process_recruitment_BL, todo_list)
165 | pool.close()
166 | pool.join()
167 |
168 | sex_by_timestamp = {}
169 | for timestamp, result in results:
170 | for psc1, sex in result.items():
171 | sex_by_timestamp.setdefault(psc1, {})[timestamp] = result[psc1]
172 |
173 | recruitment_sex = {}
174 | for psc1, timestamps in sex_by_timestamp.items():
175 | max_timestamp = max(timestamps)
176 | sex = timestamps[max_timestamp]
177 | for k, v in timestamps.items():
178 | if v != sex:
179 | logging.error("%s: inconsistent 'gender' across time stamps\n"
180 | '\t%s: %s\n'
181 | '\t%s: %s',
182 | psc1,
183 | datetime.fromtimestamp(k).date(), v,
184 | datetime.fromtimestamp(max_timestamp).date(), sex)
185 | recruitment_sex[psc1] = sex
186 |
187 | return recruitment_sex
188 |
189 |
190 | def main():
191 | recruitment = recruitment_BL(BL_RECRUITMENT_INFO)
192 |
193 | with open('imagen_sex_recruitment.csv', 'w', newline='') as csvfile:
194 | sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
195 | sex.writerow(['PSC1', 'Recruitment'])
196 | for psc1 in sorted(recruitment):
197 | row = [psc1]
198 | row.append(recruitment[psc1])
199 | sex.writerow(row)
200 |
201 |
202 | if __name__ == "__main__":
203 | main()
204 |
--------------------------------------------------------------------------------
/sex/imagen_sex_xnat.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | from multiprocessing import Pool
5 | from xml.etree import ElementTree
6 | from imagen_databank import PSC1_FROM_PSC2
7 | import csv
8 | import logging
9 |
10 | logging.basicConfig(level=logging.INFO)
11 |
12 | BL_XNAT = '/neurospin/imagen/export/xml'
13 |
14 | WORKER_PROCESSES = 16
15 |
16 |
17 | FEMALE = 'F'
18 | MALE = 'M'
19 |
20 | _XNAT_GENDER_MAPPING = {
21 | 'female': FEMALE,
22 | 'male': MALE,
23 | }
24 |
25 | _XNAT_EXPERIMENT_GENDER_MAPPING = {
26 | 'f': FEMALE,
27 | 'F': FEMALE,
28 | 'm': MALE,
29 | 'M': MALE,
30 | 'w': FEMALE,
31 | 'female': FEMALE, # single occurrence!
32 | }
33 |
34 | _XNAT_EXPERIMENT_GENDER_VOID = {
35 | '0',
36 | 'Test',
37 | 'not known',
38 | }
39 |
40 |
41 | def list_xnat_BL(path):
42 | """List XML files exported from XNAT.
43 |
44 | Yields only files with standard names:
45 | IMAGEN_.xml
46 |
47 | Parameters
48 | ----------
49 | path : str
50 | Directory to read XML files from.
51 |
52 | Yields
53 | ------
54 | tuple of str
55 | Yields a pair (psc2, path).
56 |
57 | """
58 | for f in os.listdir(path):
59 | root, ext = os.path.splitext(f)
60 | if ext == '.xml':
61 | PREFIX = 'IMAGEN_'
62 | if root.startswith(PREFIX):
63 | psc2 = root[len(PREFIX):]
64 | logging.debug('%s: found XML file: %s', psc2, f)
65 | assert(psc2.isdigit() and len(psc2) == 12)
66 | yield (psc2, os.path.join(path, f))
67 | else:
68 | logging.error('unexpected XML file: %s', f)
69 | else:
70 | logging.debug('skipping non-XML file: %s', f)
71 |
72 |
73 | def process_xnat_BL(arguments):
74 | """Read subject sex from XML file exported from XNAT.
75 |
76 | Looks for this information in two distinct places.
77 |
78 | Parameters
79 | ----------
80 | arguments : tuple of str
81 | Expects a pair (psc2, path)
82 |
83 | Returns
84 | -------
85 | tuple of str
86 | Yields a pair (xnat_sex, xnat_experiment_sex).
87 |
88 | """
89 | (psc2, path) = arguments # unpack multiple arguments
90 |
91 | tree = ElementTree.parse(path)
92 | root = tree.getroot()
93 |
94 | xnat_sex = None
95 | xnat_gender = root.find('.//{http://nrg.wustl.edu/xnat}gender')
96 | if xnat_gender is None:
97 | logging.warn("%s: missing 'gender' in XML file", psc2)
98 | else:
99 | xnat_gender = xnat_gender.text
100 | if xnat_gender in _XNAT_GENDER_MAPPING:
101 | xnat_sex = _XNAT_GENDER_MAPPING[xnat_gender]
102 | else:
103 | logging.error("%s: incorrect 'gender' (%s) in XML file",
104 | psc2, xnat_gender)
105 |
106 | xnat_experiment_sex = None
107 | xnat_experiment_gender = root.find('.//{http://nrg.wustl.edu/xnat}experiment[@gender]')
108 | if xnat_experiment_gender is None:
109 | logging.warn("%s: missing 'experiment[@gender]' in XML file", psc2)
110 | else:
111 | xnat_experiment_gender = xnat_experiment_gender.attrib['gender']
112 | xnat_experiment_gender = xnat_experiment_gender.strip()
113 | if xnat_experiment_gender in _XNAT_EXPERIMENT_GENDER_MAPPING:
114 | xnat_experiment_sex = _XNAT_EXPERIMENT_GENDER_MAPPING[xnat_experiment_gender]
115 | elif xnat_experiment_gender not in _XNAT_EXPERIMENT_GENDER_VOID:
116 | logging.error("%s: incorrect 'experiment[@gender]' (%s) in XML file",
117 | psc2, xnat_experiment_gender)
118 |
119 | return xnat_sex, xnat_experiment_sex
120 |
121 |
122 | def xnat_BL(path):
123 | """Process XML files exported from XNAT.
124 |
125 | First list the files to process, then read these files in parallel.
126 |
127 | Parameters
128 | ----------
129 | path : str
130 | Directory to read XML files from.
131 |
132 | Returns
133 | -------
134 | dict
135 | Key is PSC2 and value a pair (xnat_sex, xnat_experiment_sex).
136 |
137 | """
138 | todo_list = list(list_xnat_BL(BL_XNAT))
139 |
140 | pool = Pool(WORKER_PROCESSES)
141 | results = pool.map(process_xnat_BL, todo_list)
142 | pool.close()
143 | pool.join()
144 |
145 | psc1, path = zip(*todo_list)
146 | return dict(zip(psc1, results))
147 |
148 |
149 | def main():
150 | xnat = xnat_BL(BL_XNAT)
151 |
152 | xnat = {PSC1_FROM_PSC2[psc2]: v for psc2, v in xnat.items()}
153 |
154 | with open('imagen_sex_xnat.csv', 'w', newline='') as csvfile:
155 | sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
156 | sex.writerow(['PSC1',
157 | 'XNAT gender'])
158 | for psc1 in sorted(xnat):
159 | row = [psc1]
160 | if xnat[psc1][0] and xnat[psc1][1]:
161 | if xnat[psc1][0] != xnat[psc1][1]:
162 | logging.error("%s: inconsistent 'gender' (%s) / 'experiment@gender' (%s)",
163 | psc1, xnat[psc1][0], xnat[psc1][1])
164 | row.append('?')
165 | else:
166 | row.append(xnat[psc1][0])
167 | elif xnat[psc1][0]:
168 | row.append(xnat[psc1][0])
169 | elif xnat[psc1][1]:
170 | row.append(xnat[psc1][1])
171 | else:
172 | row.append(None)
173 | sex.writerow(row)
174 |
175 |
176 | if __name__ == "__main__":
177 | main()
178 |
--------------------------------------------------------------------------------
/stratify_demographics/demographics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 |
4 | import os
5 | from csv import reader
6 | from csv import DictWriter
7 | import xlrd
8 | from imagen_databank import PSC2_FROM_PSC1, CENTER_NAME
9 |
10 | import logging
11 | logging.basicConfig(level=logging.ERROR)
12 |
13 |
14 | _DEBUG_PSYTOOLS_SEX = '/imagen/STRATIFY/RAW/PSC1/meta_data/STRATIFY_SEX_2024-10-17.txt'
15 |
16 | _DEMOGRAPHIC_RECORDS_DIR = '/imagen/STRATIFY/RAW/PSC1/meta_data'
17 | _DEMOGRAPHIC_RECORDS = [
18 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'STRATIFY_recruitment_file_SOUTHAMPTON_2024-10-16.xlsx'),
19 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'STRATIFY_recruitment_file_LONDON_2024-03-14.xlsx'),
20 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'ESTRA_recruitment_file_LONDON_2024-08-16.xlsx'),
21 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'ESTRA_recruitment_file_LONDON_CONTROLS_2023-07-24.xlsx'),
22 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'STRATIFY_recruitment_file_BERLIN_2024-10-16.xlsx'),
23 | ]
24 |
25 | _FINAL_COLUMNS = (
26 | 'PSC2',
27 | 'sex',
28 | 'recruitment site',
29 | 'scanning site',
30 | 'patient group',
31 | 'complete',
32 | #'missing data',
33 | )
34 |
35 | _DEMOGRAPHIC_COLUMNS = {
36 | # handle separately 'PSC1 Code'
37 | # Stratify
38 | 'Sex': _FINAL_COLUMNS[1],
39 | 'Acquisition Centre (and Scanning Site)': _FINAL_COLUMNS[3],
40 | 'Acquisition Centre': _FINAL_COLUMNS[3],
41 | 'Patient Group': _FINAL_COLUMNS[4],
42 | 'Fully Complete? Y/N': _FINAL_COLUMNS[5],
43 | #'Missing Data (Please Specify)': _FINAL_COLUMNS[6],
44 | # ESTRA
45 | # (skip 'Recruitment Centre')
46 | 'Scanning Site': _FINAL_COLUMNS[3],
47 | 'Gender ': _FINAL_COLUMNS[1],
48 | 'Diagnosis ': _FINAL_COLUMNS[4],
49 | 'Diagnosis': _FINAL_COLUMNS[4],
50 | # Stratify 20 additional controls
51 | 'Site': _FINAL_COLUMNS[3],
52 | 'Group': _FINAL_COLUMNS[4],
53 | 'Gender': _FINAL_COLUMNS[1],
54 | # LONDON CONTROLS
55 |
56 | # BERLIN
57 | 'sex': _FINAL_COLUMNS[1],
58 | 'scanning site': _FINAL_COLUMNS[3],
59 | 'patient group': _FINAL_COLUMNS[4],
60 | 'complete': _FINAL_COLUMNS[5],
61 | #'missing data': _FINAL_COLUMNS[6],
62 | }
63 |
64 | _CONTROL_GROUP = 'Control'
65 | _CONTROL_GROUP_ESTRA = 'Control_ESTRA'
66 | _ADHD_GROUP = 'ADHD'
67 | _AUD_GROUP = 'AUD'
68 | _AN_GROUP = 'AN'
69 | _RECAN_GROUP = 'recAN'
70 | _BN_GROUP = 'BN'
71 | _RECBN_GROUP = 'recBN'
72 | _MDD_GROUP = 'MDD'
73 | _PSYCHOSIS_GROUP = 'Psychosis'
74 | _BED_GROUP= 'BED'
75 |
76 | _PATIENT_GROUPS = {
77 | _CONTROL_GROUP,
78 | _CONTROL_GROUP_ESTRA,
79 | _ADHD_GROUP,
80 | _AUD_GROUP,
81 | _AN_GROUP,
82 | _RECAN_GROUP,
83 | _BN_GROUP,
84 | _RECBN_GROUP,
85 | _MDD_GROUP,
86 | _PSYCHOSIS_GROUP,
87 | _BED_GROUP,
88 | }
89 |
90 |
91 | def normalize_patient_group(s):
92 | table = {
93 | 'control': _CONTROL_GROUP,
94 | 'Control_ESTRA': _CONTROL_GROUP_ESTRA,
95 | 'depression': _MDD_GROUP,
96 | 'psychosis': _PSYCHOSIS_GROUP,
97 | 'Alcohol Use Disorder': _AUD_GROUP,
98 | 'Major Depressive Disorder': _MDD_GROUP,
99 | 'Healthy Control': _CONTROL_GROUP,
100 |
101 | }
102 | if s in table:
103 | s = table[s]
104 |
105 | return s
106 |
107 |
108 | def normalize_scanning_site(s):
109 | table = {
110 | # LONDON: 'CNS' or 'Invicro'
111 | 'KCL': 'CNS',
112 | 'Denmark Hill': 'CNS',
113 | # SOUTHAMPTON
114 | 'Southampton': None,
115 | # BERLIN
116 | 'BERLIN': None,
117 | }
118 | if s in table:
119 | s = table[s]
120 |
121 | return s
122 |
123 |
124 | def normalize_sex(s):
125 | s = s.upper()
126 |
127 | table = {
128 | 'FEMALE': 'F',
129 | 'MALE': 'M',
130 | }
131 | if s in table:
132 | s = table[s]
133 |
134 | return s
135 |
136 |
137 | def strip_cell(s):
138 | try:
139 | s = s.strip()
140 | except AttributeError: # floats and other types
141 | pass
142 | return s
143 |
144 |
145 | def read_demographic_record(path):
146 | demographics = {}
147 |
148 | with xlrd.open_workbook(path) as workbook:
149 | worksheet = workbook.sheet_by_index(0)
150 |
151 | # read header
152 | psc1_index = None
153 | index = {}
154 | row = [strip_cell(x) for x in worksheet.row_values(0)]
155 | print(path)
156 | for i, value in enumerate(row):
157 | if value in _DEMOGRAPHIC_COLUMNS:
158 | index[_DEMOGRAPHIC_COLUMNS[value]] = i
159 | print(i, value, '→', _DEMOGRAPHIC_COLUMNS[value])
160 | elif value == 'PSC1 Code' or value == 'PSC1':
161 | psc1_index = i
162 | else:
163 | print(i, value, '→', '?????')
164 |
165 | if psc1_index is None:
166 | logging.error('%s: cannot find PSC1 code', path)
167 | return demographics
168 |
169 | # read data
170 | for i in range(1, worksheet.nrows):
171 | row = [strip_cell(x) for x in worksheet.row_values(i)]
172 |
173 | psc1 = row[psc1_index]
174 | psc1 = psc1[:12] # remove trailing FU3 or SB
175 | if psc1 not in PSC2_FROM_PSC1:
176 | logging.error('%s: invalid PSC1 code', psc1)
177 | continue
178 |
179 | demographics[psc1] = {}
180 |
181 | for name, i in index.items():
182 | value = row[i]
183 | if name == 'sex':
184 | value = normalize_sex(value)
185 | if value not in {'F', 'M'}:
186 | logging.error('%s: invalid sex: %s', psc1, value)
187 | continue
188 | elif name == 'patient group':
189 | value = normalize_patient_group(value)
190 | if value not in _PATIENT_GROUPS:
191 | logging.error('%s: invalid patient group: %s',
192 | psc1, value)
193 | continue
194 | elif name == 'scanning site':
195 | value = normalize_scanning_site(value)
196 | elif name == 'complete':
197 | if value not in {'Y', 'N', ''}:
198 | logging.error('%s: invalid completeness: %s',
199 | psc1, value)
200 | continue
201 | elif name == 'missing data':
202 | value = value.rstrip(',.')
203 | if value.lower() == 'none':
204 | value = None
205 | demographics[psc1][name] = value
206 |
207 | return demographics
208 |
209 |
210 | def read_demographic_records(paths):
211 | demographic_records = {}
212 |
213 | for path in paths:
214 | demographic_records.update(read_demographic_record(path))
215 |
216 | return demographic_records
217 |
218 |
219 | def main():
220 | demographics = read_demographic_records(_DEMOGRAPHIC_RECORDS)
221 |
222 | with open(_DEBUG_PSYTOOLS_SEX, 'r') as sex_file:
223 | sex_reader = reader(sex_file, dialect='excel')
224 |
225 | with open('STRATIFY_participants.csv', 'w') as demographics_file:
226 | demographics_writer = DictWriter(demographics_file,
227 | _FINAL_COLUMNS,
228 | dialect='excel')
229 | demographics_writer.writeheader()
230 | for row in sex_reader:
231 | psc1 = row[0]
232 | psc2 = PSC2_FROM_PSC1[psc1]
233 | center = int(psc1[1])
234 | if center > 8:
235 | center = int(psc1[1:3])
236 | center = CENTER_NAME[center]
237 | sex = row[1]
238 | if psc1 in demographics:
239 | data = demographics[psc1]
240 | data['PSC2'] = psc2
241 | data['recruitment site'] = center
242 | if 'sex' in data:
243 | if data['sex'] != sex:
244 | logging.error('%s: inconsistent sex between Psytools and recruitment file', psc1)
245 | data['sex'] = sex
246 | else:
247 | data = {
248 | 'PSC2': psc2,
249 | 'sex': sex,
250 | 'recruitment site': center,
251 | }
252 | row = {x: data[x] if x in data else None
253 | for x in _FINAL_COLUMNS}
254 | demographics_writer.writerow(row)
255 |
256 |
257 | if __name__ == "__main__":
258 | main()
259 |
--------------------------------------------------------------------------------
/stratify_demographics/stratify_debug_psytools.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import os
4 | from multiprocessing import Pool
5 | import csv
6 | from datetime import datetime, date
7 | from collections import Counter
8 | import logging
9 |
10 | logging.basicConfig(level=logging.INFO)
11 |
12 | STRATIFY_PSYTOOLS = '/neurospin/imagen/STRATIFY/RAW/PSC1/psytools'
13 | STRATIFY_DOB = '/neurospin/imagen/STRATIFY/RAW/PSC1/meta_data/dob_validation.csv'
14 | STRATIFY_SEX = '/neurospin/imagen/STRATIFY/RAW/PSC1/meta_data/sex_validation.csv'
15 |
16 | WORKER_PROCESSES = 24
17 |
18 |
19 | FEMALE = 'F'
20 | MALE = 'M'
21 |
22 | _CSV_ID_CHECK_GENDER_MAPPING = {
23 | '1': MALE,
24 | '2': FEMALE,
25 | 'female': FEMALE,
26 | 'male': MALE,
27 | }
28 |
29 | _LSRC2_ID_CHECK_GENDER_MAPPING = {
30 | 'F': FEMALE,
31 | 'M': MALE,
32 | }
33 |
34 | _CANTAB_GENDER_MAPPING = {
35 | 'Female': FEMALE,
36 | 'Male': MALE,
37 | }
38 |
39 |
40 | def list_psytools_timepoint(path):
41 | """List Psytools CSV files exported from Delosis.
42 |
43 | Parameters
44 | ----------
45 | path : str
46 | Directory to read Psytools CSV files from.
47 |
48 | Yields
49 | ------
50 | str
51 | Path to Psytools CSV file.
52 |
53 | """
54 | CSV_PREFIX = ('IMAGEN-', 'STRATIFY-')
55 | LSRC2_PREFIX = ('Imagen_', 'STRATIFY_Core') # exclude STRATIFY_Screening
56 |
57 | for f in os.listdir(path):
58 | root, ext = os.path.splitext(f)
59 | if ext == '.csv':
60 | if any(root.startswith(prefix) for prefix in CSV_PREFIX):
61 | yield (False, os.path.join(path, f), root)
62 | elif any(root.startswith(prefix) for prefix in LSRC2_PREFIX):
63 | yield (True, os.path.join(path, f), root)
64 | else:
65 | logging.error('skipping unknown CSV file: %s', f)
66 |
67 |
68 | def process_psytools_timepoint(arguments):
69 | (lsrc2, path, name) = arguments # unpack multiple arguments
70 |
71 | sex_counter = {}
72 | dob_counter = {}
73 |
74 | with open(path, 'r') as f:
75 | reader = csv.DictReader(f, dialect='excel')
76 | for row in reader:
77 | if lsrc2:
78 | psc1 = row['id']
79 | if psc1.endswith('SB'):
80 | psc1 = psc1[:-len('SB')]
81 | if psc1.endswith('FU'):
82 | psc1 = psc1[:-len('FU')]
83 | if psc1.isdigit() and len(psc1) == 12:
84 | if 'IdCheckGender' in row:
85 | id_check_gender = row['IdCheckGender']
86 | if id_check_gender in _LSRC2_ID_CHECK_GENDER_MAPPING:
87 | id_check_gender = _LSRC2_ID_CHECK_GENDER_MAPPING[id_check_gender]
88 | sex_counter.setdefault(psc1, {}).setdefault(id_check_gender, Counter()).update(('IdCheckGender',))
89 | elif id_check_gender:
90 | logging.error("%s: %s: invalid 'IdCheckGender': %s",
91 | name, psc1, id_check_gender)
92 | else:
93 | logging.debug("%s: %s: empty 'IdCheckGender': %s",
94 | name, psc1, id_check_gender)
95 | if 'IdCheckDob' in row:
96 | id_check_dob = row['IdCheckDob']
97 | try:
98 | id_check_dob = datetime.strptime(id_check_dob, '%Y-%m-%d %H:%M:%S')
99 | except ValueError as e:
100 | if id_check_dob:
101 | logging.error("%s: %s: invalid 'IdCheckDob': %s",
102 | name, psc1, id_check_dob)
103 | else:
104 | logging.debug("%s: %s: empty 'IdCheckDob': %s",
105 | name, psc1, id_check_dob)
106 | else:
107 | id_check_dob = id_check_dob.date()
108 | if id_check_dob.year > 2012 or id_check_dob.year < 1990:
109 | logging.error("%s: %s: skip 'IdCheckDob': %d",
110 | name, psc1, id_check_dob.year)
111 | else:
112 | dob_counter.setdefault(psc1, {}).setdefault(id_check_dob, Counter()).update(('IdCheckDob',))
113 | else:
114 | logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1)
115 | else:
116 | psc1_suffix = row['User code'].rsplit('-', 1)
117 | psc1 = psc1_suffix[0]
118 | if psc1.endswith('SB'):
119 | psc1 = psc1[:-len('SB')]
120 | completed = row['Completed']
121 | if completed == 't':
122 | trial = row['Trial']
123 | if trial == 'id_check_gender':
124 | if psc1.isdigit() and len(psc1) == 12:
125 | trial_result = row['Trial result']
126 | if trial_result in _CSV_ID_CHECK_GENDER_MAPPING:
127 | id_check_gender = _CSV_ID_CHECK_GENDER_MAPPING[trial_result]
128 | sex_counter.setdefault(psc1, {}).setdefault(id_check_gender, Counter()).update((trial,))
129 | else:
130 | logging.error("%s: %s: invalid 'id_check_gender': %s",
131 | name, psc1, trial_result)
132 | else:
133 | logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1)
134 | elif trial == 'ni_gender':
135 | if psc1.isdigit() and len(psc1) == 12:
136 | trial_result = row['Trial result']
137 | if trial_result in _LSRC2_ID_CHECK_GENDER_MAPPING:
138 | id_check_gender = _LSRC2_ID_CHECK_GENDER_MAPPING[trial_result]
139 | sex_counter.setdefault(psc1, {}).setdefault(id_check_gender, Counter()).update((trial,))
140 | else:
141 | logging.error("%s: %s: invalid 'ni_gender': %s",
142 | name, psc1, trial_result)
143 | else:
144 | logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1)
145 | elif trial == 'id_check_dob':
146 | if psc1.isdigit() and len(psc1) == 12:
147 | trial_result = row['Trial result']
148 | try:
149 | month, year = trial_result.rsplit('_')
150 | month = int(month)
151 | year = int(year)
152 | except ValueError as e:
153 | logging.error("%s: invalid 'id_check_dob': %s",
154 | psc1, id_check_dob)
155 | else:
156 | if year > 2012 or year < 1990:
157 | logging.error("%s: skip 'id_check_dob': %d",
158 | psc1, year)
159 | else:
160 | dob_counter.setdefault(psc1, {}).setdefault((year, month), Counter()).update((trial,))
161 | else:
162 | logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1)
163 |
164 | return sex_counter, dob_counter
165 |
166 |
167 | def psytools_timepoint(path):
168 | todo_list = list(list_psytools_timepoint(path))
169 |
170 | pool = Pool(WORKER_PROCESSES)
171 | results = pool.map(process_psytools_timepoint, todo_list)
172 | pool.close()
173 | pool.join()
174 |
175 | sex = {}
176 | dob = {}
177 | for (sex_counter, dob_counter), (lsrc2, path, name) in zip(results, todo_list):
178 | for psc1, values in sex_counter.items():
179 | for value, variables in values.items():
180 | for variable, count in variables.items():
181 | sex.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update({name: count})
182 | for psc1, values in dob_counter.items():
183 | for value, variables in values.items():
184 | for variable, count in variables.items():
185 | dob.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update({name: count})
186 |
187 | clean_dob = {}
188 | for psc1, values in dob.items():
189 | exact_dates = set()
190 | for value, variables in values.items():
191 | if type(value) == date:
192 | for variable, counter in variables.items():
193 | exact_dates.add(value)
194 | clean_dob.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update(counter)
195 | for value, variables in values.items():
196 | if type(value) == tuple:
197 | year, month = value
198 | for variable, counter in variables.items():
199 | for d in exact_dates:
200 | if d.year == year and d.month == month:
201 | clean_dob.setdefault(psc1, {}).setdefault(d, {}).setdefault(variable, Counter()).update(counter)
202 | break
203 | else:
204 | clean_dob.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update(counter)
205 |
206 | return sex, clean_dob
207 |
208 |
209 | def cantab_timepoint(path):
210 | sex = {}
211 | for center in os.listdir(path):
212 | center_path = os.path.join(path, center)
213 | if os.path.isdir(center_path):
214 | for psc1 in os.listdir(center_path):
215 | psc1_path = os.path.join(center_path, psc1)
216 | if os.path.isdir(psc1_path):
217 | if psc1.isdigit() and len(psc1) == 12:
218 | additional_data_path = os.path.join(psc1_path, 'AdditionalData')
219 | for f in os.listdir(additional_data_path):
220 | if f.startswith('datasheet_'):
221 | if f == ('datasheet_' + psc1 + 'SB.csv'):
222 | f_path = os.path.join(additional_data_path, f)
223 | with open(f_path, newline='') as csvfile:
224 | reader = csv.DictReader(csvfile)
225 | try:
226 | if 'Gender' not in reader.fieldnames:
227 | csvfile.seek(0)
228 | reader = csv.DictReader(csvfile, delimiter=';')
229 | if 'Gender' not in reader.fieldnames:
230 | reader = None
231 | except:
232 | logging.error('bad cantab datasheet for %s', psc1)
233 | for row in reader:
234 | if 'Gender' in row:
235 | if row['Gender']:
236 | sex[psc1] = _CANTAB_GENDER_MAPPING[row['Gender']]
237 | else:
238 | logging.warning('%s: missing Gender value: %s', psc1, f)
239 | else:
240 | logging.warning('%s: missing Gender column (%s): %s', psc1, reader.fieldnames, f)
241 | else:
242 | logging.error('%s: incorrect file name: %s', psc1, f)
243 | else:
244 | logging.info('%s: not a directory', psc1)
245 | else:
246 | logging.debug('%s: not a PSC1 code', psc1)
247 |
248 | return sex
249 |
250 |
251 | def main():
252 | sex, dob = psytools_timepoint(STRATIFY_PSYTOOLS)
253 | cantab_sex = cantab_timepoint('/neurospin/imagen/STRATIFY/RAW/PSC1')
254 |
255 | validated_dob = {}
256 | with open(STRATIFY_DOB, 'r') as f:
257 | reader = csv.reader(f, dialect='excel')
258 | for row in reader:
259 | validated_dob[row[0]] = datetime.strptime(row[1], '%Y-%m-%d').date()
260 |
261 | validated_sex = {}
262 | with open(STRATIFY_SEX, 'r') as f:
263 | reader = csv.reader(f, dialect='excel')
264 | for row in reader:
265 | validated_sex[row[0]] = row[1]
266 |
267 | for psc1 in cantab_sex:
268 | if psc1 in sex:
269 | sex[psc1].setdefault(cantab_sex[psc1], {}).setdefault('Gender', Counter()).update({'datasheet_' + psc1 + 'SB': 1})
270 | else:
271 | logging.error('%s: found in Cantab but missing from Psytools', psc1)
272 |
273 | today = datetime.today()
274 |
275 | with open('STRATIFY_SEX_' + today.strftime('%Y-%m-%d') + '.txt', 'w') as f:
276 | for psc1, values in sex.items():
277 | if psc1 in validated_sex:
278 | print(','.join((psc1, validated_sex[psc1])), file=f)
279 | elif len(values) > 1:
280 | message = '{}: multiple sex values:\n'.format(psc1)
281 | for value, variables in values.items():
282 | count_value = 0
283 | message_variable = ''
284 | for variable, counters in variables.items():
285 | count_variable = 0
286 | message_name = ''
287 | for name, count in counters.items():
288 | message_name += '\t\t\t{}\n'.format(name)
289 | count_variable += count
290 | message_variable += '\t\t{} ({})\n'.format(variable, count_variable) + message_name
291 | count_value += count_variable
292 | message_value = '\t{} ({})\n'.format(value, count_value) + message_variable
293 | message += message_value
294 | logging.error(message)
295 | else:
296 | value = next(iter(values.keys()))
297 | print(','.join((psc1, value)), file=f)
298 |
299 | with open('STRATIFY_DOB_' + today.strftime('%Y-%m-%d') + '.txt', 'w') as f:
300 | for psc1, values in dob.items():
301 | if psc1 in validated_dob:
302 | print(','.join((psc1, validated_dob[psc1].strftime('%Y-%m-%d'),
303 | today.strftime('%Y-%m-%d_%H:%M:%S.0'))),
304 | file=f)
305 | elif len(values) > 1:
306 | message = '{}: multiple date of birth values:\n'.format(psc1)
307 | for value, variables in values.items():
308 | count_value = 0
309 | message_variable = ''
310 | for variable, counters in variables.items():
311 | count_variable = 0
312 | message_name = ''
313 | for name, count in counters.items():
314 | message_name += '\t\t\t{} ({})\n'.format(name, count)
315 | count_variable += count
316 | message_variable += '\t\t{} ({})\n'.format(variable, count_variable) + message_name
317 | count_value += count_variable
318 | message_value = '\t{} ({})\n'.format(value, count_value) + message_variable
319 | message += message_value
320 | logging.error(message)
321 | else:
322 | value = next(iter(values.keys()))
323 | if type(value) == date:
324 | value = value.strftime('%Y-%m-%d')
325 | print(','.join((psc1, value,
326 | today.strftime('%Y-%m-%d_%H:%M:%S.0'))),
327 | file=f)
328 | else:
329 | logging.error('%s: skipping incomplete date: %s', psc1, str(value))
330 |
331 |
332 | if __name__ == "__main__":
333 | main()
334 |
--------------------------------------------------------------------------------