├── AUTHOR ├── LICENSE ├── MANIFEST.in ├── README.rst ├── cantab ├── imagen_cantab_age_at_session_start_time.py └── imagen_cantab_extract_deidentify.py ├── dawba └── imagen_dawba_deidentify.py ├── genomics ├── rna_seq_deidentify_imagen.py └── rna_seq_deidentify_stratify.py ├── geolocation └── geolocation.sh ├── imagen_databank ├── __init__.py ├── additional_data.py ├── behavioral.py ├── cantab.py ├── core.py ├── dicom_utils.py ├── image_data.py ├── sanity │ ├── __init__.py │ ├── cantab.py │ └── imaging.py └── scanning.py ├── mri └── imagen_sample_FU3_mri_deidentify.py ├── onsets ├── imagen_onsets_copy_FU3.sh ├── imagen_onsets_copy_STRATIFY.sh └── imagen_onsets_extract_deidentify.py ├── psc └── imagen_update_dawba_codes_from_tokens.py ├── psytools ├── imagen_psytools_deidentify.py └── imagen_psytools_download.py ├── setup.py ├── sex ├── imagen_sex.py ├── imagen_sex_dataset.py ├── imagen_sex_methylation.py ├── imagen_sex_psytools.py ├── imagen_sex_recruitment.py └── imagen_sex_xnat.py └── stratify_demographics ├── demographics.py └── stratify_debug_psytools.py /AUTHOR: -------------------------------------------------------------------------------- 1 | Dimitri Papadopoulos 2 | David Goyard 3 | Antoine Grigis 4 | Vincent Frouin 5 | Robin Cherbonnier 6 | Thomas Gareau 7 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHOR LICENSE MANIFEST.in setup.py README.txt 2 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ========================================= 2 | Databank operations of the Imagen project 3 | ========================================= 4 | 5 | Databank operations are mostly documented internally at NeuroSpin. 6 | 7 | Basic information is available from the `project wiki`_. 8 | 9 | This Python package combines a Python library *imagen_databank* for basic 10 | sanity check and preprocessing of Imagen data and a set of scripts to 11 | extract, check, anonymize and transform raw Imagen data. 12 | 13 | ``imagen_databank`` 14 | Read and perform sanity checks on raw datasets. 15 | 16 | ``cantab`` 17 | Extract age from FU2 Cantab data. 18 | 19 | ``dawba`` 20 | Remove identifying data and convert PSC1 to PSC2 in Dawba data, 21 | after manual download from the youthinmind_ server. 22 | 23 | ``stratify_demographics`` 24 | Cross-check Stratify age and sex with `stratify_debug_psytools.py`. 25 | Print demographics with `demographics.py`, using recruitment files and 26 | validated age/sex from the output of teh previosu script. 27 | 28 | ``geolocation`` 29 | Merge and convert geolocation data from PSC1 to PSC2. 30 | 31 | ``mri`` 32 | De-identify some NIfTI files that used to contain the PSC1 code. 33 | 34 | ``onsets`` 35 | Remove identifying data and convert PSC1 to PSC2 in FU3 onsets files. 36 | 37 | ``psc`` 38 | Update FU3 Dawba codes from token tables maintained on the Delosis_ serevr. 39 | 40 | ``psytools`` 41 | Download Psytools data as CSV files from the Delosis_ server. 42 | Remove identifying data and convert PSC1 to PSC2. 43 | 44 | ``sex`` 45 | Derive reference sex of Imagen subjects from multiple sources. 46 | There had been errors at baseline. 47 | 48 | .. _`project wiki`: https://github.com/imagen2/imagen_databank/wiki 49 | .. _youthinmind: http://youthinmind.com 50 | .. _Delosis: https://www.delosis.com 51 | -------------------------------------------------------------------------------- /cantab/imagen_cantab_age_at_session_start_time.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """... 3 | 4 | ========== 5 | Attributes 6 | ========== 7 | 8 | Input 9 | ----- 10 | 11 | FU2_MASTER_DIR : str 12 | Location of FU2 PSC1-encoded data. 13 | 14 | Output 15 | ------ 16 | 17 | ??? 18 | 19 | """ 20 | 21 | FU2_MASTER_DIR = '/neurospin/imagen/FU2/RAW/PSC1' 22 | 23 | import logging 24 | logger = logging.getLogger(__name__) 25 | logging.basicConfig(level=logging.INFO) 26 | 27 | import os 28 | import glob 29 | from datetime import date 30 | 31 | # import ../imagen_databank 32 | import sys 33 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')) 34 | from imagen_databank import PSC2_FROM_PSC1 35 | from imagen_databank import DOB_FROM_PSC1 36 | from imagen_databank import read_datasheet 37 | 38 | 39 | def main(): 40 | # find datasheet_*.csv files 41 | logger.info('start globing datasheet_*.csv files') 42 | datasheets = glob.glob(os.path.join(FU2_MASTER_DIR, 43 | '*/*/AdditionalData/datasheet_*.csv')) 44 | logger.info('finished globing datasheet_*.csv files') 45 | 46 | for datasheet in datasheets: 47 | subject_ids, session_start_times, dummy_r, dummy_c, dummy_f = read_datasheet(datasheet) 48 | if len(subject_ids) != 1: 49 | logger.warning('Proper "Subject ID" not found: %s', datasheet) 50 | continue 51 | psc1 = subject_ids.pop()[:12] 52 | 53 | # find age 54 | if psc1 not in DOB_FROM_PSC1: 55 | logger.error('unknown age for PSC1 code %s: %s', psc1, datasheet) 56 | continue 57 | dob = DOB_FROM_PSC1[psc1] 58 | session_start_times = set(sst.date() for sst in session_start_times) 59 | if len(session_start_times) != 1: 60 | logger.warning('Proper "Session start time" not found: %s', 61 | datasheet) 62 | continue 63 | session_start_time = session_start_times.pop() 64 | if session_start_time < date(2007, 1, 1): 65 | logger.error('Bogus "Session start time" %s: %s', 66 | session_start_time, datasheet) 67 | continue 68 | age = (session_start_time - dob).days 69 | 70 | # find PSC2 71 | if psc1 not in PSC2_FROM_PSC1: 72 | logger.error('unknown PSC1 code %s: %s', psc1, datasheet) 73 | continue 74 | psc2 = PSC2_FROM_PSC1[psc1] 75 | 76 | print('{0},{1}'.format(psc2, age)) 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /dawba/imagen_dawba_deidentify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Re-encode and anonymize DAWBA files (BL, FU1, FU2 and FU3). 3 | 4 | This script replaces the Scito anoymization pipeline which does not 5 | seem to be working anymore for DAWBA files. 6 | 7 | ========== 8 | Attributes 9 | ========== 10 | 11 | Input 12 | ----- 13 | 14 | DAWBA_BL_MASTER_DIR : str 15 | Location of BL PSC1-encoded files. 16 | DAWBA_FU1_MASTER_DIR : str 17 | Location of FU1 PSC1-encoded files. 18 | DAWBA_FU2_MASTER_DIR : str 19 | Location of FU2 PSC1-encoded files. 20 | DAWBA_FU3_MASTER_DIR : str 21 | Location of FU3 PSC1-encoded files. 22 | DAWBA_SB_MASTER_DIR : str 23 | Location of Stratify PSC1-encoded files. 24 | 25 | Output 26 | ------ 27 | 28 | DAWBA_BL_PSC2_DIR : str 29 | Location of BL PSC2-encoded files. 30 | DAWBA_FU1_PSC2_DIR : str 31 | Location of FU1 PSC2-encoded files. 32 | DAWBA_FU2_PSC2_DIR : str 33 | Location of FU2 PSC2-encoded files. 34 | DAWBA_FU3_PSC2_DIR : str 35 | Location of FU3 PSC2-encoded files. 36 | DAWBA_SB_PSC2_DIR : str 37 | Location of Stratify PSC2-encoded files. 38 | 39 | """ 40 | 41 | DAWBA_BL_MASTER_DIR = '/neurospin/imagen/BL/RAW/PSC1/dawba' 42 | DAWBA_BL_PSC2_DIR = '/neurospin/imagen/BL/RAW/PSC2/dawba' 43 | DAWBA_FU1_MASTER_DIR = '/neurospin/imagen/FU1/RAW/PSC1/dawba' 44 | DAWBA_FU1_PSC2_DIR = '/neurospin/imagen/FU1/RAW/PSC2/dawba' 45 | DAWBA_FU2_MASTER_DIR = '/neurospin/imagen/FU2/RAW/PSC1/dawba' 46 | DAWBA_FU2_PSC2_DIR = '/neurospin/imagen/FU2/RAW/PSC2/dawba' 47 | DAWBA_FU3_MASTER_DIR = '/neurospin/imagen/FU3/RAW/PSC1/dawba' 48 | DAWBA_FU3_PSC2_DIR = '/neurospin/imagen/FU3/RAW/PSC2/dawba' 49 | DAWBA_SB_MASTER_DIR = '/neurospin/imagen/STRATIFY/RAW/PSC1/dawba' 50 | DAWBA_SB_PSC2_DIR = '/neurospin/imagen/STRATIFY/RAW/PSC2/dawba' 51 | 52 | WITHDRAWN_DAWBA_CODES = { 53 | # DAWBA1 codes, missing for some reason - just ignore them... 54 | '19042', 55 | '19044', 56 | '19045', 57 | '19046', 58 | '19047', 59 | '19048', 60 | '19049', 61 | '19050', 62 | '19051', 63 | '23094', 64 | '23095', 65 | '23096', 66 | '23097', 67 | '23098', 68 | '23099', 69 | '23100', 70 | '23101', 71 | '23102', 72 | '23103', 73 | '23104', 74 | '23105', 75 | '23106', 76 | '23107', 77 | '23108', 78 | '23109', 79 | '23110', 80 | '23112', 81 | '23881', 82 | '27361', 83 | '27512', 84 | '28117', 85 | '28694', 86 | '31469', 87 | '31470', 88 | '31471', 89 | '31473', 90 | '38297', 91 | '38298', 92 | '38299', 93 | '38300', 94 | '38301', 95 | # see thread "DAWBA3 codes conversion table" from 2015-05-18 96 | '127657', 97 | # see thread "DAWBA3 codes conversion table" from 2015-12-15 98 | '128847', 99 | '127658', 100 | '132983', 101 | '129716', 102 | '129500', 103 | # see thread "Imagen: Dawba data 201490 acquired on 13 September 2015" on 2019-05-27 104 | '201490', 105 | # see thread "Imagen FU3 Dawba code 221867" on 2019-05-08 106 | '221867', 107 | # see thread "token management in Imagen FU3" on 2019-05-03 108 | '228686', 109 | '228691', 110 | # see thread "token management in Imagen FU3" on 2019-05-03 111 | '239204', 112 | '239230', 113 | # see thread "Imagen FU3 Dawba code 252346" on 2019-05-04 114 | '252346', 115 | # see thread "Re: AW:Imagen FU3 token management: 272443 / 272444" on 2019-06-25 116 | # 244471 and 244513 are the same participant, we were told to keep the former 117 | '244513', 118 | # see thread "AW: [ext] Fwd: Pause to production of new teams" on 2019-07-23 119 | '265683', 120 | '265684', 121 | '265685', 122 | '265686', 123 | '265687', 124 | '265689', 125 | # see thread "IMAGEN FU3, DAWBA-PSC1 clarification" on 2019-09-04 126 | # 236038 and 254243 are the same participant, we were told to keep the former 127 | '254243', 128 | } 129 | 130 | import os 131 | from datetime import datetime 132 | 133 | # import ../imagen_databank 134 | import sys 135 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')) 136 | from imagen_databank import PSC1_FROM_DAWBA 137 | from imagen_databank import PSC2_FROM_PSC1 138 | from imagen_databank import DOB_FROM_PSC1 139 | 140 | import logging 141 | logging.basicConfig(level=logging.INFO) 142 | 143 | 144 | def _create_psc2_file(dawba_path, psc2_path): 145 | """Anonymize and re-encode a DAWBA questionnaire from DAWBA to PSC2. 146 | 147 | DAWBA questionnaire files are CSV files. 148 | 149 | Columns containing a date will be modified and the date will converted to 150 | the age of the subject in days, as required by the anonymization process. 151 | 152 | Parameters 153 | ---------- 154 | psc2_from_dawba: map 155 | Conversion table, from DAWBA to PSC2. 156 | dawba_path: str 157 | Input: DAWBA-encoded CSV file. 158 | psc2_path: str 159 | Output: PSC2-encoded CSV file. 160 | 161 | """ 162 | with open(dawba_path, 'r') as dawba_file: 163 | # identify columns to anonymize/remove in header 164 | header = next(iter(dawba_file)) 165 | items = header.split('\t') 166 | convert = {i for i, item in enumerate(items) 167 | if 'sstartdate' in item or 'p1startdate' in item} 168 | skip = {i for i, item in enumerate(items) 169 | if 'ratername' in item or 'ratedate' in item} 170 | 171 | with open(psc2_path, 'w') as psc2_file: 172 | # write header 173 | items = [item for i, item in enumerate(items) 174 | if i not in skip] 175 | psc2_file.write('\t'.join(items)) 176 | if not items[-1].endswith('\n'): 177 | psc2_file.write('\n') 178 | 179 | # write data 180 | for line in dawba_file: 181 | items = line.split('\t') 182 | dawba = items[0] 183 | if dawba not in PSC1_FROM_DAWBA: 184 | if dawba in WITHDRAWN_DAWBA_CODES: 185 | logging.info('withdrawn DAWBA code: %s', dawba) 186 | else: 187 | logging.error('DAWBA code missing from conversion table: %s', 188 | dawba) 189 | continue 190 | psc1 = PSC1_FROM_DAWBA[dawba] 191 | if psc1 not in PSC2_FROM_PSC1: 192 | logging.error('PSC1 code missing from conversion table: %s', 193 | psc1) 194 | continue 195 | psc2 = PSC2_FROM_PSC1[psc1] 196 | logging.info('converting subject %s from DAWBA to PSC2', 197 | psc1) 198 | items[0] = psc2 199 | # convert dates to subject age in days 200 | for i in convert: 201 | if items[i] != '': 202 | if psc1 in DOB_FROM_PSC1: 203 | startdate = datetime.strptime(items[i], 204 | '%d.%m.%y').date() 205 | birthdate = DOB_FROM_PSC1[psc1] 206 | age = startdate - birthdate 207 | logging.info('age of subject %s: %d', 208 | psc1, age.days) 209 | items[i] = str(age.days) 210 | else: 211 | items[i] = '' 212 | items = [item for i, item in enumerate(items) 213 | if i not in skip] 214 | psc2_file.write('\t'.join(items)) 215 | if not items[-1].endswith('\n'): 216 | psc2_file.write('\n') 217 | 218 | 219 | def create_psc2_files(master_dir, psc2_dir, prefix=None): 220 | """Anonymize and re-encode all DAWBA questionnaires within a directory. 221 | 222 | DAWBA-encoded files are read from `master_dir`, anoymized and converted 223 | from DAWBA codes to PSC2, and the result is written in `psc2_dir`. 224 | 225 | Parameters 226 | ---------- 227 | master_dir: str 228 | Input directory with DAWBA-encoded questionnaires. 229 | psc2_dir: str 230 | Output directory with PSC2-encoded and anonymized questionnaires. 231 | 232 | """ 233 | for master_file in os.listdir(master_dir): 234 | master_path = os.path.join(master_dir, master_file) 235 | if prefix: 236 | master_file = prefix + master_file 237 | psc2_path = os.path.join(psc2_dir, master_file) 238 | _create_psc2_file(master_path, psc2_path) 239 | 240 | 241 | def main(): 242 | create_psc2_files(DAWBA_BL_MASTER_DIR, DAWBA_BL_PSC2_DIR, prefix='IMAGEN_') 243 | create_psc2_files(DAWBA_FU1_MASTER_DIR, DAWBA_FU1_PSC2_DIR, prefix='IMAGEN_') 244 | create_psc2_files(DAWBA_FU2_MASTER_DIR, DAWBA_FU2_PSC2_DIR, prefix='IMAGEN_') 245 | create_psc2_files(DAWBA_FU3_MASTER_DIR, DAWBA_FU3_PSC2_DIR, prefix='IMAGEN_') 246 | create_psc2_files(DAWBA_SB_MASTER_DIR, DAWBA_SB_PSC2_DIR, prefix='STRATIFY_') 247 | 248 | 249 | if __name__ == "__main__": 250 | main() 251 | -------------------------------------------------------------------------------- /genomics/rna_seq_deidentify_imagen.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from imagen_databank import PSC2_FROM_PSC1 4 | 5 | file_labID_PSC1_conv='/imagen/FU3/RAW/PSC1/genomics/rna/env_IMAGEN_align60_no.dups_metadata.tsv' 6 | 7 | #use either first or seconf bloc, for gene_counts or gene_tmp 8 | 9 | input_dir_imagen_PSC1='/imagen/FU3/RAW/PSC1/genomics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts.tsv' 10 | output_dir_imagen_BL_PSC2='/imagen/BL/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2_BL.tsv' 11 | output_dir_imagen_FU2_PSC2='/imagen/FU2/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2_FU2.tsv' 12 | output_dir_imagen_FU3_PSC2='/imagen/FU3/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2_FU3.tsv' 13 | """ 14 | input_dir_imagen_PSC1="/imagen/FU3/RAW/PSC1/genomics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm.tsv" 15 | output_dir_imagen_BL_PSC2='/imagen/BL/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2_BL.tsv' 16 | output_dir_imagen_FU2_PSC2='/imagen/FU2/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2_FU2.tsv' 17 | output_dir_imagen_FU3_PSC2='/imagen/FU3/processed/genetics/rna/env_IMAGEN_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2_FU3.tsv' 18 | """ 19 | 20 | 21 | 22 | def convert_labID_to_PSC2_with_timepoint(labID): 23 | labID_index = headers.index("Lab_Code") 24 | psc1_index = headers.index("PSC1") 25 | timepoint_index=headers.index("TimePoint") 26 | for line in tab_conv_labID_psc1: 27 | if line[labID_index]==labID: 28 | try: 29 | if len(line[psc1_index])<12: 30 | psc1="0"+line[psc1_index] 31 | elif len(line[psc1_index])<12: 32 | psc1=line[psc1_index] 33 | psc2 = PSC2_FROM_PSC1[psc1] 34 | return (psc2, line[timepoint_index]) 35 | except: 36 | print("invalid PSC1 code:", line[psc1_index]) 37 | #return ("###", line[timepoint_index]) 38 | 39 | print("PSC1 not found for labID: ", labID) 40 | 41 | """ 42 | for line in file_labID_PSC1: 43 | columns = line.strip().split(",") 44 | #print("check:",columns[labID_index],labID==columns[labID_index]) 45 | if columns[labID_index] == labID: 46 | #print("deidentified: ",columns[psc1_index], "****", columns[timepoint_index]) 47 | psc2 = PSC2_FROM_PSC1["0"+columns[psc1_index]] 48 | return(psc2,columns[timepoint_index]) 49 | print("PSC1 not found for labID: ", labID) 50 | """ 51 | 52 | if __name__ == "__main__": 53 | with open(file_labID_PSC1_conv, 'r', errors='ignore') as file_labID_PSC1: 54 | reader = csv.reader(file_labID_PSC1, delimiter=',') 55 | tab_conv_labID_psc1 = [row for row in reader] 56 | headers=tab_conv_labID_psc1[0] 57 | #headers = list(next(reader)) 58 | print(headers) 59 | print(convert_labID_to_PSC2_with_timepoint("GB97ENVKCLR301518")) 60 | 61 | with open(input_dir_imagen_PSC1, 'r', newline='',errors='ignore') as labID_infile: 62 | reader_input = csv.reader(labID_infile, delimiter='\t') 63 | 64 | data = [row for row in reader_input] 65 | 66 | #print(data[0]) 67 | #intialize list of lists that will be written in the output file 68 | data_psc2_BL=[[] for i in range(len(data))] 69 | data_psc2_FU2 = [[] for i in range(len(data))] 70 | data_psc2_FU3 = [[] for i in range(len(data))] 71 | #intialize the two first columns of the three timepoints 72 | for i in range(len(data)): 73 | #print(data[i][0]," ***** ", data[i][1], " ***** ", data[i][2]) 74 | #print(data_psc2_BL[i]) 75 | data_psc2_BL[i].append(data[i][0]) 76 | data_psc2_BL[i].append(data[i][1]) 77 | 78 | data_psc2_FU2[i].append(data[i][0]) 79 | data_psc2_FU2[i].append(data[i][1]) 80 | 81 | data_psc2_FU3[i].append(data[i][0]) 82 | data_psc2_FU3[i].append(data[i][1]) 83 | 84 | 85 | count_BL=0 86 | count_FU2=0 87 | count_FU3 = 0 88 | #copy the resting column to the respective matrix depending on the timepoint 89 | for col_index in range(2,len(data[0])): 90 | #print(col_index) 91 | lab_id=data[0][col_index] 92 | lab_id.strip() 93 | #print(convert_labID_to_PSC2_with_timepoint(lab_id)) 94 | try: 95 | (psc2, timepoint)= convert_labID_to_PSC2_with_timepoint(lab_id) 96 | if timepoint == "BL": 97 | 98 | count_BL=count_BL+1 99 | data_psc2_BL[0].append(psc2) 100 | for i in range(1,len(data)): 101 | data_psc2_BL[i].append(data[i][col_index]) 102 | elif timepoint == "FU2": 103 | 104 | count_FU2=count_FU2+1 105 | data_psc2_FU2[0].append(psc2) 106 | for i in range(1,len(data)): 107 | data_psc2_FU2[i].append(data[i][col_index]) 108 | elif timepoint == "FU3": 109 | 110 | count_FU3=count_FU3+1 111 | data_psc2_FU3[0].append(psc2) 112 | for i in range(1,len(data)): 113 | data_psc2_FU3[i].append(data[i][col_index]) 114 | else: 115 | print("invalid timepoint:",timepoint) 116 | except: 117 | continue 118 | print("BL", count_BL) 119 | print("FU2", count_FU2) 120 | print("FU3", count_FU3) 121 | 122 | #write the output to the files 123 | print("writing ...") 124 | with open(output_dir_imagen_BL_PSC2, 'w', newline='') as PSC2_BL_outfile: 125 | writer_BL = csv.writer(PSC2_BL_outfile, delimiter='\t') 126 | writer_BL.writerows(data_psc2_BL) 127 | 128 | with open(output_dir_imagen_FU2_PSC2, 'w', newline='') as PSC2_FU2_outfile: 129 | writer_FU2 = csv.writer(PSC2_FU2_outfile, delimiter='\t') 130 | writer_FU2.writerows(data_psc2_FU2) 131 | 132 | with open(output_dir_imagen_FU3_PSC2, 'w', newline='') as PSC2_FU3_outfile: 133 | writer_FU3 = csv.writer(PSC2_FU3_outfile, delimiter='\t') 134 | writer_FU3.writerows(data_psc2_FU3) 135 | -------------------------------------------------------------------------------- /genomics/rna_seq_deidentify_stratify.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from imagen_databank import PSC2_FROM_PSC1 4 | 5 | file_labID_PSC1_conv_stratify='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_STRATIFY_align60_no.dups_metadata.tsv' 6 | file_labID_PSC1_conv_estra='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_ESTRA_align60_no.dups_metadata.tsv' 7 | 8 | #use either first or seconf bloc, for gene_counts or gene_tmp 9 | 10 | input_dir_STRATIFY_PSC1_counts='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts.tsv' 11 | input_dir_ESTRA_PSC1_counts='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts.tsv' 12 | output_dir_STRATIFY_PSC2_counts='/imagen/STRATIFY/processed/genetics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2.tsv' 13 | output_dir_ESTRA_PSC2_counts='/imagen/STRATIFY/processed/genetics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_counts_PSC2.tsv' 14 | 15 | 16 | input_dir_STRATIFY_PSC1_tpm='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm.tsv' 17 | input_dir_ESTRA_PSC1_tpm='/imagen/STRATIFY/RAW/PSC1/genomics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm.tsv' 18 | output_dir_STRATIFY_PSC2_tpm='/imagen/STRATIFY/processed/genetics/rna/env_STRATIFY_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2.tsv' 19 | output_dir_ESTRA_PSC2_tpm='/imagen/STRATIFY/processed/genetics/rna/env_ESTRA_align60_no.dups_no.sex.mismatch_salmon.merged.gene_tpm_PSC2.tsv' 20 | 21 | 22 | 23 | 24 | def convert_labID_to_PSC2_with_timepoint(labID,tab_conv_labID_psc1): 25 | headers = tab_conv_labID_psc1[0] 26 | labID_index = headers.index("Lab_Code") 27 | psc1_index = headers.index("PSC1") 28 | timepoint_index=headers.index("TimePoint") 29 | for line in tab_conv_labID_psc1: 30 | if line[labID_index]==labID: 31 | try: 32 | if len(line[psc1_index])<12: 33 | psc1="0"+line[psc1_index] 34 | elif len(line[psc1_index])<12: 35 | psc1=line[psc1_index] 36 | psc2 = PSC2_FROM_PSC1[psc1] 37 | 38 | return (psc2, line[timepoint_index]) 39 | except: 40 | print("invalid PSC1 code:", line[psc1_index]) 41 | #return ("###", line[timepoint_index]) 42 | 43 | print("PSC1 not found for labID: ", labID) 44 | 45 | 46 | def convert_file_to_PSC2(file_labID_PSC1_conv, input_dir_PSC1, output_dir_PSC2, delimiter_metadata): 47 | print("converting ", input_dir_PSC1, " to PSC2...") 48 | with open(file_labID_PSC1_conv, 'r', errors='ignore') as file_labID_PSC1: 49 | reader = csv.reader(file_labID_PSC1, delimiter=delimiter_metadata) 50 | tab_conv_labID_psc1 = [row for row in reader] 51 | headers = tab_conv_labID_psc1[0] 52 | # headers = list(next(reader)) 53 | print(headers) 54 | # print(convert_labID_to_PSC2_with_timepoint("GB97ENVKCLR301518")) 55 | 56 | with open(input_dir_PSC1, 'r', newline='',errors='ignore') as labID_infile: 57 | reader_input = csv.reader(labID_infile, delimiter='\t') 58 | 59 | data = [row for row in reader_input] 60 | 61 | #print(data[0]) 62 | #intialize list of lists that will be written in the output file 63 | data_psc2=[[] for i in range(len(data))] 64 | 65 | #intialize the two first columns of the three timepoints 66 | for i in range(len(data)): 67 | #print(data[i][0]," ***** ", data[i][1], " ***** ", data[i][2]) 68 | #print(data_psc2_BL[i]) 69 | data_psc2[i].append(data[i][0]) 70 | data_psc2[i].append(data[i][1]) 71 | 72 | count=0 73 | 74 | #copy the resting column to the respective matrix depending on the timepoint 75 | for col_index in range(2,len(data[0])): 76 | #print(col_index) 77 | lab_id=data[0][col_index] 78 | lab_id.strip() 79 | #print(convert_labID_to_PSC2_with_timepoint(lab_id)) 80 | try: 81 | (psc2, timepoint)= convert_labID_to_PSC2_with_timepoint(lab_id,tab_conv_labID_psc1) 82 | count=count+1 83 | data_psc2[0].append(psc2) 84 | for i in range(1,len(data)): 85 | data_psc2[i].append(data[i][col_index]) 86 | 87 | except: 88 | continue 89 | print("number of lines in file: " ,count) 90 | 91 | 92 | #write the output to the files 93 | print("writing ...") 94 | with open(output_dir_PSC2, 'w', newline='') as PSC2_outfile: 95 | writer = csv.writer(PSC2_outfile, delimiter='\t') 96 | writer.writerows(data_psc2) 97 | 98 | 99 | 100 | if __name__ == "__main__": 101 | convert_file_to_PSC2(file_labID_PSC1_conv_stratify, input_dir_STRATIFY_PSC1_counts, output_dir_STRATIFY_PSC2_counts,",") 102 | 103 | convert_file_to_PSC2(file_labID_PSC1_conv_stratify, input_dir_STRATIFY_PSC1_tpm, output_dir_STRATIFY_PSC2_tpm, ",") 104 | 105 | convert_file_to_PSC2(file_labID_PSC1_conv_estra, input_dir_ESTRA_PSC1_counts, output_dir_ESTRA_PSC2_counts, "\t") 106 | 107 | convert_file_to_PSC2(file_labID_PSC1_conv_estra, input_dir_ESTRA_PSC1_tpm, output_dir_ESTRA_PSC2_tpm, "\t") 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /geolocation/geolocation.sh: -------------------------------------------------------------------------------- 1 | #/bin/sh 2 | 3 | # 4 | # process geolocation at each time point 5 | # 6 | for timepoint in BL FU1 FU2 FU3 7 | do 8 | DIR_PSC1="/neurospin/imagen/${timepoint}/RAW/PSC1/geolocation" 9 | FILE_PSC2="/neurospin/imagen/${timepoint}/processed/geolocation/IMAGEN_geolocation_${timepoint}.csv" 10 | 11 | # print output file header line 12 | echo "PSC2,latitude,longitude,notes" > "$FILE_PSC2" 13 | # process each input file 14 | for file in "${DIR_PSC1}/IMAGEN_geolocation_"*"_${timepoint}.csv" 15 | do 16 | # some commands cannot process DOS line endings 17 | tmpfile=`mktemp -t tmp.geolocation.XXXXXXXXXX` 18 | dos2unix -n "$file" "$tmpfile" 2>/dev/null 19 | # some sites lack a "Notes" column 20 | if head -1 "$tmpfile" | grep -q "Notes" 21 | then 22 | ADD_NOTES=0 23 | else 24 | ADD_NOTES=1 25 | fi 26 | # skip input file header line 27 | tail -n +2 "$tmpfile" | 28 | # some sites lack a "Notes" column 29 | if [ "$ADD_NOTES" ] 30 | then 31 | sed 's/$/,/' 32 | fi 33 | # clean up 34 | rm -f "$tmpfile" 35 | done | psc2psc.py 2>/dev/null | sort >> "$FILE_PSC2" 36 | unix2dos -o "$FILE_PSC2" 2>/dev/null 37 | done 38 | 39 | 40 | # 41 | # process geolocation backdated from BL 42 | # 43 | BACKDATED_PSC1="/neurospin/imagen/FU3/RAW/PSC1/geolocation/IMAGEN_geolocation_ALL_SITES_backdated_Dublin_updated.csv" 44 | BACKDATED_PSC2="/neurospin/imagen/FU3/processed/geolocation/IMAGEN_geolocation_backdated.csv" 45 | 46 | # print output file header line 47 | echo "PSC2,year,latitude,longitude" > "$BACKDATED_PSC2" 48 | # skip input file header line 49 | tail -n +2 "$BACKDATED_PSC1" | psc2psc.py 2>/dev/null | sort >> "$BACKDATED_PSC2" 50 | unix2dos -o "$BACKDATED_PSC2" 2>/dev/null 51 | -------------------------------------------------------------------------------- /imagen_databank/__init__.py: -------------------------------------------------------------------------------- 1 | # noqa 2 | 3 | # Copyright (c) 2014-2018 CEA 4 | # 5 | # This software is governed by the CeCILL license under French law and 6 | # abiding by the rules of distribution of free software. You can use, 7 | # modify and/ or redistribute the software under the terms of the CeCILL 8 | # license as circulated by CEA, CNRS and INRIA at the following URL 9 | # "http://www.cecill.info". 10 | # 11 | # As a counterpart to the access to the source code and rights to copy, 12 | # modify and redistribute granted by the license, users are provided only 13 | # with a limited warranty and the software's author, the holder of the 14 | # economic rights, and the successive licensors have only limited 15 | # liability. 16 | # 17 | # In this respect, the user's attention is drawn to the risks associated 18 | # with loading, using, modifying and/or developing or reproducing the 19 | # software by the user in light of its specific status of free software, 20 | # that may mean that it is complicated to manipulate, and that also 21 | # therefore means that it is reserved for developers and experienced 22 | # professionals having in-depth computer knowledge. Users are therefore 23 | # encouraged to load and test the software's suitability as regards their 24 | # requirements in conditions enabling the security of their systems and/or 25 | # data to be ensured and, more generally, to use and operate it in the 26 | # same conditions as regards security. 27 | # 28 | # The fact that you are presently reading this means that you have had 29 | # knowledge of the CeCILL license and that you accept its terms. 30 | 31 | __all__ = ['additional_data', 'behavioral', 'cantab', 'core', 'dicom_utils', 32 | 'image_data', 'scanning', 'sanity'] 33 | 34 | from . import core 35 | from .core import (LONDON, NOTTINGHAM, DUBLIN, BERLIN, 36 | HAMBURG, MANNHEIM, PARIS, DRESDEN, 37 | SOUTHAMPTON, AACHEN) 38 | from .core import CENTER_NAME 39 | from .core import (PSC2_FROM_PSC1, PSC1_FROM_PSC2, 40 | PSC1_FROM_DAWBA, PSC2_FROM_DAWBA, # PSC2_FROM_DAWBA is obsolete 41 | DOB_FROM_PSC1, DOB_FROM_PSC2) # DOB_FROM_PSC2 is obsolete 42 | from .core import (detect_psc1, detect_psc2, guess_psc1) 43 | from .core import Error 44 | 45 | from . import additional_data 46 | from .additional_data import (walk_additional_data, report_additional_data) 47 | 48 | from . import behavioral 49 | from .behavioral import (MID_CSV, FT_CSV, SS_CSV, RECOG_CSV) 50 | from .behavioral import (read_mid, read_ft, read_ss, read_recog) 51 | 52 | from . import cantab 53 | from .cantab import (CANTAB_CCLAR, DETAILED_DATASHEET_CSV, DATASHEET_CSV, 54 | REPORT_HTML) 55 | from .cantab import (read_cant, read_datasheet, read_detailed_datasheet, 56 | read_report) 57 | 58 | from . import dicom_utils 59 | from .dicom_utils import read_metadata 60 | 61 | from . import image_data 62 | from .image_data import (SEQUENCE_LOCALIZER_CALIBRATION, 63 | SEQUENCE_T2, SEQUENCE_T2_FLAIR, 64 | SEQUENCE_ADNI_MPRAGE, 65 | SEQUENCE_MID, SEQUENCE_FT, SEQUENCE_SST, 66 | SEQUENCE_B0_MAP, SEQUENCE_DTI, 67 | SEQUENCE_RESTING_STATE, 68 | SEQUENCE_NODDI) 69 | from .image_data import SEQUENCE_NAME 70 | from .image_data import NONSTANDARD_DICOM 71 | from .image_data import series_type_from_description 72 | from .image_data import walk_image_data, report_image_data 73 | 74 | from . import scanning 75 | from .scanning import read_scanning 76 | 77 | from . import sanity 78 | 79 | __author__ = 'Dimitri Papadopoulos' 80 | __copyright__ = 'Copyright (c) 2014-2018 CEA' 81 | __license__ = 'CeCILL' 82 | __version__ = '0.1.0' 83 | __email__ = 'imagendatabase@cea.fr' 84 | __status__ = 'Development' 85 | -------------------------------------------------------------------------------- /imagen_databank/additional_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2017 CEA 2 | # 3 | # This software is governed by the CeCILL license under French law and 4 | # abiding by the rules of distribution of free software. You can use, 5 | # modify and/ or redistribute the software under the terms of the CeCILL 6 | # license as circulated by CEA, CNRS and INRIA at the following URL 7 | # "http://www.cecill.info". 8 | # 9 | # As a counterpart to the access to the source code and rights to copy, 10 | # modify and redistribute granted by the license, users are provided only 11 | # with a limited warranty and the software's author, the holder of the 12 | # economic rights, and the successive licensors have only limited 13 | # liability. 14 | # 15 | # In this respect, the user's attention is drawn to the risks associated 16 | # with loading, using, modifying and/or developing or reproducing the 17 | # software by the user in light of its specific status of free software, 18 | # that may mean that it is complicated to manipulate, and that also 19 | # therefore means that it is reserved for developers and experienced 20 | # professionals having in-depth computer knowledge. Users are therefore 21 | # encouraged to load and test the software's suitability as regards their 22 | # requirements in conditions enabling the security of their systems and/or 23 | # data to be ensured and, more generally, to use and operate it in the 24 | # same conditions as regards security. 25 | # 26 | # The fact that you are presently reading this means that you have had 27 | # knowledge of the CeCILL license and that you accept its terms. 28 | 29 | import os 30 | import re 31 | 32 | from .cantab import (CANTAB_CCLAR, DETAILED_DATASHEET_CSV, DATASHEET_CSV, 33 | REPORT_HTML, 34 | read_cant, read_datasheet, read_detailed_datasheet, 35 | read_report) 36 | from .behavioral import (MID_CSV, FT_CSV, SS_CSV, RECOG_CSV, 37 | read_mid, read_ft, read_ss, read_recog) 38 | 39 | import logging 40 | logger = logging.getLogger(__name__) 41 | 42 | __all__ = ['walk_additional_data', 'report_additional_data'] 43 | 44 | 45 | # 46 | # check filenames against these regex'es when exploring Additional Data 47 | # 48 | # in some case order is important, for example: 49 | # - first match 'detailed_datasheet' 50 | # - then match 'datasheet' 51 | # 52 | _LOOSE_ADDITIONAL_DATA_REGEXES = ( 53 | (re.compile(r'(\w+_)?cant(_\w+)?\.cclar', re.IGNORECASE), CANTAB_CCLAR), 54 | # Mannheim send 'detailed datasheet' files (space instead of underscore) 55 | (re.compile(r'(\w+_)?detailed[_ ]datasheet(_\w+)?\.csv', re.IGNORECASE), 56 | DETAILED_DATASHEET_CSV), 57 | (re.compile(r'(\w+_)?datasheet(_\w+)?\.csv', re.IGNORECASE), DATASHEET_CSV), 58 | (re.compile(r'(\w+_)?report(_\w+)?\.html', re.IGNORECASE), REPORT_HTML), 59 | (re.compile(r'ft_\w+\.csv', re.IGNORECASE), FT_CSV), 60 | (re.compile(r'mid_\w+\.csv', re.IGNORECASE), MID_CSV), 61 | (re.compile(r'recog_\w+\.csv', re.IGNORECASE), RECOG_CSV), 62 | (re.compile(r'ss_\w+\.csv', re.IGNORECASE), SS_CSV), 63 | ) 64 | 65 | _EXACT_ADDITIONAL_DATA_REGEXES = ( 66 | (re.compile(r'cant_\d{12}(fu|FU)?\.cclar'), CANTAB_CCLAR), 67 | (re.compile(r'detailed_datasheet_\d{12}(fu|FU)?\.csv'), DETAILED_DATASHEET_CSV), 68 | (re.compile(r'datasheet_\d{12}(fu|FU)?\.csv'), DATASHEET_CSV), 69 | (re.compile(r'report_\d{12}(fu|FU)?\.html'), REPORT_HTML), 70 | (re.compile(r'ft_\d{12}(fu|FU)?\.csv'), FT_CSV), 71 | (re.compile(r'mid_\d{12}(fu|FU)?\.csv'), MID_CSV), 72 | (re.compile(r'recog_\d{12}(fu|FU)?\.csv'), RECOG_CSV), 73 | (re.compile(r'ss_\d{12}(fu|FU)?\.csv', re.IGNORECASE), SS_CSV), 74 | ) 75 | 76 | 77 | def _match_additional_data_sops(filename, exact=False): 78 | """Compare filename to filenames defined in Imagen FU2 SOPs. 79 | 80 | Compare actual filename to expected filenames expected for Additional 81 | Data in SOPs, either in a strict way or a loose way. This matching 82 | function is empirical and based on experimentation. 83 | 84 | Parameters 85 | ---------- 86 | filename : unicode 87 | The file basename to match. 88 | 89 | exact : bool 90 | Exact match if True else loose match. 91 | 92 | Returns 93 | ------- 94 | str 95 | If the filename loosely matches a file type defined in the SOPs, 96 | return the type file type, else return None. 97 | 98 | """ 99 | if exact: 100 | regex_list = _EXACT_ADDITIONAL_DATA_REGEXES 101 | else: 102 | regex_list = _LOOSE_ADDITIONAL_DATA_REGEXES 103 | for regex, filetype in regex_list: 104 | if regex.match(filename): 105 | logger.debug('assign type "%s" to filename: %s', 106 | filetype, filename) 107 | return filetype 108 | logger.info('filename does not match any known type: %s', filename) 109 | return None 110 | 111 | 112 | def walk_additional_data(path): 113 | """Generate information on Additional Data files in a directory. 114 | 115 | Parameters 116 | ---------- 117 | path : unicode 118 | The directory to look for files into. 119 | 120 | Returns 121 | ------- 122 | tuple 123 | Yield a 2-tuple: the name and the path of each file relative to path. 124 | 125 | """ 126 | 127 | for root, dummy_dirs, files in os.walk(path): 128 | for filename in files: 129 | relpath = os.path.relpath(os.path.join(root, filename), path) 130 | yield filename, relpath 131 | 132 | 133 | def report_additional_data(path, psc1, exact=False): 134 | """Find Additional Data files that fit the Imagen FU2 SOPs. 135 | 136 | The Imagen FU2 SOPs define a precise file organization for Additional 137 | Data. In practice we have found the SOPs are only loosely followed by 138 | acquisition centres, hence the tolerant optional argument. 139 | 140 | This function scans the directory where we expect to find the Additional 141 | Data of a dataset and builds a collection of files identified as the 142 | files described in the SOPs. 143 | 144 | Parameters 145 | ---------- 146 | path : unicode 147 | The directory to look for Additional Data into. 148 | 149 | psc1 : str 150 | PSC1 code of the subject. 151 | 152 | exact : bool 153 | Exact match if True, else loose match. 154 | 155 | Returns 156 | ------- 157 | dict 158 | The key identifies the type of identified files and the value 159 | lists the relative path of the files. 160 | 161 | """ 162 | additional_files = {} 163 | 164 | for filename, relpath in walk_additional_data(path): 165 | filetype = _match_additional_data_sops(filename, exact) 166 | if filetype: 167 | logger.debug('assign type "%s" to file: %s', 168 | filetype, relpath) 169 | additional_files.setdefault(filetype, []).append(relpath) 170 | else: 171 | logger.warning('cannot match any known type: %s', relpath) 172 | 173 | additional_data = {} 174 | 175 | # read cant_*.cclar where available 176 | if CANTAB_CCLAR in additional_files: 177 | for f in additional_files[CANTAB_CCLAR]: 178 | f_path = os.path.join(path, f) 179 | subject_ids = read_cant(f_path) 180 | if psc1 in subject_ids: 181 | subject_ids.remove(psc1) 182 | additional_data.setdefault(CANTAB_CCLAR, {})[f] = subject_ids 183 | # read datasheet_*.csv where available 184 | if DATASHEET_CSV in additional_files: 185 | for f in additional_files[DATASHEET_CSV]: 186 | f_path = os.path.join(path, f) 187 | subject_ids, dummy_st, dummy_r, dummy_c, dummy_f = read_datasheet(f_path) 188 | if psc1 in subject_ids: 189 | subject_ids.remove(psc1) 190 | additional_data.setdefault(DATASHEET_CSV, {})[f] = subject_ids 191 | # read detailed_datasheet_*.csv where available 192 | if DETAILED_DATASHEET_CSV in additional_files: 193 | for f in additional_files[DETAILED_DATASHEET_CSV]: 194 | f_path = os.path.join(path, f) 195 | subject_ids = read_detailed_datasheet(f_path) 196 | if psc1 in subject_ids: 197 | subject_ids.remove(psc1) 198 | additional_data.setdefault(DETAILED_DATASHEET_CSV, {})[f] = subject_ids 199 | # read report_*.html where available 200 | if REPORT_HTML in additional_files: 201 | for f in additional_files[REPORT_HTML]: 202 | f_path = os.path.join(path, f) 203 | subject_ids = read_report(f_path) 204 | if psc1 in subject_ids: 205 | subject_ids.remove(psc1) 206 | additional_data.setdefault(REPORT_HTML, {})[f] = subject_ids 207 | # read Scanning/ft_*.csv where available 208 | if FT_CSV in additional_files: 209 | for f in additional_files[FT_CSV]: 210 | f_path = os.path.join(path, f) 211 | subject_id, _, _, _ = read_ft(f_path) 212 | if subject_id: 213 | additional_data.setdefault(FT_CSV, {})[f] = set(subject_id) 214 | # read Scanning/mid_*.csv where available 215 | if MID_CSV in additional_files: 216 | for f in additional_files[MID_CSV]: 217 | f_path = os.path.join(path, f) 218 | subject_id, _, _, _ = read_mid(f_path) 219 | if subject_id: 220 | additional_data.setdefault(MID_CSV, {})[f] = set(subject_id) 221 | # read Scanning/recog_*.csv where available 222 | if RECOG_CSV in additional_files: 223 | for f in additional_files[RECOG_CSV]: 224 | f_path = os.path.join(path, f) 225 | subject_id, _, _, _ = read_recog(f_path) 226 | if subject_id: 227 | additional_data.setdefault(RECOG_CSV, {})[f] = set(subject_id) 228 | # read Scanning/ss_*.csv where available 229 | if SS_CSV in additional_files: 230 | for f in additional_files[SS_CSV]: 231 | f_path = os.path.join(path, f) 232 | subject_id, _, _, _ = read_ss(f_path) 233 | if subject_id: 234 | additional_data.setdefault(SS_CSV, {})[f] = set(subject_id) 235 | 236 | return additional_data 237 | -------------------------------------------------------------------------------- /imagen_databank/behavioral.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2017 CEA 2 | # 3 | # This software is governed by the CeCILL license under French law and 4 | # abiding by the rules of distribution of free software. You can use, 5 | # modify and/ or redistribute the software under the terms of the CeCILL 6 | # license as circulated by CEA, CNRS and INRIA at the following URL 7 | # "http://www.cecill.info". 8 | # 9 | # As a counterpart to the access to the source code and rights to copy, 10 | # modify and redistribute granted by the license, users are provided only 11 | # with a limited warranty and the software's author, the holder of the 12 | # economic rights, and the successive licensors have only limited 13 | # liability. 14 | # 15 | # In this respect, the user's attention is drawn to the risks associated 16 | # with loading, using, modifying and/or developing or reproducing the 17 | # software by the user in light of its specific status of free software, 18 | # that may mean that it is complicated to manipulate, and that also 19 | # therefore means that it is reserved for developers and experienced 20 | # professionals having in-depth computer knowledge. Users are therefore 21 | # encouraged to load and test the software's suitability as regards their 22 | # requirements in conditions enabling the security of their systems and/or 23 | # data to be ensured and, more generally, to use and operate it in the 24 | # same conditions as regards security. 25 | # 26 | # The fact that you are presently reading this means that you have had 27 | # knowledge of the CeCILL license and that you accept its terms. 28 | 29 | import csv 30 | from datetime import datetime 31 | 32 | from .core import Error 33 | 34 | import logging 35 | logger = logging.getLogger(__name__) 36 | 37 | __all__ = ['MID_COLUMNS', 'FT_COLUMNS', 'SS_COLUMNS', 'RECOG_COLUMNS', 38 | 'read_mid', 'read_ft', 'read_ss', 'read_recog'] 39 | 40 | # 41 | # types of files we expect to be find under AdditionalData/Scanning 42 | # 43 | FT_CSV = 'ft' 44 | MID_CSV = 'mid' 45 | SS_CSV = 'ss' 46 | RECOG_CSV = 'recog' 47 | 48 | 49 | def _parse_behavioral_datetime(date_string): 50 | """Read date in the format found in CSV files. 51 | 52 | * LONDON 01/02/2015 01:02:03 53 | * NOTTINGHAM 01/02/2015 01:02:03 54 | * DUBLIN 01/02/2015 01:02:03 2/1/2015 1:02:03 AM 55 | * BERLIN 01.02.2015 01:02:03 56 | * HAMBURG 01.02.2015 01:02:03 57 | * MANNHEIM 01.02.2015 01:02:03 58 | * PARIS 01/02/2015 01:02:03 59 | * DRESDEN 01.02.2015 01:02:03 60 | 61 | """ 62 | DATE_FORMATS = ( 63 | '%d.%m.%Y %H:%M:%S', 64 | '%d/%m/%Y %H:%M:%S', 65 | '%m/%d/%Y %I:%M:%S %p', 66 | ) 67 | for date_format in DATE_FORMATS: 68 | try: 69 | dt = datetime.strptime(date_string, date_format) 70 | return dt 71 | except ValueError: 72 | pass 73 | return None 74 | 75 | 76 | def _fix_spurious_quotes(s): 77 | if s.startswith('"'): 78 | last = s.rfind('"') 79 | if last > 0: 80 | main = s[1:last] 81 | last += 1 82 | tail = s[last:] 83 | if tail.isspace(): 84 | s = main + tail 85 | return s 86 | 87 | 88 | def _fix_terminal_tab(s): 89 | last = s.rfind('\t') 90 | if last > 0: 91 | main = s[:last] 92 | last += 1 93 | tail = s[last:] 94 | if tail.isspace(): 95 | s = main + tail 96 | return s 97 | 98 | 99 | MID_COLUMNS = ( 100 | 'Trial', 101 | 'Trial Category', 102 | 'Trial Start Time (Onset)', 103 | 'Pre-determined Onset', 104 | 'Cue Presented', 105 | 'Anticipation Phase Start Time', 106 | 'Anticipation Phase Duration', 107 | 'Target Phase Start Time', 108 | 'Target Phase Duration', 109 | 'Response Made by Subject', 110 | 'Response time', 111 | 'Feedback Phase Start Time', 112 | 'Outcome', 113 | 'Amount', 114 | 'Fixation Phase Start Time (Lasts until next trial start time)', 115 | 'Success Rate', 116 | 'Scanner Pulse', 117 | ) 118 | 119 | FT_COLUMNS = ( 120 | 'Trial Start Time (Onset)', 121 | 'Video Clip Name', 122 | ) 123 | 124 | SS_COLUMNS = ( 125 | 'Trial', 126 | 'Trial Category', 127 | 'Trial Start Time (Onset)', 128 | 'Pre-determined/randomised onset', 129 | 'Go Stimulus Presentation Time', # 'Go Stimulus Presentation Time ' 130 | 'Stimulus Presented', 131 | 'Delay', 132 | 'Stop Stimulus Presentation Time', 133 | 'Response made by subject', 134 | 'Absolute Response Time', 135 | 'Relative Response Time', 136 | 'Response Outcome', 137 | 'Real Jitter', 138 | 'Pre-determined Jitter', 139 | 'Success Rate of Variable Delay Stop Trials', 140 | 'Scanner Pulse', 141 | ) 142 | 143 | RECOG_COLUMNS = ( 144 | 'TimePassed', 145 | 'UserResponse', 146 | 'ImageFileName', 147 | ) 148 | 149 | # for each of the 4 tasks we provide a tuple: 150 | # * first word in the behavioral file that identifies the task 151 | # * list of columns in the 2nd line 152 | # * column from which to extract the last ascending numerical sequence 153 | # * True if the numerical sequence is strictly ascending 154 | _TASK_SPECIFICS = { 155 | MID_CSV: ('MID_TASK', MID_COLUMNS, 0, True), 156 | FT_CSV: ('FACE_TASK', FT_COLUMNS, 0, True), 157 | SS_CSV: ('STOP_SIGNAL_TASK', SS_COLUMNS, 0, False), 158 | RECOG_CSV: ('RECOGNITION_TASK', RECOG_COLUMNS, 0, True), 159 | } 160 | 161 | 162 | def _read_generic_behavioral(path, task, strict=True): 163 | """Read behavioral files and return part of the contents and errors. 164 | 165 | Sometimes complete lines are enclosed in quotes. Such quotes 166 | must be fixed before the contents can be read as CSV. 167 | 168 | Parameters 169 | ---------- 170 | path : str 171 | Path to the behavioral file to read from. 172 | 173 | task : ? 174 | Type of task. 175 | 176 | strict : bool 177 | Be more lenient and let wholly quoted lines through if False, 178 | else do report the error. 179 | 180 | Returns 181 | ------- 182 | psc1 : str 183 | PSC1 code. 184 | timestamp : datetime 185 | Time stamp extracted from the header. 186 | trials : array_like 187 | Last ascending sequence of trials. 188 | errors : array_like 189 | List of Error. 190 | 191 | Raises 192 | ------ 193 | FileNotFoundError 194 | If path does not exist. 195 | 196 | """ 197 | psc1 = None 198 | timestamp = None 199 | sequence = [] 200 | errors = [] 201 | 202 | with open(path, 'r') as behavioral: # add newline='' in Python 3 203 | lines = behavioral.readlines() 204 | 205 | # attempt to handle broken CSV files with fully quoted lines 206 | reader = csv.reader(lines, delimiter='\t') 207 | if not strict and max(len(row) for row in reader) < 2: 208 | lines = [_fix_spurious_quotes(line) for line in lines] 209 | 210 | # remove spurious terminal tab 211 | lines = [_fix_terminal_tab(line) for line in lines] 212 | 213 | # now re-read file contents 214 | reader = csv.reader(lines, delimiter='\t') 215 | 216 | # 1st line 217 | header = next(reader) 218 | if header: 219 | header = [x.strip() for x in header] 220 | if len(header) != 4: 221 | errors.append(Error(path, 'Line 1 contains {0} columns instead of 4' 222 | .format(len(header)), header)) 223 | if len(header) > 3: 224 | COLUMN = 'Task type: Scanning' 225 | if header[3] != COLUMN: 226 | errors.append(Error(path, 'Column 4 of line 1 must be "{0}" ' 227 | 'instead of "{1}"' 228 | .format(COLUMN, header[3]), header)) 229 | if len(header) > 2: 230 | COLUMN = 'Subject ID:' 231 | if header[2].startswith(COLUMN): 232 | psc1 = header[2][len(COLUMN):].lstrip() 233 | else: 234 | errors.append(Error(path, 'Column 3 of line 1 "{0}" must start ' 235 | 'with "{1}"' 236 | .format(header[2], COLUMN), header)) 237 | if len(header) > 1: 238 | timestamp = _parse_behavioral_datetime(header[1]) 239 | if not timestamp: 240 | errors.append(Error(path, 'Column 2 of line 1 "{0}" is not a standard time stamp' 241 | .format(header[1]), header)) 242 | if len(header) > 0: 243 | COLUMN = '{0} task'.format(_TASK_SPECIFICS[task][0]) 244 | if header[0] != COLUMN: 245 | errors.append(Error(path, 'Column 1 of line 1 must be "{0}" ' 246 | 'instead of "{1}"' 247 | .format(COLUMN, header[0]), header)) 248 | else: 249 | errors.append(Error(path, 'Empty file')) 250 | 251 | # 2nd line 252 | try: 253 | header = next(reader) 254 | header = [x.strip() for x in header] 255 | COLUMNS = _TASK_SPECIFICS[task][1] 256 | if len(header) != len(COLUMNS): 257 | errors.append(Error(path, 'Line 2 contains {0} columns instead of {1}' 258 | .format(len(header), len(COLUMNS)), 259 | header)) 260 | for i, (h, c) in enumerate(zip(header, COLUMNS)): 261 | if h != c: 262 | errors.append(Error(path, 'Column {0} of line 2 must be {1} instead of {2}' 263 | .format(i + 1, c, h), header)) 264 | break 265 | except StopIteration: 266 | errors.append(Error(path, 'Missing 2nd line')) 267 | 268 | # data 269 | last = None 270 | for n, row in enumerate(reader, 3): 271 | row = [x.strip() for x in row] 272 | COLUMNS = _TASK_SPECIFICS[task][1] 273 | if not any(row): # get rid of empty rows 274 | continue 275 | elif (len(row) != len(COLUMNS)): 276 | errors.append(Error(path, 'Line {0} contains {1} columns instead of {2}' 277 | .format(n, len(row), len(COLUMNS)), 278 | row)) 279 | # column to check for ascending numerical sequence 280 | current = row[_TASK_SPECIFICS[task][2]].strip() 281 | try: 282 | # expect ascending numerical sequences 283 | current = int(current) 284 | if last: 285 | if _TASK_SPECIFICS[task][3]: # strictly ascending 286 | if current <= last: 287 | sequence = [] # start new ascending sequence 288 | else: 289 | if current < last: 290 | sequence = [] # start new ascending sequence 291 | sequence.append(current) 292 | last = current 293 | except ValueError: 294 | errors.append(Error(path, 'Column {0} of line {1} "{2}" should contain ' 295 | 'only numbers' 296 | .format(_TASK_SPECIFICS[task][2] + 1, n, current), row)) 297 | if last: 298 | last = None 299 | 300 | return psc1, timestamp, sequence, errors 301 | 302 | 303 | def read_mid(path, strict=True): 304 | """Return "Subject ID" and other information extracted from mid_*.csv. 305 | 306 | Sometimes complete lines are enclosed in quotes. In that case 307 | mid_*.csv content must be fixed before it can be read as CSV. 308 | 309 | Parameters 310 | ---------- 311 | path : unicode 312 | Path to the mid_*.csv file to read from. 313 | 314 | strict : bool 315 | Be more lenient and let wholly quoted lines through if False, 316 | else do report the error. 317 | 318 | Returns 319 | ------- 320 | psc1 : str 321 | PSC1 code. 322 | timestamp : datetime 323 | Time stamp extracted from the header. 324 | trials : array_like 325 | The last ascending sequence of trials ('Trials' column). 326 | errors : array_like 327 | List of Error. 328 | 329 | Raises 330 | ------ 331 | FileNotFoundError 332 | If path does not exist. 333 | 334 | """ 335 | return _read_generic_behavioral(path, MID_CSV, strict) 336 | 337 | 338 | def read_ft(path, strict=True): 339 | """Return "Subject ID" and other information extracted from ft_*.csv. 340 | 341 | Sometimes complete lines are enclosed in quotes. In that case 342 | ft_*.csv content must be fixed before it can be read as CSV. 343 | 344 | Parameters 345 | ---------- 346 | path : unicode 347 | Path to the ft_*.csv file to read from. 348 | 349 | strict : bool 350 | Be more lenient and let wholly quoted lines through if False, 351 | else do report the error. 352 | 353 | Returns 354 | ------- 355 | psc1 : str 356 | PSC1 code. 357 | timestamp : datetime 358 | Time stamp extracted from the header. 359 | trials : array_like 360 | The last ascending sequence of trials ('Trials' column). 361 | errors : array_like 362 | List of Error. 363 | 364 | Raises 365 | ------ 366 | FileNotFoundError 367 | If path does not exist. 368 | 369 | """ 370 | return _read_generic_behavioral(path, FT_CSV, strict) 371 | 372 | 373 | def read_ss(path, strict=True): 374 | """Return "Subject ID" and other information extracted from ss_*.csv. 375 | 376 | Sometimes complete lines are enclosed in quotes. In that case 377 | ss_*.csv content must be fixed before it can be read as CSV. 378 | 379 | Parameters 380 | ---------- 381 | path : unicode 382 | Path to the ss_*.csv file to read from. 383 | 384 | strict : bool 385 | Be more lenient and let wholly quoted lines through if False, 386 | else do report the error. 387 | 388 | Returns 389 | ------- 390 | psc1 : str 391 | PSC1 code. 392 | timestamp : datetime 393 | Time stamp extracted from the header. 394 | trials : array_like 395 | The last ascending sequence of trials ('Trials' column). 396 | errors : array_like 397 | List of Error. 398 | 399 | Raises 400 | ------ 401 | FileNotFoundError 402 | If path does not exist. 403 | 404 | """ 405 | return _read_generic_behavioral(path, SS_CSV, strict) 406 | 407 | 408 | def read_recog(path, strict=True): 409 | """Return "Subject ID" and other information extracted from recog_*.csv. 410 | 411 | Sometimes complete lines are enclosed in quotes. In that case 412 | recog_*.csv content must be fixed before it can be read as CSV. 413 | 414 | Parameters 415 | ---------- 416 | path : unicode 417 | Path to the recog_*.csv file to read from. 418 | 419 | strict : bool 420 | Be more lenient and let wholly quoted lines through if False, 421 | else do report the error. 422 | 423 | Returns 424 | ------- 425 | psc1 : str 426 | PSC1 code. 427 | timestamp : datetime 428 | Time stamp extracted from the header. 429 | times : array_like 430 | The last ascending sequence of trials ('TimePassed' column). 431 | errors : array_like 432 | List of Error. 433 | 434 | Raises 435 | ------ 436 | FileNotFoundError 437 | If path does not exist. 438 | 439 | """ 440 | return _read_generic_behavioral(path, RECOG_CSV, strict) 441 | 442 | 443 | def main(): 444 | import os.path 445 | 446 | ROOT_DIR = '/neurospin/imagen/FU2/RAW/PSC1' 447 | for center in os.listdir(ROOT_DIR): 448 | center_path = os.path.join(ROOT_DIR, center) 449 | for subject in os.listdir(center_path): 450 | subject_path = os.path.join(center_path, subject) 451 | behavioral_path = os.path.join(subject_path, 452 | 'AdditionalData', 'Scanning') 453 | if os.path.isdir(behavioral_path): 454 | #~ mid_files = tuple(os.path.join(behavioral_path, b) 455 | #~ for b in os.listdir(behavioral_path) 456 | #~ if 'mid_' in b) 457 | #~ for mid_file in mid_files: 458 | #~ (psc1, _timestamp, onsets, errors) = read_mid(mid_file, False) 459 | #~ print('▸ {0} MID {1}'.format(psc1, len(onsets))) 460 | #~ for error in errors: 461 | #~ print(' ✗ {0}: {1}'.format(error.message, 462 | #~ os.path.relpath(error.path, ROOT_DIR))) 463 | #~ ft_files = tuple(os.path.join(behavioral_path, b) 464 | #~ for b in os.listdir(behavioral_path) 465 | #~ if 'ft_' in b) 466 | #~ for ft_file in ft_files: 467 | #~ (psc1, _timestamp, onsets, errors) = read_ft(ft_file, False) 468 | #~ print('▸ {0} FT {1}'.format(psc1, len(onsets))) 469 | #~ for error in errors: 470 | #~ print(' ✗ {0}: {1}'.format(error.message, 471 | #~ os.path.relpath(error.path, ROOT_DIR))) 472 | ss_files = tuple(os.path.join(behavioral_path, b) 473 | for b in os.listdir(behavioral_path) 474 | if 'ss_' in b) 475 | for ss_file in ss_files: 476 | (psc1, timestamp, onsets, errors) = read_ss(ss_file, # pylint: disable=unused-variable 477 | False) 478 | print('▸ {0} SS {1}'.format(psc1, len(onsets))) 479 | for error in errors: 480 | print(' ✗ {0}: {1}'.format(error.message, 481 | os.path.relpath(error.path, ROOT_DIR))) 482 | #~ recog_files = tuple(os.path.join(behavioral_path, b) 483 | #~ for b in os.listdir(behavioral_path) 484 | #~ if 'recog_' in b) 485 | #~ for recog_file in recog_files: 486 | #~ (psc1, timestamp, onsets, errors) = read_recog(recog_file, False) 487 | #~ print('▸ {0} RECOG {1}'.format(psc1, len(onsets))) 488 | #~ for error in errors: 489 | #~ print(' ✗ {0}: {1}'.format(error.message, 490 | #~ os.path.relpath(error.path, ROOT_DIR))) 491 | 492 | 493 | if __name__ == '__main__': 494 | main() 495 | -------------------------------------------------------------------------------- /imagen_databank/cantab.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2017 CEA 2 | # 3 | # This software is governed by the CeCILL license under French law and 4 | # abiding by the rules of distribution of free software. You can use, 5 | # modify and/ or redistribute the software under the terms of the CeCILL 6 | # license as circulated by CEA, CNRS and INRIA at the following URL 7 | # "http://www.cecill.info". 8 | # 9 | # As a counterpart to the access to the source code and rights to copy, 10 | # modify and redistribute granted by the license, users are provided only 11 | # with a limited warranty and the software's author, the holder of the 12 | # economic rights, and the successive licensors have only limited 13 | # liability. 14 | # 15 | # In this respect, the user's attention is drawn to the risks associated 16 | # with loading, using, modifying and/or developing or reproducing the 17 | # software by the user in light of its specific status of free software, 18 | # that may mean that it is complicated to manipulate, and that also 19 | # therefore means that it is reserved for developers and experienced 20 | # professionals having in-depth computer knowledge. Users are therefore 21 | # encouraged to load and test the software's suitability as regards their 22 | # requirements in conditions enabling the security of their systems and/or 23 | # data to be ensured and, more generally, to use and operate it in the 24 | # same conditions as regards security. 25 | # 26 | # The fact that you are presently reading this means that you have had 27 | # knowledge of the CeCILL license and that you accept its terms. 28 | 29 | from zipfile import ZipFile 30 | from lxml import etree 31 | import datetime 32 | import csv 33 | import re 34 | import sys 35 | 36 | import logging 37 | logger = logging.getLogger(__name__) 38 | 39 | __all___ = ['CANTAB_CCLAR', 'DETAILED_DATASHEET_CSV', 'DATASHEET_CSV', 40 | 'REPORT_HTML', 41 | 'read_cant', 'read_datasheet', 'read_detailed_datasheet', 42 | 'read_report'] 43 | 44 | 45 | # 46 | # types of files we expect to be find under AdditionalData 47 | # 48 | CANTAB_CCLAR = 'cantab' 49 | DETAILED_DATASHEET_CSV = 'detailed_datasheet' 50 | DATASHEET_CSV = 'datasheet' 51 | REPORT_HTML = 'report' 52 | 53 | _ID_XPATH = ".//{http://www.camcog.com/proteus/entity/xml}attribute[@name='ID']" 54 | 55 | 56 | def read_cant(path): 57 | """Return "Subject ID" values found in a cant_*.cclar file. 58 | 59 | Parameters 60 | ---------- 61 | path : unicode 62 | Path to the cant_*.cclar file to read from. 63 | 64 | Returns 65 | ------- 66 | list 67 | "Subject ID" values found in the file. 68 | 69 | """ 70 | subject_ids = set() 71 | cantfile = ZipFile(path, 'r') 72 | for name in cantfile.namelist(): 73 | if name.endswith('index.xml'): 74 | root = etree.fromstring(cantfile.read(name)) 75 | for element in root.findall(_ID_XPATH): 76 | subject_ids.add(element.attrib['value']) 77 | cantfile.close() 78 | return subject_ids 79 | 80 | 81 | def _parse_csv_datetime(date_string): 82 | """Read date in the format found in CSV files. 83 | 84 | * LONDON 01-Feb-2015 12:34:56 85 | * NOTTINGHAM 01-Feb-2015 12:34:56 01/02/2015 12:34 86 | * DUBLIN 01-Feb-2015 12:34:56 87 | * BERLIN 01.02.2015 12:34:56 88 | * HAMBURG 01.02.2015 12:34:56 89 | * MANNHEIM 01.02.2015 12:34:56 90 | * PARIS 01 Feb 2015 12:34:56 91 | * DRESDEN 12:34:56 01.02.2015 92 | 93 | """ 94 | DATE_FORMATS = ( 95 | '%d-%b-%Y %H:%M:%S', # 01-Feb-2015 12:34:56 96 | '%d/%m/%Y %H:%M', # 01/02/2015 12:34 97 | '%d.%m.%Y %H:%M:%S', # 01.02.2015 12:34:56 98 | '%d %b %Y %H:%M:%S', # 01 Feb 2015 12:34:56 99 | '%H:%M:%S %d.%m.%Y', # 12:34:56 01.02.2015 100 | ) 101 | for date_format in DATE_FORMATS: 102 | try: 103 | dt = datetime.datetime.strptime(date_string, date_format) 104 | return dt 105 | except ValueError: 106 | pass 107 | return None 108 | 109 | 110 | def read_datasheet(path): 111 | """Return "Subject ID" and other information extracted from datasheet_*.csv. 112 | 113 | Parameters 114 | ---------- 115 | path : unicode 116 | Path to the datasheet_*.csv file to read from. 117 | 118 | Returns 119 | ------- 120 | list 121 | * "Subject ID" values found in the file. 122 | * "Session start time" values found in the file. 123 | * number of rows. 124 | * minimal number of columns. 125 | * list of column titles. 126 | 127 | """ 128 | with open(path) as csvfile: 129 | # read header 130 | dialect = csv.Sniffer().sniff(csvfile.read()) 131 | csvfile.seek(0) 132 | reader = csv.reader(csvfile, dialect) 133 | rows = 0 134 | columns_max = columns_min = 0 135 | fields = {} 136 | header = next(reader) 137 | if header: 138 | fields = {v: i for i, v in enumerate(header)} 139 | columns_max = columns_min = len(header) 140 | rows += 1 141 | subject_ids = set() 142 | session_start_times = set() 143 | # read values from the rest of the table 144 | for row in reader: 145 | if len(row) > 0: 146 | if "Subject ID" in fields: 147 | subject_id = row[fields["Subject ID"]] 148 | else: 149 | subject_id = row[0] 150 | subject_ids.add(subject_id) 151 | if "Session start time" in fields: 152 | session_start_time = _parse_csv_datetime(row[fields["Session start time"]]) 153 | if session_start_time is not None: 154 | if session_start_time < datetime.datetime(2007, 1, 1): 155 | logger.warning('"Session start time" for %s anterior to 2007: %s', 156 | subject_id, session_start_time.date()) 157 | session_start_times.add(session_start_time) 158 | columns_min = min(len(row), columns_min) 159 | columns_max = max(len(row), columns_max) 160 | rows += 1 161 | return (subject_ids, session_start_times, rows, columns_min, fields) 162 | 163 | 164 | # 165 | # match lines with "Subject ID" 166 | # 167 | _DETAILED_DATASHEET_REGEX = re.compile(r'"?Subject ID : (\w*)"?') 168 | 169 | 170 | def read_detailed_datasheet(path): 171 | """Return "Subject ID" values found in a detailed_datasheet_*.csv file. 172 | 173 | Parameters 174 | ---------- 175 | path : unicode 176 | Path to the detailed_datasheet_*.csv file to read from. 177 | 178 | Returns 179 | ------- 180 | list 181 | "Subject ID" values found in the file. 182 | 183 | """ 184 | with open(path, encoding='latin1') as f: 185 | subject_ids = set() 186 | for line in f: 187 | match = _DETAILED_DATASHEET_REGEX.match(line) 188 | if match: 189 | subject_ids.add(match.group(1)) 190 | return subject_ids 191 | 192 | 193 | _REPORT_REGEX = re.compile('Subject ID(.*)Gender(.*)') 194 | 195 | 196 | def read_report(path): 197 | """Return "Subject ID" values found in a report_*.html file. 198 | 199 | Parameters 200 | ---------- 201 | path : unicode 202 | Path to the report_*.html to read from. 203 | 204 | Returns 205 | ------- 206 | list 207 | "Subject ID" values found in the file. 208 | 209 | """ 210 | with open(path, encoding='latin-1') as report_html: 211 | subject_ids = set() 212 | for line in report_html: 213 | match = _REPORT_REGEX.match(line) 214 | if match: 215 | subject_ids.add(match.group(1)) 216 | return subject_ids 217 | -------------------------------------------------------------------------------- /imagen_databank/core.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2019 CEA 2 | # 3 | # This software is governed by the CeCILL license under French law and 4 | # abiding by the rules of distribution of free software. You can use, 5 | # modify and/ or redistribute the software under the terms of the CeCILL 6 | # license as circulated by CEA, CNRS and INRIA at the following URL 7 | # "http://www.cecill.info". 8 | # 9 | # As a counterpart to the access to the source code and rights to copy, 10 | # modify and redistribute granted by the license, users are provided only 11 | # with a limited warranty and the software's author, the holder of the 12 | # economic rights, and the successive licensors have only limited 13 | # liability. 14 | # 15 | # In this respect, the user's attention is drawn to the risks associated 16 | # with loading, using, modifying and/or developing or reproducing the 17 | # software by the user in light of its specific status of free software, 18 | # that may mean that it is complicated to manipulate, and that also 19 | # therefore means that it is reserved for developers and experienced 20 | # professionals having in-depth computer knowledge. Users are therefore 21 | # encouraged to load and test the software's suitability as regards their 22 | # requirements in conditions enabling the security of their systems and/or 23 | # data to be ensured and, more generally, to use and operate it in the 24 | # same conditions as regards security. 25 | # 26 | # The fact that you are presently reading this means that you have had 27 | # knowledge of the CeCILL license and that you accept its terms. 28 | 29 | import re 30 | import datetime 31 | 32 | import logging 33 | logger = logging.getLogger(__name__) 34 | 35 | __all___ = ['LONDON', 'NOTTINGHAM', 'DUBLIN', 'BERLIN', 36 | 'HAMBURG', 'MANNHEIM', 'PARIS', 'DRESDEN', 37 | 'SOUTHAMPTON', 'AACHEN', 38 | 'CENTER_NAME', 39 | 'PSC2_FROM_PSC1', 'PSC1_FROM_PSC2', 40 | 'PSC1_FROM_DAWBA', 'PSC2_FROM_DAWBA', # PSC2_FROM_DAWBA is obsolete 41 | 'DOB_FROM_PSC1', 42 | 'detect_psc1', 'detect_psc2', 'guess_psc1', 43 | 'Error'] 44 | 45 | 46 | # 47 | # numerical ID of acquisition centers of Imagen 48 | # 49 | LONDON = 1 50 | NOTTINGHAM = 2 51 | DUBLIN = 3 52 | BERLIN = 4 53 | HAMBURG = 5 54 | MANNHEIM = 6 55 | PARIS = 7 56 | DRESDEN = 8 57 | SOUTHAMPTON = 90 # Stratify 58 | AACHEN = 91 # Stratify 59 | 60 | # 61 | # from numerical ID to standard name of acquisition centers of Imagen 62 | # 63 | CENTER_NAME = { 64 | LONDON: 'LONDON', 65 | NOTTINGHAM: 'NOTTINGHAM', 66 | DUBLIN: 'DUBLIN', 67 | BERLIN: 'BERLIN', 68 | HAMBURG: 'HAMBURG', 69 | MANNHEIM: 'MANNHEIM', 70 | PARIS: 'PARIS', 71 | DRESDEN: 'DRESDEN', 72 | SOUTHAMPTON: 'SOUTHAMPTON', # Stratify 73 | AACHEN: 'AACHEN', # Stratify 74 | } 75 | 76 | # 77 | # file that maps PSC1 to PSC2 and DAWBA codes to PSC1 78 | # 79 | _PSC2PSC = '/neurospin/imagen/src/scripts/psc_tools/psc2psc.csv' 80 | _PSC2PSC_STRATIFY = '/neurospin/imagen/src/scripts/psc_tools/psc2psc_SB.csv' 81 | 82 | # 83 | # file that maps PSC1 codes to date of birth 84 | # 85 | _DOB = '/neurospin/imagen/src/scripts/psc_tools/DOB.csv' 86 | _DOB_STRATIFY = '/neurospin/imagen/src/scripts/psc_tools/DOB_SB.csv' 87 | 88 | 89 | def _initialize_psc1_dawba_psc2(): 90 | """Returns dictionaries to map PSC1 to PSC2 and DAWBA codes to PSC1. 91 | 92 | Parameters 93 | ---------- 94 | path : unicode 95 | File containing PSC1=DAWBA=PSC2 mappings. 96 | 97 | Returns 98 | ------- 99 | tuple 100 | Pair of PSC1→PSC2 and DAWBA→PSC1 dictionaries. 101 | 102 | """ 103 | psc2_from_psc1 = {} 104 | psc1_from_dawba = {} 105 | for psc2psc in (_PSC2PSC, _PSC2PSC_STRATIFY): 106 | with open(psc2psc, 'rU') as f: 107 | for line in f: 108 | psc1, dawba, psc2 = line.strip('\n').split('=') 109 | # 1st line is: PSC1=DAWBA=PSC2 110 | if psc1 == 'PSC1' and dawba == 'DAWBA' and psc2 == 'PSC2': 111 | continue 112 | if psc2 in psc2_from_psc1: 113 | if psc2_from_psc1[psc1] != psc2: 114 | logger.critical('inconsistent PSC1/PSC2 mapping: %s', _PSC2PSC) 115 | raise Exception('inconsistent PSC1/PSC2 mapping') 116 | else: 117 | psc2_from_psc1[psc1] = psc2 118 | psc1_from_dawba[dawba] = psc1 119 | return psc2_from_psc1, psc1_from_dawba 120 | 121 | 122 | _REGEX_DOB = re.compile(r'(\d{4})-(\d{2})-(\d{2})') 123 | 124 | 125 | def _initialize_dob(): 126 | """Returns dictionary to map PSC1 code to date of birth. 127 | 128 | Parameters 129 | ---------- 130 | path : unicode 131 | DOB.csv file left over by initial Imagen team. 132 | 133 | Returns 134 | ------- 135 | dict 136 | Dictionary map PSC1 code to date of birth. 137 | 138 | """ 139 | dob_from_psc1 = {} 140 | for dob in (_DOB, _DOB_STRATIFY): 141 | with open(dob, 'rU') as f: 142 | for line in f: 143 | psc1, dob, dummy_when = line.strip('\n').split(',') 144 | match = _REGEX_DOB.match(dob) 145 | if match: 146 | year = int(match.group(1)) 147 | month = int(match.group(2)) 148 | day = int(match.group(3)) 149 | if year > 2012 or year < 1987: 150 | raise Exception('unexpected date of birth: {0} ({1}-{2}-{3})'.format(dob, year, month, day)) 151 | dob_from_psc1[psc1] = datetime.date(year, month, day) 152 | else: 153 | raise Exception('unexpected line in DOB.csv: {0}'.format(line)) 154 | return dob_from_psc1 155 | 156 | 157 | PSC2_FROM_PSC1, PSC1_FROM_DAWBA = _initialize_psc1_dawba_psc2() 158 | PSC2_FROM_DAWBA = {k: PSC2_FROM_PSC1[v] # obsolete 159 | for k, v in PSC1_FROM_DAWBA.items() if v in PSC2_FROM_PSC1} 160 | PSC1_FROM_PSC2 = {v: k for k, v in PSC2_FROM_PSC1.items()} 161 | DOB_FROM_PSC1 = _initialize_dob() 162 | DOB_FROM_PSC2 = {PSC2_FROM_PSC1[k]: v # obsolete 163 | for k, v in DOB_FROM_PSC1.items() if k in PSC2_FROM_PSC1} 164 | 165 | 166 | # 167 | # the heuristic to detect a PSC1 code is that: 168 | # - it starts with 0 followed by the digit associated to each center 169 | # - it is a series of 12 digits 170 | # 171 | _PSC1_REGEX = re.compile('(0[' + 172 | ''.join([str(c) for c in CENTER_NAME]) + 173 | ']\d{10})[^d]?') 174 | 175 | 176 | def detect_psc1(string): 177 | """Find potential PSC1 codes in a filename. 178 | 179 | PSC1 codes are sequences of 12 digits starting with 0 followed by a 180 | different digit for each center, followed by 10 digits. 181 | 182 | Parameters 183 | ---------- 184 | filename : str 185 | The string to search for PSC1. 186 | 187 | Returns 188 | ------- 189 | str 190 | Potential PSC1 code or None. 191 | 192 | """ 193 | match = _PSC1_REGEX.search(string) 194 | if match: 195 | return match.group(1) 196 | else: 197 | return None 198 | 199 | 200 | # 201 | # the heuristic to detect a PSC2 code is that: 202 | # - it starts with 0 followed by a different digit for each center 203 | # - it a series of 12 digits 204 | # 205 | _PSC2_REGEX = re.compile('(0\d{11})[^d]?') 206 | 207 | 208 | def detect_psc2(string): 209 | """Find potential PSC2 codes in a filename. 210 | 211 | PSC2 codes are sequences of 12 digits starting with 0. 212 | 213 | Parameters 214 | ---------- 215 | filename : str 216 | The string to search for PSC2. 217 | 218 | Returns 219 | ------- 220 | str 221 | Potential PSC2 code or None. 222 | 223 | """ 224 | match = _PSC2_REGEX.search(string) 225 | if match: 226 | return match.group(1) 227 | else: 228 | return None 229 | 230 | 231 | def guess_psc1(subject_id, center): 232 | subject_id = subject_id.split('_')[0] 233 | if subject_id.upper().startswith('FU2'): 234 | subject_id = subject_id[3:] 235 | if subject_id.upper().endswith('FU3'): 236 | subject_id = subject_id[:-3] 237 | elif subject_id.upper().endswith('FU2'): 238 | subject_id = subject_id[:-3] 239 | elif subject_id.upper().endswith('FU'): 240 | subject_id = subject_id[:-2] 241 | # this is very empirical and based on cases seen so far! 242 | if len(subject_id) < 10: 243 | subject_id = '0' + str(center) + subject_id.rjust(10, '0') 244 | elif len(subject_id) < 11: 245 | if len(subject_id) < 10: 246 | subject_id = subject_id.rjust(10, '0') 247 | subject_id = '0' + str(center) + subject_id 248 | elif len(subject_id) < 12: 249 | subject_id = subject_id[0:2] + '0' + subject_id[2:] 250 | # check this is an existing PSC1 code 251 | if subject_id in PSC2_FROM_PSC1: 252 | return subject_id 253 | return None 254 | 255 | 256 | class Error: 257 | """Error while parsing files. 258 | 259 | Returned by functions that parse Cantab and behavioral files. 260 | 261 | Attributes 262 | ---------- 263 | path : str 264 | File name. 265 | message : str 266 | Message explaining the error. 267 | sample : str 268 | Part of the file that generated the error. 269 | 270 | """ 271 | _SAMPLE_LEN = 30 272 | 273 | def __init__(self, path, message, sample=None): 274 | self.path = path 275 | self.message = message 276 | self.sample = sample 277 | 278 | def __str__(self): 279 | if self.path: 280 | if self.sample: 281 | sample = repr(self.sample) 282 | if len(sample) > self._SAMPLE_LEN: 283 | sample = sample[:self._SAMPLE_LEN] + '...' 284 | return '{0}: <{1}>: {2}'.format(self.message, sample, self.path) 285 | else: 286 | return '{0}: {1}'.format(self.message, self.path) 287 | else: 288 | return '{0}'.format(self.message) 289 | -------------------------------------------------------------------------------- /imagen_databank/dicom_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2017 CEA 2 | # 3 | # This software is governed by the CeCILL license under French law and 4 | # abiding by the rules of distribution of free software. You can use, 5 | # modify and/ or redistribute the software under the terms of the CeCILL 6 | # license as circulated by CEA, CNRS and INRIA at the following URL 7 | # "http://www.cecill.info". 8 | # 9 | # As a counterpart to the access to the source code and rights to copy, 10 | # modify and redistribute granted by the license, users are provided only 11 | # with a limited warranty and the software's author, the holder of the 12 | # economic rights, and the successive licensors have only limited 13 | # liability. 14 | # 15 | # In this respect, the user's attention is drawn to the risks associated 16 | # with loading, using, modifying and/or developing or reproducing the 17 | # software by the user in light of its specific status of free software, 18 | # that may mean that it is complicated to manipulate, and that also 19 | # therefore means that it is reserved for developers and experienced 20 | # professionals having in-depth computer knowledge. Users are therefore 21 | # encouraged to load and test the software's suitability as regards their 22 | # requirements in conditions enabling the security of their systems and/or 23 | # data to be ensured and, more generally, to use and operate it in the 24 | # same conditions as regards security. 25 | # 26 | # The fact that you are presently reading this means that you have had 27 | # knowledge of the CeCILL license and that you accept its terms. 28 | 29 | import re 30 | import datetime 31 | import dateutil.tz 32 | try: 33 | import pydicom 34 | except: 35 | import dicom as pydicom 36 | from pydicom.filereader import InvalidDicomError 37 | from pydicom.filereader import dcmread 38 | 39 | import logging 40 | logger = logging.getLogger(__name__) 41 | 42 | __all__ = ['read_metadata'] 43 | 44 | 45 | # 46 | # parse DICOM DateTime and Time tags 47 | # 48 | _REGEX_DT = re.compile(r"((\d{4,14})(\.(\d{1,6}))?)([+-]\d{4})?") 49 | _REGEX_TM = re.compile(r"(\d{2,6})(\.(\d{1,6}))?") 50 | 51 | 52 | def _datetime_from_dt(dt): 53 | """Convert DICOM DateTime to Python datetime. 54 | 55 | Parameters 56 | ---------- 57 | dt : str 58 | DateTime tag from DICOM image. 59 | 60 | Returns 61 | ------- 62 | datetime 63 | 64 | """ 65 | match = _REGEX_DT.match(dt) 66 | if match and len(dt) <= 26: 67 | dt_match = match.group(2) 68 | year = int(dt_match[0:4]) 69 | if len(dt_match) < 6: 70 | month = 1 71 | else: 72 | month = int(dt_match[4:6]) 73 | if len(dt_match) < 8: 74 | day = 1 75 | else: 76 | day = int(dt_match[6:8]) 77 | if len(dt_match) < 10: 78 | hour = 0 79 | else: 80 | hour = int(dt_match[8:10]) 81 | if len(dt_match) < 12: 82 | minute = 0 83 | else: 84 | minute = int(dt_match[10:12]) 85 | if len(dt_match) < 14: 86 | second = 0 87 | microsecond = 0 88 | else: 89 | second = int(dt_match[12:14]) 90 | ms_match = match.group(4) 91 | if ms_match: 92 | microsecond = int(ms_match.rstrip().ljust(6, '0')) 93 | else: 94 | microsecond = 0 95 | tz_match = match.group(5) 96 | if tz_match: 97 | offset = (int(tz_match[1:3]) * 60 + int(tz_match[3:5])) * 60 98 | if tz_match[0] == '-': 99 | offset = -offset 100 | tzinfo = dateutil.tz.tzoffset(tz_match, offset) 101 | else: 102 | tzinfo = None 103 | return datetime.datetime(year, month, day, 104 | hour, minute, second, microsecond, 105 | tzinfo) 106 | else: 107 | logger.error('incorrect DICOM DT: %s', dt) 108 | return None 109 | 110 | 111 | def _date_from_da(da): 112 | """Convert DICOM Date to Python date. 113 | 114 | Parameters 115 | ---------- 116 | da : str 117 | Date tag from DICOM image. 118 | 119 | Returns 120 | ------- 121 | date 122 | 123 | """ 124 | if len(da) == 8: 125 | year = int(da[0:4]) 126 | month = int(da[4:6]) 127 | day = int(da[6:8]) 128 | return datetime.date(year, month, day) 129 | elif len(da) == 10 and da[4] == '.' and da[7] == '.': 130 | # ACR-NEMA Standard 300, predecessor to DICOM - for compatibility 131 | year = int(da[0:4]) 132 | month = int(da[5:7]) 133 | day = int(da[8:10]) 134 | return datetime.date(year, month, day) 135 | else: 136 | logger.error('incorrect DICOM DA: %s', da) 137 | return None 138 | 139 | 140 | def _time_from_tm(tm): 141 | """Convert DICOM Time to Python time. 142 | 143 | Parameters 144 | ---------- 145 | tm : str 146 | Time tag from DICOM image. 147 | 148 | Returns 149 | ------- 150 | time 151 | 152 | """ 153 | match = _REGEX_TM.match(tm) 154 | if match and len(tm) <= 16: 155 | tm_match = match.group(1) 156 | hour = int(tm_match[0:2]) 157 | if len(tm_match) < 4: 158 | minute = 0 159 | else: 160 | minute = int(tm_match[2:4]) 161 | if len(tm_match) < 6: 162 | second = 0 163 | microsecond = 0 164 | else: 165 | second = int(tm_match[4:6]) 166 | ms_match = match.group(3) 167 | if ms_match: 168 | microsecond = int(ms_match.rstrip().ljust(6, '0')) 169 | else: 170 | microsecond = 0 171 | return datetime.time(hour, minute, second, microsecond) 172 | else: 173 | logger.error('incorrect DICOM TM: %s', tm) 174 | return None 175 | 176 | 177 | def read_metadata(path, force=False): 178 | """Read select metadata from a DICOM file. 179 | 180 | We always attempt to read the following DICOM tags. An exception is raised 181 | if one of the tags cannot be read: 182 | - SOPClassUID 183 | - SeriesInstanceUID 184 | - SeriesNumber 185 | - SeriesDescription 186 | - SOPInstanceUID 187 | 188 | We also attempt to read the following DICOM tags if they are present: 189 | - ImageType 190 | - AcquisitionDateTime 191 | - AcquisitionDate 192 | - AcquisitionTime 193 | - StationName 194 | - Manufacturer 195 | - ManufacturerModelName 196 | - DeviceSerialNumber 197 | - SoftwareVersions 198 | - PatientID 199 | 200 | Parameters 201 | ---------- 202 | path : str 203 | Path name of the DICOM file. 204 | force : bool 205 | If True read nonstandard files, typically without "Part 10" headers. 206 | 207 | Returns 208 | ------- 209 | dict 210 | 211 | """ 212 | dataset = dcmread(path, force=force) 213 | 214 | # missing compulsory tags will raise exceptions 215 | if 'SeriesDescription' in dataset: 216 | description = dataset.SeriesDescription 217 | elif 'ProtocolName' in dataset: 218 | description = dataset.ProtocolName 219 | else: 220 | description = dataset.SeriesDescription # raise an exception! 221 | 222 | metadata = { 223 | 'SOPClassUID': dataset.SOPClassUID, 224 | 'SOPInstanceUID': dataset.SOPInstanceUID, 225 | 'SeriesInstanceUID': dataset.SeriesInstanceUID, 226 | 'SeriesNumber': dataset.SeriesNumber, 227 | 'SeriesDescription': description, 228 | } 229 | 230 | # optional tags 231 | if 'ImageType' in dataset: 232 | metadata['ImageType'] = dataset.ImageType 233 | if 'AcquisitionDateTime' in dataset: 234 | dt = _datetime_from_dt(dataset.AcquisitionDateTime) 235 | metadata['AcquisitionDate'] = dt.date() 236 | metadata['AcquisitionTime'] = dt.time() 237 | else: 238 | if 'AcquisitionDate' in dataset: 239 | metadata['AcquisitionDate'] = _date_from_da(dataset.AcquisitionDate) 240 | if 'AcquisitionTime' in dataset: 241 | metadata['AcquisitionTime'] = _time_from_tm(dataset.AcquisitionTime) 242 | if 'StationName' in dataset: 243 | metadata['StationName'] = dataset.StationName 244 | if 'Manufacturer' in dataset: 245 | metadata['Manufacturer'] = dataset.Manufacturer 246 | if 'ManufacturerModelName' in dataset: 247 | metadata['ManufacturerModelName'] = dataset.ManufacturerModelName 248 | if 'DeviceSerialNumber' in dataset: 249 | metadata['DeviceSerialNumber'] = dataset.DeviceSerialNumber 250 | if 'SoftwareVersions' in dataset: 251 | if pydicom.dataelem.isMultiValue(dataset.SoftwareVersions): 252 | # usually the last part is the more informative 253 | # for example on Philips scanners: 254 | # ['3.2.1', '3.2.1.1'] → '3.2.1.1' 255 | metadata['SoftwareVersions'] = dataset.SoftwareVersions[-1] 256 | else: 257 | metadata['SoftwareVersions'] = dataset.SoftwareVersions 258 | if 'StudyComments' in dataset: # DUBLIN 259 | metadata['StudyComments'] = dataset.StudyComments 260 | if 'PatientName' in dataset: # BERLIN, NOTTINGHAM 261 | metadata['PatientName'] = dataset.PatientName 262 | if 'ImageComments' in dataset: # HAMBURG, DRESDEN 263 | metadata['ImageComments'] = dataset.ImageComments 264 | if 'StudyDescription' in dataset: # LONDON 265 | metadata['StudyDescription'] = dataset.StudyDescription 266 | if 'PerformedProcedureStepDescription' in dataset: # LONDON 267 | metadata['PerformedProcedureStepDescription'] = dataset.PerformedProcedureStepDescription 268 | if 'PatientID' in dataset: # BERLIN, MANNHEIM, PARIS 269 | metadata['PatientID'] = dataset.PatientID 270 | 271 | return metadata 272 | -------------------------------------------------------------------------------- /imagen_databank/image_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2018 CEA 2 | # 3 | # This software is governed by the CeCILL license under French law and 4 | # abiding by the rules of distribution of free software. You can use, 5 | # modify and/ or redistribute the software under the terms of the CeCILL 6 | # license as circulated by CEA, CNRS and INRIA at the following URL 7 | # "http://www.cecill.info". 8 | # 9 | # As a counterpart to the access to the source code and rights to copy, 10 | # modify and redistribute granted by the license, users are provided only 11 | # with a limited warranty and the software's author, the holder of the 12 | # economic rights, and the successive licensors have only limited 13 | # liability. 14 | # 15 | # In this respect, the user's attention is drawn to the risks associated 16 | # with loading, using, modifying and/or developing or reproducing the 17 | # software by the user in light of its specific status of free software, 18 | # that may mean that it is complicated to manipulate, and that also 19 | # therefore means that it is reserved for developers and experienced 20 | # professionals having in-depth computer knowledge. Users are therefore 21 | # encouraged to load and test the software's suitability as regards their 22 | # requirements in conditions enabling the security of their systems and/or 23 | # data to be ensured and, more generally, to use and operate it in the 24 | # same conditions as regards security. 25 | # 26 | # The fact that you are presently reading this means that you have had 27 | # knowledge of the CeCILL license and that you accept its terms. 28 | 29 | import os 30 | import re 31 | import time 32 | import datetime 33 | 34 | from .core import (LONDON, NOTTINGHAM, DUBLIN, BERLIN, 35 | HAMBURG, MANNHEIM, PARIS, DRESDEN, 36 | SOUTHAMPTON, AACHEN) 37 | from .dicom_utils import read_metadata 38 | from .dicom_utils import InvalidDicomError 39 | 40 | import logging 41 | logger = logging.getLogger(__name__) 42 | 43 | __all__ = ['SEQUENCE_LOCALIZER_CALIBRATION', 'SEQUENCE_T2', 44 | 'SEQUENCE_T2_FLAIR', 'SEQUENCE_ADNI_MPRAGE', 45 | 'SEQUENCE_MID', 'SEQUENCE_FT', 'SEQUENCE_SST', 46 | 'SEQUENCE_B0_MAP', 'SEQUENCE_DTI', 47 | 'SEQUENCE_RESTING_STATE', 48 | 'SEQUENCE_NODDI', 49 | 'SEQUENCE_NAME', 50 | 'NONSTANDARD_DICOM', 51 | 'series_type_from_description', 52 | 'walk_image_data', 'report_image_data'] 53 | 54 | 55 | # 56 | # information sent by Anna Cattrell to Dimitri on 13 June 2014: 57 | # Standard Operating Procedure IMAGEN Follow-up 2 study 58 | # 59 | # 2.2.1 Overview of Imaging Session: 60 | # 61 | # 2. 3 plane localizer / Parallel imaging calibration 62 | # 3. Axial T2 slices (site specific duration) 63 | # 4. Axial T2 Flair slices (site specific duration) 64 | # 5. 3D Sagittal ADNI MPRAGE (Long) 65 | # 7. Monetary Incentive Delay Task (MID) 66 | # 9. Face task (FT) 67 | # 11. Stop-signal task (SST) 68 | # 12. B0 Map 69 | # 13. DTI (duration is heart-rate dependent at sites with cardiac gating) 70 | # 14. Resting State 71 | # 15. Short MPRAGE (baseline only) 72 | # 16. EPI Global (JBP suggestion followed by a few centres at baseline) 73 | # 17. NODDI (optional, added in Follow-up 3) 74 | # 75 | # the following constants attempt to describe each of these sequences 76 | # 77 | SEQUENCE_LOCALIZER_CALIBRATION = 2 78 | SEQUENCE_T2 = 3 79 | SEQUENCE_T2_FLAIR = 4 80 | SEQUENCE_ADNI_MPRAGE = 5 81 | SEQUENCE_MID = 7 82 | SEQUENCE_FT = 9 83 | SEQUENCE_SST = 11 84 | SEQUENCE_B0_MAP = 12 85 | SEQUENCE_DTI = 13 86 | SEQUENCE_RESTING_STATE = 14 87 | SEQUENCE_SHORT_MPRAGE = 15 88 | SEQUENCE_GLOBAL = 16 89 | SEQUENCE_NODDI = 17 90 | 91 | # 92 | # from sequence ID to sequence name 93 | # 94 | SEQUENCE_NAME = { 95 | SEQUENCE_LOCALIZER_CALIBRATION: 'Localizer/Calibration', 96 | SEQUENCE_T2: 'T2', 97 | SEQUENCE_T2_FLAIR: 'T2 Flair', 98 | SEQUENCE_ADNI_MPRAGE: 'ADNI MPRAGE', 99 | SEQUENCE_MID: 'EPI MID', 100 | SEQUENCE_FT: 'EPI Faces', 101 | SEQUENCE_SST: 'EPI Signal Stop', 102 | SEQUENCE_B0_MAP: 'B0 Map', 103 | SEQUENCE_DTI: 'DTI', 104 | SEQUENCE_RESTING_STATE: 'Resting State', 105 | SEQUENCE_SHORT_MPRAGE: 'Short MPRAGE', 106 | SEQUENCE_GLOBAL: 'EPI Global', 107 | SEQUENCE_NODDI: 'NODDI', 108 | } 109 | 110 | # 111 | # check sequence names against these regex'es when trying to identify 112 | # the type of a sequence from its name 113 | # 114 | # in some case order is important, for example: 115 | # - first match 'FLAIR' and 'short MPRAGE' 116 | # - then match 'T2' and 'MPRAGE' 117 | # 118 | _LOOSE_IMAGE_DATA_REGEXES = ( 119 | (re.compile(r'LOCALI[ZS]ER', re.IGNORECASE), SEQUENCE_LOCALIZER_CALIBRATION), 120 | # LONDON calibration 121 | (re.compile(r'ASSET[- ]Cal', re.IGNORECASE), SEQUENCE_LOCALIZER_CALIBRATION), 122 | # NOTTINGHAM 3-plane scout 123 | (re.compile(r'Survey_SHC'), SEQUENCE_LOCALIZER_CALIBRATION), 124 | # LONDON FU3 3-plane Localizer 125 | (re.compile(r'3Plane'), SEQUENCE_LOCALIZER_CALIBRATION), 126 | # first search for "FLAIR" then for "T2" 127 | (re.compile(r'FLAIR', re.IGNORECASE), SEQUENCE_T2_FLAIR), 128 | (re.compile(r'T2', re.IGNORECASE), SEQUENCE_T2), 129 | (re.compile(r'short MPRAGE', re.IGNORECASE), SEQUENCE_SHORT_MPRAGE), 130 | (re.compile(r'MPRAGE', re.IGNORECASE), SEQUENCE_ADNI_MPRAGE), 131 | (re.compile(r'MID', re.IGNORECASE), SEQUENCE_MID), 132 | # "EPI short reward" and "EPI reward short" are the same as "EPI short MID" 133 | (re.compile(r'reward', re.IGNORECASE), SEQUENCE_MID), 134 | (re.compile(r'face', re.IGNORECASE), SEQUENCE_FT), 135 | (re.compile(r'stop[- ]signal', re.IGNORECASE), SEQUENCE_SST), 136 | # LONDON stop signal DICOM files contain "SST" 137 | (re.compile(r'SST', re.IGNORECASE), SEQUENCE_SST), 138 | (re.compile(r'global', re.IGNORECASE), SEQUENCE_GLOBAL), 139 | (re.compile(r'B0'), SEQUENCE_B0_MAP), 140 | # LONDON B0 maps made of 3 DICOM files containing "FIELDMAP" 141 | (re.compile(r'FIELDMAP', re.IGNORECASE), SEQUENCE_B0_MAP), 142 | (re.compile(r'DTI'), SEQUENCE_DTI), 143 | (re.compile(r'REST', re.IGNORECASE), SEQUENCE_RESTING_STATE), 144 | ) 145 | 146 | # 147 | # some acquisition centers may send nonstandard DICOM files 148 | # 149 | # for example Hamburg have sent DICOM files without "PART 10" headers 150 | # 151 | NONSTANDARD_DICOM = { 152 | LONDON: False, 153 | NOTTINGHAM: False, 154 | DUBLIN: False, 155 | BERLIN: False, 156 | HAMBURG: True, 157 | MANNHEIM: False, 158 | PARIS: False, 159 | DRESDEN: False, 160 | SOUTHAMPTON: False, 161 | AACHEN: False, 162 | } 163 | 164 | # 165 | # the SOP Class UIDs we expect to find while scanning DICOM files: 166 | # - those we process 167 | # - those we discard silently 168 | # 169 | # any other SOP Class UID generates a warning 170 | # 171 | _ALLOWED_SOP_CLASS_UIDS = { 172 | 'MR Image Storage', 173 | 'Enhanced MR Image Storage', 174 | } 175 | _IGNORED_SOP_CLASS_UIDS = { 176 | 'Grayscale Softcopy Presentation State Storage SOP Class', 177 | 'Raw Data Storage', 178 | 'Enhanced SR Storage', 179 | 'Philips Private Gyroscan MR Serie Data', 180 | 'Private MR Series Data Storage', '1.3.46.670589.11.0.0.12.2', 181 | 'Private MR Examcard Storage', '1.3.46.670589.11.0.0.12.4', 182 | 'Secondary Capture Image Storage', 183 | } 184 | 185 | 186 | def series_type_from_description(series_description): 187 | """Match series description to those listed in Imagen FU2 SOPs. 188 | 189 | This matching function is empirical and based on experimentation. 190 | 191 | Parameters 192 | ---------- 193 | series_description : unicode 194 | The series description to match. 195 | 196 | Returns 197 | ------- 198 | str 199 | If the series description loosely matches a series type listed 200 | in the SOPs, return this series type, else return None. 201 | 202 | """ 203 | for regex, series_type in _LOOSE_IMAGE_DATA_REGEXES: 204 | if regex.search(series_description): 205 | return series_type 206 | return None 207 | 208 | 209 | def walk_image_data(path, force=False): 210 | """Generate information on DICOM files in a directory. 211 | 212 | File that cannot be read are skipped and an error message is logged. 213 | 214 | Parameters 215 | ---------- 216 | path : unicode 217 | Directory to read DICOM files from. 218 | force : bool 219 | Try reading nonstandard DICOM files, typically without "PART 10" headers. 220 | 221 | Yields 222 | ------ 223 | tuple 224 | Yields a pair (metadata, relpath) where metadata is a dictionary 225 | of extracted DICOM metadata. 226 | 227 | """ 228 | n = 0 229 | start = time.time() 230 | 231 | logger.info('start processing files under: %s', path) 232 | 233 | for root, dummy_dirs, files in os.walk(path): 234 | n += len(files) 235 | for filename in files: 236 | abspath = os.path.join(root, filename) 237 | relpath = os.path.normpath(os.path.relpath(abspath, path)) 238 | # skip DICOMDIR since we are going to read all DICOM files anyway 239 | # beware, Nottigham had sent a DICOMDIR2 file! 240 | if filename.startswith('DICOMDIR'): 241 | continue 242 | logger.debug('read file: %s', relpath) 243 | try: 244 | metadata = read_metadata(abspath, force=force) 245 | except OSError as e: 246 | logger.error('cannot read file (%s): %s', str(e), relpath) 247 | except InvalidDicomError as e: 248 | logger.error('cannot read nonstandard DICOM file: %s: %s', str(e), relpath) 249 | except AttributeError as e: 250 | logger.error('missing attribute: %s: %s', str(e), relpath) 251 | else: 252 | yield (metadata, relpath) 253 | 254 | elapsed = time.time() - start 255 | logger.info('processed %d files in %.2f s: %s', n, elapsed, path) 256 | 257 | 258 | def report_image_data(path, force=False): 259 | """Find DICOM files loosely organized according to the Imagen FU2 SOPs. 260 | 261 | The Imagen FU2 SOPs define a precise file organization for Image Data. In 262 | practice we have found the SOPs are only loosely followed. A method to find 263 | DICOM files while adapting to local variations is to read all DICOM files, 264 | then filter and break them down into series based on their contents. 265 | 266 | This function scans the directory where we expect to find the Image Data 267 | of a dataset and reports series of valid DICOM files. 268 | 269 | Parameters 270 | ---------- 271 | path : unicode 272 | Directory to read DICOM files from. 273 | force : bool 274 | Try reading nonstandard DICOM files, typically without "PART 10" headers. 275 | 276 | Returns 277 | ------- 278 | dict 279 | The key identifies a series while the value is a pair 280 | (series_data, images). 281 | 282 | """ 283 | series_dict = {} 284 | 285 | for (image_data, relpath) in walk_image_data(path, force=force): 286 | if str(image_data['SOPClassUID']) in _IGNORED_SOP_CLASS_UIDS: 287 | continue 288 | # extract DICOM tags of interest, throw exceptions if missing tags! 289 | series_uid = image_data['SeriesInstanceUID'] 290 | image_uid = image_data['SOPInstanceUID'] 291 | series_number = image_data['SeriesNumber'] 292 | series_description = image_data['SeriesDescription'] 293 | image_types = image_data.get('ImageType', []) 294 | station_name = image_data.get('StationName', None) 295 | manufacturer = image_data.get('Manufacturer', None) 296 | manufacturer_model_name = image_data.get('ManufacturerModelName', None) 297 | software_versions = image_data.get('SoftwareVersions', None) 298 | device_serial_number = image_data.get('DeviceSerialNumber', None) 299 | if 'AcquisitionDate' in image_data: 300 | acquisition_date = image_data['AcquisitionDate'] 301 | if 'AcquisitionTime' in image_data: 302 | acquisition_time = image_data['AcquisitionTime'] 303 | timestamp = datetime.datetime.combine(acquisition_date, 304 | acquisition_time) 305 | else: 306 | timestamp = datetime.datetime(acquisition_date.year, 307 | acquisition_date.month, 308 | acquisition_date.day) 309 | else: 310 | logger.error('missing acquisition time: %s', relpath) 311 | # FIXME: this is obviously wrong! # find PSC1 code 312 | if 'CommentsOnThePerformedProcedureStep' in image_data: # DUBLIN 313 | psc1 = image_data['CommentsOnThePerformedProcedureStep'] 314 | elif 'ImageComments' in image_data: # HAMBURG, DRESDEN 315 | psc1 = image_data['ImageComments'] 316 | elif 'PatientID' in image_data: # LONDON, NOTTINGHAM, BERLIN, MANNHEIM, PARIS 317 | psc1 = image_data['PatientID'] 318 | elif 'PatientName' in image_data: # LONDON, NOTTINGHAM, BERLIN, MANNHEIM, PARIS 319 | psc1 = image_data['PatientName'] 320 | else: 321 | psc1 = None 322 | # build the dictionary of series using 'SeriesInstanceUID' as a key 323 | if series_uid not in series_dict: 324 | series_data = { 325 | 'SeriesNumber': series_number, 326 | 'SeriesDescription': series_description, 327 | 'ImageType': set(image_types), 328 | 'MinAcquisitionDateTime': timestamp, 329 | 'MaxAcquisitionDateTime': timestamp, 330 | } 331 | if station_name: 332 | series_data['StationName'] = station_name 333 | if manufacturer: 334 | series_data['Manufacturer'] = manufacturer 335 | if manufacturer_model_name: 336 | series_data['ManufacturerModelName'] = manufacturer_model_name 337 | if software_versions: 338 | series_data['SoftwareVersions'] = software_versions 339 | if device_serial_number: 340 | series_data['DeviceSerialNumber'] = device_serial_number 341 | if psc1: 342 | series_data['PSC1'] = psc1 343 | # populate series with relative path to DICOM files 344 | series_dict[series_uid] = (series_data, {image_uid: relpath}) 345 | else: 346 | series_dict[series_uid][0]['ImageType'] |= set(image_types) 347 | # check consistency within series: 348 | if series_number != series_dict[series_uid][0]['SeriesNumber']: 349 | logger.error('inconsistent series number ' 350 | '"%s" / "%s":\n %s\n %s', 351 | series_dict[series_uid][0]['SeriesNumber'], 352 | series_number, 353 | next(iter(series_dict[series_uid][1].values())), 354 | relpath) 355 | elif series_description != series_dict[series_uid][0]['SeriesDescription']: 356 | logger.error('inconsistent series description ' 357 | '"%s" / "%s":\n %s\n %s', 358 | series_dict[series_uid][0]['SeriesDescription'], 359 | series_description, 360 | next(iter(series_dict[series_uid][1].values())), 361 | relpath) 362 | if station_name: 363 | if 'StationName' in series_dict[series_uid][0]: 364 | if station_name != series_dict[series_uid][0]['StationName']: 365 | logger.error('inconsistent station name ' 366 | '"%s" / "%s":\n %s\n %s', 367 | series_dict[series_uid][0]['StationName'], 368 | station_name, 369 | next(iter(series_dict[series_uid][1].values())), 370 | relpath) 371 | else: 372 | series_dict[series_uid][0]['StationName'] = station_name 373 | if manufacturer: 374 | if 'Manufacturer' in series_dict[series_uid][0]: 375 | if manufacturer != series_dict[series_uid][0]['Manufacturer']: 376 | logger.error('inconsistent manufacturer ' 377 | '"%s" / "%s":\n %s\n %s', 378 | series_dict[series_uid][0]['Manufacturer'], 379 | manufacturer, 380 | next(iter(series_dict[series_uid][1].values())), 381 | relpath) 382 | else: 383 | series_dict[series_uid][0]['Manufacturer'] = manufacturer 384 | if manufacturer_model_name: 385 | if 'ManufacturerModelName' in series_dict[series_uid][0]: 386 | if manufacturer_model_name != series_dict[series_uid][0]['ManufacturerModelName']: 387 | logger.error('inconsistent manufacturer model name ' 388 | '"%s" / "%s":\n %s\n %s', 389 | series_dict[series_uid][0]['ManufacturerModelName'], 390 | manufacturer_model_name, 391 | next(iter(series_dict[series_uid][1].values())), 392 | relpath) 393 | else: 394 | series_dict[series_uid][0]['ManufacturerModelName'] = manufacturer_model_name 395 | if software_versions: 396 | if 'SoftwareVersions' in series_dict[series_uid][0]: 397 | if software_versions != series_dict[series_uid][0]['SoftwareVersions']: 398 | logger.error('inconsistent software versions ' 399 | '"%s" / "%s":\n %s\n %s', 400 | series_dict[series_uid][0]['SoftwareVersions'], 401 | software_versions, 402 | next(iter(series_dict[series_uid][1].values())), 403 | relpath) 404 | else: 405 | series_dict[series_uid][0]['SoftwareVersions'] = software_versions 406 | if device_serial_number: 407 | if 'DeviceSerialNumber' in series_dict[series_uid][0]: 408 | if device_serial_number != series_dict[series_uid][0]['DeviceSerialNumber']: 409 | logger.error('inconsistent device serial number ' 410 | '"%s" / "%s":\n %s\n %s', 411 | series_dict[series_uid][0]['DeviceSerialNumber'], 412 | device_serial_number, 413 | next(iter(series_dict[series_uid][1].values())), 414 | relpath) 415 | else: 416 | series_dict[series_uid][0]['DeviceSerialNumber'] = device_serial_number 417 | 418 | if psc1: 419 | if 'PSC1' in series_dict[series_uid][0]: 420 | if psc1 != series_dict[series_uid][0]['PSC1']: 421 | logger.error('inconsistent PSC1 code ' 422 | '"%s" / "%s":\n %s\n %s', 423 | series_dict[series_uid][0]['PSC1'], 424 | psc1, 425 | next(iter(series_dict[series_uid][1].values())), 426 | relpath) 427 | else: 428 | series_dict[series_uid][0]['PSC1'] = psc1 429 | # populate series with relative path to DICOM files 430 | if image_uid not in series_dict[series_uid][1]: 431 | series_dict[series_uid][1][image_uid] = relpath 432 | else: 433 | logger.error('duplicate image in series (%s):' 434 | '\n %s\n %s', 435 | series_description, 436 | series_dict[series_uid][1][image_uid], 437 | relpath) 438 | # update acquisition date/time range by series 439 | if timestamp < series_dict[series_uid][0]['MinAcquisitionDateTime']: 440 | series_dict[series_uid][0]['MinAcquisitionDateTime'] = timestamp 441 | if timestamp > series_dict[series_uid][0]['MaxAcquisitionDateTime']: 442 | series_dict[series_uid][0]['MaxAcquisitionDateTime'] = timestamp 443 | 444 | return series_dict 445 | -------------------------------------------------------------------------------- /imagen_databank/sanity/__init__.py: -------------------------------------------------------------------------------- 1 | # noqa 2 | 3 | # Copyright (c) 2014-2017 CEA 4 | # 5 | # This software is governed by the CeCILL license under French law and 6 | # abiding by the rules of distribution of free software. You can use, 7 | # modify and/ or redistribute the software under the terms of the CeCILL 8 | # license as circulated by CEA, CNRS and INRIA at the following URL 9 | # "http://www.cecill.info". 10 | # 11 | # As a counterpart to the access to the source code and rights to copy, 12 | # modify and redistribute granted by the license, users are provided only 13 | # with a limited warranty and the software's author, the holder of the 14 | # economic rights, and the successive licensors have only limited 15 | # liability. 16 | # 17 | # In this respect, the user's attention is drawn to the risks associated 18 | # with loading, using, modifying and/or developing or reproducing the 19 | # software by the user in light of its specific status of free software, 20 | # that may mean that it is complicated to manipulate, and that also 21 | # therefore means that it is reserved for developers and experienced 22 | # professionals having in-depth computer knowledge. Users are therefore 23 | # encouraged to load and test the software's suitability as regards their 24 | # requirements in conditions enabling the security of their systems and/or 25 | # data to be ensured and, more generally, to use and operate it in the 26 | # same conditions as regards security. 27 | # 28 | # The fact that you are presently reading this means that you have had 29 | # knowledge of the CeCILL license and that you accept its terms. 30 | 31 | __all__ = ['cantab', 'imaging'] 32 | 33 | 34 | from . import cantab 35 | __all__.extend(cantab.__all__) 36 | from .cantab import check_cant_name 37 | from .cantab import check_datasheet_name 38 | from .cantab import check_detailed_datasheet_name 39 | from .cantab import check_report_name 40 | from .cantab import check_cant_content 41 | from .cantab import check_datasheet_content 42 | from .cantab import check_detailed_datasheet_content 43 | from .cantab import check_report_content 44 | 45 | from . import imaging 46 | __all__.extend(imaging.__all__) 47 | from .imaging import check_zip_name 48 | from .imaging import check_zip_content 49 | from .imaging import ZipTree 50 | -------------------------------------------------------------------------------- /imagen_databank/scanning.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014-2017 CEA 2 | # 3 | # This software is governed by the CeCILL license under French law and 4 | # abiding by the rules of distribution of free software. You can use, 5 | # modify and/ or redistribute the software under the terms of the CeCILL 6 | # license as circulated by CEA, CNRS and INRIA at the following URL 7 | # "http://www.cecill.info". 8 | # 9 | # As a counterpart to the access to the source code and rights to copy, 10 | # modify and redistribute granted by the license, users are provided only 11 | # with a limited warranty and the software's author, the holder of the 12 | # economic rights, and the successive licensors have only limited 13 | # liability. 14 | # 15 | # In this respect, the user's attention is drawn to the risks associated 16 | # with loading, using, modifying and/or developing or reproducing the 17 | # software by the user in light of its specific status of free software, 18 | # that may mean that it is complicated to manipulate, and that also 19 | # therefore means that it is reserved for developers and experienced 20 | # professionals having in-depth computer knowledge. Users are therefore 21 | # encouraged to load and test the software's suitability as regards their 22 | # requirements in conditions enabling the security of their systems and/or 23 | # data to be ensured and, more generally, to use and operate it in the 24 | # same conditions as regards security. 25 | # 26 | # The fact that you are presently reading this means that you have had 27 | # knowledge of the CeCILL license and that you accept its terms. 28 | 29 | import re 30 | 31 | from . core import detect_psc1 32 | 33 | import logging 34 | logger = logging.getLogger(__name__) 35 | 36 | 37 | _SUBJECT_ID_REGEX = re.compile('\d{2}[/\.]\d{2}[/\.]\d{4} \d{2}:\d{2}:\d{2}\tSubject ID: (\w+)') 38 | 39 | 40 | def read_scanning(path): 41 | """Return "Subject ID" values found in a Scanning/*.csv file. 42 | 43 | Parameters 44 | ---------- 45 | path : unicode 46 | Path to the Scanning/*.csv to read from. 47 | 48 | Returns 49 | ------- 50 | str 51 | "Subject ID" value found in the file. 52 | 53 | """ 54 | 55 | with open(path) as scanning: 56 | subject_ids = set() 57 | for line in scanning: 58 | match = _SUBJECT_ID_REGEX.match(line) 59 | if match: 60 | subject_id = detect_psc1(match.group(1)) 61 | if subject_id is None: 62 | subject_id = match.group(1) 63 | subject_ids.add(subject_id) 64 | return subject_ids 65 | -------------------------------------------------------------------------------- /mri/imagen_sample_FU3_mri_deidentify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) 2010-2019 CEA 4 | # 5 | # This software is governed by the CeCILL license under French law and 6 | # abiding by the rules of distribution of free software. You can use, 7 | # modify and/ or redistribute the software under the terms of the CeCILL 8 | # license as circulated by CEA, CNRS and INRIA at the following URL 9 | # "http://www.cecill.info". 10 | # 11 | # As a counterpart to the access to the source code and rights to copy, 12 | # modify and redistribute granted by the license, users are provided only 13 | # with a limited warranty and the software's author, the holder of the 14 | # economic rights, and the successive licensors have only limited 15 | # liability. 16 | # 17 | # In this respect, the user's attention is drawn to the risks associated 18 | # with loading, using, modifying and/or developing or reproducing the 19 | # software by the user in light of its specific status of free software, 20 | # that may mean that it is complicated to manipulate, and that also 21 | # therefore means that it is reserved for developers and experienced 22 | # professionals having in-depth computer knowledge. Users are therefore 23 | # encouraged to load and test the software's suitability as regards their 24 | # requirements in conditions enabling the security of their systems and/or 25 | # data to be ensured and, more generally, to use and operate it in the 26 | # same conditions as regards security. 27 | # 28 | # The fact that you are presently reading this means that you have had 29 | # knowledge of the CeCILL license and that you accept its terms. 30 | 31 | import os 32 | import zipfile 33 | import zlib 34 | import tempfile 35 | from datetime import datetime 36 | import shutil 37 | import subprocess 38 | from imagen_databank import PSC2_FROM_PSC1 39 | import json 40 | import logging 41 | logger = logging.getLogger(__name__) 42 | logging.basicConfig(level=logging.INFO) 43 | 44 | 45 | QUARANTINE_PATH = '/imagen/FU3/RAW/QUARANTINE' 46 | BIDS_PATH = '/neurospin/tmp/imagen/dcm2niix' 47 | SKIP_PATH = '/imagen/mri_skip.json' 48 | 49 | 50 | def quarantine_filename_semantics(filename): 51 | root, ext = os.path.splitext(filename) 52 | 53 | if (ext != '.zip'): 54 | logger.debug('%s: filename without ".zip" extension', filename) 55 | 56 | increment, suffix = root.split('_data_') 57 | increment = int(increment) 58 | 59 | psc1 = suffix[:-6] # last 6 characters added by the upload portal 60 | if len(psc1) > 12: 61 | timepoint = psc1[12:] 62 | psc1 = psc1[:12] 63 | else: 64 | logger.error('%s: missing timepoint', psc1) 65 | 66 | return increment, psc1, timepoint 67 | 68 | 69 | def timestamps(top, include_dirs=True): 70 | min_timestamp = datetime.max 71 | max_timestamp = datetime.min 72 | 73 | for root, dirs, files in os.walk(top): 74 | if include_dirs: 75 | for dirname in dirs: 76 | path = os.path.join(root, dirname) 77 | timestamp = datetime.fromtimestamp(os.path.getmtime(path)) 78 | min_timestamp = min(timestamp, min_timestamp) 79 | max_timestamp = max(timestamp, max_timestamp) 80 | for filename in files: 81 | path = os.path.join(root, filename) 82 | timestamp = datetime.fromtimestamp(os.path.getmtime(path)) 83 | min_timestamp = min(timestamp, min_timestamp) 84 | max_timestamp = max(timestamp, max_timestamp) 85 | 86 | return (min_timestamp, max_timestamp) 87 | 88 | 89 | def list_datasets(path): 90 | datasets = {} 91 | 92 | for zip_file in os.listdir(path): 93 | zip_path = os.path.join(path, zip_file) 94 | root, ext = os.path.splitext(zip_file) 95 | 96 | if (ext != '.zip'): 97 | logger.debug('%s: this is not a ZIP file ', zip_file) 98 | continue 99 | elif not zipfile.is_zipfile(zip_path): 100 | logger.warn('%s: skip invalid ZIP file ', zip_file) 101 | continue 102 | 103 | # Unix timestamp of the ZIP file 104 | timestamp = os.path.getmtime(zip_path) 105 | 106 | # semantics of ZIP file name 107 | increment, psc1, timepoint = quarantine_filename_semantics(zip_file) 108 | 109 | # compare increment/timestamp of ZIP files, keep most recent 110 | timepoint = datasets.setdefault(timepoint, {}) 111 | if psc1 in timepoint: 112 | old_zip_path, old_increment, old_timestamp = timepoint[psc1] 113 | if (increment <= old_increment or timestamp <= old_timestamp): 114 | if (increment >= old_increment or timestamp >= old_timestamp): 115 | logger.error('%s: inconsistent timestamps', zip_file) 116 | continue 117 | timepoint[psc1] = (zip_path, increment, timestamp) 118 | 119 | return datasets 120 | 121 | 122 | def dcm2nii(src, dst, comment): 123 | status = 0 124 | 125 | logger.info('%s: running dcm2niix: %s', src, dst) 126 | 127 | dcm2niix = ['dcm2niix', 128 | '-z', 'y', '-9' 129 | '-c', comment, 130 | '-o', dst, 131 | src] 132 | completed = subprocess.run(dcm2niix, 133 | capture_output=True) 134 | if completed.returncode: 135 | logger.error('%s: dcm2niix failed: %s', 136 | src, completed.stdout) 137 | status = completed.returncode 138 | 139 | return status 140 | 141 | 142 | def deidentify(timepoint, psc1, zip_path, bids_path): 143 | logger.info('%s/%s: deidentify', psc1, timepoint) 144 | 145 | psc2 = PSC2_FROM_PSC1[psc1] 146 | out_sub_path = os.path.join(bids_path, 'sub-' + psc2) 147 | out_ses_path = os.path.join(out_sub_path, 'ses-' + timepoint) 148 | 149 | # skip ZIP files that have already been processed 150 | if os.path.isdir(out_ses_path): 151 | zip_timestamp = datetime.fromtimestamp(os.path.getmtime(zip_path)) 152 | min_timestamp, max_timestamp = timestamps(out_ses_path) 153 | if min_timestamp > zip_timestamp: 154 | return 155 | else: 156 | shutil.rmtree(out_ses_path) 157 | os.makedirs(out_ses_path) 158 | 159 | status = 0 160 | prefix = 'cveda-mri-' + psc1 161 | with tempfile.TemporaryDirectory(prefix=prefix) as tempdir: 162 | # unpack ZIP file into temporary directory 163 | zip_file = zipfile.ZipFile(zip_path) 164 | try: 165 | zip_file.extractall(tempdir) 166 | except (zipfile.BadZipFile, OSError, EOFError, zlib.error) as e: 167 | logger.error('%s/%s: corrupt ZIP file: %s', 168 | psc1, timepoint, str(e)) 169 | return 170 | 171 | os.makedirs(out_ses_path) 172 | status = dcm2nii(tempdir, out_ses_path, 173 | psc2 + '/' + timepoint) 174 | 175 | if status: 176 | shutil.rmtree(out_ses_path) 177 | if not os.listdir(out_sub_path): # empty directory 178 | os.rmdir(out_sub_path) 179 | 180 | return status 181 | 182 | 183 | def main(): 184 | datasets = list_datasets(QUARANTINE_PATH) 185 | 186 | for timepoint, timepoint_datasets in datasets.items(): 187 | for psc1, (zip_path, increment, timestamp) in timepoint_datasets.items(): 188 | with open(SKIP_PATH) as skip_file: 189 | skip = json.load(skip_file) 190 | if timepoint in skip and psc1 in skip[timepoint]: 191 | continue 192 | deidentify(timepoint, psc1, zip_path, BIDS_PATH) 193 | 194 | 195 | if __name__ == "__main__": 196 | main() 197 | -------------------------------------------------------------------------------- /onsets/imagen_onsets_copy_FU3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SOURCE='/neurospin/imagen/FU3/RAW/PSC2/onsets' 4 | TARGET='/neurospin/imagen/FU3/processed/nifti' 5 | 6 | for f in "${SOURCE}/"*.csv 7 | do 8 | basename=`basename "$f" '.csv'` 9 | psc2=`echo "$basename" | sed -e 's/^.*_//; s/FU3$//'` 10 | if [ -d "${TARGET}/${psc2}" ] 11 | then 12 | mkdir -p "${TARGET}/${psc2}/BehaviouralData" 13 | cp -p "${SOURCE}/${basename}.csv" "${TARGET}/${psc2}/BehaviouralData/" 14 | else 15 | >&2 echo "ERROR: $psc2: missing folder!" 16 | fi 17 | done 18 | -------------------------------------------------------------------------------- /onsets/imagen_onsets_copy_STRATIFY.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SOURCE='/neurospin/imagen/STRATIFY/RAW/PSC2/onsets' 4 | TARGET='/neurospin/imagen/STRATIFY/processed/nifti' 5 | 6 | for f in "${SOURCE}/"*.csv 7 | do 8 | basename=`basename "$f" '.csv'` 9 | psc2=`echo "$basename" | sed -e 's/^.*_//; s/SB$//'` 10 | if [ -d "${TARGET}/${psc2}" ] 11 | then 12 | mkdir -p "${TARGET}/${psc2}/BehaviouralData" 13 | cp -p "${SOURCE}/${basename}.csv" "${TARGET}/${psc2}/BehaviouralData/" 14 | else 15 | >&2 echo "ERROR: $psc2: missing folder!" 16 | fi 17 | done 18 | -------------------------------------------------------------------------------- /onsets/imagen_onsets_extract_deidentify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import zipfile 5 | from datetime import datetime 6 | from tempfile import TemporaryDirectory 7 | from multiprocessing import Pool 8 | from imagen_databank import PSC2_FROM_PSC1, DOB_FROM_PSC1 9 | import logging 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | 13 | WORKER_PROCESSES = 8 14 | 15 | DATASETS_FU3_SB = '/neurospin/imagen/FU3/RAW/QUARANTINE' 16 | ONSETS = { 17 | 'FU3': '/neurospin/imagen/FU3/RAW/PSC2/onsets', 18 | 'SB': '/neurospin/imagen/STRATIFY/RAW/PSC2/onsets', 19 | } 20 | 21 | 22 | def _parse_onsets_datetime(date_string): 23 | """Read date in the format found in CSV files. 24 | 25 | """ 26 | DATE_FORMATS = ( 27 | '%d.%m.%Y %H:%M:%S', 28 | '%d/%m/%Y %H:%M:%S', 29 | ) 30 | for date_format in DATE_FORMATS: 31 | try: 32 | dt = datetime.strptime(date_string, date_format) 33 | return dt 34 | except ValueError: 35 | pass 36 | return None 37 | 38 | 39 | def _extract_psc1_timestamp(path): 40 | """Extract time stamp from FU3 / Stratify zip files in QUARANTINE. 41 | 42 | Parameters 43 | ---------- 44 | path : unicode 45 | Zip file name. 46 | 47 | Returns 48 | ------- 49 | tuple (str, int) 50 | PSC1 code and database increment number from tarball file name. 51 | 52 | """ 53 | path = os.path.basename(path) 54 | root, ext = os.path.splitext(path) 55 | 56 | # extract database increment number and PSC1 57 | increment, data, psc1 = root.split('_', 2) 58 | assert(increment.isdigit()) 59 | increment = int(increment) 60 | while not psc1[:12].isdigit(): 61 | split = psc1.split('_', 1) 62 | if len(split) > 1: 63 | psc1 = split[-1] 64 | else: 65 | psc1 = None 66 | break 67 | else: 68 | psc1 = psc1[:12] 69 | 70 | return psc1, increment 71 | 72 | 73 | def process_behavioural(path, timepoint, prefix, psc1, psc2): 74 | logging.info('%s: processing behavioural file...', path) 75 | 76 | with open(path, encoding='latin-1', newline='') as content: 77 | output_path = ONSETS[timepoint] 78 | output = os.path.join(output_path, prefix + '_' + psc2 + timepoint + '.csv') 79 | with open(output, 'w') as output: 80 | # de-identify 1st line 81 | line = next(iter(content)) 82 | column = line.split('\t') 83 | if psc1 in DOB_FROM_PSC1: 84 | column[1] = str((_parse_onsets_datetime(column[1]).date() - 85 | DOB_FROM_PSC1[psc1]).days) 86 | else: 87 | column[1] = '' 88 | column[2] = column[2].replace(psc1, psc2) 89 | line = '\t'.join(column) 90 | # write to target file 91 | output.write(line) 92 | for line in content: 93 | output.write(line) 94 | 95 | 96 | def process_dataset(arguments): 97 | (timepoint, psc1, psc2, dataset_path) = arguments # unpack multiple arguments 98 | 99 | logging.info('%s: processing zipped %s dataset...', psc1, timepoint) 100 | 101 | with TemporaryDirectory(prefix='imagen_behavioural_') as tmp: 102 | with zipfile.ZipFile(dataset_path) as dataset_zipfile: 103 | members = dataset_zipfile.infolist() 104 | 105 | for prefix in ('ft', 'mid', 'recog', 'ss'): 106 | for member in members: 107 | if member.filename == (psc1 + timepoint + '/AdditionalData/Scanning/' + 108 | prefix + '_' + psc1 + timepoint + '.csv'): 109 | dataset_zipfile.extract(member, path=tmp) 110 | behavioural_path = os.path.join(tmp, member.filename) 111 | process_behavioural(behavioural_path, timepoint, prefix, psc1, psc2) 112 | break 113 | else: 114 | logging.warning('%s: missing %s_*.csv file', psc1, prefix) 115 | 116 | logging.info('%s: processed zipped %s dataset', psc1, timepoint) 117 | 118 | 119 | def list_datasets(path, timepoint): 120 | # list zip files to process 121 | # for subjects with multiple zip files, keep the most recent one 122 | datasets = {} 123 | for dataset in os.listdir(path): 124 | root, ext = os.path.splitext(dataset) 125 | if ext != '.zip': 126 | continue 127 | increment, data, psc1 = root.split('_', 2) 128 | assert(increment.isdigit() and data == 'data' and 129 | psc1[:12].isdigit()) 130 | if psc1[12:12+len(timepoint)] != timepoint: 131 | continue 132 | 133 | psc1, timestamp = _extract_psc1_timestamp(dataset) 134 | dataset_path = os.path.join(path, dataset) 135 | datasets.setdefault(psc1, {})[timestamp] = dataset_path 136 | 137 | logging.info('found %d zipped %s datasets', len(datasets), timepoint) 138 | 139 | return[(psc1, timestamps[max(timestamps.keys())]) # keep latest dataset 140 | for (psc1, timestamps) in datasets.items()] 141 | 142 | 143 | def process_datasets(path, timepoint): 144 | todo_list = list(list_datasets(path, timepoint)) 145 | todo_list = [(timepoint, psc1, PSC2_FROM_PSC1[psc1], path) for (psc1, path) in todo_list] 146 | 147 | pool = Pool(WORKER_PROCESSES) 148 | results = pool.map(process_dataset, todo_list) 149 | pool.close() 150 | pool.join() 151 | return results 152 | 153 | 154 | def main(): 155 | for timepoint in ('FU3', 'SB'): 156 | results = process_datasets(DATASETS_FU3_SB, timepoint) 157 | 158 | 159 | if __name__ == "__main__": 160 | main() 161 | -------------------------------------------------------------------------------- /psc/imagen_update_dawba_codes_from_tokens.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Download Dawba codes for Imagen FU3 and Stratify and update conversion table. 3 | 4 | ========== 5 | Attributes 6 | ========== 7 | 8 | Output 9 | ------ 10 | 11 | PSC2PSC : str 12 | Table of conversion between participant codes (PSC1, Dawba, PSC2). 13 | 14 | """ 15 | 16 | import os 17 | import requests 18 | import json 19 | import base64 20 | from urllib.parse import urlparse 21 | import datetime 22 | import logging 23 | from imagen_databank import PSC2_FROM_PSC1 24 | logging.basicConfig(level=logging.INFO) 25 | 26 | # The LSRC2 service at Delosis. 27 | LSRC2_BASE_URL = 'https://www.delosis.com/qs/index.php/admin/remotecontrol' 28 | # Since credentials are different between the legacy and the LSRC2 service, 29 | # and ~/.netrc allows only a single set of credentials per server, store 30 | # LSRC2 credentials in an alternate file. 31 | LSRC2_NETRC_FILE = '~/.lsrc2' 32 | # The PSC1, Dawba, PSC2 conversion table 33 | PSC2PSC = '/neurospin/imagen/src/scripts/psc_tools/psc2psc.csv' 34 | PSC2PSC_SB = '/neurospin/imagen/src/scripts/psc_tools/psc2psc_SB.csv' 35 | 36 | 37 | class LimeSurveyError(Exception): 38 | def __init__(self, message, code): 39 | super().__init__(message) 40 | self.code = code 41 | 42 | 43 | def error2exception(func): 44 | def wrapper(*args, **kwargs): 45 | response, error = func(*args, **kwargs) 46 | if error: 47 | try: 48 | code = error['code'] 49 | message = error['message'] 50 | except (TypeError, KeyError): 51 | code = -32603 # internal JSON-RPC error 52 | message = 'Unexpected JSON-RPC error type' 53 | raise LimeSurveyError(message, code) 54 | return response 55 | return wrapper 56 | 57 | 58 | class LimeSurveySession: 59 | """LimeSurvey JSON-RPC LSRC2 session 60 | 61 | Documented here: 62 | https://www.delosis.com/qs/index.php/admin/remotecontrol 63 | https://manual.limesurvey.org/RemoteControl_2_API 64 | 65 | """ 66 | __request_id = 0 67 | 68 | def __init__(self, url, username, password): 69 | self.url = url 70 | # start a Requests session 71 | self.session = requests.Session() 72 | # Keep-alive is 100% automatic in Requests, thanks to urllib3 73 | self.session.headers.update({'content-type': 'application/json'}) 74 | # start a LimeSurvey RemoteControl 2 session 75 | self.key = self._get_session_key(username, password) 76 | 77 | def __enter__(self): 78 | return self 79 | 80 | def __exit__(self, type, value, traceback): 81 | self.close() 82 | return False # re-raises the exception 83 | 84 | def close(self): 85 | """Release LimeSurvey session key, then close Requests session""" 86 | self._release_session_key(self.key) 87 | self.key = None 88 | self.session.close() 89 | 90 | @staticmethod 91 | def _generate_request_id(): 92 | LimeSurveySession.__request_id += 1 93 | return LimeSurveySession.__request_id 94 | 95 | @staticmethod 96 | def _request(method, params): 97 | return { 98 | 'jsonrpc': '2.0', 99 | 'id': LimeSurveySession._generate_request_id(), 100 | 'method': method, 101 | 'params': params, 102 | } 103 | 104 | def _post(self, request): 105 | logging.debug('JSON-RPC request: %s', request) 106 | assert 'method' in request and 'params' in request and 'id' in request 107 | response = self.session.post(self.url, data=json.dumps(request)) 108 | response = response.json() 109 | logging.debug('JSON-RPC response: %s', response) 110 | assert response['id'] == request['id'] 111 | result = response['result'] 112 | error = response['error'] 113 | if error: 114 | logging.error('JSON-RPC error: %s', error) 115 | return result, error 116 | 117 | def _get_session_key(self, username, password): 118 | request = self._request('get_session_key', [username, password]) 119 | response, error = self._post(request) 120 | 121 | # fix non-sensical LSRC2 error handling 122 | # completely at odds with JSON-RPC error handling 123 | try: 124 | status = response['status'] 125 | except (TypeError, KeyError): 126 | if error is not None: 127 | logging.error('LSRC2 failed to create a session key') 128 | response = None 129 | else: 130 | logging.info('LSRC2 new session key: %s', response) 131 | else: 132 | logging.error(status) 133 | error = { 134 | 'code': -32099, # implementation-defined error in JSON-RPC 135 | 'message': status, 136 | } 137 | response = None 138 | 139 | return response 140 | 141 | def _release_session_key(self, key): 142 | request = self._request('release_session_key', [key]) 143 | logging.info('LSRC2 release session key: %s', key) 144 | dummy_response, dummy_error = self._post(request) # returns ('OK', None) even if bogus key 145 | 146 | @error2exception 147 | def surveys(self): 148 | request = self._request('list_surveys', [self.key]) 149 | return self._post(request) 150 | 151 | @error2exception 152 | def participants(self, survey, attributes=False): 153 | request = self._request('list_participants', 154 | [self.key, survey, 0, 5000, False, attributes]) 155 | responses, error = self._post(request) 156 | 157 | # fix non-sensical LSRC2 error handling 158 | # completely at odds with JSON-RPC error handling 159 | try: 160 | status = responses['status'] 161 | except (TypeError, KeyError): 162 | pass 163 | else: 164 | # LSRC2 returns errors as a dict with a 'status' attribute 165 | if status == 'No Tokens found': 166 | # When a survey is empty, LSRC2 also returns a dict: 167 | # {"status": "No Tokens found"} 168 | if error is not None: 169 | logging.error('JSON-RPC error report does not match "status"') 170 | error = None 171 | else: 172 | error = { 173 | 'code': -32099, # implementation-defined error in JSON-RPC 174 | 'message': status, 175 | } 176 | responses = [] 177 | 178 | return responses, error 179 | 180 | @error2exception 181 | def participant_properties(self, survey, participant, attributes): 182 | request = self._request('get_participant_properties', 183 | [self.key, survey, participant, attributes]) 184 | return self._post(request) 185 | 186 | @error2exception 187 | def responses(self, survey, status='all'): 188 | request = self._request('export_responses', 189 | [self.key, survey, 'csv', None, status]) 190 | responses, error = self._post(request) 191 | 192 | try: 193 | responses = base64.b64decode(responses).decode('utf_8').split('\n') 194 | except TypeError: 195 | # fix non-sensical LSRC2 error handling 196 | # completely at odds with JSON-RPC error handling 197 | try: 198 | status = responses['status'] 199 | except (TypeError, KeyError): 200 | message = 'JSON-RPC function "export_responses" expected a Base64-encoded string' 201 | logging.error(message) 202 | error = { 203 | 'code': -32099, # implementation-defined error in JSON-RPC 204 | 'message': message, 205 | } 206 | else: 207 | # LSRC2 returns errors as a dict with a 'status' attribute 208 | if status == 'No Data, could not get max id.': 209 | # When a survey is empty, LSRC2 also returns a dict: 210 | # {"status": "No Data, could not get max id."} 211 | if error is not None: 212 | logging.error('JSON-RPC error report does not match "status"') 213 | error = None 214 | else: 215 | error = { 216 | 'code': -32099, # implementation-defined error in JSON-RPC 217 | 'message': status, 218 | } 219 | responses = [] 220 | 221 | return responses, error 222 | 223 | 224 | def _get_netrc_auth(url): 225 | try: 226 | netrc_path = os.path.expanduser(LSRC2_NETRC_FILE) 227 | except KeyError: 228 | import warnings 229 | warnings.warn('Unable to find home directory') 230 | return 231 | if not os.path.exists(netrc_path): 232 | return 233 | 234 | netloc = urlparse(url).netloc 235 | 236 | try: 237 | from netrc import netrc, NetrcParseError 238 | try: 239 | authenticators = netrc(netrc_path).authenticators(netloc) 240 | except (NetrcParseError, OSError): 241 | return 242 | if authenticators: 243 | return (authenticators[0], authenticators[2]) 244 | except (ImportError): 245 | return 246 | 247 | 248 | def download_lsrc2_tokens(base_url, startswith=None): 249 | """JSON RPC calls to LSRC2 service to retrieve tokens. 250 | 251 | """ 252 | username, password = _get_netrc_auth(base_url) 253 | with LimeSurveySession(base_url, username, password) as session: 254 | dawba_from_psc1 = {} 255 | 256 | surveys = session.surveys() 257 | for survey in surveys: 258 | title = survey['surveyls_title'] 259 | sid = survey['sid'] 260 | active = survey['active'] 261 | 262 | if title.startswith(startswith): 263 | if active == 'N': 264 | logging.info('skip inactive survey: %s', title) 265 | continue 266 | else: 267 | logging.info('read survey: %s', title) 268 | else: 269 | logging.info('skip survey: %s', title) 270 | continue 271 | 272 | # subjects in surveys are identified by "sid" and "token" 273 | # retrieve correlation between "token" and PSC1 and Dawba codes 274 | psc1_from_token = {} 275 | dawba_from_token = {} 276 | participants = session.participants(sid, ['completed', 'reminded', 'attribute_1', 'attribute_2']) 277 | 278 | for participant in participants: 279 | token = participant['token'] 280 | if ('reminded' in participant and participant['reminded'] == 'Duplicate' or 281 | 'completed' in participant and participant['completed'] == 'N'): 282 | continue 283 | # PSC1 284 | if 'attribute_1' in participant: 285 | psc1 = participant['attribute_1'].strip() 286 | if psc1.endswith('SB'): 287 | psc1 = psc1[:-2] 288 | if psc1.endswith('FU3'): 289 | psc1 = psc1[:-3] 290 | if psc1.isdigit(): 291 | if token in psc1_from_token: 292 | if psc1 != psc1_from_token[token]: 293 | logging.error('survey: %s: duplicate token has inconsistent PSC1 codes: %s / %s', 294 | title, psc1_from_token[token], psc1) 295 | else: 296 | logging.warning('survey: %s: duplicate token for PSC1 code: %s', 297 | title, psc1) 298 | else: 299 | psc1_from_token[token] = psc1 300 | else: 301 | logging.info('survey: %s: skipping invalid PSC1 code: %s', 302 | title, psc1) 303 | else: 304 | logging.error('survey: %s: token %s lacks a PSC1 code', 305 | title, token) 306 | # Dawba 307 | if 'attribute_2' in participant: 308 | dawba = participant['attribute_2'] 309 | if dawba: 310 | dawba = dawba.strip() 311 | if dawba.isdigit(): 312 | if token in dawba_from_token: 313 | if dawba != dawba_from_token[token]: 314 | logging.error('survey: %s: duplicate token has inconsistent Dawba codes: %s / %s', 315 | title, dawba_from_token[token], dawba) 316 | else: 317 | logging.warning('survey: %s: duplicate token for Dawba code: %s', 318 | title, dawba) 319 | else: 320 | dawba_from_token[token] = dawba 321 | elif dawba == '-': 322 | logging.warning("survey: %s: %s: skipping empty Dawba code '-'", 323 | title, psc1) 324 | else: 325 | logging.info('survey: %s: %s: skipping invalid Dawba code: %s', 326 | title, psc1, dawba) 327 | else: 328 | logging.info('survey: %s: %s: skipping empty Dawba code', 329 | title, psc1) 330 | else: 331 | logging.error('survey: %s: token %s lacks a Dawba code', 332 | title, token) 333 | 334 | for token in psc1_from_token.keys() & dawba_from_token.keys(): 335 | psc1 = psc1_from_token[token] 336 | dawba = dawba_from_token[token] 337 | dawba_from_psc1.setdefault(psc1, {}).setdefault(dawba, set()) 338 | dawba_from_psc1[psc1][dawba].add(title) 339 | 340 | for psc1, codes in dawba_from_psc1.items(): 341 | if len(codes) > 1: 342 | message_details = '' 343 | for dawba, titles in codes.items(): 344 | message_details += '\t{}:\n\t\t{}\n'.format(dawba, '\n\t\t'.join(title for title in titles)) 345 | logging.error('%s: multiple Dawba codes:\n%s', 346 | psc1, message_details) 347 | dawba_from_psc1[psc1] = None 348 | else: 349 | dawba_from_psc1[psc1] = next(iter(dawba_from_psc1[psc1].keys())) 350 | dawba_from_psc1 = {psc1: dawba for psc1, dawba in dawba_from_psc1.items() 351 | if dawba} 352 | 353 | return dawba_from_psc1 354 | 355 | 356 | def main(): 357 | projects = ( 358 | (PSC2PSC, 'Imagen FUIII - Core'), 359 | (PSC2PSC_SB, 'STRATIFY Core'), 360 | ) 361 | 362 | for psc2psc, startswith in projects: 363 | dawba_from_psc1 = download_lsrc2_tokens(LSRC2_BASE_URL, startswith) 364 | 365 | root, ext = os.path.splitext(psc2psc) 366 | output = root + '_' + datetime.date.today().isoformat() + ext 367 | with open(psc2psc, 'r') as p, open(output, 'w') as o: 368 | # skip header line 369 | line = next(p).strip('\n') 370 | print(line, file=o) 371 | 372 | done = set() 373 | for line in p: 374 | line = line.strip('\n') 375 | psc1, dawba, psc2 = line.split('=') 376 | if (int(dawba) > 200000 or # process only FU3 and Stratify 377 | dawba == '000000'): 378 | if psc1 in dawba_from_psc1: 379 | if dawba != dawba_from_psc1[psc1]: 380 | if dawba == '000000': 381 | logging.info('%s: Dawba code initialized from %s to %s', 382 | psc1, dawba, dawba_from_psc1[psc1]) 383 | else: 384 | logging.error('%s: Dawba code changed from %s to %s', 385 | psc1, dawba, dawba_from_psc1[psc1]) 386 | dawba = dawba_from_psc1[psc1] 387 | line = '='.join((psc1, dawba, psc2)) 388 | done.add(psc1) 389 | print(line, file=o) 390 | 391 | for psc1 in (dawba_from_psc1.keys() - done): 392 | dawba = dawba_from_psc1[psc1] 393 | psc2 = PSC2_FROM_PSC1[psc1] 394 | line = '='.join((psc1, dawba, psc2)) 395 | print(line, file=o) 396 | 397 | 398 | if __name__ == "__main__": 399 | main() 400 | -------------------------------------------------------------------------------- /psytools/imagen_psytools_deidentify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Re-encode and pseudonymize Psytools CSV files (BL, FU1, FU2, FU3 and Stratify). 3 | 4 | This script replaces the Scito pseudonymization pipeline. 5 | 6 | ========== 7 | Attributes 8 | ========== 9 | 10 | Input 11 | ----- 12 | 13 | PSYTOOLS_BL_DERIVED_DIR : str 14 | Location of BL PSC1-encoded files. 15 | PSYTOOLS_FU1_DERIVED_DIR : str 16 | Location of FU1 PSC1-encoded files. 17 | PSYTOOLS_FU2_DERIVED_DIR : str 18 | Location of FU2 PSC1-encoded files. 19 | PSYTOOLS_FU3_DERIVED_DIR : str 20 | Location of FU3 PSC1-encoded files. 21 | PSYTOOLS_STRATIFY_DERIVED_DIR : str 22 | Location of Stratify PSC1-encoded files. 23 | 24 | Output 25 | ------ 26 | 27 | PSYTOOLS_BL_PSC2_DIR : str 28 | Location of BL PSC2-encoded files. 29 | PSYTOOLS_FU1_PSC2_DIR : str 30 | Location of FU1 PSC2-encoded files. 31 | PSYTOOLS_FU2_PSC2_DIR : str 32 | Location of FU2 PSC2-encoded files. 33 | PSYTOOLS_FU3_PSC2_DIR : str 34 | Location of FU3 PSC2-encoded files. 35 | PSYTOOLS_STRATIFY_PSC2_DIR : str 36 | Location of Stratify PSC2-encoded files. 37 | 38 | """ 39 | 40 | PSYTOOLS_BL_DERIVED_DIR = '/tmp/imagen/BL/processed/psytools' 41 | PSYTOOLS_BL_PSC2_DIR = '/neurospin/imagen/BL/processed/psytools' 42 | PSYTOOLS_FU1_DERIVED_DIR = '/tmp/imagen/FU1/processed/psytools' 43 | PSYTOOLS_FU1_PSC2_DIR = '/neurospin/imagen/FU1/processed/psytools' 44 | PSYTOOLS_FU2_DERIVED_DIR = '/tmp/imagen/FU2/processed/psytools' 45 | PSYTOOLS_FU2_PSC2_DIR = '/neurospin/imagen/FU2/processed/psytools' 46 | PSYTOOLS_FU3_DERIVED_DIR = '/tmp/imagen/FU3/processed/psytools' 47 | PSYTOOLS_FU3_PSC2_DIR = '/neurospin/imagen/FU3/processed/psytools' 48 | PSYTOOLS_STRATIFY_DERIVED_DIR = '/tmp/imagen/STRATIFY/processed/psytools' 49 | PSYTOOLS_STRATIFY_PSC2_DIR = '/neurospin/imagen/STRATIFY/processed/psytools' 50 | PSYTOOLS_STRATIFY_FU_DERIVED_DIR = '/tmp/imagen/STRATIFY_FU/processed/psytools' 51 | PSYTOOLS_STRATIFY_FU_PSC2_DIR = '/neurospin/imagen/STRATIFY_FU/processed/psytools' 52 | PSYTOOLS_IMACOV19_BL_DERIVED_DIR = '/tmp/imagen/IMACOV19_BL/processed/psytools' 53 | PSYTOOLS_IMACOV19_BL_PSC2_DIR = '/neurospin/imagen/IMACOV19_BL/processed/psytools' 54 | PSYTOOLS_IMACOV19_FU_DERIVED_DIR = '/tmp/imagen/IMACOV19_FU/processed/psytools' 55 | PSYTOOLS_IMACOV19_FU_PSC2_DIR = '/neurospin/imagen/IMACOV19_FU/processed/psytools' 56 | PSYTOOLS_IMACOV19_FU2_DERIVED_DIR = '/tmp/imagen/IMACOV19_FU2/processed/psytools' 57 | PSYTOOLS_IMACOV19_FU2_PSC2_DIR = '/neurospin/imagen/IMACOV19_FU2/processed/psytools' 58 | PSYTOOLS_IMACOV19_FU3_DERIVED_DIR = '/tmp/imagen/IMACOV19_FU3/processed/psytools' 59 | PSYTOOLS_IMACOV19_FU3_PSC2_DIR = '/neurospin/imagen/IMACOV19_FU3/processed/psytools' 60 | PSYTOOLS_STRATICO19_BL_DERIVED_DIR = '/tmp/imagen/STRATICO19_BL/processed/psytools' 61 | PSYTOOLS_STRATICO19_BL_PSC2_DIR = '/neurospin/imagen/STRATICO19_BL/processed/psytools' 62 | PSYTOOLS_STRATICO19_FU_DERIVED_DIR = '/tmp/imagen/STRATICO19_FU/processed/psytools' 63 | PSYTOOLS_STRATICO19_FU_PSC2_DIR = '/neurospin/imagen/STRATICO19_FU/processed/psytools' 64 | PSYTOOLS_STRATICO19_FU2_DERIVED_DIR = '/tmp/imagen/STRATICO19_FU2/processed/psytools' 65 | PSYTOOLS_STRATICO19_FU2_PSC2_DIR = '/neurospin/imagen/STRATICO19_FU2/processed/psytools' 66 | PSYTOOLS_STRATICO19_FU3_DERIVED_DIR = '/tmp/imagen/STRATICO19_FU3/processed/psytools' 67 | PSYTOOLS_STRATICO19_FU3_PSC2_DIR = '/neurospin/imagen/STRATICO19_FU3/processed/psytools' 68 | 69 | 70 | import os 71 | from csv import DictReader 72 | from csv import DictWriter 73 | from datetime import datetime 74 | import logging 75 | logging.basicConfig(level=logging.INFO) 76 | 77 | # import ../imagen_databank 78 | import sys 79 | sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')) 80 | from imagen_databank import PSC2_FROM_PSC1 81 | from imagen_databank import DOB_FROM_PSC1 82 | 83 | 84 | def _deidentify_legacy(psc2_from_psc1, psytools_path, psc2_path): 85 | """Anonymize and re-encode a legacy Psytools questionnaire from PSC1 to PSC2. 86 | 87 | Legacy questionnaires are in long format. 88 | 89 | Parameters 90 | ---------- 91 | psc2_from_psc1: map 92 | Conversion table, from PSC1 to PSC2. 93 | psytools_path: str 94 | Input: PSC1-encoded Psytools file. 95 | psc2_path: str 96 | Output: PSC2-encoded Psytools file. 97 | 98 | """ 99 | with open(psytools_path, 'r') as psc1_file: 100 | psc1_reader = DictReader(psc1_file, dialect='excel') 101 | 102 | # de-identify columns with timestamps 103 | ANONYMIZED_COLUMNS = { 104 | 'Completed Timestamp': ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S'), 105 | 'Processed Timestamp': ('%Y-%m-%d %H:%M:%S.%f', '%Y-%m-%d %H:%M:%S'), 106 | } 107 | convert = [fieldname for fieldname in psc1_reader.fieldnames 108 | if fieldname in ANONYMIZED_COLUMNS] 109 | 110 | # discard other columns with dates 111 | DISCARDED_COLUMNS = { 112 | 'id_check_dob', 'id_check_gender', 'id_check_relation', 113 | # FU3 / NI DATA 114 | 'DATE_BIRTH_1', 'DATE_BIRTH_2', 'DATE_BIRTH_3', 115 | 'TEST_DATE_1', 'TEST_DATE_2', 'TEST_DATE_3' 116 | } 117 | 118 | # read/process each row and save for later writing 119 | rows = {} 120 | for row in psc1_reader: 121 | psc1, suffix = row['User code'][:12], row['User code'][12:] 122 | if psc1 in PSC2_FROM_PSC1: 123 | psc2 = PSC2_FROM_PSC1[psc1] 124 | if suffix in {'-C', '-P', '-I'}: 125 | # keep the suffix of Imagen subject IDs 126 | # -C Child 127 | # -P Parent 128 | # -I Institute 129 | row['User code'] = psc2 + suffix 130 | else: 131 | if suffix in {'FU', 'SU'}: 132 | # as a short-term decision, discard "FU" follow-up participants as well as "SU" follow-up 133 | # from Stratify and LimeSurvey-derived files 134 | logging.info('discarding STRATIFY follow-up participant %s!', 135 | row['User code']) 136 | continue 137 | elif suffix not in {'FU3', 'SB'}: #'SU' corresponds to ESTRA FU, has been removed temporarily to generate psytools SB 138 | # remove "FU3 and "SB"/"SU" suffixes 139 | # in Stratify and LimeSurvey-derived files 140 | logging.error('unknown suffix %s in user code %s', 141 | suffix, row['User code']) 142 | row['User code'] = psc2 143 | else: 144 | logging.error('unknown PSC1 code %s in user code %s', 145 | psc1, row['User code']) 146 | continue 147 | 148 | # de-identify columns with timestamps 149 | for fieldname in convert: 150 | if psc1 in DOB_FROM_PSC1: 151 | birth = DOB_FROM_PSC1[psc1] 152 | for timestamp_format in ANONYMIZED_COLUMNS[fieldname]: 153 | try: 154 | timestamp = datetime.strptime(row[fieldname], 155 | timestamp_format).date() 156 | except ValueError: 157 | continue 158 | else: 159 | age = timestamp - birth 160 | row[fieldname] = str(age.days) 161 | break 162 | else: 163 | logging.error('%s: invalid "%s": %s', 164 | psc1, fieldname, row[fieldname]) 165 | row[fieldname] = None 166 | else: 167 | row[fieldname] = None 168 | 169 | # convert to age in days at date of birth - should be 0 if correct! 170 | # FU2 / ESPAD CHILD 171 | # FU2 / NI DATA 172 | for column in ('education_end', 'ni_period', 'ni_date'): 173 | if column in psc1_reader.fieldnames: 174 | if psc1 in DOB_FROM_PSC1: 175 | birth = DOB_FROM_PSC1[psc1] 176 | try: 177 | d = datetime.strptime(row[column], 178 | '%d-%m-%Y').date() 179 | except ValueError: 180 | row[column] = None 181 | else: 182 | age = d - birth 183 | row[column] = str(age.days) 184 | else: 185 | row[column] = None 186 | 187 | # convert to age of parents in days at assessment 188 | # BL/FU1 / PBQ 189 | for column in ('pbq_01', 'pbq_02'): 190 | if column in psc1_reader.fieldnames: 191 | try: 192 | birth = datetime.strptime(row[column], 193 | '%d-%m-%Y').date() 194 | except ValueError: 195 | row[column] = None 196 | else: 197 | # last 'timestamp' ought to be 'Processed timestamp' 198 | age = timestamp - birth 199 | row[column] = str(age.days) 200 | 201 | # discard other columns with dates 202 | for column in DISCARDED_COLUMNS: 203 | if column in psc1_reader.fieldnames: 204 | del row[column] 205 | 206 | rows.setdefault(psc2, []).append(row) 207 | 208 | # save rows into output file, sort by PSC2 209 | with open(psc2_path, 'w') as psc2_file: 210 | fieldnames = [fieldname for fieldname in psc1_reader.fieldnames 211 | if fieldname not in DISCARDED_COLUMNS] 212 | psc2_writer = DictWriter(psc2_file, fieldnames, dialect='excel') 213 | psc2_writer.writeheader() 214 | for psc2 in sorted(rows): 215 | for row in rows[psc2]: 216 | psc2_writer.writerow(row) 217 | 218 | 219 | def _psc1(psc1, psc2_from_psc1): 220 | if 'TEST' in psc1.upper(): 221 | # skip test subjects 222 | logging.debug('skipping test subject "%s"', psc1) 223 | else: 224 | # find and skip subjects with invalid identifier 225 | if psc1[-3:] in {'FU2', 'FU3'}: 226 | psc1 = psc1[:-3] 227 | elif psc1[-2:] in {'SB'}: #removing SU to skip followup acquisitions in SB psytools 228 | psc1 = psc1[:-2] 229 | if psc1 in psc2_from_psc1: 230 | return psc1 231 | elif psc1 in {'0x0000xxxxxx'}: 232 | logging.info('skipping known invalid subject identifier "%s"', 233 | psc1) 234 | else: 235 | logging.error('invalid subject identifier "%s"', psc1) 236 | return None 237 | 238 | 239 | def _deidentify_lsrc2(psc2_from_psc1, psytools_path, psc2_path): 240 | """Anonymize and re-encode an LSRC2 Psytools questionnaire from PSC1 to PSC2. 241 | 242 | LSRC2 questionnaires are in wide format. 243 | 244 | Parameters 245 | ---------- 246 | psc2_from_psc1: map 247 | Conversion table, from PSC1 to PSC2. 248 | psytools_path: str 249 | Input: PSC1-encoded Psytools file. 250 | psc2_path: str 251 | Output: PSC2-encoded Psytools file. 252 | 253 | """ 254 | COLUMNS_TO_REMOVE = { 255 | 'token', 256 | 'ipaddr', 257 | 'IdCheckGender', 258 | 'IdCheckDob', 259 | 'geoLoc_search', # Covid-19 questionnaires 260 | } 261 | COLUMNS_WITH_DATE = { 262 | 'startdate', 263 | 'datestamp', 264 | 'submitdate', 265 | } 266 | 267 | with open(psytools_path, 'r') as psc1_file: 268 | psc1_reader = DictReader(psc1_file, dialect='excel') 269 | # columns to remove entirely 270 | fieldnames = [x for x in psc1_reader.fieldnames 271 | if x not in COLUMNS_TO_REMOVE] 272 | with open(psc2_path, 'w') as psc2_file: 273 | psc2_writer = DictWriter(psc2_file, fieldnames, dialect='excel') 274 | psc2_writer.writeheader() 275 | for row in psc1_reader: 276 | # skip test and invalid subjects 277 | psc1 = _psc1(row['id'], psc2_from_psc1) 278 | if psc1: 279 | psc2 = psc2_from_psc1[psc1] 280 | # columns to remove entirely 281 | for x in COLUMNS_TO_REMOVE: 282 | if x in row: 283 | del row[x] 284 | # columns to de-identify 285 | row['id'] = psc2 286 | for x in COLUMNS_WITH_DATE: 287 | if x in row and row[x]: 288 | date = datetime.strptime(row[x], 289 | '%Y-%m-%d %H:%M:%S').date() 290 | if psc1 in DOB_FROM_PSC1: 291 | birth = DOB_FROM_PSC1[psc1] 292 | age = date - birth 293 | row[x] = age.days 294 | else: 295 | logging.error('unknown date of birth: "%s"', 296 | psc1) 297 | row[x] = None 298 | psc2_writer.writerow(row) 299 | 300 | 301 | def deidentify(psc2_from_psc1, master_dir, psc2_dir): 302 | """Anonymize and re-encode Psytools questionnaires within a directory. 303 | 304 | PSC1-encoded files are read from `master_dir`, anoymized and converted 305 | from PSC1 codes to PSC2, and the result is written in `psc2_dir`. 306 | 307 | Parameters 308 | ---------- 309 | psc2_from_psc1: map 310 | Conversion table, from PSC1 to PSC2. 311 | master_dir: str 312 | Input directory with PSC1-encoded questionnaires. 313 | psc2_dir: str 314 | Output directory with PSC2-encoded and anonymized questionnaires. 315 | 316 | """ 317 | CURRENTLY_NOT_PROPERLY_DEIDENTIFIED = { 318 | 'IMAGEN-IMGN_RELIABILITY_PI_FU2-BASIC_DIGEST.csv', 319 | 'IMAGEN-IMGN_RELIABILITY_FU3-BASIC_DIGEST.csv', 320 | 'STRATIFY_screening_(London).csv', 321 | 'STRATIFY_screening_(Southampton).csv', 322 | 'STRATIFY_screening_(ED).csv', 323 | } 324 | 325 | for filename in os.listdir(master_dir): 326 | if filename in CURRENTLY_NOT_PROPERLY_DEIDENTIFIED: 327 | continue 328 | master_path = os.path.join(master_dir, filename) 329 | psc2_path = os.path.join(psc2_dir, filename) 330 | if filename.startswith('IMAGEN-') or filename.startswith('STRATIFY-') or filename.startswith('IMACOV19-') or filename.startswith('STRATICO19-'): 331 | _deidentify_legacy(psc2_from_psc1, master_path, psc2_path) 332 | elif filename.startswith('Imagen_') or filename.startswith('STRATIFY_'): 333 | _deidentify_lsrc2(psc2_from_psc1, master_path, psc2_path) 334 | else: 335 | logging.error('skipping unknown file: %s', filename) 336 | 337 | 338 | def main(): 339 | # IMAGEN 340 | deidentify(PSC2_FROM_PSC1, 341 | PSYTOOLS_BL_DERIVED_DIR, PSYTOOLS_BL_PSC2_DIR) 342 | deidentify(PSC2_FROM_PSC1, 343 | PSYTOOLS_FU1_DERIVED_DIR, PSYTOOLS_FU1_PSC2_DIR) 344 | deidentify(PSC2_FROM_PSC1, 345 | PSYTOOLS_FU2_DERIVED_DIR, PSYTOOLS_FU2_PSC2_DIR) 346 | deidentify(PSC2_FROM_PSC1, 347 | PSYTOOLS_FU3_DERIVED_DIR, PSYTOOLS_FU3_PSC2_DIR) 348 | # STRATIFY/ESTRA 349 | deidentify(PSC2_FROM_PSC1, 350 | PSYTOOLS_STRATIFY_DERIVED_DIR, PSYTOOLS_STRATIFY_PSC2_DIR) 351 | #deidentify(PSC2_FROM_PSC1, 352 | # PSYTOOLS_STRATIFY_FU_DERIVED_DIR, PSYTOOLS_STRATIFY_FU_PSC2_DIR) 353 | # IMACOV 354 | deidentify(PSC2_FROM_PSC1, 355 | PSYTOOLS_IMACOV19_BL_DERIVED_DIR, PSYTOOLS_IMACOV19_BL_PSC2_DIR) 356 | deidentify(PSC2_FROM_PSC1, 357 | PSYTOOLS_IMACOV19_FU_DERIVED_DIR, PSYTOOLS_IMACOV19_FU_PSC2_DIR) 358 | deidentify(PSC2_FROM_PSC1, 359 | PSYTOOLS_IMACOV19_FU2_DERIVED_DIR, PSYTOOLS_IMACOV19_FU2_PSC2_DIR) 360 | deidentify(PSC2_FROM_PSC1, 361 | PSYTOOLS_IMACOV19_FU3_DERIVED_DIR, PSYTOOLS_IMACOV19_FU3_PSC2_DIR) 362 | # STRATICO 363 | deidentify(PSC2_FROM_PSC1, 364 | PSYTOOLS_STRATICO19_BL_DERIVED_DIR, PSYTOOLS_STRATICO19_BL_PSC2_DIR) 365 | deidentify(PSC2_FROM_PSC1, 366 | PSYTOOLS_STRATICO19_FU_DERIVED_DIR, PSYTOOLS_STRATICO19_FU_PSC2_DIR) 367 | deidentify(PSC2_FROM_PSC1, 368 | PSYTOOLS_STRATICO19_FU2_DERIVED_DIR, PSYTOOLS_STRATICO19_FU2_PSC2_DIR) 369 | deidentify(PSC2_FROM_PSC1, 370 | PSYTOOLS_STRATICO19_FU3_DERIVED_DIR, PSYTOOLS_STRATICO19_FU3_PSC2_DIR) 371 | 372 | 373 | if __name__ == "__main__": 374 | main() 375 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2017 CEA 2 | # 3 | # This software is governed by the CeCILL license under French law and 4 | # abiding by the rules of distribution of free software. You can use, 5 | # modify and/ or redistribute the software under the terms of the CeCILL 6 | # license as circulated by CEA, CNRS and INRIA at the following URL 7 | # "http://www.cecill.info". 8 | # 9 | # As a counterpart to the access to the source code and rights to copy, 10 | # modify and redistribute granted by the license, users are provided only 11 | # with a limited warranty and the software's author, the holder of the 12 | # economic rights, and the successive licensors have only limited 13 | # liability. 14 | # 15 | # In this respect, the user's attention is drawn to the risks associated 16 | # with loading, using, modifying and/or developing or reproducing the 17 | # software by the user in light of its specific status of free software, 18 | # that may mean that it is complicated to manipulate, and that also 19 | # therefore means that it is reserved for developers and experienced 20 | # professionals having in-depth computer knowledge. Users are therefore 21 | # encouraged to load and test the software's suitability as regards their 22 | # requirements in conditions enabling the security of their systems and/or 23 | # data to be ensured and, more generally, to use and operate it in the 24 | # same conditions as regards security. 25 | # 26 | # The fact that you are presently reading this means that you have had 27 | # knowledge of the CeCILL license and that you accept its terms. 28 | 29 | from setuptools import setup 30 | from imagen_databank import __version__ 31 | from imagen_databank import __author__ 32 | from imagen_databank import __email__ 33 | from imagen_databank import __license__ 34 | 35 | 36 | def readme(): 37 | with open('README.rst') as f: 38 | return f.read() 39 | 40 | 41 | def license(): 42 | with open('LICENSE') as f: 43 | return f.read() 44 | 45 | 46 | setup( 47 | name='imagen_databank', 48 | version=__version__, 49 | author=__author__, 50 | author_email=__email__, 51 | description='Imagen project databank software', 52 | long_description=readme(), 53 | license=__license__, 54 | url='https://github.com/imagen2/imagen_databank', 55 | packages=['imagen_databank'], 56 | scripts=[ 57 | 'cantab/imagen_cantab_age_at_session_start_time.py', 58 | 'dawba/imagen_dawba_process.py', 59 | 'psytools/imagen_psytools_download_csv.py', 60 | 'psytools/imagen_psytools_download_json.py', 61 | 'psytools/imagen_psytools_deidentify_csv.py', 62 | 'psytools/imagen_psytools_deidentify_json.py', 63 | ], 64 | classifiers=[ 65 | "License :: OSI Approved :: CEA CNRS Inria Logiciel Libre License, version 2.1 (CeCILL-2.1)", 66 | "Intended Audience :: Developers", 67 | "Intended Audience :: Science/Research", 68 | "Environment :: Console", 69 | "Development Status :: 4 - Beta", 70 | "Programming Language :: Python", 71 | "Programming Language :: Python :: 2.7", 72 | "Programming Language :: Python :: 3", 73 | "Operating System :: OS Independent", 74 | "Topic :: Scientific/Engineering :: Medical Science Apps.", 75 | "Topic :: Utilities", 76 | ], 77 | install_requires=[ 78 | 'pydicom', 79 | ], 80 | ) 81 | -------------------------------------------------------------------------------- /sex/imagen_sex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import csv 5 | import logging 6 | from imagen_databank import PSC2_FROM_PSC1 7 | 8 | logging.basicConfig(level=logging.INFO) 9 | 10 | WORKER_PROCESSES = 8 11 | 12 | FU3_VALIDATION = '/neurospin/imagen/FU3/RAW/PSC1/meta_data/sex_validation_2018.csv' 13 | 14 | FEMALE = 'F' 15 | MALE = 'M' 16 | 17 | 18 | def validation_FU3(path): 19 | result = {} 20 | 21 | with open(path, newline='') as csvfile: 22 | reader = csv.reader(csvfile, delimiter=',') 23 | next(reader) # skip header 24 | for row in reader: 25 | psc1 = row[0] 26 | sex = row[1] 27 | result[psc1] = sex 28 | 29 | return result 30 | 31 | 32 | def main(): 33 | # read different sources 34 | with open('imagen_sex_recruitment.csv', 'r') as f: 35 | reader = csv.DictReader(f, dialect='excel') 36 | recruitment = {row['PSC1']: row['Recruitment'] 37 | for row in reader} 38 | 39 | with open('imagen_sex_dataset.csv', 'r') as f: 40 | reader = csv.DictReader(f, dialect='excel') 41 | dataset = {row['PSC1']: 42 | (row['QualityReport.txt'] if 'QualityReport.txt' in row else None, 43 | row['BL MRI'] if 'BL MRI' in row else None, 44 | row['BL Cantab'] if 'BL Cantab' in row else None, 45 | row['FU2 MRI'] if 'FU2 MRI' in row else None, 46 | row['FU2 Cantab'] if 'FU2 Cantab' in row else None, 47 | row['FU3 MRI'] if 'FU3 MRI' in row else None, 48 | row['FU3 Cantab'] if 'FU3 Cantab' in row else None) 49 | for row in reader} 50 | 51 | with open('imagen_sex_psytools.csv', 'r') as f: 52 | reader = csv.DictReader(f, dialect='excel') 53 | psytools = {row['PSC1']: 54 | (row['Psytools BL'] if 'Psytools BL' in row else None, 55 | row['Psytools FU1'] if 'Psytools FU1' in row else None, 56 | row['Psytools FU2'] if 'Psytools FU2' in row else None, 57 | row['Psytools FU3'] if 'Psytools FU3' in row else None) 58 | for row in reader} 59 | 60 | with open('imagen_sex_xnat.csv', 'r') as f: 61 | reader = csv.DictReader(f, dialect='excel') 62 | xnat = {row['PSC1']: row['XNAT gender'] if 'XNAT gender' in row else None 63 | for row in reader} 64 | 65 | with open('imagen_sex_methylation.csv', 'r') as f: 66 | reader = csv.DictReader(f, dialect='excel') 67 | methylation = {row['PSC1']: 68 | (row['Methylation BL'] if 'Methylation BL' in row else None, 69 | row['Methylation FU'] if 'Methylation FU' in row else None) 70 | for row in reader} 71 | 72 | validation = validation_FU3(FU3_VALIDATION) 73 | 74 | # merge sources 75 | psc1s = set() 76 | for source in (recruitment, psytools, xnat, validation, methylation): 77 | psc1s = psc1s.union(set(source.keys())) 78 | psc1s = psc1s.intersection(set(PSC2_FROM_PSC1.keys())) # LONDON recruitment file 79 | 80 | with open('imagen_sex.csv', 'w', newline='') as csvfile: 81 | fieldnames = ['PSC1', 82 | 'Recruitment', 83 | 'QualityReport.txt', 'MRI BL', 'Cantab BL', 'MRI FU2', 'Cantab FU2', 'MRI FU3', 'Cantab FU3', 84 | 'Psytools BL', 'Psytools FU1', 'Psytools FU2', 'Psytools FU3', 85 | 'XNAT gender', 86 | '2018 validation', 87 | 'Reference', 88 | 'Methylation BL', 'Methylation FU'] 89 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 90 | writer.writeheader() 91 | 92 | for psc1 in sorted(psc1s): 93 | row = {} 94 | if psc1 in recruitment: 95 | row['Recruitment'] = recruitment[psc1] 96 | if psc1 in dataset: 97 | if dataset[psc1][0]: 98 | row['QualityReport.txt'] = dataset[psc1][0] 99 | if dataset[psc1][1]: 100 | row['MRI BL'] = dataset[psc1][1] 101 | if dataset[psc1][2]: 102 | row['Cantab BL'] = dataset[psc1][2] 103 | if dataset[psc1][3]: 104 | row['MRI FU2'] = dataset[psc1][3] 105 | if dataset[psc1][4]: 106 | row['Cantab FU2'] = dataset[psc1][4] 107 | if dataset[psc1][5]: 108 | row['MRI FU3'] = dataset[psc1][5] 109 | if dataset[psc1][6]: 110 | row['Cantab FU3'] = dataset[psc1][6] 111 | if psc1 in psytools: 112 | if psytools[psc1][0]: 113 | row['Psytools BL'] = psytools[psc1][0] 114 | if psytools[psc1][1]: 115 | row['Psytools FU1'] = psytools[psc1][1] 116 | if psytools[psc1][2]: 117 | row['Psytools FU2'] = psytools[psc1][2] 118 | if psytools[psc1][3]: 119 | row['Psytools FU3'] = psytools[psc1][3] 120 | if psc1 in xnat: 121 | row['XNAT gender'] = xnat[psc1] 122 | if psc1 in validation: 123 | row['2018 validation'] = validation[psc1] 124 | 125 | if psc1 in xnat and psc1 in validation: 126 | if xnat[psc1] != validation[psc1]: 127 | logging.warning('%s: changed XNAT %s into %s', 128 | psc1, xnat[psc1], validation[psc1]) 129 | 130 | values = set(row.values()) 131 | if len(values) > 1: 132 | if psc1 in validation: 133 | row['Reference'] = validation[psc1] 134 | elif psc1 in xnat: 135 | row['Reference'] = xnat[psc1] 136 | else: 137 | logging.warning('%s: cannot derive a reference value for sex', 138 | psc1) 139 | else: 140 | row['Reference'] = next(iter(values)) 141 | 142 | if psc1 in methylation: 143 | if methylation[psc1][0]: 144 | row['Methylation BL'] = methylation[psc1][0] 145 | if methylation[psc1][1]: 146 | row['Methylation FU'] = methylation[psc1][1] 147 | 148 | row['PSC1'] = psc1 149 | writer.writerow(row) 150 | 151 | 152 | if __name__ == "__main__": 153 | main() 154 | -------------------------------------------------------------------------------- /sex/imagen_sex_methylation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import csv 5 | import logging 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | 9 | WORKER_PROCESSES = 8 10 | 11 | METHYLATION = '/neurospin/imagen/TODO/predicted_gender.csv' 12 | PSC1_FROM_CHIP = '/neurospin/imagen/TODO/PSC1/Associated PSC1 codes.csv' 13 | 14 | FEMALE = 'F' 15 | MALE = 'M' 16 | 17 | 18 | def psc1_from_chip(path): 19 | result = {} 20 | 21 | with open(path, newline='') as csvfile: 22 | reader = csv.reader(csvfile, delimiter=',') 23 | next(reader) # skip header 24 | for row in reader: 25 | chip = row[0] 26 | psc1 = row[1] 27 | if psc1.endswith('FU'): 28 | psc1 = psc1[:-len('FU')] 29 | timepoint = 'FU2' 30 | else: 31 | timepoint = 'BL' 32 | result[chip] = (psc1, timepoint) 33 | 34 | return result 35 | 36 | 37 | def methylation_process(path, psc1_from_chip): 38 | result_BL = {} 39 | result_FU2 = {} 40 | 41 | with open(path, newline='') as csvfile: 42 | reader = csv.reader(csvfile, delimiter=',') 43 | next(reader) # skip header 44 | for row in reader: 45 | chip = row[0] 46 | sex = row[1] 47 | if sex == '1': 48 | sex = 'F' 49 | elif sex == '2': 50 | sex = 'M' 51 | else: 52 | logging.error('%s: incorrect sex (%s) in prediction CSV file: %s', 53 | chip, sex, f) 54 | continue 55 | if chip in psc1_from_chip: 56 | psc1, timepoint = psc1_from_chip[chip] 57 | if timepoint == 'FU2': 58 | result = result_FU2 59 | elif timepoint == 'BL': 60 | result = result_BL 61 | else: 62 | logging.error('%s: incorrect connversion table', chip) 63 | continue 64 | if psc1 in result: 65 | if result[psc1] != sex: 66 | logging.error('%s: inconsistent sex from methylation', psc1) 67 | result[psc1] = '?' 68 | else: 69 | result[psc1] = sex 70 | 71 | return result_BL, result_FU2 72 | 73 | 74 | def main(): 75 | psc1_from_chip_table = psc1_from_chip(PSC1_FROM_CHIP) 76 | methylation_BL, methylation_FU2 = methylation_process(METHYLATION, psc1_from_chip_table) 77 | methylation = (methylation_BL, methylation_FU2) 78 | 79 | with open('imagen_sex_methylation.csv', 'w', newline='') as csvfile: 80 | sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) 81 | sex.writerow(['PSC1', 82 | 'Methylation BL', 'Methylation FU']) 83 | psc1s = set() 84 | for timepoint in methylation: 85 | psc1s = psc1s.union(set(timepoint.keys())) 86 | for psc1 in sorted(psc1s): 87 | row = [psc1] 88 | for timepoint in methylation: 89 | if psc1 in timepoint: 90 | row.append(timepoint[psc1]) 91 | else: 92 | row.append(None) 93 | sex.writerow(row) 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /sex/imagen_sex_psytools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | from multiprocessing import Pool 5 | import csv 6 | from collections import Counter 7 | import logging 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | BL_PSYTOOLS = '/neurospin/imagen/BL/RAW/PSC1/psytools' 12 | FU1_PSYTOOLS = '/neurospin/imagen/FU1/RAW/PSC1/psytools' 13 | FU2_PSYTOOLS = '/neurospin/imagen/FU2/RAW/PSC1/psytools' 14 | FU3_PSYTOOLS = '/neurospin/imagen/FU3/RAW/PSC1/psytools' 15 | 16 | WORKER_PROCESSES = 24 17 | 18 | 19 | FEMALE = 'F' 20 | MALE = 'M' 21 | 22 | _CSV_ID_CHECK_GENDER_MAPPING = { 23 | '1': MALE, 24 | '2': FEMALE, 25 | 'female': FEMALE, 26 | 'male': MALE, 27 | } 28 | 29 | _LSRC2_ID_CHECK_GENDER_MAPPING = { 30 | 'F': FEMALE, 31 | 'M': MALE, 32 | } 33 | 34 | 35 | def _psytools_choice(psc1, counter): 36 | female = counter[FEMALE] 37 | male = counter[MALE] 38 | total = female + male 39 | if female and male: 40 | logging.error('%s: inconsistent information about sex', psc1) 41 | return None 42 | elif female: 43 | return FEMALE 44 | elif male: 45 | return MALE 46 | else: 47 | logging.error('%s: cannot find information about sex', psc1) 48 | sex = None 49 | 50 | 51 | def list_psytools_timepoint(path): 52 | """List Psytools CSV files exported from Delosis. 53 | 54 | Parameters 55 | ---------- 56 | path : str 57 | Directory to read Psytools CSV files from. 58 | 59 | Yields 60 | ------ 61 | str 62 | Path to Psytools CSV file. 63 | 64 | """ 65 | CSV_PREFIX = ('IMAGEN-IMGN_', 'IMAGEN-cVEDA_') 66 | LSRC2_PREFIX = ('Imagen_', 'STRATIFY_') 67 | 68 | for f in os.listdir(path): 69 | root, ext = os.path.splitext(f) 70 | if ext == '.csv': 71 | if any(root.startswith(prefix) for prefix in CSV_PREFIX): 72 | yield (False, os.path.join(path, f)) 73 | elif any(root.startswith(prefix) for prefix in LSRC2_PREFIX): 74 | yield (True, os.path.join(path, f)) 75 | else: 76 | logging.error('skipping unknown CSV file: %s', f) 77 | 78 | 79 | def process_psytools_timepoint(arguments): 80 | (lsrc2, path) = arguments # unpack multiple arguments 81 | 82 | result = {} 83 | 84 | with open(path, 'r') as f: 85 | reader = csv.DictReader(f, dialect='excel') 86 | for row in reader: 87 | if lsrc2: 88 | psc1 = row['id'] 89 | if psc1.endswith('FU3'): 90 | psc1 = psc1[:-len('FU3')] 91 | elif psc1.endswith('FU2'): # Parent questionnaires 92 | psc1 = psc1[:-len('FU2')] 93 | if psc1.isdigit() and len(psc1) == 12: 94 | if 'IdCheckGender' in row: 95 | id_check_gender = row['IdCheckGender'] 96 | if id_check_gender in _LSRC2_ID_CHECK_GENDER_MAPPING: 97 | sex = _LSRC2_ID_CHECK_GENDER_MAPPING[id_check_gender] 98 | result.setdefault(psc1, []).append(sex) 99 | else: 100 | logging.error("%s: invalid 'IdCheckGender': %s", 101 | psc1, id_check_gender) 102 | else: 103 | logging.info('%s: cannot interpret as PSC1 code', psc1) 104 | else: 105 | completed = row['Completed'] 106 | trial = row['Trial'] 107 | if completed == 't' and trial == "id_check_gender": 108 | psc1_suffix = row['User code'].rsplit('-', 1) 109 | psc1 = psc1_suffix[0] 110 | if psc1.isdigit() and len(psc1) == 12: 111 | trial_result = row['Trial result'] 112 | if trial_result in _CSV_ID_CHECK_GENDER_MAPPING: 113 | sex = _CSV_ID_CHECK_GENDER_MAPPING[trial_result] 114 | result.setdefault(psc1, []).append(sex) 115 | else: 116 | logging.error("%s: invalid 'id_check_gender': %s", 117 | psc1, trial_result) 118 | else: 119 | logging.info('%s: cannot interpret as PSC1 code', psc1) 120 | 121 | return result 122 | 123 | 124 | def _decide_from_counter(counter): 125 | female = counter[FEMALE] 126 | male = counter[MALE] 127 | total = sum(counter.values()) 128 | if total: 129 | if female > male: 130 | sex = FEMALE 131 | percentage = ((200 * female) // total + 1) // 2 # closest integer percentage 132 | elif male > female: 133 | sex = MALE 134 | percentage = ((200 * male) // total + 1) // 2 # closest integer percentage 135 | else: 136 | sex = None 137 | percentage = 50 138 | else: 139 | sex = None 140 | percentage = None 141 | 142 | return sex, percentage 143 | 144 | 145 | def psytools_timepoint(path): 146 | todo_list = list(list_psytools_timepoint(path)) 147 | 148 | pool = Pool(WORKER_PROCESSES) 149 | results = pool.map(process_psytools_timepoint, todo_list) 150 | pool.close() 151 | pool.join() 152 | 153 | sex_counter = {} 154 | for result in results: 155 | for psc1, sex in result.items(): 156 | sex_counter.setdefault(psc1, Counter()).update(sex) 157 | 158 | return {psc1: _decide_from_counter(counter) 159 | for psc1, counter in sex_counter.items()} 160 | 161 | 162 | def main(): 163 | psytools_BL = psytools_timepoint(BL_PSYTOOLS) 164 | psytools_FU1 = psytools_timepoint(FU1_PSYTOOLS) 165 | psytools_FU2 = psytools_timepoint(FU2_PSYTOOLS) 166 | psytools_FU3 = psytools_timepoint(FU3_PSYTOOLS) 167 | psytools = (psytools_BL, psytools_FU1, psytools_FU2, psytools_FU3) 168 | 169 | with open('imagen_sex_psytools.csv', 'w', newline='') as csvfile: 170 | sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) 171 | sex.writerow(['PSC1', 172 | 'Psytools BL', 'Psytools FU1', 173 | 'Psytools FU2', 'Psytools FU3']) 174 | psc1s = set() 175 | for timepoint in psytools: 176 | psc1s = psc1s.union(set(timepoint.keys())) 177 | for psc1 in sorted(psc1s): 178 | row = [psc1] 179 | for timepoint in psytools: 180 | if psc1 in timepoint: 181 | row.append(timepoint[psc1][0]) 182 | else: 183 | row.append(None) 184 | sex.writerow(row) 185 | 186 | if any(psc1 in timepoint and timepoint[psc1][1] != 100 187 | for timepoint in psytools): 188 | s = '%s: inconsistent sex:' 189 | if psc1 in psytools_BL: 190 | s += '\n\tBL: {} {}%%'.format(psytools_BL[psc1][0], psytools_BL[psc1][1]) 191 | if psc1 in psytools_FU1: 192 | s += '\n\tFU1: {} {}%%'.format(psytools_FU1[psc1][0], psytools_FU1[psc1][1]) 193 | if psc1 in psytools_FU2: 194 | s += '\n\tFU2: {} {}%%'.format(psytools_FU2[psc1][0], psytools_FU2[psc1][1]) 195 | if psc1 in psytools_FU3: 196 | s += '\n\tFU3: {} {}%%'.format(psytools_FU3[psc1][0], psytools_FU3[psc1][1]) 197 | logging.warning(s, psc1) 198 | 199 | 200 | if __name__ == "__main__": 201 | main() 202 | -------------------------------------------------------------------------------- /sex/imagen_sex_recruitment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | from multiprocessing import Pool 5 | import csv 6 | from datetime import datetime 7 | import logging 8 | 9 | logging.basicConfig(level=logging.INFO) 10 | 11 | BL_RECRUITMENT_INFO = os.path.join('/neurospin/imagen/BL/RAW/PSC1/recruitment') 12 | 13 | WORKER_PROCESSES = 16 14 | 15 | 16 | FEMALE = 'F' 17 | MALE = 'M' 18 | 19 | _RECRUITMENT_SEX_MAPPING = { 20 | 'f': FEMALE, 21 | 'F': FEMALE, 22 | 'm': MALE, 23 | 'M': MALE, 24 | 'w': FEMALE, 25 | } 26 | 27 | _RECRUITMENT_SEX_VOID = { 28 | '', 29 | '0', 30 | '0.0', 31 | 'Test', 32 | 'not known', 33 | } 34 | 35 | 36 | def _recruitment_center(s): 37 | s = s.strip() 38 | 39 | if set(s).issubset('12345678.0'): 40 | if '.' in s: 41 | try: 42 | s = float(s) 43 | except ValueError: 44 | logging.info('%s: cannot interpret as center code', s) 45 | return None 46 | else: 47 | s = str(int(s // 1)) # integral part 48 | if len(s) == 1: 49 | return s 50 | else: 51 | logging.error('%s: incorrect center code', s) 52 | else: 53 | logging.debug('%s: skipping center code', s) 54 | 55 | return None 56 | 57 | 58 | def _recruitment_psc1(s, center): 59 | s = s.strip() 60 | 61 | if s.isdigit(): 62 | if len(s) < 7: 63 | s = '0' + center + s.zfill(10) 64 | if len(s) == 12: 65 | return s 66 | else: 67 | logging.error('%s: incorrect PSC1 code', s) 68 | elif s: 69 | logging.warn('%s: cannot interpret as PSC1 code', s) 70 | else: 71 | logging.debug('empty PSC1 code') 72 | 73 | return None 74 | 75 | 76 | def _recruitment_choice(psc1, timestamps): 77 | # use data with most recent time stamp 78 | counter = Counter(timestamps[max(timestamps.keys())]) 79 | 80 | female = counter[FEMALE] 81 | male = counter[MALE] 82 | if female and male: 83 | logging.error('%s: inconsistent information about sex', psc1) 84 | return None 85 | elif female: 86 | return FEMALE 87 | elif male: 88 | return MALE 89 | else: 90 | logging.error('%s: cannot find information about sex', psc1) 91 | sex = None 92 | 93 | 94 | def list_recruitment_BL(path): 95 | """List recruitment CSV files sent by recruitment centres. 96 | 97 | Parameters 98 | ---------- 99 | path : str 100 | Directory to read CSV recruitment files from. 101 | 102 | Yields 103 | ------ 104 | str 105 | Path to CSV file. 106 | 107 | """ 108 | for f in os.listdir(path): 109 | root, ext = os.path.splitext(f) 110 | if ext == '.csv': 111 | yield os.path.join(path, f) 112 | 113 | 114 | def process_recruitment_BL(path): 115 | timestamp = os.path.getmtime(path) 116 | 117 | recruitment_sex = {} 118 | 119 | with open(path, encoding='latin1', newline='') as csvfile: 120 | recruitment = csv.reader(csvfile, delimiter=',') 121 | for row in recruitment: 122 | center = _recruitment_center(row[0]) 123 | if center: 124 | psc1 = _recruitment_psc1(row[1], center) 125 | if psc1: 126 | gender = row[2].strip() 127 | if gender in _RECRUITMENT_SEX_MAPPING: 128 | sex = _RECRUITMENT_SEX_MAPPING[gender] 129 | if psc1 in recruitment_sex: 130 | if recruitment_sex[psc1] != sex: 131 | logging.error('%s: inconsistent duplicate line', 132 | psc1) 133 | else: 134 | logging.error('%s: duplicate line', 135 | psc1) 136 | else: 137 | recruitment_sex[psc1] = sex 138 | elif gender not in _RECRUITMENT_SEX_VOID: 139 | logging.error("%s: incorrect 'gender': %s", 140 | psc1, gender) 141 | 142 | return timestamp, recruitment_sex 143 | 144 | 145 | def recruitment_BL(path): 146 | """Process CSV recruitment files sent by recruitment centres at baseline. 147 | 148 | First list the files to process, then read these files in parallel. 149 | 150 | Parameters 151 | ---------- 152 | path : str 153 | Directory to read CSV recruitment files from. 154 | 155 | Returns 156 | ------- 157 | dict 158 | Key is PSC1 and value a pair (xnat_sex, xnat_experiment_sex). 159 | 160 | """ 161 | todo_list = list(list_recruitment_BL(path)) 162 | 163 | pool = Pool(WORKER_PROCESSES) 164 | results = pool.map(process_recruitment_BL, todo_list) 165 | pool.close() 166 | pool.join() 167 | 168 | sex_by_timestamp = {} 169 | for timestamp, result in results: 170 | for psc1, sex in result.items(): 171 | sex_by_timestamp.setdefault(psc1, {})[timestamp] = result[psc1] 172 | 173 | recruitment_sex = {} 174 | for psc1, timestamps in sex_by_timestamp.items(): 175 | max_timestamp = max(timestamps) 176 | sex = timestamps[max_timestamp] 177 | for k, v in timestamps.items(): 178 | if v != sex: 179 | logging.error("%s: inconsistent 'gender' across time stamps\n" 180 | '\t%s: %s\n' 181 | '\t%s: %s', 182 | psc1, 183 | datetime.fromtimestamp(k).date(), v, 184 | datetime.fromtimestamp(max_timestamp).date(), sex) 185 | recruitment_sex[psc1] = sex 186 | 187 | return recruitment_sex 188 | 189 | 190 | def main(): 191 | recruitment = recruitment_BL(BL_RECRUITMENT_INFO) 192 | 193 | with open('imagen_sex_recruitment.csv', 'w', newline='') as csvfile: 194 | sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) 195 | sex.writerow(['PSC1', 'Recruitment']) 196 | for psc1 in sorted(recruitment): 197 | row = [psc1] 198 | row.append(recruitment[psc1]) 199 | sex.writerow(row) 200 | 201 | 202 | if __name__ == "__main__": 203 | main() 204 | -------------------------------------------------------------------------------- /sex/imagen_sex_xnat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | from multiprocessing import Pool 5 | from xml.etree import ElementTree 6 | from imagen_databank import PSC1_FROM_PSC2 7 | import csv 8 | import logging 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | 12 | BL_XNAT = '/neurospin/imagen/export/xml' 13 | 14 | WORKER_PROCESSES = 16 15 | 16 | 17 | FEMALE = 'F' 18 | MALE = 'M' 19 | 20 | _XNAT_GENDER_MAPPING = { 21 | 'female': FEMALE, 22 | 'male': MALE, 23 | } 24 | 25 | _XNAT_EXPERIMENT_GENDER_MAPPING = { 26 | 'f': FEMALE, 27 | 'F': FEMALE, 28 | 'm': MALE, 29 | 'M': MALE, 30 | 'w': FEMALE, 31 | 'female': FEMALE, # single occurrence! 32 | } 33 | 34 | _XNAT_EXPERIMENT_GENDER_VOID = { 35 | '0', 36 | 'Test', 37 | 'not known', 38 | } 39 | 40 | 41 | def list_xnat_BL(path): 42 | """List XML files exported from XNAT. 43 | 44 | Yields only files with standard names: 45 | IMAGEN_.xml 46 | 47 | Parameters 48 | ---------- 49 | path : str 50 | Directory to read XML files from. 51 | 52 | Yields 53 | ------ 54 | tuple of str 55 | Yields a pair (psc2, path). 56 | 57 | """ 58 | for f in os.listdir(path): 59 | root, ext = os.path.splitext(f) 60 | if ext == '.xml': 61 | PREFIX = 'IMAGEN_' 62 | if root.startswith(PREFIX): 63 | psc2 = root[len(PREFIX):] 64 | logging.debug('%s: found XML file: %s', psc2, f) 65 | assert(psc2.isdigit() and len(psc2) == 12) 66 | yield (psc2, os.path.join(path, f)) 67 | else: 68 | logging.error('unexpected XML file: %s', f) 69 | else: 70 | logging.debug('skipping non-XML file: %s', f) 71 | 72 | 73 | def process_xnat_BL(arguments): 74 | """Read subject sex from XML file exported from XNAT. 75 | 76 | Looks for this information in two distinct places. 77 | 78 | Parameters 79 | ---------- 80 | arguments : tuple of str 81 | Expects a pair (psc2, path) 82 | 83 | Returns 84 | ------- 85 | tuple of str 86 | Yields a pair (xnat_sex, xnat_experiment_sex). 87 | 88 | """ 89 | (psc2, path) = arguments # unpack multiple arguments 90 | 91 | tree = ElementTree.parse(path) 92 | root = tree.getroot() 93 | 94 | xnat_sex = None 95 | xnat_gender = root.find('.//{http://nrg.wustl.edu/xnat}gender') 96 | if xnat_gender is None: 97 | logging.warn("%s: missing 'gender' in XML file", psc2) 98 | else: 99 | xnat_gender = xnat_gender.text 100 | if xnat_gender in _XNAT_GENDER_MAPPING: 101 | xnat_sex = _XNAT_GENDER_MAPPING[xnat_gender] 102 | else: 103 | logging.error("%s: incorrect 'gender' (%s) in XML file", 104 | psc2, xnat_gender) 105 | 106 | xnat_experiment_sex = None 107 | xnat_experiment_gender = root.find('.//{http://nrg.wustl.edu/xnat}experiment[@gender]') 108 | if xnat_experiment_gender is None: 109 | logging.warn("%s: missing 'experiment[@gender]' in XML file", psc2) 110 | else: 111 | xnat_experiment_gender = xnat_experiment_gender.attrib['gender'] 112 | xnat_experiment_gender = xnat_experiment_gender.strip() 113 | if xnat_experiment_gender in _XNAT_EXPERIMENT_GENDER_MAPPING: 114 | xnat_experiment_sex = _XNAT_EXPERIMENT_GENDER_MAPPING[xnat_experiment_gender] 115 | elif xnat_experiment_gender not in _XNAT_EXPERIMENT_GENDER_VOID: 116 | logging.error("%s: incorrect 'experiment[@gender]' (%s) in XML file", 117 | psc2, xnat_experiment_gender) 118 | 119 | return xnat_sex, xnat_experiment_sex 120 | 121 | 122 | def xnat_BL(path): 123 | """Process XML files exported from XNAT. 124 | 125 | First list the files to process, then read these files in parallel. 126 | 127 | Parameters 128 | ---------- 129 | path : str 130 | Directory to read XML files from. 131 | 132 | Returns 133 | ------- 134 | dict 135 | Key is PSC2 and value a pair (xnat_sex, xnat_experiment_sex). 136 | 137 | """ 138 | todo_list = list(list_xnat_BL(BL_XNAT)) 139 | 140 | pool = Pool(WORKER_PROCESSES) 141 | results = pool.map(process_xnat_BL, todo_list) 142 | pool.close() 143 | pool.join() 144 | 145 | psc1, path = zip(*todo_list) 146 | return dict(zip(psc1, results)) 147 | 148 | 149 | def main(): 150 | xnat = xnat_BL(BL_XNAT) 151 | 152 | xnat = {PSC1_FROM_PSC2[psc2]: v for psc2, v in xnat.items()} 153 | 154 | with open('imagen_sex_xnat.csv', 'w', newline='') as csvfile: 155 | sex = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) 156 | sex.writerow(['PSC1', 157 | 'XNAT gender']) 158 | for psc1 in sorted(xnat): 159 | row = [psc1] 160 | if xnat[psc1][0] and xnat[psc1][1]: 161 | if xnat[psc1][0] != xnat[psc1][1]: 162 | logging.error("%s: inconsistent 'gender' (%s) / 'experiment@gender' (%s)", 163 | psc1, xnat[psc1][0], xnat[psc1][1]) 164 | row.append('?') 165 | else: 166 | row.append(xnat[psc1][0]) 167 | elif xnat[psc1][0]: 168 | row.append(xnat[psc1][0]) 169 | elif xnat[psc1][1]: 170 | row.append(xnat[psc1][1]) 171 | else: 172 | row.append(None) 173 | sex.writerow(row) 174 | 175 | 176 | if __name__ == "__main__": 177 | main() 178 | -------------------------------------------------------------------------------- /stratify_demographics/demographics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | 4 | import os 5 | from csv import reader 6 | from csv import DictWriter 7 | import xlrd 8 | from imagen_databank import PSC2_FROM_PSC1, CENTER_NAME 9 | 10 | import logging 11 | logging.basicConfig(level=logging.ERROR) 12 | 13 | 14 | _DEBUG_PSYTOOLS_SEX = '/imagen/STRATIFY/RAW/PSC1/meta_data/STRATIFY_SEX_2024-10-17.txt' 15 | 16 | _DEMOGRAPHIC_RECORDS_DIR = '/imagen/STRATIFY/RAW/PSC1/meta_data' 17 | _DEMOGRAPHIC_RECORDS = [ 18 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'STRATIFY_recruitment_file_SOUTHAMPTON_2024-10-16.xlsx'), 19 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'STRATIFY_recruitment_file_LONDON_2024-03-14.xlsx'), 20 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'ESTRA_recruitment_file_LONDON_2024-08-16.xlsx'), 21 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'ESTRA_recruitment_file_LONDON_CONTROLS_2023-07-24.xlsx'), 22 | os.path.join(_DEMOGRAPHIC_RECORDS_DIR, 'STRATIFY_recruitment_file_BERLIN_2024-10-16.xlsx'), 23 | ] 24 | 25 | _FINAL_COLUMNS = ( 26 | 'PSC2', 27 | 'sex', 28 | 'recruitment site', 29 | 'scanning site', 30 | 'patient group', 31 | 'complete', 32 | #'missing data', 33 | ) 34 | 35 | _DEMOGRAPHIC_COLUMNS = { 36 | # handle separately 'PSC1 Code' 37 | # Stratify 38 | 'Sex': _FINAL_COLUMNS[1], 39 | 'Acquisition Centre (and Scanning Site)': _FINAL_COLUMNS[3], 40 | 'Acquisition Centre': _FINAL_COLUMNS[3], 41 | 'Patient Group': _FINAL_COLUMNS[4], 42 | 'Fully Complete? Y/N': _FINAL_COLUMNS[5], 43 | #'Missing Data (Please Specify)': _FINAL_COLUMNS[6], 44 | # ESTRA 45 | # (skip 'Recruitment Centre') 46 | 'Scanning Site': _FINAL_COLUMNS[3], 47 | 'Gender ': _FINAL_COLUMNS[1], 48 | 'Diagnosis ': _FINAL_COLUMNS[4], 49 | 'Diagnosis': _FINAL_COLUMNS[4], 50 | # Stratify 20 additional controls 51 | 'Site': _FINAL_COLUMNS[3], 52 | 'Group': _FINAL_COLUMNS[4], 53 | 'Gender': _FINAL_COLUMNS[1], 54 | # LONDON CONTROLS 55 | 56 | # BERLIN 57 | 'sex': _FINAL_COLUMNS[1], 58 | 'scanning site': _FINAL_COLUMNS[3], 59 | 'patient group': _FINAL_COLUMNS[4], 60 | 'complete': _FINAL_COLUMNS[5], 61 | #'missing data': _FINAL_COLUMNS[6], 62 | } 63 | 64 | _CONTROL_GROUP = 'Control' 65 | _CONTROL_GROUP_ESTRA = 'Control_ESTRA' 66 | _ADHD_GROUP = 'ADHD' 67 | _AUD_GROUP = 'AUD' 68 | _AN_GROUP = 'AN' 69 | _RECAN_GROUP = 'recAN' 70 | _BN_GROUP = 'BN' 71 | _RECBN_GROUP = 'recBN' 72 | _MDD_GROUP = 'MDD' 73 | _PSYCHOSIS_GROUP = 'Psychosis' 74 | _BED_GROUP= 'BED' 75 | 76 | _PATIENT_GROUPS = { 77 | _CONTROL_GROUP, 78 | _CONTROL_GROUP_ESTRA, 79 | _ADHD_GROUP, 80 | _AUD_GROUP, 81 | _AN_GROUP, 82 | _RECAN_GROUP, 83 | _BN_GROUP, 84 | _RECBN_GROUP, 85 | _MDD_GROUP, 86 | _PSYCHOSIS_GROUP, 87 | _BED_GROUP, 88 | } 89 | 90 | 91 | def normalize_patient_group(s): 92 | table = { 93 | 'control': _CONTROL_GROUP, 94 | 'Control_ESTRA': _CONTROL_GROUP_ESTRA, 95 | 'depression': _MDD_GROUP, 96 | 'psychosis': _PSYCHOSIS_GROUP, 97 | 'Alcohol Use Disorder': _AUD_GROUP, 98 | 'Major Depressive Disorder': _MDD_GROUP, 99 | 'Healthy Control': _CONTROL_GROUP, 100 | 101 | } 102 | if s in table: 103 | s = table[s] 104 | 105 | return s 106 | 107 | 108 | def normalize_scanning_site(s): 109 | table = { 110 | # LONDON: 'CNS' or 'Invicro' 111 | 'KCL': 'CNS', 112 | 'Denmark Hill': 'CNS', 113 | # SOUTHAMPTON 114 | 'Southampton': None, 115 | # BERLIN 116 | 'BERLIN': None, 117 | } 118 | if s in table: 119 | s = table[s] 120 | 121 | return s 122 | 123 | 124 | def normalize_sex(s): 125 | s = s.upper() 126 | 127 | table = { 128 | 'FEMALE': 'F', 129 | 'MALE': 'M', 130 | } 131 | if s in table: 132 | s = table[s] 133 | 134 | return s 135 | 136 | 137 | def strip_cell(s): 138 | try: 139 | s = s.strip() 140 | except AttributeError: # floats and other types 141 | pass 142 | return s 143 | 144 | 145 | def read_demographic_record(path): 146 | demographics = {} 147 | 148 | with xlrd.open_workbook(path) as workbook: 149 | worksheet = workbook.sheet_by_index(0) 150 | 151 | # read header 152 | psc1_index = None 153 | index = {} 154 | row = [strip_cell(x) for x in worksheet.row_values(0)] 155 | print(path) 156 | for i, value in enumerate(row): 157 | if value in _DEMOGRAPHIC_COLUMNS: 158 | index[_DEMOGRAPHIC_COLUMNS[value]] = i 159 | print(i, value, '→', _DEMOGRAPHIC_COLUMNS[value]) 160 | elif value == 'PSC1 Code' or value == 'PSC1': 161 | psc1_index = i 162 | else: 163 | print(i, value, '→', '?????') 164 | 165 | if psc1_index is None: 166 | logging.error('%s: cannot find PSC1 code', path) 167 | return demographics 168 | 169 | # read data 170 | for i in range(1, worksheet.nrows): 171 | row = [strip_cell(x) for x in worksheet.row_values(i)] 172 | 173 | psc1 = row[psc1_index] 174 | psc1 = psc1[:12] # remove trailing FU3 or SB 175 | if psc1 not in PSC2_FROM_PSC1: 176 | logging.error('%s: invalid PSC1 code', psc1) 177 | continue 178 | 179 | demographics[psc1] = {} 180 | 181 | for name, i in index.items(): 182 | value = row[i] 183 | if name == 'sex': 184 | value = normalize_sex(value) 185 | if value not in {'F', 'M'}: 186 | logging.error('%s: invalid sex: %s', psc1, value) 187 | continue 188 | elif name == 'patient group': 189 | value = normalize_patient_group(value) 190 | if value not in _PATIENT_GROUPS: 191 | logging.error('%s: invalid patient group: %s', 192 | psc1, value) 193 | continue 194 | elif name == 'scanning site': 195 | value = normalize_scanning_site(value) 196 | elif name == 'complete': 197 | if value not in {'Y', 'N', ''}: 198 | logging.error('%s: invalid completeness: %s', 199 | psc1, value) 200 | continue 201 | elif name == 'missing data': 202 | value = value.rstrip(',.') 203 | if value.lower() == 'none': 204 | value = None 205 | demographics[psc1][name] = value 206 | 207 | return demographics 208 | 209 | 210 | def read_demographic_records(paths): 211 | demographic_records = {} 212 | 213 | for path in paths: 214 | demographic_records.update(read_demographic_record(path)) 215 | 216 | return demographic_records 217 | 218 | 219 | def main(): 220 | demographics = read_demographic_records(_DEMOGRAPHIC_RECORDS) 221 | 222 | with open(_DEBUG_PSYTOOLS_SEX, 'r') as sex_file: 223 | sex_reader = reader(sex_file, dialect='excel') 224 | 225 | with open('STRATIFY_participants.csv', 'w') as demographics_file: 226 | demographics_writer = DictWriter(demographics_file, 227 | _FINAL_COLUMNS, 228 | dialect='excel') 229 | demographics_writer.writeheader() 230 | for row in sex_reader: 231 | psc1 = row[0] 232 | psc2 = PSC2_FROM_PSC1[psc1] 233 | center = int(psc1[1]) 234 | if center > 8: 235 | center = int(psc1[1:3]) 236 | center = CENTER_NAME[center] 237 | sex = row[1] 238 | if psc1 in demographics: 239 | data = demographics[psc1] 240 | data['PSC2'] = psc2 241 | data['recruitment site'] = center 242 | if 'sex' in data: 243 | if data['sex'] != sex: 244 | logging.error('%s: inconsistent sex between Psytools and recruitment file', psc1) 245 | data['sex'] = sex 246 | else: 247 | data = { 248 | 'PSC2': psc2, 249 | 'sex': sex, 250 | 'recruitment site': center, 251 | } 252 | row = {x: data[x] if x in data else None 253 | for x in _FINAL_COLUMNS} 254 | demographics_writer.writerow(row) 255 | 256 | 257 | if __name__ == "__main__": 258 | main() 259 | -------------------------------------------------------------------------------- /stratify_demographics/stratify_debug_psytools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | from multiprocessing import Pool 5 | import csv 6 | from datetime import datetime, date 7 | from collections import Counter 8 | import logging 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | 12 | STRATIFY_PSYTOOLS = '/neurospin/imagen/STRATIFY/RAW/PSC1/psytools' 13 | STRATIFY_DOB = '/neurospin/imagen/STRATIFY/RAW/PSC1/meta_data/dob_validation.csv' 14 | STRATIFY_SEX = '/neurospin/imagen/STRATIFY/RAW/PSC1/meta_data/sex_validation.csv' 15 | 16 | WORKER_PROCESSES = 24 17 | 18 | 19 | FEMALE = 'F' 20 | MALE = 'M' 21 | 22 | _CSV_ID_CHECK_GENDER_MAPPING = { 23 | '1': MALE, 24 | '2': FEMALE, 25 | 'female': FEMALE, 26 | 'male': MALE, 27 | } 28 | 29 | _LSRC2_ID_CHECK_GENDER_MAPPING = { 30 | 'F': FEMALE, 31 | 'M': MALE, 32 | } 33 | 34 | _CANTAB_GENDER_MAPPING = { 35 | 'Female': FEMALE, 36 | 'Male': MALE, 37 | } 38 | 39 | 40 | def list_psytools_timepoint(path): 41 | """List Psytools CSV files exported from Delosis. 42 | 43 | Parameters 44 | ---------- 45 | path : str 46 | Directory to read Psytools CSV files from. 47 | 48 | Yields 49 | ------ 50 | str 51 | Path to Psytools CSV file. 52 | 53 | """ 54 | CSV_PREFIX = ('IMAGEN-', 'STRATIFY-') 55 | LSRC2_PREFIX = ('Imagen_', 'STRATIFY_Core') # exclude STRATIFY_Screening 56 | 57 | for f in os.listdir(path): 58 | root, ext = os.path.splitext(f) 59 | if ext == '.csv': 60 | if any(root.startswith(prefix) for prefix in CSV_PREFIX): 61 | yield (False, os.path.join(path, f), root) 62 | elif any(root.startswith(prefix) for prefix in LSRC2_PREFIX): 63 | yield (True, os.path.join(path, f), root) 64 | else: 65 | logging.error('skipping unknown CSV file: %s', f) 66 | 67 | 68 | def process_psytools_timepoint(arguments): 69 | (lsrc2, path, name) = arguments # unpack multiple arguments 70 | 71 | sex_counter = {} 72 | dob_counter = {} 73 | 74 | with open(path, 'r') as f: 75 | reader = csv.DictReader(f, dialect='excel') 76 | for row in reader: 77 | if lsrc2: 78 | psc1 = row['id'] 79 | if psc1.endswith('SB'): 80 | psc1 = psc1[:-len('SB')] 81 | if psc1.endswith('FU'): 82 | psc1 = psc1[:-len('FU')] 83 | if psc1.isdigit() and len(psc1) == 12: 84 | if 'IdCheckGender' in row: 85 | id_check_gender = row['IdCheckGender'] 86 | if id_check_gender in _LSRC2_ID_CHECK_GENDER_MAPPING: 87 | id_check_gender = _LSRC2_ID_CHECK_GENDER_MAPPING[id_check_gender] 88 | sex_counter.setdefault(psc1, {}).setdefault(id_check_gender, Counter()).update(('IdCheckGender',)) 89 | elif id_check_gender: 90 | logging.error("%s: %s: invalid 'IdCheckGender': %s", 91 | name, psc1, id_check_gender) 92 | else: 93 | logging.debug("%s: %s: empty 'IdCheckGender': %s", 94 | name, psc1, id_check_gender) 95 | if 'IdCheckDob' in row: 96 | id_check_dob = row['IdCheckDob'] 97 | try: 98 | id_check_dob = datetime.strptime(id_check_dob, '%Y-%m-%d %H:%M:%S') 99 | except ValueError as e: 100 | if id_check_dob: 101 | logging.error("%s: %s: invalid 'IdCheckDob': %s", 102 | name, psc1, id_check_dob) 103 | else: 104 | logging.debug("%s: %s: empty 'IdCheckDob': %s", 105 | name, psc1, id_check_dob) 106 | else: 107 | id_check_dob = id_check_dob.date() 108 | if id_check_dob.year > 2012 or id_check_dob.year < 1990: 109 | logging.error("%s: %s: skip 'IdCheckDob': %d", 110 | name, psc1, id_check_dob.year) 111 | else: 112 | dob_counter.setdefault(psc1, {}).setdefault(id_check_dob, Counter()).update(('IdCheckDob',)) 113 | else: 114 | logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1) 115 | else: 116 | psc1_suffix = row['User code'].rsplit('-', 1) 117 | psc1 = psc1_suffix[0] 118 | if psc1.endswith('SB'): 119 | psc1 = psc1[:-len('SB')] 120 | completed = row['Completed'] 121 | if completed == 't': 122 | trial = row['Trial'] 123 | if trial == 'id_check_gender': 124 | if psc1.isdigit() and len(psc1) == 12: 125 | trial_result = row['Trial result'] 126 | if trial_result in _CSV_ID_CHECK_GENDER_MAPPING: 127 | id_check_gender = _CSV_ID_CHECK_GENDER_MAPPING[trial_result] 128 | sex_counter.setdefault(psc1, {}).setdefault(id_check_gender, Counter()).update((trial,)) 129 | else: 130 | logging.error("%s: %s: invalid 'id_check_gender': %s", 131 | name, psc1, trial_result) 132 | else: 133 | logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1) 134 | elif trial == 'ni_gender': 135 | if psc1.isdigit() and len(psc1) == 12: 136 | trial_result = row['Trial result'] 137 | if trial_result in _LSRC2_ID_CHECK_GENDER_MAPPING: 138 | id_check_gender = _LSRC2_ID_CHECK_GENDER_MAPPING[trial_result] 139 | sex_counter.setdefault(psc1, {}).setdefault(id_check_gender, Counter()).update((trial,)) 140 | else: 141 | logging.error("%s: %s: invalid 'ni_gender': %s", 142 | name, psc1, trial_result) 143 | else: 144 | logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1) 145 | elif trial == 'id_check_dob': 146 | if psc1.isdigit() and len(psc1) == 12: 147 | trial_result = row['Trial result'] 148 | try: 149 | month, year = trial_result.rsplit('_') 150 | month = int(month) 151 | year = int(year) 152 | except ValueError as e: 153 | logging.error("%s: invalid 'id_check_dob': %s", 154 | psc1, id_check_dob) 155 | else: 156 | if year > 2012 or year < 1990: 157 | logging.error("%s: skip 'id_check_dob': %d", 158 | psc1, year) 159 | else: 160 | dob_counter.setdefault(psc1, {}).setdefault((year, month), Counter()).update((trial,)) 161 | else: 162 | logging.info('%s: %s: cannot interpret as PSC1 code', name, psc1) 163 | 164 | return sex_counter, dob_counter 165 | 166 | 167 | def psytools_timepoint(path): 168 | todo_list = list(list_psytools_timepoint(path)) 169 | 170 | pool = Pool(WORKER_PROCESSES) 171 | results = pool.map(process_psytools_timepoint, todo_list) 172 | pool.close() 173 | pool.join() 174 | 175 | sex = {} 176 | dob = {} 177 | for (sex_counter, dob_counter), (lsrc2, path, name) in zip(results, todo_list): 178 | for psc1, values in sex_counter.items(): 179 | for value, variables in values.items(): 180 | for variable, count in variables.items(): 181 | sex.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update({name: count}) 182 | for psc1, values in dob_counter.items(): 183 | for value, variables in values.items(): 184 | for variable, count in variables.items(): 185 | dob.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update({name: count}) 186 | 187 | clean_dob = {} 188 | for psc1, values in dob.items(): 189 | exact_dates = set() 190 | for value, variables in values.items(): 191 | if type(value) == date: 192 | for variable, counter in variables.items(): 193 | exact_dates.add(value) 194 | clean_dob.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update(counter) 195 | for value, variables in values.items(): 196 | if type(value) == tuple: 197 | year, month = value 198 | for variable, counter in variables.items(): 199 | for d in exact_dates: 200 | if d.year == year and d.month == month: 201 | clean_dob.setdefault(psc1, {}).setdefault(d, {}).setdefault(variable, Counter()).update(counter) 202 | break 203 | else: 204 | clean_dob.setdefault(psc1, {}).setdefault(value, {}).setdefault(variable, Counter()).update(counter) 205 | 206 | return sex, clean_dob 207 | 208 | 209 | def cantab_timepoint(path): 210 | sex = {} 211 | for center in os.listdir(path): 212 | center_path = os.path.join(path, center) 213 | if os.path.isdir(center_path): 214 | for psc1 in os.listdir(center_path): 215 | psc1_path = os.path.join(center_path, psc1) 216 | if os.path.isdir(psc1_path): 217 | if psc1.isdigit() and len(psc1) == 12: 218 | additional_data_path = os.path.join(psc1_path, 'AdditionalData') 219 | for f in os.listdir(additional_data_path): 220 | if f.startswith('datasheet_'): 221 | if f == ('datasheet_' + psc1 + 'SB.csv'): 222 | f_path = os.path.join(additional_data_path, f) 223 | with open(f_path, newline='') as csvfile: 224 | reader = csv.DictReader(csvfile) 225 | try: 226 | if 'Gender' not in reader.fieldnames: 227 | csvfile.seek(0) 228 | reader = csv.DictReader(csvfile, delimiter=';') 229 | if 'Gender' not in reader.fieldnames: 230 | reader = None 231 | except: 232 | logging.error('bad cantab datasheet for %s', psc1) 233 | for row in reader: 234 | if 'Gender' in row: 235 | if row['Gender']: 236 | sex[psc1] = _CANTAB_GENDER_MAPPING[row['Gender']] 237 | else: 238 | logging.warning('%s: missing Gender value: %s', psc1, f) 239 | else: 240 | logging.warning('%s: missing Gender column (%s): %s', psc1, reader.fieldnames, f) 241 | else: 242 | logging.error('%s: incorrect file name: %s', psc1, f) 243 | else: 244 | logging.info('%s: not a directory', psc1) 245 | else: 246 | logging.debug('%s: not a PSC1 code', psc1) 247 | 248 | return sex 249 | 250 | 251 | def main(): 252 | sex, dob = psytools_timepoint(STRATIFY_PSYTOOLS) 253 | cantab_sex = cantab_timepoint('/neurospin/imagen/STRATIFY/RAW/PSC1') 254 | 255 | validated_dob = {} 256 | with open(STRATIFY_DOB, 'r') as f: 257 | reader = csv.reader(f, dialect='excel') 258 | for row in reader: 259 | validated_dob[row[0]] = datetime.strptime(row[1], '%Y-%m-%d').date() 260 | 261 | validated_sex = {} 262 | with open(STRATIFY_SEX, 'r') as f: 263 | reader = csv.reader(f, dialect='excel') 264 | for row in reader: 265 | validated_sex[row[0]] = row[1] 266 | 267 | for psc1 in cantab_sex: 268 | if psc1 in sex: 269 | sex[psc1].setdefault(cantab_sex[psc1], {}).setdefault('Gender', Counter()).update({'datasheet_' + psc1 + 'SB': 1}) 270 | else: 271 | logging.error('%s: found in Cantab but missing from Psytools', psc1) 272 | 273 | today = datetime.today() 274 | 275 | with open('STRATIFY_SEX_' + today.strftime('%Y-%m-%d') + '.txt', 'w') as f: 276 | for psc1, values in sex.items(): 277 | if psc1 in validated_sex: 278 | print(','.join((psc1, validated_sex[psc1])), file=f) 279 | elif len(values) > 1: 280 | message = '{}: multiple sex values:\n'.format(psc1) 281 | for value, variables in values.items(): 282 | count_value = 0 283 | message_variable = '' 284 | for variable, counters in variables.items(): 285 | count_variable = 0 286 | message_name = '' 287 | for name, count in counters.items(): 288 | message_name += '\t\t\t{}\n'.format(name) 289 | count_variable += count 290 | message_variable += '\t\t{} ({})\n'.format(variable, count_variable) + message_name 291 | count_value += count_variable 292 | message_value = '\t{} ({})\n'.format(value, count_value) + message_variable 293 | message += message_value 294 | logging.error(message) 295 | else: 296 | value = next(iter(values.keys())) 297 | print(','.join((psc1, value)), file=f) 298 | 299 | with open('STRATIFY_DOB_' + today.strftime('%Y-%m-%d') + '.txt', 'w') as f: 300 | for psc1, values in dob.items(): 301 | if psc1 in validated_dob: 302 | print(','.join((psc1, validated_dob[psc1].strftime('%Y-%m-%d'), 303 | today.strftime('%Y-%m-%d_%H:%M:%S.0'))), 304 | file=f) 305 | elif len(values) > 1: 306 | message = '{}: multiple date of birth values:\n'.format(psc1) 307 | for value, variables in values.items(): 308 | count_value = 0 309 | message_variable = '' 310 | for variable, counters in variables.items(): 311 | count_variable = 0 312 | message_name = '' 313 | for name, count in counters.items(): 314 | message_name += '\t\t\t{} ({})\n'.format(name, count) 315 | count_variable += count 316 | message_variable += '\t\t{} ({})\n'.format(variable, count_variable) + message_name 317 | count_value += count_variable 318 | message_value = '\t{} ({})\n'.format(value, count_value) + message_variable 319 | message += message_value 320 | logging.error(message) 321 | else: 322 | value = next(iter(values.keys())) 323 | if type(value) == date: 324 | value = value.strftime('%Y-%m-%d') 325 | print(','.join((psc1, value, 326 | today.strftime('%Y-%m-%d_%H:%M:%S.0'))), 327 | file=f) 328 | else: 329 | logging.error('%s: skipping incomplete date: %s', psc1, str(value)) 330 | 331 | 332 | if __name__ == "__main__": 333 | main() 334 | --------------------------------------------------------------------------------