├── .gitignore ├── LICENSE ├── Liftover_SNP.py ├── PLINK_file_Utils.py ├── README.md ├── __init__.py ├── annotation.py ├── aud_meta_qq_lambdas.ipynb ├── config.plotgwas.3.cfg ├── convert_cleansumstats_output_to_mixer_format.py ├── data └── biomart_GENCODE_basic.txt.gz ├── fdrmat2csv.py ├── figs └── Z3nns.png ├── lift_rs_numbers.py ├── make_ld_matrix ├── README.md ├── data │ └── EUR_subj.list ├── genotypes2ref.py ├── make_ld_matrix.py └── make_maf_vector.py ├── make_universal_variant_ids.py ├── manhattan.py ├── merge_bed_files.py ├── overCorrect.py ├── plink_utils.py ├── plotgwas.py ├── process_metal.py ├── pyliftover ├── GRCh37ToHg19.over.chain.gz ├── README ├── __init__.py ├── chainfile.py ├── hg17ToHg19.over.chain.gz ├── hg18ToHg19.over.chain.gz ├── hg19ToGRCh37.over.chain.gz ├── intervaltree.py └── liftover.py ├── qq.py ├── sumStats2ref.py ├── summary_stats_Utils.py ├── sumstats.py ├── sumstats2mat.py ├── sumstats_ldsc_helper.py ├── sumstats_utils.py ├── tests ├── 1234_ref.bim ├── case01.mat ├── case01.txt ├── case01.txt.gz ├── case02.txt ├── case02.txt.gz ├── case03.txt ├── case03.txt.gz ├── case04.mat ├── case04.txt ├── case04.txt.gz ├── test_consistent.py └── test_duplicated.py └── version.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/LICENSE -------------------------------------------------------------------------------- /Liftover_SNP.py: -------------------------------------------------------------------------------- 1 | import os, re 2 | import numpy as np 3 | import pandas as pd 4 | import argparse 5 | import logging 6 | import random 7 | from distutils.version import StrictVersion 8 | 9 | from pyliftover import LiftOver 10 | 11 | Intro = r''' 12 | Lifting SNPs 'rs' number and genomic position across different builds. 13 | Option --find-build require biopython. Install it with "pip install biopython". 14 | Lift 'rs' number: 15 | ---------------- 16 | 1. Based on NCBI SNP merge and history files: RsMergeArch and SNPHistory 17 | 2. SNPs with ID coded as 'CHR:POS' will be left untouched 18 | Details: http://genome.sph.umich.edu/wiki/LiftOver 19 | 20 | Lift genomic position: 21 | ---------------------- 22 | 1. Only performed when the CHR and POS columns are specified 23 | 2. Based on UCSC build converting files: 24 | hg17ToHg19.over.chain.gz 25 | hg18ToHg19.over.chain.gz 26 | 3. It is possible that one position in old build maps to multiple 27 | positions in new build. In such case, 28 | The SNP is removed from cleaned data 29 | The matching with highest score were used in dup_*.txt 30 | 4. It is also possible that no new position matches the old 31 | The SNP is removed from the cleaned data 32 | The old position was kept in miss_*.txt 33 | 34 | The results after running the script will be stored in the following files, where * is the name of the original file: 35 | - lifted_* is the main result. It contains the original input plus new columns with the lifted data (SNP, CHR, POS) 36 | 37 | - dup_* report duplicated rs and/or CHR:BP entries after lifting 38 | - miss_* report CHR:BP entries that couldn't be lifted by LiftOver tool with given chain file 39 | - multi_* report CHR:BP entries that lift into multiple locations with given chain file 40 | 41 | Reports: 42 | - lift_pos_result reports the results of lifting BP 43 | - lift_rs_result reports the results of lifting RS# numbers 44 | - summary_lift_pos reports summary of lifting BP 45 | - summary_lift_rs reports summary of lifting RS# numbers 46 | ''' 47 | 48 | def myopen(fn): 49 | import gzip 50 | try: 51 | h = gzip.open(fn, mode='rt') 52 | ln = h.read(2) # read arbitrary bytes so check if @param fn is a gzipped file 53 | except: 54 | # cannot read in gzip format 55 | return open(fn, mode='rt') 56 | h.close() 57 | return gzip.open(fn, mode='rt') 58 | 59 | def read_rs_history(histFile): 60 | RS_HISTORY = set() # store rs 61 | 62 | logging.info("Reading '{}' file...".format(histFile)) 63 | for ln in myopen(histFile): 64 | fd = ln.strip().split('\t') 65 | if ln.lower().find('re-activ') < 0: 66 | RS_HISTORY.add(fd[0]) 67 | 68 | return RS_HISTORY 69 | 70 | 71 | def read_rs_merge(mergFile): 72 | RS_MERGE = dict() # high_rs -> (lower_rs, current_rs) 73 | 74 | logging.info("Reading '{}' file...".format(mergFile)) 75 | for ln in myopen(mergFile): 76 | fd = ln.strip().split('\t') 77 | h, l = fd[0], fd[1] 78 | c = fd[6] 79 | RS_MERGE[h] = (l, c) 80 | 81 | return RS_MERGE 82 | 83 | 84 | # Returns list of tuples, where each tuple is (rs#, build, chr, pos) 85 | def fetch_snps(snp_ids, verbose=False): 86 | from Bio import Entrez 87 | Entrez.email = "oleksandr.frei@gmail.com" 88 | 89 | def pull_var(v, line): 90 | return [x for x in line if x.startswith(v)][0].replace(v, '') 91 | 92 | def parse_snp(snp_info): 93 | 94 | snp = snp_info.split('\n') 95 | 96 | rsId = snp[0].split(" | ")[0] 97 | lineset = [x.split(' | ') for x in snp if x.startswith('CTG')] 98 | if len(lineset) == 0: 99 | return None 100 | 101 | try: 102 | build = pull_var("assembly=", lineset[0]) 103 | chr = pull_var("chr=", lineset[0]) 104 | pos = pull_var("chr-pos=", lineset[0]) 105 | except: 106 | return None 107 | 108 | return rsId, build, chr, pos 109 | 110 | logging.info('Querying dbSNP for {} SNPs...'.format(len(snp_ids))) 111 | response = Entrez.efetch(db='SNP', id=','.join(snp_ids), rettype='flt', retmode='flt').read() 112 | logging.info('Done') 113 | if verbose: 114 | print(response) 115 | 116 | snp_infos = [] 117 | for snp_info in filter(None, response.split('\n\n')): 118 | snp_infos.append(parse_snp(snp_info)) 119 | return snp_infos 120 | 121 | def lift_rs(rsvec, RS_HISTORY, RS_MERGE): 122 | RS_LIFTED = rsvec.copy(); nsnps = len(rsvec) 123 | RS_idx = np.empty((nsnps,), dtype='|S10') 124 | logging.info("Lifting rs# numbers for n={} SNPs...".format(nsnps)) 125 | for i in range(nsnps): 126 | rs = rsvec[i] 127 | if (i+1) % 200000 == 0: 128 | logging.info("{} SNPs done".format(i+1)) 129 | if rs not in RS_MERGE: 130 | RS_LIFTED[i] = rs; RS_idx[i] = 'unchanged' 131 | continue 132 | while True: 133 | if rs in RS_MERGE: 134 | rsLow, rsCurrent = RS_MERGE[rs] 135 | if rsCurrent not in RS_HISTORY and rsCurrent != '': 136 | RS_LIFTED[i] = rsCurrent; RS_idx[i] = 'lifted' 137 | break 138 | else: 139 | rs = rsLow 140 | else: 141 | RS_LIFTED[i] = rs; RS_idx[i] = 'unlifted' 142 | break 143 | logging.info("Lifting rs# numbers is finished.") 144 | return RS_LIFTED, RS_idx 145 | 146 | def lift_pos(posvec, chrvec, chainFile): 147 | logging.info("Lifting genomic positions...") 148 | nsnps = len(posvec) 149 | posvec = posvec -1; 150 | pos_lifted = np.empty((nsnps,), dtype='int32') 151 | chr_lifted = np.empty((nsnps,), dtype='int32') 152 | pos_indi = np.empty((nsnps,), dtype='|S10') 153 | dup_indi = np.empty((nsnps,), dtype='bool'); dup_indi.fill(False) 154 | lift = LiftOver(chainFile) 155 | for i in range(nsnps): 156 | if (i+1) % 200000 == 0: 157 | logging.info("{} SNPs done".format(i+1)) 158 | pos = posvec[i]; chr = 'chr%d' % (chrvec[i],) 159 | tmp = lift.convert_coordinate(chr, pos) 160 | if not tmp: 161 | pos_lifted[i] = pos; pos_indi[i] = 'miss'; chr_lifted[i]=chrvec[i] 162 | elif len(tmp) > 1: 163 | pos_lifted[i] = tmp[0][1]; 164 | chr_lifted[i] = re.sub('chr', '', tmp[0][0]) 165 | pos_indi[i] = 'multi' 166 | else: 167 | pos_lifted[i] = tmp[0][1] 168 | chr_lifted[i] = re.sub('chr', '', tmp[0][0]) 169 | if pos == tmp[0][1]: 170 | pos_indi[i] = 'unchanged' 171 | else: 172 | pos_indi[i] = 'lifted' 173 | return pos_lifted+1, pos_indi, chr_lifted 174 | 175 | def trim_ch_rs (sum_dat, snpCol, chrCol, with_pos): 176 | nsnps = sum_dat.shape[0] 177 | logging.info("Parsing {} rs# numbers from the input file...".format(nsnps)) 178 | chrnum_vec = np.empty((nsnps, ), dtype='int') 179 | rsvec_num = []; rsPattern = re.compile(r'rs[0-9]*') 180 | rsidx = np.empty((nsnps, ), dtype='bool'); rsidx.fill(False) 181 | for i in range(nsnps): 182 | if (i+1) % 200000 == 0: 183 | logging.info("{} SNPs done".format(i+1)) 184 | rs = sum_dat.loc[:,snpCol][i] 185 | if with_pos: 186 | chr = sum_dat.loc[:,chrCol][i] 187 | if rsPattern.match(rs): 188 | rsidx[i] = True 189 | rsvec_num.append(re.sub('rs', '', rs)) 190 | if with_pos: 191 | chrnum_vec[i] = int(re.sub('[chrCHR]', '', str(chr))) 192 | rsvec_num = np.array(rsvec_num) 193 | return rsvec_num, rsidx, chrnum_vec 194 | 195 | def try_find_build(rs, pos): 196 | snps_info = fetch_snps(rs) 197 | #snps_info = [('rs3737728', 'GRCh38.p2', '1', '1086035'), ('rs3934834', 'GRCh38.p2', '1', '1070426'), ('rs9651273', 'GRCh38.p2', '1', '1096160')] 198 | logging.info("Loading liftover chain files...") 199 | lift38_19 = LiftOver('pyliftover/hg38ToHg19.over.chain.gz') 200 | lift19_18 = LiftOver('pyliftover/hg19ToHg18.over.chain.gz') 201 | lift19_17 = LiftOver('pyliftover/hg19ToHg17.over.chain.gz') 202 | logging.info("Done") 203 | 204 | for (rsId, build, true_chr, pos_hg38), source_pos in zip(snps_info, pos): 205 | try: 206 | #if build != 'GRCh38.p2': # assume a specific build we get from Entrez.efetch(db='SNP') 207 | # continue 208 | source_pos -= 1 209 | pos_hg19 = lift38_19.convert_coordinate('chr{}'.format(true_chr), int(pos_hg38) - 1)[0][1] 210 | pos_hg18 = lift19_18.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1] 211 | pos_hg17 = lift19_17.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1] 212 | print("build={} {} chr{} source={} hg38={}{} hg19={}{} hg18={}{} hg17={}{}".format( 213 | build, rsId, true_chr, source_pos, 214 | pos_hg38, '*' if pos_hg38==source_pos else '', 215 | pos_hg19, '*' if pos_hg19==source_pos else '', 216 | pos_hg18, '*' if pos_hg18==source_pos else '', 217 | pos_hg17, '*' if pos_hg17==source_pos else '')) 218 | except: 219 | pass 220 | 221 | def lift_over(sumFile, outDir, histFile, mergFile, chainFile, 222 | snpCol, chrCol, posCol, bim=False, reffile="", find_build=False): 223 | logging.info("Reading input file '{}'...".format(sumFile)) 224 | sum_dat = pd.read_csv(sumFile, sep=' +|\t', engine='python') 225 | logging.info("Done. Columns are: {}".format(", ".join(sum_dat.columns))) 226 | if bim: 227 | logging.info("Setting new column names based on BIM format") 228 | sum_dat.columns = ['CHR', 'SNP', 'GP', 'POS', 'A1', 'A2'] 229 | snpCol='SNP'; chrCol='CHR'; posCol='POS' 230 | 231 | if snpCol not in sum_dat.columns: 232 | raise ValueError("Input file does not have {} column".format(snpCol)) 233 | 234 | with_pos = chrCol is not None and chrCol != '-' and posCol is not None and posCol != '-' 235 | with_ref = reffile != None and reffile != "" 236 | 237 | if with_pos: 238 | if chrCol not in sum_dat.columns: 239 | raise ValueError("Input file does not have {} column".format(chrCol)) 240 | if posCol not in sum_dat.columns: 241 | raise ValueError("Input file does not have {} column".format(posCol)) 242 | 243 | if find_build: 244 | sample_size = 60 245 | sample = random.sample(range(sum_dat.shape[0]), sample_size) 246 | sum_dat_sample = sum_dat.ix[sample, :] 247 | sum_dat_sample = sum_dat_sample.sort_values(chrCol) 248 | sum_dat_sample.reset_index(inplace=True) 249 | try_find_build(sum_dat_sample[snpCol].as_matrix(), sum_dat_sample[posCol].as_matrix()) 250 | return 251 | 252 | logging.info("Checking if there are duplicates by rs# number in the input file... ") 253 | duplicated = sum_dat.duplicated(snpCol, keep=False) 254 | if any(duplicated): 255 | logging.warning("{} duplicated rs# numbers were found in the input file".format(sum(duplicated))) 256 | else: 257 | logging.info("No duplicated rs# numbers were found in the input file") 258 | 259 | rsvec_num, rsidx, chrnum_vec = trim_ch_rs(sum_dat, snpCol, chrCol, with_pos) 260 | 261 | RS_HISTORY = read_rs_history(histFile) if isinstance(histFile, str) else histFile 262 | RS_MERGE = read_rs_merge(mergFile) if isinstance(mergFile, str) else mergFile 263 | 264 | lifted_rs, lift_rs_indi = lift_rs(rsvec_num, RS_HISTORY, RS_MERGE) 265 | summary_lift_rs(sum_dat.loc[:, snpCol][rsidx], lifted_rs, lift_rs_indi, 266 | outDir) 267 | sum_dat.loc[:,'new_ID'] = sum_dat.loc[:, snpCol].copy() 268 | sum_dat.loc[rsidx, 'new_ID'] = np.array(['rs%s' % (s,) for s in lifted_rs]) 269 | # TO-DO: It is better to flag all duplicate but the function with 270 | # keep = False doesnt work ! So the first of multiple occurance will 271 | # sneak into the clean dataset 272 | # ofrei: keep = False seems to work well for me with pandas 0.18.0 273 | sum_dat_dup_idx = sum_dat.duplicated(subset = ('new_ID',), keep=False) 274 | if with_pos: 275 | lifted_pos, lift_pos_indi ,chr_lifted = lift_pos(sum_dat[posCol], 276 | chrnum_vec, chainFile) 277 | summary_lift_pos(sum_dat.loc[:,snpCol], sum_dat.loc[:,chrCol], 278 | sum_dat.loc[:, posCol], lifted_pos, lift_pos_indi, outDir) 279 | sum_dat.loc[:,'new_pos'] = lifted_pos 280 | sum_dat.loc[:,'new_chr'] = chr_lifted.astype('int') 281 | sum_dat.loc[:, 'postag'] = np.array(['%s:%s' % (str(c), str(p)) for 282 | c, p in zip(chr_lifted, lifted_pos)]) 283 | sum_dat_dup_idx2 = sum_dat.duplicated(subset = ('postag'), keep=False) 284 | sum_dat_dup_idx = np.logical_or(sum_dat_dup_idx, sum_dat_dup_idx2) 285 | sum_dat_miss_idx = lift_pos_indi=='miss' 286 | sum_dat_multi_idx = lift_pos_indi=='multi' 287 | if np.sum(sum_dat_multi_idx) > 0: 288 | sum_dat_multi = sum_dat.ix[sum_dat_multi_idx, :] 289 | multi_file = os.path.join(outDir, 'multi_%s' % (os.path.basename(sumFile),)) 290 | sum_dat_multi.to_csv(multi_file, index=False, sep='\t') 291 | logging.info("Created {} file with {} entries".format(multi_file, sum_dat_multi.shape[0])) 292 | 293 | elif with_ref: 294 | logging.info("Lifting with BIM reference file...") 295 | refbim = pd.read_csv(reffile,delimiter='\t', header=None) 296 | refbim.columns = ['CHR', 'SNP', 'GP', 'POS', 'A1', 'A2'] 297 | if any(duplicated): 298 | logging.warning("(!!!) pandas.merge has not been tested on how it merges duplicated entries (!!!!)") 299 | tmp = pd.merge(sum_dat, refbim, left_on='new_ID', right_on='SNP', 300 | how='left') 301 | sum_dat_miss_idx = np.isnan(tmp.POS) 302 | sum_dat.loc[~sum_dat_miss_idx, 'new_pos'] = tmp.POS[ 303 | ~sum_dat_miss_idx].astype('int') 304 | sum_dat.loc[~sum_dat_miss_idx, chrCol] = tmp.CHR[ 305 | ~sum_dat_miss_idx].astype('int') 306 | sum_dat_multi_idx = np.empty((sum_dat.shape[0],), dtype='bool') 307 | sum_dat_multi_idx.fill(False) 308 | else: 309 | sum_dat_miss_idx = np.empty((sum_dat.shape[0],), dtype='bool') 310 | sum_dat_miss_idx.fill(False) 311 | sum_dat_multi_idx = np.empty((sum_dat.shape[0],), dtype='bool') 312 | sum_dat_multi_idx.fill(False) 313 | if np.sum(sum_dat_dup_idx) > 0: 314 | sum_dat_dup = sum_dat.ix[sum_dat_dup_idx, :] 315 | dup_file = os.path.join(outDir, 'dup_%s' % (os.path.basename(sumFile),)) 316 | sum_dat_dup.to_csv(dup_file, index=False, sep='\t') 317 | logging.info("Created {} file with {} entries".format(dup_file, sum_dat_dup.shape[0])) 318 | if np.sum(sum_dat_miss_idx) > 0: 319 | sum_dat_miss = sum_dat.ix[sum_dat_miss_idx, :] 320 | miss_file = os.path.join(outDir, 'miss_%s' % (os.path.basename(sumFile),)) 321 | sum_dat_miss.to_csv(miss_file, index=False, sep='\t') 322 | logging.info("Created {} file with {} entries".format(miss_file, sum_dat_miss.shape[0])) 323 | rm_idx = (sum_dat_dup_idx.astype('int') + 324 | sum_dat_miss_idx.astype('int') + 325 | sum_dat_multi_idx.astype('int')) >=1 326 | if np.sum(rm_idx) > 0: 327 | sum_dat = sum_dat.ix[~rm_idx,:] 328 | logging.warning( 329 | "{0} entries removed from the input because of duplication, " 330 | "misses or multiple mappings between builds".format(np.sum(rm_idx))) 331 | if bim: 332 | logging.info("Updating plink files...") 333 | update_plinkfiles(outDir, sumFile, sum_dat, snpCol) 334 | else: 335 | result_file = os.path.join(outDir, 'lifted_%s' % (os.path.basename(sumFile),)) 336 | logging.info("Saving the result to {}...".format(result_file)) 337 | sum_dat.to_csv(result_file, index=False, sep='\t') 338 | logging.info("Done.") 339 | 340 | def update_plinkfiles(outDir, sumFile, sum_dat, snpCol): 341 | tmp_ex = os.path.join(outDir, 'tmp_extract.txt') 342 | sum_dat.to_csv(tmp_ex, index=False, sep='\t', columns=(snpCol,)) 343 | bf = re.sub('.bim','', sumFile) 344 | plink_cmd = r'''plink --bfile %s --extract %s --make-bed \ 345 | --out %s''' % (bf, tmp_ex, sumFile+'.tmp') 346 | os.system(plink_cmd) 347 | tmp_bim = pd.read_csv(sumFile+'.tmp.bim',delimiter='\t', header=None) 348 | tmp_bim.columns = ['oCHR', 'oSNP', 'oGP', 'oPOS', 'oA1', 'oA2'] 349 | tmp = pd.merge(tmp_bim, sum_dat, left_on='oSNP', right_on=snpCol, 350 | how='left') 351 | miss_idx = np.isnan(tmp.loc[:,'new_pos']) 352 | assert np.sum(miss_idx) == 0 353 | tmp_pos_file = os.path.join(outDir, 'tmp_update_pos.txt') 354 | tmp.to_csv(tmp_pos_file, index=False, sep='\t', 355 | columns=('oSNP', 'new_pos'), header=None) 356 | plink_cmd = r'''plink --bfile %s --update-map %s 2 --make-bed \ 357 | --out %s''' % (sumFile+'.tmp',tmp_pos_file, 358 | sumFile+'.tmp2') 359 | os.system(plink_cmd) 360 | tmp_bim2 = pd.read_csv(sumFile+'.tmp2.bim',sep='\t', header=None) 361 | tmp_bim2.columns = ['oCHR', 'oSNP', 'oGP', 'oPOS', 'oA1', 'oA2'] 362 | tmp = pd.merge(tmp_bim2, sum_dat, left_on='oSNP', right_on=snpCol, 363 | how='left') 364 | tmp.to_csv(sumFile+'.tmp2.bim', sep='\t', header=None, index=False, 365 | columns=('oCHR', 'new_ID', 'oGP', 'oPOS', 'oA1', 'oA2')) 366 | os.system("mv %s %s " % (sumFile+'.tmp2.bim', bf+'_lifted.bim')) 367 | os.system("mv %s %s " % (sumFile+'.tmp2.bed', bf+'_lifted.bed')) 368 | os.system("mv %s %s " % (sumFile+'.tmp2.fam', bf+'_lifted.fam')) 369 | os.system("rm %s " % (sumFile+'.tmp.bim',)) 370 | os.system("rm %s " % (sumFile+'.tmp.fam',)) 371 | os.system("rm %s " % (sumFile+'.tmp.bed',)) 372 | os.system("rm %s " % (tmp_pos_file,)) 373 | os.system("rm %s " % (tmp_ex,)) 374 | 375 | def summary_lift_rs(orig_rs, new_rs, indivec, outDir): 376 | li_idx = indivec == 'lifted' 377 | ul_idx = indivec == 'unlifted' 378 | uc_idx = indivec == 'unchanged' 379 | orig_rs = np.array(orig_rs) 380 | summary_file = os.path.join(outDir, 'summary_lift_rs.txt') 381 | logging.info("Saving lift summary to '{}'...".format(summary_file)) 382 | with open(summary_file, 'w') as f: 383 | f.write('Total number of SNPs with "rs" number: %d\n' % ( 384 | len(orig_rs,))) 385 | f.write('\t Total number of SNPs with "rs" lifted: %d\n' % ( 386 | np.sum(li_idx,))) 387 | f.write('\t Total number of SNPs with "rs" unchanged: %d\n' % ( 388 | np.sum(uc_idx,))) 389 | f.write('\t Total number of SNPs with "rs" cant lift: %d\n' % ( 390 | np.sum(ul_idx,))) 391 | 392 | results_file = os.path.join(outDir, 'lift_rs_result.txt') 393 | logging.info("Saving lifted SNPs to '{}'...".format(summary_file)) 394 | with open(results_file, 'w') as f: 395 | f.write('ORI_RS\tNEW_RS\tSTATUS\n') 396 | for i in range(len(orig_rs)): 397 | f.write('%s\trs%s\t%s\n' % (orig_rs[i], new_rs[i], indivec[i])) 398 | 399 | def summary_lift_pos(orig_snp, chrvec, posvec, new_posvec, indivec, outDir): 400 | li_idx = indivec == 'lifted' 401 | uc_idx = indivec == 'unchanged' 402 | miss_idx = indivec == 'miss' 403 | multi_idx = indivec == 'multi' 404 | orig_snp = np.array(orig_snp); chrvec = np.array(chrvec) 405 | posvec = np.array(posvec); new_posvec = np.array(new_posvec) 406 | with open(os.path.join(outDir, 'summary_lift_pos.txt'), 'w') as f: 407 | f.write('Total number of SNPs: %d\n' % ( len(orig_snp,))) 408 | f.write('\t Total number of SNPs lifted: %d\n' % (np.sum(li_idx,))) 409 | f.write('\t Total number of SNPs unchanged: %d\n' % (np.sum(uc_idx,))) 410 | f.write('\t Total number of SNPs missed: %d\n' % (np.sum(miss_idx,))) 411 | f.write('\t Total number of SNPs with multiple locations in new build: %d\n' % ( np.sum(multi_idx,))) 412 | with open (os.path.join(outDir, 'lift_pos_result.txt'), 'w') as f: 413 | f.write('SNP\tCHR\t\ORI_POS\tNEW_POS\tSTATUS\n') 414 | for i in range(len(orig_snp)): 415 | f.write('%s\t%s\t%d\t%d\%s\n' % (orig_snp[i], str(chrvec[i]), 416 | posvec[i], new_posvec[i], indivec[i])) 417 | 418 | if __name__ == "__main__": 419 | import time 420 | import warnings 421 | warnings.simplefilter('ignore') 422 | tsts = time.time() 423 | parser = argparse.ArgumentParser(prog="Liftover_SNPs", 424 | formatter_class=argparse.RawTextHelpFormatter, 425 | description=Intro) 426 | 427 | parser.add_argument('input_file', type=str, help='Path of the input SNPs file') 428 | 429 | parser.add_argument('-s', '--snp', type=str, required=True, help='The name of the SNP field in the input file', dest='snp_column') 430 | parser.add_argument('-c', '--chr', type=str, help='The name of the Chromosome field name in the input file', default='-', dest='chr_column') 431 | parser.add_argument('-p', '--pos', type=str, help='The name of the BP field in input file', default='-', dest='pos_column') 432 | 433 | parser.add_argument( '--output-folder', type=str, default='.', help='Output directory') 434 | parser.add_argument( '--history-file', type=str, default='pyliftover/SNPHistory.bcp.gz',help='NCBI SNP build history file') 435 | parser.add_argument( '--merge-file', type=str, default='pyliftover/RsMergeArch.bcp.gz', help='NCBI SNP merge file') 436 | parser.add_argument( '--chain-file', type=str, default='pyliftover/hg18ToHg19.over.chain.gz', help='UCSC chain file') 437 | parser.add_argument( '--find-build', action='store_true', help='Attempt to detect the build of the input file', default=False) 438 | 439 | parser.add_argument( '--bim', action='store_true', help='(experimental option) update PLINT fileset bim file', default=False) 440 | parser.add_argument( '--ref', type=str, help='(experimental option) Reference bim file', default='') 441 | 442 | parser.add_argument('-v', '--verbose', action="store_true", help="increase output verbosity") 443 | 444 | args = parser.parse_args() 445 | 446 | logging_level = logging.INFO if args.verbose else logging.WARNING 447 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging_level) 448 | 449 | if StrictVersion(pd.__version__) < StrictVersion("0.17.0"): 450 | logging.warning("Old pandas version detected. " 451 | "Liftover_SNP script hasn not been tested with pandas={}", pd.__version__) 452 | 453 | # Check correctness of user-provided arguments 454 | if (args.chr_column == '-') != (args.pos_column == '-'): 455 | raise ValueError("Arguments --chr-column and --pos-column must be provided together") 456 | 457 | if args.chr_column == '-' and args.find_build: 458 | raise ValueError("Unable to find build without CHR:POS information") 459 | 460 | if not os.access(args.output_folder, os.F_OK): 461 | logging.warning("Output directory {} not exists, making one for you".format(args.output_folder)) 462 | os.makedirs(args.output_folder) 463 | 464 | lift_over(args.input_file, args.output_folder, args.history_file, args.merge_file, args.chain_file, 465 | args.snp_column, args.chr_column, args.pos_column, args.bim, args.ref, args.find_build) 466 | 467 | logging.info('Finish at {}'.format(time.ctime())) 468 | ted = time.time() 469 | logging.info('Time taken {} mins {} sec'.format((ted-tsts)//60, np.round(ted-tsts) % 60)) 470 | -------------------------------------------------------------------------------- /PLINK_file_Utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os, sys, logging 4 | 5 | def read_bim(bimFile, logger=None, sep='\t'): 6 | ''' 7 | Read PLINK bim file. 8 | 9 | Input: 10 | ------ 11 | bimFile, PLINK bim file path 12 | logger, python logger for process information 13 | sep, separator of bim file 14 | 15 | Return: 16 | ------ 17 | bimDat, DataFrame with SNP information 18 | 19 | Note: 20 | ----- 21 | * Adding column names for convienience: 22 | CHR SNP GP POS A1 A2 23 | * Change ChrX->23, ChrY->24 and ChrM->25 24 | ''' 25 | if not logger: 26 | logger = logging.getLogger() 27 | logger.addHandler(logging.StreamHandler()) 28 | if not os.access(bimFile, os.R_OK): 29 | logger.error('Unable to read {}'.format(bimFile)) 30 | raise (ValueError, 'Unable to read {}'.format(bimFile)) 31 | bimDat = pd.read_csv(bimFile, sep=sep, header=None, 32 | names=['CHR', 'SNP', 'GP', 'POS', 'A1', 'A2']) 33 | bimDat.loc[:,'CHR'] = bimDat.loc[:, 'CHR'].astype('|S5') 34 | bimDat.loc[bimDat.loc[:, 'CHR']=='X', 'CHR'] = '23' 35 | bimDat.loc[bimDat.loc[:, 'CHR']=='Y', 'CHR'] = '24' 36 | bimDat.loc[bimDat.loc[:, 'CHR']=='M', 'CHR'] = '25' 37 | bimDat.loc[:, 'CHR'] = bimDat.loc[:, 'CHR'].astype('float').astype('int') 38 | bimDat.loc[:, 'POS'] = bimDat.loc[:, 'POS'].astype('int') 39 | logger.info('Read {} SNPs from {}'.format(bimDat.shape[0], bimFile)) 40 | logger.info('Columns: CHR, SNP, GP, POS, A1, A2 were used') 41 | return (bimDat) 42 | 43 | def deduplicate_bim(bimDat, outdir, logger=None): 44 | ''' 45 | Check if PLINK bim data has duplicate SNP by position. 46 | 47 | Input: 48 | ------ 49 | bimDat, DataFrame with PLINK bim data 50 | outdir, Output directory for intemediate files 51 | logger, python logger for process information 52 | 53 | Return: 54 | ------- 55 | dupIdx, Indictor Series for SNPs that should be removed 56 | 57 | Note: 58 | ----- 59 | * Save all duplicated SNPs into a gziped file. 60 | * For duplicated SNPs: 61 | * SNPs with rs-number kept, i.e., A1 is (A,T,C,G) and A2 is (A,T,C,G) and 62 | * SNP ID starts with 'rs' 63 | Otherwise, 64 | the first of the duplicates were kept. 65 | * Also save a text file containing SNP IDs that should be removed by PLINK 66 | Warning: 67 | * Extreme slow for large dataset. So better do it once and update the 68 | * corresponding PLINK bed/bim/fam. 69 | ''' 70 | if not logger: 71 | logger = logging.getLogger() 72 | logger.addHandler(logging.StreamHandler()) 73 | dupIdx = bimDat.duplicated(subset=['CHR', 'POS'], keep=False) 74 | ndup = np.sum(dupIdx) 75 | if ndup > 0: 76 | outfile = os.path.join(outdir, 'Duplicated_SNPs_by_POS.txt.gz') 77 | logger.warn('Bim file has {} duplicated items by genomic position'.format(ndup)) 78 | logger.warn('Save all duplicated SNPs in Bim to {}'.format(outfile)) 79 | dupDat = bimDat.loc[dupIdx==True,:] 80 | dupDat.to_csv(outfile, index=False, compression='gzip', na_rep='NA', 81 | sep='\t') 82 | grouped = dupDat.groupby(by=['CHR','POS'], sort=False) 83 | for name, x in grouped: 84 | rsIdx = x.loc[:,'SNP'].str.startswith('rs') 85 | A1Idx = x.loc[:,'A1'].str.contains('[A|T|C|G]') 86 | A2Idx = x.loc[:,'A2'].str.contains('[A|T|C|G]') 87 | Idx = rsIdx & A1Idx & A2Idx 88 | if np.sum(Idx) != 1: 89 | Idx.iloc[0] = False 90 | dupIdx.values[Idx.index] = Idx.values 91 | outfile = os.path.join(outdir, 'Duplicated_SNPs_by_POS_excluded.txt.gz') 92 | logger.warn('Save removed duplicated SNPs in Bim to {}'.format(outfile)) 93 | tmpDat = dupDat.loc[dupIdx==True] 94 | tmpDat.to_csv(outfile, index=False, compression='gzip', na_rep='NA', 95 | sep='\t', columns=['SNP']) 96 | return (dupIdx) 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A collection of various utilities for GWAS summary statistics. 2 | 3 | ## sumstats.py 4 | 5 | sumstats.py is a collection of utilities that work with GWAS summary stats. 6 | ``csv`` utility reads raw summary statistics files 7 | and convert them into a standardized format: 8 | tab-separated file with standard 9 | column names, standard chromosome labels, 10 | NA label for missing data, etc. 11 | ``qc`` utility perform a set of highly customizable quality control procedures. 12 | ``mat`` utility re-saves summary stats in MATLAB format for cond/conj pleiotropy analysis. 13 | ``lift`` utility can lift genomic corredinats across genomic builds, and SNP rs numbers to a newer versions of SNPdb. 14 | 15 | Some of the steps require additional data: 16 | ``` 17 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/2558411_ref.bim 18 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/9279485_ref.bim 19 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/b149_RsMergeArch.bcp.gz 20 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/b149_SNPChrPosOnRef_105.bcp.gz 21 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/b149_SNPHistory.bcp.gz 22 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/hg18ToHg19.over.chain.gz 23 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/ref_1kG_phase3_EUR.tar.gz 24 | ``` 25 | 26 | ``` 27 | usage: sumstats.py [-h] 28 | {csv,qc,mat,lift,clump,rs,ls,mat-to-csv,ldsc-to-mat,frq-to-mat,ref-to-mat,ldsum,diff-mat} ... 29 | 30 | A collection of various utilities for GWAS summary statistics. 31 | 32 | positional arguments: 33 | {csv,qc,zscore,mat,lift,clump,rs,ls,mat-to-csv,ldsc-to-mat,frq-to-mat,ref-to-mat,ldsum,diff-mat,neff} 34 | csv Load raw summary statistics file and convert it into a 35 | standardized format: tab-separated file with standard 36 | column names, standard chromosome labels, NA label for 37 | missing data, etc. The conversion does not change the 38 | number of lines in the input files (e.g. no filtering 39 | is done on markers). Unrecognized columns are removed 40 | from the summary statistics file. The remaining 41 | utilities in sumstats.py work with summary statistics 42 | files in the standardized format. 43 | qc Miscellaneous quality control and filtering procedures 44 | zscore Calculate z-score from p-value column and effect size 45 | column 46 | mat Create mat files that can be used as an input for 47 | cond/conj FDR and for CM3 model. Takes csv files 48 | (created with the csv task of this script). Require 49 | columns: SNP, P, and one of the signed summary 50 | statistics columns (BETA, OR, Z, LOGODDS). Creates 51 | corresponding mat files which can be used as an input 52 | for the conditional fdr model. Only SNPs from the 53 | reference file are considered. Zscores of strand 54 | ambiguous SNPs are set to NA. To use CHR:POS for 55 | merging summary statistics with reference file 56 | consider 'rs' utility which auguments summary 57 | statistics with SNP column (first run 'sumstats.py rs 58 | ...', then feed the resulting file into sumstats.py 59 | mat ...) 60 | lift Lift RS numbers to a newer version of SNPdb, and/or 61 | liftover chr:pos to another genomic build using UCSC 62 | chain files. WARNING: this utility may use excessive 63 | amount of memory (up and beyong 32 GB of RAM). 64 | clump Perform LD-based clumping of summary stats. This works 65 | similar to FUMA snp2gene functionality 66 | (http://fuma.ctglab.nl/tutorial#snp2gene). Step 1. Re- 67 | save summary stats into one file for each chromosome. 68 | Step 2a Use 'plink --clump' to find independent 69 | significant SNPs (default r2=0.6) Step 2b Use 'plink 70 | --clump' to find lead SNPs, by clumping independent 71 | significant SNPs (default r2=0.1) Step 3. Use 'plink 72 | --ld' to find genomic loci around each independent 73 | significant SNP (default r2=0.6) Step 4. Merge 74 | together genomic loci which are closer than certain 75 | threshold (250 KB) Step 5. Merge together genomic loci 76 | that fall into exclusion regions, such as MHC Step 6. 77 | Output genomic loci report, indicating lead SNPs for 78 | each loci Step 7. Output candidate SNP report 79 | rs Augument summary statistic file with SNP RS number 80 | from reference file. Merging is done on chromosome and 81 | position. If SNP column already exists in --sumstats 82 | file, it will be overwritten. 83 | ls Report information about standard sumstat files, 84 | including the set of columns available, number of 85 | SNPs, etc. 86 | mat-to-csv Convert matlab .mat file with logpvec, zvec and 87 | (optionally) nvec into CSV files. 88 | ldsc-to-mat Convert .sumstats, .ldscore, .M, .M_5_50 and binary 89 | .annot files from LD score regression to .mat files. 90 | frq-to-mat Convert .frq files plink from .mat files. 91 | ref-to-mat Convert reference files to .mat files. 92 | ldsum convert plink .ld.gz files (pairwise ld r2) to ld 93 | scores 94 | diff-mat Compare two .mat files with logpvec, zvec and nvec, 95 | and report the differences. 96 | neff generate N column from NCASE and NCONTROL, as 4 / (1 / 97 | NCASE + 1 / NCONTROL) 98 | 99 | optional arguments: 100 | -h, --help show this help message and exit 101 | ``` 102 | 103 | For more information about each command call ``sumstats.py --help``. 104 | 105 | Examples: 106 | ``` 107 | python $(python_convert)/sumstats.py csv --sumstats scz2.snp.results.txt.gz --out PGC_SCZ_2014.csv --force --auto --head 5 --chr hg19chrc 108 | python $(python_convert)/sumstats.py mat --sumstats PGC_SCZ_2014.csv --out PGC_SCZ_2014.mat --ref 2558411_ref.bim --force 109 | ``` 110 | 111 | Further examples can be found in [GWAS_SUMSTAT/Makefile](https://github.com/precimed/GWAS_SUMSTAT/blob/master/Makefile). 112 | 113 | ## sumstats.py clump 114 | 115 | ``clump`` utility determine 116 | - independent significant SNPs 117 | - lead SNPs 118 | - genomic loci 119 | - candidate SNPs 120 | using the same logic as FUMA's snp2gene. An example: 121 | 122 | ``` 123 | python sumstats.py clump \ 124 | --clump-field FDR \ 125 | --force \ 126 | --plink /home/oleksandr/plink/plink \ 127 | --sumstats cond0p01_BIP_vs_COG/result.mat.csv \ 128 | --bfile-chr /full/path/to/ref_1kG_phase3_EUR/chr@ \ 129 | --exclude-ranges ['6:25119106-33854733', '8:7200000-12500000'] \ 130 | --clump-p1 0.01 \ 131 | --out cond0p01_BIP_vs_COG/result.clump 132 | ``` 133 | 134 | Here the input file ``results.mat.csv`` was converted from cond/conj FDR results using this script: 135 | 136 | ``` 137 | # Usage: 138 | # python fdrmat2csv.py result.mat /space/syn03/1/data/GWAS/SUMSTAT/misc/9279485_ref.bim 139 | 140 | import pandas as pd 141 | import scipy.io as sio 142 | import sys 143 | import numpy as np 144 | if __name__ == '__main__': 145 | sumstats = sio.loadmat(sys.argv[1]) 146 | ref=pd.read_csv(ys.argv[2], delim_whitespace=True) 147 | ref['FDR']=sumstats['fdrmat'] 148 | ref[['CHR', 'SNP', 'BP', 'A1', 'A2', 'FDR']].to_csv(sys.argv[1] + '.csv', index=False, sep='\t') 149 | ``` 150 | 151 | ## make_ld_matrix 152 | 153 | Make LD matrix from reference data. Output either in matlab format or as dense lower triangular text file. 154 | To run this tool you need to download reference data from http://ctg.cncr.nl/software/magma (for example g1000_eur). 155 | Example: 156 | ``` 157 | python make_ld_matrix.py --ref 2558411_ref.bim --bfile g1000_eur --ld_window_r2 0.1 --savemat ldmat_p1.mat 158 | ``` 159 | For more info see [make_ld_matrix](./make_ld_matrix/README.md). 160 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/__init__.py -------------------------------------------------------------------------------- /annotation.py: -------------------------------------------------------------------------------- 1 | from time import localtime, strftime 2 | import pandas as pd 3 | import numpy as np 4 | import argparse 5 | from collections import namedtuple 6 | from multiprocessing import Pool 7 | import scipy.io as sio 8 | 9 | 10 | annotation_categories = ["transcript", "exon", "intron", "utr5", "utr3", 11 | "coding", "upstream_1kb", "downstream_1kb"] 12 | 13 | valid_chromosomes = [str(i) for i in range(1,23)] 14 | 15 | required_biomart_cols = {"Associated Gene Name": "gene_name", 16 | "Ensembl Gene ID": "gene_id", 17 | "Gene type": "gene_type", 18 | "Ensembl Transcript ID": "transcript_id", 19 | "Transcript Start (bp)": "transcript_start", 20 | "Transcript End (bp)": "transcript_end", 21 | "Chromosome Name": "chr", 22 | "Strand": "strand", 23 | "5' UTR Start": "utr5_start", 24 | "5' UTR End": "utr5_end", 25 | "3' UTR Start": "utr3_start", 26 | "3' UTR End": "utr3_end", 27 | "Ensembl Exon ID": "exon_id", 28 | "Exon Chr Start (bp)": "exon_start", 29 | "Exon Chr End (bp)": "exon_end", 30 | "Genomic coding start": "coding_start", 31 | "Genomic coding end": "coding_end", 32 | "Exon Rank in Transcript": "exon_rank"} 33 | 34 | required_bim_cols = {"SNP": "snp", 35 | "CHR": "chr", 36 | "BP": "pos"} 37 | 38 | snp_annotation = namedtuple("snp_annotation", annotation_categories) 39 | 40 | 41 | def is_in_upstream_1kb(snp, pos_chr_mart_df): 42 | forward_upstream = ( (pos_chr_mart_df.strand == 1) & 43 | (int(snp.pos) < pos_chr_mart_df.transcript_start) ).any() 44 | if not forward_upstream: 45 | reverse_upstream = ( (pos_chr_mart_df.strand == -1) & 46 | (int(snp.pos) >= pos_chr_mart_df.transcript_end) ).any() 47 | return forward_upstream or reverse_upstream 48 | 49 | 50 | def is_in_downstream_1kb(snp, pos_chr_mart_df): 51 | forward_downstream = ( (pos_chr_mart_df.strand == 1) & 52 | (int(snp.pos) >= pos_chr_mart_df.transcript_end) ).any() 53 | if not forward_downstream: 54 | reverse_downstream = ( (pos_chr_mart_df.strand == -1) & 55 | (int(snp.pos) < pos_chr_mart_df.transcript_start) ).any() 56 | return forward_downstream or reverse_downstream 57 | 58 | 59 | def is_in_transcript(snp, pos_chr_mart_df): 60 | return ( (pos_chr_mart_df.transcript_start <= snp.pos) & 61 | (int(snp.pos) < pos_chr_mart_df.transcript_end) ).any() 62 | 63 | 64 | def is_in_exon(snp, pos_chr_mart_df): 65 | return ( (pos_chr_mart_df.exon_start <= snp.pos) & 66 | (int(snp.pos) < pos_chr_mart_df.exon_end) ).any() 67 | 68 | 69 | def is_in_utr5(snp, coding_pos_chr_mart_df): 70 | return ( (coding_pos_chr_mart_df.utr5_start <= snp.pos) & 71 | (int(snp.pos) < coding_pos_chr_mart_df.utr5_end) ).any() 72 | 73 | 74 | def is_in_utr3(snp, coding_pos_chr_mart_df): 75 | return ( (coding_pos_chr_mart_df.utr3_start <= snp.pos) & 76 | (int(snp.pos) < coding_pos_chr_mart_df.utr3_end) ).any() 77 | 78 | 79 | def is_in_coding(snp, coding_pos_chr_mart_df): 80 | return ( (coding_pos_chr_mart_df.coding_start <= snp.pos) & 81 | (int(snp.pos) < coding_pos_chr_mart_df.coding_end) ).any() 82 | 83 | 84 | def annotate(arg): 85 | chr_snp_df, chr_mart_df = arg 86 | annot_df = pd.DataFrame(columns=annotation_categories) 87 | for snp_row in chr_snp_df.itertuples(): 88 | annot = dict.fromkeys(annotation_categories, False) 89 | i = ( (chr_mart_df.upstream_1kb <= snp_row.pos) & 90 | (int(snp_row.pos) < chr_mart_df.downstream_1kb) ) 91 | pos_chr_mart_df = chr_mart_df[i] 92 | if len(pos_chr_mart_df) != 0: 93 | annot["upstream_1kb"] = is_in_upstream_1kb(snp_row, pos_chr_mart_df) 94 | annot["downstream_1kb"] = is_in_downstream_1kb(snp_row, pos_chr_mart_df) 95 | annot["transcript"] = is_in_transcript(snp_row, pos_chr_mart_df) 96 | if annot["transcript"]: 97 | annot["exon"] = is_in_exon(snp_row, pos_chr_mart_df) 98 | annot["intron"] = not annot["exon"] 99 | coding_i = (pos_chr_mart_df.gene_type == "protein_coding") 100 | coding_pos_chr_mart_df = pos_chr_mart_df[coding_i] 101 | if annot["exon"] and len(pos_chr_mart_df) > 0: 102 | annot["utr5"] = is_in_utr5(snp_row, coding_pos_chr_mart_df) 103 | annot["utr3"] = is_in_utr3(snp_row, coding_pos_chr_mart_df) 104 | annot["coding"] = is_in_coding(snp_row, coding_pos_chr_mart_df) 105 | annot_df.loc[snp_row.snp] = snp_annotation(**annot) 106 | return annot_df 107 | 108 | 109 | def make_annotation_from_biomart(biomart_file, bim_file, out_txt, out_mat, 110 | test_run, test_n_snps, n_proc): 111 | mart_df = pd.read_csv(biomart_file, usecols=list(required_biomart_cols)) 112 | mart_df.columns = [required_biomart_cols[c] for c in mart_df.columns] 113 | print("%d exones in the input file" % len(mart_df)) 114 | 115 | # change to 0-based coordinates 116 | mart_df.transcript_start -= 1 117 | mart_df.exon_start -= 1 118 | mart_df.coding_start -= 1 119 | 120 | mart_df = mart_df[mart_df.chr.isin(valid_chromosomes)] 121 | print("%d exones on valid chromosomes" % len(mart_df)) 122 | mart_df["upstream_1kb"] = mart_df.transcript_start - 1000 123 | mart_df["downstream_1kb"] = mart_df.transcript_end + 1000 124 | 125 | #WARN: hardcoded CHR column 126 | snp_df = pd.read_csv(bim_file, usecols=list(required_bim_cols), 127 | dtype={"CHR":str}) 128 | snp_df.columns = [required_bim_cols[c] for c in snp_df.columns] 129 | print("%d snps in bim file" % len(snp_df)) 130 | snp_df = snp_df[snp_df.chr.isin(valid_chromosomes)] 131 | print("%d snps on valid chromosomes" % len(snp_df)) 132 | snp_df.drop_duplicates("snp", inplace=True) 133 | print("%d non duplicated snps on valid chromosomes" % len(snp_df)) 134 | 135 | if test_run: 136 | # Test with random subset 137 | print("Taking random %d snps for testing" % test_n_snps) 138 | random_ind = np.random.permutation(len(snp_df))[:test_n_snps] 139 | snp_df = snp_df.loc[random_ind,:] 140 | 141 | arg_gen = ( (snp_df[snp_df.chr == c], mart_df[mart_df.chr == c]) 142 | for c in valid_chromosomes ) 143 | if n_proc > 1: 144 | pool = Pool(processes=n_proc) 145 | annotation_dfs = pool.map(annotate, arg_gen) 146 | else: 147 | annotation_dfs = [annotate(arg) for arg in arg_gen] 148 | annot_df = pd.concat(annotation_dfs) 149 | print("%d SNPs were annotated" % len(annot_df)) 150 | annot_df[annotation_categories] = annot_df[annotation_categories].astype(int) 151 | annot_df = annot_df.reindex(index=snp_df.snp) 152 | if not out_txt is None: 153 | annot_df.to_csv(out_txt, sep='\t', index_label="snp") 154 | print("%s saved" % out_txt) 155 | if not out_mat is None: 156 | mat_dict = {"annomat": annot_df.values, "annonames": list(annot_df.columns)} 157 | sio.savemat(out_mat, mat_dict, format="5", appendmat=False) 158 | print("%s saved" % out_mat) 159 | 160 | 161 | 162 | if __name__ == "__main__": 163 | # Implementation notes: 164 | # - A file "biomart_GENCODE_basic.txt" was created using Ensembl Biomart 165 | # tool. Using "Homo sapiens genes (GRCh37.p13)" dataset. Only transcripts 166 | # included into GENCODE basic annotation were taken, i.e. the only fileter 167 | # applied was: "GENCODE basic annotation: Only". All keys from 168 | # required_biomart_cols dict presented above were taken as attributes. 169 | # - Definition of GENCODE basic annotation can be found here: 170 | # http://grch37.ensembl.org/Help/Glossary?id=500 171 | # - Biomart has 1-based coordinate system, while dbSNP has 0-based 172 | # coordinates. In this script everything is converted to 0-based. 173 | 174 | parser = argparse.ArgumentParser(description="Classify SNPs from reference " 175 | "template based on the biomart annotations.") 176 | parser.add_argument("--biomart", default="data/biomart_GENCODE_basic.txt.gz", 177 | type=str, help="File with Biomart annotations.") 178 | parser.add_argument("--ref", default="2558411_ref.bim", 179 | type=str, help="Reference template file.") 180 | parser.add_argument("--out-txt", default="annotations.txt", type=str, 181 | help="Output text file name or None.") 182 | parser.add_argument("--out-mat", default="annotations.mat", type=str, 183 | help="Output mat file name or None.") 184 | parser.add_argument("--test", action="store_true", 185 | help="Run test with randomly picked test_n_snps SNPs.") 186 | parser.add_argument("--test-n-snps", default=10000, type=int, 187 | help="Number of SNPs for testing.") 188 | parser.add_argument("--n-proc", default=1, type=int, help="Number of cores " 189 | "to use for calculation.") 190 | args = parser.parse_args() 191 | 192 | print("Started on %s" % strftime("%a, %d %b %Y %H:%M:%S", localtime())) 193 | 194 | make_annotation_from_biomart(args.biomart, args.ref, args.out_txt, 195 | args.out_mat, args.test, args.test_n_snps, args.n_proc) 196 | 197 | print("Finished on %s" % strftime("%a, %d %b %Y %H:%M:%S", localtime())) 198 | -------------------------------------------------------------------------------- /config.plotgwas.3.cfg: -------------------------------------------------------------------------------- 1 | # Path to input sumstats file 2 | sumstats = "/cluster/projects/p33/users/alexeas/aud_gwas/meta/scripts/generic-metal/AUD.EUR.PGC_MVP_UKB_FINNGEN.1.190622.metal.processed.tsv.gz.csv" 3 | 4 | # sumstats column names (p-value, chromosome, position and marker ID columns are required) 5 | p_col = "PVAL" 6 | chrom_col = "CHR" 7 | bp_col = "BP" 8 | id_col = "SNP" 9 | 10 | # Plot elements 11 | legend_label = "AUD EUR" # label to use in the legend 12 | legend_label_color = "#994455" # color hex code 13 | y_axis_label = "p-value" # y axis will be labeled -log10(y_axis_label) 14 | gws_threshold = 5E-8 15 | p_cutoff_low = 0.05 # all variants with p > p_cutoff_low are ignored 16 | p_cutoff_high = 1E-40 # all variants with p < p_cutoff_high are ignored 17 | allign_y_max = True # if True, make y max limit equal in top and bottom axis 18 | # Normal dots 19 | color1 = "#EE99AA" # color (hex code) of markers on odd chromosomes 20 | color2 = "#994455" # color of markers on even chromosomes 21 | size = 8 # size of the marker 22 | marker = "o" # shape of the marker, see matplotlib marker specification for available options 23 | alpha = 0.7 # transparency of markers ranging from 0 (fully transparent) to 1 (opaque) 24 | # Bold dots 25 | bold = "bold.test.txt" # file (path) with marker ids to plot with potentially different size/marker/color/transparency; set to empty string to ignore 26 | color1_bold = "#EE99AA" 27 | color2_bold = "#994455" 28 | size_bold = 48 29 | marker_bold = "o" 30 | alpha_bold = 1 31 | # Outlined dots 32 | outlined = "outlined.test.txt" # file (path) with marker ids to plot with outline; set to empty string to ignore 33 | color1_outlined = "#EE99AA" 34 | color2_outlined = "#994455" 35 | size_outlined = 144 36 | marker_outlined = "o" 37 | alpha_outlined = 1 38 | 39 | -------------------------------------------------------------------------------- /convert_cleansumstats_output_to_mixer_format.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | 5 | # python convert_cleansumstats_output_to_mixer_format.py /cluster/projects/p697/projects/SUMSTATv3/v3.1/STD_GRCh37/CTG_COG_2018.sumstats.gz CTG_COG_2018_mixer.sumstats.gz 6 | if __name__ == "__main__": 7 | fname = sys.argv[1] 8 | fname_out = sys.argv[2] 9 | print(f'processing {fname} -> {fname_out}...') 10 | df = pd.read_csv(fname, sep='\t', dtype=str) 11 | idx = df['CHR'].astype('str').str.lower().str.replace('chr', '').isin([str(i) for i in range(1, 23)]) 12 | print(f'keep autosomes only (CHR column contains 1-22): {np.sum(~idx)} variants removed, {np.sum(idx)} variants retained') 13 | df = df[idx].copy() 14 | df['CHR'] = df['CHR'].astype(int) 15 | print('original columns: ' + ' '.join(df.columns)) 16 | if 'POS' in df.columns: df.rename(columns={'POS':'BP'}, inplace=True) 17 | if 'RSID' in df.columns: df.rename(columns={'RSID':'SNP'}, inplace=True) 18 | if 'EffectAllele' in df.columns: df.rename(columns={'EffectAllele':'A1'}, inplace=True) 19 | if 'OtherAllele' in df.columns: df.rename(columns={'OtherAllele':'A2'}, inplace=True) 20 | if 'B' in df.columns: df.rename(columns={'B':'BETA'}, inplace=True) 21 | if 'EAF' in df.columns: df.rename(columns={'EAF':'FRQ'}, inplace=True) 22 | print('renamed columns: ' + ' '.join(df.columns)) 23 | 24 | sumstats_len = len(df) 25 | df['BP'] = pd.to_numeric(df['BP'], errors='coerce') 26 | df.dropna(subset=['BP'], inplace=True) 27 | df['BP'] = df['BP'].astype(int) 28 | print(f'Drop {sumstats_len - len(df)} variants due to non-numeric or missing values in BP column') 29 | 30 | idx = (df['CHR'] == 6) & (df['BP'] >= 25e6) & (df['BP'] < 35e6) 31 | print(f'drop MHC variants (chr6:25-35): {np.sum(idx)} variants removed, {np.sum(~idx)} variants retained') 32 | df = df[~idx].copy() 33 | 34 | print(f'writing {fname_out}...') 35 | df.to_csv(fname_out, sep='\t', index=False) 36 | print('done.') 37 | -------------------------------------------------------------------------------- /data/biomart_GENCODE_basic.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/data/biomart_GENCODE_basic.txt.gz -------------------------------------------------------------------------------- /fdrmat2csv.py: -------------------------------------------------------------------------------- 1 | ### CONVERT PLEIOFDR RESULTS TO CSV ######################## 2 | 3 | # -- Modules ------------------------- 4 | 5 | import pandas as pd 6 | import scipy.io as sio 7 | import os 8 | import argparse 9 | 10 | # -- Parse arguments ------------------------- 11 | 12 | parser = argparse.ArgumentParser(description="Convert PleioFDR result.mat file to csv") 13 | requiredNamed = parser.add_argument_group('Required arguments') 14 | requiredNamed.add_argument("--mat", help="Path to result.mat file from PleioFDR ouput", required=True) 15 | requiredNamed.add_argument("--ref", help="Path to .ref file", required=True) 16 | parser.add_argument("--out", help="Path to file after conversion is done (default: result.mat.csv)") 17 | parser.add_argument("--head", default=5, type=int, help="Number of lines to show (default: 5)") 18 | parser.add_argument("--compress", default=False, action='store_true', help="Compress to .gz archive (default: False)") 19 | args = parser.parse_args() 20 | 21 | matfile = args.mat 22 | reffile = args.ref 23 | if args.out is not None: 24 | outname = args.out 25 | else: 26 | outname = matfile + '.csv' 27 | 28 | # -- Convert result.mat ------------------------- 29 | 30 | if __name__ == '__main__': 31 | print('Load {}'.format(matfile)) 32 | sumstats = sio.loadmat(matfile) 33 | 34 | print('Load {}'.format(reffile)) 35 | ref = pd.read_csv(reffile, delim_whitespace=True) 36 | ref['FDR'] = None 37 | ref['FDR'] = sumstats['fdrmat'] 38 | 39 | print('Write {}'.format(outname)) 40 | ref[['CHR', 'SNP', 'BP', 'A1', 'A2', 'FDR']].to_csv(outname, index=False, sep='\t') 41 | 42 | if args.head: 43 | print(ref.head(args.head)) 44 | 45 | if args.compress: 46 | print('Compress {}'.format(outname)) 47 | os.system('gzip {}'.format(outname)) 48 | 49 | print('Done!') 50 | 51 | pass 52 | -------------------------------------------------------------------------------- /figs/Z3nns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/figs/Z3nns.png -------------------------------------------------------------------------------- /lift_rs_numbers.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | 3 | class LiftRsNumbers: 4 | def _read_rs_history(self, histFile): 5 | RS_HISTORY = set() # store rs 6 | 7 | print("Reading '{}' file...".format(histFile)) 8 | for ln in gzip.open(histFile, mode='rt'): 9 | fd = ln.strip().split('\t') 10 | # Some very few entries in SNPHistory file are about 11 | # re-activation SNPs (not about deleting them). 12 | # We just need to ignore those entries 13 | if ln.lower().find('re-activ') < 0: 14 | RS_HISTORY.add(fd[0]) 15 | print('{} entries found'.format(len(RS_HISTORY))) 16 | return RS_HISTORY 17 | 18 | def _read_rs_merge(self, mergFile): 19 | RS_MERGE = dict() # high_rs -> (lower_rs, current_rs) 20 | 21 | print("Reading '{}' file...".format(mergFile)) 22 | for ln in gzip.open(mergFile, mode='rt'): 23 | fd = ln.strip().split('\t') 24 | h, l = fd[0], fd[1] 25 | c = fd[6] 26 | RS_MERGE[h] = (l, c) 27 | 28 | print('{} entries found'.format(len(RS_MERGE))) 29 | return RS_MERGE 30 | 31 | def __init__(self, hist_file=None, merge_file=None): 32 | self._RS_HISTORY = self._read_rs_history(hist_file) 33 | self._RS_MERGE = self._read_rs_merge(merge_file) 34 | 35 | def lift(self, rsvec): 36 | unchanged = 0; lifted = 0; deleted = 0; not_rs_number = 0; 37 | RS_LIFTED = rsvec.copy(); nsnps = len(rsvec) 38 | print("Lifting rs# numbers for n={} SNPs...".format(nsnps)) 39 | is_rs_number = [x.startswith('rs') and x[2:].isdigit() for x in rsvec] 40 | rsvec = [x[2:] for x in rsvec] 41 | for i in range(nsnps): 42 | if not is_rs_number[i]: 43 | not_rs_number += 1 44 | continue 45 | rs = rsvec[i] 46 | if rs not in self._RS_MERGE and rs not in self._RS_HISTORY: 47 | unchanged += 1 48 | continue 49 | while True: 50 | if rs in self._RS_MERGE: 51 | rsLow, rsCurrent = self._RS_MERGE[rs] 52 | if rsCurrent not in self._RS_HISTORY and rsCurrent != '': 53 | RS_LIFTED[i] = 'rs' + rsCurrent; lifted += 1 54 | break 55 | else: 56 | rs = rsLow 57 | else: 58 | # Such SNPs were deleted from SNPdb, 59 | # look it up here: https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=71800898 60 | RS_LIFTED[i] = None; deleted += 1 61 | break 62 | 63 | return RS_LIFTED, {'invalid rs#':not_rs_number, 'unchanged':unchanged, 'lifted':lifted, 'deleted':deleted} 64 | -------------------------------------------------------------------------------- /make_ld_matrix/README.md: -------------------------------------------------------------------------------- 1 | # make_ld_matrix.py 2 | 3 | Make LD matrix from reference data. Output either in matlab format or as dense lower triangular text file. 4 | To run this tool you need to download reference data from http://ctg.cncr.nl/software/magma (for example g1000_eur). 5 | 6 | For info run `python make_ld_matrix.py --help`. 7 | 8 | 9 | ## Usage 10 | 11 | ### only abel: 12 | ``` 13 | qlogin --account=nn9114k --mem-per-cpu=2000 --cpus-per-task=2 14 | 15 | module load python2 16 | module load plink2 17 | module load octave 18 | 19 | pip install --user pandas 20 | pip install --user scipy 21 | 22 | cd /work/users/$USER/ 23 | ``` 24 | 25 | ### general: 26 | 27 | Download data: 28 | ``` 29 | ( 30 | mkdir data10g 31 | cd data10g 32 | wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr[0-9]*vcf.gz 33 | ) 34 | ``` 35 | 36 | Download code: 37 | ``` 38 | git clone https://github.com/precimed/python_convert 39 | cd python_convert/make_ld_matrix 40 | ``` 41 | 42 | Set path to your reference file: 43 | ``` 44 | reffile=/work/users/$USER/2558411_ref.bim 45 | ``` 46 | 47 | Create Matlab matrices: 48 | ``` 49 | for i in ../../data10g/*.vcf.gz; do 50 | python make_ld_matrix.py --vcf $i \ 51 | --ref $reffile --savemat tmp/$(basename $i).map \ 52 | --plink 'plink --memory 3600 --threads 2'; 53 | done 54 | ``` 55 | 56 | Convert to Matlab sparse matrices: 57 | ``` 58 | for f in tmp/*.map.mat; do 59 | # repace ".map.mat" with ".sparse.mat" 60 | outfile=${f%.map.mat}.sparse.mat 61 | # matlab script 62 | mscript=" 63 | load $f 64 | LDmat = sparse(double(id1),double(id2),true,double(nsnp),double(nsnp)); 65 | LDmat = LDmat | speye(double(nsnp)); 66 | LDmat = LDmat | (LDmat - LDmat'); 67 | save(\"$outfile\", 'LDmat', '-v7.3') 68 | " 69 | # run matlab script 70 | echo "$mscript" | octave --silent 71 | done 72 | ``` 73 | Inside the folder "./tmp/" you now have a file "*.sparse.mat" for each chromosome. 74 | -------------------------------------------------------------------------------- /make_ld_matrix/data/EUR_subj.list: -------------------------------------------------------------------------------- 1 | HG00096 HG00096 2 | HG00097 HG00097 3 | HG00099 HG00099 4 | HG00100 HG00100 5 | HG00101 HG00101 6 | HG00102 HG00102 7 | HG00103 HG00103 8 | HG00105 HG00105 9 | HG00106 HG00106 10 | HG00107 HG00107 11 | HG00108 HG00108 12 | HG00109 HG00109 13 | HG00110 HG00110 14 | HG00111 HG00111 15 | HG00112 HG00112 16 | HG00113 HG00113 17 | HG00114 HG00114 18 | HG00115 HG00115 19 | HG00116 HG00116 20 | HG00117 HG00117 21 | HG00118 HG00118 22 | HG00119 HG00119 23 | HG00120 HG00120 24 | HG00121 HG00121 25 | HG00122 HG00122 26 | HG00123 HG00123 27 | HG00125 HG00125 28 | HG00126 HG00126 29 | HG00127 HG00127 30 | HG00128 HG00128 31 | HG00129 HG00129 32 | HG00130 HG00130 33 | HG00131 HG00131 34 | HG00132 HG00132 35 | HG00133 HG00133 36 | HG00136 HG00136 37 | HG00137 HG00137 38 | HG00138 HG00138 39 | HG00139 HG00139 40 | HG00140 HG00140 41 | HG00141 HG00141 42 | HG00142 HG00142 43 | HG00143 HG00143 44 | HG00145 HG00145 45 | HG00146 HG00146 46 | HG00148 HG00148 47 | HG00149 HG00149 48 | HG00150 HG00150 49 | HG00151 HG00151 50 | HG00154 HG00154 51 | HG00155 HG00155 52 | HG00157 HG00157 53 | HG00158 HG00158 54 | HG00159 HG00159 55 | HG00160 HG00160 56 | HG00171 HG00171 57 | HG00173 HG00173 58 | HG00174 HG00174 59 | HG00176 HG00176 60 | HG00177 HG00177 61 | HG00178 HG00178 62 | HG00179 HG00179 63 | HG00180 HG00180 64 | HG00181 HG00181 65 | HG00182 HG00182 66 | HG00183 HG00183 67 | HG00185 HG00185 68 | HG00186 HG00186 69 | HG00187 HG00187 70 | HG00188 HG00188 71 | HG00189 HG00189 72 | HG00190 HG00190 73 | HG00231 HG00231 74 | HG00232 HG00232 75 | HG00233 HG00233 76 | HG00234 HG00234 77 | HG00235 HG00235 78 | HG00236 HG00236 79 | HG00237 HG00237 80 | HG00238 HG00238 81 | HG00239 HG00239 82 | HG00240 HG00240 83 | HG00242 HG00242 84 | HG00243 HG00243 85 | HG00244 HG00244 86 | HG00245 HG00245 87 | HG00246 HG00246 88 | HG00250 HG00250 89 | HG00251 HG00251 90 | HG00252 HG00252 91 | HG00253 HG00253 92 | HG00254 HG00254 93 | HG00255 HG00255 94 | HG00256 HG00256 95 | HG00257 HG00257 96 | HG00258 HG00258 97 | HG00259 HG00259 98 | HG00260 HG00260 99 | HG00261 HG00261 100 | HG00262 HG00262 101 | HG00263 HG00263 102 | HG00264 HG00264 103 | HG00265 HG00265 104 | HG00266 HG00266 105 | HG00267 HG00267 106 | HG00268 HG00268 107 | HG00269 HG00269 108 | HG00271 HG00271 109 | HG00272 HG00272 110 | HG00273 HG00273 111 | HG00274 HG00274 112 | HG00275 HG00275 113 | HG00276 HG00276 114 | HG00277 HG00277 115 | HG00278 HG00278 116 | HG00280 HG00280 117 | HG00281 HG00281 118 | HG00282 HG00282 119 | HG00284 HG00284 120 | HG00285 HG00285 121 | HG00288 HG00288 122 | HG00290 HG00290 123 | HG00304 HG00304 124 | HG00306 HG00306 125 | HG00308 HG00308 126 | HG00309 HG00309 127 | HG00310 HG00310 128 | HG00311 HG00311 129 | HG00313 HG00313 130 | HG00315 HG00315 131 | HG00318 HG00318 132 | HG00319 HG00319 133 | HG00320 HG00320 134 | HG00321 HG00321 135 | HG00323 HG00323 136 | HG00324 HG00324 137 | HG00325 HG00325 138 | HG00326 HG00326 139 | HG00327 HG00327 140 | HG00328 HG00328 141 | HG00329 HG00329 142 | HG00330 HG00330 143 | HG00331 HG00331 144 | HG00332 HG00332 145 | HG00334 HG00334 146 | HG00335 HG00335 147 | HG00336 HG00336 148 | HG00337 HG00337 149 | HG00338 HG00338 150 | HG00339 HG00339 151 | HG00341 HG00341 152 | HG00342 HG00342 153 | HG00343 HG00343 154 | HG00344 HG00344 155 | HG00345 HG00345 156 | HG00346 HG00346 157 | HG00349 HG00349 158 | HG00350 HG00350 159 | HG00351 HG00351 160 | HG00353 HG00353 161 | HG00355 HG00355 162 | HG00356 HG00356 163 | HG00357 HG00357 164 | HG00358 HG00358 165 | HG00360 HG00360 166 | HG00361 HG00361 167 | HG00362 HG00362 168 | HG00364 HG00364 169 | HG00365 HG00365 170 | HG00366 HG00366 171 | HG00367 HG00367 172 | HG00368 HG00368 173 | HG00369 HG00369 174 | HG00371 HG00371 175 | HG00372 HG00372 176 | HG00373 HG00373 177 | HG00375 HG00375 178 | HG00376 HG00376 179 | HG00378 HG00378 180 | HG00379 HG00379 181 | HG00380 HG00380 182 | HG00381 HG00381 183 | HG00382 HG00382 184 | HG00383 HG00383 185 | HG00384 HG00384 186 | HG01334 HG01334 187 | HG01500 HG01500 188 | HG01501 HG01501 189 | HG01503 HG01503 190 | HG01504 HG01504 191 | HG01506 HG01506 192 | HG01507 HG01507 193 | HG01509 HG01509 194 | HG01510 HG01510 195 | HG01512 HG01512 196 | HG01513 HG01513 197 | HG01515 HG01515 198 | HG01516 HG01516 199 | HG01518 HG01518 200 | HG01519 HG01519 201 | HG01521 HG01521 202 | HG01522 HG01522 203 | HG01524 HG01524 204 | HG01525 HG01525 205 | HG01527 HG01527 206 | HG01528 HG01528 207 | HG01530 HG01530 208 | HG01531 HG01531 209 | HG01536 HG01536 210 | HG01537 HG01537 211 | HG01602 HG01602 212 | HG01603 HG01603 213 | HG01605 HG01605 214 | HG01606 HG01606 215 | HG01607 HG01607 216 | HG01608 HG01608 217 | HG01610 HG01610 218 | HG01612 HG01612 219 | HG01613 HG01613 220 | HG01615 HG01615 221 | HG01617 HG01617 222 | HG01618 HG01618 223 | HG01619 HG01619 224 | HG01620 HG01620 225 | HG01623 HG01623 226 | HG01624 HG01624 227 | HG01625 HG01625 228 | HG01626 HG01626 229 | HG01628 HG01628 230 | HG01630 HG01630 231 | HG01631 HG01631 232 | HG01632 HG01632 233 | HG01668 HG01668 234 | HG01669 HG01669 235 | HG01670 HG01670 236 | HG01672 HG01672 237 | HG01673 HG01673 238 | HG01675 HG01675 239 | HG01676 HG01676 240 | HG01678 HG01678 241 | HG01679 HG01679 242 | HG01680 HG01680 243 | HG01682 HG01682 244 | HG01684 HG01684 245 | HG01685 HG01685 246 | HG01686 HG01686 247 | HG01694 HG01694 248 | HG01695 HG01695 249 | HG01697 HG01697 250 | HG01699 HG01699 251 | HG01700 HG01700 252 | HG01702 HG01702 253 | HG01704 HG01704 254 | HG01705 HG01705 255 | HG01707 HG01707 256 | HG01708 HG01708 257 | HG01709 HG01709 258 | HG01710 HG01710 259 | HG01746 HG01746 260 | HG01747 HG01747 261 | HG01756 HG01756 262 | HG01757 HG01757 263 | HG01761 HG01761 264 | HG01762 HG01762 265 | HG01765 HG01765 266 | HG01766 HG01766 267 | HG01767 HG01767 268 | HG01768 HG01768 269 | HG01770 HG01770 270 | HG01771 HG01771 271 | HG01773 HG01773 272 | HG01775 HG01775 273 | HG01776 HG01776 274 | HG01777 HG01777 275 | HG01779 HG01779 276 | HG01781 HG01781 277 | HG01783 HG01783 278 | HG01784 HG01784 279 | HG01785 HG01785 280 | HG01786 HG01786 281 | HG01789 HG01789 282 | HG01790 HG01790 283 | HG01791 HG01791 284 | HG02215 HG02215 285 | HG02219 HG02219 286 | HG02220 HG02220 287 | HG02221 HG02221 288 | HG02223 HG02223 289 | HG02224 HG02224 290 | HG02230 HG02230 291 | HG02231 HG02231 292 | HG02232 HG02232 293 | HG02233 HG02233 294 | HG02235 HG02235 295 | HG02236 HG02236 296 | HG02238 HG02238 297 | HG02239 HG02239 298 | NA06984 NA06984 299 | NA06985 NA06985 300 | NA06986 NA06986 301 | NA06989 NA06989 302 | NA06994 NA06994 303 | NA07000 NA07000 304 | NA07037 NA07037 305 | NA07048 NA07048 306 | NA07051 NA07051 307 | NA07056 NA07056 308 | NA07347 NA07347 309 | NA07357 NA07357 310 | NA10847 NA10847 311 | NA10851 NA10851 312 | NA11829 NA11829 313 | NA11830 NA11830 314 | NA11831 NA11831 315 | NA11832 NA11832 316 | NA11840 NA11840 317 | NA11843 NA11843 318 | NA11881 NA11881 319 | NA11892 NA11892 320 | NA11893 NA11893 321 | NA11894 NA11894 322 | NA11918 NA11918 323 | NA11919 NA11919 324 | NA11920 NA11920 325 | NA11930 NA11930 326 | NA11931 NA11931 327 | NA11932 NA11932 328 | NA11933 NA11933 329 | NA11992 NA11992 330 | NA11994 NA11994 331 | NA11995 NA11995 332 | NA12003 NA12003 333 | NA12004 NA12004 334 | NA12005 NA12005 335 | NA12006 NA12006 336 | NA12043 NA12043 337 | NA12044 NA12044 338 | NA12045 NA12045 339 | NA12046 NA12046 340 | NA12058 NA12058 341 | NA12144 NA12144 342 | NA12154 NA12154 343 | NA12155 NA12155 344 | NA12156 NA12156 345 | NA12234 NA12234 346 | NA12249 NA12249 347 | NA12272 NA12272 348 | NA12273 NA12273 349 | NA12275 NA12275 350 | NA12282 NA12282 351 | NA12283 NA12283 352 | NA12286 NA12286 353 | NA12287 NA12287 354 | NA12340 NA12340 355 | NA12341 NA12341 356 | NA12342 NA12342 357 | NA12347 NA12347 358 | NA12348 NA12348 359 | NA12383 NA12383 360 | NA12399 NA12399 361 | NA12400 NA12400 362 | NA12413 NA12413 363 | NA12414 NA12414 364 | NA12489 NA12489 365 | NA12546 NA12546 366 | NA12716 NA12716 367 | NA12717 NA12717 368 | NA12718 NA12718 369 | NA12748 NA12748 370 | NA12749 NA12749 371 | NA12750 NA12750 372 | NA12751 NA12751 373 | NA12760 NA12760 374 | NA12761 NA12761 375 | NA12762 NA12762 376 | NA12763 NA12763 377 | NA12775 NA12775 378 | NA12776 NA12776 379 | NA12777 NA12777 380 | NA12778 NA12778 381 | NA12812 NA12812 382 | NA12813 NA12813 383 | NA12814 NA12814 384 | NA12815 NA12815 385 | NA12827 NA12827 386 | NA12828 NA12828 387 | NA12829 NA12829 388 | NA12830 NA12830 389 | NA12842 NA12842 390 | NA12843 NA12843 391 | NA12872 NA12872 392 | NA12873 NA12873 393 | NA12874 NA12874 394 | NA12878 NA12878 395 | NA12889 NA12889 396 | NA12890 NA12890 397 | NA20502 NA20502 398 | NA20503 NA20503 399 | NA20504 NA20504 400 | NA20505 NA20505 401 | NA20506 NA20506 402 | NA20507 NA20507 403 | NA20508 NA20508 404 | NA20509 NA20509 405 | NA20510 NA20510 406 | NA20511 NA20511 407 | NA20512 NA20512 408 | NA20513 NA20513 409 | NA20514 NA20514 410 | NA20515 NA20515 411 | NA20516 NA20516 412 | NA20517 NA20517 413 | NA20518 NA20518 414 | NA20519 NA20519 415 | NA20520 NA20520 416 | NA20521 NA20521 417 | NA20522 NA20522 418 | NA20524 NA20524 419 | NA20525 NA20525 420 | NA20527 NA20527 421 | NA20528 NA20528 422 | NA20529 NA20529 423 | NA20530 NA20530 424 | NA20531 NA20531 425 | NA20532 NA20532 426 | NA20533 NA20533 427 | NA20534 NA20534 428 | NA20535 NA20535 429 | NA20536 NA20536 430 | NA20538 NA20538 431 | NA20539 NA20539 432 | NA20540 NA20540 433 | NA20541 NA20541 434 | NA20542 NA20542 435 | NA20543 NA20543 436 | NA20544 NA20544 437 | NA20581 NA20581 438 | NA20582 NA20582 439 | NA20585 NA20585 440 | NA20586 NA20586 441 | NA20587 NA20587 442 | NA20588 NA20588 443 | NA20589 NA20589 444 | NA20752 NA20752 445 | NA20753 NA20753 446 | NA20754 NA20754 447 | NA20755 NA20755 448 | NA20756 NA20756 449 | NA20757 NA20757 450 | NA20758 NA20758 451 | NA20759 NA20759 452 | NA20760 NA20760 453 | NA20761 NA20761 454 | NA20762 NA20762 455 | NA20763 NA20763 456 | NA20764 NA20764 457 | NA20765 NA20765 458 | NA20766 NA20766 459 | NA20767 NA20767 460 | NA20768 NA20768 461 | NA20769 NA20769 462 | NA20770 NA20770 463 | NA20771 NA20771 464 | NA20772 NA20772 465 | NA20773 NA20773 466 | NA20774 NA20774 467 | NA20775 NA20775 468 | NA20778 NA20778 469 | NA20783 NA20783 470 | NA20785 NA20785 471 | NA20786 NA20786 472 | NA20787 NA20787 473 | NA20790 NA20790 474 | NA20792 NA20792 475 | NA20795 NA20795 476 | NA20796 NA20796 477 | NA20797 NA20797 478 | NA20798 NA20798 479 | NA20799 NA20799 480 | NA20800 NA20800 481 | NA20801 NA20801 482 | NA20802 NA20802 483 | NA20803 NA20803 484 | NA20804 NA20804 485 | NA20805 NA20805 486 | NA20806 NA20806 487 | NA20807 NA20807 488 | NA20808 NA20808 489 | NA20809 NA20809 490 | NA20810 NA20810 491 | NA20811 NA20811 492 | NA20812 NA20812 493 | NA20813 NA20813 494 | NA20814 NA20814 495 | NA20815 NA20815 496 | NA20818 NA20818 497 | NA20819 NA20819 498 | NA20821 NA20821 499 | NA20822 NA20822 500 | NA20826 NA20826 501 | NA20827 NA20827 502 | NA20828 NA20828 503 | NA20832 NA20832 504 | -------------------------------------------------------------------------------- /make_ld_matrix/genotypes2ref.py: -------------------------------------------------------------------------------- 1 | # Align genotypes to reference file, e.g. 2 | # - Extract SNPs from the reference (merge by CHR:POS --- require data to be on the same genomic build) 3 | # - Extract subset of individuals (for example, european population) 4 | # - Merge SNPs together (for example if input data is split by chromosome) 5 | # 6 | # To run this tool: 7 | # - Download *.vcf.gz files from 1000 Genome project ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ 8 | # - Run the tool as follows 9 | # python genotypes2ref.py --vcf ~/1000Genome/phase3/build37_released/*.vcf.gz --ref 2558411_ref.bim 10 | 11 | import argparse 12 | import glob 13 | import itertools 14 | import os.path 15 | import os 16 | import subprocess 17 | import sys 18 | import pandas as pd 19 | 20 | def parse_args(args): 21 | parser = argparse.ArgumentParser(description="Generate LD matrix from genotype matrix") 22 | parser.add_argument("--ref", type=str, help="Reference file (for example 2558411_ref.bim or 9279485_ref.bim.") 23 | parser.add_argument("--vcf", type=str, help="Filename of input .vcf file, or pattern (for example '~/1000Genome/phase3/build37_released/*.vcf.gz')") 24 | parser.add_argument("--keep", default=r"data/EUR_subj.list", type=str, help="Extract SNPs and keep only EUR individuals") 25 | parser.add_argument("--out", default=r"tmp", type=str, help="Folder to output the result") 26 | parser.add_argument("--plink", default="plink", type=str, help="location of plink executable") 27 | return parser.parse_args(args) 28 | 29 | def execute_command(command): 30 | print("Execute command: {}".format(command)) 31 | print(subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0].decode("utf-8")) 32 | #print(subprocess.check_output(command.split()).decode("utf-8")) 33 | 34 | def process_vcf_file(vcf_file, df_ref, keep_file, output_dir, plink): 35 | [_, filename] = os.path.split(vcf_file) 36 | bfile = os.path.join(output_dir, filename) 37 | snpidlist = os.path.join(output_dir, filename + '.snpidlist.txt') 38 | join_file = os.path.join(output_dir, filename) 39 | 40 | if not os.path.exists(output_dir): 41 | os.makedirs(output_dir) 42 | 43 | # Convert vcf into bed 44 | execute_command(r'{0} --vcf {1} --make-bed --out {2}'.format(plink, vcf_file, bfile)) 45 | 46 | # Read bim file 47 | df_bim = pd.read_csv('{}.bim'.format(bfile), header=None, delim_whitespace=True) 48 | df_bim.columns=['CHR','SNP','GP','POS','A1','A2'] 49 | 50 | # Left-merge with reference file by CHR:POS, then output merged RS numberes into snpidlist.txt 51 | df_bim2 = pd.merge(df_bim, df_ref, how='left', left_on = ['CHR', 'POS'], right_on = ['CHR', 'BP']) 52 | df_bim2[df_bim2['SNP_y'].notnull()]['SNP_x'].to_csv(snpidlist, index=False) 53 | 54 | # Extract SNPs and keep only EUR individuals 55 | execute_command(r'{0} --bfile {1} --extract {2} --keep {3} --make-bed --out {4}'.format(plink, bfile, snpidlist, keep_file, join_file)) 56 | return join_file 57 | 58 | if __name__ == "__main__": 59 | args = parse_args(sys.argv[1:]) 60 | 61 | vcf_files=[file for file in glob.glob(args.vcf) 62 | if ('chrX' not in file) and ('chrY' not in file)] 63 | print(vcf_files) 64 | # Read reference file 65 | df_ref = pd.read_csv(args.ref, delim_whitespace=True) 66 | assert df_ref.duplicated(['CHR', 'BP']).sum() == 0 67 | assert df_ref.duplicated(['SNP']).sum() == 0 68 | 69 | for vcf_file in vcf_files: process_vcf_file(vcf_file, df_ref, args.keep, args.out, args.plink) 70 | 71 | print("Done.") 72 | -------------------------------------------------------------------------------- /make_ld_matrix/make_ld_matrix.py: -------------------------------------------------------------------------------- 1 | # Download reference data from http://ctg.cncr.nl/software/magma (for example g1000_eur) 2 | # Then you can run the tool as follows: 3 | # python make_ld_matrix.py --ref 2558411_ref.bim --bfile g1000_eur --ld_window_r2 0.1 --savemat ldmat_p1.mat 4 | # 5 | # Another example is for situation where you've already generated LD matrix by plink: 6 | # python make_ld_matrix.py --ref 2558411_ref.bim --ldfile tmp.ld --savemat ldmat.mat 7 | 8 | from subprocess import call, check_output 9 | import subprocess 10 | import pandas as pd 11 | import numpy as np 12 | import argparse 13 | import sys 14 | import os.path 15 | from make_maf_vector import make_maf_vector 16 | from genotypes2ref import process_vcf_file 17 | 18 | def execute_command(command): 19 | print("Execute command: {}".format(command)) 20 | process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 21 | print(process.communicate()[0].decode("utf-8")) 22 | #print(subprocess.check_output(command.split()).decode("utf-8")) 23 | 24 | 25 | def parse_args(args): 26 | parser = argparse.ArgumentParser(description="Generate LD matrix from genotype matrix") 27 | parser.add_argument("--ref", type=str, help="Reference file (for example 2558411_ref.bim or 9279485_ref.bim.") 28 | parser.add_argument("--bfile", type=str, help="Genotypes in plink binary format") 29 | parser.add_argument("--vcf", type=str, help="Filename of input .vcf file, or pattern (for example '~/1000Genome/phase3/build37_released/*.vcf.gz')") 30 | parser.add_argument("--keep", default=None, type=str, help="Extract SNPs and keep only EUR individuals") 31 | parser.add_argument("--ldfile", type=str, default=None, help="Path to .ld file generated by plink (takes priority over bfile, ld_window_kb and ld_window_r2") 32 | parser.add_argument("--ld_window_kb", default=10000, type=int, help="Window in KB") 33 | parser.add_argument("--ld_window_r2", default=0.1, type=float, help="LD r2 threshold") 34 | parser.add_argument("--chunksize", default=1000000, type=int, help="Chunk size when reading ld matrix") 35 | parser.add_argument("--plink", default="plink", type=str, help="location of plink executable") 36 | parser.add_argument("--savemat", default=None, type=str, help="Generate matfile for Matlab.") 37 | parser.add_argument("--saveltm", default=None, type=str, help="Generate 'ltm' --- lower triangular matrix in plain text format.") 38 | return parser.parse_args(args) 39 | 40 | def make_ld_matrix(args): 41 | if not args.savemat and not args.saveltm: 42 | raise ValueError('No output requested, use --savemat or --saveltm') 43 | if args.savemat and os.path.isfile(args.savemat): 44 | raise ValueError('Output file already exist: {}'.format(args.savemat)) 45 | if args.saveltm and os.path.isfile(args.saveltm): 46 | raise ValueError('Output file already exist: {}'.format(args.saveltm)) 47 | 48 | # Read the template 49 | print('Reading {0}...'.format(args.ref)) 50 | ref = pd.read_csv(args.ref, delim_whitespace=True, usecols=['BP', 'CHR']) 51 | nsnp = ref.shape[0] 52 | chrpos_to_id = dict([((chr, pos), index) for chr, pos, index in zip(ref['CHR'], ref['BP'], ref.index)]) 53 | if len(chrpos_to_id) != nsnp: raise ValueError("Duplicated CHR:POS pairs found in the reference file") 54 | 55 | if args.vcf is not None: 56 | args.bfile = process_vcf_file(args.vcf, ref, args.keep, 'tmp', args.plink) 57 | 58 | if args.bfile is not None: 59 | execute_command('{0} --bfile {1} --freq --out {1}'.format(args.plink, args.bfile)) 60 | mafvec = make_maf_vector(chrpos_to_id, nsnp, args.bfile); 61 | else: 62 | mafvec = np.empty((nsnp, 1)) 63 | mafvec[:] = np.NAN 64 | 65 | if args.ldfile is None: 66 | # Create LD file in table format 67 | execute_command('{0} --bfile {1} --r2 gz --ld-window-kb {2} --ld-window 999999 --ld-window-r2 {3} --out {1}'.format(args.plink, args.bfile, args.ld_window_kb, args.ld_window_r2)) 68 | args.ldfile = '{0}.ld.gz'.format(args.bfile) 69 | 70 | # Read resulting LD matrix 71 | reader = pd.read_csv(args.ldfile, delim_whitespace=True, chunksize=args.chunksize) 72 | 73 | print('Parsing {0}...'.format(args.ldfile)) 74 | total_df = None 75 | for i, df in enumerate(reader): 76 | df_len_original = len(df) 77 | df = df[df['R2'] >= args.ld_window_r2].copy() 78 | id1tmp = [chrpos_to_id.get((chr, pos), None) for chr, pos in zip(df['CHR_A'], df['BP_A'])] 79 | id2tmp = [chrpos_to_id.get((chr, pos), None) for chr, pos in zip(df['CHR_B'], df['BP_B'])] 80 | mask = [(i1 is not None and i2 is not None) for i1, i2 in zip(id1tmp, id2tmp)] 81 | id1 = [value for index, value in enumerate(id1tmp) if mask[index] == True] 82 | id2 = [value for index, value in enumerate(id2tmp) if mask[index] == True] 83 | val = [value for index, value in enumerate(df['R2']) if mask[index] == True] 84 | df_tmp = pd.DataFrame(data={'id1': id1, 'id2': id2, 'val': val}) 85 | total_df = df_tmp if total_df is None else total_df.append(df_tmp, ignore_index=True) 86 | print('\rFinish {0} entries ({1} after joining with ref and applying r2 threshold)'.format(i * args.chunksize + df_len_original, total_df.shape[0])) 87 | print('. Done.') 88 | 89 | print('Detecting duplicated entries...') 90 | old_size = total_df.shape[0] 91 | total_df.drop_duplicates(subset=['id1', 'id2'], keep='first', inplace=True) 92 | print('Drop {} duplicated entries'.format(old_size-total_df.shape[0])) 93 | 94 | # Output the result as lower diagonal matrix 95 | if args.saveltm: 96 | print('Save result as lower diagonal matrix to {0}...'.format(args.saveltm)) 97 | from scipy.sparse import csr_matrix 98 | id1=list(total_df['id1']); id2 = list(total_df['id2']); val = list(total_df['val']) 99 | assert(all([(i < j) for (i, j) in zip(id1, id2)])) # expect that plink output lower diagonal matrix 100 | csr = csr_matrix((val, (id2, id1)), shape=(nsnp, nsnp)) 101 | 102 | with open(args.saveltm, 'w') as result: 103 | result.write('1.0\n') 104 | for i in range(1, nsnp): 105 | values = csr[i, :].todense()[0, 0:i].A1 106 | values_str = '\t'.join(str(x) for x in values) 107 | result.write('{0}\t1.0\n'.format(values_str)) 108 | 109 | # Output the result in matlab format 110 | if args.savemat: 111 | print('Save result in matlab format to {0}...'.format(args.savemat)) 112 | import scipy.io as sio 113 | sio.savemat( 114 | args.savemat, {'id1':[i + 1 for i in total_df['id1']], 'id2':[i + 1 for i in total_df['id2']], 'val':list(total_df['val']), 'nsnp':nsnp, 'mafvec':mafvec }, 115 | format='5', do_compression=False, oned_as='column') 116 | 117 | print(""" 118 | The results are saved into {0}. Now you should open matlab and execute the following commands to re-save the result as matlab sparse matrix: 119 | load {0} 120 | LDmat = sparse(double(id1),double(id2),true,double(nsnp),double(nsnp)); 121 | LDmat = LDmat | speye(double(nsnp)); 122 | LDmat = LDmat | (LDmat - LDmat'); 123 | save('LDmat.mat', 'LDmat', '-v7.3') 124 | 125 | Or, to save with the actual r^2 values: 126 | load {0} 127 | i1 = [id1; id2; (1:nsnp)'];i2 = [id2; id1; (1:nsnp)']; v = [val; val; ones(nsnp, 1)]; 128 | LDmat = sparse(double(i1),double(i2),double(v),double(nsnp),double(nsnp)); 129 | save('LDmat.mat', 'LDmat', '-v7.3') 130 | """.format(args.savemat)) 131 | 132 | 133 | if __name__ == "__main__": 134 | args = parse_args(sys.argv[1:]) 135 | make_ld_matrix(args) 136 | print("Done.") 137 | -------------------------------------------------------------------------------- /make_ld_matrix/make_maf_vector.py: -------------------------------------------------------------------------------- 1 | # Download reference data from http://ctg.cncr.nl/software/magma (for example g1000_eur) 2 | # Then you can run the tool as follows: 3 | # python make_maf_vector.py --ref 2558411_ref.bim --bfile merged --savemat mafvec.mat 4 | 5 | 6 | import pandas as pd 7 | import numpy as np 8 | import argparse 9 | import sys 10 | import scipy.io as sio 11 | 12 | 13 | def parse_args(args): 14 | parser = argparse.ArgumentParser(description="Generate LD matrix from genotype matrix") 15 | parser.add_argument("--ref", type=str, help="Reference file (for example 2558411_ref.bim or 9279485_ref.bim.") 16 | parser.add_argument("--bfile", type=str, help="Genotypes in plink binary format (only bim and frq files are required)") 17 | parser.add_argument("--savemat", default=None, type=str, help="Generate matfile for Matlab.") 18 | return parser.parse_args(args) 19 | 20 | 21 | def make_maf_vector(chrpos_to_id, nsnp, bfile): 22 | print('Reading {0}...'.format(bfile + '.bim')) 23 | df_bim = pd.read_csv(bfile + '.bim', delim_whitespace=True, header=None) 24 | df_bim.columns=['CHR','SNP','GP','POS','A1','A2'] 25 | print('Reading {0}...'.format(bfile + '.frq')) 26 | df_frq = pd.read_csv(bfile + '.frq', delim_whitespace=True) 27 | df_frq['POS'] = df_bim['POS'] # Assume that bim and frq files are aligned 28 | df_frq['INDEX'] = [chrpos_to_id.get((chr, pos), -1) for chr, pos in zip(df_frq['CHR'], df_frq['POS'])] 29 | 30 | mafvec = np.zeros((nsnp, 1)); mafvec[:] = np.NAN 31 | df = df_frq[df_frq['INDEX'] != -1] 32 | for index, value in zip(df['INDEX'], df['MAF']): 33 | mafvec[index] = value 34 | return mafvec 35 | 36 | if __name__ == "__main__": 37 | args = parse_args(sys.argv[1:]) 38 | 39 | print('Reading {0}...'.format(args.ref)) 40 | ref = pd.read_csv(args.ref, delim_whitespace=True) 41 | nsnp = ref.shape[0] 42 | chrpos_to_id = dict([((chr, pos), index) for chr, pos, index in zip(ref['CHR'], ref['BP'], ref.index)]) 43 | if len(chrpos_to_id) != nsnp: raise ValueError("Duplicated CHR:POS pairs found in the reference file") 44 | 45 | mafvec = make_maf_vector(chrpos_to_id=chrpos_to_id, nsnp=nsnp, bfile=args.bfile) 46 | 47 | print('Found {0} SNPs with non-zero MAF, {1} with zero MAF, {2} missing in genotypes --- {3} SNPs in total'.format( 48 | (~np.isnan(mafvec) & (mafvec>0)).sum(), (mafvec == 0).sum(), np.isnan(mafvec).sum(), len(mafvec))) 49 | 50 | print('Saving result to {0}...'.format(args.savemat)) 51 | sio.savemat(args.savemat, {'mafvec':mafvec}, format='5', do_compression=False, oned_as='column') 52 | 53 | print("Done.") 54 | -------------------------------------------------------------------------------- /make_universal_variant_ids.py: -------------------------------------------------------------------------------- 1 | """ 2 | Input: whitespace-delimited csv file with CHR, BP, A1, A2 and ID columns. 3 | Output: tab-separated csv file with two columns ID and UID, where ID column is taken from the input file, 4 | UID column is constructed as CHR:BP:AA1:AA2, where CHR and BP are from the input file, AA1 is min(A1, A2, A1_complementary, A2_complementary), 5 | AA2 = A2 if AA1 == A1, 6 | AA2 = A1 if AA1 == A2, 7 | AA2 = A2_complementary if AA1 == A1_complementary, 8 | AA2 = A1_complementary if AA1 == A2_complementary, 9 | min is taken based on lexicographical order. 10 | If either A1 or A2 contains non-ATGC char, original ID is retained, i.e. UID = ID. 11 | Example: 12 | python make_universal_variant_ids.py --fname /cluster/projects/p33/users/alexeas/hrc/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz \ 13 | --chr "#CHROM" --bp POS --a1 REF --a2 ALT --id ID --out hrc.hg19.uid.txt 14 | Resulting hrc.hg19.uid.txt file will contain two columns (no header): (1) original ID from the input file, (2) constructed univeral ID (UID) 15 | Assume you need to map different type of variant ids between two different files: 16 | FILE1 has rsid 17 | FILE2 has chr:bp 18 | and both FILE1 and FILE2 contain CHR, BP, A1, A2 columns with coordinates in the same genomic build. 19 | Then you can apply the script to each FILE1 and FILE2 separately to generate UIDs for each file. For FILE1 you will get FILE1.uid and for FILE2 you'll get FILE2.uid. 20 | Then you can get the mapping between rsid IDs from FILE1 and chr:bp IDs from FILE2 using: 21 | join -t$'\t' -1 2 -2 2 <(sort -k2,2 FILE1.uid) <(sort -k2,2 FILE2.uid) > FILE1_FILE2.uid 22 | Resulting FILE1_FILE2.uid file will have three columns: (1) univeral ID (2) rsid from FILE1, (3) chr:bp from FILE2. 23 | """ 24 | 25 | import pandas as pd 26 | import argparse 27 | 28 | COMPL_DICT = {"A":"T", "T":"A", "G":"C", "C":"G"} 29 | DEL_ACGT_TT = str.maketrans({c:"" for c in "ATGC"}) 30 | STD_COL_NAMES = ["CHR", "BP", "A1", "A2", "ID"] 31 | 32 | def std_format(df, col_names, std_col_names): 33 | # standardize format: 34 | # - retain only relevant columns (col_names) in the specified order 35 | # - rename columns to standard names, std_col_names[i] should be an std col name for col_names[i] 36 | # - A1 and A2 to uppercase 37 | df = df[col_names].copy(deep=True) 38 | col_rename_dict = dict(zip(col_names, std_col_names)) 39 | df.rename(columns=col_rename_dict, inplace=True) 40 | df["A1"] = df["A1"].str.upper() 41 | df["A2"] = df["A2"].str.upper() 42 | return df 43 | 44 | def reverse_compl(seq): 45 | # seq is an uppercase string. 46 | return COMPL_DICT[seq] if len(seq) == 1 else "".join([COMPL_DICT[b] for b in seq][::-1]) 47 | 48 | def get_uid_col(df): 49 | # df is DataFrame with standard column names with CHR, BP, A1, A2 and ID columns in the corresponding order. 50 | # A1 and A2 must be capitalized. 51 | uid_col = [] 52 | for chrom, bp, a1, a2, vid in df.itertuples(index=False): 53 | if a1.translate(DEL_ACGT_TT) == "" and a2.translate(DEL_ACGT_TT) == "": 54 | min_orig, max_orig = (a1, a2) if a1 < a2 else (a2, a1) 55 | a1c, a2c = reverse_compl(a1), reverse_compl(a2) 56 | min_compl, max_compl = (a1c, a2c) if a1c < a2c else (a2c, a1c) 57 | a1u, a2u = (min_orig, max_orig) if min_orig < min_compl else (min_compl, max_compl) 58 | uid = f"{chrom}:{bp}:{a1u}:{a2u}" 59 | else: 60 | uid = vid 61 | uid_col.append(uid) 62 | return uid_col 63 | 64 | 65 | # Parse arguments -------------------------- 66 | parser = argparse.ArgumentParser(description="Constract universal variant IDs (see detailed description and example of use in the script file).") 67 | parser.add_argument("--fname", help="Path to input file with chromosome, position, allele 1, allele 2 and variant ID columns.") 68 | parser.add_argument("--chr", default="CHR", help="Chromosome column.") 69 | parser.add_argument("--bp", default="BP", help="Position column.") 70 | parser.add_argument("--a1", default="A1", help="Allele 1 column.") 71 | parser.add_argument("--a2", default="A2", help="Allele 2 column.") 72 | parser.add_argument("--id", default="ID", help="Variant ID column.") 73 | parser.add_argument("--save-all", action="store_true", help="Save all columns from the input file.") 74 | parser.add_argument("--out", help="Output file name.") 75 | args = parser.parse_args() 76 | 77 | 78 | # Main ------------------------------------- 79 | col_names = [args.chr, args.bp, args.a1, args.a2, args.id] # the order should correspond to the order in STD_COL_NAMES 80 | df = pd.read_csv(args.fname, delim_whitespace=True, usecols=None if args.save_all else col_names, dtype=str) 81 | assert not "UID" in df.columns 82 | print(f"{df.shape[0]} variants loaded from {args.fname}") 83 | df_std = std_format(df, col_names, STD_COL_NAMES) 84 | uid_col = get_uid_col(df_std) 85 | assert df.shape[0] == df_std.shape[0] 86 | df["UID"] = uid_col 87 | 88 | if args.save_all: 89 | df.to_csv(args.out, sep='\t', index=False) 90 | else: 91 | df[[args.id, "UID"]].to_csv(args.out, sep='\t', index=False, header=False) 92 | 93 | print(f"{args.out} saved.") 94 | 95 | -------------------------------------------------------------------------------- /manhattan.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import pandas as pd 5 | import numpy as np 6 | import matplotlib 7 | matplotlib.use("Agg") 8 | import matplotlib.pyplot as plt 9 | import matplotlib.patches as mpatches 10 | from matplotlib.collections import PatchCollection 11 | import matplotlib.patheffects as mpe 12 | 13 | # Default colors are similar to matplotlib 2.0 defaults and are taken from: 14 | # https://github.com/vega/vega/wiki/Scales#scale-range-literals 15 | DEFAULT_COLOR_NAMES = [1,3,5,7,9,11,13,15,17,19] 16 | DEFAULT_COLOR_NAMES_ANNOT = [1,3,5,7,9,11,13,15,17,19] # [2,4,6,8,10,12,14,16,18,20] 17 | # colors corresponding to even indices are lighter analogs of colors with odd indices, e.g. DEFAULT_COLORS[2] is a light version of DEFAULT_COLORS[1] 18 | DEFAULT_COLORS = {1:"#1f77b4", 2:"#aec7e8", 3:"#ff7f0e", 4:"#ffbb78", 19 | 5:"#2ca02c", 6:"#98df8a", 7:"#d62728", 8:"#ff9896", 20 | 9:"#9467bd", 10:"#c5b0d5", 11:"#8c564b", 12:"#c49c94", 21 | 13:"#e377c2", 14:"#f7b6d2", 15:"#7f7f7f", 16:"#c7c7c7", 22 | 17:"#bcbd22", 18:"#dbdb8d", 19:"#17becf", 20:"#9edae5"} 23 | 24 | # colors from http://mkweb.bcgsc.ca/colorblind/ 25 | CB_COLOR_NAMES = ["orange","sky_blue","bluish_green","yellow","blue", 26 | "vermillion","reddish_purple","black"] 27 | CB_COLOR_NAMES_ANNOT = ["orange","sky_blue","bluish_green","yellow","blue", 28 | "vermillion","reddish_purple","black"] 29 | CB_COLORS = {"orange":"#e69f00", 30 | "sky_blue":"#56b4e9", 31 | "bluish_green":"#009e73", 32 | "yellow":"#f0e442", 33 | "blue":"#0072b2", 34 | "vermillion":"#d55e00", 35 | "reddish_purple":"#cc79a7", 36 | "black":"#000000"} 37 | 38 | example_text = """Example: 39 | python manhattan.py result.mat.csv \\ 40 | --lead conj.result.clump.lead.csv --indep conj.result.clump.indep.csv \\ 41 | --p FDR --y-label conjFDR --color-list 1 --legend-label 'Trait1 & Trait2' \\ 42 | --legend-location 'upper right' --p-thresh 0.05 --out conjfdr_manhattan""" 43 | 44 | 45 | def parse_args(args): 46 | parser = argparse.ArgumentParser( 47 | formatter_class=argparse.RawDescriptionHelpFormatter, 48 | description="A tool to draw Manhattan plot from sumstat files.", 49 | epilog=example_text) 50 | 51 | parser.add_argument("sumstats", nargs="+", help="A list of sumstat files") 52 | parser.add_argument("--sep", nargs="+", default=['\t'], 53 | help="A list of column separators in sumstat files") 54 | parser.add_argument("--snp", nargs="+", default=["SNP"], 55 | help="A list of columns with SNP ids in sumstat files") 56 | parser.add_argument("--chr", nargs="+", default=["CHR"], 57 | help="A list of columns with SNP chromosomes in sumstat files") 58 | parser.add_argument("--bp", nargs="+", default=["BP"], 59 | help="A list of columns with SNP positions in sumstat files") 60 | parser.add_argument("--p", nargs="+", default=["PVAL"], 61 | help="A list of columns with SNP p-values in sumstat files") 62 | 63 | parser.add_argument("--outlined", nargs="+", default=["NA"], 64 | help=("A list of files with ids of SNPs to mark with outlined bold dots, 'NA' if absent. " 65 | "These files should contain a single column with SNP ids without header")) 66 | parser.add_argument("--bold", nargs="+", default=["NA"], 67 | help=("A list of files with ids of SNPs to mark with bold dots, 'NA' if absent. " 68 | "These files should contain a single column with SNP ids without header")) 69 | parser.add_argument("--annot", nargs="+", default=["NA"], 70 | help=("A list of files with ids (1st column) and labels (2nd column) of SNPs to annotate, 'NA' if absent. " 71 | "These files should contain two tab-delimited columns (1st: SNP ids, 2nd: SNP labels) without header")) 72 | # the next two options are shortcuts for --outlined and --bold to work 73 | # directly with the output of "sumstats.py clump". These options probably 74 | # should be removed in future for clarity 75 | parser.add_argument("--lead", nargs="+", default=["NA"], 76 | help=("A list of files with ids of lead SNPs, 'NA' if absent. " 77 | "These files should be the output of 'sumstats.py clump'")) 78 | parser.add_argument("--indep", nargs="+", default=["NA"], 79 | help=("A list of files with ids of independent significant SNPs, 'NA' if absent. " 80 | "These files should be the output of 'sumstats.py clump'")) 81 | 82 | parser.add_argument("--p-thresh", type=float, default=5.E-8, 83 | help="Significance threshold for p-values") 84 | parser.add_argument("--transparency", type=float, nargs="+", default=[1], 85 | help="Transparency level of points") 86 | parser.add_argument("--between-chr-gap", type=float, default=0.1, 87 | help="Size of the gap between chromosomes in the figure") 88 | parser.add_argument("--snps-to-keep", nargs="+", default=["NA"], 89 | help="A list of files with ids of SNPs to take for plotting, 'NA' if absent. " 90 | "These sets of SNPs are further reduced according to '--downsample-frac' argument. " 91 | "These files should contain a single column with SNP ids without header") 92 | parser.add_argument("--downsample-frac", nargs="+", type=float, 93 | default=[0.005], help="Fraction of SNPs to take for plotting") 94 | parser.add_argument("--downsample-thresh", nargs="+", type=float, 95 | default=[None], help="Only SNPs with p-values larger than the threshold are downsampled") 96 | parser.add_argument("--chr2use", type=str, default="1-22", 97 | help=("Chromosome ids to plot (e.g. 1,2,3 or 1-4,12,16-20 or 19-22,X,Y). " 98 | "The order in the figure will correspond to the order in this argument. " 99 | "Chromosomes with non-integer ids should be indicated separately")) 100 | parser.add_argument("--striped-background", action="store_true", 101 | help="Draw grey background for every second chromosome") 102 | parser.add_argument("--color-list", nargs="+", default=[], 103 | help="Use specified color list, e.g. 1 3 5 7 9 11 13 15 17 19; 2 4 6 8 10 12 14 16 18 20; orange sky_blue bluish_green yellow blue vermillion reddish_purple black, or any colors listed on https://python-graph-gallery.com/100-calling-a-color-with-seaborn") 104 | parser.add_argument("--cb-colors", action="store_true", 105 | help="Use colors designed for color-blind people") 106 | parser.add_argument("--seed", type=int, default=1, help="Random seed") 107 | parser.add_argument("--out", default="manhattan", help="Out file name") 108 | parser.add_argument("--separate-sumstats", action="store_true", 109 | help="Plot each sumstat in a separate subplot.") 110 | 111 | parser.add_argument("--y-label", default="P", 112 | help="Label of y axis. Label in the figure will be: -log10(y_label).") 113 | parser.add_argument("--y-max", type=float, default=-1, help="Upper limit of y axis. Default: autodetect.") 114 | parser.add_argument("--legend-location", default="best", 115 | help="Legend location: 'best', 'upper right', 'upper left', 'lower left', 'lower right', 'right', 'center left', 'center right', 'lower center', 'upper center', 'center'") 116 | parser.add_argument("--no-legend", action="store_true", 117 | help="Don't add legend to the figure.") 118 | parser.add_argument("--legend-labels", nargs="+", default=["NA"], 119 | help="A list of labels for sumstats to use in the legend in the corresponding order. " 120 | "If '--no-legend' is specified, this argument is ignored. If both this and " 121 | "'--no-legend' arguments are absent, corresponding file names are used in " 122 | "the legend.") 123 | 124 | return parser.parse_args(args) 125 | 126 | 127 | def process_args(args): 128 | """ 129 | Check whether provided arguments are correct, change list-type arguments 130 | with single value to have a length = length of sumstats argument and process 131 | chr2use arument. 132 | """ 133 | for f in args.sumstats: 134 | assert os.path.isfile(f), "'%s' file doesn't exist" % f 135 | for f in args.outlined: 136 | assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f 137 | for f in args.bold: 138 | assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f 139 | for f in args.lead: 140 | assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f 141 | for f in args.indep: 142 | assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f 143 | for f in args.annot: 144 | assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f 145 | 146 | n = len(args.sumstats) 147 | arg_dict = vars(args) 148 | for arg_name, arg_val in arg_dict.items(): 149 | if (type(arg_val) is list) and (len(arg_val)0,:], should be ~ 1.5x faster 231 | df.dropna(subset=[pval_col], how="all", inplace = True) 232 | print("%d SNPs with defined p-value" % len(df)) 233 | df = df.loc[df[chr_col].isin(chr2use),:] 234 | print("%d SNPs within specified chromosomes" % len(df)) 235 | # TODO: zero filtering step is very slow, should be optimized 236 | df = df.loc[df[pval_col]>0,:] 237 | print("%d SNPs with non-zero p-value" % len(df)) 238 | # TODO: drop duplicates as it is done in qq.py 239 | return df 240 | 241 | 242 | def get_df2plot(df, outlined_snps_f, bold_snps_f, lead_snps_f, indep_snps_f, 243 | annot_f, snps_to_keep_f, downsample_frac, downsample_thresh, pval_col): 244 | """ 245 | Select variants which will be plotted. Mark lead and independent significant 246 | variants if corresponding information is provided. 247 | Args: 248 | df: DataFrame for variant selection 249 | outlined_snps_f: a name of file with SNP ids to plot with outlined bold dots 250 | bold_snps_f: a name of file with SNP ids to plot with bold dots 251 | lead_snps_f: a name of file with lead variants 252 | indep_snps_f: a name of file with independent significant variants 253 | snps_to_keep_f: a list of variants to consider for plotting, only these 254 | variants are considered when downsampling take place 255 | downsample_frac: a fraction of variants which will be sampled from df 256 | for plotting 257 | downsample_thresh: only variants with p-value larger than this threshold 258 | are downsampled 259 | pval_col: a column with p-values in df 260 | Returns: 261 | df2plot: DataFrame with variants for plotting 262 | """ 263 | print("Preparing SNPs for plotting") 264 | # define a subset of variants which will be plotted: 265 | # [outlined + lead] + [bold + indep] + sample 266 | outlined_snp_ids = get_snp_ids(outlined_snps_f) 267 | bold_snp_ids = get_snp_ids(bold_snps_f) 268 | lead_snp_id = get_lead(lead_snps_f) 269 | indep_snp_id = get_indep_sig(indep_snps_f) 270 | annot_series = get_annot(annot_f) 271 | outlined_snp_ids = np.unique(np.concatenate((outlined_snp_ids, lead_snp_id))) 272 | bold_snp_ids = np.unique(np.concatenate((bold_snp_ids, indep_snp_id))) 273 | # sample variants 274 | if snps_to_keep_f != "NA": 275 | snps2keep = get_snp_ids(snps_to_keep_f) 276 | ii = df.index.intersection(snps2keep) 277 | df = df.loc[ii,:] 278 | print("%d SNPs overlap with %s" % (len(df),snps_to_keep_f)) 279 | if not downsample_thresh is None: 280 | i2downsample = df[pval_col]>downsample_thresh 281 | df2downsample = df.loc[i2downsample,:] 282 | snps2downsample = df2downsample.index 283 | snps2downsample_pvals = df2downsample[pval_col] 284 | snps2keep = df.loc[~i2downsample,:].index.values 285 | else: 286 | snps2downsample = df.index 287 | snps2downsample_pvals = df[pval_col] 288 | snps2keep = [] 289 | n = int(downsample_frac*len(snps2downsample)) 290 | # w = 1/df[pval_col].values 291 | w = -np.log10(snps2downsample_pvals.values) 292 | w /= sum(w) 293 | 294 | snp_sample = np.random.choice(snps2downsample,size=n,replace=False,p=w) 295 | # TODO: keep SNPs within identified loci with higher prob? 296 | # NOTE: it could be that there are snp ids in outlined_snp_ids or bold_snp_ids which 297 | # are not in df.index, therefore we should take an index.intersection first. 298 | outlined_snp_ids = df.index.intersection(outlined_snp_ids) 299 | bold_snp_ids = df.index.intersection(bold_snp_ids) 300 | annot_snp_ids = df.index.intersection(annot_series.index) 301 | snps2keep = np.unique(np.concatenate((snps2keep, outlined_snp_ids, bold_snp_ids, 302 | snp_sample, annot_snp_ids))) 303 | df2plot = df.loc[snps2keep,:] 304 | df2plot.loc[:,"outlined"] = False 305 | df2plot.loc[outlined_snp_ids,"outlined"] = True 306 | df2plot.loc[:,"bold"] = False 307 | df2plot.loc[bold_snp_ids,"bold"] = True 308 | df2plot.loc[:,"annot"] = "" 309 | df2plot.loc[annot_snp_ids,"annot"] = annot_series[annot_snp_ids] 310 | print("%d outlined SNPs" % len(outlined_snp_ids)) 311 | print("%d bold SNPs" % len(bold_snp_ids)) 312 | print("%d annotated SNPs" % len(annot_snp_ids)) 313 | print("%d SNPs will be plotted in total" % len(df2plot)) 314 | return df2plot 315 | 316 | 317 | def get_chr_df(dfs2plot, bp_cols, chr_cols, between_chr_gap, chr2use): 318 | """ 319 | Construct DataFrame with index = chromosome names and 5 columns: 320 | min: minimum coordinate on each chromosome among all dfs in dfs2plot 321 | max: maximum coordinate on each chromosome among all dfs in dfs2plot 322 | ind: index of the chromosome = 1:N, where N - nuumber of different chromosomes 323 | rel_size: size of the chromosome relative to the first chromosome (i.e. 324 | rel_size of the first chr = 1) 325 | start: start coordinate of the chromosome on the x axis, where the first 326 | chromosome starts at x = 0 and ends at x = 1 (if its size = 1), taking 327 | into account between_chr_gap 328 | Args: 329 | dfs2plot: a list of DataFrames that will be plotted 330 | bp_cols: name of marker position on chromosome columns 331 | chr_cols: name of marker chromosome columns 332 | between_chr_gap: gap between end of chr K and start of chr K+1 333 | chr2use: chromosomes to use for plotting (other are dropped) 334 | Returns: 335 | chr_df: a DataFrame with chromosome information as described above 336 | """ 337 | unique_chr = np.unique(np.concatenate([df[chr_cols[i]].unique() for i,df in enumerate(dfs2plot)])) 338 | unique_chr = [c for c in chr2use if c in unique_chr] 339 | chr_df = pd.DataFrame(index=unique_chr, columns=["min","max","ind","start","rel_size"]) 340 | min_df = pd.DataFrame(index=unique_chr) 341 | max_df = pd.DataFrame(index=unique_chr) 342 | for i,df in enumerate(dfs2plot): 343 | chr_min = df.groupby(chr_cols[i])[bp_cols[i]].min() 344 | chr_max = df.groupby(chr_cols[i])[bp_cols[i]].max() 345 | min_df[i] = chr_min 346 | max_df[i] = chr_max 347 | chr_df["min"] = min_df.min(axis=1) 348 | chr_df["max"] = max_df.max(axis=1) 349 | chr_df["ind"] = np.arange(len(unique_chr)) 350 | # use the first chr form unique_chr as a reference unit size 351 | ref_unit_size = chr_df.loc[chr_df.index[0],"max"] - chr_df.loc[chr_df.index[0],"min"] 352 | chr_df["rel_size"] = (chr_df["max"] - chr_df["min"])/ref_unit_size 353 | chr_df["start"] = chr_df["rel_size"].cumsum() - chr_df["rel_size"] + between_chr_gap*chr_df["ind"] 354 | return chr_df 355 | 356 | 357 | def add_coords(df2plot, chr_col, bp_col, pval_col, chr_df): 358 | """ 359 | Modify provided DataFrame df2plot by adding columns with x-y coordinates for 360 | plotting to it. 361 | Args: 362 | df2plot: DataFrame with variants for plotting (produced by get_df2plot) 363 | chr_col: a column with chromosome of variants in df2plot 364 | bp_col: a column with position on chromosome of variants in df2plot 365 | pval_col: a column with variant p-values 366 | chr_df: a DataFrame with chromosome information (produced by get_chr_df) 367 | """ 368 | chr_start = chr_df.loc[df2plot[chr_col], "start"].values 369 | chr_min = chr_df.loc[df2plot[chr_col], "min"].values 370 | df2plot.loc[:,"x_coord"] = (df2plot[bp_col] - chr_min)/chr_df.loc[chr_df.index[0],"max"] + chr_start 371 | df2plot.loc[:,"log10p"] = -np.log10(df2plot[pval_col]) # y coord 372 | 373 | 374 | def add_striped_background(chr_df, ax, y_up): 375 | """ 376 | Add grey background rectagle for every second chromosome. 377 | """ 378 | height = y_up 379 | background_rect = [] 380 | for c in chr_df.index[1::2]: 381 | x = chr_df.loc[c,"start"] 382 | y = 0 383 | width = chr_df.loc[c,"rel_size"] 384 | rect = mpatches.Rectangle((x, y), width, height) 385 | background_rect.append(rect) 386 | pc = PatchCollection(background_rect, facecolor='#AEA79F', alpha=0.3, 387 | edgecolor='None') 388 | ax.add_collection(pc) 389 | 390 | 391 | if __name__ == "__main__": 392 | args = parse_args(sys.argv[1:]) 393 | process_args(args) 394 | 395 | np.random.seed(args.seed) 396 | 397 | if args.color_list: 398 | assert len(args.sumstats) <= len(args.color_list), "%d is maximum number of sumstats to plot simultaneously with specified color scheme" % len(color_list) 399 | color_names = [int(x) if x.isdigit() else x for x in args.color_list] 400 | color_names_annot = color_names 401 | color_dict = {**DEFAULT_COLORS, **CB_COLORS} 402 | for x in args.color_list: 403 | if x not in color_dict: 404 | color_dict[x] = x 405 | elif args.cb_colors: 406 | assert len(args.sumstats) <= len(CB_COLOR_NAMES), "%d is maximum number of sumstats to plot simultaneously with color-blind color scheme" % len(CB_COLOR_NAMES) 407 | color_names = CB_COLOR_NAMES 408 | color_names_annot = CB_COLOR_NAMES_ANNOT 409 | color_dict = CB_COLORS 410 | else: 411 | # use default colors 412 | assert len(args.sumstats) <= len(DEFAULT_COLOR_NAMES), "%d is maximum number of sumstats to plot simultaneously with default color scheme" % len(DEFAULT_COLOR_NAMES) 413 | color_names = DEFAULT_COLOR_NAMES 414 | color_names_annot = DEFAULT_COLOR_NAMES_ANNOT 415 | color_dict = DEFAULT_COLORS 416 | 417 | legend_labels = [os.path.splitext(os.path.basename(args.sumstats[i]))[0] if ll == "NA" else ll 418 | for i,ll in enumerate(args.legend_labels)] 419 | legends_handles = [] 420 | 421 | sumstat_dfs = [ 422 | filter_sumstats(s, args.sep[i], args.snp[i], args.p[i], args.chr[i], args.bp[i], args.chr2use) 423 | for i,s in enumerate(args.sumstats)] 424 | 425 | dfs2plot = [get_df2plot(df, args.outlined[i], args.bold[i], args.lead[i], args.indep[i], 426 | args.annot[i], args.snps_to_keep[i], args.downsample_frac[i], 427 | args.downsample_thresh[i], args.p[i]) 428 | for i, df in enumerate(sumstat_dfs)] 429 | 430 | chr_df = get_chr_df(dfs2plot, args.bp, args.chr, args.between_chr_gap, args.chr2use) 431 | 432 | for i,df in enumerate(dfs2plot): 433 | add_coords(df, args.chr[i], args.bp[i], args.p[i], chr_df) 434 | 435 | n_subplots = len(dfs2plot) if args.separate_sumstats else 1 436 | 437 | # make plot 438 | print("Making plot") 439 | plt.rc('legend',fontsize=15) 440 | fig, axarr = plt.subplots(n_subplots, squeeze=False, figsize=(14,5*n_subplots), dpi=200) 441 | axarr = axarr[:,0] # squeeze second dimention since we don't need it here 442 | 443 | 444 | # find upper limit for Y axis 445 | if args.y_max > 0: 446 | y_up = args.y_max 447 | else: 448 | y_up = max([df["log10p"].max() for df in dfs2plot]) 449 | y_up = max(y_up, -np.log10(args.p_thresh)) 450 | y_up *= 1.05 451 | 452 | if args.striped_background: 453 | for ax in axarr: 454 | add_striped_background(chr_df, ax, y_up) 455 | 456 | for i, df in enumerate(dfs2plot): 457 | # plot normal points 458 | ax_i = i if args.separate_sumstats else 0 459 | ax = axarr[ax_i] 460 | 461 | color = color_dict[color_names[i]] 462 | ax.plot(df["x_coord"], df["log10p"], ls=' ', marker='.', ms=2, 463 | color=color, alpha=args.transparency[i]) 464 | patch = mpatches.Patch(color=color, label=legend_labels[i]) 465 | legends_handles.append(patch) 466 | for i, df in enumerate(dfs2plot): 467 | # plot bold significant and outlined variants "on top" of normal points 468 | ax_i = i if args.separate_sumstats else 0 469 | ax = axarr[ax_i] 470 | 471 | color = color_dict[color_names[i]] 472 | df_tmp = df.loc[df["bold"],:] 473 | ax.plot(df_tmp["x_coord"], df_tmp["log10p"], ls=' ', marker='o', ms=5, 474 | color=color) 475 | df_tmp = df.loc[df["outlined"],:] 476 | ax.plot(df_tmp["x_coord"], df_tmp["log10p"], ls=' ', marker='o', ms=8, 477 | markeredgewidth=0.6, markeredgecolor='k', color=color) 478 | df_tmp = df.loc[df["annot"]!="",["annot","x_coord", "log10p"]] 479 | pe = [mpe.Stroke(linewidth=0.8, foreground='black')] 480 | for row in df_tmp.itertuples(): 481 | color = color_dict[color_names_annot[i]] 482 | ax.annotate(row.annot, xy=(row.x_coord, row.log10p), xycoords='data', 483 | xytext=(2,2), textcoords='offset points', color=color, 484 | fontsize=12, style='italic', fontweight='heavy', 485 | # path_effects=pe, # uncomment path_effects to have a black border of the label symbols 486 | bbox={"boxstyle":"square, pad=0.02", "facecolor":"white", 487 | "edgecolor":"none","alpha":0.6}) 488 | 489 | for i,ax in enumerate(axarr): 490 | ax.hlines([-np.log10(args.p_thresh)], 0, 1, colors='k', linestyles='dotted', 491 | transform=ax.get_yaxis_transform()) 492 | 493 | ax.tick_params(axis='y', which='major', labelsize=15) 494 | ax.tick_params(axis='x', which='major', labelsize=15) 495 | x_ticks = chr_df["start"] + 0.5*chr_df["rel_size"] 496 | ax.set_xticks(x_ticks) 497 | ax.set_xticklabels(map(str, x_ticks.index), fontsize=14) 498 | 499 | ax.set_xlim((-0.1, 500 | chr_df.loc[chr_df.index[-1], "start"] + chr_df.loc[chr_df.index[-1], "rel_size"] + 0.1)) 501 | y_low = ax.get_ylim()[0] 502 | ax.set_ylim((0-0.005*y_up, y_up)) 503 | # remove top and right spines 504 | ax.spines['right'].set_visible(False) 505 | ax.spines['top'].set_visible(False) 506 | # add offset for left spine 507 | ax.spines['left'].set_position(('outward',5)) 508 | ax.spines['bottom'].set_position(('outward',5)) 509 | 510 | ax.set_xlabel("Chromosome", fontsize=20) 511 | ax.set_ylabel(r"$\mathrm{-log_{10}(%s)}$" % args.y_label, fontsize=20) 512 | 513 | if args.legend_location: 514 | handles = legends_handles[i:i+1] if args.separate_sumstats else legends_handles 515 | ax.legend(handles=handles, loc=args.legend_location) 516 | elif not args.no_legend: 517 | handles = legends_handles[i:i+1] if args.separate_sumstats else legends_handles 518 | ax.legend(handles=handles, loc='best') 519 | 520 | 521 | plt.tight_layout() 522 | 523 | # save/show 524 | # plt.savefig(args.out) 525 | plt.savefig(args.out+'.png') 526 | plt.savefig(args.out+'.pdf') 527 | plt.savefig(args.out+'.svg') 528 | # plt.show() 529 | print("%s was generated" % args.out) 530 | -------------------------------------------------------------------------------- /merge_bed_files.py: -------------------------------------------------------------------------------- 1 | # Merge together several .bed files, ignoring all potential warnings from plink. 2 | # 3 | # To run this tool: 4 | # python merge_bed_files.py --bed ~/1000Genome/phase3/build37_released/*.bed --out merged 5 | 6 | import argparse 7 | import glob 8 | import itertools 9 | import os.path 10 | import os 11 | import subprocess 12 | import sys 13 | import pandas as pd 14 | 15 | def parse_args(args): 16 | parser = argparse.ArgumentParser(description="Merge together several .bed files") 17 | parser.add_argument("--bed", type=str, help="Filename of input .bed file, or pattern (for example '~/1000Genome/phase3/build37_released/*.bed')") 18 | parser.add_argument("--out", default=r"merged", type=str, help="Filename of output .bed file (without extention") 19 | return parser.parse_args(args) 20 | 21 | def execute_command(command): 22 | print("Execute command: {}".format(command)) 23 | print(subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0].decode("utf-8")) 24 | #print(subprocess.check_output(command.split()).decode("utf-8")) 25 | 26 | def exclude_snps(bfile_in, snps_file, bfile_out): 27 | execute_command('plink --memory 4096 --bfile {0} --exclude {1} --make-bed --out {2}'.format(bfile_in, snps_file, bfile_out)) 28 | 29 | def merge(files, output_bfile): 30 | missnp_file = '{0}-merge.missnp'.format(output_bfile) 31 | if os.path.exists(missnp_file): 32 | os.remove(missnp_file) 33 | first = files[0] 34 | with open('mergelist.txt', 'w') as mergelist: 35 | for filename in files[1:]: 36 | mergelist.write('{0}.bed {0}.bim {0}.fam\n'.format(filename)) 37 | execute_command('plink --memory 4096 --bfile {0} --merge-list mergelist.txt --allow-no-sex --make-bed --out {1}'.format(first, output_bfile)) 38 | os.remove('mergelist.txt') 39 | 40 | if __name__ == "__main__": 41 | args = parse_args(sys.argv[1:]) 42 | 43 | # Find all .bed filenames (without extention) 44 | files = [os.path.splitext(file)[0] for file in glob.glob(args.bed)] 45 | 46 | merge(files, args.out) 47 | missnp_file = '{0}-merge.missnp'.format(args.out) 48 | if os.path.exists(missnp_file): 49 | # Handle merge failure as described here: https://www.cog-genomics.org/plink2/data#merge3 50 | for file in files: 51 | exclude_snps(file, missnp_file, '{0}.filter'.format(file)) 52 | merge(['{0}.filter'.format(file) for file in files], args.out) 53 | for file in files: map(os.remove, glob.glob('{0}.filter'.format(file))) 54 | 55 | print("Done.") 56 | -------------------------------------------------------------------------------- /overCorrect.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import scipy.stats as stats 4 | import scipy.special as special 5 | import scipy.linalg as linalg 6 | import scipy.interpolate as interp 7 | 8 | # Decorrelation in Statistics: The Mahalanobis Transformation 9 | # Added material to Data Compression: The Complete Reference 10 | # http://www.davidsalomon.name/DC2advertis/DeCorr.pdf 11 | def overCorrect(z1, z2, idx): 12 | defidx = np.isfinite(z1+z2) 13 | z11 = z1[np.logical_and(defidx, idx)]; 14 | z22 = z2[np.logical_and(defidx, idx)] 15 | if np.sum(z1 < 0) != 0 and np.sum(z2 < 0) == 0: 16 | C = np.corrcoef(np.power(z11,2), z22) 17 | print "correlation between squred Z-score: ", C[0,1] 18 | elif np.sum(z1 < 0) == 0 and np.sum(z2 < 0) != 0: 19 | C = np.corrcoef(z11, np.power(z22,2)) 20 | print "correlation between squred Z-score: ", C[0,1] 21 | else: 22 | C = np.corrcoef(z11, z22) 23 | print "correlation between Z score: ", C[0,1] 24 | Z = np.row_stack([z1, z2]) 25 | print Z.shape 26 | z_adj = np.dot(linalg.fractional_matrix_power(C, -1/2), Z) 27 | print np.corrcoef(z_adj[0,:], z_adj[1,:])[0,1] 28 | return z2logp(z_adj[0,:]), z2logp(z_adj[1,:]) 29 | 30 | def z2logp(zscores, tails = 2): 31 | """ 32 | compute coresponding -log10 p values of given z values. 33 | """ 34 | return -np.log10(tails * stats.norm.cdf(-np.fabs(zscores))) 35 | -------------------------------------------------------------------------------- /plink_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | 5 | def get_byte_map(): 6 | """ 7 | Construct mapping between bytes 0..255 and 4-element arrays of a1 genotypes 8 | from plink bed file. 9 | Return 256 x 4 array A, where A[i] = [a1, a2, a3, a4], each ai is from {2, -1, 1, 0}. 10 | """ 11 | genotype_codes = np.array([2, -1, 1, 0],dtype=np.int8) 12 | byte_map = np.empty((256,4), dtype=np.int8) 13 | for b in range(256): 14 | for a in range(4): 15 | byte_map[b,a] = genotype_codes[(b >> 2*a) & 3] 16 | return byte_map 17 | 18 | 19 | class Plink(object): 20 | bim = None # pd.DataFrame 21 | fam = None # pd.DataFrame 22 | bed = None # np.memmap(dtype=np.uint8), this also contain extra bits if n_samples%4 != 0 23 | 24 | byte_map = get_byte_map() 25 | 26 | def __init__(self, bfile): 27 | print(f"Loading plink {bfile}") 28 | self._load_bim(bfile) 29 | self._load_fam(bfile) 30 | self._load_bed(bfile) 31 | 32 | def _load_bim(self, bfile): 33 | self.bim = pd.read_csv(f'{bfile}.bim', sep='\t', header=None, 34 | names=["chr","snp","cm","bp","a1","a2"]) 35 | 36 | def _load_fam(self, bfile): 37 | self.fam = pd.read_csv(f'{bfile}.fam',sep='\t',header=None, 38 | names=["fid","iid","father_id","mother_id","sex","pheno"]) 39 | 40 | def _load_bed(self, bfile): 41 | if self.bim is None: 42 | self._load_bim(bfile) 43 | if self.fam is None: 44 | self._load_fam(bfile) 45 | bedf = f'{bfile}.bed' 46 | magic_bits = np.fromfile(bedf, count=3, dtype=np.uint8) 47 | if (magic_bits != [108,27,1]).any(): 48 | raise Exception(f"{bedf} file is not a valid bed file!") 49 | n_snps = self.bim.shape[0] 50 | n_samples = self.fam.shape[0] 51 | n_cols = n_samples//4 52 | if 4*n_cols != n_samples: 53 | n_cols += 1 54 | self.bed = np.memmap(bedf, dtype=np.uint8, offset=3, mode='r', shape=(n_snps,n_cols)) 55 | 56 | def get_geno(self, snp_ii=None): 57 | """ 58 | Get genotypes for SNPs with indices from snp_ii. 59 | Args: 60 | snp_ii : np.array of SNP indices 61 | """ 62 | n_snps = self.bim.shape[0] 63 | n_samples = self.fam.shape[0] 64 | if snp_ii is None: 65 | snp_ii = np.arange(n_snps) 66 | assert max(snp_ii) < n_snps, f"SNP index cannot be > {n_snps-1}" 67 | n_cols = 4*(n_samples//4) 68 | if n_cols != n_samples: 69 | n_cols += 4 70 | samp_geno = self.byte_map[self.bed[snp_ii]].reshape((len(snp_ii),n_cols)) 71 | return samp_geno[:,:n_samples] 72 | 73 | 74 | if __name__ == '__main__': 75 | # Example: 76 | bfile = '/path/to/plink_bfile' 77 | plink = Plink(bfile) 78 | 79 | # read genotypes of 0-th and 10-th variants 80 | geno_arr = plink.get_geno([0,10]) 81 | print(geno_arr.shape) 82 | 83 | # read genotypes of all variants in the bfile 84 | geno_arr = plink.get_geno() 85 | print(geno_arr.shape) 86 | -------------------------------------------------------------------------------- /plotgwas.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib 4 | matplotlib.use("Agg") 5 | import matplotlib.pyplot as plt 6 | import os 7 | import sys 8 | import argparse 9 | 10 | """ 11 | Run examples: 12 | (1) Manhattan plot with one sumstats: 13 | python plotgwas.py manhattan --config config.plotgwas.3.cfg --out manhattan.3.svg 14 | (2) Miami plot with two sumstats in the top panel and one sumstats in the bottom panel saved as png and as svg files: 15 | python plotgwas.py miami --config-top config.plotgwas.1.cfg config.plotgwas.2.cfg --config-bottom config.plotgwas.3.cfg --out miami.1.2.3.png miami.1.2.3.svg 16 | """ 17 | 18 | 19 | def parse_args(args): 20 | parser = argparse.ArgumentParser(description="Tools to plot GWAS summary statistic data.") 21 | subparsers = parser.add_subparsers() 22 | 23 | parser_manhattan = subparsers.add_parser("manhattan", help="Make Manhattan plot.") 24 | parser_manhattan.add_argument("--config", type=str, nargs='+', required=True, help="List of config files to plot.") 25 | parser_manhattan.add_argument("--out", type=str, nargs='+', required=True, help="List of output file names.") 26 | parser_manhattan.set_defaults(func=make_manhattan) 27 | 28 | parser_miami = subparsers.add_parser("miami", help="Make Miami plot.") 29 | parser_miami.add_argument("--config-top", type=str, nargs='+', required=True, help="List of config files to plot on top.") 30 | parser_miami.add_argument("--config-bottom", type=str, nargs='+', required=True, help="List of config files to plot on bottom.") 31 | parser_miami.add_argument("--out", type=str, nargs='+', required=True, help="List of output file names.") 32 | parser_miami.set_defaults(func=make_miami) 33 | 34 | return parser.parse_args(args) 35 | 36 | 37 | def add_coord(dfs): 38 | # df is assume to have "CHR" and "BP" columns. Chromosomes here are assumed to be integers. 39 | min_chr_bp, max_chr_bp = get_min_max_chr_bp(dfs) 40 | next_chrom_start = 0 41 | between_chr_gap = 0.005*(sum(max_chr_bp.values()) - sum(min_chr_bp.values())) 42 | for df in dfs: 43 | next_chrom_start = 0 44 | for chrom in sorted(min_chr_bp): 45 | i_chrom = df.CHR==chrom 46 | min_bp, max_bp = min_chr_bp[chrom], max_chr_bp[chrom] 47 | coord = df.loc[i_chrom, "BP"] - min_bp + next_chrom_start 48 | df.loc[i_chrom, "COORD"] = coord 49 | next_chrom_start += max_bp - min_bp + between_chr_gap 50 | 51 | def get_min_max_chr_bp(dfs): 52 | min_max_chr_bp = [] 53 | for df in dfs: 54 | df_chr_bp = df.groupby("CHR").agg({"BP":['min', 'max']}).BP 55 | min_max_chr_bp.append(df_chr_bp) 56 | df_concat = pd.concat(min_max_chr_bp, axis=1) 57 | min_chr_bp = df_concat.min(axis=1).to_dict() 58 | max_chr_bp = df_concat.max(axis=1).to_dict() 59 | return min_chr_bp, max_chr_bp 60 | 61 | def drop_marginal_snps(df, p_cutoff_low, p_cutoff_high): 62 | # df is assume to have "P" column. 63 | # Add coordinates in the figure. 64 | # Drop non-autosomes and convert chromosomes to int. 65 | autosomes = [str(i) for i in range(1,23)] 66 | i_autosome = df.CHR.isin(autosomes) 67 | df = df.loc[i_autosome,:] 68 | i2plot = (p_cutoff_high`_. 12 | 13 | The primary usage example, supported by the library is the following:: 14 | 15 | >> from pyliftover import LiftOver 16 | >> lo = LiftOver('hg17', 'hg18') 17 | >> lo.convert_coordinate('chr1', 1000000) 18 | 19 | The first line will automatically download the hg17-to-hg18 coordinate conversion `chain file` from UCSC, 20 | unless it is already cached or available in the current directory. Alternatively, you may provide your own chain file:: 21 | 22 | >> lo = LiftOver('hg17ToHg18.over.chain.gz') 23 | >> lo.convert_coordinate('chr1', 1000000, '-') 24 | 25 | The result of ``lo.convert_coordinate`` call is either ``None`` (if the source chromosome name is unrecognized) or a list of target positions in the 26 | new assembly. The list may be empty (locus is deleted in the new assembly), have a single element (locus matched uniquely), or, in principle, 27 | have multiple elements (although this is probably a rare occasion for most default intra-species genomic conversions). 28 | 29 | Although you may try to apply the tool with arbitrary chain files, like the original ``liftOver`` tool, it makes most sense for conversion of 30 | coordinates between different assemblies of the same species. 31 | ''' 32 | 33 | from .liftover import LiftOver -------------------------------------------------------------------------------- /pyliftover/chainfile.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Pure-python implementation of UCSC "liftover" genome coordinate conversion. 3 | Class for dealing with "xx.over.chain" files. 4 | 5 | Copyright 2013, Konstantin Tretyakov. 6 | http://kt.era.ee/ 7 | 8 | Licensed under MIT license. 9 | ''' 10 | 11 | import os.path 12 | import gzip 13 | import urllib 14 | import shutil 15 | import sys 16 | 17 | from .intervaltree import IntervalTree 18 | 19 | if sys.version_info >= (3, 0): 20 | import urllib.request 21 | FancyURLopener = urllib.FancyURLopener if sys.version_info < (3, 0) else urllib.request.FancyURLopener 22 | 23 | class ErrorAwareURLOpener(FancyURLopener): 24 | def http_error_default(self, url, fp, errcode, errmsg, headers): 25 | raise Exception("404") 26 | _urlopener = ErrorAwareURLOpener() 27 | 28 | def open_liftover_chain_file(from_db, to_db, search_dir='.', cache_dir=os.path.expanduser("~/.pyliftover"), use_web=True, write_cache=True): 29 | ''' 30 | A "smart" way of obtaining liftover chain files. 31 | By default acts as follows: 32 | 1. If the file ``To.over.chain.gz`` exists in , 33 | opens it for reading via gzip.open. 34 | 2. Otherwise, if the file ``To.over.chain`` exists 35 | in the opens it (as uncompressed file). 36 | Steps 1 and 2 may be disabled if search_dir is set to None. 37 | 3. Otherwise, checks whether ``/To.over.chain.gz`` exists. 38 | This step may be disabled by specifying cache_dir = None. 39 | 4. If file still not found attempts to download the file from the URL 40 | 'http://hgdownload.cse.ucsc.edu/goldenPath//liftOver/To.over.chain.gz' 41 | to a temporary location. This step may be disabled by specifying use_web=False. In this case the operation fails and 42 | the function returns None. 43 | 5. At this point, if write_cache=True and cache_dir is not None and writable, the file is copied to cache_dir and opened from there. 44 | Otherwise it is opened from the temporary location. 45 | 46 | In case of errors (e.g. URL cannot be opened), None is returned. 47 | ''' 48 | to_db = to_db[0].upper() + to_db[1:] 49 | FILE_NAME_GZ = '%sTo%s.over.chain.gz' % (from_db, to_db) 50 | FILE_NAME = '%sTo%s.over.chain' % (from_db, to_db) 51 | 52 | if search_dir is not None: 53 | FILE_GZ = os.path.join(search_dir, FILE_NAME_GZ) 54 | FILE = os.path.join(search_dir, FILE_NAME) 55 | if os.path.isfile(FILE_GZ): 56 | return gzip.open(FILE_GZ, 'rb') 57 | elif os.path.isfile(FILE): 58 | return open(FILE, 'rb') 59 | if cache_dir is not None: 60 | FILE_GZ = os.path.join(cache_dir, FILE_NAME_GZ) 61 | if os.path.isfile(FILE_GZ): 62 | return gzip.open(FILE_GZ, 'rb') 63 | if use_web: 64 | # Download file from the web. 65 | try: 66 | url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/liftOver/%sTo%s.over.chain.gz' % (from_db, from_db, to_db) 67 | (filename, headers) = _urlopener.retrieve(url) 68 | except: 69 | # Download failed, exit 70 | return None 71 | # Move the file to cache? 72 | if write_cache and (cache_dir is not None): 73 | try: 74 | if not os.path.isdir(cache_dir): 75 | os.mkdir(cache_dir) 76 | shutil.move(filename, FILE_GZ) 77 | # Move successful, open from cache 78 | return gzip.open(FILE_GZ, 'rb') 79 | except: 80 | # Move failed, open file from temp location 81 | return gzip.open(filename, 'rb') 82 | else: 83 | # Open from temp location 84 | return gzip.open(filename, 'rb') 85 | # If we didn't quit before this place, all failed. 86 | return None 87 | 88 | 89 | class LiftOverChainFile: 90 | ''' 91 | The class, which loads and indexes USCS's .over.chain files. 92 | 93 | Specification of the chain format can be found here: http://genome.ucsc.edu/goldenPath/help/chain.html 94 | ''' 95 | 96 | def __init__(self, f): 97 | ''' 98 | Reads chain data from the file and initializes an interval index. 99 | f must be a file object open for reading. 100 | If any errors are detected, an Exception is thrown. 101 | ''' 102 | self.chains = self._load_chains(f) 103 | self.chain_index = self._index_chains(self.chains) 104 | 105 | @staticmethod 106 | def _load_chains(f): 107 | ''' 108 | Loads all LiftOverChain objects from a file into an array. Returns the result. 109 | ''' 110 | chains = [] 111 | while True: 112 | line = f.readline() 113 | if not line: 114 | break 115 | if line.startswith(b'#') or line.startswith(b'\n') or line.startswith(b'\r'): 116 | continue 117 | if line.startswith(b'chain'): 118 | # Read chain 119 | chains.append(LiftOverChain(line, f)) 120 | continue 121 | return chains 122 | 123 | @staticmethod 124 | def _index_chains(chains): 125 | ''' 126 | Given a list of LiftOverChain objects, creates a 127 | dict: source_name --> 128 | IntervalTree: --> 129 | (target_from, target_to, chain) 130 | Returns the resulting dict. 131 | Throws an exception on any errors or inconsistencies among chains (e.g. different sizes specified for the same chromosome in various chains). 132 | ''' 133 | chain_index = {} 134 | source_size = {} 135 | target_size = {} 136 | for c in chains: 137 | # Verify that sizes of chromosomes are consistent over all chains 138 | source_size.setdefault(c.source_name, c.source_size) 139 | if source_size[c.source_name] != c.source_size: 140 | raise Exception("Chains have inconsistent specification of source chromosome size for %s (%d vs %d)" % (c.source_name, source_size[c.source_name], c.source_size)) 141 | target_size.setdefault(c.target_name, c.target_size) 142 | if target_size[c.target_name] != c.target_size: 143 | raise Exception("Chains have inconsistent specification of target chromosome size for %s (%d vs %d)" % (c.target_name, target_size[c.target_name], c.target_size)) 144 | chain_index.setdefault(c.source_name, IntervalTree(0, c.source_size)) 145 | # Register all blocks from the chain in the corresponding interval tree 146 | tree = chain_index[c.source_name] 147 | for (sfrom, sto, tfrom, tto) in c.blocks: 148 | tree.add_interval(sfrom, sto, (tfrom, tto, c)) 149 | 150 | # Sort all interval trees 151 | for k in chain_index: 152 | chain_index[k].sort() 153 | return chain_index 154 | 155 | def query(self, chromosome, position): 156 | ''' 157 | Given a chromosome and position, returns all matching records from the chain index. 158 | Each record is an interval (source_from, source_to, data) 159 | where data = (target_from, target_to, chain). Note that depending on chain.target_strand, the target values may need to be reversed (e.g. pos --> chain.target_size - pos). 160 | 161 | If chromosome is not found in the index, None is returned. 162 | ''' 163 | # A somewhat-ugly hack to allow both 'bytes' and 'str' objects to be used as 164 | # chromosome names in Python 3. As we store chromosome names as strings, 165 | # we'll transparently translate the query to a string too. 166 | if type(chromosome).__name__ == 'bytes': 167 | chromosome = chromosome.decode('ascii') 168 | if chromosome not in self.chain_index: 169 | return None 170 | else: 171 | return self.chain_index[chromosome].query(position) 172 | 173 | 174 | class LiftOverChain: 175 | ''' 176 | Represents a single chain from an .over.chain file. 177 | A chain basically maps a set of intervals from "source" coordinates to corresponding coordinates in "target" coordinates. 178 | The "source" and "target" are somehow referred to in the specs (http://genome.ucsc.edu/goldenPath/help/chain.html) 179 | as "target" and "query" respectively. 180 | ''' 181 | def __init__(self, header, f): 182 | ''' 183 | Reads the chain from a stream given the first line and a file opened at all remaining lines. 184 | On error throws an exception. 185 | ''' 186 | if sys.version_info >= (3, 0): 187 | header = header.decode('ascii') # In Python 2, work with usual strings. 188 | fields = header.split() 189 | if fields[0] != 'chain' and len(fields) not in [12, 13]: 190 | raise Exception("Invalid chain format. (%s)" % header) 191 | # chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1 192 | self.score = int(fields[1]) # Alignment score 193 | self.source_name = fields[2] # E.g. chrY 194 | self.source_size = int(fields[3]) # Full length of the chromosome 195 | self.source_strand = fields[4] # Must be + 196 | if self.source_strand != '+': 197 | raise Exception("Source strand in an .over.chain file must be +. (%s)" % header) 198 | self.source_start = int(fields[5]) # Start of source region 199 | self.source_end = int(fields[6]) # End of source region 200 | self.target_name = fields[7] # E.g. chr5 201 | self.target_size = int(fields[8]) # Full length of the chromosome 202 | self.target_strand = fields[9] # + or - 203 | if self.target_strand not in ['+', '-']: 204 | raise Exception("Target strand must be - or +. (%s)" % header) 205 | self.target_start = int(fields[10]) 206 | self.target_end = int(fields[11]) 207 | self.id = None if len(fields) == 12 else fields[12].strip() 208 | 209 | # Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to) 210 | sfrom, tfrom = self.source_start, self.target_start 211 | self.blocks = [] 212 | fields = f.readline().decode('ascii').split() 213 | while len(fields) == 3: 214 | size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2]) 215 | self.blocks.append((sfrom, sfrom+size, tfrom, tfrom+size)) 216 | sfrom += size + sgap 217 | tfrom += size + tgap 218 | fields = f.readline().split() 219 | if len(fields) != 1: 220 | raise Exception("Expecting one number on the last line of alignments block. (%s)" % header) 221 | size = int(fields[0]) 222 | self.blocks.append((sfrom, sfrom+size, tfrom, tfrom+size)) 223 | if (sfrom + size) != self.source_end or (tfrom + size) != self.target_end: 224 | raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header) 225 | -------------------------------------------------------------------------------- /pyliftover/hg17ToHg19.over.chain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/pyliftover/hg17ToHg19.over.chain.gz -------------------------------------------------------------------------------- /pyliftover/hg18ToHg19.over.chain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/pyliftover/hg18ToHg19.over.chain.gz -------------------------------------------------------------------------------- /pyliftover/hg19ToGRCh37.over.chain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/pyliftover/hg19ToGRCh37.over.chain.gz -------------------------------------------------------------------------------- /pyliftover/intervaltree.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Interval Tree data structure for indexing a set of 3 | integer intervals of the form [start, end). 4 | 5 | http://en.wikipedia.org/wiki/Interval_tree 6 | 7 | Copyright 2013, Konstantin Tretyakov. 8 | http://kt.era.ee/ 9 | 10 | Licensed under MIT license. 11 | ''' 12 | 13 | 14 | class IntervalTree: 15 | ''' 16 | Interval Tree data structure for indexing a set of 17 | integer intervals of the form [start, end). 18 | 19 | See: http://en.wikipedia.org/wiki/Interval_tree 20 | 21 | The tree assumes it is covered in intervals reasonably uniformly (reasonable assumption for our liftOver purposes), 22 | and always picks its center as the middle point between the prespecified "min" and "max" values. 23 | No removal operation is implemented. 24 | 25 | >>> t = IntervalTree(0, 100) 26 | >>> t.query(2) 27 | [] 28 | >>> t.add_interval(10, 25) 29 | >>> t.add_interval(15, 27) 30 | >>> t.sort() 31 | >>> t.query(10) 32 | [(10, 25, None)] 33 | >>> t.query(24) 34 | [(10, 25, None), (15, 27, None)] 35 | >>> t.query(25) 36 | [(15, 27, None)] 37 | >>> t.query(27) 38 | [] 39 | ''' 40 | 41 | def __init__(self, min, max): 42 | ''' 43 | Creates a tree node for keeping intervals somewhere in the range [min...max). 44 | ''' 45 | self.min = int(min) 46 | self.max = int(max) 47 | assert self.min < self.max 48 | self.center = (min + max)/2 49 | self.single_interval = None # We take special care of trees which only contain a single interval 50 | self.left_subtree = None # Intervals which are all strictly to the left of center. 51 | self.right_subtree = None # Intervals which are all strictly to the right of center. 52 | self.mid_sorted_by_start = [] # Intervals which contain center, sorted by start position 53 | self.mid_sorted_by_end = [] # Same intervals, sorted by end position. 54 | 55 | def add_interval(self, start, end, data=None): 56 | ''' 57 | Inserts an interval to the tree. 58 | Note that when inserting we do not maintain appropriate sorting of the "mid" data structure. 59 | This should be done after all intervals are inserted. 60 | ''' 61 | # Ignore intervals of 0 or negative length 62 | if (end - start) <= 0: 63 | return 64 | if self.single_interval is None: 65 | # This is an empty tree and we are adding the first interval. Just record it in a field. 66 | self.single_interval = (start, end, data) 67 | elif self.single_interval == 0: 68 | # This is a usual tree, use standard addition method 69 | self._add_interval(start, end, data) 70 | else: 71 | # This is a tree with a single interval. Convert to a usual tree. 72 | self._add_interval(*self.single_interval) 73 | self.single_interval = 0 74 | self._add_interval(start, end, data) 75 | 76 | def _add_interval(self, start, end, data=None): 77 | if end <= self.center: 78 | # Insert into left subtree 79 | if self.left_subtree is None: 80 | self.left_subtree = IntervalTree(self.min, self.center) 81 | self.left_subtree.add_interval(start, end, data) 82 | elif start > self.center: 83 | if self.right_subtree is None: 84 | self.right_subtree = IntervalTree(self.center, self.max) 85 | self.right_subtree.add_interval(start, end, data) 86 | else: 87 | self.mid_sorted_by_start.append((start, end, data)) 88 | self.mid_sorted_by_end.append((start, end, data)) 89 | 90 | def sort(self): 91 | ''' 92 | Must be invoked after all intevals have been added to sort mid_** arrays. 93 | ''' 94 | if self.single_interval is None or self.single_interval != 0: 95 | return # Nothing to do for empty and leaf trees. 96 | self.mid_sorted_by_start.sort(key = lambda x: x[0]) 97 | self.mid_sorted_by_end.sort(key = lambda x: x[1], reverse=True) 98 | if self.left_subtree is not None: 99 | self.left_subtree.sort() 100 | if self.right_subtree is not None: 101 | self.right_subtree.sort() 102 | 103 | def query(self, x): 104 | ''' 105 | Returns all intervals in the tree, which overlap given point, i.e. all (start, end, data) records, for which (start <= x < end). 106 | ''' 107 | result = [] 108 | self._query(x, result) 109 | return result 110 | 111 | def _query(self, x, result): 112 | ''' 113 | Same as self.query, but uses a provided list to accumulate results into. 114 | ''' 115 | if self.single_interval is None: # Empty 116 | return 117 | elif self.single_interval != 0: # Single interval, just check whether x is in it 118 | if self.single_interval[0] <= x < self.single_interval[1]: 119 | result.append(self.single_interval) 120 | elif x < self.center: # Normal tree, query point to the left of center 121 | if self.left_subtree is not None: 122 | self.left_subtree._query(x, result) 123 | for int in self.mid_sorted_by_start: 124 | if int[0] <= x: 125 | result.append(int) 126 | else: 127 | break 128 | else: # Normal tree, query point to the right of center 129 | for int in self.mid_sorted_by_end: 130 | if int[1] > x: 131 | result.append(int) 132 | else: 133 | break 134 | if self.right_subtree is not None: 135 | self.right_subtree._query(x, result) 136 | 137 | def __len__(self): 138 | ''' 139 | The number of intervals maintained in the tree. 140 | Note that adding zero- or negative-size intervals does not affect its size (they are not registered). 141 | 142 | >>> t = IntervalTree(0, 100) 143 | >>> t.add_interval(1, 10) 144 | >>> t.add_interval(20, 30) 145 | >>> t.add_interval(20, 20) 146 | >>> t.add_interval(20, 19) 147 | >>> len(t) 148 | 2 149 | ''' 150 | 151 | if self.single_interval is None: 152 | return 0 153 | elif self.single_interval != 0: 154 | return 1 155 | else: 156 | size = len(self.mid_sorted_by_start) 157 | if self.left_subtree is not None: 158 | size += len(self.left_subtree) 159 | if self.right_subtree is not None: 160 | size += len(self.right_subtree) 161 | return size 162 | 163 | def __iter__(self): 164 | if self.single_interval is None: 165 | return 166 | elif self.single_interval != 0: 167 | yield self.single_interval 168 | else: 169 | if self.left_subtree is not None: 170 | for s in self.left_subtree: 171 | yield s 172 | for s in self.mid_sorted_by_start: 173 | yield s 174 | if self.right_subtree is not None: 175 | for s in self.right_subtree: 176 | yield s 177 | 178 | -------------------------------------------------------------------------------- /pyliftover/liftover.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Pure-python implementation of UCSC "liftover" genome coordinate conversion. 3 | Main class, which is actually a convenience wrapper around chainfile.py's LiftOverChainFile 4 | 5 | Copyright 2013, Konstantin Tretyakov. 6 | http://kt.era.ee/ 7 | 8 | Licensed under MIT license. 9 | ''' 10 | 11 | import os.path 12 | import gzip 13 | from .chainfile import open_liftover_chain_file, LiftOverChainFile 14 | 15 | class LiftOver: 16 | def __init__(self, from_db, to_db=None, search_dir='.', cache_dir=os.path.expanduser("~/.pyliftover"), use_web=True, write_cache=True, use_gzip=None): 17 | ''' 18 | LiftOver can be initialized in multiple ways. 19 | * By providing a filename as a single argument: LiftOver("hg17ToHg18.over.chain.gz") 20 | The file may be a usual or a gzip-compressed file. The compression is automatically detected from the .gz extension. 21 | If you want to override the way this is handled (e.g. open a file with non-gz extension as gzipped file), use use_gzip=True or use_gzip=False as needed. 22 | * By providing an opened file opbject as a single argument: LiftOver(open("hg17ToHg18.over.chain")) 23 | * By providing name of from_db and to_db, e.g. LiftOver('hg18', 'hg19'). 24 | In this case, LiftOver will "intelligently" search for the best available over.chain file for converting between the assemblies. 25 | The file will be searched in local directory, cache directory, or even downloaded from the web, if possible. 26 | The exact way this is handled (as well as all the other parameters of the constructor) is documented in 27 | :see:`pyliftover.chainfile.open_liftover_chain_file`. 28 | 29 | Test providing filename: 30 | >>> lo = LiftOver('tests/data/mds42.to.mg1655.liftOver') 31 | >>> lo.convert_coordinate('AP012306.1', 16000) #doctest: +ELLIPSIS (because on 32-bit systems there's an L qualifier after the number and on 64-bit ones there's nothing. 32 | [('Chromosome', 21175, '+', 378954552...)] 33 | 34 | Test providing from_db and to_db: 35 | >>> lo = LiftOver('hg17', 'hg18') 36 | >>> lo.convert_coordinate('chr1', 1000000) 37 | [('chr1', 949796, '+', 21057807908...)] 38 | >>> lo.convert_coordinate('chr1', 0) 39 | [('chr1', 0, '+', 21057807908...)] 40 | >>> lo.convert_coordinate('chr1', 0, '-') 41 | [('chr1', 0, '-', 21057807908...)] 42 | >>> lo.convert_coordinate('chr1', 103786442) 43 | [('chr20', 20668001, '-', 14732...)] 44 | >>> lo.convert_coordinate('chr1', 103786443, '-') 45 | [('chr20', 20668000, '+', 14732...)] 46 | >>> lo.convert_coordinate('chr1', 103786441, '+') 47 | [] 48 | ''' 49 | if to_db is None: 50 | # A file name or a file object was provided 51 | if isinstance(from_db, str): 52 | do_gzip = use_gzip if use_gzip is not None else from_db.lower().endswith('.gz') 53 | if do_gzip: 54 | f = gzip.open(from_db, 'rb') 55 | else: 56 | f = open(from_db, 'rb') 57 | else: 58 | f = from_db 59 | else: 60 | # From- and To- db names were provided. 61 | f = open_liftover_chain_file(from_db=from_db, to_db=to_db, search_dir=search_dir, cache_dir=cache_dir, use_web=use_web, write_cache=write_cache) 62 | self.chain_file = LiftOverChainFile(f) 63 | f.close() 64 | 65 | def convert_coordinate(self, chromosome, position, strand='+'): 66 | ''' 67 | Returns a *list* of possible conversions for a given chromosome position. 68 | The list may be empty (no conversion), have a single element (unique conversion), or several elements (position mapped to several chains). 69 | The list contains tuples (target_chromosome, target_position, target_strand, conversion_chain_score), 70 | where conversion_chain_score is the "alignment score" field specified at the chain used to perform conversion. If there 71 | are several possible conversions, they are sorted by decreasing conversion_chain_score. 72 | 73 | IF chromosome is completely unknown to the LiftOver, None is returned. 74 | 75 | Note that coordinates are 0-based, and even at negative strand are relative to the beginning of the genome. 76 | I.e. position 0 strand + is the first position of the genome. Position 0 strand - is also the first position of the genome 77 | (and the last position of reverse-complemented genome). 78 | ''' 79 | query_results = self.chain_file.query(chromosome, position) 80 | if query_results is None: 81 | return None 82 | else: 83 | # query_results contains intervals which contain the query point. We simply have to remap to corresponding targets. 84 | results = [] 85 | for (source_start, source_end, data) in query_results: 86 | target_start, target_end, chain = data 87 | result_position = target_start + (position - source_start) 88 | if chain.target_strand == '-': 89 | result_position = chain.target_size - 1 - result_position 90 | result_strand = chain.target_strand if strand == '+' else ('+' if chain.target_strand == '-' else '-') 91 | results.append((chain.target_name, result_position, result_strand, chain.score)) 92 | #if len(results) > 1: 93 | results.sort(key=lambda x: x[3], reverse=True) 94 | return results 95 | -------------------------------------------------------------------------------- /qq.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import matplotlib 4 | matplotlib.use("Agg") 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import scipy.stats as sstats 8 | import argparse 9 | import pandas as pd 10 | import json 11 | 12 | # Examples: 13 | # python qq.py MSA_MSA_2016_lift_noMHC_correct_case_control.csv.gz --strata CTG_COG_2018.csv.gz --strata-num PVAL --top-as-dot 100 --weights weights.tld.txt.gz --out qq.msa_cog.top100.tld.png 14 | # python qq.py PGC_MDD_2018_no23andMe.csv.gz --strata PGC_MDD_2018_no23andMe.csv.gz --strata-cat CHR --strata-cat-ids 'chr1_10=1:2:3:4:5:6:7:8:9:10,chr11_20=11:12:13:14:15:16:17:18:19:20,chr21_22=21:22' --top-as-dot 100 --weights weights.prune.txt.gz --y-lim 7.301029995663981 --out qq.mdd.chr.top100.prune.png 15 | # python qq.py PGC_SCZ_2014_EUR_qc.csv.gz --strata PGC_MDD_2018_no23andMe.csv.gz --strata-num PVAL --top-as-dot 100 --weights weights.prune.txt.gz --out qq.scz_mdd.top100.prune.png --y-lim 7.301029995663981 16 | 17 | example_text = """Example 1: 18 | python qq.py PGC_BIP_2016_qc.csv.gz 19 | Example 2: 20 | python qq.py PGC_SCZ_2014_EUR_qc.csv.gz --strata PGC_MDD_2018_no23andMe.csv.gz \\ 21 | --strata-num PVAL --top-as-dot 100 --weights weights.prune.txt.gz \\ 22 | --out qq.scz_mdd.top100.prune.png --y-lim 15 23 | Example 3: 24 | python qq.py PGC_MDD_2018_no23andMe.csv.gz --strata PGC_MDD_2018_no23andMe.csv.gz \\ 25 | --strata-cat CHR --strata-cat-ids 'chr1_7=1:2:3:4:5:6:7,chr18_21=18:19:20:21' \\ 26 | --weights weights.tld.txt.gz --y-lim 7.301029995663981 --out qq.mdd.chr.png""" 27 | 28 | 29 | def parse_args(args): 30 | parser = argparse.ArgumentParser( 31 | formatter_class=argparse.RawDescriptionHelpFormatter, 32 | description="A tool to qq plots from sumstats.", 33 | epilog=example_text) 34 | 35 | parser.add_argument("sumstats", help="Sumstats file") 36 | parser.add_argument("--sep", default='\t', 37 | help="Column separator in sumstat file") 38 | parser.add_argument("--p", default="PVAL", 39 | help="A column with SNP p-values in sumstats file") 40 | parser.add_argument("--snp", default="SNP", 41 | help="A column with SNP ids in sumstats file") 42 | parser.add_argument("--strata", default="NA", 43 | help="A file with at least 2 columns: SNP id and SNP stratum") 44 | parser.add_argument("--strata-sep", default='\t', 45 | help="Column separator in strata file") 46 | parser.add_argument("--strata-snp", default="SNP", 47 | help="A column with SNP ids in strata file") 48 | parser.add_argument("--strata-cat", default="NA", 49 | help="A column with SNP categories. Each category represents a separate stratum in qq plot") 50 | parser.add_argument("--strata-cat-ids", default="NA", 51 | help=("Comma-separated list of categories from --strata-cat column to plot " 52 | "and corresponding names, e.g. 'chr1_2_6=1:2:6' (defines strata chr1_2_6 " 53 | "containinfg all variants with value in --strata-cat column = 1,2 or 6). " 54 | "By default all categories are plotted with original names")) 55 | parser.add_argument("--strata-num", default="NA", 56 | help="A column with SNP numerical value (e.g. p-value)") 57 | parser.add_argument("--strata-num-intervals", type=str, 58 | default="p<10^-1=:0.1,p<10^-2=:0.01,p<10^-3=:0.001", help=("Comma-separated " 59 | "intervals defining SNP strata based on values from --strata-num column " 60 | "and corresponding names, e.g.: 'A=:-1,B=0:6' (defines stratum A " 61 | "corresponding to the interval (-inf, -1] and stratum B = (0,6]. " 62 | "If there is a '-' charecter in any of values, the whole argument value should be quoted")) 63 | parser.add_argument("--strata-bin", nargs='+', default="NA", 64 | help=("A list of columns (each column representing one stratum) with binary data " 65 | "0/1 or False/True for each variant indicatig whether the variant belongs to " 66 | "the corresponding strata")) 67 | parser.add_argument("--weights", default="NA", 68 | help=("Tab-separated file without header and with 2 columns: SNP id and SNP weight. " 69 | "Don't need to be normalized")) 70 | parser.add_argument("--top-as-dot", default=0, type=int, 71 | help="Number of top associations (lowest p-values) to mark as a separate dot") 72 | parser.add_argument("--x-lim", default=None, type=float, 73 | help="X-axis maximum limit on -log10 scale") 74 | parser.add_argument("--y-lim", default=None, type=float, 75 | help="Y-axis maximum limit on -log10 scale (e.g. gws threshold = 7.3)") 76 | parser.add_argument("--out", default="qq.png", help="Output file name") 77 | 78 | return parser.parse_args(args) 79 | 80 | 81 | def drop_duplicated_ind(df): 82 | i = df.index.duplicated(keep='first') 83 | if i.any(): 84 | print("The table contains %d duplicated ids" % sum(i)) 85 | print("Only the first row with duplicated id will be retained") 86 | df = df.loc[~i,:] 87 | return df 88 | 89 | 90 | def process_args(args): 91 | """ 92 | Check whether provided arguments are correct, change list-type arguments 93 | with single value to have a length = length of sumstats argument and process 94 | chr2use arument. 95 | """ 96 | assert os.path.isfile(args.sumstats), "'%s' file doesn't exist" % args.sumstats 97 | assert os.path.isfile(args.strata) or args.strata=="NA", "'%s' file doesn't exist" % args.strata 98 | assert os.path.isfile(args.weights) or args.weights=="NA", "'%s' file doesn't exist" % args.weights 99 | 100 | # process special arguments 101 | arg_dict = vars(args) 102 | if args.strata_num != "NA": 103 | intervals = {} 104 | for name_i in arg_dict["strata_num_intervals"].split(","): 105 | name, i = name_i.split("=") 106 | name = name.strip() 107 | assert name != "", "Stratum name should not be an empty string" 108 | start,end = i.split(":") 109 | start = -np.inf if start == "" else float(start) 110 | end = np.inf if end == "" else float(end) 111 | assert not name in intervals, "Stratum name must be unique (duplicated name: %s)" % name 112 | intervals[name] = (start, end) 113 | arg_dict["strata_num_intervals"] = intervals 114 | if args.strata_cat != "NA" and args.strata_cat_ids != "NA": 115 | categories = {} 116 | for name_c in arg_dict["strata_cat_ids"].split(","): 117 | name, c = name_c.split("=") 118 | name = name.strip() 119 | assert name != "", "Stratum name should not be an empty string" 120 | c = frozenset(map(str.strip, c.split(":"))) 121 | assert not name in categories, "Strata name must be unique (duplicated name: %s)" % name 122 | categories[name] = c 123 | arg_dict["strata_cat_ids"] = categories 124 | 125 | 126 | def read_sumstats(sumstats_f, sep, snpid_col, pval_col): 127 | """ 128 | Filter original summary stats file. 129 | Args: 130 | sumstats_f: sumstats file name 131 | sep: column separator in sumstats_f 132 | snpid_col: a name of column with variant ids 133 | pval_col: a name of column with variant p-values 134 | Returns: 135 | df: filtered p-values, pd.DataFrame(index=snp_id, values=pval) 136 | """ 137 | print("Reading %s" % sumstats_f) 138 | cols2use = [snpid_col, pval_col] 139 | df = pd.read_csv(sumstats_f, usecols=cols2use, index_col=snpid_col, 140 | sep=sep) 141 | print("%d SNPs in %s" % (len(df), sumstats_f)) 142 | df = df.loc[np.isfinite(df[pval_col]),:] 143 | print("%d SNPs with defined p-value" % len(df)) 144 | df = df.loc[df[pval_col]>0,:] 145 | print("%d SNPs with non-zero p-value" % len(df)) 146 | df = drop_duplicated_ind(df) 147 | return df 148 | 149 | 150 | def read_strata_cat(strata_f, sep, snpid_col, strata_cat_col, strata_cat_ids): 151 | print("Reading strata file %s" % strata_f) 152 | cols2use = [snpid_col, strata_cat_col] 153 | df = pd.read_csv(strata_f, usecols=cols2use, index_col=snpid_col, sep=sep, 154 | dtype={strata_cat_col:str}) 155 | # make a standard name for variant strata column 156 | if strata_cat_ids == "NA": 157 | for s in df[strata_cat_col].unique(): 158 | stratum_i = (df[strata_cat_col] == s) 159 | df.loc[:,s] = stratum_i 160 | else: 161 | for name, ids_set in strata_cat_ids.items(): 162 | stratum_i = df[strata_cat_col].isin(ids_set) 163 | df.loc[:,name] = stratum_i 164 | df.drop([strata_cat_col], axis=1, inplace=True) 165 | # keep only variants which are within any stratum 166 | df = df.loc[df.any(axis=1)] 167 | assert len(df) > 0, "All strata are empty" 168 | df = drop_duplicated_ind(df) 169 | return df 170 | 171 | 172 | def read_strata_num(strata_f, sep, snpid_col, strata_num_col, strata_num_intervals): 173 | print("Reading strata file %s" % strata_f) 174 | cols2use = [snpid_col, strata_num_col] 175 | df = pd.read_csv(strata_f, usecols=cols2use, index_col=snpid_col, sep=sep, 176 | dtype={strata_num_col:float}) 177 | for name, (start, end) in strata_num_intervals.items(): 178 | stratum_i = (start 0, "All strata are empty" 184 | df = drop_duplicated_ind(df) 185 | return df 186 | 187 | 188 | def read_strata_bin(strata_f, sep, snpid_col, strata_bin): 189 | print("Reading strata file %s" % strata_f) 190 | cols2use = [snpid_col] + strata_bin 191 | df = pd.read_csv(strata_f, usecols=cols2use, index_col=snpid_col, sep=sep, 192 | dtype=dict.fromkeys(strata_bin, bool)) 193 | df = df.loc[df.any(axis=1)] 194 | assert len(df) > 0, "All strata are empty" 195 | df = drop_duplicated_ind(df) 196 | return df 197 | 198 | 199 | def read_weights(weights_f): 200 | print("Reading weights file %s" % weights_f) 201 | df = pd.read_csv(weights_f, sep='\t', header=None, names=["snp", "w"], 202 | index_col="snp") 203 | # drop zero weights 204 | df = df.loc[df.w>0,:] 205 | drop_duplicated_ind(df) 206 | return df 207 | 208 | 209 | def get_xy_from_p(p, top_as_dot, p_weights, nbins=200): 210 | if p_weights is None: 211 | p_weights = np.ones(len(p)) 212 | p_weights /= sum(p_weights) # normalize weights 213 | 214 | i = np.argsort(p) 215 | p = p[i] 216 | p_weights = p_weights[i] 217 | p_ecdf = np.concatenate([[0], np.cumsum(p_weights)]) 218 | 219 | y = np.logspace(np.log10(p[-1]), np.log10(p[top_as_dot]), nbins) 220 | i = np.searchsorted(p, y, side='right') 221 | i[0] = len(p) # last index in p_ecdf 222 | i[-1] = top_as_dot+1 # top_as_dot index in p_ecdf 223 | # estimate standard uniform quantiles corresponding to y observed quantiles 224 | uniform_quantiles = p_ecdf[i] 225 | x = -np.log10(uniform_quantiles) 226 | y = -np.log10(y) 227 | # if top_as_dot = 0, then x_dot and y_dot are empty arrays 228 | x_dot = -np.log10(p_ecdf[1:top_as_dot+1]) 229 | y_dot = -np.log10(p[:top_as_dot]).values 230 | return x, y, x_dot, y_dot 231 | 232 | 233 | def get_ci(p, p_weights, ci_alpha=0.05, nbins=200): 234 | # TODO: the first part of this function is identical to the first part of 235 | # get_xy_from_p(), so probably should be merged?? 236 | if p_weights is None: 237 | p_weights = np.ones(len(p)) 238 | p_weights *= len(p)/sum(p_weights) # normalize weights and imitate order statistics 239 | 240 | i = np.argsort(p) 241 | p = p[i] 242 | p_weights = p_weights[i] 243 | cum_p_weights = np.cumsum(p_weights) 244 | 245 | y = np.logspace(np.log10(p[-1]), np.log10(p[0]), nbins) 246 | # the following code is inspired by: 247 | # https://genome.sph.umich.edu/wiki/Code_Sample:_Generating_QQ_Plots_in_R 248 | # beta_a is our order statistics. For standard uniform distr (expected under null) 249 | # it follows beta distr: 250 | # https://en.wikipedia.org/wiki/Order_statistic#Order_statistics_sampled_from_a_uniform_distribution 251 | i = np.searchsorted(p, y, side='left') 252 | i[0] = len(p) - 1 253 | i[-1] = 0 254 | beta_a = cum_p_weights[i] 255 | beta_b = len(p) + 1 - beta_a 256 | lower_ci = -np.log10(sstats.beta.ppf(1-ci_alpha/2, beta_a, beta_b)) 257 | upper_ci = -np.log10(sstats.beta.ppf(ci_alpha/2, beta_a, beta_b)) 258 | x_ci = -np.log10(beta_a/len(p)) 259 | return x_ci, lower_ci, upper_ci 260 | 261 | class NumpyEncoder(json.JSONEncoder): 262 | def default(self, obj): 263 | if callable(obj): 264 | return str(obj) 265 | if isinstance(obj, np.ndarray): 266 | return obj.tolist() 267 | if isinstance(obj, pd.Series) or isinstance(obj, pd.Index): 268 | return obj.values.tolist() 269 | if isinstance(obj, np.float32): 270 | return np.float64(obj) 271 | return json.JSONEncoder.default(self, obj) 272 | 273 | 274 | if __name__ == "__main__": 275 | args = parse_args(sys.argv[1:]) 276 | process_args(args) 277 | 278 | df_sumstats = read_sumstats(args.sumstats, args.sep, args.snp, args.p) 279 | 280 | df_strata = None 281 | if args.strata_cat != "NA": 282 | df_strata = read_strata_cat(args.strata, args.strata_sep, args.strata_snp, 283 | args.strata_cat, args.strata_cat_ids) 284 | elif args.strata_num != "NA": 285 | df_strata = read_strata_num(args.strata, args.strata_sep, args.strata_snp, 286 | args.strata_num, args.strata_num_intervals) 287 | elif args.strata_bin != "NA": 288 | df_strata = read_strata_bin(args.strata, args.strata_sep, args.strata_snp, 289 | args.strata_bin) 290 | 291 | if args.weights != 'NA': 292 | df_weights = read_weights(args.weights) 293 | snps_with_weight = df_sumstats.index.intersection(df_weights.index) 294 | print("%d varints from %s have weight" % (len(snps_with_weight), args.sumstats)) 295 | assert len(snps_with_weight) > 0, ("At least one variant from %s must " 296 | "have weight in %s if weights are provided" % (args.sumstats, args.weights)) 297 | print("Only they will be plotted") 298 | df_sumstats = df_sumstats.loc[snps_with_weight,:] 299 | df_sumstats["weights"] = df_weights.loc[snps_with_weight,:] 300 | else: 301 | # if weights are not provided set equal weights to all SNPs 302 | df_sumstats["weights"] = 1. 303 | 304 | if not df_strata is None: 305 | # drop variants which are not in df_sumstats 306 | df_strata = df_strata.loc[df_strata.index.isin(df_sumstats.index),:] 307 | # df_strata is either None or: 308 | # df_strata = DataFrame(index=snp_ids, columns=boolean_strata) 309 | 310 | x, y, x_dot, y_dot = get_xy_from_p(df_sumstats[args.p], args.top_as_dot, 311 | df_sumstats["weights"]) 312 | x_ci, lower_ci, upper_ci = get_ci(df_sumstats[args.p], df_sumstats["weights"]) 313 | 314 | # estimate axis limits 315 | max_x_lim = max(x[-1],x_ci[-1], 0 if args.top_as_dot==0 else x_dot[0]) 316 | max_y_lim = max(y[-1],upper_ci[-1], 0 if args.top_as_dot==0 else y_dot[0]) 317 | 318 | print("Making plot") 319 | json_data = {} 320 | fig, ax = plt.subplots(figsize=(6,6), dpi=200) 321 | 322 | # plot null and ci 323 | ax.fill_between(x_ci, lower_ci, upper_ci, color="0.8"); json_data['x_ci'] = x_ci; json_data['lower_ci'] = lower_ci; json_data['upper_ci'] = upper_ci 324 | ax.plot([0,x_ci[-1]], [0,x_ci[-1]], ls='--', lw=1, marker=' ', color="k") 325 | # plot all data 326 | if df_strata is None: 327 | ax.plot(x, y, ls='-', lw=1, marker=' ', label="all variants", color='C0'); json_data['x'] = x; json_data['y'] = y 328 | if args.top_as_dot > 0: 329 | ax.plot(x_dot, y_dot, ls=' ', marker='.', ms=1, color='C0'); json_data['x_dot'] = x_dot; json_data['y_dot'] = y_dot 330 | 331 | # plot strata 332 | if not df_strata is None: 333 | json_data['stratum'] = [] 334 | for j, stratum_id in enumerate(df_strata.columns): 335 | i = df_strata.index[df_strata[stratum_id]] 336 | json_stratum = {'stratum_id':stratum_id} 337 | x, y, x_dot, y_dot = get_xy_from_p(df_sumstats.loc[i,args.p], 338 | args.top_as_dot, df_sumstats.loc[i,"weights"]) 339 | color = "C%d" % ((j%9)+1); json_stratum['color'] = color 340 | ax.plot(x, y, ls='-', lw=1, marker=' ', label=stratum_id, color=color); json_stratum['x'] = x; json_stratum['y'] = y 341 | if args.top_as_dot > 0: 342 | ax.plot(x_dot, y_dot, ls=' ', marker='.', ms=1, color=color); json_stratum['x_dot'] = x_dot; json_stratum['y_dot'] = y_dot 343 | # update upper limits if needed 344 | max_x_lim = max(max_x_lim, x[-1], 0 if args.top_as_dot==0 else x_dot[0]) 345 | max_y_lim = max(max_y_lim, y[-1], 0 if args.top_as_dot==0 else y_dot[0]) 346 | json_data['stratum'].append(json_stratum) 347 | 348 | ax.set_xlabel(r"expected $\mathrm{-log_{10}(P)}$") 349 | ax.set_ylabel(r"observed $\mathrm{-log_{10}(P)}$") 350 | 351 | if not args.x_lim is None: 352 | max_x_lim = args.x_lim 353 | if not args.y_lim is None: 354 | max_y_lim = args.y_lim 355 | ax.set_xlim((-0.005*max_x_lim, 1.01*max_x_lim)) 356 | ax.set_ylim((-0.005*max_y_lim, 1.01*max_y_lim)) 357 | 358 | # configure and set title 359 | title = os.path.splitext(os.path.basename(args.sumstats))[0] 360 | if args.strata != "NA": 361 | strata = os.path.splitext(os.path.basename(args.strata))[0] 362 | title = "%s | %s" % (title, strata) 363 | ax.set_title(title, fontsize='small'); json_data['title'] = title 364 | 365 | ax.legend(loc='upper left', fontsize="small") 366 | 367 | # remove top and right spines 368 | ax.spines['right'].set_visible(False) 369 | ax.spines['top'].set_visible(False) 370 | # add offset for left spine 371 | # ax.spines['left'].set_position(('outward',1)) 372 | # ax.spines['bottom'].set_position(('outward',1)) 373 | 374 | plt.grid(True) 375 | # plt.axis('equal') 376 | plt.tight_layout() 377 | # plt.show() 378 | 379 | plt.savefig(args.out) 380 | print("%s was generated" % args.out) 381 | 382 | with open(args.out + '.json', 'w') as outfile: 383 | json.dump(json_data, outfile, cls=NumpyEncoder) 384 | print("%s.json was generated" % args.out) 385 | 386 | print("Done.") 387 | -------------------------------------------------------------------------------- /sumStats2ref.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import scipy.stats as stats 4 | import scipy.io as sio 5 | import os, sys, argparse, time, logging, getpass 6 | import matplotlib.pyplot as plt 7 | from GWAS_IO.summary_stats_Utils import * 8 | 9 | def read_sum_dat(sumFile, logger, kargs): 10 | ''' 11 | Read give summary statistics. 12 | 13 | Input: 14 | sumFile, Path of summary file. 15 | logger, Python logger for process information. 16 | kargs, namespace object for options. 17 | 18 | Return: 19 | ------- 20 | sumDat, DataFrame of summary dataset. 21 | 22 | Note: 23 | ----- 24 | 1. Field names (if exists) will be standardize according to bellow 25 | effCol, Beta 26 | ORCol, OR 27 | effACol, A1 28 | othACol, A2 29 | posCol, POS 30 | infoCol, INFO 31 | NCol, N 32 | And, Chromosome names will be standardized 33 | Removing'CHR', 'Chr', etc --> integer 34 | recode chrX--> 23 35 | recode chrY--> 24 36 | recode chrM--> 25 37 | 2. SNPs with invalid p values removed, i.e., >1 or < 0, or NAN 38 | 3. Duplicated SNPs removed 39 | ''' 40 | 41 | if not os.access(sumFile, os.R_OK): 42 | raise ValueError("Can't read summary stats file: {}".format(sumFile)) 43 | logger.info('*** Loading summary stats ***') 44 | logger.info('Read summary data from {}'.format(sumFile)) 45 | sumDat = read_sumdata(sumFile, kargs.snpCol, kargs.pCol, kargs) 46 | logger.info('......') 47 | logger.info('Read {} SNPs'.format(sumDat.shape[0])) 48 | colnames = ['SNP', 'P', 'A1', 'CHR', 'POS', 'Beta', 'A2'] 49 | if 'P' not in sumDat.columns: 50 | raise RuntimeError('No P value provided') 51 | if 'SNP' not in sumDat.columns: 52 | raise RuntimeError('No SNP ID provided') 53 | if not kargs.effACol: 54 | warnings.warn('No effective Allele provided') 55 | logger.warning('No effective Allele provided') 56 | colnames.remove('A1') 57 | if not kargs.othACol: 58 | warnings.warn( "No other Allele information provided") 59 | logger.warn('No effective Allele provided') 60 | colnames.remove('A2') 61 | if not kargs.effCol: 62 | if not kargs.orCol: 63 | colnames.remove('Beta') 64 | logger.warn('Directionality is not checked') 65 | else: 66 | sumDat.loc[:, 'Beta'] = np.log(sumDat.loc[:, 'OR']) 67 | sumDat.drop('OR',axis=1, inplace=True) 68 | if (not kargs.effACol) and (not kargs.othACol): 69 | logger.warn('Directionality is not checked') 70 | colnames.remove('Beta') 71 | sumDat.drop('Beta', axis=1, inplace=True) 72 | if (not kargs.posCol) or (not kargs.chrCol): 73 | logger.info('Using SNP ID only for align Summary data to reference') 74 | colnames.remove('POS') 75 | colnames.remove('CHR') 76 | keys = ['SNP'] 77 | elif kargs.forceID: 78 | keys = ['SNP'] 79 | else: 80 | keys = ['CHR', 'POS'] 81 | if kargs.NCol: 82 | colnames.append('N') 83 | logger.info('Reading Summary stats done\n') 84 | logger.info('**** check P values *****') 85 | sumDat = basic_QC_P(sumDat, kargs.outdir, 'P', logger) 86 | logger.info('**** END check P values *****') 87 | logger.info('**** check duplicated SNPs *****') 88 | sumDat, dup = deduplcate_sum(sumDat, 'P', keys) 89 | print (sumDat.head()) 90 | if dup.shape[0] > 0: 91 | dupFile = os.path.join(kargs.outdir, 'Duplicated_snps.gz') 92 | logger.warning('There are {} duplicated SNPs in {}'.format( 93 | dup.shape[0], sumFile)) 94 | logger.warning('\t The SNP with minimum p value included') 95 | logger.warning('see all duplicated SNPs in {}'.format(dupFile)) 96 | dup.to_csv(dupFile, index=False, na_rep='NA', compression='gzip', 97 | sep='\t') 98 | logger.info('**** END check duplicated SNPs *****') 99 | sumDat = sumDat.loc[:, colnames] 100 | print (sumDat.head()) 101 | logger.info('\n') 102 | return sumDat 103 | 104 | def read_ref_dat(refFile, logger): 105 | ''' 106 | Read in-house reference dataset. 107 | 108 | Input: 109 | ------ 110 | refFile, Path of reference file: 111 | CHR, SNP, GP, BP, A1, A2, complementA1, complementA2 112 | logger, Python logger for process information 113 | 114 | Return: 115 | ------ 116 | refDat, DataFrame of reference dataset. 117 | ''' 118 | 119 | if not os.access(refFile, os.R_OK): 120 | raise ValueError("Can't read reference file: {}".format(refFile)) 121 | logger.info('*** Loading reference data ***') 122 | refDat = pd.read_csv(refFile) 123 | refDat.rename(columns={'BP':'POS', 'A1':'refA1', 'A2':'refA2'}, 124 | inplace=True) 125 | logger.info('Read reference data from {}'.format(refFile)) 126 | logger.info('Read {} SNPs from reference data'.format(refDat.shape[0])) 127 | print ('*** Using reference with {} SNPs ***'.format(refDat.shape[0])) 128 | logger.info('Reading reference data done\n') 129 | logger.info('\n') 130 | return(refDat) 131 | 132 | def _qq(pvec, ax): 133 | ''' 134 | Making basic QQ plots of pvalues. 135 | 136 | ''' 137 | pvec = pvec[np.isfinite(pvec)] 138 | pvec[pvec < 1e-20] = 1e-20 139 | logpSort = -np.log10(np.sort(pvec)) 140 | n = logpSort.shape[0] 141 | logpTheo = -np.log10(np.cumsum(np.repeat(1.0/n, n))) 142 | ax.scatter(logpTheo, logpSort) 143 | x = np.linspace(*ax.get_xlim()) 144 | ax.plot(x, x) 145 | plt.xlabel('Theorectial -log10 (P)') 146 | plt.ylabel('Observed -log10 (P)') 147 | 148 | def summarize_merge(sumDat, mDat, misDat, outdir, logger): 149 | ''' 150 | Making QQ plot of original dataset, converted and missed. 151 | 152 | Input: 153 | ------ 154 | sumDat, DataFrame of Original summary stats 155 | mDat, DataFrame of Converted summary data 156 | misDat, DataFrame of SNPs in original but not in converted 157 | outdir, Where to save figure 158 | logger, Python logger for process information 159 | 160 | No return. 161 | ---------- 162 | TO-DO: 163 | Making multiple curves in one figure 164 | ''' 165 | logger.info('\n') 166 | if sumDat.shape[0] < 10: 167 | logger.erro('Too few SNPs converted!! N={}'.format(sumDat.shape[0])) 168 | raise (RuntimeError, 169 | 'Too few SNPs converted!! N={}'.format(sumDat.shape[0])) 170 | fig = plt.figure(facecolor='white') 171 | ax = fig.add_subplot(131) 172 | _qq(sumDat.loc[:,'P'].values, ax) 173 | plt.title('Original') 174 | ax = fig.add_subplot(132) 175 | _qq(mDat.loc[:,'P'].values, ax) 176 | plt.title("Converted") 177 | ax = fig.add_subplot(133) 178 | _qq(misDat.loc[:,'P'].values, ax) 179 | plt.title("Missed") 180 | plt.tight_layout() 181 | plt.savefig(os.path.join(outdir, 'QQ_convert.png'), format='png') 182 | plt.close() 183 | logger.info('Comparing P values in QQ_convert.png') 184 | 185 | def check_zscore(zvec, outdir, logger): 186 | ''' 187 | Check distribution of converted z-score(real not Anders') 188 | 189 | Input: 190 | ------ 191 | outdir, Where to save figure 192 | logger, Python logger for process information 193 | 194 | No return. 195 | ''' 196 | logger.info('\n') 197 | fig = plt.figure(facecolor='white') 198 | pd.Series(zvec[np.isfinite(zvec)]).hist(bins=100) 199 | plt.title('Z-Scores') 200 | plt.tight_layout() 201 | plt.savefig(os.path.join(outdir, 'Z_scores.png'), format='png') 202 | plt.close() 203 | logger.info('Check converted Z-scores at Z_scores.png') 204 | 205 | def align2ref(sumDat, refDat, logger, kargs): 206 | ''' 207 | Align given summary Data to in-house reference dataset. 208 | 209 | Input: 210 | ------ 211 | sumDat, DataFrame of summary statistics. 212 | refDat, DataFrame of in-house reference dataset. 213 | logger, Python logger for process information 214 | kargs, NameSpace object of options 215 | 216 | Return: 217 | ------- 218 | -log10 p values, and z-scores 219 | 220 | Note: 221 | ----- 222 | 1. Ambiguous SNPs removed based on in-house reference dataset. 223 | 2. effect aligned with allele coding of reference 224 | ''' 225 | if kargs.forceID: 226 | keys = ['SNP'] 227 | elif ('CHR' not in sumDat.columns) or ('POS' not in sumDat.columns): 228 | keys = ['SNP'] 229 | else: 230 | keys = ['CHR', 'POS'] 231 | mDat, misDat1 = map_snps(refDat, sumDat, keys, 'sum', False) 232 | mDat.to_csv(os.path.join(kargs.outdir, 'debug_merged.txt.gz'), 233 | sep='\t', index=False, na_rep='NA') 234 | logger.info('*** Align SNPs to reference ***') 235 | if misDat1.shape[0] > 0: 236 | outF = os.path.join(kargs.outdir, 'SNPs_not_in_sumFile.txt.gz') 237 | logger.info( 238 | 'There are {} SNPs in reference not in given summary file'.format( 239 | misDat1.shape[0])) 240 | logger.info('Details see {}'.format(outF)) 241 | misDat1.to_csv(outF, index=False, sep='\t', compression='gzip', 242 | na_rep='NA') 243 | dummy, misDat2 = map_snps(sumDat, refDat, keys, 'ref') 244 | if misDat2.shape[0] > 0: 245 | outF = os.path.join(kargs.outdir, 'SNPs_not_in_refFile.txt.gz') 246 | logger.info( 247 | 'There are {} SNPs in summary file not in reference'.format( 248 | misDat2.shape[0])) 249 | logger.info('Details see {}'.format(outF)) 250 | misDat2.to_csv(outF, index=False, sep='\t', compression='gzip', 251 | na_rep='NA') 252 | signvec = np.empty((mDat.shape[0],), dtype='float'); signvec.fill(np.nan) 253 | ambivec = (((mDat.refA1=='A')&(mDat.refA2=='T')) | 254 | ((mDat.refA2=='A')&(mDat.refA1=='T')) | 255 | ((mDat.refA1=='C')&(mDat.refA2=='G')) | 256 | ((mDat.refA2=='C')&(mDat.refA1=='G'))) 257 | ambivec = ambivec.values 258 | logger.info('{} SNPs have ambiguously coded allele in ref'. format( 259 | np.sum(ambivec))) 260 | logger.info('Zscores of ambiguously coded SNPs were set to NaN') 261 | ambDat = mDat.loc[ambivec,:] 262 | ambDat.to_csv(os.path.join(kargs.outdir, 'Ambiguous_data.txt.gz'), 263 | compression='gzip', sep='\t', index=False, na_rep='NA') 264 | logger.info('Save SNPs with ambiguous allele coding into {}'.format( 265 | os.path.join(kargs.outdir, 'Ambiguous_data.txt.gz'))) 266 | logpvec = -np.log10(mDat.loc[:,'P']) 267 | if 'A1' not in sumDat.columns: 268 | zvec = signvec.copy() 269 | else: 270 | if 'A2' not in sumDat.columns: 271 | idx1 = ((mDat.A1==mDat.refA1) | (mDat.A1==mDat.A1c)).values 272 | idx_1 = ((mDat.A1==mDat.refA2) | (mDat.A1==mDat.A2c)).values 273 | else: 274 | idx1 = (((mDat.A1==mDat.refA1)&(mDat.A2==mDat.refA2)) | ((mDat.A1==mDat.A1c)&(mDat.A2==mDat.A2c))).values 275 | idx_1 = (((mDat.A1==mDat.refA2)&(mDat.A2==mDat.refA1)) | ((mDat.A1==mDat.A2c)&(mDat.A2==mDat.A1c))).values 276 | signvec[idx1] = 1.0; signvec[idx_1] = -1.0; signvec[ambivec] = np.nan 277 | signvec = signvec * np.sign(mDat.loc[:,'Beta'].values) 278 | zvec = np.abs(stats.norm.ppf(mDat.loc[:,'P'].values * 0.5)) * signvec 279 | logger.info('{} SNPs have direction opposite to refference and changed'.format(np.sum(idx_1))) 280 | mDat.loc[:, 'newZ'] = zvec 281 | tmpMdat = mDat.loc[idx_1 ,:] 282 | tmpMdat.to_csv(os.path.join(kargs.outdir, 'flip_data.txt.gz'), 283 | index=False, sep='\t', compression='gzip',na_rep='NA') 284 | print mDat.columns 285 | summarize_merge(sumDat, mDat, misDat2, kargs.outdir, logger) 286 | logger.info('\n') 287 | if kargs.NCol: 288 | print 'I am here' 289 | print mDat.head() 290 | return(logpvec.values, zvec, mDat.loc[:,'N'].values) 291 | else: 292 | return(logpvec.values, zvec, []) 293 | 294 | def save2mat(logpvec, zvec, Nvec, trait, outdir, logger): 295 | ''' 296 | Save data in Matlab dataset. 297 | 298 | Input: 299 | ----- 300 | logpvec, -log10 p value vector 301 | zvec, zscore vector 302 | trait, Name of phenotype 303 | outdir, Where to save dataset 304 | logger, Python logger for process information 305 | 306 | No return. 307 | ''' 308 | outfile = os.path.join(outdir, trait) 309 | if len(Nvec) == len(logpvec): 310 | print (np.sum(np.isfinite(Nvec))) 311 | print (np.sum(np.isfinite(logpvec))) 312 | tmpdict = {'logpvec_'+trait.lower():logpvec, 313 | 'zvec_'+trait.lower():zvec, 'nvec_'+trait.lower():Nvec} 314 | else: 315 | tmpdict = {'logpvec_'+trait.lower():logpvec, 'zvec_'+trait.lower():zvec} 316 | sio.savemat(outfile, tmpdict, format='5', do_compression=False, 317 | oned_as='column') 318 | logger.info('Save converted data to {}'.format(outfile+'.mat')) 319 | 320 | def convert_sum(): 321 | parser = argparse.ArgumentParser(prog="Preprocess Summary stats", 322 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 323 | description='Preprocess summary stats for matlab') 324 | parser.add_argument('--sumFile', type=str, help='Summary stats file') 325 | parser.add_argument('--ref', type=str, help='Reference file') 326 | parser.add_argument('--trait', type=str, help='Trait Name') 327 | parser.add_argument('--outdir', type=str, help='Output DIR', default=".") 328 | parser.add_argument('--forceID', action='store_true', default=False, 329 | help='Force using SNP ID other than position') 330 | parser.add_argument('--snpCol', type=str, help='SNP ID field', 331 | default='SNP') 332 | parser.add_argument('--pCol', type=str, help='P value field', default='P') 333 | parser.add_argument('--effACol', type=str, help='Effective allele field', 334 | default=None) 335 | parser.add_argument('--othACol', type=str, help='The other allele field', 336 | default=None) 337 | parser.add_argument('--effCol', type=str, help='Effect size field', 338 | default=None) 339 | parser.add_argument('--orCol', type=str, help='Odds ratio field', 340 | default=None) 341 | parser.add_argument('--NCol', type=str, help='sample size per SNP', 342 | default=None) 343 | parser.add_argument('--posCol', type=str, 344 | help='Genomic position field',default=None) 345 | parser.add_argument('--chrCol', type=str, 346 | help='Chromosome field',default=None) 347 | args = parser.parse_args() 348 | if not os.access(args.outdir, os.F_OK): 349 | os.mkdir(args.outdir) 350 | if not os.access(args.sumFile, os.R_OK): 351 | raise ValueError("Can't read summary stats file: {}".format(args.sumFile)) 352 | if not os.access(args.ref, os.R_OK): 353 | raise ValueError("Can't read reference file: {}".format(args.ref)) 354 | logfile = os.path.join(args.outdir, 'convert_' + args.trait + '.log') 355 | logger = logging.getLogger() 356 | logger.addHandler(logging.FileHandler(logfile,mode='w')) 357 | logger.setLevel(logging.DEBUG) 358 | sumDat = read_sum_dat(args.sumFile, logger, args) 359 | refDat = read_ref_dat(args.ref, logger) 360 | logpvec, zvec, Nvec = align2ref(sumDat, refDat, logger, args) 361 | check_zscore(zvec, args.outdir, logger) 362 | save2mat(logpvec, zvec, Nvec, args.trait, args.outdir, logger) 363 | logger.info('\n**********\nFinished at {}'.format(time.ctime())) 364 | logger.info('Author: {} at {}'.format(getpass.getuser(), time.ctime())) 365 | 366 | 367 | if __name__ == "__main__": 368 | import time 369 | import numpy as np 370 | tsts = time.time() 371 | convert_sum() 372 | print 373 | print 'Finish at %s' % time.ctime() 374 | ted = time.time() 375 | print 'Time taken %d mins %d sec' % ((ted-tsts)//60, np.round(ted-tsts) % 376 | 60) 377 | 378 | -------------------------------------------------------------------------------- /sumstats2mat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import argparse 4 | import pandas as pd 5 | import scipy.io as sio 6 | import numpy as np 7 | from collections import namedtuple 8 | 9 | Cols = namedtuple('Cols', ['SNP', 'PVAL', 'A1', 'A2', 'N', 'NCASE', 'NCONTROL', 'Z']) 10 | cols = Cols._make( ['RSID', 'P', 'EffectAllele', 'OtherAllele', 'N', 'CaseN', 'ControlN', 'Z']) 11 | 12 | __version__ = '1.0.0' 13 | MASTHEAD = "***********************************************************************\n" 14 | MASTHEAD += "* sumstats2mat.py: Converts summary statistics from csv to matlab format\n" 15 | MASTHEAD += "* Version {V}\n".format(V=__version__) 16 | MASTHEAD += "* (C) Norwegian Centre for Mental Disorders Research / University of Oslo\n" 17 | MASTHEAD += "* GNU General Public License v3\n" 18 | MASTHEAD += "***********************************************************************\n" 19 | 20 | _base_complement = {"A":"T", "C":"G", "G":"C", "T":"A"} 21 | def _reverse_complement_variant(variant): 22 | # variant should be a 2-elemet sequence with upper case string elements 23 | return ("".join([_base_complement[b] for b in variant[0][::-1]]), 24 | "".join([_base_complement[b] for b in variant[1][::-1]])) 25 | 26 | def check_input_file(file): 27 | if not os.path.isfile(file): 28 | raise ValueError("Input file does not exist: {f}".format(f=file)) 29 | 30 | def check_output_file(file, force=False): 31 | # Delete target file if user specifies --force option 32 | if force: 33 | try: 34 | os.remove(file) 35 | except OSError: 36 | pass 37 | 38 | # Otherwise raise an error if target file already exists 39 | if os.path.isfile(file) and not force: 40 | raise ValueError("Output file already exists: {f}".format(f=file)) 41 | 42 | # Create target folder if it doesn't exist 43 | output_dir = os.path.dirname(file) 44 | if output_dir and not os.path.isdir(output_dir): os.makedirs(output_dir) # ensure that output folder exists 45 | 46 | def make_mat(args): 47 | if args.out is None: raise ValueError('--out is required.') 48 | 49 | check_input_file(args.ref) 50 | check_input_file(args.sumstats) 51 | check_output_file(args.out, args.force) 52 | 53 | reader = pd.read_csv(args.sumstats, delim_whitespace=True, chunksize=args.chunksize, float_precision='high') 54 | df_out = None 55 | for chunk_index, ss_chunk in enumerate(reader): 56 | # (BEGIN) special handling of the first chunk 57 | if chunk_index==0: 58 | columns = list(ss_chunk.columns) 59 | 60 | required_cols = [cols.SNP, cols.A1, cols.A2, cols.Z] 61 | if (set(required_cols) - set(columns)): 62 | absent_cols = set(required_cols) - set(columns) 63 | err_msg = ("Columns {} are missing from the --sumstats file {}").format(', '.join(absent_cols), args.sumstats) 64 | raise(RuntimeError(err_msg)) 65 | 66 | n_col = cols.N if cols.N in columns else None 67 | ncase_col = cols.NCASE if cols.NCASE in columns else None 68 | ncontrol_col = cols.NCONTROL if cols.NCONTROL in columns else None 69 | if (not args.without_n) and ((n_col is None) and ((ncase_col is None) or (ncontrol_col is None))): 70 | raise(ValueError('Sample size column is not detected in {}. Expact either N or NCASE, NCONTROL column.'.format(args.sumstats))) 71 | 72 | print('Reading reference file {}...'.format(args.ref)) 73 | ref_reader = pd.read_csv(args.ref, sep='\t', usecols=['SNP', 'A1', 'A2'], chunksize=args.chunksize) 74 | ref_dict = {} 75 | for ref_chunk in ref_reader: 76 | ref_chunk.drop(ref_chunk.index[np.logical_not(ref_chunk['A1'].str.upper().str.match('^[ACTG]*$')) | np.logical_not(ref_chunk['A2'].str.upper().str.match('^[ACTG]*$'))], inplace=True) 77 | if ref_chunk.empty: continue 78 | gtypes = zip(ref_chunk['A1'].apply(str.upper),ref_chunk['A2'].apply(str.upper)) 79 | #TODO?: add check whether some id is already in ref_dict 80 | ref_dict.update(dict(zip(ref_chunk['SNP'], gtypes))) 81 | ref_dict = {i: (variant, _reverse_complement_variant(variant), 82 | variant[::-1], _reverse_complement_variant(variant[::-1])) 83 | for i, variant in ref_dict.items()} 84 | ref_snps = pd.read_csv(args.ref, sep='\t', usecols=['SNP'], squeeze=True) 85 | #TODO?: add check whether ref_snps contains duplicates 86 | print("Reference dict contains {d} snps.".format(d=len(ref_dict))) 87 | 88 | print('Reading summary statistics file {}...'.format(args.sumstats)) 89 | print('Column types: ' + ', '.join([column + ':' + str(dtype) for (column, dtype) in zip(ss_chunk.columns, ss_chunk.dtypes)])) 90 | # (END) special handling of the first chunk 91 | 92 | ss_chunk = ss_chunk.loc[ss_chunk[cols.SNP].isin(ref_dict),:] 93 | if ss_chunk.empty: continue 94 | gtypes = list(zip(ss_chunk[cols.A1].apply(str.upper),ss_chunk[cols.A2].apply(str.upper))) 95 | # index of SNPs that have the same alleles as indicated in reference 96 | ind = [gt in ref_dict[sid] for sid, gt in zip(ss_chunk[cols.SNP], gtypes)] 97 | ss_chunk = ss_chunk.loc[ind,:] 98 | gtypes = [gt for gt, j in zip(gtypes, ind) if j] 99 | log10pv = -np.log10(ss_chunk[cols.PVAL].values) 100 | # not_ref_effect = [ 101 | # 1 if effect allele in data == other allele in reference 102 | # -1 if effect allele in data == effect allele in reference ] 103 | # So zscores with positive effects will be positive and zscores with 104 | # negative effects will stay negative, since 105 | # stats.norm.ppf(ss_chunk[cols.PVAL]*0.5) is always negetive (see zvect 106 | # calculation below). 107 | not_ref_effect = np.array([1 if gt in ref_dict[sid][:2] else -1 108 | for sid, gt in zip(ss_chunk[cols.SNP], gtypes)]) 109 | #TODO: check proportion of positive and negative effects 110 | zvect = ss_chunk[cols.Z].values*not_ref_effect 111 | ind_ambiguous = [j for j,gt in enumerate(gtypes) if gt == _reverse_complement_variant(gt)[::-1]] 112 | # set zscore of ambiguous SNPs to nan 113 | zvect[ind_ambiguous] = np.nan 114 | #TODO: check whether output df contains duplicated rs-ids (warn) 115 | 116 | # reindex by SNP, add required columns and drop unnecessary columns 117 | ss_chunk.index = ss_chunk[cols.SNP] 118 | # add required columns 119 | ss_chunk["logpvec"] = log10pv 120 | ss_chunk["zvec"] = zvect 121 | if not args.without_n: 122 | if n_col is None: 123 | nvec = 4./(1./ss_chunk[ncase_col] + 1./ss_chunk[ncontrol_col]) 124 | else: 125 | nvec = ss_chunk[n_col].values 126 | ss_chunk["nvec"] = nvec 127 | 128 | cols2drop = [c for c in ss_chunk.columns if (c not in ['logpvec', 'zvec', 'nvec'])] 129 | ss_chunk.drop(cols2drop, axis=1, inplace=True) 130 | 131 | if df_out is None: 132 | df_out = ss_chunk.copy() 133 | else: 134 | df_out = df_out.append(ss_chunk) 135 | 136 | print("{f}: {n} lines processed, {m} SNPs matched with reference file".format(f=args.sumstats, n=(chunk_index+1)*args.chunksize, m=len(df_out))) 137 | 138 | if df_out.empty: raise(ValueError("No SNPs match after joining with reference data")) 139 | dup_index = df_out.index.duplicated(keep=False) 140 | if dup_index.any(): 141 | print("Duplicated SNP ids detected:") 142 | print(df_out[dup_index]) 143 | print("Keeping only the first occurance.") 144 | df_out = df_out[~df_out.index.duplicated(keep='first')] 145 | # allign index accordind order of SNPs in ref, insert NaN rows for SNPs that 146 | # present in ref but absent in sumstats file 147 | df_out = df_out.reindex(ref_snps) 148 | 149 | print('Writing .mat file...') 150 | save_dict = {c+args.trait: df_out[c].astype(np.float64).values for c in df_out.columns} 151 | sio.savemat(args.out, save_dict, format='5', do_compression=False, 152 | oned_as='column', appendmat=False) 153 | print("%s created" % args.out) 154 | 155 | ### ================================================================================= 156 | ### Main section 157 | ### ================================================================================= 158 | if __name__ == "__main__": 159 | parser_mat = argparse.ArgumentParser(description="Create mat files that can " 160 | "be used as an input for pleiofdr analysis (https://github.com/precimed/pleiofdr/). " 161 | "Takes a .csv file with summary statistics file as input. The file can be compressed with gzip. " 162 | "Require columns: RSID, P, EffectAllele, OtherAllele, and sample size column (either N, or CaseN and ControlN). " 163 | "Creates corresponding mat files which can be used as an input for pleiofdr analysis. " 164 | "Only SNPs from the reference file are considered. " 165 | "Zscores of strand ambiguous SNPs are set to NA. ") 166 | 167 | parser_mat.add_argument("--sumstats", type=str, help="Input file with summary statistics. ") 168 | parser_mat.add_argument("--ref", type=str, help="[required] Tab-separated file with list of referense SNPs.") 169 | parser_mat.add_argument("--out", type=str, help="[required] File to output the result. File should end with .mat extension.") 170 | parser_mat.add_argument("--force", action="store_true", default=False, help="Allow sumstats.py to overwrite output file if it exists.") 171 | 172 | parser_mat.add_argument("--trait", type=str, default='', 173 | help="Trait name that will be used in mat file. Can be kept empty, in this case the variables will be named 'logpvec', 'zvec' and 'nvec'") 174 | parser_mat.add_argument("--without-n", action="store_true", default=False, 175 | help="Proceed without sample size (N or NCASE/NCONTROL)") 176 | parser_mat.add_argument("--chunksize", default=100000, type=int, 177 | help="Size of chunk to read the file.") 178 | 179 | args = parser_mat.parse_args() 180 | make_mat(args) 181 | -------------------------------------------------------------------------------- /sumstats_ldsc_helper.py: -------------------------------------------------------------------------------- 1 | # Examples: 2 | # python sumstats_ldsc_helper.py annot PGC_SCZ_2014.csv PGC_SCZ_2014_ldscores/{}.annot.gz --annot 1000G_Phase3_baseline_ldscores/baseline.{}.annot.gz 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import os.path 7 | import sys 8 | import argparse 9 | from intervaltree import IntervalTree 10 | 11 | def parse_args(args): 12 | parser = argparse.ArgumentParser(description="Miscellaneous utilities to work with statistics and LDSC regression method") 13 | subparsers = parser.add_subparsers() 14 | parser_annot = subparsers.add_parser("annot", help="Create binary annotations from 0.1, 0.01 and 0.001 p-value stratuums of the summary statistic") 15 | parser_annot.add_argument("sumstats_file", type=str, help="Input file with summary statistics") 16 | parser_annot.add_argument("output_file", type=str, help="Path to the output file to place the results") 17 | parser_annot.add_argument("--annot", type=str, help="Path to baseline.CHR.annot.gz file from 1000G_Phase3_baseline_ldscores") 18 | parser_annot.add_argument("--force", action="store_true", default=False, help="Force overwrite target files if they exist.") 19 | parser_annot.add_argument("--window", default=2, type=int, help="Window to include into the binary annotation around each SNP that pass p-value threshold.") 20 | parser_annot.set_defaults(func=make_annot) 21 | return parser.parse_args(args) 22 | 23 | ### ================================================================================= 24 | ### Implementation for parser_annot 25 | ### ================================================================================= 26 | def check_input_file(file): 27 | if not os.path.isfile(file): 28 | raise ValueError("Input file does not exist: {f}".format(f=file)) 29 | 30 | def check_output_file(file, force=False): 31 | # Delete target file if user specifies --force option 32 | if force: 33 | try: 34 | os.remove(file) 35 | except OSError: 36 | pass 37 | 38 | # Otherwise raise an error if target file already exists 39 | if os.path.isfile(file) and not force: 40 | raise ValueError("Output file already exists: {f}".format(f=file)) 41 | 42 | # Create target folder if it doesn't exist 43 | output_dir = os.path.dirname(file) 44 | if output_dir and not os.path.isdir(output_dir): os.makedirs(output_dir) # ensure that output folder exists 45 | 46 | def make_annot(args): 47 | """ 48 | Create binary annotations from 0.1, 0.01 and 0.001 p-value stratuums of the summary statistic. 49 | """ 50 | check_input_file(args.sumstats_file) 51 | for chri in range(1, 23): check_output_file(args.output_file.format(chri), args.force) 52 | 53 | print('Reading summary statistics file {}...'.format(args.sumstats_file)) 54 | sumstats = pd.read_csv(args.sumstats_file, delim_whitespace=True, usecols=['PVAL', 'CHR', 'BP']) 55 | print('Done, read {} SNPs.'.format(sumstats.shape[0])) 56 | 57 | for chri in range(1, 23): 58 | print('Processing chromosome {}...'.format(chri)) 59 | df = pd.read_csv(args.annot.format(chri), delim_whitespace=True) 60 | df = df[['CHR', 'BP', 'SNP', 'CM']].copy() 61 | for pthresh, label in [(0.1, '.1'), (0.01, '.01'), (0.001, '.001')]: 62 | sumstatsCHR = sumstats[sumstats.CHR == chri].copy(deep=True) 63 | print('{} markers, {} of them are on chr {}, {} of them have p-value below {}'.format(sumstats.shape[0], sumstatsCHR.shape[0], chri, (sumstatsCHR.PVAL < pthresh).sum(), pthresh)) 64 | itree = IntervalTree.from_tuples(zip(sumstatsCHR[sumstatsCHR.PVAL < pthresh].BP - args.window, sumstatsCHR[sumstatsCHR.PVAL < pthresh].BP + args.window)) 65 | itree.merge_overlaps() 66 | print('Found {} intervals, average length {}'.format(len(itree), sum([i.length() for i in itree])/len(itree))) 67 | 68 | annot_binary = [int(bool(itree[p])) for p in df.BP] 69 | df['PVAL{}'.format(label)] = annot_binary 70 | print('{} markers out of {} ({}%) belongs to the annotation'.format(sum(annot_binary), len(annot_binary), 100 * sum(annot_binary) / len(annot_binary))) 71 | df.to_csv(args.output_file.format(chri), index=False, sep='\t', compression='gzip') 72 | print('Results saved to {}'.format(args.output_file.format(chri))) 73 | 74 | ### ================================================================================= 75 | ### Main section 76 | ### ================================================================================= 77 | if __name__ == "__main__": 78 | args = parse_args(sys.argv[1:]) 79 | args.func(args) 80 | print("Done") 81 | -------------------------------------------------------------------------------- /sumstats_utils.py: -------------------------------------------------------------------------------- 1 | # Misc utils to deal with summary stat files. 2 | # Some parts of the code in this file originates from https://github.com/bulik/ldsc/, 3 | # which is licensed under GNU General Public License v3.0 4 | # See https://github.com/bulik/ldsc/blob/master/LICENSE for complete license. 5 | 6 | import sys, os, re, logging, datetime 7 | import numpy as np 8 | import pandas as pd 9 | import gzip 10 | import six 11 | import itertools as it 12 | from collections import namedtuple 13 | 14 | COMPLEMENT_ALLELE = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} 15 | # bases 16 | BASES = COMPLEMENT_ALLELE.keys() 17 | # true iff strand ambiguous 18 | STRAND_AMBIGUOUS = {''.join(x): x[0] == COMPLEMENT_ALLELE[x[1]] 19 | for x in it.product(BASES, BASES) 20 | if x[0] != x[1]} 21 | # SNPS we want to keep (pairs of alleles) 22 | VALID_SNPS = {x for x in map(lambda y: ''.join(y), it.product(BASES, BASES)) 23 | if x[0] != x[1] and not STRAND_AMBIGUOUS[x]} 24 | # T iff SNP 1 has the same alleles as SNP 2 (allowing for strand or ref allele flip). 25 | MATCH_ALLELES = {x for x in map(lambda y: ''.join(y), it.product(VALID_SNPS, VALID_SNPS)) 26 | # strand and ref match 27 | if ((x[0] == x[2]) and (x[1] == x[3])) or 28 | # ref match, strand flip 29 | ((x[0] == COMPLEMENT_ALLELE[x[2]]) and (x[1] == COMPLEMENT_ALLELE[x[3]])) or 30 | # ref flip, strand match 31 | ((x[0] == x[3]) and (x[1] == x[2])) or 32 | ((x[0] == COMPLEMENT_ALLELE[x[3]]) and (x[1] == COMPLEMENT_ALLELE[x[2]]))} # strand and ref flip 33 | # T iff SNP 1 has the same alleles as SNP 2 w/ ref allele flip. 34 | FLIP_ALLELES = {''.join(x): 35 | ((x[0] == x[3]) and (x[1] == x[2])) or # strand match 36 | # strand flip 37 | ((x[0] == COMPLEMENT_ALLELE[x[3]]) and (x[1] == COMPLEMENT_ALLELE[x[2]])) 38 | for x in MATCH_ALLELES} 39 | 40 | def filter_alleles(alleles): 41 | '''Remove bad variants (mismatched alleles, non-SNPs, strand ambiguous).''' 42 | ii = alleles.apply(lambda y: y in MATCH_ALLELES) 43 | return ii 44 | 45 | 46 | def align_alleles(z, alleles): 47 | '''Align Z1 and Z2 to same choice of ref allele (allowing for strand flip).''' 48 | try: 49 | z *= (-1) ** alleles.apply(lambda y: FLIP_ALLELES[y]) 50 | except KeyError as e: 51 | raise KeyError('Incompatible alleles. ') 52 | return z 53 | 54 | Cols = namedtuple('Cols', ['SNP', 'CHR', 'BP', 'PVAL', 'A1', 'A2', 'EA', 'N', 'NCASE', 'NCONTROL', 'Z', 'OR', 'BETA', 'LOGODDS', 'SE', 'INFO', 'FRQ', 'NSTUDY', 'CHRPOS', 'A1A2', 'CHRPOSA1A2', 'DIRECTION', 'ORL95', 'ORU95']) 55 | cols = Cols._make( ['SNP', 'CHR', 'BP', 'PVAL', 'A1', 'A2', 'EA', 'N', 'NCASE', 'NCONTROL', 'Z', 'OR', 'BETA', 'LOGODDS', 'SE', 'INFO', 'FRQ', 'NSTUDY', 'CHRPOS', 'A1A2', 'CHRPOSA1A2', 'DIRECTION', 'ORL95', 'ORU95']) 56 | cols_type_map = {'SNP':str, 'CHR':int, 'BP':int, 'PVAL':np.float64, 'A1':str, 'A2':str, 'EA':str, 'N':float, 'NCASE':float, 'NCONTROL':float, 'Z':float, 'OR':float, 'BETA':float, 57 | 'LOGODDS':float, 'SE':float, 'INFO':float, 'FRQ':float, 'NSTUDY':int, 'CHRPOS':str, 'A1A2':str, 'CHRPOSA1A2':str, 'DIRECTION':str, 'ORL95':float, 'ORU95':float} 58 | cname_to_cleansumstats_map = { 59 | 'SNP': 'col_SNP', 60 | 'CHR': 'col_CHR', 61 | 'BP': 'col_POS', 62 | 'PVAL': 'col_P', 63 | 'A1': 'col_EffectAllele', 64 | 'A2': 'col_OtherAllele', 65 | 'N': 'col_N', 66 | 'NCASE': 'col_CaseN', 67 | 'NCONTROL': 'col_ControlN', 68 | 'Z': 'col_Z', 69 | 'OR': 'col_OR', 70 | 'BETA': 'col_BETA', 71 | 'SE': 'col_SE', 72 | 'LOGODDS': 'col_BETA', 73 | 'INFO': 'col_INFO', 74 | 'FRQ': 'col_EAF', 75 | 'DIRECTION': 'col_Direction', 76 | 'ORL95': 'col_ORL95', 77 | 'ORU95': 'col_ORU95', 78 | # 'NSTUDY': 'col_studyN' - not supported by cleansumstats 79 | # col_OAF, col_Notes - not supported by python_convert 80 | # CHRPOSA1A2, CHRPOS - require special handling (see update_cleansumstats_cols in sumstats.py) 81 | # A1A2 - incompatible 82 | } 83 | 84 | null_values = { 85 | cols.LOGODDS: 0, 86 | cols.BETA: 0, 87 | cols.OR: 1, 88 | cols.Z: 0 89 | } 90 | 91 | default_cnames = { 92 | 93 | # RS NUMBER 94 | 'SNP': cols.SNP, 95 | 'MARKERNAME': cols.SNP, 96 | 'SNPID': cols.SNP, 97 | 'SNP_ID': cols.SNP, 98 | 'RS': cols.SNP, 99 | 'RSID': cols.SNP, 100 | 'RS_NUMBER': cols.SNP, 101 | 'RS_NUMBERS': cols.SNP, 102 | # CHROMOSOME 103 | 'CHR': cols.CHR, 104 | 'CHROM': cols.CHR, 105 | 'CHROMSOME': cols.CHR, 106 | 'CHROMOSOME' : cols.CHR, 107 | # POSITION 108 | 'POS': cols.BP, 109 | 'BP': cols.BP, 110 | 'BPOS': cols.BP, 111 | 'POSITION' : cols.BP, 112 | # NUMBER OF STUDIES 113 | 'NSTUDY': cols.NSTUDY, 114 | 'N_STUDY': cols.NSTUDY, 115 | 'NSTUDIES': cols.NSTUDY, 116 | 'N_STUDIES': cols.NSTUDY, 117 | # P-VALUE 118 | 'P': cols.PVAL, 119 | 'PVALUE': cols.PVAL, 120 | 'P_VALUE': cols.PVAL, 121 | 'PVAL': cols.PVAL, 122 | 'P_VAL': cols.PVAL, 123 | 'GC_PVALUE': cols.PVAL, 124 | 'MTAG_PVAL': cols.PVAL, 125 | # ALLELE 1 126 | 'A1': cols.A1, 127 | 'ALLELE1': cols.A1, 128 | 'ALLELE_1': cols.A1, 129 | 'EFFECT_ALLELE': cols.A1, 130 | 'EFFECTALLELE': cols.A1, 131 | 'REFERENCE_ALLELE': cols.A1, 132 | 'INC_ALLELE': cols.A1, 133 | 'EA': cols.A1, 134 | # ALLELE 2 135 | 'A2': cols.A2, 136 | 'ALLELE2': cols.A2, 137 | 'ALLELE_2': cols.A2, 138 | 'OTHER_ALLELE': cols.A2, 139 | 'OTHERALLELE': cols.A2, 140 | 'NON_EFFECT_ALLELE': cols.A2, 141 | 'NON_EFF_ALLELE': cols.A2, 142 | 'DEC_ALLELE': cols.A2, 143 | 'NEA': cols.A2, 144 | # N 145 | 'N': cols.N, 146 | 'SAMPLESIZE': cols.N, 147 | 'WEIGHT': cols.N, # metal does this. possibly risky. 148 | # NCASE 149 | 'NCASE': cols.NCASE, 150 | 'CASES_N': cols.NCASE, 151 | 'N_CASE': cols.NCASE, 152 | 'N_CASES': cols.NCASE, 153 | 'N_CAS': cols.NCASE, 154 | 'N_CASE': cols.NCASE, 155 | 'CASEN': cols.NCASE, 156 | # NCONTROL 157 | 'N_CONTROLS': cols.NCONTROL, 158 | 'N_CON': cols.NCONTROL, 159 | 'CONTROLN': cols.NCONTROL, 160 | 'NCONTROL': cols.NCONTROL, 161 | 'CONTROLS_N': cols.NCONTROL, 162 | 'N_CONTROL': cols.NCONTROL, 163 | # SIGNED STATISTICS 164 | 'ZSCORE': cols.Z, 165 | 'Z-SCORE': cols.Z, 166 | 'GC_ZSCORE': cols.Z, 167 | 'Z': cols.Z, 168 | 'MTAG_Z': cols.Z, 169 | 'OR': cols.OR, 170 | 'ORL95': cols.ORL95, 171 | 'ORU95': cols.ORU95, 172 | 'B': cols.BETA, 173 | 'BETA': cols.BETA, 174 | 'MTAG_BETA': cols.BETA, 175 | 'LOGODDS': cols.LOGODDS, 176 | 'EFFECTS': cols.BETA, 177 | 'EFFECT': cols.BETA, 178 | 'SIGNED_SUMSTAT': 'SIGNED_SUMSTAT', 179 | # STANDARD ERROR 180 | 'SE' : cols.SE, 181 | 'STDERR' : cols.SE, 182 | 'MTAG_SE' : cols.SE, 183 | # INFO 184 | 'INFO': cols.INFO, 185 | # MAF 186 | 'EAF': cols.FRQ, 187 | 'FRQ': cols.FRQ, 188 | 'MAF': cols.FRQ, 189 | 'FRQ_U': cols.FRQ, 190 | 'F_U': cols.FRQ, 191 | 'FREQ': cols.FRQ, 192 | # DIRECTION 193 | 'DIRECTION': cols.DIRECTION, 194 | } 195 | 196 | describe_cname = { 197 | cols.SNP: 'Variant ID (e.g., rs number)', 198 | cols.CHR: 'Chromosome number', 199 | cols.BP: 'Base-pair position', 200 | cols.PVAL: 'p-Value', 201 | cols.A1: 'Allele 1, interpreted as ref allele for signed sumstat.', 202 | cols.A2: 'Allele 2, interpreted as non-ref allele for signed sumstat.', 203 | cols.EA: 'Effect Allele, interpreted as ref allele for signed sumstat (specific to MVP data to validate that A1 is the same as EA).', 204 | cols.N: 'Sample size', 205 | cols.NCASE: 'Number of cases', 206 | cols.NCONTROL: 'Number of controls', 207 | cols.Z: 'Z-score (0 --> no effect; above 0 --> A1 is trait/risk increasing)', 208 | cols.OR: 'Odds ratio (1 --> no effect; above 1 --> A1 is risk increasing)', 209 | cols.ORL95: 'Lower 95%% confidence bound of OR', 210 | cols.ORU95: 'Upper 95%% confidence bound of OR', 211 | cols.BETA: '[linear/logistic] regression coefficient (0 --> no effect; above 0 --> A1 is trait/risk increasing)', 212 | cols.LOGODDS: 'Log odds ratio (0 --> no effect; above 0 --> A1 is risk increasing)', 213 | cols.SE: 'standard error of the effect size', 214 | cols.INFO: 'INFO score (imputation quality; higher --> better imputation)', 215 | cols.FRQ: 'Allele frequency', 216 | 'SIGNED_SUMSTAT': 'Directional summary statistic as specified by --signed-sumstats.', 217 | cols.NSTUDY: 'Number of studies in which the SNP was genotyped.', 218 | 'UNKNOWN': 'Unknown column type (will be skipped).', 219 | cols.CHRPOS: 'chr:pos column with colon-separated information about Chromosome and Base-pair position', 220 | cols.A1A2: 'A1/A2 column with slash-separated information about marker allles', 221 | cols.CHRPOSA1A2: 'chr:pos:ref:alt column with colon-separated information about Chromosome, Base-pair position, Reference allele, Alternative allele', 222 | cols.DIRECTION: 'METAL "direction" column, one char per substudy (+ or - indicate effect direction; ? indicate failed imputation or QC)' 223 | } 224 | 225 | def clean_header(header): 226 | ''' 227 | For cleaning file headers. 228 | - convert to uppercase 229 | - replace dashes '-' with underscores '_' 230 | - replace dots '.' (as in R) with underscores '_' 231 | - remove newlines ('\n') 232 | ''' 233 | return header.upper().replace('-', '_').replace('.', '_').replace('\n', '') 234 | 235 | def format_chr(chrvec): 236 | ''' 237 | Reformat chromosome names. 238 | 239 | Input: 240 | ------ 241 | Vector of chromosome IDs 242 | 243 | Output: 244 | ------- 245 | Vector of cleaned chromosome IDs 246 | 247 | Note: 248 | * Remove "chr/Chr/CHR/MT/mt" strings in the name 249 | * Change chrX to 23, ChrY to 24, PAR to 25, MT to 26 250 | * (as in plink, https://www.cog-genomics.org/plink/1.9/input#allow_extra_chr) 251 | ''' 252 | try: 253 | tmpchrvec = chrvec.astype('str') 254 | tmpchrvec = tmpchrvec.str.lower() 255 | tmpchrvec = tmpchrvec.str.replace('chr', '') 256 | tmpchrvec[tmpchrvec=='x'] = '23' 257 | tmpchrvec[tmpchrvec=='y'] = '24' 258 | tmpchrvec[tmpchrvec=='par'] = '25' 259 | tmpchrvec[tmpchrvec=='m'] = '26' 260 | tmpchrvec[tmpchrvec=='mt'] = '26' 261 | tmpchrvec[tmpchrvec=='x_par1'] = '25' 262 | tmpchrvec[tmpchrvec=='x_par2'] = '25' 263 | tmpchrvec[tmpchrvec=='x_nonpar'] = '23' 264 | # TO-DO: Bellow is anoying 265 | tmpchrvec[tmpchrvec=='na'] = '-9' 266 | tmpchrvec[tmpchrvec.isnull()] = '-9' 267 | tmpchrvec[tmpchrvec=='nan'] = '-9' 268 | tmpchrvec[tmpchrvec==' '] = '-9' 269 | tmpchrvec = tmpchrvec.astype('float').astype('int') 270 | return tmpchrvec 271 | except: 272 | raise 273 | 274 | def get_header(fh, lines=5): 275 | (openfunc, _) = get_compression(fh) 276 | header = [] 277 | with openfunc(fh) as f: 278 | for line in it.islice(f, lines): 279 | line = line if isinstance(line, six.string_types) else line.decode('utf-8') 280 | header.append(line.rstrip('\n')) 281 | return header 282 | 283 | def get_compression_and_open(fh): 284 | (openfunc, _) = get_compression(fh) 285 | return openfunc(fh) 286 | 287 | def get_compression(fh): 288 | ''' 289 | Read filename suffixes and figure out whether it is gzipped,bzip2'ed or not compressed 290 | ''' 291 | if fh.endswith('gz'): 292 | compression = 'gzip' 293 | openfunc = gzip.open 294 | elif fh.endswith('bz2'): 295 | compression = 'bz2' 296 | openfunc = bz2.BZ2File 297 | else: 298 | openfunc = open 299 | compression = None 300 | 301 | return openfunc, compression 302 | -------------------------------------------------------------------------------- /tests/case01.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case01.mat -------------------------------------------------------------------------------- /tests/case01.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case01.txt.gz -------------------------------------------------------------------------------- /tests/case02.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case02.txt.gz -------------------------------------------------------------------------------- /tests/case03.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case03.txt.gz -------------------------------------------------------------------------------- /tests/case04.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case04.mat -------------------------------------------------------------------------------- /tests/case04.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case04.txt.gz -------------------------------------------------------------------------------- /tests/test_consistent.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import scipy.io as sio 3 | import numpy as np 4 | import shutil 5 | import os.path 6 | 7 | def execute_command(command): 8 | process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 9 | print(process.communicate()[0].decode("utf-8")) 10 | #print(subprocess.check_output(command.split()).decode("utf-8")) 11 | 12 | def run(filename, matfile, effect): 13 | reffile = r'tests/1234_ref.bim' 14 | if os.path.isdir('TEMP_FOLDER'): shutil.rmtree('TEMP_FOLDER') 15 | execute_command(r'python sumstats.py csv {} TEMP_FOLDER/TEST.csv --auto --force'.format(filename)) 16 | execute_command(r'python sumstats.py mat {} TEMP_FOLDER/TEST.csv --force --effect {}'.format(reffile, effect)) 17 | 18 | f1 = sio.loadmat(matfile) 19 | f2 = sio.loadmat('TEMP_FOLDER/TEST.mat') 20 | assert(all(np.isfinite(f1['logpvec_test']) == np.isfinite(f2['logpvec']))) 21 | assert(all(np.isfinite(f1['zvec_test']) == np.isfinite(f2['zvec']))) 22 | assert(max(abs(f1['logpvec_test'] - f2['logpvec'])) < 1e-10) 23 | assert(max(abs(f1['zvec_test'] - f2['zvec'])) < 1e-10) 24 | shutil.rmtree('TEMP_FOLDER') 25 | 26 | def test01(): run('tests/case01.txt', 'tests/case01.mat', effect='BETA') 27 | def test01gz(): run('tests/case01.txt.gz', 'tests/case01.mat', effect='BETA') 28 | #def test02(): run('tests/case02.txt') 29 | #def test03(): run('tests/case03.txt') 30 | def test04(): run('tests/case04.txt', 'tests/case04.mat', effect='OR') 31 | def test04gz(): run('tests/case04.txt.gz', 'tests/case04.mat', effect='OR') 32 | -------------------------------------------------------------------------------- /tests/test_duplicated.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def test_func(): 5 | # test passes on pandas.__version__ == u'0.18.0' 6 | df = pd.DataFrame([[1, 2, 1, 1], 7 | [2, 2, 2, 2], 8 | [1, 2, 3, 1], 9 | [3, 2, 4, 2]], columns=['A', 'B', 'C', 'D']) 10 | 11 | assert all(df.duplicated('A', keep=False) == [True, False, True, False]) 12 | assert all(df.duplicated('B', keep=False) == [True, True, True, True]) 13 | assert all(df.duplicated('C', keep=False) == [False, False, False, False]) 14 | assert all(df.duplicated('D', keep=False) == [True, True, True, True]) 15 | 16 | assert all(df.duplicated('A') == [False, False, True, False]) 17 | assert all(df.duplicated('B') == [False, True, True, True]) 18 | assert all(df.duplicated('C') == [False, False, False, False]) 19 | assert all(df.duplicated('D') == [False, False, True, True]) 20 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | __version__ = '__0.1__' 2 | --------------------------------------------------------------------------------