├── .gitignore
├── LICENSE
├── Liftover_SNP.py
├── PLINK_file_Utils.py
├── README.md
├── __init__.py
├── annotation.py
├── aud_meta_qq_lambdas.ipynb
├── config.plotgwas.3.cfg
├── convert_cleansumstats_output_to_mixer_format.py
├── data
    └── biomart_GENCODE_basic.txt.gz
├── fdrmat2csv.py
├── figs
    └── Z3nns.png
├── lift_rs_numbers.py
├── make_ld_matrix
    ├── README.md
    ├── data
    │   └── EUR_subj.list
    ├── genotypes2ref.py
    ├── make_ld_matrix.py
    └── make_maf_vector.py
├── make_universal_variant_ids.py
├── manhattan.py
├── merge_bed_files.py
├── overCorrect.py
├── plink_utils.py
├── plotgwas.py
├── process_metal.py
├── pyliftover
    ├── GRCh37ToHg19.over.chain.gz
    ├── README
    ├── __init__.py
    ├── chainfile.py
    ├── hg17ToHg19.over.chain.gz
    ├── hg18ToHg19.over.chain.gz
    ├── hg19ToGRCh37.over.chain.gz
    ├── intervaltree.py
    └── liftover.py
├── qq.py
├── sumStats2ref.py
├── summary_stats_Utils.py
├── sumstats.py
├── sumstats2mat.py
├── sumstats_ldsc_helper.py
├── sumstats_utils.py
├── tests
    ├── 1234_ref.bim
    ├── case01.mat
    ├── case01.txt
    ├── case01.txt.gz
    ├── case02.txt
    ├── case02.txt.gz
    ├── case03.txt
    ├── case03.txt.gz
    ├── case04.mat
    ├── case04.txt
    ├── case04.txt.gz
    ├── test_consistent.py
    └── test_duplicated.py
└── version.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/LICENSE


--------------------------------------------------------------------------------
/Liftover_SNP.py:
--------------------------------------------------------------------------------
  1 | import os, re
  2 | import numpy as np
  3 | import pandas as pd
  4 | import argparse
  5 | import logging
  6 | import random
  7 | from distutils.version import StrictVersion
  8 | 
  9 | from pyliftover import LiftOver
 10 | 
 11 | Intro = r'''
 12 | Lifting SNPs 'rs' number and genomic position across different builds.
 13 | Option --find-build require biopython. Install it with "pip install biopython".
 14 | Lift 'rs' number:
 15 | ----------------
 16 |     1. Based on NCBI SNP merge and history files: RsMergeArch and SNPHistory
 17 |     2. SNPs with ID coded as 'CHR:POS' will be left untouched
 18 |     Details: http://genome.sph.umich.edu/wiki/LiftOver
 19 | 
 20 | Lift genomic position:
 21 | ----------------------
 22 |     1. Only performed when the CHR and POS columns are specified
 23 |     2. Based on UCSC build converting files:
 24 |         hg17ToHg19.over.chain.gz
 25 |         hg18ToHg19.over.chain.gz
 26 |     3. It is possible that one position in old build maps to multiple
 27 |     positions in new build. In such case,
 28 |         The SNP is removed from cleaned data
 29 |         The matching with highest score were used in dup_*.txt
 30 |     4. It is also possible that no new position matches the old
 31 |         The SNP is removed from the cleaned data
 32 |         The old position was kept in miss_*.txt
 33 | 
 34 | The results after running the script will be stored in the following files, where * is the name of the original file:
 35 |     - lifted_* is the main result. It contains the original input plus new columns with the lifted data (SNP, CHR, POS)
 36 | 
 37 |     - dup_* report duplicated rs and/or CHR:BP entries after lifting
 38 |     - miss_* report CHR:BP entries that couldn't be lifted by LiftOver tool with given chain file
 39 |     - multi_* report CHR:BP entries that lift into multiple locations with given chain file
 40 | 
 41 |     Reports:
 42 |     - lift_pos_result reports the results of lifting BP
 43 |     - lift_rs_result reports the results of lifting RS# numbers
 44 |     - summary_lift_pos reports summary of lifting BP
 45 |     - summary_lift_rs reports summary of lifting RS# numbers
 46 | '''
 47 | 
 48 | def myopen(fn):
 49 |     import gzip
 50 |     try:
 51 |         h = gzip.open(fn, mode='rt')
 52 |         ln = h.read(2) # read arbitrary bytes so check if @param fn is a gzipped file
 53 |     except:
 54 |         # cannot read in gzip format
 55 |         return open(fn, mode='rt')
 56 |     h.close()
 57 |     return gzip.open(fn, mode='rt')
 58 | 
 59 | def read_rs_history(histFile):
 60 |     RS_HISTORY = set() # store rs
 61 | 
 62 |     logging.info("Reading '{}' file...".format(histFile))
 63 |     for ln in myopen(histFile):
 64 |         fd = ln.strip().split('\t')
 65 |         if ln.lower().find('re-activ') < 0:
 66 |             RS_HISTORY.add(fd[0])
 67 | 
 68 |     return RS_HISTORY
 69 | 
 70 | 
 71 | def read_rs_merge(mergFile):
 72 |     RS_MERGE = dict() # high_rs -> (lower_rs, current_rs)
 73 | 
 74 |     logging.info("Reading '{}' file...".format(mergFile))
 75 |     for ln in myopen(mergFile):
 76 |         fd = ln.strip().split('\t')
 77 |         h, l = fd[0], fd[1]
 78 |         c = fd[6]
 79 |         RS_MERGE[h] = (l, c)
 80 | 
 81 |     return RS_MERGE
 82 | 
 83 | 
 84 | # Returns list of tuples, where each tuple is (rs#, build, chr, pos)
 85 | def fetch_snps(snp_ids, verbose=False):
 86 |     from Bio import Entrez
 87 |     Entrez.email = "oleksandr.frei@gmail.com"
 88 | 
 89 |     def pull_var(v, line):
 90 |         return [x for x in line if x.startswith(v)][0].replace(v, '')
 91 | 
 92 |     def parse_snp(snp_info):
 93 | 
 94 |         snp = snp_info.split('\n')
 95 | 
 96 |         rsId = snp[0].split(" | ")[0]
 97 |         lineset = [x.split(' | ') for x in snp if x.startswith('CTG')]
 98 |         if len(lineset) == 0:
 99 |             return None
100 | 
101 |         try:
102 |             build = pull_var("assembly=", lineset[0])
103 |             chr = pull_var("chr=", lineset[0])
104 |             pos = pull_var("chr-pos=", lineset[0])
105 |         except:
106 |             return None
107 | 
108 |         return rsId, build, chr, pos
109 | 
110 |     logging.info('Querying dbSNP for {} SNPs...'.format(len(snp_ids)))
111 |     response = Entrez.efetch(db='SNP', id=','.join(snp_ids), rettype='flt', retmode='flt').read()
112 |     logging.info('Done')
113 |     if verbose:
114 |         print(response)
115 | 
116 |     snp_infos = []
117 |     for snp_info in filter(None, response.split('\n\n')):
118 |         snp_infos.append(parse_snp(snp_info))
119 |     return snp_infos
120 | 
121 | def lift_rs(rsvec, RS_HISTORY, RS_MERGE):
122 |     RS_LIFTED = rsvec.copy(); nsnps = len(rsvec)
123 |     RS_idx = np.empty((nsnps,), dtype='|S10')
124 |     logging.info("Lifting rs# numbers for n={} SNPs...".format(nsnps))
125 |     for i in range(nsnps):
126 |         rs = rsvec[i]
127 |         if (i+1) % 200000 == 0:
128 |             logging.info("{} SNPs done".format(i+1))
129 |         if rs not in RS_MERGE:
130 |             RS_LIFTED[i] = rs; RS_idx[i] = 'unchanged'
131 |             continue
132 |         while True:
133 |             if rs in RS_MERGE:
134 |                 rsLow, rsCurrent = RS_MERGE[rs]
135 |                 if rsCurrent not in RS_HISTORY and rsCurrent != '':
136 |                     RS_LIFTED[i] = rsCurrent; RS_idx[i] = 'lifted'
137 |                     break
138 |                 else:
139 |                     rs = rsLow
140 |             else:
141 |                 RS_LIFTED[i] = rs; RS_idx[i] = 'unlifted'
142 |                 break
143 |     logging.info("Lifting rs# numbers is finished.")
144 |     return RS_LIFTED, RS_idx
145 |         
146 | def lift_pos(posvec, chrvec, chainFile):
147 |     logging.info("Lifting genomic positions...")
148 |     nsnps = len(posvec)
149 |     posvec = posvec -1;
150 |     pos_lifted = np.empty((nsnps,), dtype='int32')
151 |     chr_lifted = np.empty((nsnps,), dtype='int32')
152 |     pos_indi = np.empty((nsnps,), dtype='|S10')
153 |     dup_indi = np.empty((nsnps,), dtype='bool'); dup_indi.fill(False)
154 |     lift = LiftOver(chainFile)
155 |     for i in range(nsnps):
156 |         if (i+1) % 200000 == 0:
157 |             logging.info("{} SNPs done".format(i+1))
158 |         pos = posvec[i]; chr = 'chr%d' % (chrvec[i],)
159 |         tmp = lift.convert_coordinate(chr, pos)
160 |         if not tmp:
161 |             pos_lifted[i] = pos; pos_indi[i] = 'miss'; chr_lifted[i]=chrvec[i]
162 |         elif len(tmp) > 1:
163 |             pos_lifted[i] = tmp[0][1]; 
164 |             chr_lifted[i] = re.sub('chr', '', tmp[0][0])
165 |             pos_indi[i] = 'multi'
166 |         else:
167 |             pos_lifted[i] = tmp[0][1] 
168 |             chr_lifted[i] = re.sub('chr', '', tmp[0][0])
169 |             if pos == tmp[0][1]:
170 |                 pos_indi[i] = 'unchanged'
171 |             else:
172 |                 pos_indi[i] = 'lifted'
173 |     return pos_lifted+1, pos_indi, chr_lifted
174 |                 
175 | def trim_ch_rs (sum_dat, snpCol, chrCol, with_pos):
176 |     nsnps = sum_dat.shape[0]
177 |     logging.info("Parsing {} rs# numbers from the input file...".format(nsnps))
178 |     chrnum_vec = np.empty((nsnps, ), dtype='int')
179 |     rsvec_num = []; rsPattern = re.compile(r'rs[0-9]*')
180 |     rsidx = np.empty((nsnps, ), dtype='bool'); rsidx.fill(False)
181 |     for i in range(nsnps):
182 |         if (i+1) % 200000 == 0:
183 |             logging.info("{} SNPs done".format(i+1))
184 |         rs = sum_dat.loc[:,snpCol][i]
185 |         if with_pos:
186 |             chr = sum_dat.loc[:,chrCol][i]
187 |         if rsPattern.match(rs):
188 |             rsidx[i] = True
189 |             rsvec_num.append(re.sub('rs', '', rs))
190 |         if with_pos:
191 |             chrnum_vec[i] = int(re.sub('[chrCHR]', '', str(chr)))
192 |     rsvec_num = np.array(rsvec_num)
193 |     return rsvec_num, rsidx, chrnum_vec
194 | 
195 | def try_find_build(rs, pos):
196 |     snps_info = fetch_snps(rs)
197 |     #snps_info = [('rs3737728', 'GRCh38.p2', '1', '1086035'), ('rs3934834', 'GRCh38.p2', '1', '1070426'), ('rs9651273', 'GRCh38.p2', '1', '1096160')]
198 |     logging.info("Loading liftover chain files...")
199 |     lift38_19 = LiftOver('pyliftover/hg38ToHg19.over.chain.gz')
200 |     lift19_18 = LiftOver('pyliftover/hg19ToHg18.over.chain.gz')
201 |     lift19_17 = LiftOver('pyliftover/hg19ToHg17.over.chain.gz')
202 |     logging.info("Done")
203 | 
204 |     for (rsId, build, true_chr, pos_hg38), source_pos in zip(snps_info, pos):
205 |         try:
206 |             #if build != 'GRCh38.p2':  # assume a specific build we get from Entrez.efetch(db='SNP')
207 |             #    continue
208 |             source_pos -= 1
209 |             pos_hg19 = lift38_19.convert_coordinate('chr{}'.format(true_chr), int(pos_hg38) - 1)[0][1]
210 |             pos_hg18 = lift19_18.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1]
211 |             pos_hg17 = lift19_17.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1]
212 |             print("build={} {} chr{} source={} hg38={}{} hg19={}{} hg18={}{} hg17={}{}".format(
213 |                 build, rsId, true_chr, source_pos,
214 |                 pos_hg38, '*' if pos_hg38==source_pos else '',
215 |                 pos_hg19, '*' if pos_hg19==source_pos else '',
216 |                 pos_hg18, '*' if pos_hg18==source_pos else '',
217 |                 pos_hg17, '*' if pos_hg17==source_pos else ''))
218 |         except:
219 |             pass
220 | 
221 | def lift_over(sumFile, outDir, histFile, mergFile, chainFile,
222 |               snpCol, chrCol, posCol, bim=False, reffile="", find_build=False):
223 |     logging.info("Reading input file '{}'...".format(sumFile))
224 |     sum_dat = pd.read_csv(sumFile, sep=' +|\t', engine='python')
225 |     logging.info("Done. Columns are: {}".format(", ".join(sum_dat.columns)))
226 |     if bim:
227 |         logging.info("Setting new column names based on BIM format")
228 |         sum_dat.columns = ['CHR', 'SNP', 'GP', 'POS', 'A1', 'A2']
229 |         snpCol='SNP'; chrCol='CHR'; posCol='POS'
230 | 
231 |     if snpCol not in sum_dat.columns:
232 |         raise ValueError("Input file does not have {} column".format(snpCol))
233 | 
234 |     with_pos = chrCol is not None and chrCol != '-' and posCol is not None and posCol != '-'
235 |     with_ref = reffile != None and reffile != ""
236 | 
237 |     if with_pos:
238 |         if chrCol not in sum_dat.columns:
239 |             raise ValueError("Input file does not have {} column".format(chrCol))
240 |         if posCol not in sum_dat.columns:
241 |             raise ValueError("Input file does not have {} column".format(posCol))
242 | 
243 |     if find_build:
244 |         sample_size = 60
245 |         sample = random.sample(range(sum_dat.shape[0]), sample_size)
246 |         sum_dat_sample = sum_dat.ix[sample, :]
247 |         sum_dat_sample = sum_dat_sample.sort_values(chrCol)
248 |         sum_dat_sample.reset_index(inplace=True)
249 |         try_find_build(sum_dat_sample[snpCol].as_matrix(), sum_dat_sample[posCol].as_matrix())
250 |         return
251 | 
252 |     logging.info("Checking if there are duplicates by rs# number in the input file... ")
253 |     duplicated = sum_dat.duplicated(snpCol, keep=False)
254 |     if any(duplicated):
255 |         logging.warning("{} duplicated rs# numbers were found in the input file".format(sum(duplicated)))
256 |     else:
257 |         logging.info("No duplicated rs# numbers were found in the input file")
258 | 
259 |     rsvec_num, rsidx, chrnum_vec = trim_ch_rs(sum_dat, snpCol, chrCol, with_pos)
260 | 
261 |     RS_HISTORY = read_rs_history(histFile) if isinstance(histFile, str) else histFile
262 |     RS_MERGE = read_rs_merge(mergFile) if isinstance(mergFile, str) else mergFile
263 | 
264 |     lifted_rs, lift_rs_indi = lift_rs(rsvec_num, RS_HISTORY, RS_MERGE)
265 |     summary_lift_rs(sum_dat.loc[:, snpCol][rsidx], lifted_rs, lift_rs_indi, 
266 |             outDir)
267 |     sum_dat.loc[:,'new_ID'] = sum_dat.loc[:, snpCol].copy()
268 |     sum_dat.loc[rsidx, 'new_ID'] = np.array(['rs%s' % (s,) for s in lifted_rs])
269 |     # TO-DO: It is better to flag all duplicate but the function with
270 |     #   keep = False doesnt work ! So the first of multiple occurance will
271 |     #   sneak into the clean dataset
272 |     # ofrei: keep = False seems to work well for me with pandas 0.18.0
273 |     sum_dat_dup_idx = sum_dat.duplicated(subset = ('new_ID',), keep=False)
274 |     if with_pos:
275 |         lifted_pos, lift_pos_indi ,chr_lifted = lift_pos(sum_dat[posCol],
276 |                 chrnum_vec, chainFile)
277 |         summary_lift_pos(sum_dat.loc[:,snpCol], sum_dat.loc[:,chrCol], 
278 |                 sum_dat.loc[:, posCol], lifted_pos, lift_pos_indi, outDir)
279 |         sum_dat.loc[:,'new_pos'] = lifted_pos
280 |         sum_dat.loc[:,'new_chr'] = chr_lifted.astype('int')
281 |         sum_dat.loc[:, 'postag'] = np.array(['%s:%s' % (str(c), str(p)) for
282 |             c, p in zip(chr_lifted, lifted_pos)])
283 |         sum_dat_dup_idx2 = sum_dat.duplicated(subset = ('postag'), keep=False)
284 |         sum_dat_dup_idx = np.logical_or(sum_dat_dup_idx, sum_dat_dup_idx2)
285 |         sum_dat_miss_idx = lift_pos_indi=='miss' 
286 |         sum_dat_multi_idx = lift_pos_indi=='multi'
287 |         if np.sum(sum_dat_multi_idx) > 0:
288 |             sum_dat_multi = sum_dat.ix[sum_dat_multi_idx, :]
289 |             multi_file = os.path.join(outDir, 'multi_%s' % (os.path.basename(sumFile),))
290 |             sum_dat_multi.to_csv(multi_file, index=False, sep='\t')
291 |             logging.info("Created {} file with {} entries".format(multi_file, sum_dat_multi.shape[0]))
292 | 
293 |     elif with_ref:
294 |         logging.info("Lifting with BIM reference file...")
295 |         refbim = pd.read_csv(reffile,delimiter='\t', header=None)
296 |         refbim.columns = ['CHR', 'SNP', 'GP', 'POS', 'A1', 'A2']
297 |         if any(duplicated):
298 |             logging.warning("(!!!) pandas.merge has not been tested on how it merges duplicated entries (!!!!)")
299 |         tmp = pd.merge(sum_dat, refbim, left_on='new_ID', right_on='SNP',
300 |                 how='left')
301 |         sum_dat_miss_idx = np.isnan(tmp.POS)
302 |         sum_dat.loc[~sum_dat_miss_idx, 'new_pos'] = tmp.POS[
303 |                 ~sum_dat_miss_idx].astype('int')
304 |         sum_dat.loc[~sum_dat_miss_idx, chrCol] = tmp.CHR[
305 |                 ~sum_dat_miss_idx].astype('int')
306 |         sum_dat_multi_idx = np.empty((sum_dat.shape[0],), dtype='bool')
307 |         sum_dat_multi_idx.fill(False)
308 |     else:
309 |         sum_dat_miss_idx = np.empty((sum_dat.shape[0],), dtype='bool')
310 |         sum_dat_miss_idx.fill(False)
311 |         sum_dat_multi_idx = np.empty((sum_dat.shape[0],), dtype='bool')
312 |         sum_dat_multi_idx.fill(False)
313 |     if np.sum(sum_dat_dup_idx) > 0:
314 |         sum_dat_dup = sum_dat.ix[sum_dat_dup_idx, :]
315 |         dup_file = os.path.join(outDir, 'dup_%s' % (os.path.basename(sumFile),))
316 |         sum_dat_dup.to_csv(dup_file, index=False, sep='\t')
317 |         logging.info("Created {} file with {} entries".format(dup_file, sum_dat_dup.shape[0]))
318 |     if np.sum(sum_dat_miss_idx) > 0:
319 |         sum_dat_miss = sum_dat.ix[sum_dat_miss_idx, :]
320 |         miss_file = os.path.join(outDir, 'miss_%s' % (os.path.basename(sumFile),))
321 |         sum_dat_miss.to_csv(miss_file, index=False, sep='\t')
322 |         logging.info("Created {} file with {} entries".format(miss_file, sum_dat_miss.shape[0]))
323 |     rm_idx = (sum_dat_dup_idx.astype('int') + 
324 |                 sum_dat_miss_idx.astype('int') + 
325 |                 sum_dat_multi_idx.astype('int')) >=1
326 |     if np.sum(rm_idx) > 0:
327 |         sum_dat = sum_dat.ix[~rm_idx,:]
328 |         logging.warning(
329 |             "{0} entries removed from the input because of duplication, "
330 |             "misses or multiple mappings between builds".format(np.sum(rm_idx)))
331 |     if bim:
332 |         logging.info("Updating plink files...")
333 |         update_plinkfiles(outDir, sumFile, sum_dat, snpCol)
334 |     else:
335 |         result_file = os.path.join(outDir, 'lifted_%s' % (os.path.basename(sumFile),))
336 |         logging.info("Saving the result to {}...".format(result_file))
337 |         sum_dat.to_csv(result_file, index=False, sep='\t')
338 |         logging.info("Done.")
339 | 
340 | def update_plinkfiles(outDir, sumFile, sum_dat, snpCol):
341 |     tmp_ex = os.path.join(outDir, 'tmp_extract.txt')
342 |     sum_dat.to_csv(tmp_ex, index=False, sep='\t', columns=(snpCol,))
343 |     bf = re.sub('.bim','', sumFile)
344 |     plink_cmd = r'''plink --bfile %s --extract %s --make-bed \
345 |             --out %s''' % (bf, tmp_ex, sumFile+'.tmp')
346 |     os.system(plink_cmd)
347 |     tmp_bim = pd.read_csv(sumFile+'.tmp.bim',delimiter='\t', header=None)
348 |     tmp_bim.columns = ['oCHR', 'oSNP', 'oGP', 'oPOS', 'oA1', 'oA2']
349 |     tmp = pd.merge(tmp_bim, sum_dat, left_on='oSNP', right_on=snpCol,
350 |             how='left')
351 |     miss_idx = np.isnan(tmp.loc[:,'new_pos'])
352 |     assert np.sum(miss_idx) == 0
353 |     tmp_pos_file = os.path.join(outDir, 'tmp_update_pos.txt')
354 |     tmp.to_csv(tmp_pos_file, index=False, sep='\t', 
355 |             columns=('oSNP', 'new_pos'), header=None)
356 |     plink_cmd = r'''plink --bfile %s --update-map %s 2 --make-bed \
357 |                         --out %s''' % (sumFile+'.tmp',tmp_pos_file,
358 |                                 sumFile+'.tmp2')
359 |     os.system(plink_cmd)
360 |     tmp_bim2 = pd.read_csv(sumFile+'.tmp2.bim',sep='\t', header=None)
361 |     tmp_bim2.columns = ['oCHR', 'oSNP', 'oGP', 'oPOS', 'oA1', 'oA2']
362 |     tmp = pd.merge(tmp_bim2, sum_dat, left_on='oSNP', right_on=snpCol,
363 |             how='left')
364 |     tmp.to_csv(sumFile+'.tmp2.bim', sep='\t', header=None, index=False,
365 |             columns=('oCHR', 'new_ID', 'oGP', 'oPOS', 'oA1', 'oA2'))
366 |     os.system("mv %s %s " % (sumFile+'.tmp2.bim', bf+'_lifted.bim'))
367 |     os.system("mv %s %s " % (sumFile+'.tmp2.bed', bf+'_lifted.bed'))
368 |     os.system("mv %s %s " % (sumFile+'.tmp2.fam', bf+'_lifted.fam'))
369 |     os.system("rm %s " % (sumFile+'.tmp.bim',))
370 |     os.system("rm %s " % (sumFile+'.tmp.fam',))
371 |     os.system("rm %s " % (sumFile+'.tmp.bed',))
372 |     os.system("rm %s " % (tmp_pos_file,))
373 |     os.system("rm %s " % (tmp_ex,))
374 | 
375 | def summary_lift_rs(orig_rs, new_rs, indivec, outDir):
376 |     li_idx = indivec == 'lifted'
377 |     ul_idx = indivec == 'unlifted'
378 |     uc_idx = indivec == 'unchanged'
379 |     orig_rs = np.array(orig_rs)
380 |     summary_file = os.path.join(outDir, 'summary_lift_rs.txt')
381 |     logging.info("Saving lift summary to '{}'...".format(summary_file))
382 |     with open(summary_file, 'w') as f:
383 |         f.write('Total number of SNPs with "rs" number: %d\n' % (
384 |             len(orig_rs,)))
385 |         f.write('\t Total number of SNPs with "rs" lifted: %d\n' % (
386 |             np.sum(li_idx,)))
387 |         f.write('\t Total number of SNPs with "rs" unchanged: %d\n' % (
388 |             np.sum(uc_idx,)))
389 |         f.write('\t Total number of SNPs with "rs" cant lift: %d\n' % (
390 |             np.sum(ul_idx,)))
391 | 
392 |     results_file = os.path.join(outDir, 'lift_rs_result.txt')
393 |     logging.info("Saving lifted SNPs to '{}'...".format(summary_file))
394 |     with open(results_file, 'w') as f:
395 |         f.write('ORI_RS\tNEW_RS\tSTATUS\n')
396 |         for i in range(len(orig_rs)):
397 |             f.write('%s\trs%s\t%s\n' % (orig_rs[i], new_rs[i], indivec[i]))
398 | 
399 | def summary_lift_pos(orig_snp, chrvec, posvec, new_posvec, indivec, outDir):
400 |     li_idx = indivec == 'lifted'
401 |     uc_idx = indivec == 'unchanged'
402 |     miss_idx = indivec == 'miss'
403 |     multi_idx = indivec == 'multi'
404 |     orig_snp = np.array(orig_snp); chrvec = np.array(chrvec)
405 |     posvec = np.array(posvec); new_posvec = np.array(new_posvec)
406 |     with open(os.path.join(outDir, 'summary_lift_pos.txt'), 'w') as f:
407 |         f.write('Total number of SNPs: %d\n' % ( len(orig_snp,)))
408 |         f.write('\t Total number of SNPs lifted: %d\n' % (np.sum(li_idx,)))
409 |         f.write('\t Total number of SNPs unchanged: %d\n' % (np.sum(uc_idx,)))
410 |         f.write('\t Total number of SNPs missed: %d\n' % (np.sum(miss_idx,)))
411 |         f.write('\t Total number of SNPs with multiple locations in new build: %d\n' % ( np.sum(multi_idx,)))
412 |     with open (os.path.join(outDir, 'lift_pos_result.txt'), 'w') as f:
413 |         f.write('SNP\tCHR\t\ORI_POS\tNEW_POS\tSTATUS\n')
414 |         for i in range(len(orig_snp)):
415 |             f.write('%s\t%s\t%d\t%d\%s\n' % (orig_snp[i], str(chrvec[i]), 
416 |                 posvec[i], new_posvec[i], indivec[i]))
417 | 
418 | if __name__ == "__main__":
419 |     import time
420 |     import warnings
421 |     warnings.simplefilter('ignore')
422 |     tsts = time.time()
423 |     parser = argparse.ArgumentParser(prog="Liftover_SNPs",
424 |             formatter_class=argparse.RawTextHelpFormatter,
425 |             description=Intro)
426 | 
427 |     parser.add_argument('input_file', type=str, help='Path of the input SNPs file')
428 | 
429 |     parser.add_argument('-s', '--snp', type=str, required=True, help='The name of the SNP field in the input file', dest='snp_column')
430 |     parser.add_argument('-c', '--chr', type=str, help='The name of the Chromosome field name in the input file', default='-', dest='chr_column')
431 |     parser.add_argument('-p', '--pos', type=str, help='The name of the BP field in input file', default='-', dest='pos_column')
432 | 
433 |     parser.add_argument(      '--output-folder', type=str, default='.', help='Output directory')
434 |     parser.add_argument(      '--history-file', type=str, default='pyliftover/SNPHistory.bcp.gz',help='NCBI SNP build history file')
435 |     parser.add_argument(      '--merge-file', type=str, default='pyliftover/RsMergeArch.bcp.gz', help='NCBI SNP merge file')
436 |     parser.add_argument(      '--chain-file', type=str, default='pyliftover/hg18ToHg19.over.chain.gz', help='UCSC chain file')
437 |     parser.add_argument(      '--find-build', action='store_true', help='Attempt to detect the build of the input file', default=False)
438 | 
439 |     parser.add_argument(       '--bim', action='store_true', help='(experimental option) update PLINT fileset bim file', default=False)
440 |     parser.add_argument(       '--ref', type=str, help='(experimental option) Reference bim file', default='')
441 | 
442 |     parser.add_argument('-v',  '--verbose', action="store_true", help="increase output verbosity")
443 | 
444 |     args = parser.parse_args()
445 | 
446 |     logging_level = logging.INFO if args.verbose else logging.WARNING
447 |     logging.basicConfig(format='%(levelname)s:%(message)s', level=logging_level)
448 | 
449 |     if StrictVersion(pd.__version__) < StrictVersion("0.17.0"):
450 |         logging.warning("Old pandas version detected. "
451 |                         "Liftover_SNP script hasn not been tested with pandas={}", pd.__version__)
452 | 
453 |     # Check correctness of user-provided arguments
454 |     if (args.chr_column == '-') != (args.pos_column == '-'):
455 |         raise ValueError("Arguments --chr-column and --pos-column must be provided together")
456 | 
457 |     if args.chr_column == '-' and args.find_build:
458 |         raise ValueError("Unable to find build without CHR:POS information")
459 | 
460 |     if not os.access(args.output_folder, os.F_OK):
461 |         logging.warning("Output directory {} not exists, making one for you".format(args.output_folder))
462 |         os.makedirs(args.output_folder)
463 | 
464 |     lift_over(args.input_file, args.output_folder, args.history_file, args.merge_file, args.chain_file,
465 |               args.snp_column, args.chr_column, args.pos_column, args.bim, args.ref, args.find_build)
466 | 
467 |     logging.info('Finish at {}'.format(time.ctime()))
468 |     ted = time.time()
469 |     logging.info('Time taken {} mins {} sec'.format((ted-tsts)//60, np.round(ted-tsts) % 60))
470 | 


--------------------------------------------------------------------------------
/PLINK_file_Utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | import numpy as np
 3 | import os, sys, logging
 4 | 
 5 | def read_bim(bimFile, logger=None, sep='\t'):
 6 |     '''
 7 |     Read PLINK bim file.
 8 | 
 9 |     Input:
10 |     ------
11 |     bimFile,    PLINK bim file path
12 |     logger,     python logger for process information
13 |     sep,        separator of bim file
14 | 
15 |     Return:
16 |     ------
17 |     bimDat,     DataFrame with SNP information
18 | 
19 |     Note:
20 |     -----
21 |     * Adding column names for convienience:
22 |         CHR SNP GP  POS A1  A2
23 |     * Change ChrX->23, ChrY->24 and ChrM->25
24 |     '''
25 |     if not logger:
26 |         logger = logging.getLogger()
27 |         logger.addHandler(logging.StreamHandler())
28 |     if not os.access(bimFile, os.R_OK):
29 |         logger.error('Unable to read {}'.format(bimFile))
30 |         raise (ValueError, 'Unable to read {}'.format(bimFile))
31 |     bimDat = pd.read_csv(bimFile, sep=sep, header=None, 
32 |             names=['CHR', 'SNP', 'GP', 'POS', 'A1', 'A2'])
33 |     bimDat.loc[:,'CHR'] = bimDat.loc[:, 'CHR'].astype('|S5')
34 |     bimDat.loc[bimDat.loc[:, 'CHR']=='X', 'CHR'] = '23'
35 |     bimDat.loc[bimDat.loc[:, 'CHR']=='Y', 'CHR'] = '24'
36 |     bimDat.loc[bimDat.loc[:, 'CHR']=='M', 'CHR'] = '25'
37 |     bimDat.loc[:, 'CHR'] = bimDat.loc[:, 'CHR'].astype('float').astype('int')
38 |     bimDat.loc[:, 'POS'] = bimDat.loc[:, 'POS'].astype('int')
39 |     logger.info('Read {} SNPs from {}'.format(bimDat.shape[0], bimFile))
40 |     logger.info('Columns: CHR, SNP, GP, POS, A1, A2 were used')
41 |     return (bimDat)
42 | 
43 | def deduplicate_bim(bimDat, outdir, logger=None):
44 |     '''
45 |     Check if PLINK bim data has duplicate SNP by position.
46 | 
47 |     Input:
48 |     ------
49 |     bimDat,     DataFrame with PLINK bim data
50 |     outdir,     Output directory for intemediate files
51 |     logger,     python logger for process information
52 | 
53 |     Return:
54 |     -------
55 |     dupIdx,     Indictor Series for SNPs that should be removed
56 | 
57 |     Note:
58 |     -----
59 |     * Save all duplicated SNPs into a gziped file.
60 |     * For duplicated SNPs:
61 |     *   SNPs with rs-number kept, i.e., A1 is (A,T,C,G) and A2 is (A,T,C,G) and
62 |     *   SNP ID starts with 'rs'
63 |         Otherwise,
64 |         the first of the duplicates were kept.
65 |     * Also save a text file containing SNP IDs that should be removed by PLINK
66 |     Warning:
67 |     * Extreme slow for large dataset. So better do it once and update the 
68 |     *   corresponding PLINK bed/bim/fam. 
69 |     '''
70 |     if not logger:
71 |         logger = logging.getLogger()
72 |         logger.addHandler(logging.StreamHandler())
73 |     dupIdx = bimDat.duplicated(subset=['CHR', 'POS'], keep=False)
74 |     ndup = np.sum(dupIdx)
75 |     if ndup > 0:
76 |         outfile = os.path.join(outdir, 'Duplicated_SNPs_by_POS.txt.gz')
77 |         logger.warn('Bim file has {} duplicated items by genomic position'.format(ndup))
78 |         logger.warn('Save all duplicated SNPs in Bim to {}'.format(outfile))
79 |         dupDat = bimDat.loc[dupIdx==True,:]
80 |         dupDat.to_csv(outfile, index=False, compression='gzip', na_rep='NA',
81 |                 sep='\t')
82 |         grouped = dupDat.groupby(by=['CHR','POS'], sort=False)
83 |         for name, x in grouped:
84 |             rsIdx = x.loc[:,'SNP'].str.startswith('rs')
85 |             A1Idx = x.loc[:,'A1'].str.contains('[A|T|C|G]')
86 |             A2Idx = x.loc[:,'A2'].str.contains('[A|T|C|G]')
87 |             Idx = rsIdx & A1Idx & A2Idx
88 |             if np.sum(Idx) != 1:
89 |                 Idx.iloc[0] = False
90 |             dupIdx.values[Idx.index] = Idx.values 
91 |         outfile = os.path.join(outdir, 'Duplicated_SNPs_by_POS_excluded.txt.gz')
92 |         logger.warn('Save removed duplicated SNPs in Bim to {}'.format(outfile))
93 |         tmpDat = dupDat.loc[dupIdx==True]
94 |         tmpDat.to_csv(outfile, index=False, compression='gzip', na_rep='NA',
95 |                 sep='\t', columns=['SNP'])
96 |     return (dupIdx) 
97 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A collection of various utilities for GWAS summary statistics.
  2 | 
  3 | ## sumstats.py
  4 | 
  5 | sumstats.py is a collection of utilities that work with GWAS summary stats.
  6 | ``csv`` utility reads raw summary statistics files
  7 | and convert them into a standardized format:
  8 | tab-separated file with standard
  9 | column names, standard chromosome labels,
 10 | NA label for missing data, etc.
 11 | ``qc`` utility perform a set of highly customizable quality control procedures.
 12 | ``mat`` utility re-saves summary stats in MATLAB format for cond/conj pleiotropy analysis.
 13 | ``lift`` utility can lift genomic corredinats across genomic builds, and SNP rs numbers to a newer versions of SNPdb.
 14 | 
 15 | Some of the steps require additional data:
 16 | ```
 17 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/2558411_ref.bim
 18 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/9279485_ref.bim
 19 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/b149_RsMergeArch.bcp.gz
 20 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/b149_SNPChrPosOnRef_105.bcp.gz
 21 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/b149_SNPHistory.bcp.gz
 22 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/hg18ToHg19.over.chain.gz
 23 | wget https://precimed.s3-eu-west-1.amazonaws.com/python_convert/ref_1kG_phase3_EUR.tar.gz
 24 | ```
 25 | 
 26 | ```
 27 | usage: sumstats.py [-h]
 28 |                    {csv,qc,mat,lift,clump,rs,ls,mat-to-csv,ldsc-to-mat,frq-to-mat,ref-to-mat,ldsum,diff-mat} ...
 29 | 
 30 | A collection of various utilities for GWAS summary statistics.
 31 | 
 32 | positional arguments:
 33 |   {csv,qc,zscore,mat,lift,clump,rs,ls,mat-to-csv,ldsc-to-mat,frq-to-mat,ref-to-mat,ldsum,diff-mat,neff}
 34 |     csv                 Load raw summary statistics file and convert it into a
 35 |                         standardized format: tab-separated file with standard
 36 |                         column names, standard chromosome labels, NA label for
 37 |                         missing data, etc. The conversion does not change the
 38 |                         number of lines in the input files (e.g. no filtering
 39 |                         is done on markers). Unrecognized columns are removed
 40 |                         from the summary statistics file. The remaining
 41 |                         utilities in sumstats.py work with summary statistics
 42 |                         files in the standardized format.
 43 |     qc                  Miscellaneous quality control and filtering procedures
 44 |     zscore              Calculate z-score from p-value column and effect size
 45 |                         column
 46 |     mat                 Create mat files that can be used as an input for
 47 |                         cond/conj FDR and for CM3 model. Takes csv files
 48 |                         (created with the csv task of this script). Require
 49 |                         columns: SNP, P, and one of the signed summary
 50 |                         statistics columns (BETA, OR, Z, LOGODDS). Creates
 51 |                         corresponding mat files which can be used as an input
 52 |                         for the conditional fdr model. Only SNPs from the
 53 |                         reference file are considered. Zscores of strand
 54 |                         ambiguous SNPs are set to NA. To use CHR:POS for
 55 |                         merging summary statistics with reference file
 56 |                         consider 'rs' utility which auguments summary
 57 |                         statistics with SNP column (first run 'sumstats.py rs
 58 |                         ...', then feed the resulting file into sumstats.py
 59 |                         mat ...)
 60 |     lift                Lift RS numbers to a newer version of SNPdb, and/or
 61 |                         liftover chr:pos to another genomic build using UCSC
 62 |                         chain files. WARNING: this utility may use excessive
 63 |                         amount of memory (up and beyong 32 GB of RAM).
 64 |     clump               Perform LD-based clumping of summary stats. This works
 65 |                         similar to FUMA snp2gene functionality
 66 |                         (http://fuma.ctglab.nl/tutorial#snp2gene). Step 1. Re-
 67 |                         save summary stats into one file for each chromosome.
 68 |                         Step 2a Use 'plink --clump' to find independent
 69 |                         significant SNPs (default r2=0.6) Step 2b Use 'plink
 70 |                         --clump' to find lead SNPs, by clumping independent
 71 |                         significant SNPs (default r2=0.1) Step 3. Use 'plink
 72 |                         --ld' to find genomic loci around each independent
 73 |                         significant SNP (default r2=0.6) Step 4. Merge
 74 |                         together genomic loci which are closer than certain
 75 |                         threshold (250 KB) Step 5. Merge together genomic loci
 76 |                         that fall into exclusion regions, such as MHC Step 6.
 77 |                         Output genomic loci report, indicating lead SNPs for
 78 |                         each loci Step 7. Output candidate SNP report
 79 |     rs                  Augument summary statistic file with SNP RS number
 80 |                         from reference file. Merging is done on chromosome and
 81 |                         position. If SNP column already exists in --sumstats
 82 |                         file, it will be overwritten.
 83 |     ls                  Report information about standard sumstat files,
 84 |                         including the set of columns available, number of
 85 |                         SNPs, etc.
 86 |     mat-to-csv          Convert matlab .mat file with logpvec, zvec and
 87 |                         (optionally) nvec into CSV files.
 88 |     ldsc-to-mat         Convert .sumstats, .ldscore, .M, .M_5_50 and binary
 89 |                         .annot files from LD score regression to .mat files.
 90 |     frq-to-mat          Convert .frq files plink from .mat files.
 91 |     ref-to-mat          Convert reference files to .mat files.
 92 |     ldsum               convert plink .ld.gz files (pairwise ld r2) to ld
 93 |                         scores
 94 |     diff-mat            Compare two .mat files with logpvec, zvec and nvec,
 95 |                         and report the differences.
 96 |     neff                generate N column from NCASE and NCONTROL, as 4 / (1 /
 97 |                         NCASE + 1 / NCONTROL)  
 98 | 
 99 | optional arguments:
100 |   -h, --help            show this help message and exit
101 | ```
102 | 
103 | For more information about each command call ``sumstats.py <command> --help``.
104 | 
105 | Examples:
106 | ```
107 | python $(python_convert)/sumstats.py csv --sumstats scz2.snp.results.txt.gz --out PGC_SCZ_2014.csv --force --auto --head 5 --chr hg19chrc
108 | python $(python_convert)/sumstats.py mat --sumstats PGC_SCZ_2014.csv --out PGC_SCZ_2014.mat --ref 2558411_ref.bim --force
109 | ```
110 | 
111 | Further examples can be found in [GWAS_SUMSTAT/Makefile](https://github.com/precimed/GWAS_SUMSTAT/blob/master/Makefile).
112 | 
113 | ## sumstats.py clump
114 | 
115 | ``clump`` utility determine 
116 |   - independent significant SNPs
117 |   - lead SNPs
118 |   - genomic loci
119 |   - candidate SNPs
120 | using the same logic as FUMA's snp2gene. An example:
121 | 
122 | ```
123 | python sumstats.py clump \
124 | 	--clump-field FDR \
125 | 	--force  \
126 | 	--plink /home/oleksandr/plink/plink \
127 | 	--sumstats cond0p01_BIP_vs_COG/result.mat.csv \
128 | 	--bfile-chr /full/path/to/ref_1kG_phase3_EUR/chr@ \
129 | 	--exclude-ranges ['6:25119106-33854733', '8:7200000-12500000'] \
130 | 	--clump-p1 0.01 \
131 | 	--out cond0p01_BIP_vs_COG/result.clump
132 | ```
133 | 
134 | Here the input file ``results.mat.csv`` was converted from cond/conj FDR results using this script:
135 | 
136 | ```
137 | # Usage:
138 | # python fdrmat2csv.py result.mat /space/syn03/1/data/GWAS/SUMSTAT/misc/9279485_ref.bim
139 | 
140 | import pandas as pd
141 | import scipy.io as sio
142 | import sys
143 | import numpy as np
144 | if __name__ == '__main__':
145 |     sumstats = sio.loadmat(sys.argv[1])
146 |     ref=pd.read_csv(ys.argv[2], delim_whitespace=True)
147 |     ref['FDR']=sumstats['fdrmat']
148 |     ref[['CHR', 'SNP', 'BP', 'A1', 'A2', 'FDR']].to_csv(sys.argv[1] + '.csv', index=False, sep='\t')
149 | ```
150 | 
151 | ## make_ld_matrix
152 | 
153 | Make LD matrix from reference data. Output either in matlab format or as dense lower triangular text file.
154 | To run this tool you need to download reference data from http://ctg.cncr.nl/software/magma (for example g1000_eur).
155 | Example:
156 | ```
157 | python make_ld_matrix.py --ref 2558411_ref.bim --bfile g1000_eur --ld_window_r2 0.1 --savemat ldmat_p1.mat
158 | ```
159 | For more info see [make_ld_matrix](./make_ld_matrix/README.md).
160 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/__init__.py


--------------------------------------------------------------------------------
/annotation.py:
--------------------------------------------------------------------------------
  1 | from time import localtime, strftime
  2 | import pandas as pd
  3 | import numpy as np
  4 | import argparse
  5 | from collections import namedtuple
  6 | from multiprocessing import Pool
  7 | import scipy.io as sio
  8 | 
  9 | 
 10 | annotation_categories = ["transcript", "exon", "intron", "utr5", "utr3",
 11 |     "coding", "upstream_1kb", "downstream_1kb"]
 12 | 
 13 | valid_chromosomes = [str(i) for i in range(1,23)]
 14 | 
 15 | required_biomart_cols = {"Associated Gene Name":    "gene_name",
 16 |                          "Ensembl Gene ID":         "gene_id",
 17 |                          "Gene type":               "gene_type",
 18 |                          "Ensembl Transcript ID":   "transcript_id",
 19 |                          "Transcript Start (bp)":   "transcript_start",
 20 |                          "Transcript End (bp)":     "transcript_end",
 21 |                          "Chromosome Name":         "chr",
 22 |                          "Strand":                  "strand",
 23 |                          "5' UTR Start":            "utr5_start",
 24 |                          "5' UTR End":              "utr5_end",
 25 |                          "3' UTR Start":            "utr3_start",
 26 |                          "3' UTR End":              "utr3_end",
 27 |                          "Ensembl Exon ID":         "exon_id",
 28 |                          "Exon Chr Start (bp)":     "exon_start",
 29 |                          "Exon Chr End (bp)":       "exon_end",
 30 |                          "Genomic coding start":    "coding_start",
 31 |                          "Genomic coding end":      "coding_end",
 32 |                          "Exon Rank in Transcript": "exon_rank"}
 33 | 
 34 | required_bim_cols = {"SNP": "snp",
 35 |                      "CHR": "chr",
 36 |                      "BP":  "pos"}
 37 | 
 38 | snp_annotation = namedtuple("snp_annotation", annotation_categories)
 39 | 
 40 | 
 41 | def is_in_upstream_1kb(snp, pos_chr_mart_df):
 42 |     forward_upstream = ( (pos_chr_mart_df.strand == 1) &
 43 |             (int(snp.pos) < pos_chr_mart_df.transcript_start) ).any()
 44 |     if not forward_upstream:
 45 |         reverse_upstream = ( (pos_chr_mart_df.strand == -1) &
 46 |                 (int(snp.pos) >= pos_chr_mart_df.transcript_end) ).any()
 47 |     return forward_upstream or reverse_upstream
 48 | 
 49 | 
 50 | def is_in_downstream_1kb(snp, pos_chr_mart_df):
 51 |     forward_downstream = ( (pos_chr_mart_df.strand == 1) &
 52 |             (int(snp.pos) >= pos_chr_mart_df.transcript_end) ).any()
 53 |     if not forward_downstream:
 54 |         reverse_downstream = ( (pos_chr_mart_df.strand == -1) &
 55 |                 (int(snp.pos) < pos_chr_mart_df.transcript_start) ).any()
 56 |     return forward_downstream or reverse_downstream
 57 | 
 58 | 
 59 | def is_in_transcript(snp, pos_chr_mart_df):
 60 |     return ( (pos_chr_mart_df.transcript_start <= snp.pos) &
 61 |             (int(snp.pos) < pos_chr_mart_df.transcript_end) ).any()
 62 | 
 63 | 
 64 | def is_in_exon(snp, pos_chr_mart_df):
 65 |     return ( (pos_chr_mart_df.exon_start <= snp.pos) &
 66 |             (int(snp.pos) < pos_chr_mart_df.exon_end) ).any()
 67 | 
 68 | 
 69 | def is_in_utr5(snp, coding_pos_chr_mart_df):
 70 |     return ( (coding_pos_chr_mart_df.utr5_start <= snp.pos) &
 71 |             (int(snp.pos) < coding_pos_chr_mart_df.utr5_end) ).any()
 72 | 
 73 | 
 74 | def is_in_utr3(snp, coding_pos_chr_mart_df):
 75 |     return ( (coding_pos_chr_mart_df.utr3_start <= snp.pos) &
 76 |             (int(snp.pos) < coding_pos_chr_mart_df.utr3_end) ).any()
 77 | 
 78 | 
 79 | def is_in_coding(snp, coding_pos_chr_mart_df):
 80 |     return ( (coding_pos_chr_mart_df.coding_start <= snp.pos) &
 81 |             (int(snp.pos) < coding_pos_chr_mart_df.coding_end) ).any()
 82 | 
 83 | 
 84 | def annotate(arg):
 85 |     chr_snp_df, chr_mart_df = arg
 86 |     annot_df = pd.DataFrame(columns=annotation_categories)
 87 |     for snp_row in chr_snp_df.itertuples():
 88 |         annot = dict.fromkeys(annotation_categories, False)
 89 |         i = ( (chr_mart_df.upstream_1kb <= snp_row.pos) &
 90 |               (int(snp_row.pos) < chr_mart_df.downstream_1kb) )
 91 |         pos_chr_mart_df = chr_mart_df[i]
 92 |         if len(pos_chr_mart_df) != 0:
 93 |             annot["upstream_1kb"] = is_in_upstream_1kb(snp_row, pos_chr_mart_df)
 94 |             annot["downstream_1kb"] = is_in_downstream_1kb(snp_row, pos_chr_mart_df)
 95 |             annot["transcript"] = is_in_transcript(snp_row, pos_chr_mart_df)
 96 |             if annot["transcript"]:
 97 |                 annot["exon"] = is_in_exon(snp_row, pos_chr_mart_df)
 98 |                 annot["intron"] = not annot["exon"]
 99 |                 coding_i = (pos_chr_mart_df.gene_type == "protein_coding")
100 |                 coding_pos_chr_mart_df = pos_chr_mart_df[coding_i]
101 |                 if annot["exon"] and len(pos_chr_mart_df) > 0:
102 |                     annot["utr5"] = is_in_utr5(snp_row, coding_pos_chr_mart_df)
103 |                     annot["utr3"] = is_in_utr3(snp_row, coding_pos_chr_mart_df)
104 |                     annot["coding"] = is_in_coding(snp_row, coding_pos_chr_mart_df)
105 |         annot_df.loc[snp_row.snp] = snp_annotation(**annot)
106 |     return annot_df
107 | 
108 | 
109 | def make_annotation_from_biomart(biomart_file, bim_file, out_txt, out_mat,
110 |     test_run, test_n_snps, n_proc):
111 |     mart_df = pd.read_csv(biomart_file, usecols=list(required_biomart_cols))
112 |     mart_df.columns = [required_biomart_cols[c] for c in mart_df.columns]
113 |     print("%d exones in the input file" % len(mart_df))
114 | 
115 |     # change to 0-based coordinates
116 |     mart_df.transcript_start -= 1
117 |     mart_df.exon_start -= 1
118 |     mart_df.coding_start -= 1
119 | 
120 |     mart_df = mart_df[mart_df.chr.isin(valid_chromosomes)]
121 |     print("%d exones on valid chromosomes" % len(mart_df))
122 |     mart_df["upstream_1kb"] = mart_df.transcript_start - 1000
123 |     mart_df["downstream_1kb"] = mart_df.transcript_end + 1000
124 | 
125 |     #WARN: hardcoded CHR column
126 |     snp_df = pd.read_csv(bim_file, usecols=list(required_bim_cols),
127 |         dtype={"CHR":str})
128 |     snp_df.columns = [required_bim_cols[c] for c in snp_df.columns]
129 |     print("%d snps in bim file" % len(snp_df))
130 |     snp_df = snp_df[snp_df.chr.isin(valid_chromosomes)]
131 |     print("%d snps on valid chromosomes" % len(snp_df))
132 |     snp_df.drop_duplicates("snp", inplace=True)
133 |     print("%d non duplicated snps on valid chromosomes" % len(snp_df))
134 | 
135 |     if test_run:
136 |         # Test with random subset
137 |         print("Taking random %d snps for testing" % test_n_snps)
138 |         random_ind = np.random.permutation(len(snp_df))[:test_n_snps]
139 |         snp_df = snp_df.loc[random_ind,:]
140 | 
141 |     arg_gen = ( (snp_df[snp_df.chr == c], mart_df[mart_df.chr == c])
142 |             for c in valid_chromosomes )
143 |     if n_proc > 1:
144 |         pool = Pool(processes=n_proc)
145 |         annotation_dfs = pool.map(annotate, arg_gen)
146 |     else:
147 |         annotation_dfs = [annotate(arg) for arg in arg_gen]
148 |     annot_df = pd.concat(annotation_dfs)
149 |     print("%d SNPs were annotated" % len(annot_df))
150 |     annot_df[annotation_categories] = annot_df[annotation_categories].astype(int)
151 |     annot_df = annot_df.reindex(index=snp_df.snp)
152 |     if not out_txt is None:
153 |         annot_df.to_csv(out_txt, sep='\t', index_label="snp")
154 |         print("%s saved" % out_txt)
155 |     if not out_mat is None:
156 |         mat_dict = {"annomat": annot_df.values, "annonames": list(annot_df.columns)}
157 |         sio.savemat(out_mat, mat_dict, format="5", appendmat=False)
158 |         print("%s saved" % out_mat)
159 | 
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     # Implementation notes:
164 |     # - A file "biomart_GENCODE_basic.txt" was created using Ensembl Biomart
165 |     #   tool. Using "Homo sapiens genes (GRCh37.p13)" dataset. Only transcripts
166 |     #   included into GENCODE basic annotation were taken, i.e. the only fileter
167 |     #   applied was: "GENCODE basic annotation: Only". All keys from
168 |     #   required_biomart_cols dict presented above were taken as attributes.
169 |     # - Definition of GENCODE basic annotation can be found here:
170 |     #   http://grch37.ensembl.org/Help/Glossary?id=500
171 |     # - Biomart has 1-based coordinate system, while dbSNP has 0-based
172 |     #   coordinates. In this script everything is converted to 0-based.
173 | 
174 |     parser = argparse.ArgumentParser(description="Classify SNPs from reference "
175 |         "template based on the biomart annotations.")
176 |     parser.add_argument("--biomart", default="data/biomart_GENCODE_basic.txt.gz",
177 |         type=str, help="File with Biomart annotations.")
178 |     parser.add_argument("--ref", default="2558411_ref.bim",
179 |         type=str, help="Reference template file.")
180 |     parser.add_argument("--out-txt", default="annotations.txt", type=str,
181 |         help="Output text file name or None.")
182 |     parser.add_argument("--out-mat", default="annotations.mat", type=str,
183 |         help="Output mat file name or None.")
184 |     parser.add_argument("--test", action="store_true",
185 |         help="Run test with randomly picked test_n_snps SNPs.")
186 |     parser.add_argument("--test-n-snps", default=10000, type=int,
187 |         help="Number of SNPs for testing.")
188 |     parser.add_argument("--n-proc", default=1, type=int, help="Number of cores "
189 |         "to use for calculation.")
190 |     args = parser.parse_args()
191 | 
192 |     print("Started on %s" % strftime("%a, %d %b %Y %H:%M:%S", localtime()))
193 | 
194 |     make_annotation_from_biomart(args.biomart, args.ref, args.out_txt,
195 |         args.out_mat, args.test, args.test_n_snps, args.n_proc)
196 | 
197 |     print("Finished on %s" % strftime("%a, %d %b %Y %H:%M:%S", localtime()))
198 | 


--------------------------------------------------------------------------------
/config.plotgwas.3.cfg:
--------------------------------------------------------------------------------
 1 | # Path to input sumstats file
 2 | sumstats = "/cluster/projects/p33/users/alexeas/aud_gwas/meta/scripts/generic-metal/AUD.EUR.PGC_MVP_UKB_FINNGEN.1.190622.metal.processed.tsv.gz.csv"
 3 | 
 4 | # sumstats column names (p-value, chromosome, position and marker ID columns are required)
 5 | p_col = "PVAL"
 6 | chrom_col = "CHR"
 7 | bp_col = "BP"
 8 | id_col = "SNP"
 9 | 
10 | # Plot elements
11 | legend_label = "AUD EUR"               # label to use in the legend
12 | legend_label_color = "#994455"         # color hex code
13 | y_axis_label = "p-value"               # y axis will be labeled -log10(y_axis_label)
14 | gws_threshold = 5E-8
15 | p_cutoff_low = 0.05                    # all variants with p > p_cutoff_low are ignored
16 | p_cutoff_high = 1E-40                  # all variants with p < p_cutoff_high are ignored
17 | allign_y_max = True                    # if True, make y max limit equal in top and bottom axis
18 | # Normal dots
19 | color1 = "#EE99AA"                     # color (hex code) of markers on odd chromosomes
20 | color2 = "#994455"                     # color of markers on even chromosomes
21 | size = 8                               # size of the marker
22 | marker = "o"                           # shape of the marker, see matplotlib marker specification for available options
23 | alpha = 0.7                            # transparency of markers ranging from 0 (fully transparent) to 1 (opaque)
24 | # Bold dots
25 | bold = "bold.test.txt"                 # file (path) with marker ids to plot with potentially different size/marker/color/transparency; set to empty string to ignore
26 | color1_bold = "#EE99AA"
27 | color2_bold = "#994455"
28 | size_bold = 48
29 | marker_bold = "o"
30 | alpha_bold = 1
31 | # Outlined dots
32 | outlined = "outlined.test.txt"         # file (path) with marker ids to plot with outline; set to empty string to ignore
33 | color1_outlined = "#EE99AA"
34 | color2_outlined = "#994455"
35 | size_outlined = 144
36 | marker_outlined = "o"
37 | alpha_outlined = 1
38 | 
39 | 


--------------------------------------------------------------------------------
/convert_cleansumstats_output_to_mixer_format.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | # python convert_cleansumstats_output_to_mixer_format.py /cluster/projects/p697/projects/SUMSTATv3/v3.1/STD_GRCh37/CTG_COG_2018.sumstats.gz CTG_COG_2018_mixer.sumstats.gz
 6 | if __name__ == "__main__":
 7 |     fname = sys.argv[1]
 8 |     fname_out = sys.argv[2]
 9 |     print(f'processing {fname} -> {fname_out}...')
10 |     df = pd.read_csv(fname, sep='\t', dtype=str)
11 |     idx = df['CHR'].astype('str').str.lower().str.replace('chr', '').isin([str(i) for i in range(1, 23)])
12 |     print(f'keep autosomes only (CHR column contains 1-22): {np.sum(~idx)} variants removed, {np.sum(idx)} variants retained')
13 |     df = df[idx].copy()
14 |     df['CHR'] = df['CHR'].astype(int)
15 |     print('original columns: ' + ' '.join(df.columns))
16 |     if 'POS' in df.columns: df.rename(columns={'POS':'BP'}, inplace=True)
17 |     if 'RSID' in df.columns: df.rename(columns={'RSID':'SNP'}, inplace=True)
18 |     if 'EffectAllele' in df.columns: df.rename(columns={'EffectAllele':'A1'}, inplace=True)
19 |     if 'OtherAllele' in df.columns: df.rename(columns={'OtherAllele':'A2'}, inplace=True)
20 |     if 'B' in df.columns: df.rename(columns={'B':'BETA'}, inplace=True)
21 |     if 'EAF' in df.columns: df.rename(columns={'EAF':'FRQ'}, inplace=True)
22 |     print('renamed  columns: ' + ' '.join(df.columns))
23 | 
24 |     sumstats_len = len(df)
25 |     df['BP'] = pd.to_numeric(df['BP'], errors='coerce')
26 |     df.dropna(subset=['BP'], inplace=True)
27 |     df['BP'] = df['BP'].astype(int)
28 |     print(f'Drop {sumstats_len - len(df)} variants due to non-numeric or missing values in BP column')
29 | 
30 |     idx = (df['CHR'] == 6) & (df['BP'] >= 25e6) & (df['BP'] < 35e6)
31 |     print(f'drop MHC variants (chr6:25-35): {np.sum(idx)} variants removed, {np.sum(~idx)} variants retained')
32 |     df = df[~idx].copy()
33 | 
34 |     print(f'writing {fname_out}...')    
35 |     df.to_csv(fname_out, sep='\t', index=False)
36 |     print('done.')
37 | 


--------------------------------------------------------------------------------
/data/biomart_GENCODE_basic.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/data/biomart_GENCODE_basic.txt.gz


--------------------------------------------------------------------------------
/fdrmat2csv.py:
--------------------------------------------------------------------------------
 1 | ### CONVERT PLEIOFDR RESULTS TO CSV ########################
 2 | 
 3 | # -- Modules -------------------------
 4 | 
 5 | import pandas as pd
 6 | import scipy.io as sio
 7 | import os
 8 | import argparse
 9 | 
10 | # -- Parse arguments -------------------------
11 | 
12 | parser = argparse.ArgumentParser(description="Convert PleioFDR result.mat file to csv")
13 | requiredNamed = parser.add_argument_group('Required arguments')
14 | requiredNamed.add_argument("--mat", help="Path to result.mat file from PleioFDR ouput", required=True)
15 | requiredNamed.add_argument("--ref", help="Path to .ref file", required=True)
16 | parser.add_argument("--out", help="Path to file after conversion is done (default: result.mat.csv)")
17 | parser.add_argument("--head", default=5, type=int, help="Number of lines to show (default: 5)")
18 | parser.add_argument("--compress", default=False, action='store_true', help="Compress to .gz archive (default: False)")
19 | args = parser.parse_args()
20 | 
21 | matfile = args.mat
22 | reffile = args.ref
23 | if args.out is not None:
24 |     outname = args.out
25 | else:
26 |     outname = matfile + '.csv'
27 | 
28 | # -- Convert result.mat -------------------------
29 | 
30 | if __name__ == '__main__':
31 |     print('Load {}'.format(matfile))
32 |     sumstats = sio.loadmat(matfile)
33 |     
34 |     print('Load {}'.format(reffile))
35 |     ref = pd.read_csv(reffile, delim_whitespace=True)
36 |     ref['FDR'] = None
37 |     ref['FDR'] = sumstats['fdrmat']
38 | 
39 |     print('Write {}'.format(outname))
40 |     ref[['CHR', 'SNP', 'BP', 'A1', 'A2', 'FDR']].to_csv(outname, index=False, sep='\t')
41 | 
42 |     if args.head:
43 |         print(ref.head(args.head))
44 | 
45 |     if args.compress:
46 |         print('Compress {}'.format(outname))
47 |         os.system('gzip {}'.format(outname))
48 | 
49 |     print('Done!')
50 | 
51 | pass
52 | 


--------------------------------------------------------------------------------
/figs/Z3nns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/figs/Z3nns.png


--------------------------------------------------------------------------------
/lift_rs_numbers.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | 
 3 | class LiftRsNumbers:
 4 |     def _read_rs_history(self, histFile):
 5 |         RS_HISTORY = set() # store rs
 6 | 
 7 |         print("Reading '{}' file...".format(histFile))
 8 |         for ln in gzip.open(histFile, mode='rt'):
 9 |             fd = ln.strip().split('\t')
10 |             # Some very few entries in SNPHistory file are about
11 |             # re-activation SNPs (not about deleting them).
12 |             # We just need to ignore those entries
13 |             if ln.lower().find('re-activ') < 0:
14 |                 RS_HISTORY.add(fd[0])
15 |         print('{} entries found'.format(len(RS_HISTORY)))
16 |         return RS_HISTORY
17 | 
18 |     def _read_rs_merge(self, mergFile):
19 |         RS_MERGE = dict() # high_rs -> (lower_rs, current_rs)
20 | 
21 |         print("Reading '{}' file...".format(mergFile))
22 |         for ln in gzip.open(mergFile, mode='rt'):
23 |             fd = ln.strip().split('\t')
24 |             h, l = fd[0], fd[1]
25 |             c = fd[6]
26 |             RS_MERGE[h] = (l, c)
27 | 
28 |         print('{} entries found'.format(len(RS_MERGE)))
29 |         return RS_MERGE
30 | 
31 |     def __init__(self, hist_file=None, merge_file=None):
32 |         self._RS_HISTORY = self._read_rs_history(hist_file)
33 |         self._RS_MERGE = self._read_rs_merge(merge_file)
34 | 
35 |     def lift(self, rsvec):
36 |         unchanged = 0; lifted = 0; deleted = 0; not_rs_number = 0;
37 |         RS_LIFTED = rsvec.copy(); nsnps = len(rsvec)
38 |         print("Lifting rs# numbers for n={} SNPs...".format(nsnps))
39 |         is_rs_number = [x.startswith('rs') and x[2:].isdigit() for x in rsvec]
40 |         rsvec = [x[2:] for x in rsvec]
41 |         for i in range(nsnps):
42 |             if not is_rs_number[i]:
43 |                 not_rs_number += 1
44 |                 continue
45 |             rs = rsvec[i]
46 |             if rs not in self._RS_MERGE and rs not in self._RS_HISTORY:
47 |                 unchanged += 1
48 |                 continue
49 |             while True:
50 |                 if rs in self._RS_MERGE:
51 |                     rsLow, rsCurrent = self._RS_MERGE[rs]
52 |                     if rsCurrent not in self._RS_HISTORY and rsCurrent != '':
53 |                         RS_LIFTED[i] = 'rs' + rsCurrent; lifted += 1
54 |                         break
55 |                     else:
56 |                         rs = rsLow
57 |                 else:
58 |                     # Such SNPs were deleted from SNPdb,
59 |                     # look it up here: https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=71800898
60 |                     RS_LIFTED[i] = None; deleted += 1
61 |                     break
62 | 
63 |         return RS_LIFTED, {'invalid rs#':not_rs_number, 'unchanged':unchanged, 'lifted':lifted, 'deleted':deleted}
64 | 


--------------------------------------------------------------------------------
/make_ld_matrix/README.md:
--------------------------------------------------------------------------------
 1 | # make_ld_matrix.py
 2 | 
 3 | Make LD matrix from reference data. Output either in matlab format or as dense lower triangular text file.
 4 | To run this tool you need to download reference data from http://ctg.cncr.nl/software/magma (for example g1000_eur).
 5 | 
 6 | For info run `python make_ld_matrix.py --help`.
 7 | 
 8 | 
 9 | ## Usage
10 | 
11 | ### only abel:
12 | ```
13 | qlogin --account=nn9114k --mem-per-cpu=2000  --cpus-per-task=2
14 | 
15 | module load python2
16 | module load plink2
17 | module load octave
18 | 
19 | pip install --user pandas
20 | pip install --user scipy
21 | 
22 | cd /work/users/$USER/
23 | ```
24 | 
25 | ### general:
26 | 
27 | Download data:
28 |   ```
29 | (
30 |   mkdir data10g
31 |   cd data10g
32 |   wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr[0-9]*vcf.gz
33 | )
34 |   ```
35 |   
36 | Download code:
37 |   ```
38 | git clone https://github.com/precimed/python_convert
39 | cd python_convert/make_ld_matrix
40 |   ```
41 | 
42 | Set path to your reference file:
43 |   ```
44 |   reffile=/work/users/$USER/2558411_ref.bim 
45 |   ```
46 | 
47 | Create Matlab matrices:
48 |   ```
49 | for i in ../../data10g/*.vcf.gz; do
50 |   python make_ld_matrix.py --vcf  $i \
51 |     --ref $reffile  --savemat tmp/$(basename $i).map \
52 |     --plink 'plink --memory 3600 --threads 2';
53 | done
54 |   ```
55 | 
56 | Convert to Matlab sparse matrices:
57 | ```
58 | for f in tmp/*.map.mat; do
59 |   # repace ".map.mat" with ".sparse.mat"
60 |   outfile=${f%.map.mat}.sparse.mat
61 |   # matlab script
62 |   mscript="
63 |   load $f
64 | LDmat = sparse(double(id1),double(id2),true,double(nsnp),double(nsnp));
65 | LDmat = LDmat | speye(double(nsnp));
66 | LDmat = LDmat | (LDmat - LDmat');
67 | save(\"$outfile\", 'LDmat', '-v7.3')
68 | "
69 |   # run matlab script
70 |   echo "$mscript" | octave --silent
71 | done
72 | ```
73 | Inside the folder "./tmp/" you now have a file "*.sparse.mat" for each chromosome. 
74 | 


--------------------------------------------------------------------------------
/make_ld_matrix/data/EUR_subj.list:
--------------------------------------------------------------------------------
  1 | HG00096	HG00096
  2 | HG00097	HG00097
  3 | HG00099	HG00099
  4 | HG00100	HG00100
  5 | HG00101	HG00101
  6 | HG00102	HG00102
  7 | HG00103	HG00103
  8 | HG00105	HG00105
  9 | HG00106	HG00106
 10 | HG00107	HG00107
 11 | HG00108	HG00108
 12 | HG00109	HG00109
 13 | HG00110	HG00110
 14 | HG00111	HG00111
 15 | HG00112	HG00112
 16 | HG00113	HG00113
 17 | HG00114	HG00114
 18 | HG00115	HG00115
 19 | HG00116	HG00116
 20 | HG00117	HG00117
 21 | HG00118	HG00118
 22 | HG00119	HG00119
 23 | HG00120	HG00120
 24 | HG00121	HG00121
 25 | HG00122	HG00122
 26 | HG00123	HG00123
 27 | HG00125	HG00125
 28 | HG00126	HG00126
 29 | HG00127	HG00127
 30 | HG00128	HG00128
 31 | HG00129	HG00129
 32 | HG00130	HG00130
 33 | HG00131	HG00131
 34 | HG00132	HG00132
 35 | HG00133	HG00133
 36 | HG00136	HG00136
 37 | HG00137	HG00137
 38 | HG00138	HG00138
 39 | HG00139	HG00139
 40 | HG00140	HG00140
 41 | HG00141	HG00141
 42 | HG00142	HG00142
 43 | HG00143	HG00143
 44 | HG00145	HG00145
 45 | HG00146	HG00146
 46 | HG00148	HG00148
 47 | HG00149	HG00149
 48 | HG00150	HG00150
 49 | HG00151	HG00151
 50 | HG00154	HG00154
 51 | HG00155	HG00155
 52 | HG00157	HG00157
 53 | HG00158	HG00158
 54 | HG00159	HG00159
 55 | HG00160	HG00160
 56 | HG00171	HG00171
 57 | HG00173	HG00173
 58 | HG00174	HG00174
 59 | HG00176	HG00176
 60 | HG00177	HG00177
 61 | HG00178	HG00178
 62 | HG00179	HG00179
 63 | HG00180	HG00180
 64 | HG00181	HG00181
 65 | HG00182	HG00182
 66 | HG00183	HG00183
 67 | HG00185	HG00185
 68 | HG00186	HG00186
 69 | HG00187	HG00187
 70 | HG00188	HG00188
 71 | HG00189	HG00189
 72 | HG00190	HG00190
 73 | HG00231	HG00231
 74 | HG00232	HG00232
 75 | HG00233	HG00233
 76 | HG00234	HG00234
 77 | HG00235	HG00235
 78 | HG00236	HG00236
 79 | HG00237	HG00237
 80 | HG00238	HG00238
 81 | HG00239	HG00239
 82 | HG00240	HG00240
 83 | HG00242	HG00242
 84 | HG00243	HG00243
 85 | HG00244	HG00244
 86 | HG00245	HG00245
 87 | HG00246	HG00246
 88 | HG00250	HG00250
 89 | HG00251	HG00251
 90 | HG00252	HG00252
 91 | HG00253	HG00253
 92 | HG00254	HG00254
 93 | HG00255	HG00255
 94 | HG00256	HG00256
 95 | HG00257	HG00257
 96 | HG00258	HG00258
 97 | HG00259	HG00259
 98 | HG00260	HG00260
 99 | HG00261	HG00261
100 | HG00262	HG00262
101 | HG00263	HG00263
102 | HG00264	HG00264
103 | HG00265	HG00265
104 | HG00266	HG00266
105 | HG00267	HG00267
106 | HG00268	HG00268
107 | HG00269	HG00269
108 | HG00271	HG00271
109 | HG00272	HG00272
110 | HG00273	HG00273
111 | HG00274	HG00274
112 | HG00275	HG00275
113 | HG00276	HG00276
114 | HG00277	HG00277
115 | HG00278	HG00278
116 | HG00280	HG00280
117 | HG00281	HG00281
118 | HG00282	HG00282
119 | HG00284	HG00284
120 | HG00285	HG00285
121 | HG00288	HG00288
122 | HG00290	HG00290
123 | HG00304	HG00304
124 | HG00306	HG00306
125 | HG00308	HG00308
126 | HG00309	HG00309
127 | HG00310	HG00310
128 | HG00311	HG00311
129 | HG00313	HG00313
130 | HG00315	HG00315
131 | HG00318	HG00318
132 | HG00319	HG00319
133 | HG00320	HG00320
134 | HG00321	HG00321
135 | HG00323	HG00323
136 | HG00324	HG00324
137 | HG00325	HG00325
138 | HG00326	HG00326
139 | HG00327	HG00327
140 | HG00328	HG00328
141 | HG00329	HG00329
142 | HG00330	HG00330
143 | HG00331	HG00331
144 | HG00332	HG00332
145 | HG00334	HG00334
146 | HG00335	HG00335
147 | HG00336	HG00336
148 | HG00337	HG00337
149 | HG00338	HG00338
150 | HG00339	HG00339
151 | HG00341	HG00341
152 | HG00342	HG00342
153 | HG00343	HG00343
154 | HG00344	HG00344
155 | HG00345	HG00345
156 | HG00346	HG00346
157 | HG00349	HG00349
158 | HG00350	HG00350
159 | HG00351	HG00351
160 | HG00353	HG00353
161 | HG00355	HG00355
162 | HG00356	HG00356
163 | HG00357	HG00357
164 | HG00358	HG00358
165 | HG00360	HG00360
166 | HG00361	HG00361
167 | HG00362	HG00362
168 | HG00364	HG00364
169 | HG00365	HG00365
170 | HG00366	HG00366
171 | HG00367	HG00367
172 | HG00368	HG00368
173 | HG00369	HG00369
174 | HG00371	HG00371
175 | HG00372	HG00372
176 | HG00373	HG00373
177 | HG00375	HG00375
178 | HG00376	HG00376
179 | HG00378	HG00378
180 | HG00379	HG00379
181 | HG00380	HG00380
182 | HG00381	HG00381
183 | HG00382	HG00382
184 | HG00383	HG00383
185 | HG00384	HG00384
186 | HG01334	HG01334
187 | HG01500	HG01500
188 | HG01501	HG01501
189 | HG01503	HG01503
190 | HG01504	HG01504
191 | HG01506	HG01506
192 | HG01507	HG01507
193 | HG01509	HG01509
194 | HG01510	HG01510
195 | HG01512	HG01512
196 | HG01513	HG01513
197 | HG01515	HG01515
198 | HG01516	HG01516
199 | HG01518	HG01518
200 | HG01519	HG01519
201 | HG01521	HG01521
202 | HG01522	HG01522
203 | HG01524	HG01524
204 | HG01525	HG01525
205 | HG01527	HG01527
206 | HG01528	HG01528
207 | HG01530	HG01530
208 | HG01531	HG01531
209 | HG01536	HG01536
210 | HG01537	HG01537
211 | HG01602	HG01602
212 | HG01603	HG01603
213 | HG01605	HG01605
214 | HG01606	HG01606
215 | HG01607	HG01607
216 | HG01608	HG01608
217 | HG01610	HG01610
218 | HG01612	HG01612
219 | HG01613	HG01613
220 | HG01615	HG01615
221 | HG01617	HG01617
222 | HG01618	HG01618
223 | HG01619	HG01619
224 | HG01620	HG01620
225 | HG01623	HG01623
226 | HG01624	HG01624
227 | HG01625	HG01625
228 | HG01626	HG01626
229 | HG01628	HG01628
230 | HG01630	HG01630
231 | HG01631	HG01631
232 | HG01632	HG01632
233 | HG01668	HG01668
234 | HG01669	HG01669
235 | HG01670	HG01670
236 | HG01672	HG01672
237 | HG01673	HG01673
238 | HG01675	HG01675
239 | HG01676	HG01676
240 | HG01678	HG01678
241 | HG01679	HG01679
242 | HG01680	HG01680
243 | HG01682	HG01682
244 | HG01684	HG01684
245 | HG01685	HG01685
246 | HG01686	HG01686
247 | HG01694	HG01694
248 | HG01695	HG01695
249 | HG01697	HG01697
250 | HG01699	HG01699
251 | HG01700	HG01700
252 | HG01702	HG01702
253 | HG01704	HG01704
254 | HG01705	HG01705
255 | HG01707	HG01707
256 | HG01708	HG01708
257 | HG01709	HG01709
258 | HG01710	HG01710
259 | HG01746	HG01746
260 | HG01747	HG01747
261 | HG01756	HG01756
262 | HG01757	HG01757
263 | HG01761	HG01761
264 | HG01762	HG01762
265 | HG01765	HG01765
266 | HG01766	HG01766
267 | HG01767	HG01767
268 | HG01768	HG01768
269 | HG01770	HG01770
270 | HG01771	HG01771
271 | HG01773	HG01773
272 | HG01775	HG01775
273 | HG01776	HG01776
274 | HG01777	HG01777
275 | HG01779	HG01779
276 | HG01781	HG01781
277 | HG01783	HG01783
278 | HG01784	HG01784
279 | HG01785	HG01785
280 | HG01786	HG01786
281 | HG01789	HG01789
282 | HG01790	HG01790
283 | HG01791	HG01791
284 | HG02215	HG02215
285 | HG02219	HG02219
286 | HG02220	HG02220
287 | HG02221	HG02221
288 | HG02223	HG02223
289 | HG02224	HG02224
290 | HG02230	HG02230
291 | HG02231	HG02231
292 | HG02232	HG02232
293 | HG02233	HG02233
294 | HG02235	HG02235
295 | HG02236	HG02236
296 | HG02238	HG02238
297 | HG02239	HG02239
298 | NA06984	NA06984
299 | NA06985	NA06985
300 | NA06986	NA06986
301 | NA06989	NA06989
302 | NA06994	NA06994
303 | NA07000	NA07000
304 | NA07037	NA07037
305 | NA07048	NA07048
306 | NA07051	NA07051
307 | NA07056	NA07056
308 | NA07347	NA07347
309 | NA07357	NA07357
310 | NA10847	NA10847
311 | NA10851	NA10851
312 | NA11829	NA11829
313 | NA11830	NA11830
314 | NA11831	NA11831
315 | NA11832	NA11832
316 | NA11840	NA11840
317 | NA11843	NA11843
318 | NA11881	NA11881
319 | NA11892	NA11892
320 | NA11893	NA11893
321 | NA11894	NA11894
322 | NA11918	NA11918
323 | NA11919	NA11919
324 | NA11920	NA11920
325 | NA11930	NA11930
326 | NA11931	NA11931
327 | NA11932	NA11932
328 | NA11933	NA11933
329 | NA11992	NA11992
330 | NA11994	NA11994
331 | NA11995	NA11995
332 | NA12003	NA12003
333 | NA12004	NA12004
334 | NA12005	NA12005
335 | NA12006	NA12006
336 | NA12043	NA12043
337 | NA12044	NA12044
338 | NA12045	NA12045
339 | NA12046	NA12046
340 | NA12058	NA12058
341 | NA12144	NA12144
342 | NA12154	NA12154
343 | NA12155	NA12155
344 | NA12156	NA12156
345 | NA12234	NA12234
346 | NA12249	NA12249
347 | NA12272	NA12272
348 | NA12273	NA12273
349 | NA12275	NA12275
350 | NA12282	NA12282
351 | NA12283	NA12283
352 | NA12286	NA12286
353 | NA12287	NA12287
354 | NA12340	NA12340
355 | NA12341	NA12341
356 | NA12342	NA12342
357 | NA12347	NA12347
358 | NA12348	NA12348
359 | NA12383	NA12383
360 | NA12399	NA12399
361 | NA12400	NA12400
362 | NA12413	NA12413
363 | NA12414	NA12414
364 | NA12489	NA12489
365 | NA12546	NA12546
366 | NA12716	NA12716
367 | NA12717	NA12717
368 | NA12718	NA12718
369 | NA12748	NA12748
370 | NA12749	NA12749
371 | NA12750	NA12750
372 | NA12751	NA12751
373 | NA12760	NA12760
374 | NA12761	NA12761
375 | NA12762	NA12762
376 | NA12763	NA12763
377 | NA12775	NA12775
378 | NA12776	NA12776
379 | NA12777	NA12777
380 | NA12778	NA12778
381 | NA12812	NA12812
382 | NA12813	NA12813
383 | NA12814	NA12814
384 | NA12815	NA12815
385 | NA12827	NA12827
386 | NA12828	NA12828
387 | NA12829	NA12829
388 | NA12830	NA12830
389 | NA12842	NA12842
390 | NA12843	NA12843
391 | NA12872	NA12872
392 | NA12873	NA12873
393 | NA12874	NA12874
394 | NA12878	NA12878
395 | NA12889	NA12889
396 | NA12890	NA12890
397 | NA20502	NA20502
398 | NA20503	NA20503
399 | NA20504	NA20504
400 | NA20505	NA20505
401 | NA20506	NA20506
402 | NA20507	NA20507
403 | NA20508	NA20508
404 | NA20509	NA20509
405 | NA20510	NA20510
406 | NA20511	NA20511
407 | NA20512	NA20512
408 | NA20513	NA20513
409 | NA20514	NA20514
410 | NA20515	NA20515
411 | NA20516	NA20516
412 | NA20517	NA20517
413 | NA20518	NA20518
414 | NA20519	NA20519
415 | NA20520	NA20520
416 | NA20521	NA20521
417 | NA20522	NA20522
418 | NA20524	NA20524
419 | NA20525	NA20525
420 | NA20527	NA20527
421 | NA20528	NA20528
422 | NA20529	NA20529
423 | NA20530	NA20530
424 | NA20531	NA20531
425 | NA20532	NA20532
426 | NA20533	NA20533
427 | NA20534	NA20534
428 | NA20535	NA20535
429 | NA20536	NA20536
430 | NA20538	NA20538
431 | NA20539	NA20539
432 | NA20540	NA20540
433 | NA20541	NA20541
434 | NA20542	NA20542
435 | NA20543	NA20543
436 | NA20544	NA20544
437 | NA20581	NA20581
438 | NA20582	NA20582
439 | NA20585	NA20585
440 | NA20586	NA20586
441 | NA20587	NA20587
442 | NA20588	NA20588
443 | NA20589	NA20589
444 | NA20752	NA20752
445 | NA20753	NA20753
446 | NA20754	NA20754
447 | NA20755	NA20755
448 | NA20756	NA20756
449 | NA20757	NA20757
450 | NA20758	NA20758
451 | NA20759	NA20759
452 | NA20760	NA20760
453 | NA20761	NA20761
454 | NA20762	NA20762
455 | NA20763	NA20763
456 | NA20764	NA20764
457 | NA20765	NA20765
458 | NA20766	NA20766
459 | NA20767	NA20767
460 | NA20768	NA20768
461 | NA20769	NA20769
462 | NA20770	NA20770
463 | NA20771	NA20771
464 | NA20772	NA20772
465 | NA20773	NA20773
466 | NA20774	NA20774
467 | NA20775	NA20775
468 | NA20778	NA20778
469 | NA20783	NA20783
470 | NA20785	NA20785
471 | NA20786	NA20786
472 | NA20787	NA20787
473 | NA20790	NA20790
474 | NA20792	NA20792
475 | NA20795	NA20795
476 | NA20796	NA20796
477 | NA20797	NA20797
478 | NA20798	NA20798
479 | NA20799	NA20799
480 | NA20800	NA20800
481 | NA20801	NA20801
482 | NA20802	NA20802
483 | NA20803	NA20803
484 | NA20804	NA20804
485 | NA20805	NA20805
486 | NA20806	NA20806
487 | NA20807	NA20807
488 | NA20808	NA20808
489 | NA20809	NA20809
490 | NA20810	NA20810
491 | NA20811	NA20811
492 | NA20812	NA20812
493 | NA20813	NA20813
494 | NA20814	NA20814
495 | NA20815	NA20815
496 | NA20818	NA20818
497 | NA20819	NA20819
498 | NA20821	NA20821
499 | NA20822	NA20822
500 | NA20826	NA20826
501 | NA20827	NA20827
502 | NA20828	NA20828
503 | NA20832	NA20832
504 | 


--------------------------------------------------------------------------------
/make_ld_matrix/genotypes2ref.py:
--------------------------------------------------------------------------------
 1 | # Align genotypes to reference file, e.g.
 2 | # - Extract SNPs from the reference (merge by CHR:POS --- require data to be on the same genomic build)
 3 | # - Extract subset of individuals (for example, european population)
 4 | # - Merge SNPs together (for example if input data is split by chromosome)
 5 | #
 6 | # To run this tool:
 7 | # - Download *.vcf.gz files from 1000 Genome project ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/
 8 | # - Run the tool as follows 
 9 | #    python genotypes2ref.py --vcf ~/1000Genome/phase3/build37_released/*.vcf.gz --ref 2558411_ref.bim
10 | 
11 | import argparse
12 | import glob
13 | import itertools
14 | import os.path
15 | import os
16 | import subprocess
17 | import sys
18 | import pandas as pd
19 | 
20 | def parse_args(args):
21 |     parser = argparse.ArgumentParser(description="Generate LD matrix from genotype matrix")
22 |     parser.add_argument("--ref", type=str, help="Reference file (for example 2558411_ref.bim or 9279485_ref.bim.")
23 |     parser.add_argument("--vcf", type=str, help="Filename of input .vcf file, or pattern (for example '~/1000Genome/phase3/build37_released/*.vcf.gz')")
24 |     parser.add_argument("--keep", default=r"data/EUR_subj.list", type=str, help="Extract SNPs and keep only EUR individuals")
25 |     parser.add_argument("--out", default=r"tmp", type=str, help="Folder to output the result")
26 |     parser.add_argument("--plink", default="plink", type=str, help="location of plink executable")
27 |     return parser.parse_args(args)
28 | 
29 | def execute_command(command):
30 |     print("Execute command: {}".format(command))
31 |     print(subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0].decode("utf-8"))
32 |     #print(subprocess.check_output(command.split()).decode("utf-8"))
33 | 
34 | def process_vcf_file(vcf_file, df_ref, keep_file, output_dir, plink):
35 |     [_, filename] = os.path.split(vcf_file)
36 |     bfile = os.path.join(output_dir, filename)
37 |     snpidlist = os.path.join(output_dir, filename + '.snpidlist.txt')
38 |     join_file = os.path.join(output_dir, filename)
39 | 
40 |     if not os.path.exists(output_dir):
41 |         os.makedirs(output_dir)
42 | 
43 |     # Convert vcf into bed
44 |     execute_command(r'{0} --vcf {1} --make-bed --out {2}'.format(plink, vcf_file, bfile))
45 | 
46 |     # Read bim file
47 |     df_bim = pd.read_csv('{}.bim'.format(bfile), header=None, delim_whitespace=True)
48 |     df_bim.columns=['CHR','SNP','GP','POS','A1','A2']
49 | 
50 |     # Left-merge with reference file by CHR:POS, then output merged RS numberes into snpidlist.txt
51 |     df_bim2 = pd.merge(df_bim, df_ref, how='left', left_on = ['CHR', 'POS'], right_on = ['CHR', 'BP'])
52 |     df_bim2[df_bim2['SNP_y'].notnull()]['SNP_x'].to_csv(snpidlist, index=False)
53 | 
54 |     # Extract SNPs and keep only EUR individuals
55 |     execute_command(r'{0} --bfile {1} --extract {2} --keep {3} --make-bed --out {4}'.format(plink, bfile, snpidlist, keep_file, join_file))
56 |     return join_file
57 | 
58 | if __name__ == "__main__":
59 |     args = parse_args(sys.argv[1:])
60 | 
61 |     vcf_files=[file for file in glob.glob(args.vcf)
62 |                if ('chrX' not in file) and ('chrY' not in file)]
63 |     print(vcf_files)
64 |     # Read reference file
65 |     df_ref = pd.read_csv(args.ref, delim_whitespace=True)
66 |     assert df_ref.duplicated(['CHR', 'BP']).sum() == 0
67 |     assert df_ref.duplicated(['SNP']).sum() == 0
68 | 
69 |     for vcf_file in vcf_files: process_vcf_file(vcf_file, df_ref, args.keep, args.out, args.plink)
70 | 
71 |     print("Done.")
72 | 


--------------------------------------------------------------------------------
/make_ld_matrix/make_ld_matrix.py:
--------------------------------------------------------------------------------
  1 | # Download reference data from http://ctg.cncr.nl/software/magma (for example g1000_eur)
  2 | # Then you can run the tool as follows:
  3 | #    python make_ld_matrix.py --ref 2558411_ref.bim --bfile g1000_eur --ld_window_r2 0.1 --savemat ldmat_p1.mat
  4 | #
  5 | # Another example is for situation where you've already generated LD matrix by plink:
  6 | #    python make_ld_matrix.py --ref 2558411_ref.bim --ldfile tmp.ld --savemat ldmat.mat
  7 | 
  8 | from subprocess import call, check_output
  9 | import subprocess
 10 | import pandas as pd
 11 | import numpy as np
 12 | import argparse
 13 | import sys
 14 | import os.path
 15 | from make_maf_vector import make_maf_vector
 16 | from genotypes2ref import process_vcf_file
 17 | 
 18 | def execute_command(command):
 19 |     print("Execute command: {}".format(command))
 20 |     process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 21 |     print(process.communicate()[0].decode("utf-8"))
 22 |     #print(subprocess.check_output(command.split()).decode("utf-8"))
 23 | 
 24 | 
 25 | def parse_args(args):
 26 |     parser = argparse.ArgumentParser(description="Generate LD matrix from genotype matrix")
 27 |     parser.add_argument("--ref", type=str, help="Reference file (for example 2558411_ref.bim or 9279485_ref.bim.")
 28 |     parser.add_argument("--bfile", type=str, help="Genotypes in plink binary format")
 29 |     parser.add_argument("--vcf", type=str, help="Filename of input .vcf file, or pattern (for example '~/1000Genome/phase3/build37_released/*.vcf.gz')")
 30 |     parser.add_argument("--keep", default=None, type=str, help="Extract SNPs and keep only EUR individuals")
 31 |     parser.add_argument("--ldfile", type=str, default=None, help="Path to .ld file generated by plink (takes priority over bfile, ld_window_kb and ld_window_r2")
 32 |     parser.add_argument("--ld_window_kb", default=10000, type=int, help="Window in KB")
 33 |     parser.add_argument("--ld_window_r2", default=0.1, type=float, help="LD r2 threshold")
 34 |     parser.add_argument("--chunksize", default=1000000, type=int, help="Chunk size when reading ld matrix")
 35 |     parser.add_argument("--plink", default="plink", type=str, help="location of plink executable")
 36 |     parser.add_argument("--savemat", default=None, type=str, help="Generate matfile for Matlab.")
 37 |     parser.add_argument("--saveltm", default=None, type=str, help="Generate 'ltm' --- lower triangular matrix in plain text format.")
 38 |     return parser.parse_args(args)
 39 | 
 40 | def make_ld_matrix(args):
 41 |     if not args.savemat and not args.saveltm:
 42 |         raise ValueError('No output requested, use --savemat or --saveltm')
 43 |     if args.savemat and os.path.isfile(args.savemat):
 44 |         raise ValueError('Output file already exist: {}'.format(args.savemat))
 45 |     if args.saveltm and os.path.isfile(args.saveltm):
 46 |         raise ValueError('Output file already exist: {}'.format(args.saveltm))
 47 | 
 48 |     # Read the template
 49 |     print('Reading {0}...'.format(args.ref))
 50 |     ref = pd.read_csv(args.ref, delim_whitespace=True, usecols=['BP', 'CHR'])
 51 |     nsnp = ref.shape[0]
 52 |     chrpos_to_id = dict([((chr, pos), index) for chr, pos, index in zip(ref['CHR'], ref['BP'], ref.index)])
 53 |     if len(chrpos_to_id) != nsnp: raise ValueError("Duplicated CHR:POS pairs found in the reference file")
 54 | 
 55 |     if args.vcf is not None:
 56 |        args.bfile = process_vcf_file(args.vcf, ref, args.keep, 'tmp', args.plink)
 57 |     
 58 |     if args.bfile is not None:
 59 |         execute_command('{0} --bfile {1} --freq --out {1}'.format(args.plink, args.bfile))
 60 |         mafvec = make_maf_vector(chrpos_to_id, nsnp, args.bfile);
 61 |     else:
 62 |         mafvec = np.empty((nsnp, 1))
 63 |         mafvec[:] = np.NAN
 64 | 
 65 |     if args.ldfile is None:
 66 |         # Create LD file in table format
 67 |         execute_command('{0} --bfile {1} --r2 gz --ld-window-kb {2} --ld-window 999999 --ld-window-r2 {3} --out {1}'.format(args.plink, args.bfile, args.ld_window_kb, args.ld_window_r2))
 68 |         args.ldfile = '{0}.ld.gz'.format(args.bfile)
 69 | 
 70 |     # Read resulting LD matrix
 71 |     reader = pd.read_csv(args.ldfile, delim_whitespace=True, chunksize=args.chunksize)
 72 | 
 73 |     print('Parsing {0}...'.format(args.ldfile))
 74 |     total_df = None
 75 |     for i, df in enumerate(reader):
 76 |         df_len_original = len(df)
 77 |         df = df[df['R2'] >= args.ld_window_r2].copy()
 78 |         id1tmp = [chrpos_to_id.get((chr, pos), None) for chr, pos in zip(df['CHR_A'], df['BP_A'])]
 79 |         id2tmp = [chrpos_to_id.get((chr, pos), None) for chr, pos in zip(df['CHR_B'], df['BP_B'])]
 80 |         mask = [(i1 is not None and i2 is not None) for i1, i2 in zip(id1tmp, id2tmp)]
 81 |         id1 = [value for index, value in enumerate(id1tmp) if mask[index] == True]
 82 |         id2 = [value for index, value in enumerate(id2tmp) if mask[index] == True]
 83 |         val = [value for index, value in enumerate(df['R2']) if mask[index] == True]
 84 |         df_tmp = pd.DataFrame(data={'id1': id1, 'id2': id2, 'val': val})
 85 |         total_df = df_tmp if total_df is None else total_df.append(df_tmp, ignore_index=True)
 86 |         print('\rFinish {0} entries ({1} after joining with ref and applying r2 threshold)'.format(i * args.chunksize + df_len_original, total_df.shape[0]))
 87 |     print('. Done.')
 88 | 
 89 |     print('Detecting duplicated entries...')
 90 |     old_size = total_df.shape[0]
 91 |     total_df.drop_duplicates(subset=['id1', 'id2'], keep='first', inplace=True)
 92 |     print('Drop {} duplicated entries'.format(old_size-total_df.shape[0]))
 93 | 
 94 |     # Output the result as lower diagonal matrix
 95 |     if args.saveltm:
 96 |         print('Save result as lower diagonal matrix to {0}...'.format(args.saveltm))
 97 |         from scipy.sparse import csr_matrix
 98 |         id1=list(total_df['id1']); id2 = list(total_df['id2']); val = list(total_df['val'])
 99 |         assert(all([(i < j) for (i, j) in zip(id1, id2)]))  # expect that plink output lower diagonal matrix
100 |         csr = csr_matrix((val, (id2, id1)), shape=(nsnp, nsnp))
101 | 
102 |         with open(args.saveltm, 'w') as result:
103 |             result.write('1.0\n')
104 |             for i in range(1, nsnp):
105 |                 values =  csr[i, :].todense()[0, 0:i].A1
106 |                 values_str = '\t'.join(str(x) for x in values)
107 |                 result.write('{0}\t1.0\n'.format(values_str))
108 |     
109 |     # Output the result in matlab format
110 |     if args.savemat:
111 |         print('Save result in matlab format to {0}...'.format(args.savemat))
112 |         import scipy.io as sio
113 |         sio.savemat(
114 |             args.savemat, {'id1':[i + 1 for i in total_df['id1']], 'id2':[i + 1 for i in total_df['id2']], 'val':list(total_df['val']), 'nsnp':nsnp, 'mafvec':mafvec },
115 |             format='5', do_compression=False, oned_as='column')
116 | 
117 |         print("""
118 | The results are saved into {0}. Now you should open matlab and execute the following commands to re-save the result as matlab sparse matrix:
119 |     load {0}
120 |     LDmat = sparse(double(id1),double(id2),true,double(nsnp),double(nsnp));
121 |     LDmat = LDmat | speye(double(nsnp));
122 |     LDmat = LDmat | (LDmat - LDmat');
123 |     save('LDmat.mat', 'LDmat', '-v7.3')
124 |     
125 | Or, to save with the actual r^2 values:
126 |     load {0}
127 |     i1 = [id1; id2; (1:nsnp)'];i2 = [id2; id1; (1:nsnp)']; v  = [val; val; ones(nsnp, 1)];
128 |     LDmat = sparse(double(i1),double(i2),double(v),double(nsnp),double(nsnp));
129 |     save('LDmat.mat', 'LDmat', '-v7.3')
130 | """.format(args.savemat))
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     args = parse_args(sys.argv[1:])
135 |     make_ld_matrix(args)
136 |     print("Done.")
137 | 


--------------------------------------------------------------------------------
/make_ld_matrix/make_maf_vector.py:
--------------------------------------------------------------------------------
 1 | # Download reference data from http://ctg.cncr.nl/software/magma (for example g1000_eur)
 2 | # Then you can run the tool as follows:
 3 | #    python make_maf_vector.py --ref 2558411_ref.bim --bfile merged --savemat mafvec.mat
 4 | 
 5 | 
 6 | import pandas as pd
 7 | import numpy as np
 8 | import argparse
 9 | import sys
10 | import scipy.io as sio
11 | 
12 | 
13 | def parse_args(args):
14 |     parser = argparse.ArgumentParser(description="Generate LD matrix from genotype matrix")
15 |     parser.add_argument("--ref", type=str, help="Reference file (for example 2558411_ref.bim or 9279485_ref.bim.")
16 |     parser.add_argument("--bfile", type=str, help="Genotypes in plink binary format (only bim and frq files are required)")
17 |     parser.add_argument("--savemat", default=None, type=str, help="Generate matfile for Matlab.")
18 |     return parser.parse_args(args)
19 | 
20 | 
21 | def make_maf_vector(chrpos_to_id, nsnp, bfile):
22 |     print('Reading {0}...'.format(bfile + '.bim'))
23 |     df_bim = pd.read_csv(bfile + '.bim', delim_whitespace=True, header=None)
24 |     df_bim.columns=['CHR','SNP','GP','POS','A1','A2']
25 |     print('Reading {0}...'.format(bfile + '.frq'))
26 |     df_frq = pd.read_csv(bfile + '.frq', delim_whitespace=True)
27 |     df_frq['POS'] = df_bim['POS']  # Assume that bim and frq files are aligned
28 |     df_frq['INDEX'] = [chrpos_to_id.get((chr, pos), -1) for chr, pos in zip(df_frq['CHR'], df_frq['POS'])]
29 | 
30 |     mafvec = np.zeros((nsnp, 1)); mafvec[:] = np.NAN
31 |     df = df_frq[df_frq['INDEX'] != -1]
32 |     for index, value in zip(df['INDEX'], df['MAF']):
33 |         mafvec[index] = value
34 |     return mafvec
35 | 
36 | if __name__ == "__main__":
37 |     args = parse_args(sys.argv[1:])
38 | 
39 |     print('Reading {0}...'.format(args.ref))
40 |     ref = pd.read_csv(args.ref, delim_whitespace=True)
41 |     nsnp = ref.shape[0]
42 |     chrpos_to_id = dict([((chr, pos), index) for chr, pos, index in zip(ref['CHR'], ref['BP'], ref.index)])
43 |     if len(chrpos_to_id) != nsnp: raise ValueError("Duplicated CHR:POS pairs found in the reference file")
44 | 
45 |     mafvec = make_maf_vector(chrpos_to_id=chrpos_to_id, nsnp=nsnp, bfile=args.bfile)
46 | 
47 |     print('Found {0} SNPs with non-zero MAF, {1} with zero MAF, {2} missing in genotypes --- {3} SNPs in total'.format(
48 |       (~np.isnan(mafvec) & (mafvec>0)).sum(), (mafvec == 0).sum(), np.isnan(mafvec).sum(), len(mafvec)))
49 | 
50 |     print('Saving result to {0}...'.format(args.savemat))
51 |     sio.savemat(args.savemat, {'mafvec':mafvec}, format='5', do_compression=False, oned_as='column')
52 | 
53 |     print("Done.")
54 | 


--------------------------------------------------------------------------------
/make_universal_variant_ids.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Input: whitespace-delimited csv file with CHR, BP, A1, A2 and ID columns.
 3 | Output: tab-separated csv file with two columns ID and UID, where ID column is taken from the input file,
 4 |     UID column is constructed as CHR:BP:AA1:AA2, where CHR and BP are from the input file, AA1 is min(A1, A2, A1_complementary, A2_complementary),
 5 |     AA2 = A2 if AA1 == A1,
 6 |     AA2 = A1 if AA1 == A2,
 7 |     AA2 = A2_complementary if AA1 == A1_complementary,
 8 |     AA2 = A1_complementary if AA1 == A2_complementary,
 9 |     min is taken based on lexicographical order.
10 |     If either A1 or A2 contains non-ATGC char, original ID is retained, i.e. UID = ID. 
11 | Example:
12 | python make_universal_variant_ids.py --fname /cluster/projects/p33/users/alexeas/hrc/HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz \
13 |         --chr "#CHROM" --bp POS --a1 REF --a2 ALT --id ID --out hrc.hg19.uid.txt
14 | Resulting hrc.hg19.uid.txt file will contain two columns (no header): (1) original ID from the input file, (2) constructed univeral ID (UID)
15 | Assume you need to map different type of variant ids between two different files:
16 |     FILE1 has rsid
17 |     FILE2 has chr:bp
18 |     and both FILE1 and FILE2 contain CHR, BP, A1, A2 columns with coordinates in the same genomic build.
19 | Then you can apply the script to each FILE1 and FILE2 separately to generate UIDs for each file. For FILE1 you will get FILE1.uid and for FILE2 you'll get FILE2.uid.
20 | Then you can get the mapping between rsid IDs from FILE1 and chr:bp IDs from FILE2 using:
21 | join -t$'\t' -1 2 -2 2 <(sort -k2,2 FILE1.uid) <(sort -k2,2 FILE2.uid) > FILE1_FILE2.uid
22 | Resulting FILE1_FILE2.uid file will have three columns: (1) univeral ID (2) rsid from FILE1, (3) chr:bp from FILE2.
23 | """
24 | 
25 | import pandas as pd
26 | import argparse
27 | 
28 | COMPL_DICT = {"A":"T", "T":"A", "G":"C", "C":"G"}
29 | DEL_ACGT_TT = str.maketrans({c:"" for c in "ATGC"})
30 | STD_COL_NAMES = ["CHR", "BP", "A1", "A2", "ID"]
31 | 
32 | def std_format(df, col_names, std_col_names):
33 |     # standardize format:
34 |     # - retain only relevant columns (col_names) in the specified order
35 |     # - rename columns to standard names, std_col_names[i] should be an std col name for col_names[i]
36 |     # - A1 and A2 to uppercase
37 |     df = df[col_names].copy(deep=True)
38 |     col_rename_dict = dict(zip(col_names, std_col_names))
39 |     df.rename(columns=col_rename_dict, inplace=True)
40 |     df["A1"] = df["A1"].str.upper()
41 |     df["A2"] = df["A2"].str.upper()
42 |     return df
43 |     
44 | def reverse_compl(seq):
45 |     # seq is an uppercase string.
46 |     return COMPL_DICT[seq] if len(seq) == 1 else "".join([COMPL_DICT[b] for b in seq][::-1])
47 | 
48 | def get_uid_col(df):
49 |     # df is DataFrame with standard column names with CHR, BP, A1, A2 and ID columns in the corresponding order.
50 |     # A1 and A2 must be capitalized.
51 |     uid_col = []
52 |     for chrom, bp, a1, a2, vid in df.itertuples(index=False):
53 |         if a1.translate(DEL_ACGT_TT) == "" and a2.translate(DEL_ACGT_TT) == "":
54 |             min_orig, max_orig = (a1, a2) if a1 < a2 else (a2, a1)
55 |             a1c, a2c = reverse_compl(a1), reverse_compl(a2)
56 |             min_compl, max_compl = (a1c, a2c) if a1c < a2c else (a2c, a1c)
57 |             a1u, a2u = (min_orig, max_orig) if min_orig < min_compl else (min_compl, max_compl)
58 |             uid = f"{chrom}:{bp}:{a1u}:{a2u}"
59 |         else:
60 |             uid = vid
61 |         uid_col.append(uid)
62 |     return uid_col
63 | 
64 | 
65 | # Parse arguments --------------------------
66 | parser = argparse.ArgumentParser(description="Constract universal variant IDs (see detailed description and example of use in the script file).")
67 | parser.add_argument("--fname", help="Path to input file with chromosome, position, allele 1, allele 2 and variant ID columns.")
68 | parser.add_argument("--chr", default="CHR", help="Chromosome column.")
69 | parser.add_argument("--bp", default="BP", help="Position column.")
70 | parser.add_argument("--a1", default="A1", help="Allele 1 column.")
71 | parser.add_argument("--a2", default="A2", help="Allele 2 column.")
72 | parser.add_argument("--id", default="ID", help="Variant ID column.")
73 | parser.add_argument("--save-all", action="store_true", help="Save all columns from the input file.")
74 | parser.add_argument("--out", help="Output file name.")
75 | args = parser.parse_args()
76 | 
77 | 
78 | # Main -------------------------------------
79 | col_names = [args.chr, args.bp, args.a1, args.a2, args.id] # the order should correspond to the order in STD_COL_NAMES
80 | df = pd.read_csv(args.fname, delim_whitespace=True, usecols=None if args.save_all else col_names, dtype=str)
81 | assert not "UID" in df.columns
82 | print(f"{df.shape[0]} variants loaded from {args.fname}")
83 | df_std = std_format(df, col_names, STD_COL_NAMES)
84 | uid_col = get_uid_col(df_std)
85 | assert df.shape[0] == df_std.shape[0]
86 | df["UID"] = uid_col
87 | 
88 | if args.save_all:
89 |     df.to_csv(args.out, sep='\t', index=False)
90 | else:
91 |     df[[args.id, "UID"]].to_csv(args.out, sep='\t', index=False, header=False)
92 | 
93 | print(f"{args.out} saved.")
94 | 
95 | 


--------------------------------------------------------------------------------
/manhattan.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import argparse
  4 | import pandas as pd
  5 | import numpy as np
  6 | import matplotlib
  7 | matplotlib.use("Agg")
  8 | import matplotlib.pyplot as plt
  9 | import matplotlib.patches as mpatches
 10 | from matplotlib.collections import PatchCollection
 11 | import matplotlib.patheffects as mpe
 12 | 
 13 | # Default colors are similar to matplotlib 2.0 defaults and are taken from:
 14 | # https://github.com/vega/vega/wiki/Scales#scale-range-literals
 15 | DEFAULT_COLOR_NAMES = [1,3,5,7,9,11,13,15,17,19]
 16 | DEFAULT_COLOR_NAMES_ANNOT = [1,3,5,7,9,11,13,15,17,19] # [2,4,6,8,10,12,14,16,18,20]
 17 | # colors corresponding to even indices are lighter analogs of colors with odd indices, e.g. DEFAULT_COLORS[2] is a light version of DEFAULT_COLORS[1]
 18 | DEFAULT_COLORS = {1:"#1f77b4", 2:"#aec7e8", 3:"#ff7f0e", 4:"#ffbb78",
 19 |                   5:"#2ca02c", 6:"#98df8a", 7:"#d62728", 8:"#ff9896",
 20 |                   9:"#9467bd", 10:"#c5b0d5", 11:"#8c564b", 12:"#c49c94",
 21 |                   13:"#e377c2", 14:"#f7b6d2", 15:"#7f7f7f", 16:"#c7c7c7",
 22 |                   17:"#bcbd22", 18:"#dbdb8d", 19:"#17becf", 20:"#9edae5"}
 23 | 
 24 | # colors from http://mkweb.bcgsc.ca/colorblind/
 25 | CB_COLOR_NAMES = ["orange","sky_blue","bluish_green","yellow","blue",
 26 |     "vermillion","reddish_purple","black"]
 27 | CB_COLOR_NAMES_ANNOT = ["orange","sky_blue","bluish_green","yellow","blue",
 28 |     "vermillion","reddish_purple","black"]
 29 | CB_COLORS = {"orange":"#e69f00",
 30 |              "sky_blue":"#56b4e9",
 31 |              "bluish_green":"#009e73",
 32 |              "yellow":"#f0e442",
 33 |              "blue":"#0072b2",
 34 |              "vermillion":"#d55e00",
 35 |              "reddish_purple":"#cc79a7",
 36 |              "black":"#000000"}
 37 | 
 38 | example_text =  """Example:
 39 | python manhattan.py result.mat.csv \\
 40 | --lead conj.result.clump.lead.csv --indep conj.result.clump.indep.csv \\
 41 | --p FDR --y-label conjFDR --color-list 1 --legend-label 'Trait1 & Trait2' \\
 42 | --legend-location 'upper right' --p-thresh 0.05 --out conjfdr_manhattan"""
 43 | 
 44 | 
 45 | def parse_args(args):
 46 |     parser = argparse.ArgumentParser(
 47 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 48 |         description="A tool to draw Manhattan plot from sumstat files.",
 49 |         epilog=example_text)
 50 | 
 51 |     parser.add_argument("sumstats", nargs="+", help="A list of sumstat files")
 52 |     parser.add_argument("--sep", nargs="+", default=['\t'],
 53 |         help="A list of column separators in sumstat files")
 54 |     parser.add_argument("--snp", nargs="+", default=["SNP"],
 55 |         help="A list of columns with SNP ids in sumstat files")
 56 |     parser.add_argument("--chr", nargs="+", default=["CHR"],
 57 |         help="A list of columns with SNP chromosomes in sumstat files")
 58 |     parser.add_argument("--bp", nargs="+", default=["BP"],
 59 |         help="A list of columns with SNP positions in sumstat files")
 60 |     parser.add_argument("--p", nargs="+", default=["PVAL"],
 61 |         help="A list of columns with SNP p-values in sumstat files")
 62 | 
 63 |     parser.add_argument("--outlined", nargs="+", default=["NA"],
 64 |         help=("A list of files with ids of SNPs to mark with outlined bold dots, 'NA' if absent. "
 65 |             "These files should contain a single column with SNP ids without header"))
 66 |     parser.add_argument("--bold", nargs="+", default=["NA"],
 67 |         help=("A list of files with ids of SNPs to mark with bold dots, 'NA' if absent. "
 68 |             "These files should contain a single column with SNP ids without header"))
 69 |     parser.add_argument("--annot", nargs="+", default=["NA"],
 70 |         help=("A list of files with ids (1st column) and labels (2nd column) of SNPs to annotate, 'NA' if absent. "
 71 |             "These files should contain two tab-delimited columns (1st: SNP ids, 2nd: SNP labels) without header"))
 72 |     # the next two options are shortcuts for --outlined and --bold to work
 73 |     # directly with the output of "sumstats.py clump". These options probably
 74 |     # should be removed in future for clarity
 75 |     parser.add_argument("--lead", nargs="+", default=["NA"],
 76 |         help=("A list of files with ids of lead SNPs, 'NA' if absent. "
 77 |             "These files should be the output of 'sumstats.py clump'"))
 78 |     parser.add_argument("--indep", nargs="+", default=["NA"],
 79 |         help=("A list of files with ids of independent significant SNPs, 'NA' if absent. "
 80 |         "These files should be the output of 'sumstats.py clump'"))
 81 | 
 82 |     parser.add_argument("--p-thresh", type=float, default=5.E-8,
 83 |         help="Significance threshold for p-values")
 84 |     parser.add_argument("--transparency", type=float, nargs="+", default=[1],
 85 |         help="Transparency level of points")
 86 |     parser.add_argument("--between-chr-gap", type=float, default=0.1,
 87 |         help="Size of the gap between chromosomes in the figure")
 88 |     parser.add_argument("--snps-to-keep", nargs="+", default=["NA"],
 89 |         help="A list of files with ids of SNPs to take for plotting, 'NA' if absent. "
 90 |         "These sets of SNPs are further reduced according to '--downsample-frac' argument. "
 91 |         "These files should contain a single column with SNP ids without header")
 92 |     parser.add_argument("--downsample-frac", nargs="+", type=float,
 93 |         default=[0.005], help="Fraction of SNPs to take for plotting")
 94 |     parser.add_argument("--downsample-thresh", nargs="+", type=float,
 95 |         default=[None], help="Only SNPs with p-values larger than the threshold are downsampled")
 96 |     parser.add_argument("--chr2use", type=str, default="1-22",
 97 |         help=("Chromosome ids to plot (e.g. 1,2,3 or 1-4,12,16-20 or 19-22,X,Y). "
 98 |             "The order in the figure will correspond to the order in this argument. "
 99 |             "Chromosomes with non-integer ids should be indicated separately"))
100 |     parser.add_argument("--striped-background", action="store_true",
101 |         help="Draw grey background for every second chromosome")
102 |     parser.add_argument("--color-list", nargs="+", default=[],
103 |         help="Use specified color list, e.g. 1 3 5 7 9 11 13 15 17 19; 2 4 6 8 10 12 14 16 18 20; orange sky_blue bluish_green yellow blue vermillion reddish_purple black, or any colors listed on https://python-graph-gallery.com/100-calling-a-color-with-seaborn")
104 |     parser.add_argument("--cb-colors", action="store_true",
105 |         help="Use colors designed for color-blind people")
106 |     parser.add_argument("--seed", type=int, default=1, help="Random seed")
107 |     parser.add_argument("--out", default="manhattan", help="Out file name")
108 |     parser.add_argument("--separate-sumstats", action="store_true",
109 |         help="Plot each sumstat in a separate subplot.")
110 | 
111 |     parser.add_argument("--y-label", default="P",
112 |         help="Label of y axis. Label in the figure will be: -log10(y_label).")
113 |     parser.add_argument("--y-max", type=float, default=-1, help="Upper limit of y axis. Default: autodetect.")
114 |     parser.add_argument("--legend-location", default="best",
115 |         help="Legend location: 'best', 'upper right', 'upper left', 'lower left', 'lower right', 'right', 'center left', 'center right', 'lower center', 'upper center', 'center'")
116 |     parser.add_argument("--no-legend", action="store_true",
117 |         help="Don't add legend to the figure.")
118 |     parser.add_argument("--legend-labels", nargs="+", default=["NA"],
119 |         help="A list of labels for sumstats to use in the legend in the corresponding order. "
120 |         "If '--no-legend' is specified, this argument is ignored. If both this and "
121 |         "'--no-legend' arguments are absent, corresponding file names are used in "
122 |         "the legend.")
123 | 
124 |     return parser.parse_args(args)
125 | 
126 | 
127 | def process_args(args):
128 |     """
129 |     Check whether provided arguments are correct, change list-type arguments
130 |     with single value to have a length = length of sumstats argument and process
131 |     chr2use arument.
132 |     """
133 |     for f in args.sumstats:
134 |         assert os.path.isfile(f), "'%s' file doesn't exist" % f
135 |     for f in args.outlined:
136 |         assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f
137 |     for f in args.bold:
138 |         assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f
139 |     for f in args.lead:
140 |         assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f
141 |     for f in args.indep:
142 |         assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f
143 |     for f in args.annot:
144 |         assert os.path.isfile(f) or f=="NA", "'%s' file doesn't exist" % f
145 | 
146 |     n = len(args.sumstats)
147 |     arg_dict = vars(args)
148 |     for arg_name, arg_val in arg_dict.items():
149 |         if (type(arg_val) is list) and (len(arg_val)<n) and (len(arg_val)==1):
150 |             arg_dict[arg_name] = arg_val*n
151 |     chr2use_arg = arg_dict["chr2use"]
152 |     chr2use = []
153 |     for a in chr2use_arg.split(","):
154 |         if "-" in a:
155 |             start, end = [int(x) for x in a.split("-")]
156 |             chr2use += [str(x) for x in range(start, end+1)]
157 |         else:
158 |             chr2use.append(a.strip())
159 |     arg_dict["chr2use"] = chr2use
160 | 
161 |     msg = " option should have a value for each sumstat file or a single value"
162 |     assert len(args.sep) == n, "--sep" + msg
163 |     assert len(args.snp) == n, "--snp" + msg
164 |     assert len(args.chr) == n, "--chr" + msg
165 |     assert len(args.bp) == n, "--bp" + msg
166 |     assert len(args.p) == n, "--p " + msg
167 |     assert len(args.snps_to_keep) == n, "--snps-to-keep" + msg
168 |     assert len(args.downsample_frac) == n, "--downsample-frac" + msg
169 |     assert len(args.downsample_thresh) == n, "--downsample-thresh" + msg
170 |     assert len(args.legend_labels) == n, "--legend-labels" + msg
171 | 
172 | 
173 | def get_snp_ids(fname):
174 |     if fname == "NA":
175 |         return np.array([])
176 |     else:
177 |         return pd.read_csv(fname,header=None,squeeze=True).values
178 | 
179 | 
180 | def get_lead(fname):
181 |     if (fname == "NA") or (os.stat(fname).st_size == 0):
182 |         return np.array([])
183 |     else:
184 |         df = pd.read_csv(fname, delim_whitespace=True)
185 |         return df.loc[df.is_locus_lead,"LEAD_SNP"].values
186 | 
187 | 
188 | def get_indep_sig(fname):
189 |     if (fname == "NA") or (os.stat(fname).st_size == 0):
190 |         return np.array([])
191 |     else:
192 |         df = pd.read_csv(fname, delim_whitespace=True)
193 |         return df["INDEP_SNP"].values
194 | 
195 | 
196 | def get_annot(fname):
197 |     """
198 |     Read annotation file and return Series: index=SNP ids and values=SNP labels.
199 |     Return empty Series if fname == "NA"
200 |     """
201 |     if fname == "NA":
202 |         return pd.Series([], dtype=object)
203 |     else:
204 |         series = pd.read_csv(fname,header=None,names=["snp", "label"],delim_whitespace=True,
205 |             index_col="snp",squeeze=True)
206 |         return series
207 | 
208 | 
209 | def filter_sumstats(sumstats_f, sep, snpid_col, pval_col, chr_col, bp_col, chr2use):
210 |     """
211 |     Filter original summary stats file.
212 |     Args:
213 |         sumstats_f: sumstats file name
214 |         sep: column separator in sumstats_f
215 |         snpid_col: a name of column with variant ids
216 |         pval_col: a name of column with variant p-values
217 |         chr_col: a name of column with variant chromosomes
218 |         bp_col: a name of column with variant positions on chromosome
219 |         chr2use: chromosomes to use for plotting (other are dropped)
220 |     Returns:
221 |         df: filtered DataFrame
222 |     """
223 |     print("Filtering %s" % sumstats_f)
224 |     cols2use = [snpid_col, pval_col, chr_col, bp_col]
225 |     df = pd.read_csv(sumstats_f, usecols=cols2use, sep=sep,
226 |         dtype={chr_col:str})
227 |     print("%d SNPs in %s" % (len(df), sumstats_f))
228 |     df = df.loc[~df[snpid_col].isnull(), :].set_index(snpid_col)
229 |     print("%d SNPs with non-null SNP" % len(df))
230 |     # TODO: replace dropna with df = df.loc[df[pval_col]>0,:], should be ~ 1.5x faster
231 |     df.dropna(subset=[pval_col], how="all", inplace = True)
232 |     print("%d SNPs with defined p-value" % len(df))
233 |     df = df.loc[df[chr_col].isin(chr2use),:]
234 |     print("%d SNPs within specified chromosomes" % len(df))
235 |     # TODO: zero filtering step is very slow, should be optimized
236 |     df = df.loc[df[pval_col]>0,:]
237 |     print("%d SNPs with non-zero p-value" % len(df))
238 |     # TODO: drop duplicates as it is done in qq.py
239 |     return df
240 | 
241 | 
242 | def get_df2plot(df, outlined_snps_f, bold_snps_f, lead_snps_f, indep_snps_f,
243 |     annot_f, snps_to_keep_f, downsample_frac, downsample_thresh, pval_col):
244 |     """
245 |     Select variants which will be plotted. Mark lead and independent significant
246 |     variants if corresponding information is provided.
247 |     Args:
248 |         df: DataFrame for variant selection
249 |         outlined_snps_f: a name of file with SNP ids to plot with outlined bold dots
250 |         bold_snps_f: a name of file with SNP ids to plot with bold dots
251 |         lead_snps_f: a name of file with lead variants
252 |         indep_snps_f: a name of file with independent significant variants
253 |         snps_to_keep_f: a list of variants to consider for plotting, only these
254 |             variants are considered when downsampling take place 
255 |         downsample_frac: a fraction of variants which will be sampled from df
256 |             for plotting
257 |         downsample_thresh: only variants with p-value larger than this threshold
258 |             are downsampled
259 |         pval_col: a column with p-values in df
260 |     Returns:
261 |         df2plot: DataFrame with variants for plotting
262 |     """
263 |     print("Preparing SNPs for plotting")
264 |     # define a subset of variants which will be plotted:
265 |     # [outlined + lead] + [bold + indep] + sample
266 |     outlined_snp_ids = get_snp_ids(outlined_snps_f)
267 |     bold_snp_ids = get_snp_ids(bold_snps_f)
268 |     lead_snp_id = get_lead(lead_snps_f)
269 |     indep_snp_id = get_indep_sig(indep_snps_f)
270 |     annot_series = get_annot(annot_f)
271 |     outlined_snp_ids = np.unique(np.concatenate((outlined_snp_ids, lead_snp_id)))
272 |     bold_snp_ids = np.unique(np.concatenate((bold_snp_ids, indep_snp_id)))
273 |     # sample variants
274 |     if snps_to_keep_f != "NA":
275 |         snps2keep = get_snp_ids(snps_to_keep_f)
276 |         ii = df.index.intersection(snps2keep)
277 |         df = df.loc[ii,:]
278 |         print("%d SNPs overlap with %s" % (len(df),snps_to_keep_f))
279 |     if not downsample_thresh is None:
280 |         i2downsample = df[pval_col]>downsample_thresh
281 |         df2downsample = df.loc[i2downsample,:]
282 |         snps2downsample = df2downsample.index
283 |         snps2downsample_pvals = df2downsample[pval_col]
284 |         snps2keep = df.loc[~i2downsample,:].index.values
285 |     else:
286 |         snps2downsample = df.index
287 |         snps2downsample_pvals = df[pval_col]
288 |         snps2keep = []
289 |     n = int(downsample_frac*len(snps2downsample))
290 |     # w = 1/df[pval_col].values
291 |     w = -np.log10(snps2downsample_pvals.values)
292 |     w /= sum(w)
293 | 
294 |     snp_sample = np.random.choice(snps2downsample,size=n,replace=False,p=w)
295 |     # TODO: keep SNPs within identified loci with higher prob?
296 |     # NOTE: it could be that there are snp ids in outlined_snp_ids or bold_snp_ids which
297 |     # are not in df.index, therefore we should take an index.intersection first.
298 |     outlined_snp_ids = df.index.intersection(outlined_snp_ids)
299 |     bold_snp_ids = df.index.intersection(bold_snp_ids)
300 |     annot_snp_ids = df.index.intersection(annot_series.index)
301 |     snps2keep = np.unique(np.concatenate((snps2keep, outlined_snp_ids, bold_snp_ids,
302 |         snp_sample, annot_snp_ids)))
303 |     df2plot = df.loc[snps2keep,:]
304 |     df2plot.loc[:,"outlined"] = False
305 |     df2plot.loc[outlined_snp_ids,"outlined"] = True
306 |     df2plot.loc[:,"bold"] = False
307 |     df2plot.loc[bold_snp_ids,"bold"] = True
308 |     df2plot.loc[:,"annot"] = ""
309 |     df2plot.loc[annot_snp_ids,"annot"] = annot_series[annot_snp_ids]
310 |     print("%d outlined SNPs" % len(outlined_snp_ids))
311 |     print("%d bold SNPs" % len(bold_snp_ids))
312 |     print("%d annotated SNPs" % len(annot_snp_ids))
313 |     print("%d SNPs will be plotted in total" % len(df2plot))
314 |     return df2plot
315 | 
316 | 
317 | def get_chr_df(dfs2plot, bp_cols, chr_cols, between_chr_gap, chr2use):
318 |     """
319 |     Construct DataFrame with index = chromosome names and 5 columns:
320 |     min: minimum coordinate on each chromosome among all dfs in dfs2plot
321 |     max: maximum coordinate on each chromosome among all dfs in dfs2plot
322 |     ind: index of the chromosome = 1:N, where N - nuumber of different chromosomes
323 |     rel_size: size of the chromosome relative to the first chromosome (i.e.
324 |         rel_size of the first chr = 1)
325 |     start: start coordinate of the chromosome on the x axis, where the first
326 |         chromosome starts at x = 0 and ends at x = 1 (if its size = 1), taking
327 |         into account between_chr_gap
328 |     Args:
329 |         dfs2plot: a list of DataFrames that will be plotted
330 |         bp_cols: name of marker position on chromosome columns
331 |         chr_cols: name of marker chromosome columns
332 |         between_chr_gap: gap between end of chr K and start of chr K+1
333 |         chr2use: chromosomes to use for plotting (other are dropped)
334 |     Returns:
335 |         chr_df: a DataFrame with chromosome information as described above
336 |     """
337 |     unique_chr = np.unique(np.concatenate([df[chr_cols[i]].unique() for i,df in enumerate(dfs2plot)]))
338 |     unique_chr = [c for c in chr2use if c in unique_chr]
339 |     chr_df = pd.DataFrame(index=unique_chr, columns=["min","max","ind","start","rel_size"])
340 |     min_df = pd.DataFrame(index=unique_chr)
341 |     max_df = pd.DataFrame(index=unique_chr)
342 |     for i,df in enumerate(dfs2plot):
343 |         chr_min = df.groupby(chr_cols[i])[bp_cols[i]].min()
344 |         chr_max = df.groupby(chr_cols[i])[bp_cols[i]].max()
345 |         min_df[i] = chr_min
346 |         max_df[i] = chr_max
347 |     chr_df["min"] = min_df.min(axis=1)
348 |     chr_df["max"] = max_df.max(axis=1)
349 |     chr_df["ind"] = np.arange(len(unique_chr))
350 |     # use the first chr form unique_chr as a reference unit size
351 |     ref_unit_size = chr_df.loc[chr_df.index[0],"max"] - chr_df.loc[chr_df.index[0],"min"]
352 |     chr_df["rel_size"] = (chr_df["max"] - chr_df["min"])/ref_unit_size
353 |     chr_df["start"] = chr_df["rel_size"].cumsum() - chr_df["rel_size"] + between_chr_gap*chr_df["ind"]
354 |     return chr_df
355 | 
356 | 
357 | def add_coords(df2plot, chr_col, bp_col, pval_col, chr_df):
358 |     """
359 |     Modify provided DataFrame df2plot by adding columns with x-y coordinates for
360 |     plotting to it.
361 |     Args:
362 |         df2plot: DataFrame with variants for plotting (produced by get_df2plot)
363 |         chr_col: a column with chromosome of variants in df2plot
364 |         bp_col: a column with position on chromosome of variants in df2plot
365 |         pval_col: a column with variant p-values
366 |         chr_df: a DataFrame with chromosome information (produced by get_chr_df)
367 |     """
368 |     chr_start = chr_df.loc[df2plot[chr_col], "start"].values
369 |     chr_min = chr_df.loc[df2plot[chr_col], "min"].values
370 |     df2plot.loc[:,"x_coord"] = (df2plot[bp_col] - chr_min)/chr_df.loc[chr_df.index[0],"max"] + chr_start
371 |     df2plot.loc[:,"log10p"] = -np.log10(df2plot[pval_col]) # y coord
372 | 
373 | 
374 | def add_striped_background(chr_df, ax, y_up):
375 |     """
376 |     Add grey background rectagle for every second chromosome.
377 |     """
378 |     height = y_up
379 |     background_rect = []
380 |     for c in chr_df.index[1::2]:
381 |         x = chr_df.loc[c,"start"]
382 |         y = 0
383 |         width = chr_df.loc[c,"rel_size"]
384 |         rect = mpatches.Rectangle((x, y), width, height)
385 |         background_rect.append(rect)
386 |     pc = PatchCollection(background_rect, facecolor='#AEA79F', alpha=0.3,
387 |                          edgecolor='None')
388 |     ax.add_collection(pc)
389 | 
390 | 
391 | if __name__ == "__main__":
392 |     args = parse_args(sys.argv[1:])
393 |     process_args(args)
394 | 
395 |     np.random.seed(args.seed)
396 | 
397 |     if args.color_list:
398 |         assert len(args.sumstats) <= len(args.color_list), "%d is maximum number of sumstats to plot simultaneously with specified color scheme" % len(color_list)
399 |         color_names = [int(x) if x.isdigit() else x for x in args.color_list]
400 |         color_names_annot = color_names
401 |         color_dict = {**DEFAULT_COLORS, **CB_COLORS}
402 |         for x in args.color_list:
403 |             if x not in color_dict:
404 |                 color_dict[x] = x
405 |     elif args.cb_colors:
406 |         assert len(args.sumstats) <= len(CB_COLOR_NAMES), "%d is maximum number of sumstats to plot simultaneously with color-blind color scheme" % len(CB_COLOR_NAMES)
407 |         color_names = CB_COLOR_NAMES
408 |         color_names_annot = CB_COLOR_NAMES_ANNOT
409 |         color_dict = CB_COLORS
410 |     else:
411 |         # use default colors
412 |         assert len(args.sumstats) <= len(DEFAULT_COLOR_NAMES), "%d is maximum number of sumstats to plot simultaneously with default color scheme" % len(DEFAULT_COLOR_NAMES)
413 |         color_names = DEFAULT_COLOR_NAMES
414 |         color_names_annot = DEFAULT_COLOR_NAMES_ANNOT
415 |         color_dict = DEFAULT_COLORS
416 | 
417 |     legend_labels = [os.path.splitext(os.path.basename(args.sumstats[i]))[0] if ll == "NA" else ll
418 |         for i,ll in enumerate(args.legend_labels)]
419 |     legends_handles = []
420 | 
421 |     sumstat_dfs = [
422 |         filter_sumstats(s, args.sep[i], args.snp[i], args.p[i], args.chr[i], args.bp[i], args.chr2use)
423 |         for i,s in enumerate(args.sumstats)]
424 | 
425 |     dfs2plot = [get_df2plot(df, args.outlined[i], args.bold[i], args.lead[i], args.indep[i],
426 |                             args.annot[i], args.snps_to_keep[i], args.downsample_frac[i],
427 |                             args.downsample_thresh[i], args.p[i])
428 |         for i, df in enumerate(sumstat_dfs)]
429 | 
430 |     chr_df = get_chr_df(dfs2plot, args.bp, args.chr, args.between_chr_gap, args.chr2use)
431 | 
432 |     for i,df in enumerate(dfs2plot):
433 |         add_coords(df, args.chr[i], args.bp[i], args.p[i], chr_df)
434 | 
435 |     n_subplots = len(dfs2plot) if args.separate_sumstats else 1
436 | 
437 |     # make plot
438 |     print("Making plot")
439 |     plt.rc('legend',fontsize=15)
440 |     fig, axarr = plt.subplots(n_subplots, squeeze=False, figsize=(14,5*n_subplots), dpi=200)
441 |     axarr = axarr[:,0] # squeeze second dimention since we don't need it here
442 | 
443 | 
444 |     # find upper limit for Y axis
445 |     if args.y_max > 0:
446 |         y_up = args.y_max
447 |     else:
448 |         y_up = max([df["log10p"].max() for df in dfs2plot])
449 |         y_up = max(y_up, -np.log10(args.p_thresh))
450 |         y_up *= 1.05
451 | 
452 |     if args.striped_background:
453 |         for ax in axarr:
454 |             add_striped_background(chr_df, ax, y_up)
455 | 
456 |     for i, df in enumerate(dfs2plot):
457 |         # plot normal points
458 |         ax_i = i if args.separate_sumstats else 0
459 |         ax = axarr[ax_i]
460 |         
461 |         color = color_dict[color_names[i]]
462 |         ax.plot(df["x_coord"], df["log10p"], ls=' ', marker='.', ms=2,
463 |             color=color, alpha=args.transparency[i])
464 |         patch = mpatches.Patch(color=color, label=legend_labels[i])
465 |         legends_handles.append(patch)
466 |     for i, df in enumerate(dfs2plot):
467 |         # plot bold significant and outlined variants "on top" of normal points
468 |         ax_i = i if args.separate_sumstats else 0
469 |         ax = axarr[ax_i]
470 |         
471 |         color = color_dict[color_names[i]]
472 |         df_tmp = df.loc[df["bold"],:]
473 |         ax.plot(df_tmp["x_coord"], df_tmp["log10p"], ls=' ', marker='o', ms=5,
474 |             color=color)
475 |         df_tmp = df.loc[df["outlined"],:]
476 |         ax.plot(df_tmp["x_coord"], df_tmp["log10p"], ls=' ', marker='o', ms=8,
477 |             markeredgewidth=0.6, markeredgecolor='k', color=color)
478 |         df_tmp = df.loc[df["annot"]!="",["annot","x_coord", "log10p"]]
479 |         pe = [mpe.Stroke(linewidth=0.8, foreground='black')]
480 |         for row in df_tmp.itertuples():
481 |             color = color_dict[color_names_annot[i]]
482 |             ax.annotate(row.annot, xy=(row.x_coord, row.log10p), xycoords='data',
483 |                 xytext=(2,2), textcoords='offset points', color=color,
484 |                 fontsize=12, style='italic', fontweight='heavy',
485 |                 # path_effects=pe, # uncomment path_effects to have a black border of the label symbols 
486 |                 bbox={"boxstyle":"square, pad=0.02", "facecolor":"white",
487 |                       "edgecolor":"none","alpha":0.6})
488 | 
489 |     for i,ax in enumerate(axarr):
490 |         ax.hlines([-np.log10(args.p_thresh)], 0, 1, colors='k', linestyles='dotted',
491 |             transform=ax.get_yaxis_transform())
492 | 
493 |         ax.tick_params(axis='y', which='major', labelsize=15)
494 |         ax.tick_params(axis='x', which='major', labelsize=15)
495 |         x_ticks = chr_df["start"] + 0.5*chr_df["rel_size"]
496 |         ax.set_xticks(x_ticks)
497 |         ax.set_xticklabels(map(str, x_ticks.index), fontsize=14)
498 | 
499 |         ax.set_xlim((-0.1,
500 |             chr_df.loc[chr_df.index[-1], "start"] + chr_df.loc[chr_df.index[-1], "rel_size"] + 0.1))
501 |         y_low = ax.get_ylim()[0]
502 |         ax.set_ylim((0-0.005*y_up, y_up))
503 |         # remove top and right spines
504 |         ax.spines['right'].set_visible(False)
505 |         ax.spines['top'].set_visible(False)
506 |         # add offset for left spine
507 |         ax.spines['left'].set_position(('outward',5))
508 |         ax.spines['bottom'].set_position(('outward',5))
509 | 
510 |         ax.set_xlabel("Chromosome", fontsize=20)
511 |         ax.set_ylabel(r"$\mathrm{-log_{10}(%s)}$" % args.y_label, fontsize=20)
512 | 
513 |         if args.legend_location:
514 |             handles = legends_handles[i:i+1] if args.separate_sumstats else legends_handles
515 |             ax.legend(handles=handles, loc=args.legend_location)
516 |         elif not args.no_legend:
517 |             handles = legends_handles[i:i+1] if args.separate_sumstats else legends_handles
518 |             ax.legend(handles=handles, loc='best')
519 | 
520 | 
521 |     plt.tight_layout()
522 | 
523 |     # save/show
524 |     # plt.savefig(args.out)
525 |     plt.savefig(args.out+'.png')
526 |     plt.savefig(args.out+'.pdf')
527 |     plt.savefig(args.out+'.svg')
528 |     # plt.show()
529 |     print("%s was generated" % args.out)
530 | 


--------------------------------------------------------------------------------
/merge_bed_files.py:
--------------------------------------------------------------------------------
 1 | # Merge together several .bed files, ignoring all potential warnings from plink.
 2 | #
 3 | # To run this tool:
 4 | #    python merge_bed_files.py --bed ~/1000Genome/phase3/build37_released/*.bed --out merged
 5 | 
 6 | import argparse
 7 | import glob
 8 | import itertools
 9 | import os.path
10 | import os
11 | import subprocess
12 | import sys
13 | import pandas as pd
14 | 
15 | def parse_args(args):
16 |     parser = argparse.ArgumentParser(description="Merge together several .bed files")
17 |     parser.add_argument("--bed", type=str, help="Filename of input .bed file, or pattern (for example '~/1000Genome/phase3/build37_released/*.bed')")
18 |     parser.add_argument("--out", default=r"merged", type=str, help="Filename of output .bed file (without extention")
19 |     return parser.parse_args(args)
20 | 
21 | def execute_command(command):
22 |     print("Execute command: {}".format(command))
23 |     print(subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()[0].decode("utf-8"))
24 |     #print(subprocess.check_output(command.split()).decode("utf-8"))
25 | 
26 | def exclude_snps(bfile_in, snps_file, bfile_out):
27 |     execute_command('plink --memory 4096 --bfile {0} --exclude {1} --make-bed --out {2}'.format(bfile_in, snps_file, bfile_out))
28 | 
29 | def merge(files, output_bfile):
30 |     missnp_file = '{0}-merge.missnp'.format(output_bfile)
31 |     if os.path.exists(missnp_file):
32 |         os.remove(missnp_file)
33 |     first = files[0]
34 |     with open('mergelist.txt', 'w') as mergelist:
35 |         for filename in files[1:]:
36 |             mergelist.write('{0}.bed {0}.bim {0}.fam\n'.format(filename))
37 |     execute_command('plink --memory 4096 --bfile {0} --merge-list mergelist.txt --allow-no-sex --make-bed --out {1}'.format(first, output_bfile))
38 |     os.remove('mergelist.txt')
39 | 
40 | if __name__ == "__main__":
41 |     args = parse_args(sys.argv[1:])
42 | 
43 |     # Find all .bed filenames (without extention)
44 |     files = [os.path.splitext(file)[0] for file in glob.glob(args.bed)]
45 | 
46 |     merge(files, args.out)
47 |     missnp_file = '{0}-merge.missnp'.format(args.out)
48 |     if os.path.exists(missnp_file):
49 |         # Handle merge failure as described here: https://www.cog-genomics.org/plink2/data#merge3
50 |         for file in files:
51 |             exclude_snps(file, missnp_file, '{0}.filter'.format(file))
52 |         merge(['{0}.filter'.format(file) for file in files], args.out)
53 |         for file in files: map(os.remove, glob.glob('{0}.filter'.format(file)))
54 | 
55 |     print("Done.")
56 | 


--------------------------------------------------------------------------------
/overCorrect.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | import scipy.stats as stats
 4 | import scipy.special as special
 5 | import scipy.linalg as linalg
 6 | import scipy.interpolate as interp
 7 | 
 8 | # Decorrelation in Statistics: The Mahalanobis Transformation
 9 | # Added material to Data Compression: The Complete Reference
10 | # http://www.davidsalomon.name/DC2advertis/DeCorr.pdf
11 | def overCorrect(z1, z2, idx):
12 |     defidx = np.isfinite(z1+z2)
13 |     z11 = z1[np.logical_and(defidx, idx)]; 
14 |     z22 = z2[np.logical_and(defidx, idx)]
15 |     if np.sum(z1 < 0) != 0 and np.sum(z2 < 0) == 0:
16 |         C = np.corrcoef(np.power(z11,2), z22)
17 |         print "correlation between squred Z-score: ", C[0,1]
18 |     elif np.sum(z1 < 0) == 0 and np.sum(z2 < 0) != 0:
19 |         C = np.corrcoef(z11, np.power(z22,2))
20 |         print "correlation between squred Z-score: ", C[0,1]
21 |     else:
22 |         C = np.corrcoef(z11, z22)
23 |         print "correlation between Z score: ", C[0,1]
24 |     Z = np.row_stack([z1, z2])
25 |     print Z.shape
26 |     z_adj = np.dot(linalg.fractional_matrix_power(C, -1/2), Z)
27 |     print np.corrcoef(z_adj[0,:], z_adj[1,:])[0,1]
28 |     return z2logp(z_adj[0,:]), z2logp(z_adj[1,:]) 
29 |   
30 | def z2logp(zscores, tails = 2):
31 |     """
32 |     compute coresponding -log10 p values of given z values.
33 |     """
34 |     return -np.log10(tails * stats.norm.cdf(-np.fabs(zscores)))
35 | 


--------------------------------------------------------------------------------
/plink_utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | 
 5 | def get_byte_map():
 6 |     """
 7 |     Construct mapping between bytes 0..255 and 4-element arrays of a1 genotypes
 8 |     from plink bed file.
 9 |     Return 256 x 4 array A, where A[i] = [a1, a2, a3, a4], each ai is from {2, -1, 1, 0}.
10 |     """
11 |     genotype_codes = np.array([2, -1, 1, 0],dtype=np.int8)
12 |     byte_map = np.empty((256,4), dtype=np.int8)
13 |     for b in range(256):
14 |         for a in range(4):
15 |             byte_map[b,a] = genotype_codes[(b >> 2*a) & 3]
16 |     return byte_map
17 | 
18 | 
19 | class Plink(object):
20 |     bim = None # pd.DataFrame
21 |     fam = None # pd.DataFrame
22 |     bed = None # np.memmap(dtype=np.uint8), this also contain extra bits if n_samples%4 != 0
23 | 
24 |     byte_map = get_byte_map()
25 | 
26 |     def __init__(self, bfile):
27 |         print(f"Loading plink {bfile}")
28 |         self._load_bim(bfile)
29 |         self._load_fam(bfile)
30 |         self._load_bed(bfile)
31 |     
32 |     def _load_bim(self, bfile):
33 |         self.bim = pd.read_csv(f'{bfile}.bim', sep='\t', header=None,
34 |                                names=["chr","snp","cm","bp","a1","a2"])
35 |     
36 |     def _load_fam(self, bfile):
37 |         self.fam = pd.read_csv(f'{bfile}.fam',sep='\t',header=None,
38 |                                names=["fid","iid","father_id","mother_id","sex","pheno"])
39 | 
40 |     def _load_bed(self, bfile):
41 |         if self.bim is None:
42 |             self._load_bim(bfile)
43 |         if self.fam is None:
44 |             self._load_fam(bfile)
45 |         bedf = f'{bfile}.bed'
46 |         magic_bits = np.fromfile(bedf, count=3, dtype=np.uint8)
47 |         if (magic_bits != [108,27,1]).any():
48 |             raise Exception(f"{bedf} file is not a valid bed file!")
49 |         n_snps = self.bim.shape[0]
50 |         n_samples = self.fam.shape[0]
51 |         n_cols = n_samples//4
52 |         if 4*n_cols != n_samples:
53 |             n_cols += 1
54 |         self.bed = np.memmap(bedf, dtype=np.uint8, offset=3, mode='r', shape=(n_snps,n_cols))
55 |         
56 |     def get_geno(self, snp_ii=None):
57 |         """
58 |         Get genotypes for SNPs with indices from snp_ii.
59 |         Args:
60 |             snp_ii  : np.array of SNP indices
61 |         """
62 |         n_snps = self.bim.shape[0]
63 |         n_samples = self.fam.shape[0]
64 |         if snp_ii is None:
65 |             snp_ii = np.arange(n_snps)
66 |         assert max(snp_ii) < n_snps, f"SNP index cannot be > {n_snps-1}"
67 |         n_cols = 4*(n_samples//4)
68 |         if n_cols != n_samples:
69 |             n_cols += 4
70 |         samp_geno = self.byte_map[self.bed[snp_ii]].reshape((len(snp_ii),n_cols))
71 |         return samp_geno[:,:n_samples]
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     # Example:
76 |     bfile = '/path/to/plink_bfile'
77 |     plink = Plink(bfile)
78 | 
79 |     # read genotypes of 0-th and 10-th variants
80 |     geno_arr = plink.get_geno([0,10]) 
81 |     print(geno_arr.shape)
82 | 
83 |     # read genotypes of all variants in the bfile
84 |     geno_arr = plink.get_geno() 
85 |     print(geno_arr.shape)
86 | 


--------------------------------------------------------------------------------
/plotgwas.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib
  4 | matplotlib.use("Agg")
  5 | import matplotlib.pyplot as plt
  6 | import os
  7 | import sys
  8 | import argparse
  9 | 
 10 | """
 11 | Run examples:
 12 | (1) Manhattan plot with one sumstats:
 13 | python plotgwas.py manhattan --config config.plotgwas.3.cfg --out manhattan.3.svg
 14 | (2) Miami plot with two sumstats in the top panel and one sumstats in the bottom panel saved as png and as svg files:
 15 | python plotgwas.py miami --config-top config.plotgwas.1.cfg config.plotgwas.2.cfg --config-bottom config.plotgwas.3.cfg --out miami.1.2.3.png miami.1.2.3.svg
 16 | """
 17 | 
 18 | 
 19 | def parse_args(args):
 20 |     parser = argparse.ArgumentParser(description="Tools to plot GWAS summary statistic data.")
 21 |     subparsers = parser.add_subparsers()
 22 | 
 23 |     parser_manhattan = subparsers.add_parser("manhattan", help="Make Manhattan plot.")
 24 |     parser_manhattan.add_argument("--config", type=str, nargs='+', required=True, help="List of config files to plot.")
 25 |     parser_manhattan.add_argument("--out", type=str, nargs='+', required=True, help="List of output file names.")
 26 |     parser_manhattan.set_defaults(func=make_manhattan)
 27 | 
 28 |     parser_miami = subparsers.add_parser("miami", help="Make Miami plot.")
 29 |     parser_miami.add_argument("--config-top", type=str, nargs='+', required=True, help="List of config files to plot on top.")
 30 |     parser_miami.add_argument("--config-bottom", type=str, nargs='+', required=True, help="List of config files to plot on bottom.")
 31 |     parser_miami.add_argument("--out", type=str, nargs='+', required=True, help="List of output file names.")
 32 |     parser_miami.set_defaults(func=make_miami)
 33 | 
 34 |     return parser.parse_args(args)
 35 | 
 36 | 
 37 | def add_coord(dfs):
 38 |     # df is assume to have "CHR" and "BP" columns. Chromosomes here are assumed to be integers.
 39 |     min_chr_bp, max_chr_bp = get_min_max_chr_bp(dfs)
 40 |     next_chrom_start = 0
 41 |     between_chr_gap = 0.005*(sum(max_chr_bp.values()) - sum(min_chr_bp.values()))
 42 |     for df in dfs:
 43 |         next_chrom_start = 0
 44 |         for chrom in sorted(min_chr_bp):
 45 |             i_chrom = df.CHR==chrom
 46 |             min_bp, max_bp = min_chr_bp[chrom], max_chr_bp[chrom]
 47 |             coord = df.loc[i_chrom, "BP"] - min_bp + next_chrom_start
 48 |             df.loc[i_chrom, "COORD"] = coord
 49 |             next_chrom_start += max_bp - min_bp + between_chr_gap
 50 |         
 51 | def get_min_max_chr_bp(dfs):
 52 |     min_max_chr_bp = []
 53 |     for df in dfs:
 54 |         df_chr_bp = df.groupby("CHR").agg({"BP":['min', 'max']}).BP
 55 |         min_max_chr_bp.append(df_chr_bp)
 56 |     df_concat = pd.concat(min_max_chr_bp, axis=1)
 57 |     min_chr_bp = df_concat.min(axis=1).to_dict()
 58 |     max_chr_bp = df_concat.max(axis=1).to_dict()
 59 |     return min_chr_bp, max_chr_bp
 60 | 
 61 | def drop_marginal_snps(df, p_cutoff_low, p_cutoff_high):
 62 |     # df is assume to have "P" column.
 63 |     # Add coordinates in the figure.
 64 |     # Drop non-autosomes and convert chromosomes to int.
 65 |     autosomes = [str(i) for i in range(1,23)]
 66 |     i_autosome = df.CHR.isin(autosomes)
 67 |     df = df.loc[i_autosome,:]
 68 |     i2plot = (p_cutoff_high<df.P) & (df.P<p_cutoff_low)
 69 |     df = df.loc[i2plot,:]
 70 |     print(f"{i2plot.sum()} variants will be plotted.")
 71 |     return df
 72 | 
 73 | 
 74 | def set_xlim_and_xtiks(dfs, ax):
 75 |     ax.set_xlim( (min(df.COORD.min() for df in dfs), max(df.COORD.max() for df in dfs)) )
 76 | 
 77 |     # add chromosome labels in the middle
 78 |     xtick_locations = []
 79 |     for chrom in range(1,23):
 80 |         bp_min = min([df.loc[df.CHR==chrom,"COORD"].min() for df in dfs if chrom in df.CHR.values])
 81 |         bp_max = min([df.loc[df.CHR==chrom,"COORD"].max() for df in dfs if chrom in df.CHR.values])
 82 |         chrom_mid = bp_min + 0.5*(bp_max - bp_min)
 83 |         xtick_locations.append(chrom_mid)
 84 | 
 85 |     x_tick_labels = [str(i) for i in range(1,23)]
 86 |     ax.set_xticks(xtick_locations)
 87 |     ax.set_xticklabels(x_tick_labels)
 88 | 
 89 | 
 90 | def plot_scatter(df, color1, color2, size, marker, ax, edgewidth=0, edgecolor='k', alpha=1, label=None, rasterized=True):
 91 |     color = [color1 if chrom%2==1 else color2 for chrom in df.CHR]
 92 |     _=ax.scatter(df.COORD, -np.log10(df.P), s=size, c=color, marker=marker,
 93 |                  linewidths=edgewidth, edgecolors=edgecolor, alpha=alpha, label=label, rasterized=rasterized)
 94 |     
 95 | def plot_sumstats(df, config, ax):
 96 |     plot_scatter(df.loc[df.IS_STANDARD,:], config["color1"], config["color2"], config["size"],
 97 |              config["marker"], ax, alpha=config["alpha"], label=config["legend_label"], rasterized=True)
 98 |     if config["bold"]:
 99 |         plot_scatter(df.loc[df.IS_BOLD,:], config["color1_bold"], config["color2_bold"],
100 |                      config["size_bold"], config["marker_bold"], ax,
101 |                      alpha=config["alpha_bold"], rasterized=True)
102 |     if config["outlined"]:
103 |         plot_scatter(df.loc[df.IS_OUTLINED,:], config["color1_outlined"], config["color2_outlined"],
104 |                      config["size_outlined"], config["marker_outlined"], ax,
105 |                      alpha=config["alpha_outlined"], edgewidth=1.5, edgecolor='k', rasterized=True)
106 |     if config["gws_threshold"]:
107 |         gws_thresh = -np.log10(config["gws_threshold"])
108 |         ax.hlines([gws_thresh], 0, 1, colors='k', linestyles='dotted', transform=ax.get_yaxis_transform())
109 | 
110 | 
111 | def read_sumstats(config):
112 |     print(f"Reading sumstats from: {config['sumstats']}")
113 |     df = pd.read_csv(config["sumstats"], delim_whitespace=True,
114 |                      usecols=[config["chrom_col"], config["bp_col"], config["p_col"], config["id_col"]],
115 |                      dtype={config["chrom_col"]:str, config["bp_col"]:int, config["p_col"]:float, config["id_col"]:str})
116 |     # standardize column names
117 |     rename_col_dict = {config["chrom_col"]:"CHR", config["bp_col"]:"BP", config["p_col"]:"P", config["id_col"]:"ID"}
118 |     df.rename(columns=rename_col_dict, inplace=True)
119 |     
120 |     print(f"    {df.shape[0]} variants loaded.")
121 |     df.rename(columns={config["id_col"]:"ID", config["chrom_col"]:"CHR", config["bp_col"]:"BP", config["p_col"]:"P"}, inplace=True)
122 | 
123 |     p_cutoff_low = float(config["p_cutoff_low"])
124 |     p_cutoff_high = float(config["p_cutoff_high"])
125 |     df = drop_marginal_snps(df, p_cutoff_low, p_cutoff_high)
126 | 
127 |     df["IS_OUTLINED"] = False
128 |     if config["outlined"]:
129 |         print(f"Reading {config['outlined']}")
130 |         outlined = pd.read_csv(config["outlined"], header=None, names=["ID"]).squeeze()
131 |         df["IS_OUTLINED"] = df.ID.isin(outlined)
132 | 
133 |     df["IS_BOLD"] = False
134 |     if config["bold"]:
135 |         print(f"Reading {config['bold']}")
136 |         bold = pd.read_csv(config["bold"], header=None, names=["ID"]).squeeze()
137 |         df["IS_BOLD"] = df.ID.isin(bold)
138 |         if config["outlined"]:
139 |             df.loc[df.IS_BOLD & df.IS_OUTLINED,"IS_BOLD"] = False
140 | 
141 |     df["IS_STANDARD"] = True
142 |     df.loc[df.IS_BOLD | df.IS_OUTLINED, "IS_STANDARD"] = False
143 |     return df
144 | 
145 | 
146 | def make_manhattan(args):
147 |     config_files = args.config
148 |     configs, dfs = [], []
149 |     for config_file in config_files:
150 |         print(f"Reading config from: {config_file}")
151 |         config = {}
152 |         with open(config_file) as cf:
153 |             exec(cf.read(),  {"__builtins__" : None}, config)
154 |         configs.append(config)
155 |         df = read_sumstats(config)
156 |         dfs.append(df)
157 |         df.CHR = df.CHR.astype(int)
158 | 
159 |     add_coord(dfs)
160 | 
161 |     fig, ax = plt.subplots(1, 1, figsize=(18,4), constrained_layout=True)    
162 |     # configure spines
163 |     ax.spines['right'].set_visible(False)
164 |     ax.spines['top'].set_visible(False)
165 |     ax.spines['bottom'].set_visible(True)
166 |     ax.spines['left'].set_position(('outward',5))
167 |     
168 |     ax.tick_params(
169 |         axis='x',          # changes apply to the x-axis
170 |         which='both',      # both major and minor ticks are affected
171 |         bottom=True,       # ticks along the bottom edge are on/off
172 |         top=False,         # ticks along the top edge are on/off
173 |         labelbottom=True)  # tick labels along the bottom edge are on/off
174 |     
175 |     ax.set_xlabel("Chromosome", fontsize=16)
176 |     
177 |     for df, config in zip(dfs, configs):
178 |         plot_sumstats(df, config, ax)
179 |         
180 |     print("Plotting data.")
181 |     set_xlim_and_xtiks(dfs, ax)
182 | 
183 |     ax.set_ylabel(r"$\mathrm{-log_{10}(%s)}$" % configs[-1]["y_axis_label"], fontsize=16)
184 |     ax.tick_params(axis='both', which='major', labelsize=13)
185 |     label_colors = [c["legend_label_color"] for c in configs]
186 |     ax.legend(loc="upper right", fontsize=16, frameon=False, labelcolor=label_colors, markerscale=0, scatterpoints=1)
187 | 
188 |     # set y axis lims
189 |     _, y_max = ax.get_ylim()
190 |     y_min = min(-np.log10(df.P.max()) for df in dfs)
191 |     ax.set_ylim((y_min, y_max))
192 | 
193 |     for outf in args.out:
194 |         plt.savefig(outf, facecolor='w', dpi=300)
195 |         print(f"{outf} saved.")
196 | 
197 | 
198 | def make_miami(args):
199 |     config_files = args.config_top + args.config_bottom
200 |     configs, dfs = [], []
201 |     for config_file in config_files:
202 |         print(f"Reading config from: {config_file}")
203 |         config = {}
204 |         with open(config_file) as cf:
205 |             exec(cf.read(),  {"__builtins__" : None}, config)
206 |         configs.append(config)
207 |         df = read_sumstats(config)
208 |         dfs.append(df)
209 |         df.CHR = df.CHR.astype(int)
210 | 
211 |     add_coord(dfs)
212 | 
213 |     fig, (ax_top, ax_bottom) = plt.subplots(2, 1, figsize=(18,8), constrained_layout=True, sharex=True)
214 |     ax_bottom.invert_yaxis()
215 |     ax_bottom.tick_params(axis='x', which='both', top=False, labeltop=True, bottom=False, labelbottom=False)
216 |     ax_top.tick_params(axis='x', which='both', top=False, labeltop=False, bottom=False, labelbottom=False)
217 |     axs = [ax_top]*len(args.config_top) + [ax_bottom]*len(args.config_bottom)
218 |     # configure spines
219 |     for ax in (ax_top, ax_bottom):
220 |         ax.spines['right'].set_visible(False)
221 |         ax.spines['top'].set_visible(False)
222 |         ax.spines['bottom'].set_visible(False)
223 |         ax.spines['left'].set_position(('outward',5))
224 |         ax.ticklabel_format(axis='x', useOffset=False)
225 |     
226 |     for config, df, ax in zip(configs, dfs, axs):
227 |         plot_sumstats(df, config, ax)
228 |         
229 |     set_xlim_and_xtiks(dfs, ax_bottom) # only bottom axis have tick labels on top
230 | 
231 |     ax_top.set_ylabel(r"$\mathrm{-log_{10}(%s)}$" % configs[len(args.config_top)-1]["y_axis_label"], fontsize=16)
232 |     ax_bottom.set_ylabel(r"$\mathrm{-log_{10}(%s)}$" % configs[-1]["y_axis_label"], fontsize=16)
233 |     ax_top.tick_params(axis='both', which='major', labelsize=13)
234 |     ax_bottom.tick_params(axis='both', which='major', labelsize=13)
235 |     legend_label_colors = [c["legend_label_color"] for c in configs[:len(args.config_top)]]
236 |     ax_top.legend(loc="upper right", fontsize=16, frameon=False, labelcolor=legend_label_colors, markerscale=0, scatterpoints=1)
237 |     legend_label_colors = [c["legend_label_color"] for c in configs[len(args.config_top):]]
238 |     ax_bottom.legend(loc="lower right", fontsize=16, frameon=False, labelcolor=legend_label_colors, markerscale=0, scatterpoints=1)
239 | 
240 |     # set y axis lims
241 |     _, y_max_top = ax_top.get_ylim()
242 |     y_min_top = min(-np.log10(df.P.max()) for df in dfs[:len(args.config_top)])
243 |     y_max_bottom, _ = ax_bottom.get_ylim()
244 |     y_min_bottom = min(-np.log10(df.P.max()) for df in dfs[len(args.config_top):])
245 |     if any([c["allign_y_max"] for c in configs]):
246 |         y_max_top = max(y_max_top, y_max_bottom)
247 |         y_max_bottom = y_max_top
248 |     ax_top.set_ylim((y_min_top, y_max_top))
249 |     ax_bottom.set_ylim((y_max_bottom, y_min_bottom))
250 | 
251 |     for outf in args.out:
252 |         plt.savefig(outf, facecolor='w', dpi=300)
253 |         print(f"{outf} saved.")
254 | 
255 | 
256 | if __name__ == "__main__":
257 |     args = parse_args(sys.argv[1:])
258 |     args.func(args)
259 | 


--------------------------------------------------------------------------------
/process_metal.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | import os
 4 | 
 5 | # Input: metal output with UID in the MarkerName column.
 6 | # Add RSID from HRC reference panel and CHR and BP based on UID.
 7 | 
 8 | # Parse arguments --------------------------
 9 | parser = argparse.ArgumentParser(description="Process output of metal assuming MarkerName column contain UID. Add RSID from HRC reference panel and CHR and BP based on UID.")
10 | parser.add_argument("--fname", required=True, help="Path to input metal sumstats file.")
11 | parser.add_argument("--hrc", default="/cluster/projects/p33/users/alexeas/hrc/HRC.r1-1.GRCh37.wgs.mac5.sites.uid.tab.gz", help="Path to HRC reference file.")
12 | parser.add_argument("--out", help="Output file name.")
13 | args = parser.parse_args()
14 | if not args.out:
15 |     name, ext = os.path.splitext(args.fname)
16 |     ext = ext if ext.endswith("gz") else f"{ext}.gz"
17 |     args.out = f"{name}.processed{ext}"
18 | 
19 | fname = args.fname
20 | df = pd.read_csv(fname, sep='\t')
21 | 
22 | hrc_fname = args.hrc
23 | hrc = pd.read_csv(hrc_fname, sep='\t', usecols=["ID","UID"])
24 | hrc.rename(columns={"ID":"RSID"}, inplace=True)
25 | print(f"{hrc.shape[0]} variants in HRC reference.")
26 | 
27 | df = df.merge(hrc, left_on="MarkerName", right_on="UID", how='left')
28 | df["CHR"] = [uid.split(":")[0] for uid in df.MarkerName]
29 | df["BP"] = [uid.split(":")[1] for uid in df.MarkerName]
30 | df.drop(columns=["UID"], inplace=True)
31 | print(f"{df.shape[0]} variants in {fname}")
32 | df.to_csv(args.out, sep='\t', na_rep="NA", index=False)
33 | 
34 | 


--------------------------------------------------------------------------------
/pyliftover/GRCh37ToHg19.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/pyliftover/GRCh37ToHg19.over.chain.gz


--------------------------------------------------------------------------------
/pyliftover/README:
--------------------------------------------------------------------------------
 1 | Reference SNP rs dataset download from: ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606_b146_GRCh37p13/database/organism_data/
 2 |     SNPHistory.bcp.gz
 3 |     RsMergeArch.bcp.gz
 4 | 
 5 | Reference SNP genomic position dataset download from: 
 6 |     http://hgdownload.cse.ucsc.edu/goldenPath/hg17/liftOver/hg17ToHg19.over.chain.gz
 7 |     http://hgdownload.cse.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz
 8 | 
 9 | Chain files for build detection:
10 |     http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz
11 |     http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg18.over.chain.gz
12 |     http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg17.over.chain.gz
13 | 
14 | Author: Yunpeng Wang
15 | Date:   22 May 2016
16 | 


--------------------------------------------------------------------------------
/pyliftover/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Pure-python implementation of UCSC "liftover" genome coordinate conversion.
 3 | 
 4 | Copyright 2013, Konstantin Tretyakov.
 5 | http://kt.era.ee/
 6 | 
 7 | Licensed under MIT license.
 8 | 
 9 | PyLiftover is a library for quick and easy conversion of genomic (point) coordinates between different assemblies.
10 | 
11 | It uses the same logic and coordinate conversion mappings as the UCSC `liftOver tool<http://genome.ucsc.edu/cgi-bin/hgLiftOver>`_.
12 | 
13 | The primary usage example, supported by the library is the following::
14 | 
15 |     >> from pyliftover import LiftOver
16 |     >> lo = LiftOver('hg17', 'hg18')
17 |     >> lo.convert_coordinate('chr1', 1000000)
18 | 
19 | The first line will automatically download the hg17-to-hg18 coordinate conversion `chain file<http://genome.ucsc.edu/goldenPath/help/chain.html>` from UCSC,
20 | unless it is already cached or available in the current directory. Alternatively, you may provide your own chain file::
21 | 
22 |     >> lo = LiftOver('hg17ToHg18.over.chain.gz')
23 |     >> lo.convert_coordinate('chr1', 1000000, '-')
24 | 
25 | The result of ``lo.convert_coordinate`` call is either ``None`` (if the source chromosome name is unrecognized) or a list of target positions in the
26 | new assembly. The list may be empty (locus is deleted in the new assembly), have a single element (locus matched uniquely), or, in principle, 
27 | have multiple elements (although this is probably a rare occasion for most default intra-species genomic conversions).
28 | 
29 | Although you may try to apply the tool with arbitrary chain files, like the original ``liftOver`` tool, it makes most sense for conversion of 
30 | coordinates between different assemblies of the same species.
31 | '''
32 | 
33 | from .liftover import LiftOver


--------------------------------------------------------------------------------
/pyliftover/chainfile.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Pure-python implementation of UCSC "liftover" genome coordinate conversion.
  3 | Class for dealing with "xx.over.chain" files.
  4 | 
  5 | Copyright 2013, Konstantin Tretyakov.
  6 | http://kt.era.ee/
  7 | 
  8 | Licensed under MIT license.
  9 | '''
 10 | 
 11 | import os.path
 12 | import gzip
 13 | import urllib
 14 | import shutil
 15 | import sys
 16 | 
 17 | from .intervaltree import IntervalTree
 18 | 
 19 | if sys.version_info >= (3, 0):
 20 |     import urllib.request
 21 | FancyURLopener = urllib.FancyURLopener if sys.version_info < (3, 0) else urllib.request.FancyURLopener
 22 | 
 23 | class ErrorAwareURLOpener(FancyURLopener):
 24 |   def http_error_default(self, url, fp, errcode, errmsg, headers):
 25 |     raise Exception("404")
 26 | _urlopener = ErrorAwareURLOpener()
 27 | 
 28 | def open_liftover_chain_file(from_db, to_db, search_dir='.', cache_dir=os.path.expanduser("~/.pyliftover"), use_web=True, write_cache=True):
 29 |     '''
 30 |     A "smart" way of obtaining liftover chain files.
 31 |     By default acts as follows:
 32 |      1. If the file ``<from_db>To<to_db>.over.chain.gz`` exists in <search_dir>,
 33 |         opens it for reading via gzip.open.
 34 |      2. Otherwise, if the file ``<from_db>To<to_db>.over.chain`` exists
 35 |         in the <search_dir> opens it (as uncompressed file).
 36 |         Steps 1 and 2 may be disabled if search_dir is set to None.
 37 |      3. Otherwise, checks whether ``<cache_dir>/<from_db>To<to_db>.over.chain.gz`` exists.
 38 |         This step may be disabled by specifying cache_dir = None.
 39 |      4. If file still not found attempts to download the file from the URL
 40 |         'http://hgdownload.cse.ucsc.edu/goldenPath/<from_db>/liftOver/<from_db>To<to_db>.over.chain.gz'
 41 |         to a temporary location. This step may be disabled by specifying use_web=False. In this case the operation fails and 
 42 |         the function returns None.
 43 |      5. At this point, if write_cache=True and cache_dir is not None and writable, the file is copied to cache_dir and opened from there.
 44 |         Otherwise it is opened from the temporary location.
 45 |         
 46 |     In case of errors (e.g. URL cannot be opened), None is returned.
 47 |     '''
 48 |     to_db = to_db[0].upper() + to_db[1:]
 49 |     FILE_NAME_GZ = '%sTo%s.over.chain.gz' % (from_db, to_db)
 50 |     FILE_NAME = '%sTo%s.over.chain' % (from_db, to_db)
 51 |     
 52 |     if search_dir is not None:
 53 |         FILE_GZ = os.path.join(search_dir, FILE_NAME_GZ)
 54 |         FILE = os.path.join(search_dir, FILE_NAME)
 55 |         if os.path.isfile(FILE_GZ):
 56 |             return gzip.open(FILE_GZ, 'rb')
 57 |         elif os.path.isfile(FILE):
 58 |             return open(FILE, 'rb')
 59 |     if cache_dir is not None:
 60 |         FILE_GZ = os.path.join(cache_dir, FILE_NAME_GZ)
 61 |         if os.path.isfile(FILE_GZ):
 62 |             return gzip.open(FILE_GZ, 'rb')
 63 |     if use_web:
 64 |         # Download file from the web.
 65 |         try:
 66 |             url = 'http://hgdownload.cse.ucsc.edu/goldenPath/%s/liftOver/%sTo%s.over.chain.gz' % (from_db, from_db, to_db)
 67 |             (filename, headers) = _urlopener.retrieve(url)
 68 |         except:
 69 |             # Download failed, exit
 70 |             return None
 71 |         # Move the file to cache?
 72 |         if write_cache and (cache_dir is not None):
 73 |             try:
 74 |                 if not os.path.isdir(cache_dir):
 75 |                     os.mkdir(cache_dir)
 76 |                 shutil.move(filename, FILE_GZ)
 77 |                 # Move successful, open from cache
 78 |                 return gzip.open(FILE_GZ, 'rb')
 79 |             except:
 80 |                 # Move failed, open file from temp location
 81 |                 return gzip.open(filename, 'rb')
 82 |         else:
 83 |             # Open from temp location
 84 |             return gzip.open(filename, 'rb')
 85 |     # If we didn't quit before this place, all failed.
 86 |     return None
 87 | 
 88 | 
 89 | class LiftOverChainFile:
 90 |     '''
 91 |     The class, which loads and indexes USCS's .over.chain files.
 92 |     
 93 |     Specification of the chain format can be found here: http://genome.ucsc.edu/goldenPath/help/chain.html
 94 |     '''
 95 |     
 96 |     def __init__(self, f):
 97 |         '''
 98 |         Reads chain data from the file and initializes an interval index.
 99 |         f must be a file object open for reading.
100 |         If any errors are detected, an Exception is thrown.
101 |         '''
102 |         self.chains = self._load_chains(f)
103 |         self.chain_index = self._index_chains(self.chains)
104 |         
105 |     @staticmethod
106 |     def _load_chains(f):
107 |         '''
108 |         Loads all LiftOverChain objects from a file into an array. Returns the result.
109 |         '''
110 |         chains = []
111 |         while True:
112 |             line = f.readline()
113 |             if not line:
114 |                 break
115 |             if line.startswith(b'#') or line.startswith(b'\n') or line.startswith(b'\r'):
116 |                 continue
117 |             if line.startswith(b'chain'):
118 |                 # Read chain
119 |                 chains.append(LiftOverChain(line, f))
120 |                 continue
121 |         return chains
122 | 
123 |     @staticmethod
124 |     def _index_chains(chains):
125 |         '''
126 |         Given a list of LiftOverChain objects, creates a
127 |          dict: source_name --> 
128 |             IntervalTree: <source_from, source_to> -->
129 |                 (target_from, target_to, chain)
130 |         Returns the resulting dict.
131 |         Throws an exception on any errors or inconsistencies among chains (e.g. different sizes specified for the same chromosome in various chains).
132 |         '''
133 |         chain_index = {}
134 |         source_size = {}
135 |         target_size = {}
136 |         for c in chains:
137 |             # Verify that sizes of chromosomes are consistent over all chains
138 |             source_size.setdefault(c.source_name, c.source_size)
139 |             if source_size[c.source_name] != c.source_size:
140 |                 raise Exception("Chains have inconsistent specification of source chromosome size for %s (%d vs %d)" % (c.source_name, source_size[c.source_name], c.source_size))
141 |             target_size.setdefault(c.target_name, c.target_size)
142 |             if target_size[c.target_name] != c.target_size:
143 |                 raise Exception("Chains have inconsistent specification of target chromosome size for %s (%d vs %d)" % (c.target_name, target_size[c.target_name], c.target_size))
144 |             chain_index.setdefault(c.source_name, IntervalTree(0, c.source_size))
145 |             # Register all blocks from the chain in the corresponding interval tree
146 |             tree = chain_index[c.source_name]
147 |             for (sfrom, sto, tfrom, tto) in c.blocks:
148 |                 tree.add_interval(sfrom, sto, (tfrom, tto, c))
149 | 
150 |         # Sort all interval trees
151 |         for k in chain_index:
152 |             chain_index[k].sort()
153 |         return chain_index
154 | 
155 |     def query(self, chromosome, position):
156 |         '''
157 |         Given a chromosome and position, returns all matching records from the chain index.
158 |         Each record is an interval (source_from, source_to, data)
159 |         where data = (target_from, target_to, chain). Note that depending on chain.target_strand, the target values may need to be reversed (e.g. pos --> chain.target_size - pos).
160 |         
161 |         If chromosome is not found in the index, None is returned.
162 |         '''
163 |         # A somewhat-ugly hack to allow both 'bytes' and 'str' objects to be used as
164 |         # chromosome names in Python 3. As we store chromosome names as strings,
165 |         # we'll transparently translate the query to a string too.
166 |         if type(chromosome).__name__ == 'bytes':
167 |             chromosome = chromosome.decode('ascii')
168 |         if chromosome not in self.chain_index:
169 |             return None
170 |         else:
171 |             return self.chain_index[chromosome].query(position)
172 | 
173 | 
174 | class LiftOverChain:
175 |     '''
176 |     Represents a single chain from an .over.chain file.
177 |     A chain basically maps a set of intervals from "source" coordinates to corresponding coordinates in "target" coordinates.
178 |     The "source" and "target" are somehow referred to in the specs (http://genome.ucsc.edu/goldenPath/help/chain.html)
179 |     as "target" and "query" respectively.
180 |     '''
181 |     def __init__(self, header, f):
182 |         '''
183 |         Reads the chain from a stream given the first line and a file opened at all remaining lines.
184 |         On error throws an exception.
185 |         '''
186 |         if sys.version_info >= (3, 0):
187 |             header = header.decode('ascii') # In Python 2, work with usual strings.
188 |         fields = header.split()
189 |         if fields[0] != 'chain' and len(fields) not in [12, 13]:
190 |             raise Exception("Invalid chain format. (%s)" % header)
191 |         # chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1
192 |         self.score = int(fields[1])        # Alignment score
193 |         self.source_name = fields[2]       # E.g. chrY
194 |         self.source_size = int(fields[3])  # Full length of the chromosome
195 |         self.source_strand = fields[4]     # Must be +
196 |         if self.source_strand != '+':
197 |             raise Exception("Source strand in an .over.chain file must be +. (%s)" % header)
198 |         self.source_start = int(fields[5]) # Start of source region
199 |         self.source_end = int(fields[6])   # End of source region
200 |         self.target_name = fields[7]       # E.g. chr5
201 |         self.target_size = int(fields[8])  # Full length of the chromosome
202 |         self.target_strand = fields[9]     # + or -
203 |         if self.target_strand not in ['+', '-']:
204 |             raise Exception("Target strand must be - or +. (%s)" % header)
205 |         self.target_start = int(fields[10])
206 |         self.target_end = int(fields[11])
207 |         self.id = None if len(fields) == 12 else fields[12].strip()
208 |         
209 |         # Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to)
210 |         sfrom, tfrom = self.source_start, self.target_start
211 |         self.blocks = []
212 |         fields = f.readline().decode('ascii').split()
213 |         while len(fields) == 3:
214 |             size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2])
215 |             self.blocks.append((sfrom, sfrom+size, tfrom, tfrom+size))
216 |             sfrom += size + sgap
217 |             tfrom += size + tgap
218 |             fields = f.readline().split()
219 |         if len(fields) != 1:
220 |             raise Exception("Expecting one number on the last line of alignments block. (%s)" % header)
221 |         size = int(fields[0])
222 |         self.blocks.append((sfrom, sfrom+size, tfrom, tfrom+size))
223 |         if (sfrom + size) != self.source_end  or (tfrom + size) != self.target_end:
224 |             raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header)
225 | 


--------------------------------------------------------------------------------
/pyliftover/hg17ToHg19.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/pyliftover/hg17ToHg19.over.chain.gz


--------------------------------------------------------------------------------
/pyliftover/hg18ToHg19.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/pyliftover/hg18ToHg19.over.chain.gz


--------------------------------------------------------------------------------
/pyliftover/hg19ToGRCh37.over.chain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/pyliftover/hg19ToGRCh37.over.chain.gz


--------------------------------------------------------------------------------
/pyliftover/intervaltree.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Interval Tree data structure for indexing a set of 
  3 | integer intervals of the form [start, end).
  4 | 
  5 | http://en.wikipedia.org/wiki/Interval_tree
  6 | 
  7 | Copyright 2013, Konstantin Tretyakov.
  8 | http://kt.era.ee/
  9 | 
 10 | Licensed under MIT license.
 11 | '''
 12 | 
 13 | 
 14 | class IntervalTree:
 15 |     '''
 16 |     Interval Tree data structure for indexing a set of 
 17 |     integer intervals of the form [start, end). 
 18 |     
 19 |     See: http://en.wikipedia.org/wiki/Interval_tree
 20 | 
 21 |     The tree assumes it is covered in intervals reasonably uniformly (reasonable assumption for our liftOver purposes),
 22 |     and always picks its center as the middle point between the prespecified "min" and "max" values. 
 23 |     No removal operation is implemented.
 24 |     
 25 |     >>> t = IntervalTree(0, 100)
 26 |     >>> t.query(2)
 27 |     []
 28 |     >>> t.add_interval(10, 25)
 29 |     >>> t.add_interval(15, 27)
 30 |     >>> t.sort()
 31 |     >>> t.query(10)
 32 |     [(10, 25, None)]
 33 |     >>> t.query(24)
 34 |     [(10, 25, None), (15, 27, None)]
 35 |     >>> t.query(25)
 36 |     [(15, 27, None)]
 37 |     >>> t.query(27)
 38 |     []
 39 |     '''
 40 |     
 41 |     def __init__(self, min, max):
 42 |         '''
 43 |         Creates a tree node for keeping intervals somewhere in the range [min...max).
 44 |         '''
 45 |         self.min = int(min)
 46 |         self.max = int(max)
 47 |         assert self.min < self.max
 48 |         self.center = (min + max)/2
 49 |         self.single_interval = None # We take special care of trees which only contain a single interval
 50 |         self.left_subtree = None  # Intervals which are all strictly to the left of center.
 51 |         self.right_subtree = None # Intervals which are all strictly to the right of center.
 52 |         self.mid_sorted_by_start = []  # Intervals which contain center, sorted by start position
 53 |         self.mid_sorted_by_end = []    # Same intervals, sorted by end position.
 54 |     
 55 |     def add_interval(self, start, end, data=None):
 56 |         '''
 57 |         Inserts an interval to the tree. 
 58 |         Note that when inserting we do not maintain appropriate sorting of the "mid" data structure.
 59 |         This should be done after all intervals are inserted.
 60 |         '''
 61 |         # Ignore intervals of 0 or negative length
 62 |         if (end - start) <= 0:
 63 |             return
 64 |         if self.single_interval is None:
 65 |             # This is an empty tree and we are adding the first interval. Just record it in a field.
 66 |             self.single_interval = (start, end, data)
 67 |         elif self.single_interval == 0:
 68 |             # This is a usual tree, use standard addition method
 69 |             self._add_interval(start, end, data)
 70 |         else:
 71 |             # This is a tree with a single interval. Convert to a usual tree.
 72 |             self._add_interval(*self.single_interval)
 73 |             self.single_interval = 0
 74 |             self._add_interval(start, end, data)
 75 |             
 76 |     def _add_interval(self, start, end, data=None):
 77 |         if end <= self.center:
 78 |             # Insert into left subtree
 79 |             if self.left_subtree is None:
 80 |                 self.left_subtree = IntervalTree(self.min, self.center)
 81 |             self.left_subtree.add_interval(start, end, data)
 82 |         elif start > self.center:
 83 |             if self.right_subtree is None:
 84 |                 self.right_subtree = IntervalTree(self.center, self.max)
 85 |             self.right_subtree.add_interval(start, end, data)
 86 |         else:
 87 |             self.mid_sorted_by_start.append((start, end, data))
 88 |             self.mid_sorted_by_end.append((start, end, data))
 89 |     
 90 |     def sort(self):
 91 |         '''
 92 |         Must be invoked after all intevals have been added to sort mid_** arrays.
 93 |         '''
 94 |         if self.single_interval is None or self.single_interval != 0:
 95 |             return # Nothing to do for empty and leaf trees.
 96 |         self.mid_sorted_by_start.sort(key = lambda x: x[0])
 97 |         self.mid_sorted_by_end.sort(key = lambda x: x[1], reverse=True)
 98 |         if self.left_subtree is not None:
 99 |             self.left_subtree.sort()
100 |         if self.right_subtree is not None:
101 |             self.right_subtree.sort()
102 |     
103 |     def query(self, x):
104 |         '''
105 |         Returns all intervals in the tree, which overlap given point, i.e. all (start, end, data) records, for which (start <= x < end).
106 |         '''
107 |         result = []
108 |         self._query(x, result)
109 |         return result
110 |     
111 |     def _query(self, x, result):
112 |         '''
113 |         Same as self.query, but uses a provided list to accumulate results into.
114 |         '''
115 |         if self.single_interval is None: # Empty
116 |             return
117 |         elif self.single_interval != 0:  # Single interval, just check whether x is in it
118 |             if self.single_interval[0] <= x < self.single_interval[1]:
119 |                 result.append(self.single_interval)
120 |         elif x < self.center:            # Normal tree, query point to the left of center
121 |             if self.left_subtree is not None:
122 |                 self.left_subtree._query(x, result)
123 |             for int in self.mid_sorted_by_start:
124 |                 if int[0] <= x:
125 |                     result.append(int)
126 |                 else:
127 |                     break
128 |         else:  # Normal tree, query point to the right of center
129 |             for int in self.mid_sorted_by_end:
130 |                 if int[1] > x:
131 |                     result.append(int)
132 |                 else:
133 |                     break
134 |             if self.right_subtree is not None:
135 |                 self.right_subtree._query(x, result)
136 | 
137 |     def __len__(self):
138 |         '''
139 |         The number of intervals maintained in the tree.
140 |         Note that adding zero- or negative-size intervals does not affect its size (they are not registered).
141 |         
142 |         >>> t = IntervalTree(0, 100)
143 |         >>> t.add_interval(1, 10)
144 |         >>> t.add_interval(20, 30)
145 |         >>> t.add_interval(20, 20)
146 |         >>> t.add_interval(20, 19)
147 |         >>> len(t)
148 |         2
149 |         '''
150 |         
151 |         if self.single_interval is None:
152 |             return 0
153 |         elif self.single_interval != 0:
154 |             return 1
155 |         else:
156 |             size = len(self.mid_sorted_by_start)
157 |             if self.left_subtree is not None:
158 |                 size += len(self.left_subtree)
159 |             if self.right_subtree is not None:
160 |                 size += len(self.right_subtree)
161 |             return size
162 |             
163 |     def __iter__(self):
164 |         if self.single_interval is None:
165 |             return
166 |         elif self.single_interval != 0:
167 |             yield self.single_interval
168 |         else:
169 |             if self.left_subtree is not None:
170 |                 for s in self.left_subtree:
171 |                     yield s
172 |             for s in self.mid_sorted_by_start:
173 |                 yield s
174 |             if self.right_subtree is not None:
175 |                 for s in self.right_subtree:
176 |                     yield s
177 | 
178 | 


--------------------------------------------------------------------------------
/pyliftover/liftover.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Pure-python implementation of UCSC "liftover" genome coordinate conversion.
 3 | Main class, which is actually a convenience wrapper around chainfile.py's LiftOverChainFile
 4 | 
 5 | Copyright 2013, Konstantin Tretyakov.
 6 | http://kt.era.ee/
 7 | 
 8 | Licensed under MIT license.
 9 | '''
10 | 
11 | import os.path
12 | import gzip
13 | from .chainfile import open_liftover_chain_file, LiftOverChainFile
14 | 
15 | class LiftOver:
16 |     def __init__(self, from_db, to_db=None, search_dir='.', cache_dir=os.path.expanduser("~/.pyliftover"), use_web=True, write_cache=True, use_gzip=None):
17 |         '''
18 |         LiftOver can be initialized in multiple ways.
19 |          * By providing a filename as a single argument: LiftOver("hg17ToHg18.over.chain.gz")
20 |            The file may be a usual or a gzip-compressed file. The compression is automatically detected from the .gz extension.
21 |            If you want to override the way this is handled (e.g. open a file with non-gz extension as gzipped file), use use_gzip=True or use_gzip=False as needed.
22 |          * By providing an opened file opbject as a single argument: LiftOver(open("hg17ToHg18.over.chain"))
23 |          * By providing name of from_db and to_db, e.g. LiftOver('hg18', 'hg19').
24 |            In this case, LiftOver will "intelligently" search for the best available over.chain file for converting between the assemblies.
25 |            The file will be searched in local directory, cache directory, or even downloaded from the web, if possible.
26 |            The exact way this is handled (as well as all the other parameters of the constructor) is documented in 
27 |            :see:`pyliftover.chainfile.open_liftover_chain_file`.
28 | 
29 |         Test providing filename:
30 |         >>> lo = LiftOver('tests/data/mds42.to.mg1655.liftOver')
31 |         >>> lo.convert_coordinate('AP012306.1', 16000) #doctest: +ELLIPSIS (because on 32-bit systems there's an L qualifier after the number and on 64-bit ones there's nothing.
32 |         [('Chromosome', 21175, '+', 378954552...)]
33 | 
34 |         Test providing from_db and to_db:
35 |         >>> lo = LiftOver('hg17', 'hg18')
36 |         >>> lo.convert_coordinate('chr1', 1000000)
37 |         [('chr1', 949796, '+', 21057807908...)]
38 |         >>> lo.convert_coordinate('chr1', 0)
39 |         [('chr1', 0, '+', 21057807908...)]
40 |         >>> lo.convert_coordinate('chr1', 0, '-')
41 |         [('chr1', 0, '-', 21057807908...)]
42 |         >>> lo.convert_coordinate('chr1', 103786442)
43 |         [('chr20', 20668001, '-', 14732...)]
44 |         >>> lo.convert_coordinate('chr1', 103786443, '-')
45 |         [('chr20', 20668000, '+', 14732...)]
46 |         >>> lo.convert_coordinate('chr1', 103786441, '+')
47 |         []
48 |         '''
49 |         if to_db is None:
50 |             # A file name or a file object was provided
51 |             if isinstance(from_db, str):
52 |                 do_gzip = use_gzip if use_gzip is not None else from_db.lower().endswith('.gz')
53 |                 if do_gzip:
54 |                     f = gzip.open(from_db, 'rb')
55 |                 else:
56 |                     f = open(from_db, 'rb')
57 |             else:
58 |                 f = from_db
59 |         else:
60 |             # From- and To- db names were provided.
61 |             f = open_liftover_chain_file(from_db=from_db, to_db=to_db, search_dir=search_dir, cache_dir=cache_dir, use_web=use_web, write_cache=write_cache)
62 |         self.chain_file = LiftOverChainFile(f)
63 |         f.close()
64 |         
65 |     def convert_coordinate(self, chromosome, position, strand='+'):
66 |         '''
67 |         Returns a *list* of possible conversions for a given chromosome position.
68 |         The list may be empty (no conversion), have a single element (unique conversion), or several elements (position mapped to several chains).
69 |         The list contains tuples (target_chromosome, target_position, target_strand, conversion_chain_score),
70 |         where conversion_chain_score is the "alignment score" field specified at the chain used to perform conversion. If there
71 |         are several possible conversions, they are sorted by decreasing conversion_chain_score.
72 |         
73 |         IF chromosome is completely unknown to the LiftOver, None is returned.
74 |         
75 |         Note that coordinates are 0-based, and even at negative strand are relative to the beginning of the genome.
76 |         I.e. position 0 strand + is the first position of the genome. Position 0 strand - is also the first position of the genome 
77 |         (and the last position of reverse-complemented genome).
78 |         '''
79 |         query_results = self.chain_file.query(chromosome, position)
80 |         if query_results is None:
81 |             return None
82 |         else:
83 |             # query_results contains intervals which contain the query point. We simply have to remap to corresponding targets.
84 |             results = []
85 |             for (source_start, source_end, data) in query_results:
86 |                 target_start, target_end, chain = data
87 |                 result_position = target_start + (position - source_start)
88 |                 if chain.target_strand == '-':
89 |                     result_position = chain.target_size - 1 - result_position
90 |                 result_strand = chain.target_strand if strand == '+' else ('+' if chain.target_strand == '-' else '-')
91 |                 results.append((chain.target_name, result_position, result_strand, chain.score))
92 |             #if len(results) > 1:
93 |             results.sort(key=lambda x: x[3], reverse=True)
94 |             return results
95 | 


--------------------------------------------------------------------------------
/qq.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import matplotlib
  4 | matplotlib.use("Agg")
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import scipy.stats as sstats
  8 | import argparse
  9 | import pandas as pd
 10 | import json
 11 | 
 12 | # Examples:
 13 | # python qq.py MSA_MSA_2016_lift_noMHC_correct_case_control.csv.gz --strata CTG_COG_2018.csv.gz --strata-num PVAL --top-as-dot 100 --weights weights.tld.txt.gz --out qq.msa_cog.top100.tld.png
 14 | # python qq.py PGC_MDD_2018_no23andMe.csv.gz --strata PGC_MDD_2018_no23andMe.csv.gz --strata-cat CHR --strata-cat-ids 'chr1_10=1:2:3:4:5:6:7:8:9:10,chr11_20=11:12:13:14:15:16:17:18:19:20,chr21_22=21:22' --top-as-dot 100 --weights weights.prune.txt.gz --y-lim 7.301029995663981 --out qq.mdd.chr.top100.prune.png
 15 | # python qq.py PGC_SCZ_2014_EUR_qc.csv.gz --strata PGC_MDD_2018_no23andMe.csv.gz --strata-num PVAL --top-as-dot 100 --weights weights.prune.txt.gz --out qq.scz_mdd.top100.prune.png --y-lim 7.301029995663981
 16 | 
 17 | example_text =  """Example 1:
 18 | python qq.py PGC_BIP_2016_qc.csv.gz
 19 | Example 2:
 20 | python qq.py PGC_SCZ_2014_EUR_qc.csv.gz --strata PGC_MDD_2018_no23andMe.csv.gz \\
 21 | --strata-num PVAL --top-as-dot 100 --weights weights.prune.txt.gz \\
 22 | --out qq.scz_mdd.top100.prune.png --y-lim 15
 23 | Example 3:
 24 | python qq.py PGC_MDD_2018_no23andMe.csv.gz --strata PGC_MDD_2018_no23andMe.csv.gz \\
 25 | --strata-cat CHR --strata-cat-ids 'chr1_7=1:2:3:4:5:6:7,chr18_21=18:19:20:21' \\
 26 | --weights weights.tld.txt.gz --y-lim 7.301029995663981 --out qq.mdd.chr.png"""
 27 | 
 28 | 
 29 | def parse_args(args):
 30 |     parser = argparse.ArgumentParser(
 31 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 32 |         description="A tool to qq plots from sumstats.",
 33 |         epilog=example_text)
 34 | 
 35 |     parser.add_argument("sumstats", help="Sumstats file")
 36 |     parser.add_argument("--sep", default='\t',
 37 |         help="Column separator in sumstat file")
 38 |     parser.add_argument("--p", default="PVAL",
 39 |         help="A column with SNP p-values in sumstats file")
 40 |     parser.add_argument("--snp", default="SNP",
 41 |         help="A column with SNP ids in sumstats file")
 42 |     parser.add_argument("--strata", default="NA",
 43 |         help="A file with at least 2 columns: SNP id and SNP stratum")
 44 |     parser.add_argument("--strata-sep", default='\t',
 45 |         help="Column separator in strata file")
 46 |     parser.add_argument("--strata-snp", default="SNP",
 47 |         help="A column with SNP ids in strata file")
 48 |     parser.add_argument("--strata-cat", default="NA",
 49 |         help="A column with SNP categories. Each category represents a separate stratum in qq plot")
 50 |     parser.add_argument("--strata-cat-ids", default="NA",
 51 |         help=("Comma-separated list of categories from --strata-cat column to plot "
 52 |             "and corresponding names, e.g. 'chr1_2_6=1:2:6' (defines strata chr1_2_6 "
 53 |             "containinfg all variants with value in --strata-cat column = 1,2 or 6). "
 54 |             "By default all categories are plotted with original names"))
 55 |     parser.add_argument("--strata-num", default="NA",
 56 |         help="A column with SNP numerical value (e.g. p-value)")
 57 |     parser.add_argument("--strata-num-intervals", type=str,
 58 |         default="p<10^-1=:0.1,p<10^-2=:0.01,p<10^-3=:0.001", help=("Comma-separated "
 59 |             "intervals defining SNP strata based on values from --strata-num column "
 60 |             "and corresponding names, e.g.: 'A=:-1,B=0:6' (defines stratum A "
 61 |             "corresponding to the interval (-inf, -1] and stratum B = (0,6].  "
 62 |             "If there is a '-' charecter in any of values, the whole argument value should be quoted"))
 63 |     parser.add_argument("--strata-bin", nargs='+', default="NA",
 64 |         help=("A list of columns (each column representing one stratum) with binary data "
 65 |             "0/1 or False/True for each variant indicatig whether the variant belongs to "
 66 |             "the corresponding strata"))
 67 |     parser.add_argument("--weights", default="NA",
 68 |         help=("Tab-separated file without header and with 2 columns: SNP id and SNP weight. "
 69 |             "Don't need to be normalized"))
 70 |     parser.add_argument("--top-as-dot", default=0, type=int,
 71 |         help="Number of top associations (lowest p-values) to mark as a separate dot")
 72 |     parser.add_argument("--x-lim", default=None, type=float,
 73 |         help="X-axis maximum limit on -log10 scale")
 74 |     parser.add_argument("--y-lim", default=None, type=float,
 75 |         help="Y-axis maximum limit on -log10 scale (e.g. gws threshold = 7.3)")
 76 |     parser.add_argument("--out", default="qq.png", help="Output file name")
 77 | 
 78 |     return parser.parse_args(args)
 79 | 
 80 | 
 81 | def drop_duplicated_ind(df):
 82 |     i = df.index.duplicated(keep='first')
 83 |     if i.any():
 84 |         print("The table contains %d duplicated ids" % sum(i))
 85 |         print("Only the first row with duplicated id will be retained")
 86 |         df = df.loc[~i,:]
 87 |     return df
 88 | 
 89 | 
 90 | def process_args(args):
 91 |     """
 92 |     Check whether provided arguments are correct, change list-type arguments
 93 |     with single value to have a length = length of sumstats argument and process
 94 |     chr2use arument.
 95 |     """
 96 |     assert os.path.isfile(args.sumstats), "'%s' file doesn't exist" % args.sumstats
 97 |     assert os.path.isfile(args.strata) or args.strata=="NA", "'%s' file doesn't exist" % args.strata
 98 |     assert os.path.isfile(args.weights) or args.weights=="NA", "'%s' file doesn't exist" % args.weights
 99 | 
100 |     # process special arguments
101 |     arg_dict = vars(args)
102 |     if args.strata_num != "NA":
103 |         intervals = {}
104 |         for name_i in arg_dict["strata_num_intervals"].split(","):
105 |             name, i = name_i.split("=")
106 |             name = name.strip()
107 |             assert name != "", "Stratum name should not be an empty string"
108 |             start,end = i.split(":")
109 |             start = -np.inf if start == "" else float(start)
110 |             end = np.inf if end == "" else float(end)
111 |             assert not name in intervals, "Stratum name must be unique (duplicated name: %s)" % name
112 |             intervals[name] = (start, end)
113 |         arg_dict["strata_num_intervals"] = intervals
114 |     if args.strata_cat != "NA" and args.strata_cat_ids != "NA":
115 |         categories = {}
116 |         for name_c in arg_dict["strata_cat_ids"].split(","):
117 |             name, c = name_c.split("=")
118 |             name = name.strip()
119 |             assert name != "", "Stratum name should not be an empty string"
120 |             c = frozenset(map(str.strip, c.split(":")))
121 |             assert not name in categories, "Strata name must be unique (duplicated name: %s)" % name
122 |             categories[name] = c
123 |         arg_dict["strata_cat_ids"] = categories
124 | 
125 | 
126 | def read_sumstats(sumstats_f, sep, snpid_col, pval_col):
127 |     """
128 |     Filter original summary stats file.
129 |     Args:
130 |         sumstats_f: sumstats file name
131 |         sep: column separator in sumstats_f
132 |         snpid_col: a name of column with variant ids
133 |         pval_col: a name of column with variant p-values
134 |     Returns:
135 |         df: filtered p-values, pd.DataFrame(index=snp_id, values=pval)
136 |     """
137 |     print("Reading %s" % sumstats_f)
138 |     cols2use = [snpid_col, pval_col]
139 |     df = pd.read_csv(sumstats_f, usecols=cols2use, index_col=snpid_col,
140 |         sep=sep)
141 |     print("%d SNPs in %s" % (len(df), sumstats_f))
142 |     df = df.loc[np.isfinite(df[pval_col]),:]
143 |     print("%d SNPs with defined p-value" % len(df))
144 |     df = df.loc[df[pval_col]>0,:]
145 |     print("%d SNPs with non-zero p-value" % len(df))
146 |     df = drop_duplicated_ind(df)
147 |     return df
148 | 
149 | 
150 | def read_strata_cat(strata_f, sep, snpid_col, strata_cat_col, strata_cat_ids):
151 |     print("Reading strata file %s" % strata_f)
152 |     cols2use = [snpid_col, strata_cat_col]
153 |     df = pd.read_csv(strata_f, usecols=cols2use, index_col=snpid_col, sep=sep,
154 |         dtype={strata_cat_col:str})
155 |     # make a standard name for variant strata column
156 |     if strata_cat_ids == "NA":
157 |         for s in df[strata_cat_col].unique():
158 |             stratum_i = (df[strata_cat_col] == s)
159 |             df.loc[:,s] = stratum_i
160 |     else:
161 |         for name, ids_set in strata_cat_ids.items():
162 |             stratum_i = df[strata_cat_col].isin(ids_set)
163 |             df.loc[:,name] = stratum_i
164 |     df.drop([strata_cat_col], axis=1, inplace=True)
165 |     # keep only variants which are within any stratum
166 |     df = df.loc[df.any(axis=1)]
167 |     assert len(df) > 0, "All strata are empty"
168 |     df = drop_duplicated_ind(df)
169 |     return df
170 | 
171 | 
172 | def read_strata_num(strata_f, sep, snpid_col, strata_num_col, strata_num_intervals):
173 |     print("Reading strata file %s" % strata_f)
174 |     cols2use = [snpid_col, strata_num_col]
175 |     df = pd.read_csv(strata_f, usecols=cols2use, index_col=snpid_col, sep=sep,
176 |         dtype={strata_num_col:float})
177 |     for name, (start, end) in strata_num_intervals.items():
178 |         stratum_i = (start<df[strata_num_col]) & (df[strata_num_col]<end)
179 |         df.loc[:,name] = stratum_i
180 |     df.drop([strata_num_col], axis=1, inplace=True)
181 |     # keep only variants which are within any stratum
182 |     df = df.loc[df.any(axis=1)]
183 |     assert len(df) > 0, "All strata are empty"
184 |     df = drop_duplicated_ind(df)
185 |     return df
186 | 
187 | 
188 | def read_strata_bin(strata_f, sep, snpid_col, strata_bin):
189 |     print("Reading strata file %s" % strata_f)
190 |     cols2use = [snpid_col] + strata_bin
191 |     df = pd.read_csv(strata_f, usecols=cols2use, index_col=snpid_col, sep=sep,
192 |         dtype=dict.fromkeys(strata_bin, bool))
193 |     df = df.loc[df.any(axis=1)]
194 |     assert len(df) > 0, "All strata are empty"    
195 |     df = drop_duplicated_ind(df)
196 |     return df
197 | 
198 | 
199 | def read_weights(weights_f):
200 |     print("Reading weights file %s" % weights_f)
201 |     df = pd.read_csv(weights_f, sep='\t', header=None, names=["snp", "w"],
202 |         index_col="snp")
203 |     # drop zero weights
204 |     df = df.loc[df.w>0,:]
205 |     drop_duplicated_ind(df)
206 |     return df
207 | 
208 | 
209 | def get_xy_from_p(p, top_as_dot, p_weights, nbins=200):
210 |     if p_weights is None:
211 |         p_weights = np.ones(len(p))
212 |     p_weights /= sum(p_weights) # normalize weights
213 | 
214 |     i = np.argsort(p)
215 |     p = p[i]
216 |     p_weights = p_weights[i]
217 |     p_ecdf = np.concatenate([[0], np.cumsum(p_weights)])
218 | 
219 |     y = np.logspace(np.log10(p[-1]), np.log10(p[top_as_dot]), nbins)
220 |     i = np.searchsorted(p, y, side='right')
221 |     i[0] = len(p)  # last index in p_ecdf
222 |     i[-1] = top_as_dot+1 # top_as_dot index in p_ecdf
223 |     # estimate standard uniform quantiles corresponding to y observed quantiles
224 |     uniform_quantiles = p_ecdf[i]
225 |     x = -np.log10(uniform_quantiles)
226 |     y = -np.log10(y)
227 |     # if top_as_dot = 0, then x_dot and y_dot are empty arrays
228 |     x_dot = -np.log10(p_ecdf[1:top_as_dot+1])
229 |     y_dot = -np.log10(p[:top_as_dot]).values
230 |     return x, y, x_dot, y_dot
231 | 
232 | 
233 | def get_ci(p, p_weights, ci_alpha=0.05, nbins=200):
234 |     # TODO: the first part of this function is identical to the first part of
235 |     # get_xy_from_p(), so probably should be merged??
236 |     if p_weights is None:
237 |         p_weights = np.ones(len(p))
238 |     p_weights *= len(p)/sum(p_weights) # normalize weights and imitate order statistics
239 | 
240 |     i = np.argsort(p)
241 |     p = p[i]
242 |     p_weights = p_weights[i]
243 |     cum_p_weights = np.cumsum(p_weights)
244 | 
245 |     y = np.logspace(np.log10(p[-1]), np.log10(p[0]), nbins)
246 |     # the following code is inspired by:
247 |     # https://genome.sph.umich.edu/wiki/Code_Sample:_Generating_QQ_Plots_in_R
248 |     # beta_a is our order statistics. For standard uniform distr (expected under null)
249 |     # it follows beta distr:
250 |     # https://en.wikipedia.org/wiki/Order_statistic#Order_statistics_sampled_from_a_uniform_distribution
251 |     i = np.searchsorted(p, y, side='left')
252 |     i[0] = len(p) - 1
253 |     i[-1] = 0
254 |     beta_a = cum_p_weights[i]
255 |     beta_b = len(p) + 1 - beta_a
256 |     lower_ci = -np.log10(sstats.beta.ppf(1-ci_alpha/2, beta_a, beta_b))
257 |     upper_ci = -np.log10(sstats.beta.ppf(ci_alpha/2, beta_a, beta_b))
258 |     x_ci = -np.log10(beta_a/len(p))
259 |     return x_ci, lower_ci, upper_ci
260 | 
261 | class NumpyEncoder(json.JSONEncoder):
262 |     def default(self, obj):
263 |         if callable(obj):
264 |             return str(obj)
265 |         if isinstance(obj, np.ndarray):
266 |             return obj.tolist()
267 |         if isinstance(obj, pd.Series) or isinstance(obj, pd.Index):
268 |             return obj.values.tolist()
269 |         if isinstance(obj, np.float32):
270 |             return np.float64(obj)
271 |         return json.JSONEncoder.default(self, obj)
272 | 
273 | 
274 | if __name__ == "__main__":
275 |     args = parse_args(sys.argv[1:])
276 |     process_args(args)
277 | 
278 |     df_sumstats = read_sumstats(args.sumstats, args.sep, args.snp, args.p)
279 | 
280 |     df_strata = None
281 |     if args.strata_cat != "NA":
282 |         df_strata = read_strata_cat(args.strata, args.strata_sep, args.strata_snp,
283 |             args.strata_cat, args.strata_cat_ids)
284 |     elif args.strata_num != "NA":
285 |         df_strata = read_strata_num(args.strata, args.strata_sep, args.strata_snp,
286 |             args.strata_num, args.strata_num_intervals)
287 |     elif args.strata_bin != "NA":
288 |         df_strata = read_strata_bin(args.strata, args.strata_sep, args.strata_snp,
289 |             args.strata_bin)
290 | 
291 |     if args.weights != 'NA':
292 |         df_weights = read_weights(args.weights)
293 |         snps_with_weight = df_sumstats.index.intersection(df_weights.index)
294 |         print("%d varints from %s have weight" % (len(snps_with_weight), args.sumstats))
295 |         assert len(snps_with_weight) > 0, ("At least one variant from %s must "
296 |             "have weight in %s if weights are provided" % (args.sumstats, args.weights))
297 |         print("Only they will be plotted") 
298 |         df_sumstats = df_sumstats.loc[snps_with_weight,:]
299 |         df_sumstats["weights"] = df_weights.loc[snps_with_weight,:]
300 |     else:
301 |         # if weights are not provided set equal weights to all SNPs
302 |         df_sumstats["weights"] = 1.
303 | 
304 |     if not df_strata is None:
305 |         # drop variants which are not in df_sumstats
306 |         df_strata = df_strata.loc[df_strata.index.isin(df_sumstats.index),:]
307 |     # df_strata is either None or:
308 |     # df_strata = DataFrame(index=snp_ids, columns=boolean_strata)
309 | 
310 |     x, y, x_dot, y_dot = get_xy_from_p(df_sumstats[args.p], args.top_as_dot,
311 |         df_sumstats["weights"])
312 |     x_ci, lower_ci, upper_ci = get_ci(df_sumstats[args.p], df_sumstats["weights"])
313 | 
314 |     # estimate axis limits
315 |     max_x_lim = max(x[-1],x_ci[-1], 0 if args.top_as_dot==0 else x_dot[0])
316 |     max_y_lim = max(y[-1],upper_ci[-1], 0 if args.top_as_dot==0 else y_dot[0])
317 | 
318 |     print("Making plot")
319 |     json_data = {}
320 |     fig, ax = plt.subplots(figsize=(6,6), dpi=200)
321 | 
322 |     # plot null and ci
323 |     ax.fill_between(x_ci, lower_ci, upper_ci, color="0.8"); json_data['x_ci'] = x_ci; json_data['lower_ci'] = lower_ci; json_data['upper_ci'] = upper_ci
324 |     ax.plot([0,x_ci[-1]], [0,x_ci[-1]], ls='--', lw=1, marker=' ', color="k")
325 |     # plot all data
326 |     if df_strata is None:
327 |         ax.plot(x, y, ls='-', lw=1, marker=' ', label="all variants", color='C0'); json_data['x'] = x; json_data['y'] = y
328 |         if args.top_as_dot > 0:
329 |             ax.plot(x_dot, y_dot, ls=' ', marker='.', ms=1, color='C0'); json_data['x_dot'] = x_dot; json_data['y_dot'] = y_dot
330 | 
331 |     # plot strata
332 |     if not df_strata is None:
333 |         json_data['stratum'] = []
334 |         for j, stratum_id in enumerate(df_strata.columns):
335 |             i = df_strata.index[df_strata[stratum_id]]
336 |             json_stratum = {'stratum_id':stratum_id}
337 |             x, y, x_dot, y_dot = get_xy_from_p(df_sumstats.loc[i,args.p],
338 |                 args.top_as_dot, df_sumstats.loc[i,"weights"])
339 |             color = "C%d" % ((j%9)+1); json_stratum['color'] = color
340 |             ax.plot(x, y, ls='-', lw=1, marker=' ', label=stratum_id, color=color); json_stratum['x'] = x; json_stratum['y'] = y
341 |             if args.top_as_dot > 0:
342 |                 ax.plot(x_dot, y_dot, ls=' ', marker='.', ms=1, color=color); json_stratum['x_dot'] = x_dot; json_stratum['y_dot'] = y_dot
343 |             # update upper limits if needed
344 |             max_x_lim = max(max_x_lim, x[-1], 0 if args.top_as_dot==0 else x_dot[0])
345 |             max_y_lim = max(max_y_lim, y[-1], 0 if args.top_as_dot==0 else y_dot[0])
346 |             json_data['stratum'].append(json_stratum)
347 | 
348 |     ax.set_xlabel(r"expected $\mathrm{-log_{10}(P)}$")
349 |     ax.set_ylabel(r"observed $\mathrm{-log_{10}(P)}$")
350 | 
351 |     if not args.x_lim is None:
352 |         max_x_lim = args.x_lim
353 |     if not args.y_lim is None:
354 |         max_y_lim = args.y_lim
355 |     ax.set_xlim((-0.005*max_x_lim, 1.01*max_x_lim))
356 |     ax.set_ylim((-0.005*max_y_lim, 1.01*max_y_lim))
357 | 
358 |     # configure and set title
359 |     title = os.path.splitext(os.path.basename(args.sumstats))[0]
360 |     if args.strata != "NA":
361 |         strata = os.path.splitext(os.path.basename(args.strata))[0]
362 |         title = "%s | %s" % (title, strata)
363 |     ax.set_title(title, fontsize='small'); json_data['title'] = title
364 | 
365 |     ax.legend(loc='upper left', fontsize="small")
366 | 
367 |     # remove top and right spines
368 |     ax.spines['right'].set_visible(False)
369 |     ax.spines['top'].set_visible(False)
370 |     # add offset for left spine
371 |     # ax.spines['left'].set_position(('outward',1))
372 |     # ax.spines['bottom'].set_position(('outward',1))
373 | 
374 |     plt.grid(True)
375 |     # plt.axis('equal')
376 |     plt.tight_layout()
377 |     # plt.show()
378 | 
379 |     plt.savefig(args.out)
380 |     print("%s was generated" % args.out)
381 | 
382 |     with open(args.out + '.json', 'w') as outfile:  
383 |         json.dump(json_data, outfile, cls=NumpyEncoder)    
384 |     print("%s.json was generated" % args.out)
385 | 
386 |     print("Done.")
387 | 


--------------------------------------------------------------------------------
/sumStats2ref.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import scipy.stats as stats
  4 | import scipy.io as sio
  5 | import os, sys, argparse, time, logging, getpass
  6 | import matplotlib.pyplot as plt
  7 | from GWAS_IO.summary_stats_Utils import *
  8 | 
  9 | def read_sum_dat(sumFile, logger, kargs):
 10 |     '''
 11 |     Read give summary statistics.
 12 | 
 13 |     Input:
 14 |     sumFile,    Path of summary file.
 15 |     logger,     Python logger for process information.
 16 |     kargs,      namespace object for options.
 17 | 
 18 |     Return:
 19 |     -------
 20 |     sumDat,     DataFrame of summary dataset.
 21 | 
 22 |     Note:
 23 |     -----
 24 |     1. Field names (if exists) will be standardize according to bellow
 25 |                         effCol,     Beta
 26 |                         ORCol,      OR
 27 |                         effACol,    A1
 28 |                         othACol,    A2
 29 |                         posCol,     POS
 30 |                         infoCol,    INFO
 31 |                         NCol,       N
 32 |                 And, Chromosome names will be standardized
 33 |                         Removing'CHR', 'Chr', etc --> integer
 34 |                         recode chrX--> 23
 35 |                         recode chrY--> 24
 36 |                         recode chrM--> 25
 37 |     2. SNPs with invalid p values removed, i.e., >1 or < 0, or NAN
 38 |     3. Duplicated SNPs removed
 39 |     '''
 40 | 
 41 |     if not os.access(sumFile, os.R_OK):
 42 |         raise ValueError("Can't read summary stats file: {}".format(sumFile))
 43 |     logger.info('*** Loading summary stats ***')
 44 |     logger.info('Read summary data from {}'.format(sumFile))
 45 |     sumDat = read_sumdata(sumFile, kargs.snpCol, kargs.pCol, kargs)
 46 |     logger.info('......')
 47 |     logger.info('Read {} SNPs'.format(sumDat.shape[0]))
 48 |     colnames = ['SNP', 'P', 'A1', 'CHR', 'POS', 'Beta', 'A2']
 49 |     if 'P' not in sumDat.columns:
 50 |         raise RuntimeError('No P value provided')
 51 |     if 'SNP' not in sumDat.columns:
 52 |         raise RuntimeError('No SNP ID provided') 
 53 |     if not kargs.effACol:
 54 |         warnings.warn('No effective Allele provided') 
 55 |         logger.warning('No effective Allele provided') 
 56 |         colnames.remove('A1')
 57 |     if not kargs.othACol:
 58 |         warnings.warn( "No other Allele information provided") 
 59 |         logger.warn('No effective Allele provided') 
 60 |         colnames.remove('A2')
 61 |     if not kargs.effCol:
 62 |         if not kargs.orCol:
 63 |             colnames.remove('Beta')
 64 |             logger.warn('Directionality is not checked')
 65 |         else:
 66 |             sumDat.loc[:, 'Beta'] = np.log(sumDat.loc[:, 'OR'])
 67 |             sumDat.drop('OR',axis=1, inplace=True)
 68 |     if (not kargs.effACol) and (not kargs.othACol):
 69 |         logger.warn('Directionality is not checked')
 70 |         colnames.remove('Beta')
 71 |         sumDat.drop('Beta', axis=1, inplace=True)
 72 |     if (not kargs.posCol) or (not kargs.chrCol):
 73 |         logger.info('Using SNP ID only for align Summary data to reference')
 74 |         colnames.remove('POS')
 75 |         colnames.remove('CHR')
 76 |         keys = ['SNP']
 77 |     elif kargs.forceID:
 78 |         keys = ['SNP']
 79 |     else:
 80 |         keys = ['CHR', 'POS']
 81 |     if kargs.NCol:
 82 |         colnames.append('N')
 83 |     logger.info('Reading Summary stats done\n')
 84 |     logger.info('**** check P values *****')
 85 |     sumDat = basic_QC_P(sumDat, kargs.outdir, 'P', logger)
 86 |     logger.info('**** END check P values *****')
 87 |     logger.info('**** check duplicated SNPs *****')
 88 |     sumDat, dup = deduplcate_sum(sumDat, 'P', keys)
 89 |     print (sumDat.head())
 90 |     if dup.shape[0] > 0:
 91 |         dupFile = os.path.join(kargs.outdir, 'Duplicated_snps.gz')
 92 |         logger.warning('There are {} duplicated SNPs in {}'.format(
 93 |             dup.shape[0], sumFile))
 94 |         logger.warning('\t The SNP with minimum p value included')
 95 |         logger.warning('see all duplicated SNPs in {}'.format(dupFile))
 96 |         dup.to_csv(dupFile, index=False, na_rep='NA', compression='gzip',
 97 |                 sep='\t')
 98 |         logger.info('**** END check duplicated SNPs *****')
 99 |     sumDat = sumDat.loc[:, colnames] 
100 |     print (sumDat.head())
101 |     logger.info('\n')
102 |     return sumDat
103 | 
104 | def read_ref_dat(refFile, logger):
105 |     '''
106 |     Read in-house reference dataset.
107 | 
108 |     Input:
109 |     ------
110 |     refFile,    Path of reference file:
111 |                 CHR, SNP, GP, BP, A1, A2, complementA1, complementA2
112 |     logger,     Python logger for process information
113 | 
114 |     Return:
115 |     ------
116 |     refDat,     DataFrame of reference dataset.
117 |     '''
118 | 
119 |     if not os.access(refFile, os.R_OK):
120 |         raise ValueError("Can't read reference file: {}".format(refFile))
121 |     logger.info('*** Loading reference data ***')
122 |     refDat = pd.read_csv(refFile)
123 |     refDat.rename(columns={'BP':'POS', 'A1':'refA1', 'A2':'refA2'}, 
124 |             inplace=True)
125 |     logger.info('Read reference data from {}'.format(refFile))
126 |     logger.info('Read {} SNPs from reference data'.format(refDat.shape[0]))
127 |     print ('*** Using reference with {} SNPs ***'.format(refDat.shape[0]))
128 |     logger.info('Reading reference data done\n')
129 |     logger.info('\n')
130 |     return(refDat)
131 | 
132 | def _qq(pvec, ax):
133 |     '''
134 |     Making basic QQ plots of pvalues.
135 | 
136 |     '''
137 |     pvec = pvec[np.isfinite(pvec)]
138 |     pvec[pvec < 1e-20] = 1e-20
139 |     logpSort =  -np.log10(np.sort(pvec))
140 |     n = logpSort.shape[0]
141 |     logpTheo = -np.log10(np.cumsum(np.repeat(1.0/n, n)))
142 |     ax.scatter(logpTheo, logpSort)
143 |     x = np.linspace(*ax.get_xlim())
144 |     ax.plot(x, x)
145 |     plt.xlabel('Theorectial -log10 (P)')
146 |     plt.ylabel('Observed -log10 (P)')
147 | 
148 | def summarize_merge(sumDat, mDat, misDat, outdir, logger):
149 |     '''
150 |     Making QQ plot of original dataset, converted and missed.
151 | 
152 |     Input:
153 |     ------
154 |     sumDat,     DataFrame of Original summary stats
155 |     mDat,       DataFrame of Converted summary data
156 |     misDat,     DataFrame of SNPs in original but not in converted
157 |     outdir,     Where to save figure
158 |     logger,     Python logger for process information
159 | 
160 |     No return.
161 |     ----------
162 |     TO-DO:
163 |         Making multiple curves in one figure
164 |     '''
165 |     logger.info('\n')
166 |     if sumDat.shape[0] < 10:
167 |         logger.erro('Too few SNPs converted!! N={}'.format(sumDat.shape[0]))
168 |         raise (RuntimeError, 
169 |                 'Too few SNPs converted!! N={}'.format(sumDat.shape[0]))
170 |     fig = plt.figure(facecolor='white')
171 |     ax = fig.add_subplot(131)
172 |     _qq(sumDat.loc[:,'P'].values, ax)
173 |     plt.title('Original')
174 |     ax = fig.add_subplot(132)
175 |     _qq(mDat.loc[:,'P'].values, ax)
176 |     plt.title("Converted")
177 |     ax = fig.add_subplot(133)
178 |     _qq(misDat.loc[:,'P'].values, ax)
179 |     plt.title("Missed")
180 |     plt.tight_layout()
181 |     plt.savefig(os.path.join(outdir, 'QQ_convert.png'), format='png')
182 |     plt.close()
183 |     logger.info('Comparing P values in QQ_convert.png')
184 | 
185 | def check_zscore(zvec, outdir, logger):
186 |     '''
187 |     Check distribution of converted z-score(real not Anders')
188 | 
189 |     Input:
190 |     ------
191 |     outdir,     Where to save figure
192 |     logger,     Python logger for process information
193 | 
194 |     No return.
195 |     '''
196 |     logger.info('\n')
197 |     fig = plt.figure(facecolor='white')
198 |     pd.Series(zvec[np.isfinite(zvec)]).hist(bins=100) 
199 |     plt.title('Z-Scores')
200 |     plt.tight_layout()
201 |     plt.savefig(os.path.join(outdir, 'Z_scores.png'), format='png')
202 |     plt.close()
203 |     logger.info('Check converted Z-scores at Z_scores.png')
204 | 
205 | def align2ref(sumDat, refDat, logger, kargs):
206 |     '''
207 |     Align given summary Data to in-house reference dataset.
208 | 
209 |     Input:
210 |     ------
211 |     sumDat,     DataFrame of summary statistics.
212 |     refDat,     DataFrame of in-house reference dataset.
213 |     logger,     Python logger for process information
214 |     kargs,      NameSpace object of options
215 | 
216 |     Return:
217 |     -------
218 |     -log10 p values, and z-scores
219 | 
220 |     Note:
221 |     -----
222 |     1. Ambiguous SNPs removed based on in-house reference dataset.
223 |     2. effect aligned with allele coding of reference
224 |     '''
225 |     if kargs.forceID:
226 |         keys = ['SNP']
227 |     elif ('CHR' not in sumDat.columns) or ('POS' not in sumDat.columns):
228 |         keys = ['SNP']
229 |     else:
230 |         keys = ['CHR', 'POS']
231 |     mDat, misDat1 = map_snps(refDat, sumDat, keys, 'sum', False)
232 |     mDat.to_csv(os.path.join(kargs.outdir, 'debug_merged.txt.gz'), 
233 |         sep='\t', index=False, na_rep='NA')
234 |     logger.info('*** Align SNPs to reference ***') 
235 |     if misDat1.shape[0] > 0:
236 |         outF = os.path.join(kargs.outdir, 'SNPs_not_in_sumFile.txt.gz')
237 |         logger.info(
238 |            'There are {} SNPs in reference not in given summary file'.format(
239 |                misDat1.shape[0]))
240 |         logger.info('Details see {}'.format(outF))
241 |         misDat1.to_csv(outF, index=False, sep='\t', compression='gzip',
242 |                 na_rep='NA')
243 |     dummy, misDat2 = map_snps(sumDat, refDat, keys, 'ref')
244 |     if misDat2.shape[0] > 0:
245 |         outF = os.path.join(kargs.outdir, 'SNPs_not_in_refFile.txt.gz')
246 |         logger.info(
247 |             'There are {} SNPs in summary file not in reference'.format(
248 |                 misDat2.shape[0]))
249 |         logger.info('Details see {}'.format(outF))
250 |         misDat2.to_csv(outF, index=False, sep='\t', compression='gzip',
251 |                 na_rep='NA')
252 |     signvec = np.empty((mDat.shape[0],), dtype='float'); signvec.fill(np.nan)
253 |     ambivec = (((mDat.refA1=='A')&(mDat.refA2=='T')) | 
254 |         ((mDat.refA2=='A')&(mDat.refA1=='T')) |
255 |         ((mDat.refA1=='C')&(mDat.refA2=='G')) |
256 |         ((mDat.refA2=='C')&(mDat.refA1=='G')))
257 |     ambivec = ambivec.values
258 |     logger.info('{} SNPs have ambiguously coded allele in ref'. format(
259 |         np.sum(ambivec)))
260 |     logger.info('Zscores of ambiguously coded SNPs were set to NaN')
261 |     ambDat = mDat.loc[ambivec,:]
262 |     ambDat.to_csv(os.path.join(kargs.outdir, 'Ambiguous_data.txt.gz'),
263 |             compression='gzip', sep='\t', index=False, na_rep='NA')
264 |     logger.info('Save SNPs with ambiguous allele coding into {}'.format(
265 |         os.path.join(kargs.outdir, 'Ambiguous_data.txt.gz')))
266 |     logpvec = -np.log10(mDat.loc[:,'P'])
267 |     if 'A1' not in sumDat.columns:
268 |         zvec = signvec.copy()
269 |     else:
270 |         if 'A2' not in sumDat.columns:
271 |             idx1 = ((mDat.A1==mDat.refA1) | (mDat.A1==mDat.A1c)).values
272 |             idx_1 = ((mDat.A1==mDat.refA2) | (mDat.A1==mDat.A2c)).values
273 |         else:
274 |             idx1 = (((mDat.A1==mDat.refA1)&(mDat.A2==mDat.refA2)) | ((mDat.A1==mDat.A1c)&(mDat.A2==mDat.A2c))).values
275 |             idx_1 = (((mDat.A1==mDat.refA2)&(mDat.A2==mDat.refA1)) | ((mDat.A1==mDat.A2c)&(mDat.A2==mDat.A1c))).values
276 |     signvec[idx1] = 1.0; signvec[idx_1] = -1.0; signvec[ambivec] = np.nan
277 |     signvec = signvec * np.sign(mDat.loc[:,'Beta'].values)
278 |     zvec = np.abs(stats.norm.ppf(mDat.loc[:,'P'].values * 0.5)) * signvec
279 |     logger.info('{} SNPs have direction opposite to refference and changed'.format(np.sum(idx_1)))
280 |     mDat.loc[:, 'newZ'] = zvec
281 |     tmpMdat = mDat.loc[idx_1 ,:]
282 |     tmpMdat.to_csv(os.path.join(kargs.outdir, 'flip_data.txt.gz'),
283 |         index=False, sep='\t', compression='gzip',na_rep='NA')
284 |     print mDat.columns
285 |     summarize_merge(sumDat, mDat, misDat2, kargs.outdir, logger)
286 |     logger.info('\n')
287 |     if kargs.NCol:
288 |         print 'I am here'
289 |         print mDat.head()
290 |         return(logpvec.values, zvec, mDat.loc[:,'N'].values)
291 |     else:
292 |         return(logpvec.values, zvec, [])
293 | 
294 | def save2mat(logpvec, zvec, Nvec, trait, outdir, logger):
295 |     '''
296 |     Save data in Matlab dataset.
297 | 
298 |     Input:
299 |     -----
300 |     logpvec,    -log10 p value vector
301 |     zvec,       zscore vector
302 |     trait,      Name of phenotype
303 |     outdir,     Where to save dataset
304 |     logger,     Python logger for process information
305 | 
306 |     No return.
307 |     '''
308 |     outfile = os.path.join(outdir, trait)
309 |     if len(Nvec) == len(logpvec):
310 |         print (np.sum(np.isfinite(Nvec)))
311 |         print (np.sum(np.isfinite(logpvec)))
312 |         tmpdict = {'logpvec_'+trait.lower():logpvec, 
313 |                 'zvec_'+trait.lower():zvec, 'nvec_'+trait.lower():Nvec}
314 |     else:
315 |         tmpdict = {'logpvec_'+trait.lower():logpvec, 'zvec_'+trait.lower():zvec}
316 |     sio.savemat(outfile, tmpdict, format='5', do_compression=False,
317 |             oned_as='column')
318 |     logger.info('Save converted data to {}'.format(outfile+'.mat'))
319 |     
320 | def convert_sum():
321 |     parser = argparse.ArgumentParser(prog="Preprocess Summary stats",
322 |             formatter_class=argparse.ArgumentDefaultsHelpFormatter,
323 |             description='Preprocess summary stats for matlab') 
324 |     parser.add_argument('--sumFile', type=str, help='Summary stats file')
325 |     parser.add_argument('--ref', type=str, help='Reference file')
326 |     parser.add_argument('--trait', type=str, help='Trait Name')
327 |     parser.add_argument('--outdir', type=str, help='Output DIR', default=".")
328 |     parser.add_argument('--forceID', action='store_true', default=False,
329 |             help='Force using SNP ID other than position')
330 |     parser.add_argument('--snpCol', type=str, help='SNP ID field', 
331 |             default='SNP')
332 |     parser.add_argument('--pCol', type=str, help='P value field', default='P')
333 |     parser.add_argument('--effACol', type=str, help='Effective allele field', 
334 |             default=None)
335 |     parser.add_argument('--othACol', type=str, help='The other allele field', 
336 |             default=None)
337 |     parser.add_argument('--effCol', type=str, help='Effect size field', 
338 |             default=None)
339 |     parser.add_argument('--orCol', type=str, help='Odds ratio field', 
340 |             default=None)
341 |     parser.add_argument('--NCol', type=str, help='sample size per SNP', 
342 |             default=None)
343 |     parser.add_argument('--posCol', type=str,
344 |             help='Genomic position field',default=None) 
345 |     parser.add_argument('--chrCol', type=str,
346 |             help='Chromosome field',default=None) 
347 |     args = parser.parse_args()
348 |     if not os.access(args.outdir, os.F_OK):
349 |         os.mkdir(args.outdir)
350 |     if not os.access(args.sumFile, os.R_OK):
351 |         raise ValueError("Can't read summary stats file: {}".format(args.sumFile))
352 |     if not os.access(args.ref, os.R_OK):
353 |         raise ValueError("Can't read reference file: {}".format(args.ref))
354 |     logfile = os.path.join(args.outdir, 'convert_' + args.trait + '.log')
355 |     logger = logging.getLogger()
356 |     logger.addHandler(logging.FileHandler(logfile,mode='w'))
357 |     logger.setLevel(logging.DEBUG)
358 |     sumDat = read_sum_dat(args.sumFile, logger, args)
359 |     refDat = read_ref_dat(args.ref, logger)
360 |     logpvec, zvec, Nvec = align2ref(sumDat, refDat, logger, args)
361 |     check_zscore(zvec, args.outdir, logger)
362 |     save2mat(logpvec, zvec, Nvec, args.trait, args.outdir, logger)
363 |     logger.info('\n**********\nFinished at {}'.format(time.ctime()))
364 |     logger.info('Author: {} at {}'.format(getpass.getuser(), time.ctime()))
365 |     
366 | 
367 | if __name__ == "__main__":
368 |     import time
369 |     import numpy as np
370 |     tsts = time.time()
371 |     convert_sum()
372 |     print
373 |     print 'Finish at %s' % time.ctime()
374 |     ted = time.time()
375 |     print 'Time taken %d mins %d sec' % ((ted-tsts)//60, np.round(ted-tsts) %
376 |             60)
377 | 
378 | 


--------------------------------------------------------------------------------
/sumstats2mat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import argparse
  4 | import pandas as pd
  5 | import scipy.io as sio
  6 | import numpy as np
  7 | from collections import namedtuple
  8 | 
  9 | Cols = namedtuple('Cols', ['SNP',  'PVAL', 'A1',           'A2',          'N', 'NCASE', 'NCONTROL', 'Z'])
 10 | cols = Cols._make(        ['RSID', 'P',    'EffectAllele', 'OtherAllele', 'N', 'CaseN', 'ControlN', 'Z'])
 11 | 
 12 | __version__ = '1.0.0'
 13 | MASTHEAD = "***********************************************************************\n"
 14 | MASTHEAD += "* sumstats2mat.py: Converts summary statistics from csv to matlab format\n"
 15 | MASTHEAD += "* Version {V}\n".format(V=__version__)
 16 | MASTHEAD += "* (C) Norwegian Centre for Mental Disorders Research / University of Oslo\n"
 17 | MASTHEAD += "* GNU General Public License v3\n"
 18 | MASTHEAD += "***********************************************************************\n"
 19 | 
 20 | _base_complement = {"A":"T", "C":"G", "G":"C", "T":"A"}
 21 | def _reverse_complement_variant(variant):
 22 |     # variant should be a 2-elemet sequence with upper case string elements
 23 |     return ("".join([_base_complement[b] for b in variant[0][::-1]]),
 24 |             "".join([_base_complement[b] for b in variant[1][::-1]]))
 25 | 
 26 | def check_input_file(file):
 27 |     if not os.path.isfile(file):
 28 |         raise ValueError("Input file does not exist: {f}".format(f=file))
 29 | 
 30 | def check_output_file(file, force=False):
 31 |     # Delete target file if user specifies --force option
 32 |     if force:
 33 |         try:
 34 |             os.remove(file)
 35 |         except OSError:
 36 |             pass
 37 | 
 38 |     # Otherwise raise an error if target file already exists
 39 |     if os.path.isfile(file) and not force:
 40 |         raise ValueError("Output file already exists: {f}".format(f=file))
 41 | 
 42 |     # Create target folder if it doesn't exist
 43 |     output_dir = os.path.dirname(file)
 44 |     if output_dir and not os.path.isdir(output_dir): os.makedirs(output_dir)  # ensure that output folder exists
 45 | 
 46 | def make_mat(args):
 47 |     if args.out is None: raise ValueError('--out is required.')
 48 | 
 49 |     check_input_file(args.ref)
 50 |     check_input_file(args.sumstats)
 51 |     check_output_file(args.out, args.force)
 52 | 
 53 |     reader = pd.read_csv(args.sumstats, delim_whitespace=True, chunksize=args.chunksize, float_precision='high')
 54 |     df_out = None
 55 |     for chunk_index, ss_chunk in enumerate(reader):
 56 |         # (BEGIN) special handling of the first chunk
 57 |         if chunk_index==0:
 58 |             columns = list(ss_chunk.columns)
 59 | 
 60 |             required_cols = [cols.SNP, cols.A1, cols.A2, cols.Z]
 61 |             if (set(required_cols) - set(columns)):
 62 |                 absent_cols = set(required_cols) - set(columns)
 63 |                 err_msg = ("Columns {} are missing from the --sumstats file {}").format(', '.join(absent_cols), args.sumstats)
 64 |                 raise(RuntimeError(err_msg))
 65 | 
 66 |             n_col = cols.N if cols.N in columns else None
 67 |             ncase_col = cols.NCASE if cols.NCASE in columns else None
 68 |             ncontrol_col = cols.NCONTROL if cols.NCONTROL in columns else None
 69 |             if (not args.without_n) and ((n_col is None) and ((ncase_col is None) or (ncontrol_col is None))):
 70 |                 raise(ValueError('Sample size column is not detected in {}. Expact either N or NCASE, NCONTROL column.'.format(args.sumstats)))
 71 | 
 72 |             print('Reading reference file {}...'.format(args.ref))
 73 |             ref_reader = pd.read_csv(args.ref, sep='\t', usecols=['SNP', 'A1', 'A2'], chunksize=args.chunksize)
 74 |             ref_dict = {}
 75 |             for ref_chunk in ref_reader:
 76 |                 ref_chunk.drop(ref_chunk.index[np.logical_not(ref_chunk['A1'].str.upper().str.match('^[ACTG]*$')) | np.logical_not(ref_chunk['A2'].str.upper().str.match('^[ACTG]*$'))], inplace=True)
 77 |                 if ref_chunk.empty: continue
 78 |                 gtypes = zip(ref_chunk['A1'].apply(str.upper),ref_chunk['A2'].apply(str.upper))
 79 |                 #TODO?: add check whether some id is already in ref_dict
 80 |                 ref_dict.update(dict(zip(ref_chunk['SNP'], gtypes)))
 81 |             ref_dict = {i: (variant, _reverse_complement_variant(variant),
 82 |                             variant[::-1], _reverse_complement_variant(variant[::-1]))
 83 |                         for i, variant in ref_dict.items()}
 84 |             ref_snps = pd.read_csv(args.ref, sep='\t', usecols=['SNP'], squeeze=True)
 85 |             #TODO?: add check whether ref_snps contains duplicates
 86 |             print("Reference dict contains {d} snps.".format(d=len(ref_dict)))
 87 | 
 88 |             print('Reading summary statistics file {}...'.format(args.sumstats))
 89 |             print('Column types: ' + ', '.join([column + ':' + str(dtype) for (column, dtype) in zip(ss_chunk.columns, ss_chunk.dtypes)]))
 90 |         # (END) special handling of the first chunk
 91 | 
 92 |         ss_chunk = ss_chunk.loc[ss_chunk[cols.SNP].isin(ref_dict),:]
 93 |         if ss_chunk.empty: continue
 94 |         gtypes = list(zip(ss_chunk[cols.A1].apply(str.upper),ss_chunk[cols.A2].apply(str.upper)))
 95 |         # index of SNPs that have the same alleles as indicated in reference
 96 |         ind = [gt in ref_dict[sid] for sid, gt in zip(ss_chunk[cols.SNP], gtypes)]
 97 |         ss_chunk = ss_chunk.loc[ind,:]
 98 |         gtypes = [gt for gt, j in zip(gtypes, ind) if j]
 99 |         log10pv = -np.log10(ss_chunk[cols.PVAL].values)
100 |         # not_ref_effect = [
101 |         #   1 if effect allele in data == other allele in reference
102 |         #   -1 if effect allele in data == effect allele in reference ]
103 |         # So zscores with positive effects will be positive and zscores with
104 |         # negative effects will stay negative, since
105 |         # stats.norm.ppf(ss_chunk[cols.PVAL]*0.5) is always negetive (see zvect
106 |         # calculation below).
107 |         not_ref_effect = np.array([1 if gt in ref_dict[sid][:2] else -1
108 |             for sid, gt in zip(ss_chunk[cols.SNP], gtypes)])
109 |         #TODO: check proportion of positive and negative effects
110 |         zvect = ss_chunk[cols.Z].values*not_ref_effect
111 |         ind_ambiguous = [j for j,gt in enumerate(gtypes) if gt == _reverse_complement_variant(gt)[::-1]]
112 |         # set zscore of ambiguous SNPs to nan
113 |         zvect[ind_ambiguous] = np.nan
114 |         #TODO: check whether output df contains duplicated rs-ids (warn)
115 | 
116 |         # reindex by SNP, add required columns and drop unnecessary columns
117 |         ss_chunk.index = ss_chunk[cols.SNP]
118 |         # add required columns
119 |         ss_chunk["logpvec"] = log10pv
120 |         ss_chunk["zvec"] = zvect
121 |         if not args.without_n:
122 |             if n_col is None:
123 |                 nvec = 4./(1./ss_chunk[ncase_col] + 1./ss_chunk[ncontrol_col])
124 |             else:
125 |                 nvec = ss_chunk[n_col].values
126 |             ss_chunk["nvec"] = nvec
127 | 
128 |         cols2drop = [c for c in ss_chunk.columns if (c not in ['logpvec', 'zvec', 'nvec'])]
129 |         ss_chunk.drop(cols2drop, axis=1, inplace=True)
130 | 
131 |         if df_out is None:
132 |             df_out = ss_chunk.copy()
133 |         else:
134 |             df_out = df_out.append(ss_chunk)
135 | 
136 |         print("{f}: {n} lines processed, {m} SNPs matched with reference file".format(f=args.sumstats, n=(chunk_index+1)*args.chunksize, m=len(df_out)))
137 | 
138 |     if df_out.empty: raise(ValueError("No SNPs match after joining with reference data"))
139 |     dup_index = df_out.index.duplicated(keep=False)
140 |     if dup_index.any():
141 |         print("Duplicated SNP ids detected:")
142 |         print(df_out[dup_index])
143 |         print("Keeping only the first occurance.")
144 |     df_out = df_out[~df_out.index.duplicated(keep='first')]
145 |     # allign index accordind order of SNPs in ref, insert NaN rows for SNPs that
146 |     # present in ref but absent in sumstats file
147 |     df_out = df_out.reindex(ref_snps)
148 | 
149 |     print('Writing .mat file...')
150 |     save_dict = {c+args.trait: df_out[c].astype(np.float64).values for c in df_out.columns}
151 |     sio.savemat(args.out, save_dict, format='5', do_compression=False,
152 |         oned_as='column', appendmat=False)
153 |     print("%s created" % args.out)
154 | 
155 | ### =================================================================================
156 | ###                                Main section
157 | ### ================================================================================= 
158 | if __name__ == "__main__":
159 |     parser_mat = argparse.ArgumentParser(description="Create mat files that can "
160 |         "be used as an input for pleiofdr analysis (https://github.com/precimed/pleiofdr/). "
161 |         "Takes a .csv file with summary statistics file as input. The file can be compressed with gzip. "
162 |         "Require columns: RSID, P, EffectAllele, OtherAllele, and sample size column (either N, or CaseN and ControlN). "
163 |         "Creates corresponding mat files which can be used as an input for pleiofdr analysis. "
164 |         "Only SNPs from the reference file are considered. "
165 |         "Zscores of strand ambiguous SNPs are set to NA. ")
166 | 
167 |     parser_mat.add_argument("--sumstats", type=str, help="Input file with summary statistics. ")
168 |     parser_mat.add_argument("--ref", type=str, help="[required] Tab-separated file with list of referense SNPs.")
169 |     parser_mat.add_argument("--out", type=str, help="[required] File to output the result. File should end with .mat extension.")
170 |     parser_mat.add_argument("--force", action="store_true", default=False, help="Allow sumstats.py to overwrite output file if it exists.")
171 | 
172 |     parser_mat.add_argument("--trait", type=str, default='',
173 |         help="Trait name that will be used in mat file. Can be kept empty, in this case the variables will be named 'logpvec', 'zvec' and 'nvec'")
174 |     parser_mat.add_argument("--without-n", action="store_true", default=False,
175 |         help="Proceed without sample size (N or NCASE/NCONTROL)")
176 |     parser_mat.add_argument("--chunksize", default=100000, type=int,
177 |         help="Size of chunk to read the file.")
178 | 
179 |     args = parser_mat.parse_args()
180 |     make_mat(args)
181 | 


--------------------------------------------------------------------------------
/sumstats_ldsc_helper.py:
--------------------------------------------------------------------------------
 1 | # Examples:
 2 | # python sumstats_ldsc_helper.py annot PGC_SCZ_2014.csv PGC_SCZ_2014_ldscores/{}.annot.gz --annot 1000G_Phase3_baseline_ldscores/baseline.{}.annot.gz  
 3 | 
 4 | import pandas as pd
 5 | import numpy as np
 6 | import os.path
 7 | import sys
 8 | import argparse
 9 | from intervaltree import IntervalTree
10 | 
11 | def parse_args(args):
12 |     parser = argparse.ArgumentParser(description="Miscellaneous utilities to work with statistics and LDSC regression method")
13 |     subparsers = parser.add_subparsers()
14 |     parser_annot = subparsers.add_parser("annot", help="Create binary annotations from 0.1, 0.01 and 0.001 p-value stratuums of the summary statistic")
15 |     parser_annot.add_argument("sumstats_file", type=str, help="Input file with summary statistics")
16 |     parser_annot.add_argument("output_file", type=str, help="Path to the output file to place the results")
17 |     parser_annot.add_argument("--annot", type=str, help="Path to baseline.CHR.annot.gz file from 1000G_Phase3_baseline_ldscores")
18 |     parser_annot.add_argument("--force", action="store_true", default=False, help="Force overwrite target files if they exist.")
19 |     parser_annot.add_argument("--window", default=2, type=int, help="Window to include into the binary annotation around each SNP that pass p-value threshold.")
20 |     parser_annot.set_defaults(func=make_annot)
21 |     return parser.parse_args(args)
22 | 
23 | ### =================================================================================
24 | ###                          Implementation for parser_annot
25 | ### ================================================================================= 
26 | def check_input_file(file):
27 |     if not os.path.isfile(file):
28 |         raise ValueError("Input file does not exist: {f}".format(f=file))
29 | 
30 | def check_output_file(file, force=False):
31 |     # Delete target file if user specifies --force option
32 |     if force:
33 |         try:
34 |             os.remove(file)
35 |         except OSError:
36 |             pass
37 | 
38 |     # Otherwise raise an error if target file already exists
39 |     if os.path.isfile(file) and not force:
40 |         raise ValueError("Output file already exists: {f}".format(f=file))
41 | 
42 |     # Create target folder if it doesn't exist
43 |     output_dir = os.path.dirname(file)
44 |     if output_dir and not os.path.isdir(output_dir): os.makedirs(output_dir)  # ensure that output folder exists
45 | 
46 | def make_annot(args):
47 |     """
48 |     Create binary annotations from 0.1, 0.01 and 0.001 p-value stratuums of the summary statistic.
49 |     """
50 |     check_input_file(args.sumstats_file)
51 |     for chri in range(1, 23): check_output_file(args.output_file.format(chri), args.force)
52 | 
53 |     print('Reading summary statistics file {}...'.format(args.sumstats_file))
54 |     sumstats = pd.read_csv(args.sumstats_file, delim_whitespace=True, usecols=['PVAL', 'CHR', 'BP'])
55 |     print('Done, read {} SNPs.'.format(sumstats.shape[0]))
56 | 
57 |     for chri in range(1, 23):
58 |         print('Processing chromosome {}...'.format(chri))
59 |         df = pd.read_csv(args.annot.format(chri), delim_whitespace=True)
60 |         df = df[['CHR', 'BP', 'SNP', 'CM']].copy()
61 |         for pthresh, label in [(0.1, '.1'), (0.01, '.01'), (0.001, '.001')]:
62 |             sumstatsCHR = sumstats[sumstats.CHR == chri].copy(deep=True)
63 |             print('{} markers, {} of them are on chr {}, {} of them have p-value below {}'.format(sumstats.shape[0], sumstatsCHR.shape[0], chri, (sumstatsCHR.PVAL < pthresh).sum(), pthresh))
64 |             itree = IntervalTree.from_tuples(zip(sumstatsCHR[sumstatsCHR.PVAL < pthresh].BP - args.window, sumstatsCHR[sumstatsCHR.PVAL < pthresh].BP + args.window))
65 |             itree.merge_overlaps()
66 |             print('Found {} intervals, average length {}'.format(len(itree),  sum([i.length() for i in itree])/len(itree)))
67 | 
68 |             annot_binary = [int(bool(itree[p])) for p in df.BP]
69 |             df['PVAL{}'.format(label)] = annot_binary
70 |             print('{} markers out of {} ({}%) belongs to the annotation'.format(sum(annot_binary), len(annot_binary), 100 * sum(annot_binary) / len(annot_binary)))
71 |         df.to_csv(args.output_file.format(chri), index=False, sep='\t', compression='gzip')
72 |         print('Results saved to {}'.format(args.output_file.format(chri)))
73 | 
74 | ### =================================================================================
75 | ###                                Main section
76 | ### ================================================================================= 
77 | if __name__ == "__main__":
78 |     args = parse_args(sys.argv[1:])
79 |     args.func(args)
80 |     print("Done")
81 | 


--------------------------------------------------------------------------------
/sumstats_utils.py:
--------------------------------------------------------------------------------
  1 | # Misc utils to deal with summary stat files.
  2 | # Some parts of the code in this file originates from https://github.com/bulik/ldsc/,
  3 | # which is licensed under GNU General Public License v3.0
  4 | # See https://github.com/bulik/ldsc/blob/master/LICENSE for complete license.
  5 | 
  6 | import sys, os, re, logging, datetime
  7 | import numpy as np
  8 | import pandas as pd
  9 | import gzip
 10 | import six
 11 | import itertools as it
 12 | from collections import namedtuple
 13 | 
 14 | COMPLEMENT_ALLELE = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
 15 | # bases
 16 | BASES = COMPLEMENT_ALLELE.keys()
 17 | # true iff strand ambiguous
 18 | STRAND_AMBIGUOUS = {''.join(x): x[0] == COMPLEMENT_ALLELE[x[1]]
 19 |                     for x in it.product(BASES, BASES)
 20 |                     if x[0] != x[1]}
 21 | # SNPS we want to keep (pairs of alleles)
 22 | VALID_SNPS = {x for x in map(lambda y: ''.join(y), it.product(BASES, BASES))
 23 |               if x[0] != x[1] and not STRAND_AMBIGUOUS[x]}
 24 | # T iff SNP 1 has the same alleles as SNP 2 (allowing for strand or ref allele flip).
 25 | MATCH_ALLELES = {x for x in map(lambda y: ''.join(y), it.product(VALID_SNPS, VALID_SNPS))
 26 |                  # strand and ref match
 27 |                  if ((x[0] == x[2]) and (x[1] == x[3])) or
 28 |                  # ref match, strand flip
 29 |                  ((x[0] == COMPLEMENT_ALLELE[x[2]]) and (x[1] == COMPLEMENT_ALLELE[x[3]])) or
 30 |                  # ref flip, strand match
 31 |                  ((x[0] == x[3]) and (x[1] == x[2])) or
 32 |                  ((x[0] == COMPLEMENT_ALLELE[x[3]]) and (x[1] == COMPLEMENT_ALLELE[x[2]]))}  # strand and ref flip
 33 | # T iff SNP 1 has the same alleles as SNP 2 w/ ref allele flip.
 34 | FLIP_ALLELES = {''.join(x):
 35 |                 ((x[0] == x[3]) and (x[1] == x[2])) or  # strand match
 36 |                 # strand flip
 37 |                 ((x[0] == COMPLEMENT_ALLELE[x[3]]) and (x[1] == COMPLEMENT_ALLELE[x[2]]))
 38 |                 for x in MATCH_ALLELES}
 39 | 
 40 | def filter_alleles(alleles):
 41 |     '''Remove bad variants (mismatched alleles, non-SNPs, strand ambiguous).'''
 42 |     ii = alleles.apply(lambda y: y in MATCH_ALLELES)
 43 |     return ii
 44 | 
 45 | 
 46 | def align_alleles(z, alleles):
 47 |     '''Align Z1 and Z2 to same choice of ref allele (allowing for strand flip).'''
 48 |     try:
 49 |         z *= (-1) ** alleles.apply(lambda y: FLIP_ALLELES[y])
 50 |     except KeyError as e:
 51 |         raise KeyError('Incompatible alleles. ')
 52 |     return z
 53 | 
 54 | Cols = namedtuple('Cols', ['SNP', 'CHR', 'BP', 'PVAL', 'A1', 'A2', 'EA', 'N', 'NCASE', 'NCONTROL', 'Z', 'OR', 'BETA', 'LOGODDS', 'SE', 'INFO', 'FRQ', 'NSTUDY', 'CHRPOS', 'A1A2', 'CHRPOSA1A2', 'DIRECTION', 'ORL95', 'ORU95'])
 55 | cols = Cols._make(        ['SNP', 'CHR', 'BP', 'PVAL', 'A1', 'A2', 'EA', 'N', 'NCASE', 'NCONTROL', 'Z', 'OR', 'BETA', 'LOGODDS', 'SE', 'INFO', 'FRQ', 'NSTUDY', 'CHRPOS', 'A1A2', 'CHRPOSA1A2', 'DIRECTION', 'ORL95', 'ORU95'])
 56 | cols_type_map =           {'SNP':str, 'CHR':int, 'BP':int, 'PVAL':np.float64, 'A1':str, 'A2':str, 'EA':str, 'N':float, 'NCASE':float, 'NCONTROL':float, 'Z':float, 'OR':float, 'BETA':float,
 57 |                            'LOGODDS':float, 'SE':float, 'INFO':float, 'FRQ':float, 'NSTUDY':int, 'CHRPOS':str, 'A1A2':str, 'CHRPOSA1A2':str, 'DIRECTION':str, 'ORL95':float, 'ORU95':float}
 58 | cname_to_cleansumstats_map = {
 59 |     'SNP': 'col_SNP',
 60 |     'CHR': 'col_CHR', 
 61 |     'BP': 'col_POS',
 62 |     'PVAL': 'col_P',
 63 |     'A1': 'col_EffectAllele',
 64 |     'A2': 'col_OtherAllele',
 65 |     'N': 'col_N',
 66 |     'NCASE': 'col_CaseN',
 67 |     'NCONTROL': 'col_ControlN',
 68 |     'Z': 'col_Z',
 69 |     'OR': 'col_OR',
 70 |     'BETA': 'col_BETA',
 71 |     'SE': 'col_SE',
 72 |     'LOGODDS': 'col_BETA',
 73 |     'INFO': 'col_INFO',
 74 |     'FRQ': 'col_EAF',
 75 |     'DIRECTION': 'col_Direction',
 76 |     'ORL95': 'col_ORL95',
 77 |     'ORU95': 'col_ORU95',
 78 |     # 'NSTUDY': 'col_studyN'  - not supported by cleansumstats
 79 |     # col_OAF, col_Notes - not supported by python_convert
 80 |     # CHRPOSA1A2, CHRPOS -  require special handling (see update_cleansumstats_cols in sumstats.py)
 81 |     # A1A2 - incompatible
 82 | }
 83 | 
 84 | null_values = {
 85 |     cols.LOGODDS: 0,
 86 |     cols.BETA: 0,
 87 |     cols.OR: 1,
 88 |     cols.Z: 0
 89 | }
 90 | 
 91 | default_cnames = {
 92 | 
 93 |     # RS NUMBER
 94 |     'SNP': cols.SNP,
 95 |     'MARKERNAME': cols.SNP,
 96 |     'SNPID': cols.SNP,
 97 |     'SNP_ID': cols.SNP,  
 98 |     'RS': cols.SNP,
 99 |     'RSID': cols.SNP,
100 |     'RS_NUMBER': cols.SNP,
101 |     'RS_NUMBERS': cols.SNP,
102 |     # CHROMOSOME
103 |     'CHR': cols.CHR,
104 |     'CHROM': cols.CHR,
105 |     'CHROMSOME': cols.CHR,
106 |     'CHROMOSOME' : cols.CHR,
107 |     # POSITION
108 |     'POS': cols.BP,
109 |     'BP': cols.BP,
110 |     'BPOS': cols.BP,
111 |     'POSITION' : cols.BP,
112 |     # NUMBER OF STUDIES
113 |     'NSTUDY': cols.NSTUDY,
114 |     'N_STUDY': cols.NSTUDY,
115 |     'NSTUDIES': cols.NSTUDY,
116 |     'N_STUDIES': cols.NSTUDY,
117 |     # P-VALUE
118 |     'P': cols.PVAL,
119 |     'PVALUE': cols.PVAL,
120 |     'P_VALUE':  cols.PVAL,
121 |     'PVAL': cols.PVAL,
122 |     'P_VAL': cols.PVAL,
123 |     'GC_PVALUE': cols.PVAL,
124 |     'MTAG_PVAL': cols.PVAL,
125 |     # ALLELE 1
126 |     'A1': cols.A1,
127 |     'ALLELE1': cols.A1,
128 |     'ALLELE_1': cols.A1,
129 |     'EFFECT_ALLELE': cols.A1,
130 |     'EFFECTALLELE': cols.A1,
131 |     'REFERENCE_ALLELE': cols.A1,
132 |     'INC_ALLELE': cols.A1,
133 |     'EA': cols.A1,
134 |     # ALLELE 2
135 |     'A2': cols.A2,
136 |     'ALLELE2': cols.A2,
137 |     'ALLELE_2': cols.A2,
138 |     'OTHER_ALLELE': cols.A2,
139 |     'OTHERALLELE': cols.A2,
140 |     'NON_EFFECT_ALLELE': cols.A2,
141 |     'NON_EFF_ALLELE': cols.A2,
142 |     'DEC_ALLELE': cols.A2,
143 |     'NEA': cols.A2,
144 |     # N
145 |     'N': cols.N,
146 |     'SAMPLESIZE': cols.N,  
147 |     'WEIGHT': cols.N,  # metal does this. possibly risky.
148 |     # NCASE
149 |     'NCASE': cols.NCASE,
150 |     'CASES_N': cols.NCASE,
151 |     'N_CASE': cols.NCASE,
152 |     'N_CASES': cols.NCASE,
153 |     'N_CAS': cols.NCASE,
154 |     'N_CASE': cols.NCASE,
155 |     'CASEN': cols.NCASE,
156 |     # NCONTROL
157 |     'N_CONTROLS': cols.NCONTROL,
158 |     'N_CON': cols.NCONTROL,
159 |     'CONTROLN': cols.NCONTROL,
160 |     'NCONTROL': cols.NCONTROL,
161 |     'CONTROLS_N': cols.NCONTROL,
162 |     'N_CONTROL': cols.NCONTROL,
163 |     # SIGNED STATISTICS
164 |     'ZSCORE': cols.Z,
165 |     'Z-SCORE': cols.Z,
166 |     'GC_ZSCORE': cols.Z,
167 |     'Z': cols.Z,
168 |     'MTAG_Z': cols.Z,
169 |     'OR': cols.OR,
170 |     'ORL95': cols.ORL95,
171 |     'ORU95': cols.ORU95,
172 |     'B': cols.BETA,
173 |     'BETA': cols.BETA,
174 |     'MTAG_BETA': cols.BETA,
175 |     'LOGODDS': cols.LOGODDS,
176 |     'EFFECTS': cols.BETA,
177 |     'EFFECT': cols.BETA,
178 |     'SIGNED_SUMSTAT': 'SIGNED_SUMSTAT',
179 |     # STANDARD ERROR
180 |     'SE' : cols.SE,
181 |     'STDERR' : cols.SE,
182 |     'MTAG_SE' : cols.SE,
183 |     # INFO
184 |     'INFO': cols.INFO,
185 |     # MAF
186 |     'EAF': cols.FRQ,
187 |     'FRQ': cols.FRQ,
188 |     'MAF': cols.FRQ,
189 |     'FRQ_U': cols.FRQ,
190 |     'F_U': cols.FRQ,
191 |     'FREQ': cols.FRQ,
192 |     # DIRECTION
193 |     'DIRECTION': cols.DIRECTION,
194 | }
195 | 
196 | describe_cname = {
197 |     cols.SNP: 'Variant ID (e.g., rs number)',
198 |     cols.CHR: 'Chromosome number',
199 |     cols.BP: 'Base-pair position',
200 |     cols.PVAL: 'p-Value',
201 |     cols.A1: 'Allele 1, interpreted as ref allele for signed sumstat.',
202 |     cols.A2: 'Allele 2, interpreted as non-ref allele for signed sumstat.',
203 |     cols.EA: 'Effect Allele, interpreted as ref allele for signed sumstat (specific to MVP data to validate that A1 is the same as EA).',
204 |     cols.N: 'Sample size',
205 |     cols.NCASE: 'Number of cases',
206 |     cols.NCONTROL: 'Number of controls',
207 |     cols.Z: 'Z-score (0 --> no effect; above 0 --> A1 is trait/risk increasing)',
208 |     cols.OR: 'Odds ratio (1 --> no effect; above 1 --> A1 is risk increasing)',
209 |     cols.ORL95: 'Lower 95%% confidence bound of OR',
210 |     cols.ORU95: 'Upper 95%% confidence bound of OR',
211 |     cols.BETA: '[linear/logistic] regression coefficient (0 --> no effect; above 0 --> A1 is trait/risk increasing)',
212 |     cols.LOGODDS: 'Log odds ratio (0 --> no effect; above 0 --> A1 is risk increasing)',
213 |     cols.SE: 'standard error of the effect size',
214 |     cols.INFO: 'INFO score (imputation quality; higher --> better imputation)',
215 |     cols.FRQ: 'Allele frequency',
216 |     'SIGNED_SUMSTAT': 'Directional summary statistic as specified by --signed-sumstats.',
217 |     cols.NSTUDY: 'Number of studies in which the SNP was genotyped.',
218 |     'UNKNOWN': 'Unknown column type (will be skipped).',
219 |     cols.CHRPOS: 'chr:pos column with colon-separated information about Chromosome and Base-pair position',
220 |     cols.A1A2: 'A1/A2 column with slash-separated information about marker allles',
221 |     cols.CHRPOSA1A2: 'chr:pos:ref:alt column with colon-separated information about Chromosome, Base-pair position, Reference allele, Alternative allele',
222 |     cols.DIRECTION: 'METAL "direction" column, one char per substudy (+ or - indicate effect direction; ? indicate failed imputation or QC)'
223 | }
224 | 
225 | def clean_header(header):
226 |     '''
227 |     For cleaning file headers.
228 |     - convert to uppercase
229 |     - replace dashes '-' with underscores '_'
230 |     - replace dots '.' (as in R) with underscores '_'
231 |     - remove newlines ('\n')
232 |     '''
233 |     return header.upper().replace('-', '_').replace('.', '_').replace('\n', '')
234 | 
235 | def format_chr(chrvec):
236 |     '''
237 |     Reformat chromosome names.
238 | 
239 |     Input:
240 |     ------
241 |     Vector of chromosome IDs
242 | 
243 |     Output:
244 |     -------
245 |     Vector of cleaned chromosome IDs
246 | 
247 |     Note:
248 |     * Remove "chr/Chr/CHR/MT/mt" strings in the name
249 |     * Change chrX to 23, ChrY to 24, PAR to 25, MT to 26
250 |     * (as in plink, https://www.cog-genomics.org/plink/1.9/input#allow_extra_chr)
251 |     '''
252 |     try:
253 |         tmpchrvec = chrvec.astype('str')
254 |         tmpchrvec = tmpchrvec.str.lower()
255 |         tmpchrvec = tmpchrvec.str.replace('chr', '')
256 |         tmpchrvec[tmpchrvec=='x'] = '23'
257 |         tmpchrvec[tmpchrvec=='y'] = '24'
258 |         tmpchrvec[tmpchrvec=='par'] = '25'
259 |         tmpchrvec[tmpchrvec=='m'] = '26'
260 |         tmpchrvec[tmpchrvec=='mt'] = '26'
261 |         tmpchrvec[tmpchrvec=='x_par1'] = '25'
262 |         tmpchrvec[tmpchrvec=='x_par2'] = '25'
263 |         tmpchrvec[tmpchrvec=='x_nonpar'] = '23'
264 |         # TO-DO: Bellow is anoying
265 |         tmpchrvec[tmpchrvec=='na'] = '-9'
266 |         tmpchrvec[tmpchrvec.isnull()] = '-9'
267 |         tmpchrvec[tmpchrvec=='nan'] = '-9'
268 |         tmpchrvec[tmpchrvec==' '] = '-9'
269 |         tmpchrvec = tmpchrvec.astype('float').astype('int')
270 |         return tmpchrvec
271 |     except:
272 |         raise
273 | 
274 | def get_header(fh, lines=5):
275 |     (openfunc, _) = get_compression(fh)
276 |     header = []
277 |     with openfunc(fh) as f:
278 |         for line in it.islice(f, lines):
279 |             line = line if isinstance(line, six.string_types) else line.decode('utf-8')
280 |             header.append(line.rstrip('\n'))
281 |     return header
282 | 
283 | def get_compression_and_open(fh):
284 |     (openfunc, _) = get_compression(fh)
285 |     return openfunc(fh)
286 | 
287 | def get_compression(fh):
288 |     '''
289 |     Read filename suffixes and figure out whether it is gzipped,bzip2'ed or not compressed
290 |     '''
291 |     if fh.endswith('gz'):
292 |         compression = 'gzip'
293 |         openfunc = gzip.open
294 |     elif fh.endswith('bz2'):
295 |         compression = 'bz2'
296 |         openfunc = bz2.BZ2File
297 |     else:
298 |         openfunc = open
299 |         compression = None
300 | 
301 |     return openfunc, compression
302 | 


--------------------------------------------------------------------------------
/tests/case01.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case01.mat


--------------------------------------------------------------------------------
/tests/case01.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case01.txt.gz


--------------------------------------------------------------------------------
/tests/case02.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case02.txt.gz


--------------------------------------------------------------------------------
/tests/case03.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case03.txt.gz


--------------------------------------------------------------------------------
/tests/case04.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case04.mat


--------------------------------------------------------------------------------
/tests/case04.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/precimed/python_convert/e46ebdfafd495c1420c7f8a4740a0da75c94d84d/tests/case04.txt.gz


--------------------------------------------------------------------------------
/tests/test_consistent.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import scipy.io as sio
 3 | import numpy as np
 4 | import shutil
 5 | import os.path
 6 | 
 7 | def execute_command(command):
 8 |     process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 9 |     print(process.communicate()[0].decode("utf-8"))
10 |     #print(subprocess.check_output(command.split()).decode("utf-8"))
11 | 
12 | def run(filename, matfile, effect):
13 |     reffile = r'tests/1234_ref.bim'
14 |     if os.path.isdir('TEMP_FOLDER'): shutil.rmtree('TEMP_FOLDER')
15 |     execute_command(r'python sumstats.py csv {} TEMP_FOLDER/TEST.csv --auto --force'.format(filename))
16 |     execute_command(r'python sumstats.py mat {} TEMP_FOLDER/TEST.csv --force --effect {}'.format(reffile, effect))
17 | 
18 |     f1 = sio.loadmat(matfile)
19 |     f2 = sio.loadmat('TEMP_FOLDER/TEST.mat')
20 |     assert(all(np.isfinite(f1['logpvec_test']) == np.isfinite(f2['logpvec'])))
21 |     assert(all(np.isfinite(f1['zvec_test']) == np.isfinite(f2['zvec'])))
22 |     assert(max(abs(f1['logpvec_test'] - f2['logpvec'])) < 1e-10)
23 |     assert(max(abs(f1['zvec_test'] - f2['zvec'])) < 1e-10)
24 |     shutil.rmtree('TEMP_FOLDER')
25 | 
26 | def test01(): run('tests/case01.txt', 'tests/case01.mat', effect='BETA')
27 | def test01gz(): run('tests/case01.txt.gz', 'tests/case01.mat', effect='BETA')
28 | #def test02(): run('tests/case02.txt')
29 | #def test03(): run('tests/case03.txt')
30 | def test04(): run('tests/case04.txt', 'tests/case04.mat', effect='OR')
31 | def test04gz(): run('tests/case04.txt.gz', 'tests/case04.mat', effect='OR')
32 | 


--------------------------------------------------------------------------------
/tests/test_duplicated.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def test_func():
 5 |     # test passes on pandas.__version__ == u'0.18.0'
 6 |     df = pd.DataFrame([[1, 2, 1, 1],
 7 |                        [2, 2, 2, 2],
 8 |                        [1, 2, 3, 1],
 9 |                        [3, 2, 4, 2]], columns=['A', 'B', 'C', 'D'])
10 | 
11 |     assert all(df.duplicated('A', keep=False) == [True, False, True, False])
12 |     assert all(df.duplicated('B', keep=False) == [True, True, True, True])
13 |     assert all(df.duplicated('C', keep=False) == [False, False, False, False])
14 |     assert all(df.duplicated('D', keep=False) == [True, True, True, True])
15 | 
16 |     assert all(df.duplicated('A') == [False, False, True, False])
17 |     assert all(df.duplicated('B') == [False, True, True, True])
18 |     assert all(df.duplicated('C') == [False, False, False, False])
19 |     assert all(df.duplicated('D') == [False, False, True, True])
20 | 


--------------------------------------------------------------------------------
/version.py:
--------------------------------------------------------------------------------
1 | __version__ = '__0.1__'
2 | 


--------------------------------------------------------------------------------