├── LDSC ├── LDSC.nf └── nextflow.config ├── README.md ├── SumHer ├── SumHer.nf ├── bin │ ├── format.py │ ├── intersect.py │ ├── merge.py │ └── mhc.py └── nextflow.config ├── pheno-list.json └── tab2pheno-list.py /LDSC/LDSC.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | import groovy.json.JsonSlurper 4 | 5 | phenotypes_json = (new JsonSlurper()).parseText(file('pheno-list.json').text) 6 | 7 | munge_params = [] 8 | phenotypes_json.eachWithIndex{ val, num -> munge_params.add( [num, val.phenocode.replaceAll('\\.', '__'), val.assoc_files[0], val.num_cases, val.num_controls] ) } 9 | 10 | 11 | munge_params = Channel.from(munge_params) 12 | 13 | munge_exec = params.LDSC + "/munge_sumstats.py" 14 | ldsc_exec = params.LDSC + "/ldsc.py" 15 | 16 | process munge { 17 | 18 | label "small_mem" 19 | 20 | input: 21 | set val(num), val(phenocode), val(filename), val(n_cases), val(n_controls) from munge_params 22 | 23 | output: 24 | set val(num), val(phenocode), file("${phenocode}.sumstats.gz") into munged 25 | 26 | """ 27 | ${munge_exec} --chunksize 100000 --sumstats ${filename} --merge-alleles ${params.LDSC_snplist} --N-cas ${n_cases} --N-con ${n_controls} --p ${params.columns.pvalue} --signed-sumstats ${params.columns.effect},${params.no_effect} --snp ${params.columns.snp} --a1 ${params.columns.effect_allele} --a2 ${params.columns.other_allele} --out ${phenocode} 28 | """ 29 | 30 | } 31 | 32 | 33 | munged_a = Channel.create() 34 | munged_b = Channel.create() 35 | munged.into(munged_a, munged_b) 36 | 37 | 38 | process pair_corr { 39 | 40 | label = "small_mem" 41 | 42 | input: 43 | set val(num1), val(phenocode1), file(munged1), val(num2), val(phenocode2), file(munged2) from munged_a.combine(munged_b).filter{ it[0] < it[3] } 44 | 45 | output: 46 | file "${phenocode1}.${phenocode2}.log" into pair_corr 47 | 48 | """ 49 | ${ldsc_exec} --rg ${munged1},${munged2} --ref-ld-chr ${params.LDSC_scores} --w-ld-chr ${params.LDSC_scores} --out ${phenocode1}.${phenocode2} 50 | """ 51 | 52 | } 53 | 54 | 55 | process merge { 56 | 57 | label "big_mem" 58 | 59 | publishDir 'results' 60 | 61 | input: 62 | val files from pair_corr.collect{ "'" + it + "'" } 63 | 64 | output: 65 | file "ALL.RG.txt" into merged 66 | 67 | """ 68 | #!/usr/bin/env python 69 | import re 70 | import os 71 | def read_pair_corr(filename): 72 | with open(filename, 'r') as f_in: 73 | while True: 74 | line = f_in.readline() 75 | if not line: break 76 | if line.startswith('Summary of Genetic Correlation Results'): break 77 | header = f_in.readline() 78 | if not header: return {} 79 | header = header.rstrip().split() 80 | record = f_in.readline() 81 | if not record: return {} 82 | record = dict(zip(header, record.rstrip().split())) 83 | for p in ['p1', 'p2']: 84 | record[p] = re.sub(r'\\.sumstats\\.gz\$', '', os.path.split(record[p])[-1]).replace('__', '.') 85 | return { 'header': header, 'record': record } 86 | def write_merged(data, filename): 87 | with open(filename, 'w') as f_out: 88 | header = data[0]['header'] 89 | f_out.write('{}\\n'.format('\\t'.join(header))) 90 | for d in data: 91 | f_out.write('{}\\n'.format('\\t'.join([d['record'][h] for h in header]))) 92 | merged = [] 93 | for f in $files: 94 | corr = read_pair_corr(f) 95 | if not corr: continue 96 | merged.append(corr) 97 | write_merged(merged, "ALL.RG.txt") 98 | """ 99 | } 100 | 101 | -------------------------------------------------------------------------------- /LDSC/nextflow.config: -------------------------------------------------------------------------------- 1 | params { 2 | LDSC = "/exports/dtaliun/PheWeb/ldsc" 3 | LDSC_snplist = "/exports/dtaliun/PheWeb/ldsc/w_hm3.snplist" 4 | LDSC_scores = "/exports/dtaliun/PheWeb/ldsc/eur_w_ld_chr/" 5 | columns { 6 | pvalue = "pval" 7 | effect = "beta" 8 | snp = "ID" 9 | effect_allele = "ALT" 10 | other_allele = "REF" 11 | } 12 | no_effect = 0 13 | } 14 | 15 | process { 16 | // uncomment "slurm" when running on SLURM cluster. Change "queue" as needed. 17 | // executor = "slurm" 18 | executor = "local" 19 | queue = "main" 20 | withLabel: "small_mem" { 21 | cpus = 1 22 | time = "1d" 23 | memory = "8GB" 24 | } 25 | withLabel: "big_mem" { 26 | cpus = 1 27 | time = "1d" 28 | memory = "16GB" 29 | } 30 | } 31 | 32 | executor { 33 | $slurm { 34 | queueSize = 1000 35 | } 36 | $local { 37 | cpus = 4 // set number of CPUs to use when running on a single machine 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pheweb-rg-pipeline 2 | 3 | Pipeline for calculating genetic correlations via summary statistics between >1,000 phenotypes in PheWeb. 4 | Genetic correlation is on the observed scale (i.e. not liability scale). 5 | 6 | Pipeline allows to choose the following tools: 7 | - LDSC (https://github.com/bulik/ldsc) 8 | - SumHer (http://dougspeed.com/sumher/) 9 | 10 | Pipeline can be run locally or on SLURM. 11 | 12 | ## Required tools 13 | - Python 3 (recommended) 14 | - Nextflow (https://www.nextflow.io) 15 | * can be installed as a standalone tool (https://www.nextflow.io/docs/latest/getstarted.html#installation), or 16 | * using Miniconda (https://anaconda.org/bioconda/nextflow) 17 | - LDSC (https://github.com/bulik/ldsc), when computing genetic correlations via LDSC 18 | - SumHer (http://dougspeed.com/sumher/), when computing genetic correlations via SumHer 19 | 20 | ## Required data: 21 | - Summary statistics are derived from association analyses run in primarily European-ancestry samples. 22 | - Summary statistics contain separate columns for effect size, p-value, effect-allele (a1), the non-effect allele (a2) 23 | - N cases and N controls (for binary traits) are provided 24 | - N >3K 25 | - When using LDSC: 26 | - w_hm3.snplist #HapMap SNPs to extract for LDSC analyses (see https://github.com/bulik/ldsc/wiki/Heritability-and-Genetic-Correlation) 27 | - eur_w_ld_chr/ #pre-computed LD scores using 1000G Eur (see https://github.com/bulik/ldsc/wiki/Heritability-and-Genetic-Correlation) 28 | - When using SumHer: 29 | - 1000 Genomes based reference panel in PLINK binary format (i.e. can use the 1000 Genome EUR phase 3 files provided by LDSC at https://data.broadinstitute.org/alkesgroup/LDSCORE/1000G_Phase3_plinkfiles.tgz) 30 | 31 | ## How to run 32 | 33 | ### Input 34 | 35 | The input file must be named `pheno-list.json`. It has the same format as in the PheWeb data import pipeline. 36 | ```json 37 | [ 38 | { 39 | "assoc_files": ["/home/watman/ear-length.epacts.gz"], 40 | "phenocode": "ear-length", 41 | "num_cases": 10000, 42 | "num_controls": 100 43 | }, 44 | { 45 | "assoc_files": ["/home/watman/eats-kimchi.autosomal.epacts.gz"], 46 | "phenocode": "eats-kimchi", 47 | "num_cases": 14000, 48 | "num_controls": 100 49 | } 50 | ] 51 | ``` 52 | 53 | The key difference are: 54 | - Fields `num_cases` and `num_controls` are required 55 | - Only one file per trait is allowed in `assoc_files` (i.e. cannot split the summary statistics by chromosome) 56 | 57 | If you have a tab-delimited file (no header) with the following columns: full path to summary stat file, phenocode, number of cases, number of controls, use `tab2pheno-list.py -i [file.tsv]` to create `pheno-list.json`. 58 | 59 | Further details on how to create the input file are at https://github.com/statgen/pheweb. 60 | 61 | ### Run LDSC 62 | 63 | #### - Configuration 64 | 65 | Before running the pipeline you may need to change your `LDSC/nextflow.config` file: 66 | - Specify path to the directory where LDSC is installed in the `LDSC` field. 67 | - Specify path to the HapMap SNP list (i.e. w_hm3.snplist) in the `LDSC_snplist` field. 68 | - Specify path to the LD scores directory (i.e. eur_w_ld_chr/) in the `LDSC_scores` field. 69 | - Provide column names inside the `columns` configuration scope. 70 | - Set `no_effect` to 0 if analyzing regression coefficients or 1 if analyzing odds-ratios. 71 | 72 | #### - Locally 73 | 74 | Inside the `LDSC/nextflow.config` file, set the number of cpus you want to use via the `cpus` parameter: 75 | ``` 76 | ... 77 | $local { 78 | cpus = 4 79 | } 80 | ... 81 | ``` 82 | 83 | Place your input file `pheno-list.json` inside the directory where you want to save results (this will also be the working directory for all intermediate files). Then, in the same directory run: 84 | ``` 85 | nextflow run /path/to/LDSC.nf 86 | ``` 87 | 88 | #### - With SLURM 89 | 90 | Inside the `LDSC/nextflow.config` file, uncomment `executor = "slurm"` line and comment `executor = "local"` line: 91 | ``` 92 | executor = "slurm" 93 | // executor = "local" 94 | ``` 95 | Set SLURM queue name via the `queue` parameter. 96 | 97 | Place your input file `pheno-list.json` inside the directory where you want to save results (this will also be the working directory for all intermediate files). Then, in the same directory run: 98 | ``` 99 | nextflow run /path/to/LDSC.nf 100 | ``` 101 | 102 | ### Run SumHer 103 | 104 | #### - Configuration 105 | 106 | Before running the pipeline you may need to change your `SumHer/nextflow.config` file: 107 | - Specify path to the LDAK executable in the `LDAK` field. 108 | - Specify path to the directory with the referfernce panel (in Plink's bim and bam files; may be split by chromosome) in the `ref_panel` field. 109 | 110 | #### - Locally 111 | 112 | Inside the `SumHer/nextflow.config` file, set the number of cpus you want to use via the `cpus` parameter e.g.: 113 | ``` 114 | ... 115 | $local { 116 | cpus = 4 117 | } 118 | ... 119 | ``` 120 | 121 | Place your input file `pheno-list.json` inside the directory where you want to save results (this will also be the working directory for all intermediate files). Then, in the same directory run: 122 | ``` 123 | nextflow run /path/to/SumHer.nf 124 | ``` 125 | 126 | #### - With SLURM 127 | 128 | Inside the `SumHer/nextflow.config` file: 129 | 1. uncomment `executor = "slurm"` line and comment `executor = "local"` line e.g.: 130 | ``` 131 | executor = "slurm" 132 | // executor = "local" 133 | ``` 134 | 2. Set SLURM queue name via the `queue` parameter. 135 | 3. Set maximal number of parallel SLURM jobs via the `queueSize` e.g.: 136 | ``` 137 | $slurm { 138 | queueSize = 1000 139 | } 140 | ``` 141 | 142 | Place your input file `pheno-list.json` inside the directory where you want to save results (this will also be the working directory for all intermediate files). Then, in the same directory run: 143 | ``` 144 | nextflow run /path/to/LDSC.nf 145 | ``` 146 | 147 | 148 | ### Output 149 | 150 | Your final output (matrix of correlations among all the traits) is in the `workdir` directory in the `result/ALL.RG.txt` file. 151 | 152 | The pipeline creates directories: 153 | - `work`: `Nexflow` working directory with output files from all steps. 154 | - `result`: directory with final merged result 155 | -------------------------------------------------------------------------------- /SumHer/SumHer.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | 3 | import groovy.json.JsonSlurper 4 | 5 | phenotypes_json = (new JsonSlurper()).parseText(file('pheno-list.json').text) 6 | 7 | format_params = [] 8 | phenotypes_json.eachWithIndex{ val, num -> format_params.add( [num, val.phenocode.replaceAll('\\.', '__'), val.assoc_files[0]] ) } 9 | 10 | bim = Channel.fromPath(params.ref_panel + "/*.bim").collect() 11 | bed = Channel.fromPath(params.ref_panel + "/*.bed").collect() 12 | fam = Channel.fromPath(params.ref_panel + "/*.fam").collect() 13 | 14 | ldak_exec = params.LDAK 15 | 16 | 17 | process mhc { 18 | 19 | label "small_mem" 20 | errorStrategy "retry" 21 | maxRetries 3 22 | 23 | input: 24 | file bim from bim 25 | 26 | output: 27 | file "mhc.variants" into mhc 28 | 29 | """ 30 | mhc.py -i ${bim} -b GRCh37 -o mhc.variants 31 | """ 32 | 33 | } 34 | 35 | 36 | process format { 37 | 38 | label "small_mem" 39 | errorStrategy "retry" 40 | maxRetries 3 41 | 42 | input: 43 | set val(num), val(phenocode), val(filename) from format_params 44 | file bed from bed 45 | file bim from bim 46 | file fam from fam 47 | file mhc from mhc 48 | 49 | output: 50 | set val(num), val(phenocode), file("${phenocode}.stats"), file("${phenocode}.nonamb"), file("${phenocode}.exclude") into formatted 51 | 52 | """ 53 | format.py -i ${filename} -o ${phenocode} 54 | if [ -s ${phenocode}.big ]; then 55 | i=0 56 | for f in ${bed}; do 57 | i=\$((i+1)) 58 | ${ldak_exec} --remove-tags ${phenocode}.\${i} --bfile \${f%*.bed} --top-preds ${phenocode}.big --window-kb 1000 --min-cor 0.1 59 | done 60 | cat ${phenocode}.*.out > ${phenocode}.out 61 | else 62 | touch ${phenocode}.out 63 | fi 64 | cat ${mhc} ${phenocode}.out > ${phenocode}.exclude 65 | """ 66 | 67 | } 68 | 69 | 70 | formatted2intersection = Channel.create() 71 | formatted2unique_a = Channel.create() 72 | formatted2unique_b = Channel.create() 73 | formatted2pair_corr_a = Channel.create() 74 | formatted2pair_corr_b = Channel.create() 75 | 76 | 77 | formatted.separate(formatted2intersection, formatted2unique_a, formatted2unique_b, formatted2pair_corr_a, formatted2pair_corr_b) { 78 | it -> [ it[3], [it[0], it[4]], [it[0], it[4]], [it[0], it[1], it[2], it[4]], [it[0], it[1], it[2], it[4]] ] 79 | } 80 | 81 | 82 | process intersection { 83 | 84 | label "small_mem" 85 | errorStrategy "retry" 86 | maxRetries 3 87 | 88 | input: 89 | file nonambs from formatted2intersection.collect() 90 | 91 | output: 92 | file ("intersection.nonamb") into intersected 93 | 94 | """ 95 | intersect.py -i ${nonambs} -o "intersection.nonamb" 96 | """ 97 | 98 | } 99 | 100 | 101 | process unique { 102 | 103 | executor 'local' 104 | 105 | input: 106 | set file(exclude1), file(exclude2) from formatted2unique_a.combine(formatted2unique_b).filter{ it[0] < it[2] }.map { [it[1], it[3]] } 107 | 108 | output: 109 | set file(exclude1), file(exclude2), stdout into unique 110 | 111 | """ 112 | printf `cat ${exclude1} ${exclude2} | sort | uniq | md5sum | awk '{print \$1;}'` 113 | """ 114 | 115 | } 116 | 117 | 118 | process tagging_chr { 119 | 120 | label "small_mem" 121 | errorStrategy "retry" 122 | maxRetries 3 123 | 124 | input: 125 | file intersected from intersected 126 | set file(exclude1), file(exclude2), val(mdsum) from unique.unique { it[2] } 127 | file bed from bed 128 | file bim from bim 129 | file fam from fam 130 | each chr from Channel.from(1..22) 131 | 132 | output: 133 | set val(mdsum), file("sumldak_${mdsum}_${chr}.tagging") into tagged_chr 134 | 135 | """ 136 | cat ${exclude1} ${exclude2} | sort | uniq > combined.exclude 137 | bim_file=`grep -l "^${chr}\\s" *.bim` 138 | [ -z \$bim_file ] && echo "BIM file for chromosome ${chr} was not found!" && exit 1 139 | ${ldak_exec} --cut-weights weights_${mdsum}_${chr} --bfile \${bim_file%*.bim} --extract ${intersected} --exclude combined.exclude --chr ${chr} 140 | ${ldak_exec} --calc-weights-all weights_${mdsum}_${chr} --bfile \${bim_file%*.bim} --extract ${intersected} --exclude combined.exclude --chr ${chr} 141 | ${ldak_exec} --calc-tagging sumldak_${mdsum}_${chr} --bfile \${bim_file%*.bim} --weights weights_${mdsum}_${chr}/weights.short --power -0.25 --extract ${intersected} --exclude combined.exclude --window-kb 1000 --chr ${chr} 142 | """ 143 | 144 | } 145 | 146 | 147 | process tagging_merge { 148 | 149 | label "small_mem" 150 | errorStrategy "retry" 151 | maxRetries 3 152 | 153 | input: 154 | set val(mdsum), file(tagged_chr) from tagged_chr.groupTuple() 155 | 156 | output: 157 | file "sumldak_${mdsum}.tagging" into tagged 158 | 159 | """ 160 | find . -maxdepth 1 -name "sumldak_${mdsum}_*.tagging" -printf "%f\n" | sort -V > list.txt 161 | ${ldak_exec} --join-tagging sumldak_${mdsum} --taglist list.txt 162 | """ 163 | 164 | } 165 | 166 | 167 | process pair_corr { 168 | 169 | label "small_mem" 170 | errorStrategy "retry" 171 | maxRetries 3 172 | 173 | input: 174 | set val(phenocode1), file(stats1), file(exclude1), val(phenocode2), file(stats2), file(exclude2) from formatted2pair_corr_a.combine(formatted2pair_corr_b).filter{ it[0] < it[4] }.map { [it[1], it[2], it[3], it[5], it[6], it[7]] } 175 | file tagged from tagged.collect() 176 | 177 | output: 178 | file "${phenocode1}.${phenocode2}.cors" into pair_corr 179 | 180 | """ 181 | mdsum=`cat ${exclude1} ${exclude2} | sort | uniq | md5sum | awk '{print \$1;}'` 182 | tagfile="sumldak_\${mdsum}.tagging" 183 | ${ldak_exec} --sum-cors ${phenocode1}.${phenocode2} --tagfile \${tagfile} --summary ${stats1} --summary2 ${stats2} --genomic-control YES --check-sums NO 184 | """ 185 | 186 | } 187 | 188 | 189 | process merge { 190 | 191 | label "small_mem" 192 | errorStrategy "retry" 193 | maxRetries 3 194 | 195 | publishDir "results" 196 | 197 | input: 198 | file files from pair_corr.collectFile() { item -> ["list.txt", "${item}\n"] } 199 | 200 | output: 201 | file "ALL.RG.txt" into merged 202 | 203 | """ 204 | merge.py -i ${files} -o ALL.RG.txt 205 | """ 206 | } 207 | 208 | -------------------------------------------------------------------------------- /SumHer/bin/format.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import gzip 5 | 6 | argparser = argparse.ArgumentParser(description = 'Formats summary statistics') 7 | argparser.add_argument('-i', '--in', metavar = 'file', dest = 'in_file', required = True, help = 'Input file with summary statistics, compressed using gzip/bgzip.') 8 | argparser.add_argument('-o', '--out', metavar = 'prefix', dest = 'out_prefix', required = False, help = 'Prefix for output files: .stats, .nonamb, .big.') 9 | argparser.add_argument('-s', '--snp', metavar = 'column', dest = 'snp', default = 'ID', help = 'Column name for SNP.') 10 | argparser.add_argument('-ea', '--effect-allele', metavar = 'column', dest = 'ea', default = 'ALT', help = 'Column name for effect allele.') 11 | argparser.add_argument('-oa', '--other-allele', metavar = 'column', dest = 'oa', default = 'REF', help = 'Column name for other allele.') 12 | argparser.add_argument('-e', '--effect-size', metavar = 'column', dest = 'beta', default = 'beta', help = 'Column name for effect size (log odds).') 13 | argparser.add_argument('-se', '--se-effect', metavar = 'column', dest = 'se', default = 'sebeta', help = 'Column name for standard error of effect size.') 14 | argparser.add_argument('-n', '--num-samples', metavar = 'column', nargs = '+', dest = 'n', default = ['num_cases', 'num_controls'], help = 'Column name(s) for number of samples. If multiple columns are specified, they are summped up to get total sample size.') 15 | 16 | 17 | if __name__ == '__main__': 18 | args = argparser.parse_args() 19 | required_columns = [ args.snp, args.ea, args.oa, args.beta, args.se ] + args.n 20 | stats_filename = f'{args.out_prefix}.stats' 21 | pred_filename = f'{args.out_prefix}.nonamb' 22 | le_pred_filename = f'{args.out_prefix}.big' 23 | unique_names = set() 24 | with gzip.open(args.in_file, 'rt') as ifile, \ 25 | open(stats_filename, 'w') as stats_ofile, \ 26 | open(pred_filename, 'w') as pred_ofile, \ 27 | open(le_pred_filename, 'w') as le_pred_ofile: 28 | iheader = ifile.readline() 29 | if not iheader: 30 | raise Exception(f'Empty header in {args.in_file}.') 31 | iheader = iheader.rstrip().split() 32 | for c in required_columns: 33 | if c not in iheader: 34 | raise Exception(f'Missing {c} column in {args.in_file}.') 35 | stats_ofile.write('Predictor A1 A2 Direction Stat n\n') 36 | for i, iline in enumerate(ifile, 2): 37 | iline = iline.rstrip().split() 38 | if len(iline) != len(iheader): 39 | raise Exception(f'Number of columns on line {i} does not match header.') 40 | record = dict(zip(iheader, iline)) 41 | if record[args.snp] in unique_names: # skip if ID is duplicated 42 | continue 43 | unique_names.add(record[args.snp]) 44 | a1 = record[args.ea].upper() 45 | a2 = record[args.oa].upper() 46 | try: 47 | n = sum(map(int, (record[c] for c in args.n))) 48 | stat = (float(record[args.beta]) / float(record[args.se])) ** 2 49 | except: # catch possible type conversion or division by 0 errors 50 | continue 51 | if stat > n / 99.0: 52 | le_pred_ofile.write('{}\n'.format(record[args.snp])) 53 | if not (a1 in {'A', 'T'} and a2 in {'A', 'T'}) and not (a1 in {'C', 'G'} and a2 in {'C', 'G'}): 54 | pred_ofile.write('{}\n'.format(record[args.snp])) 55 | stats_ofile.write('{} {} {} {} {} {}\n'.format(record[args.snp], a1, a2, record[args.beta], stat, n)) 56 | -------------------------------------------------------------------------------- /SumHer/bin/intersect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | from collections import Counter 5 | 6 | argparser = argparse.ArgumentParser(description = 'Intersects files preserving the order. Note: suitable only for very short lines (e.g. rsId).') 7 | argparser.add_argument('-i', '--input', metavar = 'file', dest = 'in_files', nargs = '+', required = True, help = 'Input files.') 8 | argparser.add_argument('-o', '--out', metavar = 'file', dest = 'out_file', required = True, help = 'Output file.') 9 | 10 | 11 | if __name__ == '__main__': 12 | args = argparser.parse_args() 13 | n_files = len(args.in_files) 14 | names = Counter() 15 | for in_file in args.in_files: 16 | with open(in_file, 'rb') as ifile: 17 | names.update(ifile) 18 | with open(args.out_file, 'wb') as ofile: 19 | with open(args.in_files[0], 'rb') as ifile: 20 | for line in ifile: 21 | if names[line] == n_files: 22 | ofile.write(line) 23 | -------------------------------------------------------------------------------- /SumHer/bin/merge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | 6 | argparser = argparse.ArgumentParser(description = 'Merges SumHer output files.') 7 | argparser.add_argument('-i', '--input', metavar = 'file', dest = 'in_list_file', required = True, help = 'Input file with a list of SumHer output files (one per line).') 8 | argparser.add_argument('-o', '--out', metavar = 'file', dest = 'out_file', required = True, help = 'Output file.') 9 | 10 | 11 | def read_sumher_output(in_file): 12 | pheno1, pheno2, suffix = os.path.split(in_file)[-1].split('.') 13 | pheno1, pheno2 = map(lambda x: x.replace('__', '.'), [pheno1, pheno2]) 14 | with open(in_file, 'rt') as ifile: 15 | header = ifile.readline() 16 | if not header: return {} 17 | header = header.rstrip().split() 18 | if not all(h in header for h in ['Component', 'Value', 'SD']): return {} 19 | t_header = ['Pheno1', 'Pheno2'] 20 | t_row = [pheno1, pheno2] 21 | for line in ifile: 22 | line = line.rstrip() 23 | if not line: continue 24 | columns = line.split() 25 | if len(columns) != len(header): continue 26 | columns = dict(zip(header, columns)) 27 | t_header.append(columns['Component']) 28 | t_row.append(columns['Value']) 29 | t_header.append(columns['Component'] + '_SD') 30 | t_row.append(columns['SD']) 31 | return { 'header': t_header, 'record': t_row } 32 | 33 | 34 | if __name__ == '__main__': 35 | args = argparser.parse_args() 36 | header = False 37 | in_files = [] 38 | with open(args.in_list_file, 'rt') as ifile: 39 | for line in ifile: 40 | line = line.strip() 41 | if not line: continue 42 | in_files.append(line) 43 | with open(args.out_file, 'wt') as ofile: 44 | for in_file in in_files: 45 | corr = read_sumher_output(in_file) 46 | if not corr: continue 47 | if not header: 48 | ofile.write('{}\n'.format('\t'.join(corr['header']))) 49 | header = True 50 | ofile.write('{}\n'.format('\t'.join(corr['record']))) 51 | -------------------------------------------------------------------------------- /SumHer/bin/mhc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import gzip 5 | 6 | argparser = argparse.ArgumentParser(description = 'Lists MHC variants in *.bim files.') 7 | argparser.add_argument('-i', '--in', metavar = 'file', dest = 'in_files', nargs = '+', required = True, help = 'Input *.bim file(s).') 8 | argparser.add_argument('-b', '--genome-build', metavar = 'name', dest = 'build', choices = ['GRCh37', 'GRCh38'], required = True, help = 'Human genome build version: GRCh37, GRCh38.') 9 | argparser.add_argument('-o', '--out', metavar = 'file', dest = 'out_file', required = True, help = 'Output file.') 10 | 11 | mhc = { 12 | 'GRCh37': { 'chrom': '6', 'start': 28477797, 'stop': 33448354 } , 13 | 'GRCh38': { 'chrom': '6', 'start': 28510120, 'stop': 33480577 } 14 | } 15 | 16 | if __name__ == '__main__': 17 | args = argparser.parse_args() 18 | mhc_chrom = mhc[args.build]['chrom'] 19 | mhc_start = mhc[args.build]['start'] 20 | mhc_stop = mhc[args.build]['stop'] 21 | with open(args.out_file, 'wt') as ofile: 22 | for in_file in args.in_files: 23 | with open(in_file, 'rt') as ifile: 24 | for line in ifile: 25 | chrom, name, morgans, bp, a1, a2 = line.rstrip().split() 26 | if chrom == mhc_chrom: 27 | bp = int(bp) 28 | if bp >= mhc_start and bp <= mhc_stop: 29 | ofile.write('{}\n'.format(name)) 30 | -------------------------------------------------------------------------------- /SumHer/nextflow.config: -------------------------------------------------------------------------------- 1 | params { 2 | LDAK = "/exports/dtaliun/PheWeb/ldak5.linux" 3 | ref_panel = "/exports/dtaliun/PheWeb/ldsc/1000G_EUR_Phase3_plink" 4 | columns { 5 | pvalue = "pval" 6 | effect = "beta" 7 | snp = "ID" 8 | effect_allele = "ALT" 9 | other_allele = "REF" 10 | } 11 | no_effect = 0 12 | } 13 | 14 | process { 15 | // uncomment "slurm" when running on SLURM cluster. Change "queue" as needed. 16 | // executor = "slurm" 17 | executor = "local" 18 | queue = "main" 19 | withLabel: "small_mem" { 20 | cpus = 1 21 | time = "2d" 22 | memory = "8GB" 23 | } 24 | withLabel: "big_mem" { 25 | cpus = 1 26 | time = "2d" 27 | memory = "16GB" 28 | } 29 | } 30 | 31 | executor { 32 | $slurm { 33 | queueSize = 1000 34 | } 35 | $local { 36 | cpus = 22 // set number of CPUs to use when running on a single machine 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pheno-list.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "assoc_files": ["/home/data/PheCode_990_SAIGE_MACge20.txt.vcf.gz"], 4 | "phenocode": 990, 5 | "num_cases": 10000, 6 | "num_controls": 10000 7 | }, 8 | { 9 | "assoc_files": ["/home/data/PheCode_989_SAIGE_MACge20.txt.vcf.gz"], 10 | "phenocode": 989, 11 | "num_cases": 100000, 12 | "num_controls": 100000 13 | }, 14 | { 15 | "assoc_files": ["/home/data/PheCode_974_SAIGE_MACge20.txt.vcf.gz"], 16 | "phenocode": 974, 17 | "num_cases": 100000, 18 | "num_controls": 100000 19 | }, 20 | { 21 | "assoc_files": ["/home/data/PheCode_964_SAIGE_MACge20.txt.vcf.gz"], 22 | "phenocode": 964, 23 | "num_cases": 100000, 24 | "num_controls": 100000 25 | } 26 | ] 27 | 28 | -------------------------------------------------------------------------------- /tab2pheno-list.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | argparser = argparse.ArgumentParser(description = 'Generates pheno-list.json file from tab-delimited file. Warning: will overwrite existing pheno-list.json file.') 5 | argparser.add_argument('-i', '--in-tsv', metavar = 'file', dest = 'in_tsv', required = True, help = 'Input tab-delimited file. No header. Column order: full path to summary stat file, phenocode, number of cases, number of controls.') 6 | 7 | if __name__ == '__main__': 8 | args = argparser.parse_args() 9 | data = [] 10 | with open(args.in_tsv, 'r') as f_in: 11 | for line in f_in: 12 | columns = line.rstrip().split('\t') 13 | if any(len(c.strip()) == 0 for c in columns): 14 | continue 15 | if len(columns) < 4: 16 | continue 17 | path = columns[0] 18 | phenocode = columns[1] 19 | cases = columns[2] 20 | controls = columns[3] 21 | try: 22 | cases_int = int(cases) 23 | except: 24 | raise Exception('Non integer value \'{}\' found in cases column.'.format(cases)) 25 | try: 26 | controls_int = int(controls) 27 | except: 28 | raise Exception('Non integer value \'{}\' found in controls column.'.format(controls)) 29 | data.append({ 30 | 'assoc_files': [path], 31 | 'phenocode': phenocode, 32 | 'num_cases': cases_int, 33 | 'num_controls': controls_int 34 | }) 35 | with open('pheno-list.json', 'w') as f_out: 36 | json.dump(data, f_out, indent = 4) 37 | --------------------------------------------------------------------------------