├── LDSC
    ├── LDSC.nf
    └── nextflow.config
├── README.md
├── SumHer
    ├── SumHer.nf
    ├── bin
    │   ├── format.py
    │   ├── intersect.py
    │   ├── merge.py
    │   └── mhc.py
    └── nextflow.config
├── pheno-list.json
└── tab2pheno-list.py


/LDSC/LDSC.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | 
  3 | import groovy.json.JsonSlurper
  4 | 
  5 | phenotypes_json = (new JsonSlurper()).parseText(file('pheno-list.json').text)
  6 | 
  7 | munge_params = []
  8 | phenotypes_json.eachWithIndex{ val, num -> munge_params.add( [num, val.phenocode.replaceAll('\\.', '__'), val.assoc_files[0], val.num_cases, val.num_controls] ) } 
  9 | 
 10 | 
 11 | munge_params = Channel.from(munge_params)
 12 | 
 13 | munge_exec = params.LDSC + "/munge_sumstats.py"
 14 | ldsc_exec = params.LDSC + "/ldsc.py"
 15 | 
 16 | process munge {
 17 | 
 18 |    label "small_mem"
 19 | 
 20 |    input:
 21 |    set val(num), val(phenocode), val(filename), val(n_cases), val(n_controls) from munge_params
 22 | 
 23 |    output:
 24 |    set val(num), val(phenocode), file("${phenocode}.sumstats.gz") into munged
 25 | 
 26 |    """
 27 |    ${munge_exec} --chunksize 100000 --sumstats ${filename} --merge-alleles ${params.LDSC_snplist} --N-cas ${n_cases} --N-con ${n_controls} --p ${params.columns.pvalue} --signed-sumstats ${params.columns.effect},${params.no_effect} --snp ${params.columns.snp} --a1 ${params.columns.effect_allele} --a2 ${params.columns.other_allele} --out ${phenocode}
 28 |    """
 29 |  
 30 | }
 31 | 
 32 | 
 33 | munged_a = Channel.create()
 34 | munged_b = Channel.create()
 35 | munged.into(munged_a, munged_b)
 36 | 
 37 | 
 38 | process pair_corr {
 39 | 
 40 |    label = "small_mem"
 41 | 
 42 |    input:
 43 |    set val(num1), val(phenocode1), file(munged1), val(num2), val(phenocode2), file(munged2) from munged_a.combine(munged_b).filter{ it[0] < it[3] }
 44 | 
 45 |    output:
 46 |    file "${phenocode1}.${phenocode2}.log" into pair_corr
 47 | 
 48 |    """
 49 |    ${ldsc_exec} --rg ${munged1},${munged2} --ref-ld-chr ${params.LDSC_scores} --w-ld-chr ${params.LDSC_scores} --out ${phenocode1}.${phenocode2}
 50 |    """
 51 | 
 52 | }
 53 | 
 54 | 
 55 | process merge {
 56 | 
 57 |    label "big_mem"
 58 | 
 59 |    publishDir 'results'
 60 | 
 61 |    input:
 62 |    val files from pair_corr.collect{ "'" + it + "'" }
 63 | 
 64 |    output:
 65 |    file "ALL.RG.txt" into merged
 66 | 
 67 |    """
 68 |    #!/usr/bin/env python
 69 |    import re
 70 |    import os
 71 |    def read_pair_corr(filename):
 72 |       with open(filename, 'r') as f_in:
 73 |          while True:
 74 |             line = f_in.readline()
 75 |             if not line: break
 76 |             if line.startswith('Summary of Genetic Correlation Results'): break
 77 |          header = f_in.readline()
 78 |          if not header: return {}
 79 |          header = header.rstrip().split()
 80 |          record = f_in.readline()
 81 |          if not record: return {}
 82 |          record = dict(zip(header, record.rstrip().split()))
 83 |          for p in ['p1', 'p2']:
 84 |             record[p] = re.sub(r'\\.sumstats\\.gz\$', '', os.path.split(record[p])[-1]).replace('__', '.')
 85 |          return { 'header': header, 'record': record }
 86 |    def write_merged(data, filename):
 87 |       with open(filename, 'w') as f_out:
 88 |          header = data[0]['header']
 89 |          f_out.write('{}\\n'.format('\\t'.join(header)))
 90 |          for d in data:
 91 |             f_out.write('{}\\n'.format('\\t'.join([d['record'][h] for h in header])))
 92 |    merged = []
 93 |    for f in $files:
 94 |       corr = read_pair_corr(f)
 95 |       if not corr: continue
 96 |       merged.append(corr)
 97 |    write_merged(merged, "ALL.RG.txt")
 98 |    """
 99 | }
100 | 
101 | 


--------------------------------------------------------------------------------
/LDSC/nextflow.config:
--------------------------------------------------------------------------------
 1 | params {
 2 |    LDSC = "/exports/dtaliun/PheWeb/ldsc"
 3 |    LDSC_snplist = "/exports/dtaliun/PheWeb/ldsc/w_hm3.snplist"
 4 |    LDSC_scores = "/exports/dtaliun/PheWeb/ldsc/eur_w_ld_chr/"
 5 |    columns {
 6 |       pvalue = "pval"
 7 |       effect = "beta"
 8 |       snp = "ID"
 9 |       effect_allele = "ALT"
10 |       other_allele = "REF"
11 |    }
12 |    no_effect = 0
13 | }
14 | 
15 | process {
16 | // uncomment "slurm" when running on SLURM cluster. Change "queue" as needed. 
17 | //   executor = "slurm"
18 |    executor = "local"
19 |    queue = "main"
20 |    withLabel: "small_mem" {
21 |       cpus = 1
22 |       time = "1d" 
23 |       memory = "8GB"
24 |    }
25 |    withLabel: "big_mem" {
26 |       cpus = 1
27 |       time = "1d"
28 |       memory = "16GB"         
29 |    }
30 | }
31 | 
32 | executor {
33 |    $slurm {
34 |       queueSize = 1000
35 |    }
36 |    $local {
37 |       cpus = 4 // set number of CPUs to use when running on a single machine
38 |    }
39 | }
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pheweb-rg-pipeline
  2 | 
  3 | Pipeline for calculating genetic correlations via summary statistics between >1,000 phenotypes in PheWeb.
  4 | Genetic correlation is on the observed scale (i.e. not liability scale).
  5 | 
  6 | Pipeline allows to choose the following tools:
  7 | - LDSC (https://github.com/bulik/ldsc)
  8 | - SumHer (http://dougspeed.com/sumher/)
  9 | 
 10 | Pipeline can be run locally or on SLURM.
 11 | 
 12 | ## Required tools
 13 | - Python 3 (recommended)
 14 | - Nextflow (https://www.nextflow.io) 
 15 |    * can be installed as a standalone tool (https://www.nextflow.io/docs/latest/getstarted.html#installation), or
 16 |    * using Miniconda (https://anaconda.org/bioconda/nextflow)
 17 | - LDSC (https://github.com/bulik/ldsc), when computing genetic correlations via LDSC 
 18 | - SumHer (http://dougspeed.com/sumher/), when computing genetic correlations via SumHer
 19 | 
 20 | ## Required data:
 21 | - Summary statistics are derived from association analyses run in primarily European-ancestry samples. 
 22 | - Summary statistics contain separate columns for effect size, p-value, effect-allele (a1), the non-effect allele (a2)
 23 | - N cases and N controls (for binary traits) are provided
 24 | - N >3K 
 25 | - When using LDSC: 
 26 |     - w_hm3.snplist #HapMap SNPs to extract for LDSC analyses (see https://github.com/bulik/ldsc/wiki/Heritability-and-Genetic-Correlation)
 27 |     - eur_w_ld_chr/ #pre-computed LD scores using 1000G Eur (see https://github.com/bulik/ldsc/wiki/Heritability-and-Genetic-Correlation)
 28 | - When using SumHer:
 29 |     - 1000 Genomes based reference panel in PLINK binary format (i.e. can use the 1000 Genome EUR phase 3 files provided by LDSC at https://data.broadinstitute.org/alkesgroup/LDSCORE/1000G_Phase3_plinkfiles.tgz) 
 30 | 
 31 | ## How to run
 32 | 
 33 | ### Input
 34 | 
 35 | The input file must be named `pheno-list.json`. It has the same format as in the PheWeb data import pipeline.
 36 | ```json
 37 | [
 38 |  {
 39 |   "assoc_files": ["/home/watman/ear-length.epacts.gz"],
 40 |   "phenocode": "ear-length",
 41 |   "num_cases": 10000,
 42 |   "num_controls": 100
 43 |  },
 44 |  {
 45 |   "assoc_files": ["/home/watman/eats-kimchi.autosomal.epacts.gz"],
 46 |   "phenocode": "eats-kimchi",
 47 |   "num_cases": 14000,
 48 |   "num_controls": 100
 49 |  }
 50 | ]
 51 | ```
 52 | 
 53 | The key difference are:
 54 | -  Fields `num_cases` and `num_controls` are required
 55 | -  Only one file per trait is allowed in `assoc_files` (i.e. cannot split the summary statistics by chromosome)
 56 | 
 57 | If you have a tab-delimited file (no header) with the following columns: full path to summary stat file, phenocode, number of cases, number of controls, use `tab2pheno-list.py -i [file.tsv]` to create `pheno-list.json`.
 58 | 
 59 | Further details on how to create the input file are at https://github.com/statgen/pheweb.
 60 | 
 61 | ### Run LDSC
 62 | 
 63 | #### - Configuration
 64 | 
 65 | Before running the pipeline you may need to change your `LDSC/nextflow.config` file:
 66 | - Specify path to the directory where LDSC is installed in the `LDSC` field.
 67 | - Specify path to the HapMap SNP list (i.e. w_hm3.snplist) in the `LDSC_snplist` field.
 68 | - Specify path to the LD scores directory (i.e. eur_w_ld_chr/) in the `LDSC_scores` field.
 69 | - Provide column names inside the `columns` configuration scope.
 70 | - Set `no_effect` to 0 if analyzing regression coefficients or 1 if analyzing odds-ratios.
 71 | 
 72 | #### - Locally 
 73 | 
 74 | Inside the `LDSC/nextflow.config` file, set the number of cpus you want to use via the `cpus` parameter:
 75 | ```
 76 | ...
 77 | $local {
 78 |   cpus = 4
 79 | }
 80 | ...
 81 | ```
 82 | 
 83 | Place your input file `pheno-list.json` inside the directory where you want to save results (this will also be the working directory for all intermediate files). Then, in the same directory run:
 84 | ```
 85 | nextflow run /path/to/LDSC.nf
 86 | ```
 87 | 
 88 | #### - With SLURM
 89 | 
 90 | Inside the `LDSC/nextflow.config` file, uncomment `executor = "slurm"` line and comment `executor = "local"` line:
 91 | ```
 92 | executor = "slurm"
 93 | // executor = "local"
 94 | ```
 95 | Set SLURM queue name via the `queue` parameter.
 96 | 
 97 | Place your input file `pheno-list.json` inside the directory where you want to save results (this will also be the working directory for all intermediate files). Then, in the same directory run:
 98 | ```
 99 | nextflow run /path/to/LDSC.nf
100 | ```
101 | 
102 | ### Run SumHer
103 | 
104 | #### - Configuration
105 | 
106 | Before running the pipeline you may need to change your `SumHer/nextflow.config` file:
107 | - Specify path to the LDAK executable in the `LDAK` field.
108 | - Specify path to the directory with the referfernce panel (in Plink's bim and bam files; may be split by chromosome) in the `ref_panel` field.
109 | 
110 | #### - Locally 
111 | 
112 | Inside the `SumHer/nextflow.config` file, set the number of cpus you want to use via the `cpus` parameter e.g.:
113 | ```
114 | ...
115 | $local {
116 |   cpus = 4
117 | }
118 | ...
119 | ```
120 | 
121 | Place your input file `pheno-list.json` inside the directory where you want to save results (this will also be the working directory for all intermediate files). Then, in the same directory run:
122 | ```
123 | nextflow run /path/to/SumHer.nf
124 | ```
125 | 
126 | #### - With SLURM
127 | 
128 | Inside the `SumHer/nextflow.config` file:
129 | 1. uncomment `executor = "slurm"` line and comment `executor = "local"` line e.g.:
130 |   ```
131 |   executor = "slurm"
132 |   // executor = "local"
133 |   ```
134 | 2. Set SLURM queue name via the `queue` parameter.
135 | 3. Set maximal number of parallel SLURM jobs via the `queueSize` e.g.:
136 |   ```
137 |   $slurm {
138 |     queueSize = 1000
139 |   }
140 |   ```
141 | 
142 | Place your input file `pheno-list.json` inside the directory where you want to save results (this will also be the working directory for all intermediate files). Then, in the same directory run:
143 | ```
144 | nextflow run /path/to/LDSC.nf
145 | ```
146 | 
147 | 
148 | ### Output
149 | 
150 | Your final output (matrix of correlations among all the traits) is in the `workdir` directory in the `result/ALL.RG.txt` file.
151 | 
152 | The pipeline creates directories:
153 | - `work`: `Nexflow` working directory with output files from all steps.
154 | - `result`: directory with final merged result
155 | 


--------------------------------------------------------------------------------
/SumHer/SumHer.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | 
  3 | import groovy.json.JsonSlurper
  4 | 
  5 | phenotypes_json = (new JsonSlurper()).parseText(file('pheno-list.json').text)
  6 | 
  7 | format_params = []
  8 | phenotypes_json.eachWithIndex{ val, num -> format_params.add( [num, val.phenocode.replaceAll('\\.', '__'), val.assoc_files[0]] ) } 
  9 | 
 10 | bim = Channel.fromPath(params.ref_panel + "/*.bim").collect()
 11 | bed = Channel.fromPath(params.ref_panel + "/*.bed").collect()
 12 | fam = Channel.fromPath(params.ref_panel + "/*.fam").collect()
 13 | 
 14 | ldak_exec = params.LDAK
 15 | 
 16 | 
 17 | process mhc {
 18 | 
 19 |    label "small_mem"
 20 |    errorStrategy "retry"
 21 |    maxRetries 3
 22 | 
 23 |    input:
 24 |    file bim from bim
 25 | 
 26 |    output:
 27 |    file "mhc.variants" into mhc
 28 | 
 29 |    """
 30 |    mhc.py -i ${bim} -b GRCh37 -o mhc.variants 
 31 |    """
 32 | 
 33 | }
 34 | 
 35 | 
 36 | process format {
 37 | 
 38 |    label "small_mem"
 39 |    errorStrategy "retry"
 40 |    maxRetries 3
 41 | 
 42 |    input:
 43 |    set val(num), val(phenocode), val(filename) from format_params
 44 |    file bed from bed
 45 |    file bim from bim
 46 |    file fam from fam
 47 |    file mhc from mhc
 48 | 
 49 |    output:
 50 |    set val(num), val(phenocode), file("${phenocode}.stats"), file("${phenocode}.nonamb"), file("${phenocode}.exclude") into formatted
 51 | 
 52 |    """
 53 |    format.py -i ${filename} -o ${phenocode}
 54 |    if [ -s ${phenocode}.big ]; then
 55 |       i=0
 56 |       for f in ${bed}; do
 57 |          i=\$((i+1))
 58 |          ${ldak_exec} --remove-tags ${phenocode}.\${i} --bfile \${f%*.bed} --top-preds ${phenocode}.big --window-kb 1000 --min-cor 0.1   
 59 |       done
 60 |       cat ${phenocode}.*.out > ${phenocode}.out
 61 |    else
 62 |       touch ${phenocode}.out
 63 |    fi
 64 |    cat ${mhc} ${phenocode}.out > ${phenocode}.exclude
 65 |    """
 66 |  
 67 | }
 68 | 
 69 | 
 70 | formatted2intersection = Channel.create()
 71 | formatted2unique_a = Channel.create()
 72 | formatted2unique_b = Channel.create()
 73 | formatted2pair_corr_a = Channel.create()
 74 | formatted2pair_corr_b = Channel.create()
 75 | 
 76 | 
 77 | formatted.separate(formatted2intersection, formatted2unique_a, formatted2unique_b, formatted2pair_corr_a, formatted2pair_corr_b) { 
 78 |    it -> [ it[3], [it[0], it[4]], [it[0], it[4]], [it[0], it[1], it[2], it[4]], [it[0], it[1], it[2], it[4]]  ]
 79 | }
 80 | 
 81 | 
 82 | process intersection {
 83 | 
 84 |    label "small_mem"
 85 |    errorStrategy "retry"
 86 |    maxRetries 3
 87 | 
 88 |    input:
 89 |    file nonambs from formatted2intersection.collect()
 90 | 
 91 |    output:
 92 |    file ("intersection.nonamb") into intersected
 93 | 
 94 |    """
 95 |    intersect.py -i ${nonambs} -o "intersection.nonamb"
 96 |    """
 97 | 
 98 | }
 99 | 
100 | 
101 | process unique {
102 | 
103 |    executor 'local'
104 |   
105 |    input:
106 |    set file(exclude1), file(exclude2) from formatted2unique_a.combine(formatted2unique_b).filter{ it[0] < it[2] }.map { [it[1], it[3]] } 
107 | 
108 |    output:
109 |    set file(exclude1), file(exclude2), stdout into unique 
110 | 
111 |    """
112 |    printf `cat ${exclude1} ${exclude2} | sort | uniq | md5sum | awk '{print \$1;}'`
113 |    """
114 | 
115 | }
116 | 
117 | 
118 | process tagging_chr {
119 |    
120 |    label "small_mem"
121 |    errorStrategy "retry"
122 |    maxRetries 3
123 | 
124 |    input:
125 |    file intersected from intersected
126 |    set file(exclude1), file(exclude2), val(mdsum) from unique.unique { it[2] }
127 |    file bed from bed
128 |    file bim from bim
129 |    file fam from fam
130 |    each chr from Channel.from(1..22)
131 | 
132 |    output:
133 |    set val(mdsum), file("sumldak_${mdsum}_${chr}.tagging") into tagged_chr
134 | 
135 |    """
136 |    cat ${exclude1} ${exclude2} | sort | uniq > combined.exclude
137 |    bim_file=`grep -l "^${chr}\\s" *.bim`
138 |    [ -z \$bim_file ] && echo "BIM file for chromosome ${chr} was not found!" && exit 1
139 |    ${ldak_exec} --cut-weights weights_${mdsum}_${chr} --bfile \${bim_file%*.bim} --extract ${intersected} --exclude combined.exclude --chr ${chr}
140 |    ${ldak_exec} --calc-weights-all weights_${mdsum}_${chr} --bfile \${bim_file%*.bim} --extract ${intersected} --exclude combined.exclude --chr ${chr}
141 |    ${ldak_exec} --calc-tagging sumldak_${mdsum}_${chr} --bfile \${bim_file%*.bim} --weights weights_${mdsum}_${chr}/weights.short --power -0.25 --extract ${intersected} --exclude combined.exclude --window-kb 1000 --chr ${chr} 
142 |    """
143 |  
144 | }
145 | 
146 | 
147 | process tagging_merge {
148 | 
149 |    label "small_mem"
150 |    errorStrategy "retry"
151 |    maxRetries 3
152 | 
153 |    input:
154 |    set val(mdsum), file(tagged_chr) from tagged_chr.groupTuple()
155 | 
156 |    output:
157 |    file "sumldak_${mdsum}.tagging" into tagged
158 | 
159 |    """
160 |    find . -maxdepth 1 -name "sumldak_${mdsum}_*.tagging" -printf "%f\n" | sort -V > list.txt
161 |    ${ldak_exec} --join-tagging sumldak_${mdsum} --taglist list.txt
162 |    """
163 | 
164 | }
165 | 
166 | 
167 | process pair_corr {
168 | 
169 |    label "small_mem"
170 |    errorStrategy "retry"
171 |    maxRetries 3
172 | 
173 |    input:
174 |    set val(phenocode1), file(stats1), file(exclude1), val(phenocode2), file(stats2), file(exclude2) from formatted2pair_corr_a.combine(formatted2pair_corr_b).filter{ it[0] < it[4] }.map { [it[1], it[2], it[3], it[5], it[6], it[7]] }
175 |    file tagged from tagged.collect()
176 | 
177 |    output:
178 |    file "${phenocode1}.${phenocode2}.cors" into pair_corr
179 | 
180 |    """
181 |    mdsum=`cat ${exclude1} ${exclude2} | sort | uniq | md5sum | awk '{print \$1;}'`
182 |    tagfile="sumldak_\${mdsum}.tagging"
183 |    ${ldak_exec} --sum-cors ${phenocode1}.${phenocode2} --tagfile \${tagfile} --summary ${stats1} --summary2 ${stats2} --genomic-control YES --check-sums NO
184 |    """
185 | 
186 | }
187 | 
188 | 
189 | process merge {
190 | 
191 |    label "small_mem"
192 |    errorStrategy "retry"
193 |    maxRetries 3
194 | 
195 |    publishDir "results"
196 | 
197 |    input:
198 |    file files from pair_corr.collectFile() { item -> ["list.txt", "${item}\n"] } 
199 | 
200 |    output:
201 |    file "ALL.RG.txt" into merged
202 | 
203 |    """
204 |    merge.py -i ${files} -o ALL.RG.txt
205 |    """
206 | }
207 | 
208 | 


--------------------------------------------------------------------------------
/SumHer/bin/format.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import gzip
 5 | 
 6 | argparser = argparse.ArgumentParser(description = 'Formats summary statistics')
 7 | argparser.add_argument('-i', '--in', metavar = 'file', dest = 'in_file', required = True, help = 'Input file with summary statistics, compressed using gzip/bgzip.')
 8 | argparser.add_argument('-o', '--out', metavar = 'prefix', dest = 'out_prefix', required = False, help = 'Prefix for output files: <prefix>.stats, <prefix>.nonamb, <prefix>.big.')
 9 | argparser.add_argument('-s', '--snp', metavar = 'column', dest = 'snp', default = 'ID', help = 'Column name for SNP.')
10 | argparser.add_argument('-ea', '--effect-allele', metavar = 'column', dest = 'ea', default = 'ALT', help = 'Column name for effect allele.')
11 | argparser.add_argument('-oa', '--other-allele', metavar = 'column', dest = 'oa', default = 'REF', help = 'Column name for other allele.')
12 | argparser.add_argument('-e', '--effect-size', metavar = 'column', dest = 'beta', default = 'beta', help = 'Column name for effect size (log odds).')
13 | argparser.add_argument('-se', '--se-effect', metavar = 'column', dest = 'se', default = 'sebeta', help = 'Column name for standard error of effect size.')
14 | argparser.add_argument('-n', '--num-samples', metavar = 'column', nargs = '+', dest = 'n', default = ['num_cases', 'num_controls'], help = 'Column name(s) for number of samples. If multiple columns are specified, they are summped up to get total sample size.')
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     args = argparser.parse_args()
19 |     required_columns = [ args.snp, args.ea, args.oa, args.beta, args.se ] + args.n
20 |     stats_filename = f'{args.out_prefix}.stats'
21 |     pred_filename = f'{args.out_prefix}.nonamb'
22 |     le_pred_filename = f'{args.out_prefix}.big'
23 |     unique_names = set()
24 |     with gzip.open(args.in_file, 'rt') as ifile, \
25 |             open(stats_filename, 'w') as stats_ofile, \
26 |             open(pred_filename, 'w') as pred_ofile, \
27 |             open(le_pred_filename, 'w') as le_pred_ofile:
28 |         iheader = ifile.readline()
29 |         if not iheader:
30 |             raise Exception(f'Empty header in {args.in_file}.')
31 |         iheader = iheader.rstrip().split()
32 |         for c in required_columns:
33 |             if c not in iheader:
34 |                 raise Exception(f'Missing {c} column in {args.in_file}.')
35 |         stats_ofile.write('Predictor A1 A2 Direction Stat n\n')
36 |         for i, iline in enumerate(ifile, 2):
37 |             iline = iline.rstrip().split()
38 |             if len(iline) != len(iheader):
39 |                 raise Exception(f'Number of columns on line {i} does not match header.')
40 |             record = dict(zip(iheader, iline))
41 |             if record[args.snp] in unique_names: # skip if ID is duplicated
42 |                 continue
43 |             unique_names.add(record[args.snp])
44 |             a1 = record[args.ea].upper()
45 |             a2 = record[args.oa].upper()
46 |             try:
47 |                 n = sum(map(int, (record[c] for c in args.n)))
48 |                 stat = (float(record[args.beta]) / float(record[args.se])) ** 2
49 |             except: # catch possible type conversion or division by 0 errors
50 |                 continue
51 |             if stat > n / 99.0:
52 |                 le_pred_ofile.write('{}\n'.format(record[args.snp]))
53 |             if not (a1 in {'A', 'T'} and a2 in {'A', 'T'}) and not (a1 in {'C', 'G'} and a2 in {'C', 'G'}):
54 |                 pred_ofile.write('{}\n'.format(record[args.snp]))
55 |             stats_ofile.write('{} {} {} {} {} {}\n'.format(record[args.snp], a1, a2, record[args.beta], stat, n))
56 | 


--------------------------------------------------------------------------------
/SumHer/bin/intersect.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | from collections import Counter
 5 | 
 6 | argparser = argparse.ArgumentParser(description = 'Intersects files preserving the order. Note: suitable only for very short lines (e.g. rsId).')
 7 | argparser.add_argument('-i', '--input', metavar = 'file', dest = 'in_files', nargs = '+', required = True, help = 'Input files.')
 8 | argparser.add_argument('-o', '--out', metavar = 'file', dest = 'out_file', required = True, help = 'Output file.')
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     args = argparser.parse_args()
13 |     n_files = len(args.in_files)
14 |     names = Counter()
15 |     for in_file in args.in_files:
16 |         with open(in_file, 'rb') as ifile:
17 |             names.update(ifile)
18 |     with open(args.out_file, 'wb') as ofile:
19 |         with open(args.in_files[0], 'rb') as ifile:
20 |             for line in ifile:
21 |                 if names[line] == n_files:
22 |                     ofile.write(line)
23 | 


--------------------------------------------------------------------------------
/SumHer/bin/merge.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import os
 5 | 
 6 | argparser = argparse.ArgumentParser(description = 'Merges SumHer output files.')
 7 | argparser.add_argument('-i', '--input', metavar = 'file', dest = 'in_list_file', required = True, help = 'Input file with a list of SumHer output files (one per line).')
 8 | argparser.add_argument('-o', '--out', metavar = 'file', dest = 'out_file', required = True, help = 'Output file.')
 9 | 
10 | 
11 | def read_sumher_output(in_file):
12 |     pheno1, pheno2, suffix = os.path.split(in_file)[-1].split('.')
13 |     pheno1, pheno2 = map(lambda x: x.replace('__', '.'), [pheno1, pheno2])
14 |     with open(in_file, 'rt') as ifile:
15 |         header = ifile.readline()
16 |         if not header: return {}
17 |         header = header.rstrip().split()
18 |         if not all(h in header for h in ['Component', 'Value', 'SD']): return {}
19 |         t_header = ['Pheno1', 'Pheno2']
20 |         t_row = [pheno1, pheno2]
21 |         for line in ifile:
22 |             line = line.rstrip()
23 |             if not line: continue
24 |             columns = line.split()
25 |             if len(columns) != len(header): continue
26 |             columns = dict(zip(header, columns))
27 |             t_header.append(columns['Component'])
28 |             t_row.append(columns['Value'])
29 |             t_header.append(columns['Component'] + '_SD')
30 |             t_row.append(columns['SD'])
31 |         return { 'header': t_header, 'record': t_row }
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     args = argparser.parse_args()
36 |     header = False
37 |     in_files = []
38 |     with open(args.in_list_file, 'rt') as ifile:
39 |         for line in ifile:
40 |             line = line.strip()
41 |             if not line: continue
42 |             in_files.append(line)
43 |     with open(args.out_file, 'wt') as ofile:
44 |         for in_file in in_files:
45 |             corr = read_sumher_output(in_file)
46 |             if not corr: continue
47 |             if not header:
48 |                 ofile.write('{}\n'.format('\t'.join(corr['header'])))
49 |                 header = True
50 |             ofile.write('{}\n'.format('\t'.join(corr['record'])))
51 | 


--------------------------------------------------------------------------------
/SumHer/bin/mhc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import gzip
 5 | 
 6 | argparser = argparse.ArgumentParser(description = 'Lists MHC variants in *.bim files.')
 7 | argparser.add_argument('-i', '--in', metavar = 'file', dest = 'in_files', nargs = '+', required = True, help = 'Input *.bim file(s).')
 8 | argparser.add_argument('-b', '--genome-build', metavar = 'name', dest = 'build', choices = ['GRCh37', 'GRCh38'], required = True, help = 'Human genome build version: GRCh37, GRCh38.')
 9 | argparser.add_argument('-o', '--out', metavar = 'file', dest = 'out_file', required = True, help = 'Output file.')
10 | 
11 | mhc = {
12 |         'GRCh37': { 'chrom': '6', 'start': 28477797, 'stop': 33448354  } ,
13 |         'GRCh38': { 'chrom': '6', 'start': 28510120, 'stop': 33480577  }
14 | }
15 | 
16 | if __name__ == '__main__':
17 |     args = argparser.parse_args()
18 |     mhc_chrom = mhc[args.build]['chrom']
19 |     mhc_start = mhc[args.build]['start']
20 |     mhc_stop = mhc[args.build]['stop']
21 |     with open(args.out_file, 'wt') as ofile:
22 |         for in_file in args.in_files:
23 |             with open(in_file, 'rt') as ifile:
24 |                 for line in ifile:
25 |                     chrom, name, morgans, bp, a1, a2  = line.rstrip().split()
26 |                     if chrom == mhc_chrom:
27 |                         bp = int(bp)
28 |                         if bp >= mhc_start and bp <= mhc_stop:
29 |                             ofile.write('{}\n'.format(name))
30 | 


--------------------------------------------------------------------------------
/SumHer/nextflow.config:
--------------------------------------------------------------------------------
 1 | params {
 2 |    LDAK = "/exports/dtaliun/PheWeb/ldak5.linux"
 3 |    ref_panel = "/exports/dtaliun/PheWeb/ldsc/1000G_EUR_Phase3_plink"
 4 |    columns {
 5 |       pvalue = "pval"
 6 |       effect = "beta"
 7 |       snp = "ID"
 8 |       effect_allele = "ALT"
 9 |       other_allele = "REF"
10 |    }
11 |    no_effect = 0
12 | }
13 | 
14 | process {
15 | // uncomment "slurm" when running on SLURM cluster. Change "queue" as needed. 
16 | //   executor = "slurm"
17 |    executor = "local"
18 |    queue = "main"
19 |    withLabel: "small_mem" {
20 |       cpus = 1
21 |       time = "2d" 
22 |       memory = "8GB"
23 |    }
24 |    withLabel: "big_mem" {
25 |       cpus = 1
26 |       time = "2d"
27 |       memory = "16GB"         
28 |    }
29 | }
30 | 
31 | executor {
32 |    $slurm {
33 |       queueSize = 1000
34 |    }
35 |    $local {
36 |       cpus = 22 // set number of CPUs to use when running on a single machine
37 |    }
38 | }
39 | 


--------------------------------------------------------------------------------
/pheno-list.json:
--------------------------------------------------------------------------------
 1 | [
 2 |    {
 3 |       "assoc_files": ["/home/data/PheCode_990_SAIGE_MACge20.txt.vcf.gz"],
 4 |       "phenocode": 990,
 5 |       "num_cases": 10000,
 6 |       "num_controls": 10000
 7 |    },
 8 |    {
 9 |       "assoc_files": ["/home/data/PheCode_989_SAIGE_MACge20.txt.vcf.gz"],
10 |       "phenocode": 989,
11 |       "num_cases": 100000,
12 |       "num_controls": 100000
13 |    },
14 |    {
15 |       "assoc_files": ["/home/data/PheCode_974_SAIGE_MACge20.txt.vcf.gz"],
16 |       "phenocode": 974,
17 |       "num_cases": 100000,
18 |       "num_controls": 100000
19 |    },
20 |    {
21 |       "assoc_files": ["/home/data/PheCode_964_SAIGE_MACge20.txt.vcf.gz"],
22 |       "phenocode": 964,
23 |       "num_cases": 100000,
24 |       "num_controls": 100000
25 |    }
26 | ]
27 | 
28 | 


--------------------------------------------------------------------------------
/tab2pheno-list.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | argparser = argparse.ArgumentParser(description = 'Generates pheno-list.json file from tab-delimited file. Warning: will overwrite existing pheno-list.json file.')
 5 | argparser.add_argument('-i', '--in-tsv', metavar = 'file', dest = 'in_tsv', required = True, help = 'Input tab-delimited file. No header. Column order: full path to summary stat file, phenocode, number of cases, number of controls.')
 6 | 
 7 | if __name__ == '__main__':
 8 |    args = argparser.parse_args()
 9 |    data = []
10 |    with open(args.in_tsv, 'r') as f_in:
11 |       for line in f_in:
12 |          columns = line.rstrip().split('\t')
13 |          if any(len(c.strip()) == 0 for c in columns):
14 |              continue
15 |          if len(columns) < 4:
16 |              continue
17 |          path = columns[0]
18 |          phenocode = columns[1]
19 |          cases = columns[2]
20 |          controls = columns[3]
21 |          try:
22 |              cases_int = int(cases)
23 |          except:
24 |              raise Exception('Non integer value \'{}\' found in cases column.'.format(cases))
25 |          try:
26 |              controls_int = int(controls)
27 |          except:
28 |              raise Exception('Non integer value \'{}\' found in controls column.'.format(controls))
29 |          data.append({
30 |             'assoc_files': [path],
31 |             'phenocode': phenocode, 
32 |             'num_cases': cases_int, 
33 |             'num_controls': controls_int
34 |          })
35 |    with open('pheno-list.json', 'w') as f_out:
36 |       json.dump(data, f_out, indent = 4)
37 | 


--------------------------------------------------------------------------------