├── .gitignore
├── LICENSE.txt
├── README.md
├── TOPMed_RNAseq_pipeline.md
├── gene_model
    ├── README.md
    ├── collapse_annotation.py
    ├── gencode19_unannotated_readthrough_blacklist.txt
    └── gencode24-25_unannotated_readthrough_blacklist.txt
├── genotype
    ├── compute_genotype_pcs.py
    ├── participant_vcfs.wdl
    └── shapeit2
    │   ├── Dockerfile
    │   ├── extract_PIRs.wdl
    │   ├── hg38chrXPAR.bed
    │   ├── shapeit.wdl
    │   ├── shapeit_postprocess.wdl
    │   └── src
    │       ├── aggregate_pirs.py
    │       ├── bam_to_pir.py
    │       ├── run_shapeit.py
    │       └── shapeit_postprocess.py
├── phASER
    ├── Dockerfile
    ├── phaser_scatter.wdl
    └── wrapper.py
├── qtl
    ├── Dockerfile
    ├── README.md
    ├── R_peer_source_1.3.tgz
    ├── aFC.wdl
    ├── ase_aggregate_by_individual.wdl
    ├── ase_gatk_readcounter.wdl
    ├── dapars.wdl
    ├── eqtl_prepare_expression.wdl
    ├── leafcutter
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── leafcutter_bam_to_junc.wdl
    │   ├── leafcutter_cluster.wdl
    │   └── src
    │   │   └── cluster_prepare_fastqtl.py
    ├── metasoft.wdl
    ├── peer_factors.wdl
    ├── src
    │   ├── ase_aggregate_by_individual.py
    │   ├── ase_calculate_lamp.py
    │   ├── combine_covariates.py
    │   ├── combine_signif_pairs.py
    │   ├── convert_vep.py
    │   ├── eqtl_prepare_expression.py
    │   ├── metasoft_postprocess.py
    │   ├── metasoft_prepare_input.py
    │   ├── run_GATK_ASEReadCounter.py
    │   ├── run_PEER.R
    │   └── run_metasoft.py
    ├── tensorqtl_cis_independent.wdl
    ├── tensorqtl_cis_nominal.wdl
    ├── tensorqtl_cis_permutations.wdl
    ├── tensorqtl_cis_susie.wdl
    └── torus
    │   ├── Dockerfile
    │   ├── src
    │       └── run_torus.py
    │   └── torus.wdl
└── rnaseq
    ├── Dockerfile
    ├── README.md
    ├── bam2coverage.wdl
    ├── bamsync.wdl
    ├── bamsync
        ├── Makefile
        ├── README.md
        ├── bamsync.cpp
        └── gpl-3.0.txt
    ├── fastqc.wdl
    ├── markduplicates.wdl
    ├── references
        ├── ERCC92.chrsizes
        ├── ERCC92.fa
        ├── ERCC92.gtf
        ├── ERCC92.patched.fa
        └── ERCC92.patched.gtf
    ├── rnaseq_pipeline_bam.wdl
    ├── rnaseq_pipeline_fastq.wdl
    ├── rnaseqc2.wdl
    ├── rnaseqc2_aggregate.wdl
    ├── rsem.wdl
    ├── rsem_aggregate.wdl
    ├── rsem_preprocessing.wdl
    ├── rsem_reference.wdl
    ├── samtofastq.wdl
    ├── samtools_view.wdl
    ├── src
        ├── aggregate_rnaseqc_metrics.py
        ├── aggregate_rsem_results.py
        ├── bam2coverage.py
        ├── combine_GCTs.py
        ├── dapars.py
        ├── mpileup.py
        ├── process_star_junctions.py
        ├── run_MarkDuplicates.py
        ├── run_RSEM.py
        ├── run_STAR.py
        ├── run_SamToFastq.py
        ├── run_bamsync.sh
        ├── run_remove_IDS_reads.sh
        └── run_rnaseqc.py
    ├── star.wdl
    ├── star_fastq_list.wdl
    └── star_index.wdl


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016-2018, Broad Institute, Inc. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 | 
 6 | * Redistributions of source code must retain the above copyright notice, this
 7 |   list of conditions and the following disclaimer.
 8 | 
 9 | * Redistributions in binary form must reproduce the above copyright notice,
10 |   this list of conditions and the following disclaimer in the documentation
11 |   and/or other materials provided with the distribution.
12 | 
13 | * Neither the name Broad Institute, Inc. nor the names of its
14 |   contributors may be used to endorse or promote products derived from
15 |   this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Analysis pipelines for the [GTEx Consortium](https://www.gtexportal.org) and [TOPMed](https://www.nhlbi.nih.gov/science/trans-omics-precision-medicine-topmed-program)
 2 | 
 3 | This repository contains analysis pipelines for:
 4 | 
 5 | * RNA-seq alignment, quantification, and quality control
 6 | * eQTL mapping and annotation
 7 | * Allele-specific expression quantification
 8 | * Generation of the collapsed annotation used for gene-level expression quantification
 9 | 
10 | Pipeline components are available as docker images, and execution scripts are provided in [WDL](https://github.com/broadinstitute/wdl). The pipelines are available on [FireCloud](http://firecloud.org), in the namespace *broadinstitute_gtex*. All Python scripts are written in Python 3.
11 | 
12 | A detailed description of the RNA-seq pipeline settings used for TOPMed is provided in [TOPMed_RNAseq_pipeline.md](https://github.com/broadinstitute/gtex-pipeline/blob/master/TOPMed_RNAseq_pipeline.md).
13 | 


--------------------------------------------------------------------------------
/gene_model/README.md:
--------------------------------------------------------------------------------
 1 | <!-- Author: Francois Aguet -->
 2 | 
 3 | This repository contains utilities for the generation of gene models and annotations used in the RNA-seq and eQTL pipelines.
 4 | 
 5 | ## Collapsed gene model
 6 | 
 7 | Gene-level expression and eQTLs from the GTEx project are calculated based on a collapsed gene model (i.e., combining all isoforms of a gene into a single transcript), according to the following rules:
 8 | 
 9 | 1. Transcripts annotated as “retained_intron” or “read_through” are excluded. Additionally, transcripts that overlap with annotated read-through transcripts may be blacklisted (blacklists for GENCODE v19, 24 & 25 are provided in this repository; no transcripts were blacklisted for v26, v39 and v47).
10 | 2. The union of all exon intervals of each gene is calculated.
11 | 3. Overlapping intervals between genes are excluded from all genes.
12 | 
13 | The purpose of step 3 is primarily to exclude overlapping regions from genes annotated on both strands, which can't be unambiguously quantified from unstranded RNA-seq (GTEx samples were sequenced using an unstranded protocol). For stranded protocols, the `--stranded` flag only excludes overlapping regions on same strand. The `--collapse_only` does not remove any overlaps between collapsed transcripts.
14 | 
15 | Command for unstranded protocols:
16 | ```bash
17 | python3 collapse_annotation.py gencode.v47.GRCh38.annotation.gtf gencode.v47.GRCh38.genes.gtf
18 | ```
19 | where `gencode.v47.GRCh38.annotation.gtf` is the GTF from [GENCODE](https://www.gencodegenes.org/human/).
20 | 
21 | For stranded protocols:
22 | ```bash
23 | python3 collapse_annotation.py --stranded gencode.v47.GRCh38.annotation.gtf gencode.v47.GRCh38.genes.stranded.gtf
24 | ```
25 | 
26 | Further documentation is available on the [GTEx Portal](https://gtexportal.org/home/documentationPage#staticTextAnalysisMethods).
27 | 


--------------------------------------------------------------------------------
/gene_model/gencode19_unannotated_readthrough_blacklist.txt:
--------------------------------------------------------------------------------
  1 | unannotated_transcript_id	matching_readthrough_transcript_id
  2 | ENST00000556931.1	ENST00000505139.1
  3 | ENST00000567140.1	ENST00000368276.4
  4 | ENST00000567140.1	ENST00000490491.1
  5 | ENST00000565805.1	ENST00000320139.5
  6 | ENST00000608310.1	ENST00000556710.1
  7 | ENST00000600945.1	ENST00000532967.1
  8 | ENST00000556601.1	ENST00000505973.1
  9 | ENST00000352846.3	ENST00000406625.2
 10 | ENST00000439940.2	ENST00000604011.1
 11 | ENST00000554112.1	ENST00000504764.1
 12 | ENST00000553424.1	ENST00000505052.1
 13 | ENST00000554017.1	ENST00000513963.1
 14 | ENST00000609637.1	ENST00000354728.4
 15 | ENST00000608381.1	ENST00000373414.3
 16 | ENST00000609767.1	ENST00000482026.1
 17 | ENST00000608383.1	ENST00000305208.5
 18 | ENST00000458031.2	ENST00000463721.1
 19 | ENST00000539752.1	ENST00000509377.1
 20 | ENST00000408996.4	ENST00000282382.4
 21 | ENST00000408996.4	ENST00000333314.3
 22 | ENST00000609654.1	ENST00000520515.1
 23 | ENST00000609383.1	ENST00000518409.1
 24 | ENST00000297183.6	ENST00000532219.1
 25 | ENST00000543878.1	ENST00000480294.1
 26 | ENST00000556581.1	ENST00000503322.1
 27 | ENST00000534153.4	ENST00000493662.2
 28 | ENST00000437001.2	ENST00000585246.1
 29 | ENST00000556132.1	ENST00000481848.2
 30 | ENST00000434909.2	ENST00000509504.1
 31 | ENST00000162863.6	ENST00000579700.1
 32 | ENST00000555673.1	ENST00000413834.1
 33 | ENST00000610247.1	ENST00000310771.4
 34 | ENST00000591000.1	ENST00000476151.1
 35 | ENST00000553390.1	ENST00000554914.1
 36 | ENST00000603284.1	ENST00000603837.1
 37 | ENST00000541829.1	ENST00000541804.1
 38 | ENST00000313339.3	ENST00000566717.2
 39 | ENST00000607974.1	ENST00000357072.5
 40 | ENST00000375205.2	ENST00000375204.2
 41 | ENST00000357054.1	ENST00000375206.2
 42 | ENST00000334943.6	ENST00000502978.1
 43 | ENST00000556107.1	ENST00000374294.3
 44 | ENST00000603152.1	ENST00000447839.2
 45 | ENST00000371198.2	ENST00000475252.2
 46 | ENST00000555577.1	ENST00000370649.3
 47 | ENST00000535773.1	ENST00000527595.1
 48 | ENST00000531258.1	ENST00000557395.1
 49 | ENST00000419365.2	ENST00000555531.1
 50 | ENST00000514361.3	ENST00000412278.2
 51 | ENST00000503028.2	ENST00000421355.1
 52 | ENST00000537382.1	ENST00000527616.1
 53 | ENST00000553473.1	ENST00000540229.1
 54 | ENST00000554957.1	ENST00000381541.3
 55 | ENST00000541275.1	ENST00000547717.1
 56 | ENST00000609999.1	ENST00000448979.3
 57 | ENST00000413953.2	ENST00000548184.1
 58 | ENST00000556010.1	ENST00000539652.1
 59 | ENST00000413530.1	ENST00000547474.1
 60 | ENST00000413530.1	ENST00000548729.1
 61 | ENST00000416488.1	ENST00000540096.2
 62 | ENST00000610205.1	ENST00000335950.4
 63 | ENST00000304704.4	ENST00000553909.1
 64 | ENST00000556387.1	ENST00000530611.1
 65 | ENST00000530222.1	ENST00000528804.1
 66 | ENST00000533536.1	ENST00000528804.1
 67 | ENST00000532742.1	ENST00000528804.1
 68 | ENST00000542227.1	ENST00000552941.1
 69 | ENST00000447296.2	ENST00000549987.1
 70 | ENST00000441369.1	ENST00000558658.1
 71 | ENST00000441369.1	ENST00000559936.1
 72 | ENST00000453867.1	ENST00000558965.1
 73 | ENST00000542534.2	ENST00000382448.4
 74 | ENST00000356316.3	ENST00000549793.1
 75 | ENST00000448014.2	ENST00000605794.1
 76 | ENST00000535392.1	ENST00000568224.1
 77 | ENST00000609618.1	ENST00000320330.6
 78 | ENST00000354723.6	ENST00000565342.1
 79 | ENST00000355544.5	ENST00000567520.1
 80 | ENST00000355544.5	ENST00000568997.1
 81 | ENST00000355544.5	ENST00000569959.1
 82 | ENST00000395137.2	ENST00000566712.1
 83 | ENST00000535210.1	ENST00000569360.1
 84 | ENST00000534692.1	ENST00000532838.1
 85 | ENST00000518603.1	ENST00000523360.1
 86 | ENST00000535512.1	ENST00000573331.1
 87 | ENST00000557233.1	ENST00000293826.4
 88 | ENST00000519970.1	ENST00000557349.1
 89 | ENST00000593105.1	ENST00000412988.2
 90 | ENST00000509083.1	ENST00000591482.1
 91 | ENST00000438614.1	ENST00000555059.2
 92 | ENST00000536149.1	ENST00000495214.1
 93 | ENST00000435993.2	ENST00000560629.1
 94 | ENST00000490216.2	ENST00000577647.1
 95 | ENST00000409963.3	ENST00000590947.1
 96 | ENST00000541223.1	ENST00000571730.1
 97 | ENST00000452201.2	ENST00000590722.2
 98 | ENST00000252675.5	ENST00000592091.1
 99 | ENST00000602738.1	ENST00000541595.2
100 | ENST00000556468.1	ENST00000393796.4
101 | ENST00000556468.1	ENST00000428358.1
102 | ENST00000538752.1	ENST00000590798.1
103 | ENST00000542938.1	ENST00000434822.1
104 | ENST00000602424.2	ENST00000444486.3
105 | ENST00000162023.5	ENST00000602804.1
106 | ENST00000512771.3	ENST00000586674.1
107 | ENST00000608404.1	ENST00000555938.1
108 | ENST00000376832.4	ENST00000250366.6
109 | ENST00000222107.4	ENST00000599649.1
110 | ENST00000608843.1	ENST00000196548.5
111 | ENST00000555685.1	ENST00000504988.1
112 | ENST00000608209.1	ENST00000381151.3
113 | ENST00000553851.1	ENST00000376358.3
114 | ENST00000541269.1	ENST00000422081.2
115 | ENST00000539489.1	ENST00000306641.1
116 | 


--------------------------------------------------------------------------------
/gene_model/gencode24-25_unannotated_readthrough_blacklist.txt:
--------------------------------------------------------------------------------
1 | unannotated_transcript_id	matching_readthrough_transcript_id
2 | ENST00000535193.3	ENST00000536108.5
3 | 


--------------------------------------------------------------------------------
/genotype/compute_genotype_pcs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import pandas as pd
  4 | import numpy as np
  5 | import argparse
  6 | import os
  7 | import subprocess
  8 | import gzip
  9 | 
 10 | 
 11 | def has_dependency(name):
 12 |     return subprocess.call('which '+name, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) == 0
 13 | 
 14 | 
 15 | def patch_bim_ids(bim_in, bim_out):
 16 |     """Modifies long indel variant IDs in BIM file for compatibility with eigensoft"""
 17 |     with open(bim_in) as f_in, open(bim_out, 'w') as f_out:
 18 |         for line in f_in:
 19 |             line = line.strip().split('\t')
 20 |             line[1] = line[0]+':'+line[1].split('_')[1]
 21 |             if len(line[4])>20:
 22 |                 line[4] = 'INDEL'
 23 |             if len(line[5])>20:
 24 |                 line[5] = 'INDEL'
 25 |             f_out.write('\t'.join(line)+'\n')
 26 | 
 27 | 
 28 | if __name__ == '__main__':
 29 | 
 30 |     parser = argparse.ArgumentParser(description='Calculates genotype PCs')
 31 |     parser.add_argument('vcf', help="VCF file. To reproduce GTEx results, this VCF should only include biallelic sites (i.e., if the input is a GTEx analysis freeze VCF, sites marked as 'wasSplit' should be filtered out first. This can be done with the command: bcftools annotate -x INFO,QUAL,^FORMAT/GT -e 'INFO/wasSplit=1' -Oz -o $out_vcf $in_vcf)")
 32 |     parser.add_argument('--keep', action='store_true', help='Keep intermediary files')
 33 |     # parser.add_argument('--prefix', default=None, help='Prefix for output file names. Default: VCF file name.')
 34 |     parser.add_argument('-o', '--output_dir', default='.', help='Output directory. Default: current directory.')
 35 |     args = parser.parse_args()
 36 | 
 37 |     has_plink2 = has_dependency('plink2')
 38 |     has_plink  = has_dependency('plink')
 39 |     if not (has_plink2 or has_plink):
 40 |         raise ValueError('PLINK v1.9 or v2 must be installed and in $PATH.')
 41 |     if not has_dependency('smartpca.perl'):
 42 |         raise ValueError('EIGENSOFT must be installed and in $PATH.')
 43 | 
 44 |     # 1) convert to PLINK format if input is VCF
 45 |     if args.vcf.endswith('.vcf.gz') or args.vcf.endswith('.vcf'):
 46 |         plink_prefix_path = os.path.join(args.output_dir, os.path.split(args.vcf)[1]).replace('.vcf.gz','')
 47 |         if has_plink2:
 48 |             cmd = 'plink2 --vcf {} --out {} --make-bed --max-alleles 2 --output-chr chrM'.format(args.vcf, plink_prefix_path)
 49 |         else:
 50 |             cmd = 'plink --vcf {} --out {} --biallelic-only strict --output-chr chrM --keep-allele-order'.format(args.vcf, plink_prefix_path)
 51 |         subprocess.check_call(cmd, shell=True)
 52 |     else:
 53 |         if not np.all([os.path.exists(args.vcf+ext) for ext in ['.bed', '.bim', '.fam']]):
 54 |             raise ValueError('Unsupported input format. Must be VCF or PLINK BED prefix (with .bed, .bim, and .fam files).')
 55 |         plink_prefix_path = args.vcf
 56 |     plink_filtered_path = os.path.join(args.output_dir, os.path.split(plink_prefix_path)[1])+'.maf05_geno01'
 57 | 
 58 |     # 2) filter by minor allele frequency and missingness
 59 |     # 3) prune with --indep-pairwise 200 100 0.1
 60 |     if has_dependency('plink2'):  # use PLINK 2 if available, since much faster
 61 |         cmds = [
 62 |             'plink2 --make-bed --output-chr chrM --bfile {} --maf 0.05 --geno 0.01 --out {}'.format(plink_prefix_path, plink_filtered_path),
 63 |             'plink2 --bfile {0} --indep-pairwise 200 100 0.1 --out {0}'.format(plink_filtered_path),
 64 |             # output to pgen first since --sort-vars is not yet supported for bed in PLINK 2:
 65 |             'plink2 --bfile {0} --output-chr chrM --extract {0}.prune.in --out {0}.pruned --sort-vars --make-pgen'.format(plink_filtered_path),
 66 |             'plink2 --pfile {0} --output-chr chrM --make-bed --out {0}'.format(plink_filtered_path+'.pruned'),
 67 |         ]
 68 |     else:  # use PLINK 1.9
 69 |         cmds = [
 70 |             'plink --bfile {} --maf 0.05 --geno 0.01 --make-bed --output-chr chrM --keep-allele-order --out {}'.format(plink_prefix_path, plink_filtered_path),
 71 |             'plink --bfile {0} --indep-pairwise 200 100 0.1 --out {0}'.format(plink_filtered_path),
 72 |             'plink --bfile {0} --extract {0}.prune.in --out {0}.pruned --make-bed'.format(plink_filtered_path),
 73 |         ]
 74 |     for cmd in cmds:
 75 |         subprocess.check_call(cmd, shell=True)
 76 | 
 77 |     # 4) patch BIM
 78 |     bim_file = plink_filtered_path+'.pruned.bim'
 79 |     os.rename(bim_file, bim_file+'.orig')
 80 |     patch_bim_ids(bim_file+'.orig', bim_file)
 81 | 
 82 |     # 5) run smartpca (EIGENSOFT)
 83 |     #  -i: genotypes (plink bed)
 84 |     #  -a: snps (plink bim)
 85 |     #  -b: individual (plink fam)
 86 |     #  -k: num PCs
 87 |     #  -m: num outliers
 88 |     #  -e: eigenvalues
 89 |     #  -p: plot
 90 |     #  -l: log
 91 |     subprocess.check_call('smartpca.perl -i {0}.bed -a {0}.bim -b {0}.fam -k 20 -m 0 -o {0}.pca -e {0}.eval -p {0}.plot -l {0}.log'.format(plink_filtered_path+'.pruned'), shell=True)
 92 | 
 93 |     # 6) delete intermediate files
 94 |     if not args.keep:
 95 |         for i in ['prune.in', 'prune.out', 'log', 'bed', 'bim', 'fam']:
 96 |             os.remove(plink_filtered_path+'.'+i)
 97 |         os.remove(plink_filtered_path+'.pruned.bim.orig')
 98 |         if os.path.exists(plink_filtered_path+'.pruned.pgen'):
 99 |             os.remove(plink_filtered_path+'.pruned.pgen')
100 |             os.remove(plink_filtered_path+'.pruned.psam')
101 |             os.remove(plink_filtered_path+'.pruned.pvar')
102 |         for i in ['bed', 'bim', 'fam']:
103 |             os.remove(plink_filtered_path+'.pruned.'+i)
104 |         os.remove(plink_filtered_path+'.pruned.pca.par')
105 |         os.remove(plink_filtered_path+'.pruned.plot.ps')
106 |         os.remove(plink_filtered_path+'.pruned.plot.xtxt')
107 | 


--------------------------------------------------------------------------------
/genotype/participant_vcfs.wdl:
--------------------------------------------------------------------------------
 1 | task participant_vcfs {
 2 | 
 3 |     File vcf_file
 4 |     String participant_id
 5 | 
 6 |     Int memory
 7 |     Int disk_space
 8 |     Int num_threads
 9 |     Int num_preempt
10 | 
11 |     command {
12 |         set -euo pipefail
13 |         date +"[%b %d %H:%M:%S] Generating participant VCF (SNPs only)"
14 |         # select SNPs, filter out missing sites
15 |         bcftools view --no-update -s ${participant_id} -v snps ${vcf_file} | bcftools view --no-update -e 'GT=".|."' -Oz -o ${participant_id}.snps.vcf.gz
16 |         tabix ${participant_id}.snps.vcf.gz
17 | 
18 |         date +"[%b %d %H:%M:%S] Subsetting biallelic het sites for ASE"
19 |         bcftools view --no-update -i 'GT="het"' ${participant_id}.snps.vcf.gz | bcftools norm -m+ | bcftools view -m2 -M2 -Oz -o ${participant_id}.snps.het.vcf.gz
20 |         tabix ${participant_id}.snps.het.vcf.gz
21 | 
22 |         date +"[%b %d %H:%M:%S] Done"
23 |     }
24 | 
25 |     output {
26 |         File snps_vcf = "${participant_id}.snps.vcf.gz"
27 |         File snps_vcf_index = "${participant_id}.snps.vcf.gz.tbi"
28 |         File snps_het_vcf = "${participant_id}.snps.het.vcf.gz"
29 |         File snps_het_vcf_index = "${participant_id}.snps.het.vcf.gz.tbi"
30 |     }
31 | 
32 |     runtime {
33 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V10"
34 |         memory: "${memory}GB"
35 |         disks: "local-disk ${disk_space} HDD"
36 |         cpu: "${num_threads}"
37 |         preemptible: "${num_preempt}"
38 |     }
39 | 
40 |     meta {
41 |         author: "Francois Aguet"
42 |     }
43 | }
44 | 
45 | 
46 | workflow participant_vcfs_workflow {
47 |     call participant_vcfs
48 | }
49 | 


--------------------------------------------------------------------------------
/genotype/shapeit2/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | MAINTAINER Francois Aguet
 3 | 
 4 | RUN apt-get update && apt-get install -y software-properties-common && \
 5 |     apt-get update && apt-get install -y \
 6 |         build-essential \
 7 |         cmake \
 8 |         curl \
 9 |         libboost-all-dev \
10 |         libbz2-dev \
11 |         libcurl3-dev \
12 |         liblzma-dev \
13 |         libncurses5-dev \
14 |         libssl-dev \
15 |         python3 \
16 |         python3-pip \
17 |         sudo \
18 |         unzip \
19 |         wget \
20 |         zlib1g-dev \
21 |     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
22 |     apt-get clean && \
23 |     apt-get autoremove -y && \
24 |     rm -rf /var/lib/{apt,dpkg,cache,log}/
25 | 
26 | # htslib
27 | RUN cd /opt && \
28 |     wget --no-check-certificate https://github.com/samtools/htslib/releases/download/1.11/htslib-1.11.tar.bz2 && \
29 |     tar xf htslib-1.11.tar.bz2 && rm htslib-1.11.tar.bz2 && cd htslib-1.11 && \
30 |     ./configure --enable-libcurl --enable-s3 --enable-plugins --enable-gcs && \
31 |     make && make install && make clean
32 | 
33 | # samtools
34 | RUN cd /opt && \
35 |     wget --no-check-certificate https://github.com/samtools/samtools/releases/download/1.11/samtools-1.11.tar.bz2 && \
36 |     tar -xf samtools-1.11.tar.bz2 && rm samtools-1.11.tar.bz2 && cd samtools-1.11 && \
37 |     ./configure --with-htslib=/opt/htslib-1.11 && make && make install && make clean
38 | 
39 | # bcftools
40 | RUN cd /opt && \
41 |     wget --no-check-certificate https://github.com/samtools/bcftools/releases/download/1.11/bcftools-1.11.tar.bz2 && \
42 |     tar -xf bcftools-1.11.tar.bz2 && rm bcftools-1.11.tar.bz2 && cd bcftools-1.11 && \
43 |     ./configure --with-htslib=/opt/htslib-1.11 && make && make install && make clean
44 | 
45 | # python3
46 | RUN pip3 install --upgrade pip && pip3 install pandas numpy
47 | 
48 | # SHAPEIT2
49 | RUN cd /opt && \
50 |     wget https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.v2.r904.glibcv2.17.linux.tar.gz && \
51 |     tar -xf shapeit.v2.r904.glibcv2.17.linux.tar.gz && rm shapeit.v2.r904.glibcv2.17.linux.tar.gz
52 | ENV PATH /opt/shapeit.v2.904.3.10.0-693.11.6.el7.x86_64/bin:$PATH
53 | 
54 | # extractPIRs
55 | RUN cd /opt && \
56 |    wget https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/files/extractPIRs.v1.r68.x86_64.tgz && \
57 |    tar xf extractPIRs.v1.r68.x86_64.tgz && rm extractPIRs.v1.r68.x86_64.tgz
58 | ENV PATH /opt/extractPIRs.v1.r68.x86_64:$PATH
59 | 
60 | # scripts
61 | COPY src src/
62 | ENV PATH /src:$PATH
63 | 


--------------------------------------------------------------------------------
/genotype/shapeit2/extract_PIRs.wdl:
--------------------------------------------------------------------------------
 1 | task extract_PIRs {
 2 | 
 3 |     File bam_file  # also works with CRAM
 4 |     File bam_index
 5 |     File vcf
 6 |     File vcf_index
 7 |     String participant_id
 8 |     String chr
 9 | 
10 |     File? reference_fasta
11 |     File? reference_fasta_index
12 | 
13 |     Int memory
14 |     Int disk_space
15 |     Int num_threads
16 |     Int num_preempt
17 | 
18 |     command {
19 |         set -euo pipefail
20 |         touch ${vcf_index}
21 |         python3 /src/bam_to_pir.py --vcf ${vcf} --bam ${bam_file} --participant_id ${participant_id} ${"--fasta " + reference_fasta} --chr ${chr} --output_dir .
22 |     }
23 | 
24 |     runtime {
25 |         docker: "gcr.io/broad-cga-francois-gtex/shapeit2:latest"
26 |         memory: "${memory}GB"
27 |         disks: "local-disk ${disk_space} HDD"
28 |         cpu: "${num_threads}"
29 |         preemptible: "${num_preempt}"
30 |     }
31 | 
32 |     output {
33 |         File pir_file = "${participant_id}.${chr}.pir"
34 |     }
35 | 
36 |     meta {
37 |         author: "Francois Aguet"
38 |     }
39 | }
40 | 
41 | 
42 | workflow extract_PIRs_workflow {
43 |     File chr_list_file
44 | 
45 |     Array[String] chr_list = read_lines(chr_list_file)
46 |     scatter (c in chr_list) {
47 |         call extract_PIRs { input: chr=c }
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/genotype/shapeit2/hg38chrXPAR.bed:
--------------------------------------------------------------------------------
1 | chrX	10000	2781479	PAR1
2 | chrX	2781479	155701382	NONPAR
3 | chrX	155701382	156030895	PAR2
4 | 


--------------------------------------------------------------------------------
/genotype/shapeit2/shapeit.wdl:
--------------------------------------------------------------------------------
 1 | task shapeit {
 2 | 
 3 |     File vcf
 4 |     File vcf_index
 5 |     File pir_file
 6 |     String prefix
 7 |     String chrom
 8 | 
 9 |     File? sex
10 |     File? par_bed
11 | 
12 |     Int memory
13 |     Int disk_space
14 |     Int num_threads
15 |     Int num_preempt
16 | 
17 |     command {
18 |         set -euo pipefail
19 |         touch ${vcf_index}
20 | 
21 |         echo $(date +"[%b %d %H:%M:%S] Extracting ${chrom} from VCF")
22 |         chr_vcf=$PWD/${chrom}.vcf.gz
23 |         tabix -h ${vcf} ${chrom} | bgzip -c > $chr_vcf
24 |         tabix $chr_vcf
25 | 
26 |         python3 /src/run_shapeit.py $chr_vcf ${pir_file} ${prefix} --output_dir . --num_threads ${num_threads} ${"--sex " + sex} ${"--par_bed " + par_bed}
27 |         cat *.snp.mm > ${prefix}.${chrom}.snp.mm
28 |         cat *.ind.mm > ${prefix}.${chrom}.ind.mm
29 |     }
30 | 
31 |     output {
32 |         File phased_vcf = "${prefix}.${chrom}.phased.vcf.gz"
33 |         File snp_log = "${prefix}.${chrom}.snp.mm"
34 |         File ind_log = "${prefix}.${chrom}.ind.mm"
35 |     }
36 | 
37 |     runtime {
38 |         docker: "gcr.io/broad-cga-francois-gtex/shapeit2:latest"
39 |         memory: "${memory}GB"
40 |         disks: "local-disk ${disk_space} HDD"
41 |         cpu: "${num_threads}"
42 |         preemptible: "${num_preempt}"
43 |     }
44 | 
45 |     meta {
46 |         author: "Francois Aguet"
47 |     }
48 | }
49 | 
50 | 
51 | workflow shapeit_workflow {
52 |     call shapeit
53 | }
54 | 
55 | # workflow shapeit_workflow {
56 | #
57 | #     Array[File] chr_pir_files
58 | #     File chr_list_file
59 | #     Array[String] chr_list = read_lines(chr_list_file)
60 | #
61 | #     scatter (i in range(len(chr_list))) {
62 | #         call shapeit { input pir_file=chr_pir_files[i], chrom=chr_list[i] }
63 | #     }
64 | # }
65 | 


--------------------------------------------------------------------------------
/genotype/shapeit2/shapeit_postprocess.wdl:
--------------------------------------------------------------------------------
 1 | task shapeit_postprocess {
 2 | 
 3 |     File vcf
 4 |     File vcf_index
 5 |     File phased_vcf
 6 | 
 7 |     String prefix
 8 |     String chrom
 9 | 
10 |     Int memory
11 |     Int disk_space
12 |     Int num_threads
13 |     Int num_preempt
14 | 
15 |     command {
16 |         set -euo pipefail
17 |         touch ${vcf_index}
18 | 
19 |         echo $(date +"[%b %d %H:%M:%S] Extracting ${chrom} from VCF")
20 |         chr_vcf=$PWD/${chrom}.vcf.gz
21 |         tabix -h ${vcf} ${chrom} | bgzip -c > $chr_vcf
22 | 
23 |         python3 /src/shapeit_postprocess.py $chr_vcf ${phased_vcf} $PWD/${prefix}.${chrom}.phased.patched.vcf.gz
24 |     }
25 | 
26 |     output {
27 |         File patched_vcf = "${prefix}.${chrom}.phased.patched.vcf.gz"
28 |         File patched_vcf_index = "${prefix}.${chrom}.phased.patched.vcf.gz.tbi"
29 |         File log = "${prefix}.${chrom}.phased.patched.log"
30 |     }
31 | 
32 |     runtime {
33 |         docker: "gcr.io/broad-cga-francois-gtex/shapeit2:latest"
34 |         memory: "${memory}GB"
35 |         disks: "local-disk ${disk_space} HDD"
36 |         cpu: "${num_threads}"
37 |         preemptible: "${num_preempt}"
38 |     }
39 | 
40 |     meta {
41 |         author: "Francois Aguet"
42 |     }
43 | }
44 | 
45 | 
46 | workflow shapeit_postprocess_workflow {
47 |     call shapeit_postprocess
48 | }
49 | 


--------------------------------------------------------------------------------
/genotype/shapeit2/src/aggregate_pirs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os
 5 | import argparse
 6 | 
 7 | 
 8 | def get_header_size(pir_file):
 9 |     """
10 |     Get size of header (lines)
11 |     """
12 |     with open (pir_file, 'r') as f:
13 |         line = f.readline()
14 |     return int(line.split(' ')[1])+1
15 | 
16 | 
17 | def get_header(pir_file):
18 |     """
19 |     Get header from PIR file
20 |     """
21 |     n = get_header_size(pir_file)
22 |     with open(pir_file) as f:
23 |         header = ''.join([next(f) for _ in range(n)])
24 |     return header, n
25 | 
26 | 
27 | def check_headers(pir_files):
28 |     """
29 |     Check whether headers have same size
30 |     """
31 |     s = [get_header_size(f) for f in pir_files]
32 |     if len(np.unique(s))!=1:
33 |         raise ValueError('Header sizes do not match: {}'.format(s))
34 | 
35 | 
36 | def concatenate_pir_files(pir_files, output_pir):
37 |     """
38 |     Concatenate PIR files to 'output_pir'. All files must have the same header
39 |     """
40 |     check_headers(pir_files)
41 |     header, header_lines = get_header(pir_files[0])
42 | 
43 |     with open(output_pir, 'w') as out:
44 |         out.write(header)  # write header once
45 | 
46 |         for pir_file in pir_files:
47 |             with open(pir_file) as f:
48 |                 for _ in range(header_lines):  # skip header
49 |                     next(f)
50 |                 for line in f:
51 |                     out.write(line)
52 | 
53 | 
54 | if __name__=='__main__':
55 | 
56 |     parser = argparse.ArgumentParser(description='Combine output from extractPIRs run on individual samples. Equivalent to running extractPIRs on multiple samples.')
57 |     parser.add_argument('pir_files_tsv',
58 |         help='TSV containing paths to PIR files, in the format: [[bam1_chr1, ..., bam1_chrN], ..., [bamM_chr1, ..., bamM_chrN]].\
59 |         \nPIR file names must be in the format <sample_id>.<chr>.pir')
60 |     parser.add_argument('chr_list', help='File listing chromosomes to process.')
61 |     parser.add_argument('prefix', help='Prefix for output files: <prefix>.<chr>.pir')
62 |     parser.add_argument('-o', '--output_dir', help='Output directory')
63 |     args = parser.parse_args()
64 | 
65 |     with open(args.chr_list) as f:
66 |         chr_order = f.read().strip().split('\n')
67 | 
68 |     print('Sorting PIR files by chromosome.', flush=True)
69 |     pir_files_df = pd.read_csv(args.pir_files_tsv, header=None, sep='\t')
70 |     pir_files = pir_files_df.values.tolist()
71 | 
72 |     # sort by chromosome order
73 |     sorted_pir_files = []
74 |     for p in pir_files:
75 |         pir_dict = {os.path.split(i)[1].split('.')[1]:i for i in p}
76 |         sorted_pir_files.append([pir_dict[i] for i in chr_order])
77 |     pir_files_df = pd.DataFrame(sorted_pir_files, columns=chr_order)
78 | 
79 |     print('Starting PIR aggregation for {} samples.'.format(pir_files_df.shape[0]), flush=True)
80 |     for c in pir_files_df:
81 |         print('  * processing chromosome {}'.format(c))
82 |         chr_files = pir_files_df[c]
83 | 
84 |         chr_name = np.unique([os.path.split(i)[1].split('.')[1] for i in chr_files])
85 |         if len(chr_name)!=1 or chr_name[0]!=c:
86 |             raise ValueError('Chromosome names do not match for {}: {}'.format(c, chr_name))
87 | 
88 |         concatenate_pir_files(chr_files, os.path.join(args.output_dir, args.prefix+'.'+c+'.pir'))
89 |     print('Done.', flush=True)
90 | 


--------------------------------------------------------------------------------
/genotype/shapeit2/src/bam_to_pir.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import numpy as np
 3 | import argparse
 4 | import subprocess
 5 | from datetime import datetime
 6 | import os
 7 | 
 8 | parser = argparse.ArgumentParser(description="Extract Phase Informative Reads (PIR) from a BAM or CRAM file.")
 9 | parser.add_argument('--vcf', required=True, help='VCF file')
10 | parser.add_argument('--bam', required=True, help='BAM or CRAM file')
11 | parser.add_argument('--participant_id', required=True, help='Participant/sample ID in the VCF corresponding to BAM')
12 | parser.add_argument('--fasta', default=None, help='Reference genome, required for CRAM input')
13 | parser.add_argument('--chr', default=None, help='Chromosome to process')
14 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
15 | args = parser.parse_args()
16 | 
17 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Running bam_to_pir', flush=True)
18 | 
19 | # convert CRAM to BAM
20 | if args.bam.endswith('.cram'):
21 |     if args.fasta is None:
22 |         raise ValueError('A reference FASTA must be provided for CRAM files.')
23 |     bam_file = os.path.join(args.output_dir, os.path.split(args.bam)[1].replace('.cram', '.bam'))
24 |     cmd = 'samtools view -bh -T '+args.fasta+' -o '+bam_file+' '+args.bam
25 |     if args.chr is not None:
26 |         cmd += ' '+args.chr
27 |         print('Converting CRAM to BAM ({})'.format(args.chr), flush=True)
28 |     else:
29 |         print('Converting CRAM to BAM', flush=True)
30 |     subprocess.check_call(cmd, shell=True)
31 |     subprocess.check_call('samtools index '+bam_file, shell=True)
32 | else:
33 |     assert args.bam.endswith('.bam')
34 |     bam_file = args.bam
35 | 
36 | if args.chr is None:
37 |     # get chromosomes, check against reference sequence IDs in BAM header
38 |     chrs = subprocess.check_output('tabix --list-chroms {}'.format(args.vcf), shell=True).decode().strip().split('\n')
39 |     sq = set(subprocess.check_output('samtools view -H '+bam_file+' | grep "@SQ" | cut -f2 | awk -F":" \'{print $2}\'', shell=True).decode().strip().split('\n'))
40 |     if not np.all([i in sq for i in chrs]):
41 |         raise ValueError('Reference sequence IDs in BAM do not match VCF contig names.')
42 | else:
43 |     chrs = [args.chr]
44 | 
45 | # split each chromosome
46 | chr_vcf = os.path.join(args.output_dir, 'tmp.vcf.gz')
47 | for c in chrs:
48 |     print('Processing chromosome {}'.format(c), flush=True)
49 |     bam_list = os.path.join(args.output_dir, 'bam_list.txt')
50 |     with open(bam_list, 'w') as f:
51 |         f.write(args.participant_id+' '+bam_file+' '+c+'\n')
52 | 
53 |     # generate VCF for current chr
54 |     print('  * subsetting VCF', flush=True)
55 |     subprocess.check_call('tabix -h '+args.vcf+' '+c+' | bgzip > '+chr_vcf, shell=True)
56 |     print('  * indexing VCF', flush=True)
57 |     subprocess.check_call('tabix '+chr_vcf, shell=True)
58 |     print('  * extracting PIRs', flush=True)
59 |     subprocess.check_call('extractPIRs --vcf '+chr_vcf+' --bam '+bam_list+' --out '+os.path.join(args.output_dir, args.participant_id+'.'+c+'.pir'), shell=True)
60 | 
61 | os.remove(bam_list)
62 | os.remove(chr_vcf)
63 | 
64 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] done.', flush=True)
65 | 


--------------------------------------------------------------------------------
/genotype/shapeit2/src/run_shapeit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import sys
 4 | import subprocess
 5 | import os
 6 | import pandas as pd
 7 | from datetime import datetime
 8 | 
 9 | def shapeit(vcf, pir, prefix, seed=1, num_threads=1, start=None, end=None, sex=None, force=True):
10 |     """
11 |     Wrapper for SHAPEIT
12 | 
13 |     start, end: limit to this interval if provided
14 |     sex: required for NONPAR region
15 | 
16 |     https://mathgen.stats.ox.ac.uk/genetics_software/shapeit/shapeit.html
17 |     """
18 |     # shapeit assemble
19 |     cmd = f'shapeit -assemble --input-vcf {vcf} --input-pir {pir} -O {prefix} -T {num_threads} --seed {seed}'
20 |     if start is not None and end is not None:
21 |         cmd += f' --input-from {start} --input-to {end}'
22 |     if sex is not None:
23 |         cmd += f' --chrX --input-sex {sex}'
24 |     if force:
25 |         cmd += ' --force'
26 |     subprocess.check_call(cmd, shell=True)
27 | 
28 |     # shapeit convert: convert to VCF
29 |     phased_vcf = prefix+'.phased.vcf.gz'
30 |     subprocess.check_call(f'shapeit -convert --input-haps {prefix} --output-vcf {phased_vcf} --seed {seed}', shell=True)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     parser = argparse.ArgumentParser(description = "Run SHAPEIT")
35 |     parser.add_argument('vcf', help='VCF file, must contain one chromosome only.')
36 |     parser.add_argument('pir', help='PIR file (output from extractPIRs)')
37 |     parser.add_argument('prefix', help='Prefix for output files, e.g., <prefix>.phased.vcf.gz')
38 |     parser.add_argument('--sex', default=None, help='Sex annotation for the samples. TSV with sample ID: 1 (male) or 2 (female); no header')
39 |     parser.add_argument('--par_bed', default=None, help='BED file with PAR1, PAR2 and NONPAR regions for chrX')
40 |     parser.add_argument('--output_dir', default='.', help='Output directory')
41 |     parser.add_argument('--num_threads', default=1, help='Number of threads')
42 |     parser.add_argument('--seed', default=1, help='Seed for MCMC')
43 |     args = parser.parse_args()
44 | 
45 |     # get chromosome name from VCF
46 |     chrom = subprocess.check_output(f'tabix --list-chroms {args.vcf}', shell=True).decode().strip().split('\n')
47 |     if len(chrom)!=1:
48 |         raise ValueError('Input VCF must contain a single chromosome.')
49 |     else:
50 |         chrom = chrom[0]
51 | 
52 |     print(f"[{datetime.now().strftime('%b %d %H:%M:%S')}] Running SHAPEIT on chromosome {chrom}", flush=True)
53 |     prefix = os.path.join(args.output_dir, args.prefix+'.'+chrom)
54 | 
55 |     if chrom.endswith('X'):
56 |         if args.sex is None:
57 |             raise ValueError('Sex annotation must be provided for chrX.')
58 |         if args.par_bed is None:
59 |             raise ValueError('BED file with PAR1, NONPAR, and PAR2 intervals must be provided for chrX.')
60 | 
61 |         par_df = pd.read_csv(args.par_bed, sep='\t', header=None, names=['chr', 'start', 'end', 'name'], index_col='name')
62 |         par_df['start'] += 1 # change from 0-based [) to 1-based [)
63 |         par_df['end'] += 1
64 | 
65 |         print('  * Processing PAR1', flush=True)
66 |         shapeit(args.vcf, args.pir, prefix+'.PAR1', start=par_df.loc['PAR1', 'start'],
67 |                 end=par_df.loc['PAR1', 'end'], num_threads=args.num_threads, seed=args.seed, force=True)
68 |         print('  * Processing NONPAR', flush=True)
69 |         shapeit(args.vcf, args.pir, prefix+'.NONPAR', start=par_df.loc['NONPAR', 'start'],
70 |                 end=par_df.loc['NONPAR', 'end'], sex=args.sex, num_threads=args.num_threads, seed=args.seed, force=True)
71 |         print('  * Processing PAR2', flush=True)
72 |         shapeit(args.vcf, args.pir, prefix+'.PAR2', start=par_df.loc['PAR2', 'start'],
73 |                 end=par_df.loc['PAR2', 'end'], num_threads=args.num_threads, seed=args.seed, force=True)
74 | 
75 |         # concatenate VCFs
76 |         print('  * Concatenating PAR1, NONPAR, PAR2', flush=True)
77 |         # SHAPEIT outputs gzipped VCFs --> convert to bgzip, index
78 |         subprocess.check_call('zcat {0}.PAR1.phased.vcf.gz | bgzip -c > {0}.PAR1.vcf.gz && tabix {0}.PAR1.vcf.gz'.format(prefix), shell=True)
79 |         subprocess.check_call('zcat {0}.NONPAR.phased.vcf.gz | bgzip -c > {0}.NONPAR.vcf.gz && tabix {0}.NONPAR.vcf.gz'.format(prefix), shell=True)
80 |         subprocess.check_call('zcat {0}.PAR2.phased.vcf.gz | bgzip -c > {0}.PAR2.vcf.gz && tabix {0}.PAR2.vcf.gz'.format(prefix), shell=True)
81 |         # concatenate and index
82 |         subprocess.check_call('bcftools concat {0}.PAR1.vcf.gz {0}.NONPAR.vcf.gz {0}.PAR2.vcf.gz --output-type z --output {0}.phased.vcf.gz'.format(prefix), shell=True)
83 |         subprocess.check_call('tabix {0}.phased.vcf.gz'.format(prefix), shell=True)
84 |         subprocess.check_call('rm {0}.PAR1.* {0}.NONPAR.* {0}.PAR2.*'.format(prefix), shell=True)
85 |     else:
86 |         shapeit(args.vcf, args.pir, prefix, num_threads=args.num_threads, seed=args.seed, force=True)
87 | 
88 |     print('[' + datetime.now().strftime("%b %d %H:%M:%S") + '] done.', flush=True)
89 | 


--------------------------------------------------------------------------------
/genotype/shapeit2/src/shapeit_postprocess.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import argparse
  3 | import subprocess
  4 | import gzip
  5 | from datetime import datetime
  6 | 
  7 | 
  8 | def gt_dosage(gt):
  9 |     """Convert unphased genotype to dosage"""
 10 |     x = gt.split(b'/')
 11 |     return int(x[0])+int(x[1])
 12 | 
 13 | 
 14 | def pgt_dosage(gt):
 15 |     """Convert phased genotype to dosage"""
 16 |     x = gt.split(b'|')
 17 |     return int(x[0])+int(x[1])
 18 | 
 19 | 
 20 | def calculate_missingness(miss_buf):
 21 |     if len(miss_buf)==1:
 22 |         pct_missing = sum(miss_buf[0]) / len(miss_buf[0])
 23 |     else:
 24 |         pct_missing = np.all(miss_buf, axis=0).sum() / len(miss_buf[0])
 25 |     return pct_missing
 26 | 
 27 | 
 28 | def patch_phased_vcf(vcf_file, phased_vcf_file, patched_vcf_file, missingness_threshold=0.15):
 29 |     """
 30 |     Patches phased VCF produced by SHAPEIT2
 31 |     - if the dosage is different between the phased and unphased VCF, assigns to missing
 32 |     - for split biallelic sites, assigns to missing if ALT site is different (e.g., 0|2)
 33 |     - filters out sites with missingness > missingness_threshold
 34 |     """
 35 |     assert patched_vcf_file.endswith('.vcf.gz')
 36 |     log_file = patched_vcf_file.replace('.vcf.gz', '.log')
 37 | 
 38 |     pgt_set = set([b'.|.', b'0|0', b'0|1', b'1|0', b'1|1'])
 39 | 
 40 |     num_var = int(subprocess.check_output('zcat {} | grep -v "#" | wc -l'.format(phased_vcf_file), shell=True).decode())
 41 |     print('  * parsing {} sites'.format(num_var))
 42 | 
 43 |     bgzip = subprocess.Popen('bgzip -c > '+patched_vcf_file, stdin=subprocess.PIPE, shell=True)
 44 |     nwritten = 0
 45 |     ndropped = 0
 46 |     with gzip.open(vcf_file) as vcf, gzip.open(phased_vcf_file) as phased_vcf, open(log_file, 'w') as log:
 47 |         # skip header of unphased VCF
 48 |         for line in vcf:
 49 |             if line[:6]==b'#CHROM':
 50 |                 break
 51 |         # copy header of phased VCF
 52 |         for pline in phased_vcf:
 53 |             if pline[:6]==b'#CHROM':
 54 |                 bgzip.stdin.write(b'##Note=Processed using shapeit_postprocess.py\n')
 55 |                 bgzip.stdin.write(pline)
 56 |                 break
 57 |             bgzip.stdin.write(pline)
 58 |         assert line==pline
 59 | 
 60 |         # iterate through variants
 61 |         buf = []
 62 |         miss_buf = []
 63 |         previous_pos = None
 64 |         for k, (line, pline) in enumerate(zip(vcf, phased_vcf), 1):
 65 |             line = line.strip().split(b'\t')
 66 |             s = pline.strip().split(b'\t')
 67 |             assert line[2]==s[2]  # same variant
 68 | 
 69 |             pos = line[0]+b'_'+line[1]
 70 | 
 71 |             # if new site, write previous
 72 |             if pos != previous_pos and previous_pos is not None:
 73 |                 # calculate missingness and write site
 74 |                 pct_missing = calculate_missingness(miss_buf)
 75 |                 if pct_missing <= missingness_threshold:
 76 |                     for b in buf:
 77 |                         bgzip.stdin.write(b'\t'.join(b)+b'\n')
 78 |                     nwritten += len(buf)
 79 |                 else:
 80 |                     log.write('{}: missingness {:.4f}, removed (n = {}).\n'.format(previous_pos.decode(), pct_missing, len(buf)))
 81 |                     ndropped += len(buf)
 82 | 
 83 |                 # reset buffers
 84 |                 buf = []
 85 |                 miss_buf = []
 86 | 
 87 |             # parse and process current site
 88 |             gt = line[9:]
 89 |             pgt = s[9:]
 90 | 
 91 |             # check for dosage differences at non-imputed sites, set to missing
 92 |             ix = [i!=b'./.' and gt_dosage(i)!=pgt_dosage(j) for i,j in zip(gt, pgt)]
 93 |             if any(ix):  # dosage mismatch, set to missing
 94 |                 pgt = [b'.|.' if i else g for i,g in zip(ix,pgt)]
 95 |                 log.write('{}: mismatched dosage for {} samples (set to missing).\n'.format(s[2].decode(), sum(ix)))
 96 | 
 97 |             # reset split sites
 98 |             if b'wasSplit' in line[7]:
 99 |                 # split multi-allelic sites: a sample can only have genotype at one split site; others must be set to missing
100 |                 # Since all are imputed, set back to missing (if all were originally missing, imputed call is ambiguous).
101 |                 ix = [g==b'./.' for g in gt]
102 |                 if any(ix):
103 |                     pgt = [b'.|.' if i else g for i,g in zip(ix,pgt)]
104 |                     log.write('{}: split site; {} imputed sites reverted to missing.\n'.format(s[2].decode(), sum(ix)))
105 | 
106 |             buf.append(s[:9]+pgt)
107 |             miss_buf.append([i==b'./.' or j==b'.|.' for i,j in zip(gt, pgt)])
108 |             previous_pos = pos
109 | 
110 |             if np.mod(k, 10000)==0:
111 |                 print('\r    * variants processed: {}'.format(k), end='', flush=True)
112 |         print('\r    * variants processed: {}'.format(k), end='', flush=True)
113 |         print()
114 | 
115 |         # last site: calculate missingness and write site
116 |         pct_missing = calculate_missingness(miss_buf)
117 |         if pct_missing <= missingness_threshold:
118 |             for b in buf:
119 |                 bgzip.stdin.write(b'\t'.join(b)+b'\n')
120 |             nwritten += len(buf)
121 |         else:
122 |             log.write('{}: missingness {:.4f}, removed (n = {}).\n'.format(previous_pos.decode(), pct_missing, len(buf)))
123 |             ndropped += len(buf)
124 | 
125 |     stdout, stderr = bgzip.communicate()
126 |     print('  * wrote {} sites'.format(nwritten))
127 |     print('  * dropped {} sites'.format(ndropped))
128 | 
129 | 
130 | if __name__=='__main__':
131 | 
132 |     parser = argparse.ArgumentParser(description='Run post-processing for SHAPEIT2 phasing')
133 |     parser.add_argument('vcf_file', type=str, help='Unphased VCF')
134 |     parser.add_argument('phased_vcf_file', type=str, help='SHAPEIT2 output')
135 |     parser.add_argument('patched_vcf_file', help='Output VCF')
136 |     parser.add_argument('--missingness_threshold', default=0.15, type=np.float64, help='Missingness threshold')
137 |     args = parser.parse_args()
138 | 
139 |     print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Patching VCF', flush=True)
140 |     patch_phased_vcf(args.vcf_file, args.phased_vcf_file, args.patched_vcf_file, missingness_threshold=args.missingness_threshold)
141 | 
142 |     # index
143 |     print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Indexing patched VCF', flush=True)
144 |     subprocess.check_call('tabix '+args.patched_vcf_file, shell=True)
145 |     n = int(subprocess.check_output('bcftools index -n {}'.format(args.patched_vcf_file), shell=True).decode().strip())
146 |     print('  * {} sites remaining'.format(n))
147 | 


--------------------------------------------------------------------------------
/phASER/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile for GTEx RNA-seq pipeline dependencies
 2 | FROM ubuntu:18.04
 3 | MAINTAINER Aaron Graubert
 4 | 
 5 | RUN apt-get update && apt-get install -y \
 6 |         software-properties-common \
 7 |         bcftools \
 8 |         bedtools \
 9 |         build-essential \
10 |         cmake \
11 |         curl \
12 |         git \
13 |         libboost-all-dev \
14 |         libbz2-dev \
15 |         libcurl3-dev \
16 |         liblzma-dev \
17 |         libncurses5-dev \
18 |         libssl-dev \
19 |         python2.7 \
20 |         python-pip \
21 |         python3-pip \
22 |         unzip \
23 |         wget \
24 |         zlib1g-dev \
25 |     && rm -rf /var/lib/apt/lists/*
26 | 
27 | 
28 | #-----------------------------
29 | # Pipeline components
30 | #-----------------------------
31 | 
32 | # Samtools (phaser)
33 | RUN cd /opt && \
34 |     wget --no-check-certificate https://github.com/samtools/samtools/releases/download/1.5/samtools-1.5.tar.bz2 && \
35 |     tar -xjf samtools-1.5.tar.bz2 && rm samtools-1.5.tar.bz2 && cd samtools-1.5 && \
36 |     ./configure && make && make install && cd htslib-1.5 && make && make install
37 | 
38 | # Get Cython (phaser)
39 | RUN python2.7 -m pip install cython pandas scipy pysam intervaltree
40 | 
41 | # exon map dependencies
42 | # phASER
43 | # f15e83a : Latest commit at time of writing
44 | #RUN cd /opt && \
45 |     #git clone https://github.com/secastel/phaser.git && cd phaser && \
46 |     #git checkout f15e83a && cd phaser && python2.7 setup.py build_ext --inplace
47 | RUN cd /opt && \
48 |     git clone https://github.com/secastel/phaser.git && cd phaser && \
49 |     git fetch origin pull/36/head:subprocess && git checkout subprocess && \
50 |     cd phaser && python2.7 setup.py build_ext --inplace
51 | 
52 | COPY wrapper.py /opt/phaser/
53 | 
54 | # clean up
55 | RUN apt-get clean && \
56 |     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
57 |     apt-get autoclean && \
58 |     apt-get autoremove -y && \
59 |     rm -rf /var/lib/{apt,dpkg,cache,log}/
60 | 


--------------------------------------------------------------------------------
/phASER/phaser_scatter.wdl:
--------------------------------------------------------------------------------
  1 | task phaser {
  2 | 
  3 |     String individual_id
  4 |     String chromosome
  5 |     Array[File]+ bam_files
  6 |     Array[File]+ bam_indices
  7 |     File genotype_vcf
  8 |     File genotype_index
  9 |     File gene_model_bed
 10 | 
 11 |     File? haplotype_blacklist_bed
 12 |     File? phaser_blacklist_bed
 13 | 
 14 |     Int memory
 15 |     Int disk_space
 16 |     Int num_threads
 17 |     Int num_preempt
 18 | 
 19 |     command {
 20 |         export TMP=$HOME/
 21 |         set -euo pipefail
 22 | 
 23 |         echo $(date +"[%b %d %H:%M:%S] Preparing indices")
 24 |         for index in ${sep=" " bam_indices}; do
 25 |             touch $index
 26 |         done
 27 | 
 28 |         echo $(date +"[%b %d %H:%M:%S] Preparing bam files")
 29 |         mkdir ./bam_staging
 30 |         for bam_file in ${sep=" " bam_files}; do
 31 |             samtools view -h $bam_file ${chromosome} | \
 32 |             grep -v "vW:i:[2-7]" | \
 33 |             samtools view -h1 | samtools sort > ./bam_staging/$(basename $bam_file)
 34 |             samtools index -@ ${num_threads} ./bam_staging/$(basename $bam_file)
 35 |         done
 36 | 
 37 |         touch ${genotype_index}
 38 |         echo $(date +"[%b %d %H:%M:%S] Running phASER")
 39 |         python2.7 /opt/phaser/wrapper.py phase ${individual_id} ./bam_staging/*.bam \
 40 |             ${genotype_vcf} ${gene_model_bed} .  \
 41 |             ${"--haplo-count-blacklist=" + haplotype_blacklist_bed} \
 42 |             ${"--blacklist=" + phaser_blacklist_bed} \
 43 |             --chr ${chromosome}
 44 |     }
 45 | 
 46 |     output {
 47 |         File allele_config = "${individual_id}.${chromosome}.allele_config.txt"
 48 |         File allelic_counts = "${individual_id}.${chromosome}.allelic_counts.txt"
 49 |         File haplotypes = "${individual_id}.${chromosome}.haplotypes.txt"
 50 |         File haplo_counts = "${individual_id}.${chromosome}.haplotypic_counts.txt"
 51 |         File variant_connections = "${individual_id}.${chromosome}.variant_connections.txt"
 52 |         File phase_vcf = "${individual_id}.${chromosome}.vcf.gz"
 53 |         File vcf_index = "${individual_id}.${chromosome}.vcf.gz.tbi"
 54 |     }
 55 | 
 56 |     runtime {
 57 |         docker: "us.gcr.io/just-episode-184015/rnaseqc:latest"
 58 |         memory: "${memory}GB"
 59 |         disks: "local-disk ${disk_space} HDD"
 60 |         cpu: "${num_threads}"
 61 |         preemptible: "${num_preempt}"
 62 |     }
 63 | 
 64 |     meta {
 65 |         author: "Aaron Graubert"
 66 |     }
 67 | }
 68 | 
 69 | task phaser_postprocess {
 70 | 
 71 |     String individual_id
 72 |     Array[File]+ chromosome_vcfs
 73 |     Array[File]+ chromosome_haplotype_counts
 74 |     File gene_model_bed
 75 | 
 76 |     Int memory
 77 |     Int disk_space
 78 |     Int num_threads
 79 |     Int num_preempt
 80 | 
 81 |     command {
 82 |         set -euo pipefail
 83 | 
 84 |         echo $(date +"[%b %d %H:%M:%S] Processing phASER Files")
 85 |         python2.7 /opt/phaser/wrapper.py postprocess ${individual_id} \
 86 |           ${write_lines(chromosome_vcfs)} \
 87 |           ${write_lines(chromosome_haplotype_counts)} \
 88 |           ${gene_model_bed} \
 89 |           .
 90 |     }
 91 | 
 92 |     output {
 93 |         File haplo_counts = "${individual_id}.haplotypic_counts.txt.gz"
 94 |         File phase_vcf = "${individual_id}.vcf.gz"
 95 |         File vcf_index = "${individual_id}.vcf.gz.tbi"
 96 |         File expression_counts = "${individual_id}.gene_ae.txt.gz"
 97 |     }
 98 | 
 99 |     runtime {
100 |         docker: "us.gcr.io/just-episode-184015/rnaseqc:latest"
101 |         memory: "${memory}GB"
102 |         disks: "local-disk ${disk_space} HDD"
103 |         cpu: "${num_threads}"
104 |         preemptible: "${num_preempt}"
105 |     }
106 | 
107 |     meta {
108 |         author: "Aaron Graubert"
109 |     }
110 | }
111 | 
112 | 
113 | workflow phaser_workflow {
114 |     String individual_id
115 |     File gene_model_bed
116 |     File contig_list
117 | 
118 |     scatter(chr in read_lines(contig_list))
119 |     {
120 |       call phaser { input: individual_id=individual_id, gene_model_bed=gene_model_bed, chromosome=chr}
121 |     }
122 | 
123 |     call phaser_postprocess {
124 |       input: individual_id=individual_id, gene_model_bed=gene_model_bed, chromosome_vcfs=phaser.phase_vcf, chromosome_haplotype_counts=phaser.haplo_counts
125 |     }
126 | 
127 |     output {
128 |         File haplo_counts = phaser_postprocess.haplo_counts
129 |         File phase_vcf = phaser_postprocess.phase_vcf
130 |         File vcf_index = phaser_postprocess.vcf_index
131 |         File expression_counts = phaser_postprocess.expression_counts
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/qtl/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile for GTEx QTL pipeline
 2 | FROM ubuntu:18.04
 3 | MAINTAINER Francois Aguet
 4 | ENV DEBIAN_FRONTEND noninteractive
 5 | 
 6 | RUN apt-get update && apt-get install -y software-properties-common && \
 7 |     apt-get update && apt-get install -y \
 8 |         build-essential \
 9 |         curl \
10 |         lbzip2 \
11 |         libboost-all-dev \
12 |         libcurl3-dev \
13 |         libgsl-dev \
14 |         libhdf5-serial-dev \
15 |         openjdk-17-jdk \
16 |         python3 \
17 |         python3-pip \
18 |         r-base-core \
19 |         unzip \
20 |         vim-common \
21 |         wget \
22 |         zlib1g-dev \
23 |     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
24 |     apt-get clean && \
25 |     apt-get autoremove -y && \
26 |     rm -rf /var/lib/{apt,dpkg,cache,log}/
27 | 
28 | 
29 | # workaround for PEER, see https://github.com/mz2/peer/issues/4
30 | RUN apt-get update \
31 |     && apt-get install -y \
32 |         gcc-5 \
33 |         g++-5 \
34 |         gfortran-5 \
35 |         cmake \
36 |     && rm -rf /var/lib/apt/lists/* \
37 |     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50 --slave /usr/bin/g++ g++ /usr/bin/g++-5
38 | 
39 | # R
40 | RUN wget https://raw.githubusercontent.com/broadinstitute/gtex-pipeline/master/qtl/R_peer_source_1.3.tgz && \
41 |     R CMD INSTALL R_peer_source_1.3.tgz && \
42 |     rm R_peer_source_1.3.tgz && \
43 |     echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile && \
44 |     Rscript -e "install.packages(c('argparser'), dependencies=TRUE)" && \
45 |     Rscript -e 'source("http://bioconductor.org/biocLite.R"); biocLite("qvalue"); biocLite("sva"); biocLite("edgeR");'
46 | 
47 | # htslib
48 | RUN cd /opt && \
49 |     wget --no-check-certificate https://github.com/samtools/htslib/releases/download/1.15/htslib-1.15.tar.bz2 && \
50 |     tar -xf htslib-1.15.tar.bz2 && rm htslib-1.15.tar.bz2 && cd htslib-1.15 && make && make install && make clean
51 | 
52 | # bcftools
53 | RUN cd /opt && \
54 |     wget --no-check-certificate https://github.com/samtools/bcftools/releases/download/1.15/bcftools-1.15.tar.bz2 && \
55 |     tar -xf bcftools-1.15.tar.bz2 && rm bcftools-1.15.tar.bz2 && cd bcftools-1.15 && \
56 |     ./configure --with-htslib=/opt/htslib-1.15 && make && make install && make clean
57 | 
58 | # samtools
59 | RUN cd /opt && \
60 |     wget --no-check-certificate https://github.com/samtools/samtools/releases/download/1.15/samtools-1.15.tar.bz2 && \
61 |     tar -xf samtools-1.15.tar.bz2 && rm samtools-1.15.tar.bz2 && cd samtools-1.15 && \
62 |     ./configure --with-htslib=/opt/htslib-1.15 && make && make install && make clean
63 | 
64 | # PLINK 1.9
65 | RUN mkdir /opt/plink && cd /opt/plink && \
66 |     wget --no-check-certificate https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20220305.zip && \
67 |     unzip plink_linux_x86_64_20220305.zip && rm plink_linux_x86_64_20220305.zip
68 | ENV PATH $PATH:/opt/plink
69 | 
70 | # METASOFT
71 | RUN mkdir /opt/metasoft && cd /opt/metasoft && \
72 |     wget http://genetics.cs.ucla.edu/meta/repository/2.0.1/Metasoft.zip && \
73 |     unzip Metasoft.zip && rm Metasoft.zip
74 | 
75 | # Python
76 | RUN pip3 install --upgrade pip setuptools
77 | RUN pip3 install numpy tables pyarrow pandas scipy matplotlib h5py pysam statsmodels scikits.bootstrap qtl
78 | # numpy dependencies:
79 | RUN pip3 install pyBigWig
80 | 
81 | # aFC
82 | RUN cd /opt && \
83 |     wget https://github.com/francois-a/aFC/archive/2189fbf403b3d1ced54da21421d00b2d4bf44310.tar.gz && \
84 |     tar -xf 2189fbf403b3d1ced54da21421d00b2d4bf44310.tar.gz && mv aFC-2189fbf403b3d1ced54da21421d00b2d4bf44310 aFC && \
85 |     rm 2189fbf403b3d1ced54da21421d00b2d4bf44310.tar.gz
86 | 
87 | # copy scripts
88 | COPY src src/
89 | 


--------------------------------------------------------------------------------
/qtl/README.md:
--------------------------------------------------------------------------------
  1 | <!-- Author: Francois Aguet -->
  2 | # eQTL discovery pipeline for the [GTEx Consortium](www.gtexportal.org)
  3 | 
  4 | This repository contains all components of the eQTL discovery pipeline used by the GTEx Consortium, including data normalization, QTL mapping, and annotation steps. This document describes the pipeline used for the V7 and V8 data releases; for settings specific to the V6p analyses presented in [[GTEx Consortium, 2017](https://www.nature.com/articles/nature24277)], please see the last section.
  5 | 
  6 | ## Docker image
  7 | The GTEx eQTL pipeline components are provided in a Docker image, available at https://hub.docker.com/r/broadinstitute/gtex_eqtl/
  8 | 
  9 | To download the image, run:
 10 | ```bash
 11 | docker pull broadinstitute/gtex_eqtl:V8
 12 | ```
 13 | 
 14 | #### Image contents and pipeline components
 15 | The following tools are included in the Docker image:
 16 | 
 17 | 1. [FastQTL](https://github.com/francois-a/fastqtl): QTL mapping software ([Ongen et al., Bioinformatics, 2016](http://bioinformatics.oxfordjournals.org/content/32/10/1479.abstract))
 18 | 2. R 3.2
 19 | 3. Python 3.5
 20 | 
 21 | ## Prerequisites
 22 | The following input files are needed:
 23 | 
 24 | * VCF file with genotype information. Must be bgzip compressed and indexed with tabix.
 25 | * Expression tables in GCT format. Two tables are needed: read counts and normalized (FPKM or TPM).
 26 | * Gene annotation in GTF format.
 27 | 
 28 | 
 29 | ## Running the pipeline
 30 | Additional [documentation](http://gtexportal.org/home/documentationPage#staticTextAnalysisMethods) and details about parameter choices are provided on the [GTEx Portal](gtexportal.org).
 31 | 
 32 | This pipeline requires gene-level expression data. A collapsed reference GTF can be generated for this purpose using the [`collapse_annotation.py`](https://github.com/broadinstitute/gtex-pipeline/blob/master/gene_model/collapse_annotation.py) script available in the [gene model](https://github.com/broadinstitute/gtex-pipeline/tree/master/gene_model) directory. In the code below, it is assumed that `${annotation_gtf}` was generated using this script.
 33 | 
 34 | #### 1) Generate normalized expression in BED format
 35 | The expression data are normalized as follows: 
 36 | 1. Read counts are normalized between samples using TMM ([Robinson & Oshlack, Genome Biology, 2010](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25))
 37 | 2. Genes are selected based on the following expression thresholds: 
 38 |    - ≥0.1 TPM in ≥20% samples AND
 39 |    - ≥6 reads (unnormalized) in ≥20% samples
 40 | 3. Each gene is inverse normal transformed across samples.
 41 | ```bash
 42 | eqtl_prepare_expression.py ${tpm_gct} ${counts_gct} ${annotation_gtf} \
 43 |     ${sample_participant_lookup} ${vcf_chr_list} ${prefix} \
 44 |     --tpm_threshold 0.1 \
 45 |     --count_threshold 6 \
 46 |     --sample_frac_threshold 0.2 \
 47 |     --normalization_method tmm
 48 | ```
 49 | The file `${vcf_chr_list}` lists the chromosomes in the VCF, and can be generated using
 50 | ```
 51 | tabix --list-chroms ${vcf} > ${vcf_chr_list}
 52 | ```
 53 | The file `${sample_participant_lookup}` must contain two columns, `sample_id` and `participant_id`, mapping IDs in the expression files to IDs in the VCF (these can be the same).
 54 | 
 55 | This step generates the following BED file and index:
 56 | ```bash
 57 | ${prefix}.expression.bed.gz
 58 | ${prefix}.expression.bed.gz.tbi
 59 | ```
 60 | 
 61 | #### 2) Calculate PEER factors
 62 | ```bash
 63 | Rscript run_PEER.R ${prefix}.expression.bed.gz ${prefix} ${num_peer}
 64 | ```
 65 | The number of PEER factors was selected as function of sample size (N):
 66 | - 15 factors for N < 150
 67 | - 30 factors for 150 ≤ N < 250
 68 | - 45 factors for 250 ≤ N < 350
 69 | - 60 factors for N ≥ 350
 70 | 
 71 | For information on how these thresholds were determined, please see the [Supplementary Information](https://media.nature.com/original/nature-assets/nature/journal/v550/n7675/extref/nature24277-s1.pdf) of [[GTEx Consortium, 2017](https://www.nature.com/articles/nature24277)].
 72 | 
 73 | This step will generate 3 files:
 74 | ```bash
 75 | ${prefix}.PEER_residuals.txt
 76 | ${prefix}.PEER_alpha.txt
 77 | ${prefix}.PEER_covariates.txt
 78 | ```
 79 | 
 80 | #### 3) Combine covariates
 81 | This step generates a combined covariates file, containing genotype PCs, PEER factors, and additional explicit covariates (e.g., genotyping platform).
 82 | ```bash
 83 | combine_covariates.py ${prefix}.PEER_covariates.txt ${prefix} \
 84 |     --genotype_pcs ${genotype_pcs} \
 85 |     --add_covariates ${add_covariates}
 86 | ```
 87 | The covariate files should have one covariate per row, with an identifier in the first column, and a header line with sample identifiers. This step will generate the file `${prefix}.combined_covariates.txt`
 88 | 
 89 | #### 4) Run FastQTL
 90 | A wrapper script for multithreaded execution is provided in the docker image (`/opt/fastqtl/python/run_FastQTL_threaded.py`) and at https://github.com/francois-a/fastqtl
 91 | ```bash
 92 | # nominal pass
 93 | run_FastQTL_threaded.py ${vcf} ${prefix}.expression.bed.gz ${prefix} \
 94 |     --covariates ${prefix}.combined_covariates.txt \
 95 |     --window 1e6 --chunks 100 --threads 16
 96 | 
 97 | # permutation pass
 98 | run_FastQTL_threaded.py ${vcf} ${prefix}.expression.bed.gz ${prefix} \
 99 |     --covariates ${prefix}.combined_covariates.txt \
100 |     --window 1e6 --chunks 100 --threads 16 \
101 |     --permute 1000 10000 
102 | ```
103 | The following files will be generated:
104 | ```bash
105 | ${prefix}.allpairs.txt.gz
106 | ${prefix}.egenes.txt.gz
107 | ```
108 | 
109 | ### Using docker
110 | The steps described above can be run using docker. This assumes that the `$path_to_data` directory contains all required input files.
111 | ```bash
112 | # Docker command for step 1:
113 | docker run --rm -v $path_to_data:/data -t broadinstitute/gtex_eqtl:V8 /bin/bash \
114 |     -c "/src/eqtl_prepare_expression.py /data/${tpm_gct} /data/${counts_gct} \
115 |         /data/${annotation_gtf} /data/${sample_participant_lookup} /data/${vcf_chr_list} ${prefix} \
116 |         --tpm_threshold 0.1 --count_threshold 6 --sample_frac_threshold 0.2 --normalization_method tmm"        
117 | ```
118 | 
119 | ### V6p pipeline settings
120 | 
121 | #### Expression normalization
122 | The expression data were normalized as follows: 
123 | 1. Genes were selected based on the following exression thresholds: 
124 |    * &gt;0.1 RPKM in ≥10 samples AND
125 |    * ≥6 reads (unnormalized) in ≥10 samples
126 | 2. RPKMs were normalized between samples using quantile normalization
127 | 3. Each gene was inverse normal transformed across samples.
128 | 


--------------------------------------------------------------------------------
/qtl/R_peer_source_1.3.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/gtex-pipeline/0e54b42decbc65de9991f199b199822fd7aec46c/qtl/R_peer_source_1.3.tgz


--------------------------------------------------------------------------------
/qtl/aFC.wdl:
--------------------------------------------------------------------------------
 1 | task aFC {
 2 | 
 3 |     File vcf_file
 4 |     File vcf_index
 5 |     File expression_bed
 6 |     File expression_bed_index
 7 |     File covariates_file
 8 |     File afc_qtl_file
 9 |     String prefix
10 | 
11 |     Int memory
12 |     Int disk_space
13 |     Int num_threads
14 |     Int num_preempt
15 | 
16 |     command {
17 |         set -euo pipefail
18 |         python3 /opt/aFC/aFC.py \
19 |             --vcf ${vcf_file} \
20 |             --pheno ${expression_bed} \
21 |             --qtl ${afc_qtl_file} \
22 |             --cov ${covariates_file} \
23 |             --log_xform 1 \
24 |             --log_base 2 \
25 |             --o ${prefix}.aFC.txt
26 |         gzip ${prefix}.aFC.txt
27 |     }
28 | 
29 |     runtime {
30 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V8"
31 |         memory: "${memory}GB"
32 |         disks: "local-disk ${disk_space} HDD"
33 |         cpu: "${num_threads}"
34 |         preemptible: "${num_preempt}"
35 |     }
36 | 
37 |     output {
38 |         File afc_file="${prefix}.aFC.txt.gz"
39 |     }
40 | 
41 |     meta {
42 |         author: "Francois Aguet"
43 |     }
44 | }
45 | 
46 | workflow aFC_workflow {
47 |     call aFC
48 | }
49 | 


--------------------------------------------------------------------------------
/qtl/ase_aggregate_by_individual.wdl:
--------------------------------------------------------------------------------
 1 | task ase_aggregate_by_individual {
 2 | 
 3 |     Array[File] ase_readcount_files
 4 |     Array[String] sample_ids
 5 |     Array[String] tissue_site_details
 6 |     File het_vcf
 7 |     File vep_dict
 8 |     File simulation_bias
 9 |     File mappability_bigwig
10 |     File tissue_abbreviations
11 |     File lamp_values
12 |     String individual_id
13 | 
14 |     Int? coverage_cutoff
15 |     Float? other_ratio_cutoff
16 |     Float? mono_cutoff
17 | 
18 |     Int memory
19 |     Int disk_space
20 |     Int num_threads
21 |     Int num_preempt
22 | 
23 |     command {
24 |         set -euo pipefail
25 |         # workaround for broken 'write_lines'
26 |         python3 <<CODE
27 | with open("ase_readcount_file_paths.tsv", "w") as f:
28 |     f.write('sample_id\ttissue_site_detail\tase_readcount_file\n')
29 |     for i,t,p in zip('${sep="," sample_ids}'.split(","), '${sep="," tissue_site_details}'.split(","), '${sep="," ase_readcount_files}'.split(",")):
30 |         f.write('\t'.join([i,t,p])+'\n')
31 | CODE
32 |         python3 -u /src/ase_aggregate_by_individual.py ase_readcount_file_paths.tsv ${het_vcf} ${vep_dict} ${simulation_bias} ${mappability_bigwig} ${tissue_abbreviations} ${lamp_values} ${individual_id}
33 |     }
34 | 
35 |     output {
36 |         File ase_table = "${individual_id}.ase_table.tsv.gz"
37 |     }
38 | 
39 |     runtime {
40 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V10"
41 |         memory: "${memory}GB"
42 |         disks: "local-disk ${disk_space} HDD"
43 |         cpu: "${num_threads}"
44 |         preemptible: "${num_preempt}"
45 |     }
46 | 
47 |     meta {
48 |         author: "Francois Aguet"
49 |     }
50 | }
51 | 
52 | 
53 | workflow ase_aggregate_by_individual_workflow {
54 |     call ase_aggregate_by_individual
55 | }
56 | 


--------------------------------------------------------------------------------
/qtl/ase_gatk_readcounter.wdl:
--------------------------------------------------------------------------------
 1 |     task ase_gatk_readcounter {
 2 | 
 3 |     File gatk_jar
 4 |     File genome_fasta
 5 |     File genome_fasta_index
 6 |     File genome_fasta_dict
 7 |     File het_vcf
 8 |     File het_vcf_index
 9 |     File bam_file
10 |     File bam_index
11 |     String prefix
12 |     Boolean? filter_wasp = true
13 | 
14 |     Int memory
15 |     Int disk_space
16 |     Int num_threads
17 |     Int num_preempt
18 | 
19 |     command <<<
20 |         set -euo pipefail
21 |         if [[ ${filter_wasp} = "true" ]]
22 |         then
23 |             echo $(date +"[%b %d %H:%M:%S] Filtering out reads with allelic mapping bias")
24 |             samtools view -h ${bam_file} | grep -v "vW:i:[2-7]" | samtools view -1 > filtered.bam
25 |             samtools index filtered.bam
26 |             python3 /src/run_GATK_ASEReadCounter.py ${gatk_jar} ${genome_fasta} ${het_vcf} filtered.bam ${prefix}
27 |         else
28 |             python3 /src/run_GATK_ASEReadCounter.py ${gatk_jar} ${genome_fasta} ${het_vcf} ${bam_file} ${prefix}
29 |         fi
30 | 
31 |         # filter out chrX
32 |         mv ${prefix}.readcounts.txt.gz ${prefix}.readcounts.all.txt.gz
33 |         zcat ${prefix}.readcounts.all.txt.gz | awk '$1!="chrX" && $1!="X" {print $0}' | gzip -c > ${prefix}.readcounts.txt.gz
34 |         zcat ${prefix}.readcounts.all.txt.gz | awk '$1=="contig" || $1=="chrX" || $1=="X" {print $0}' | gzip -c > ${prefix}.readcounts.chrX.txt.gz
35 |     >>>
36 | 
37 |     output {
38 |         File ase_read_counts = "${prefix}.readcounts.txt.gz"
39 |         File ase_read_counts_chrX = "${prefix}.readcounts.chrX.txt.gz"
40 |     }
41 | 
42 |     runtime {
43 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V10"
44 |         memory: "${memory}GB"
45 |         disks: "local-disk ${disk_space} HDD"
46 |         cpu: "${num_threads}"
47 |         preemptible: "${num_preempt}"
48 |     }
49 | 
50 |     meta {
51 |         author: "Francois Aguet"
52 |     }
53 | }
54 | 
55 | 
56 | workflow ase_gatk_readcounter_workflow {
57 |     call ase_gatk_readcounter
58 | }
59 | 


--------------------------------------------------------------------------------
/qtl/dapars.wdl:
--------------------------------------------------------------------------------
 1 | task dapars {
 2 | 
 3 |     Array[File] bigwig_files
 4 |     Array[String] sample_ids
 5 |     String prefix
 6 |     File utr_annotation
 7 |     File size_factors
 8 |     File? sample_participant_lookup
 9 |     File? expression_bed
10 |     Int? coverage_threshold = 10
11 | 
12 |     Int memory
13 |     Int disk_space
14 |     Int num_threads
15 |     Int num_preempt
16 | 
17 |     command {
18 |         set -euo pipefail
19 |         python3 <<CODE
20 | with open("bigwig_list.txt", "w") as f:
21 |     f.write('sample_id\tbigwig\n')
22 |     for sample_id, bigwig in zip('${sep="," sample_ids}'.split(","), '${sep="," bigwig_files}'.split(",")):
23 |         f.write('\t'.join([sample_id, bigwig])+'\n')
24 | CODE
25 | 
26 |     python3 /src/dapars.py \
27 |         --bigwig_files bigwig_list.txt \
28 |         --utr ${utr_annotation} \
29 |         --size_factors ${size_factors} \
30 |         --prefix ${prefix} \
31 |         ${"--sample_participant_lookup " + sample_participant_lookup} \
32 |         ${"--expression_bed " + expression_bed} \
33 |         --coverage_threshold ${coverage_threshold} \
34 |         --threads ${num_threads} \
35 |         --parquet
36 |     }
37 | 
38 |     output {
39 |         File phenotype_groups = "${prefix}.dapars2_phenotype_groups.txt"
40 |         File dapars_bed = "${prefix}.dapars2_phenotypes.bed.parquet"
41 |         File dapars_ratios = "${prefix}.dapars2_3p_UTR_ratios.parquet"
42 |         File bigwig_list = "bigwig_list.txt"
43 |     }
44 | 
45 |     runtime {
46 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
47 |         memory: "${memory}GB"
48 |         disks: "local-disk ${disk_space} HDD"
49 |         cpu: "${num_threads}"
50 |         preemptible: "${num_preempt}"
51 |     }
52 | 
53 |     meta {
54 |         author: "Francois Aguet"
55 |     }
56 | }
57 | 
58 | 
59 | workflow dapars_workflow {
60 |     call dapars
61 | }
62 | 


--------------------------------------------------------------------------------
/qtl/eqtl_prepare_expression.wdl:
--------------------------------------------------------------------------------
 1 | task eqtl_prepare_expression {
 2 | 
 3 |     File tpm_gct
 4 |     File counts_gct
 5 |     File annotation_gtf
 6 |     File sample_participant_ids
 7 |     File vcf_chr_list
 8 |     String prefix
 9 | 
10 |     Float? tpm_threshold
11 |     Int? count_threshold
12 |     Float? sample_frac_threshold
13 |     String? normalization_method
14 |     String? flags  # --convert_tpm, --legacy_mode
15 | 
16 |     Int memory
17 |     Int disk_space
18 |     Int num_threads
19 |     Int num_preempt
20 | 
21 |     command {
22 |         set -euo pipefail
23 |         /src/eqtl_prepare_expression.py ${tpm_gct} ${counts_gct} \
24 |         ${annotation_gtf} ${sample_participant_ids} ${vcf_chr_list} ${prefix} \
25 |         ${"--tpm_threshold " + tpm_threshold} \
26 |         ${"--count_threshold " + count_threshold} \
27 |         ${"--sample_frac_threshold " + sample_frac_threshold} \
28 |         ${"--normalization_method " + normalization_method} \
29 |         ${flags}
30 |     }
31 | 
32 |     runtime {
33 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V8"
34 |         memory: "${memory}GB"
35 |         disks: "local-disk ${disk_space} HDD"
36 |         cpu: "${num_threads}"
37 |         preemptible: "${num_preempt}"
38 |     }
39 | 
40 |     output {
41 |         File expression_bed="${prefix}.expression.bed.gz"
42 |         File expression_bed_index="${prefix}.expression.bed.gz.tbi"
43 |     }
44 | 
45 |     meta {
46 |         author: "Francois Aguet"
47 |     }
48 | }
49 | 
50 | workflow eqtl_prepare_expression_workflow {
51 |     call eqtl_prepare_expression
52 | }
53 | 


--------------------------------------------------------------------------------
/qtl/leafcutter/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile for LeafCutter
 2 | FROM ubuntu:22.04
 3 | MAINTAINER Francois Aguet
 4 | ENV DEBIAN_FRONTEND noninteractive
 5 | 
 6 | RUN apt-get update && apt-get install -y software-properties-common && \
 7 |     apt-get update && apt-get install -y \
 8 |         build-essential \
 9 |         cmake \
10 |         curl \
11 |         lbzip2 \
12 |         libboost-all-dev \
13 |         libcurl3-dev \
14 |         libgsl-dev \
15 |         libssl-dev \
16 |         libxml2-dev \
17 |         openjdk-8-jdk \
18 |         python3 \
19 |         python3-pip \
20 |         r-base-core \
21 |         unzip \
22 |         vim-common \
23 |         wget \
24 |         zlib1g-dev \
25 |     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
26 |     apt-get clean && \
27 |     apt-get autoremove -y && \
28 |     rm -rf /var/lib/{apt,dpkg,cache,log}/
29 | 
30 | # R
31 | RUN echo "r <- getOption('repos'); r['CRAN'] <- 'http://cran.us.r-project.org'; options(repos = r);" > ~/.Rprofile && \
32 |     Rscript -e "install.packages(c('argparser', 'devtools', 'foreach'), dependencies=TRUE)" && \
33 |     Rscript -e 'BiocManager::install(c("qvalue", "sva", "edgeR", "dplyr"));'
34 | 
35 | # htslib
36 | RUN cd /opt && \
37 |     wget --no-check-certificate https://github.com/samtools/htslib/releases/download/1.19.1/htslib-1.19.1.tar.bz2 && \
38 |     tar -xf htslib-1.19.1.tar.bz2 && rm htslib-1.19.1.tar.bz2 && cd htslib-1.19.1 && \
39 |     ./configure --enable-libcurl --enable-s3 --enable-plugins --enable-gcs && \
40 |     make && make install && make clean
41 | 
42 | # samtools
43 | RUN cd /opt && \
44 |     wget --no-check-certificate https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 && \
45 |     tar -xf samtools-1.19.2.tar.bz2 && rm samtools-1.19.2.tar.bz2 && cd samtools-1.19.2 && \
46 |     ./configure --with-htslib=/opt/htslib-1.19.1 && make && make install && make clean
47 | 
48 | # bcftools
49 | RUN cd /opt && \
50 |     wget --no-check-certificate https://github.com/samtools/bcftools/releases/download/1.19/bcftools-1.19.tar.bz2 && \
51 |     tar -xf bcftools-1.19.tar.bz2 && rm bcftools-1.19.tar.bz2 && cd bcftools-1.19 && \
52 |     ./configure --with-htslib=/opt/htslib-1.19.1 && make && make install && make clean
53 | 
54 | # Python 3
55 | RUN pip3 install --upgrade pip setuptools
56 | RUN pip3 install numpy pyarrow pandas scipy scikit-learn matplotlib qtl
57 | 
58 | # LeafCutter
59 | RUN cd /opt && \
60 |     wget https://github.com/francois-a/leafcutter/archive/2488118d377baff3354dab85de1f31b03a813c92.tar.gz && \
61 |     tar -xf 2488118d377baff3354dab85de1f31b03a813c92.tar.gz && \
62 |     rm 2488118d377baff3354dab85de1f31b03a813c92.tar.gz && \
63 |     ln -s leafcutter-2488118d377baff3354dab85de1f31b03a813c92 leafcutter
64 | 
65 | # regtools
66 | RUN cd /opt && \
67 |     wget https://github.com/griffithlab/regtools/archive/refs/tags/0.5.2.zip && \
68 |     unzip 0.5.2.zip && rm 0.5.2.zip && cd regtools-0.5.2 && mkdir build && cd build && \
69 |     cmake .. && make && mv regtools .. && make clean
70 | ENV PATH /opt/regtools-0.5.2:$PATH
71 | 
72 | # copy scripts
73 | COPY src src/
74 | 


--------------------------------------------------------------------------------
/qtl/leafcutter/README.md:
--------------------------------------------------------------------------------
 1 | <!-- Author: Francois Aguet -->
 2 | ## sQTL mapping pipeline
 3 | 
 4 | This document describes the sQTL mapping pipeline used by the GTEx Consortium. For additional details, please see Section 4.3 of the Supplementary Materials for [GTEx Consortium, Science, 2020](https://www.science.org/doi/suppl/10.1126/science.aaz1776/suppl_file/aaz1776_aguet_sm.pdf).
 5 | 
 6 | ### Docker image
 7 | The pipeline components described below are available in a [Docker image](https://hub.docker.com/r/francois4/leafcutter/). To download the image, run:
 8 | ```bash
 9 | docker pull francois4/leafcutter:latest
10 | ```
11 | 
12 | #### Image contents and pipeline components
13 | The following tools are included in the image:
14 | 
15 | 1. [regtools](https://regtools.readthedocs.io/en/latest/) for extracting exon-exon junctions from RNAseq BAM files.
16 | 2. [LeafCutter](https://davidaknowles.github.io/leafcutter/) for generating intron excision ratios across samples.
17 | 
18 | ### Running the pipeline
19 | This pipeline consists of four steps, summarized below: variant-aware alignment to correct for allelic mapping bias, extraction of exon-exon junctions, generation of phenotype tables with intron excision ratios, and QTL mapping.
20 | 
21 | #### 1) Generating WASP-corrected alignments with STAR
22 | Allelic mapping bias can be a significant confounder for sQTL mapping and quantifying allele-specific expression ([Castel et al., 2015](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0762-6)). This can be mitigated using variant-aware alignment, e.g., with the [WASP correction](https://www.nature.com/articles/nmeth.3582) implemented in [STAR](https://github.com/alexdobin/STAR). Using the [STAR wrapper script](../../rnaseq/src/run_STAR.py) in this repository, this step is applied when supplying a VCF with the participant's SNPs via the `--varVCFfile` option. The participant VCFs can be generated using [this WDL](../../genotype/participant_vcfs.wdl).
23 | 
24 | #### 2) Generating exon-exon junction counts with regtools
25 | Next, exon-exon junction counts are extracted from the BAM files using
26 | ```bash
27 | samtools view -h -q 255 $bam_file | grep -v "vW:i:[2-7]" | samtools view -b > $filtered_bam
28 | regtools junctions extract -a 8 -m 50 -M 500000 -s 0 $filtered_bam | gzip -c > ${sample_id}.regtools_junc.txt.gz
29 | ```
30 | The first step filters out multi-mapping reads and reads that do not pass WASP filtering. With the default options in the [STAR wrapper script](../../rnaseq/src/run_STAR.py), spliced reads are tagged with the `XS` strand attribute, which is parsed in regtools using the `-s 0` option.
31 | A wrapper is provided in [`leafcutter_bam_to_junc.wdl`](leafcutter_bam_to_junc.wdl).
32 | 
33 | #### 3) Generating intron excision ratios with LeafCutter
34 | The `cluster_prepare_fastqtl.py` script wraps LeafCutter's `leafcutter_cluster_regtools.py` script, applies filters to remove introns with low counts or low complexity, and generates input files formatted for QTL mappers.
35 | ```bash
36 | python3 cluster_prepare_fastqtl.py \
37 |     ${junc_files_list} \
38 |     ${exons_list} \
39 |     ${collapsed_annotation} \
40 |     ${prefix} \
41 |     ${sample_participant_map}
42 | ```
43 | where `${junc_files_list}` is a text file containing paths to the `*.regtools_junc.txt.gz` files generated in the previous step. The list of exons in ${exons_list} can be generated from the [collapsed](https://github.com/broadinstitute/gtex-pipeline/tree/master/gene_model) reference annotation `${collapsed_annotation}` using, e.g.,
44 | ```python
45 | import pandas as pd
46 | import qtl.annotation
47 | 
48 | annot = qtl.annotation.Annotation('gencode.v26.GRCh38.genes.gtf')
49 | exon_df = pd.DataFrame([[g.chr, e.start_pos, e.end_pos, g.strand, g.id, g.name]
50 |                         for g in annot.genes for e in g.transcripts[0].exons],
51 |                        columns=['chr', 'start', 'end', 'strand', 'gene_id', 'gene_name'])
52 | exon_df.to_csv('gencode.v26.GRCh38.genes.exons.txt.gz', sep='\t', index=False)
53 | ```
54 | `${sample_participant_map}` is a headerless tab-delimited file linking sample IDs to participant IDs (used for renaming files, such that IDs in the output files match those in the VCF).
55 | 
56 | This step generates a BED file and index (with the BED start/end coordinates corresponding to the TSS), as well as a file mapping phenotype IDs (in the form `${chr}:${intron_start}:${intron_end}:${cluster_id}_${strand}:${gene_id}`) to gene IDs:
57 | ```bash
58 | ${prefix}.leafcutter.bed.gz
59 | ${prefix}.leafcutter.bed.gz.tbi
60 | ${prefix}.leafcutter.phenotype_groups.txt
61 | ```
62 | 
63 | #### 4) Mapping sQTLs with tensorQTL
64 | The files generated above can be provided to QTL mappers such as [tensorQTL](https://github.com/broadinstitute/tensorqtl) along with covariates including PEER factors and genotype PCs (see [eQTL pipeline](https://github.com/broadinstitute/gtex-pipeline/tree/master/qtl)).
65 | 
66 | The command for running tensorQTL with the phenotype groups defined in `${prefix}.leafcutter.phenotype_groups.txt` is given below:
67 | ```python
68 | import pandas as pd
69 | from tensorqtl import cis, post
70 | 
71 | phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(f'{prefix}.leafcutter.bed.gz')
72 | covariates_df = pd.read_csv(f'{prefix}.combined_covariates.txt', sep='\t', index_col=0).T
73 | group_s = pd.read_csv(f'{prefix}.leafcutter.phenotype_groups.txt',
74 |                       sep='\t', header=None, index_col=0, squeeze=True)
75 | cis_df = cis.map_cis(genotype_df, variant_df, phenotype_df, phenotype_pos_df,
76 |                      covariates_df, group_s=group_s)
77 | post.calculate_qvalues(cis_df, fdr=0.05, qvalue_lambda=0.85)
78 | ```
79 | 


--------------------------------------------------------------------------------
/qtl/leafcutter/leafcutter_bam_to_junc.wdl:
--------------------------------------------------------------------------------
 1 | task leafcutter_bam_to_junc {
 2 | 
 3 |     File bam_file
 4 |     String sample_id
 5 |     Int? strand_specificity = 0
 6 | 
 7 |     Int memory
 8 |     Int disk_space
 9 |     Int num_threads
10 |     Int num_preempt
11 | 
12 |     command {
13 |         set -euo pipefail
14 |         echo $(date +"[%b %d %H:%M:%S] Extracting junctions for sample ${sample_id}")
15 |         # select uniquely mapped reads that pass WASP filters
16 |         filtered_bam=${bam_file}.filtered.bam
17 |         samtools view -h -q 255 ${bam_file} | grep -v "vW:i:[2-7]" | samtools view -b > $filtered_bam
18 |         samtools index $filtered_bam
19 |         regtools junctions extract -a 8 -m 50 -M 500000 ${"-s " + strand_specificity} $filtered_bam | gzip -c > ${sample_id}.regtools_junc.txt.gz
20 |         echo $(date +"[%b %d %H:%M:%S] Done")
21 |     }
22 | 
23 |     runtime {
24 |         docker: "gcr.io/broad-cga-francois-gtex/leafcutter:latest"
25 |         memory: "${memory}GB"
26 |         disks: "local-disk ${disk_space} HDD"
27 |         cpu: "${num_threads}"
28 |         preemptible: "${num_preempt}"
29 |     }
30 | 
31 |     output {
32 |         File junc_file="${sample_id}.regtools_junc.txt.gz"
33 |     }
34 | 
35 |     meta {
36 |         author: "Francois Aguet"
37 |     }
38 | }
39 | 
40 | workflow leafcutter_bam_to_junc_workflow {
41 |     call leafcutter_bam_to_junc
42 | }
43 | 


--------------------------------------------------------------------------------
/qtl/leafcutter/leafcutter_cluster.wdl:
--------------------------------------------------------------------------------
 1 | task leafcutter_cluster {
 2 | 
 3 |     Array[File] junc_files
 4 |     File exon_list
 5 |     File genes_gtf
 6 |     String prefix
 7 |     File sample_participant_lookup
 8 | 
 9 |     Int? min_clu_reads
10 |     Float? min_clu_ratio
11 |     Int? max_intron_len
12 |     Int? num_pcs
13 | 
14 |     Int memory
15 |     Int disk_space
16 |     Int num_threads
17 |     Int num_preempt
18 | 
19 |     command {
20 |         set -euo pipefail
21 |         python3 /src/cluster_prepare_fastqtl.py \
22 |             ${write_lines(junc_files)} \
23 |             ${exon_list} \
24 |             ${genes_gtf} \
25 |             ${prefix} \
26 |             ${sample_participant_lookup} \
27 |             ${"--min_clu_reads " + min_clu_reads} \
28 |             ${"--min_clu_ratio " + min_clu_ratio} \
29 |             ${"--max_intron_len " + max_intron_len} \
30 |             ${"--num_pcs " + num_pcs} 
31 |     }
32 | 
33 |     runtime {
34 |         docker: "gcr.io/broad-cga-francois-gtex/leafcutter:latest"
35 |         memory: "${memory}GB"
36 |         disks: "local-disk ${disk_space} HDD"
37 |         cpu: "${num_threads}"
38 |         preemptible: "${num_preempt}"
39 |     }
40 | 
41 |     output {
42 |         File counts="${prefix}_perind.counts.gz"
43 |         File counts_numers="${prefix}_perind_numers.counts.gz"
44 |         File clusters_pooled="${prefix}_pooled.gz"
45 |         File clusters_refined="${prefix}_refined.gz"
46 |         File phenotype_groups="${prefix}.leafcutter.phenotype_groups.txt"
47 |         File leafcutter_bed_parquet="${prefix}.leafcutter.bed.parquet"
48 |         File leafcutter_bed="${prefix}.leafcutter.bed.gz"
49 |         File leafcutter_bed_index="${prefix}.leafcutter.bed.gz.tbi"
50 |         File leafcutter_pcs="${prefix}.leafcutter.PCs.txt"
51 |     }
52 | 
53 |     meta {
54 |         author: "Francois Aguet"
55 |     }
56 | }
57 | 
58 | workflow leafcutter_cluster_workflow {
59 |     call leafcutter_cluster
60 | }
61 | 


--------------------------------------------------------------------------------
/qtl/metasoft.wdl:
--------------------------------------------------------------------------------
  1 | task combine_signif_pairs {
  2 | 
  3 |     Array[File] signifpairs
  4 |     String prefix
  5 | 
  6 |     Int memory
  7 |     Int disk_space
  8 |     Int num_threads
  9 |     Int num_preempt
 10 | 
 11 |     command {
 12 |         set -euo pipefail
 13 |         /src/combine_signif_pairs.py ${write_lines(signifpairs)} ${prefix}
 14 |     }
 15 | 
 16 |     runtime {
 17 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V10"
 18 |         memory: "${memory}GB"
 19 |         disks: "local-disk ${disk_space} HDD"
 20 |         cpu: "${num_threads}"
 21 |         preemptible: "${num_preempt}"
 22 |     }
 23 | 
 24 |     output {
 25 |         File combined_signifpairs="${prefix}.combined_signifpairs.txt.gz"
 26 |     }
 27 | 
 28 |     meta {
 29 |         author: "Francois Aguet"
 30 |     }
 31 | }
 32 | 
 33 | 
 34 | task extract_pairs {
 35 | 
 36 |     File input_pairs
 37 |     File extract_pairs
 38 |     String prefix
 39 | 
 40 |     Int memory
 41 |     Int disk_space
 42 |     Int num_threads
 43 |     Int num_preempt
 44 | 
 45 |     command {
 46 |         set -euo pipefail
 47 |         /src/extract_pairs.py ${input_pairs} ${extract_pairs} ${prefix} --parquet
 48 |     }
 49 | 
 50 |     runtime {
 51 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V8"
 52 |         memory: "${memory}GB"
 53 |         disks: "local-disk ${disk_space} HDD"
 54 |         cpu: "${num_threads}"
 55 |         preemptible: "${num_preempt}"
 56 |     }
 57 | 
 58 |     output {
 59 |         File extracted_pairs="${prefix}.extracted_pairs.parquet"
 60 |     }
 61 | 
 62 |     meta {
 63 |         author: "Francois Aguet"
 64 |     }
 65 | }
 66 | 
 67 | 
 68 | task metasoft_prepare_input {
 69 | 
 70 |     Array[File] pair_files
 71 |     String prefix
 72 |     Int? chunks
 73 | 
 74 |     Int memory
 75 |     Int disk_space
 76 |     Int num_threads
 77 |     Int num_preempt
 78 | 
 79 |     command {
 80 |         set -euo pipefail
 81 |         /src/metasoft_prepare_input.py ${write_lines(pair_files)} ${prefix} ${"--chunks " + chunks} --write_full
 82 |     }
 83 | 
 84 |     runtime {
 85 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V8"
 86 |         memory: "${memory}GB"
 87 |         disks: "local-disk ${disk_space} HDD"
 88 |         cpu: "${num_threads}"
 89 |         preemptible: "${num_preempt}"
 90 |     }
 91 | 
 92 |     output {
 93 |         File metasoft_input = "${prefix}.metasoft_input.txt.gz"
 94 |         Array[File] metasoft_input_chunks = glob("${prefix}.metasoft_input.chunk*.txt.gz")
 95 |     }
 96 | 
 97 |     meta {
 98 |         author: "Francois Aguet"
 99 |     }
100 | }
101 | 
102 | 
103 | 
104 | task metasoft_scatter {
105 | 
106 |     File metasoft_input
107 |     String prefix
108 | 
109 |     Int memory
110 |     Int disk_space
111 |     Int num_threads
112 |     Int num_preempt
113 | 
114 |     command {
115 |         set -euo pipefail
116 |         /src/run_metasoft.py /opt/metasoft/Metasoft.jar ${metasoft_input} ${prefix}
117 |     }
118 | 
119 |     runtime {
120 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V8"
121 |         memory: "${memory}GB"
122 |         disks: "local-disk ${disk_space} HDD"
123 |         cpu: "${num_threads}"
124 |         preemptible: "${num_preempt}"
125 |     }
126 | 
127 |     output {
128 |         File metasoft_output="${prefix}.metasoft.txt.gz"
129 |         File metasoft_log="${prefix}.metasoft.log"
130 |     }
131 | 
132 |     meta {
133 |         author: "Francois Aguet"
134 |     }
135 | }
136 | 
137 | 
138 | task metasoft_postprocess {
139 | 
140 |     Array[File] metasoft_chunks
141 |     Array[File] pair_files
142 |     String prefix
143 | 
144 |     Int memory
145 |     Int disk_space
146 |     Int num_threads
147 |     Int num_preempt
148 | 
149 |     command {
150 |         set -euo pipefail
151 |         /src/metasoft_postprocess.py ${write_lines(metasoft_chunks)} ${write_lines(pair_files)} ${prefix}
152 |     }
153 | 
154 |     runtime {
155 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V8"
156 |         memory: "${memory}GB"
157 |         disks: "local-disk ${disk_space} HDD"
158 |         cpu: "${num_threads}"
159 |         preemptible: "${num_preempt}"
160 |     }
161 | 
162 |     output {
163 |         File metasoft_output="${prefix}.metasoft.txt.gz"
164 |     }
165 | 
166 |     meta {
167 |         author: "Francois Aguet"
168 |     }
169 | }
170 | 
171 | 
172 | workflow metasoft_workflow {
173 | 
174 |     Array[File] signifpairs
175 |     Array[File] allpairs
176 |     Array[String] sample_ids
177 |     String prefix
178 | 
179 |     call combine_signif_pairs { input: signifpairs=signifpairs, prefix=prefix }
180 | 
181 |     # for each sample set, extract same set of pairs
182 |     scatter(i in range(length(allpairs))) {
183 |         call extract_pairs { input: input_pairs=allpairs[i], extract_pairs=combine_signif_pairs.combined_signifpairs, prefix=sample_ids[i]}
184 |     }
185 | 
186 |     call metasoft_prepare_input { input: pair_files=extract_pairs.extracted_pairs, prefix=prefix}
187 | 
188 |     scatter(chunk in metasoft_prepare_input.metasoft_input_chunks) {
189 |         call metasoft_scatter {
190 |             input: metasoft_input=chunk, prefix=prefix
191 |         }
192 |     }
193 | 
194 |     call metasoft_postprocess {
195 |         input: metasoft_chunks=metasoft_scatter.metasoft_output, pair_files=extract_pairs.extracted_pairs, prefix=prefix
196 |     }
197 | 
198 | }
199 | 


--------------------------------------------------------------------------------
/qtl/peer_factors.wdl:
--------------------------------------------------------------------------------
 1 | task peer_factors {
 2 | 
 3 |     File phenotype_file
 4 |     String prefix
 5 |     Int num_peer
 6 | 
 7 |     File? genotype_pcs
 8 |     File? add_covariates
 9 | 
10 |     Int memory
11 |     Int disk_space
12 |     Int num_threads
13 |     Int num_preempt
14 | 
15 |     command {
16 |         set -euo pipefail
17 |         Rscript /src/run_PEER.R ${phenotype_file} ${prefix} ${num_peer}
18 |         /src/combine_covariates.py ${prefix}.PEER_covariates.txt ${prefix} ${"--genotype_pcs " + genotype_pcs} ${"--add_covariates " + add_covariates}
19 |         gzip *.PEER_residuals.txt
20 |     }
21 | 
22 |     runtime {
23 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_eqtl:V10"
24 |         memory: "${memory}GB"
25 |         disks: "local-disk ${disk_space} HDD"
26 |         cpu: "${num_threads}"
27 |         preemptible: "${num_preempt}"
28 |     }
29 | 
30 |     output {
31 |         File combined_covariates="${prefix}.combined_covariates.txt"
32 |         File alpha="${prefix}.PEER_alpha.txt"
33 |         File residuals="${prefix}.PEER_residuals.txt.gz"
34 |     }
35 | 
36 |     meta {
37 |         author: "Francois Aguet"
38 |     }
39 | }
40 | 
41 | workflow peer_factors_workflow {
42 |     call peer_factors
43 | }
44 | 


--------------------------------------------------------------------------------
/qtl/src/ase_aggregate_by_individual.py:
--------------------------------------------------------------------------------
  1 | # Author: Francois Aguet
  2 | import numpy as np
  3 | import scipy.stats
  4 | import pandas as pd
  5 | import argparse
  6 | import pyBigWig
  7 | import os
  8 | import subprocess
  9 | import io
 10 | import gzip
 11 | import pickle
 12 | import qtl.stats
 13 | 
 14 | 
 15 | parser = argparse.ArgumentParser(description='ASE')
 16 | parser.add_argument('read_count_file_list', help='Read count file list (one per sample); [sample_id, tissue_site_detail, file_path]')
 17 | parser.add_argument('het_vcf')
 18 | parser.add_argument('vep_dict')
 19 | parser.add_argument('simulation_bias_file', help='?')
 20 | parser.add_argument('mappability_bigwig', help='Mappability track in bigWig format')
 21 | parser.add_argument('tissue_abbreviations', help='File mapping tissue_site_detail to abbreviation')
 22 | parser.add_argument('lamp_values', help='Table with foreign allele frequency per individual')
 23 | parser.add_argument('individual_id', help='individual_id')
 24 | parser.add_argument('--coverage_cutoff', default=8, type=int, help='')
 25 | parser.add_argument('--other_ratio_cutoff', default=0.05, type=float, help='')
 26 | parser.add_argument('--mono_cutoff', default=0.01, type=float, help='')
 27 | parser.add_argument('-o', '--output_dir', default='.')
 28 | args = parser.parse_args()
 29 | 
 30 | 
 31 | print('Parsing inputs')
 32 | tissue2abrv = pd.read_csv(args.tissue_abbreviations, sep='\t', index_col=0).squeeze('columns').to_dict()
 33 | readcount_file_df = pd.read_csv(args.read_count_file_list, sep='\t', index_col=0)
 34 | df = pd.read_csv(args.simulation_bias_file, sep='\t', header=None, dtype=str)
 35 | simulation_bias_set = set(df[0]+':'+df[1])
 36 | 
 37 | 
 38 | print('Parsing read count files')
 39 | readcount_df_list = []
 40 | for i,rfile in enumerate(readcount_file_df['ase_readcount_file']):
 41 |     readcount_df = pd.read_csv(rfile, sep='\t', index_col=2)
 42 |     readcount_df = readcount_df[['contig', 'position', 'refAllele', 'altAllele', 'refCount', 'altCount', 'totalCount', 'otherBases']]
 43 |     readcount_df = readcount_df.rename(columns={'contig':'chr', 'position':'coord', 'refAllele':'ref', 'altAllele':'alt',
 44 |          'refCount':'refcount', 'altCount':'altcount', 'totalCount':'totalcount', 'otherBases':'othercount'})
 45 |     readcount_df = readcount_df[readcount_df['totalcount']>=args.coverage_cutoff]
 46 |     
 47 |     readcount_df['refratio'] = readcount_df['refcount']/readcount_df['totalcount']
 48 |     readcount_df['otherratio'] = readcount_df['othercount'] / (readcount_df['othercount'] + readcount_df['totalcount'])
 49 |     readcount_df['otherflag'] = (readcount_df['otherratio']>=args.other_ratio_cutoff)*1
 50 |     readcount_df['allcount'] = readcount_df['totalcount'] + readcount_df['othercount']
 51 |     sample_id = readcount_file_df.index[i]
 52 |     readcount_df['sampid'] = sample_id
 53 |     readcount_df['subjid'] = '-'.join(sample_id.split('-')[:2])
 54 |     readcount_df['tissue'] = readcount_file_df.loc[sample_id, 'tissue_site_detail']
 55 |     readcount_df['tissueabrv'] = tissue2abrv[readcount_file_df.loc[sample_id, 'tissue_site_detail']]
 56 |     readcount_df['covflag'] = 0  # covflag is never 1, since filtered above (coverage_cutoff)
 57 |     
 58 |     readcount_df_list.append(readcount_df)
 59 | 
 60 | 
 61 | print('Loading VCF')
 62 | vcf_df = pd.read_csv(args.het_vcf, sep='\t', comment='#', header=None,
 63 |     names=['chr', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info', 'format', 'genotype'], dtype=str,
 64 |     usecols=['chr', 'pos', 'id', 'info','format', 'genotype'], index_col=2)
 65 | 
 66 | vcf_snp_id_df = pd.DataFrame(index=vcf_df.index, columns=['chr', 'coord', 'genotype', 'ensg', 'vtype', 'mapbias', 'mapflag', 'monoflag', 'mono_refcount', 'mono_altcount', 'mono_totalcount', 'mono_othercount'])
 67 | vcf_snp_id_df[['chr', 'coord']] = vcf_df[['chr', 'pos']]
 68 | vcf_snp_id_df['genotype'] = vcf_df['format']+';'+vcf_df['genotype']
 69 | 
 70 | print('Adding VEP annotation')
 71 | with open(args.vep_dict, 'rb') as f:
 72 |     vep_dict = pickle.load(f)
 73 | 
 74 | ensg = []
 75 | vtype = []
 76 | for i in vcf_df.index:
 77 |     gene_name, vep = vep_dict.get(i, ('NA','NA'))
 78 |     ensg.append(gene_name)
 79 |     vtype.append(vep)
 80 | vcf_snp_id_df['ensg'] = ensg
 81 | vcf_snp_id_df['vtype'] = vtype
 82 | vep_dict = None
 83 | 
 84 | print('Adding mappability')
 85 | mp = []
 86 | bw = pyBigWig.open(args.mappability_bigwig)
 87 | for c,p in zip(vcf_df['chr'], vcf_df['pos']):
 88 |     mp.append((bw.stats(c, int(p)-1, int(p), exact=True)[0]!=1) * 1)  # BED coordinates, 0-indexed; input must be int (not numpy)
 89 | bw.close()
 90 | 
 91 | vcf_snp_id_df['mapbias'] = [1 if i in simulation_bias_set else 0 for i in vcf_snp_id_df['chr']+':'+vcf_snp_id_df['coord']]
 92 | vcf_snp_id_df['mapflag'] = mp
 93 | vcf_snp_id_df['monoflag'] = 0
 94 | vcf_snp_id_df['mono_refcount'] = 0
 95 | vcf_snp_id_df['mono_altcount'] = 0
 96 | vcf_snp_id_df['mono_totalcount'] = 0
 97 | vcf_snp_id_df['mono_othercount'] = 0
 98 | for readcount_df in readcount_df_list:
 99 |     # combine read counts for each variant
100 |     vcf_snp_id_df.loc[readcount_df.index, 'mono_refcount'] += readcount_df['refcount']
101 |     vcf_snp_id_df.loc[readcount_df.index, 'mono_altcount'] += readcount_df['altcount']
102 |     vcf_snp_id_df.loc[readcount_df.index, 'mono_totalcount'] += readcount_df['totalcount']
103 |     vcf_snp_id_df.loc[readcount_df.index, 'mono_othercount'] += readcount_df['othercount']
104 | 
105 | 
106 | print('Calculating statistics')
107 | lamp = pd.read_csv(args.lamp_values, sep='\t', index_col=0).squeeze('columns').median()
108 | ref = vcf_snp_id_df['mono_refcount']
109 | tot = vcf_snp_id_df['mono_totalcount']
110 | monop_list = scipy.stats.binom.cdf(tot-ref, tot, 1-lamp) + scipy.stats.binom.cdf(ref, tot, 1-lamp)  # monoallelic_p
111 | monop_adj_list = qtl.stats.padjust_bh(monop_list)
112 | vcf_snp_id_df['monoflag'] = (monop_adj_list > args.mono_cutoff) * 1
113 | 
114 | indiv_cov75_counts = []
115 | for readcount_df in readcount_df_list:
116 |     readcount_df['GENOTYPE_WARNING'] = vcf_snp_id_df.loc[readcount_df.index, 'monoflag']
117 |     idx = (vcf_snp_id_df.loc[readcount_df.index, ['monoflag', 'mapbias', 'mapflag']].sum(axis=1)==0) & (readcount_df['otherflag']==0)
118 |     indiv_cov75_counts.extend(list(readcount_df.loc[idx, 'totalcount']))
119 | cov75 = np.percentile(indiv_cov75_counts, 75)
120 | 
121 | 
122 | print('Calculating bias')
123 | genomewide_bias = [0.0, 0.0, 0]
124 | for readcount_df in readcount_df_list:
125 |     idx = (readcount_df[['covflag', 'otherflag']].sum(axis=1) + vcf_snp_id_df.loc[readcount_df.index, ['mapbias', 'mapflag', 'monoflag']].sum(axis=1)) == 0
126 |     refcountcov = readcount_df.loc[idx, 'refcount']
127 |     altcountcov = readcount_df.loc[idx, 'altcount']
128 |     totcountcov = refcountcov + altcountcov
129 |     
130 |     bias_keys = readcount_df.loc[idx, 'ref']+'/'+readcount_df.loc[idx, 'alt']
131 |     
132 |     idx2 = (refcountcov+altcountcov) > cov75
133 |     refcountcov[idx2] = cov75*(refcountcov[idx2]/totcountcov[idx2])
134 |     altcountcov[idx2] = cov75 - refcountcov[idx2]
135 |     totcountcov[idx2] = cov75
136 |     
137 |     genomewide_bias[0] += refcountcov.sum()
138 |     genomewide_bias[1] += totcountcov.sum()
139 |     genomewide_bias[2] += refcountcov.shape[0]
140 | 
141 | genomewide_bias_value = float(genomewide_bias[0]) / genomewide_bias[1]
142 | 
143 | 
144 | print('Calculating binomial tests, adjusted p-values')
145 | for readcount_df in readcount_df_list:
146 |     readcount_df['binom_p'] = [scipy.stats.binom_test(i, j, genomewide_bias_value) for i,j in zip(readcount_df['refcount'], readcount_df['totalcount'])]
147 |     readcount_df['nullratio'] = genomewide_bias_value
148 |     idx = (readcount_df[['covflag', 'otherflag']].sum(axis=1) + vcf_snp_id_df.loc[readcount_df.index, ['mapbias', 'mapflag', 'monoflag']].sum(axis=1))==0
149 |     readcount_df.loc[idx, 'binom_p_adj'] = qtl.stats.padjust_bh(readcount_df.loc[idx, 'binom_p'])
150 |     readcount_df.loc[~idx, 'binom_p_adj'] = 'NA'
151 | 
152 | 
153 | print('Writing output')
154 | with gzip.open(os.path.join(args.output_dir, args.individual_id+'.ase_table.tsv.gz'), 'wt') as f:
155 |     f.write('\t'.join([
156 |         'CHR',
157 |         'POS',
158 |         'VARIANT_ID',
159 |         'REF_ALLELE',
160 |         'ALT_ALLELE',
161 |         'SAMPLE_ID',
162 |         'SUBJECT_ID',
163 |         'TISSUE_ID',
164 |         'REF_COUNT',
165 |         'ALT_COUNT',
166 |         'TOTAL_COUNT',
167 |         'REF_RATIO',
168 |         'OTHER_ALLELE_COUNT',
169 |         'NULL_RATIO',
170 |         'BINOM_P',
171 |         'BINOM_P_ADJUSTED',
172 |         'MAMBA_POST_SINGLETIS',
173 |         'MAMBA_POST_MULTITIS',
174 |         'GENOTYPE',
175 |         'VARIANT_ANNOTATION',
176 |         'GENE_ID',
177 |         'LOW_MAPABILITY',
178 |         'MAPPING_BIAS_SIM',
179 |         'GENOTYPE_WARNING'])+'\n')
180 |     
181 |     merged_df = []
182 |     for readcount_df in readcount_df_list:
183 |         readcount_df['id'] = readcount_df.index
184 |         readcount_df['blank'] = 'NA'
185 |         out_df = readcount_df[['chr', 'coord', 'id', 'ref', 'alt', 'sampid', 'subjid', 'tissueabrv', 'refcount', 'altcount', 'totalcount', 'refratio', 'othercount', 'nullratio',
186 |             'binom_p', 'binom_p_adj', 'blank', 'blank']]
187 |         merged_df.append(pd.concat([out_df, vcf_snp_id_df.loc[readcount_df.index, ['genotype', 'vtype', 'ensg', 'mapflag', 'mapbias']], readcount_df['GENOTYPE_WARNING']], axis=1))
188 |     merged_df = pd.concat(merged_df, axis=0)
189 |     merged_df = merged_df.sort_values(['chr', 'coord', 'tissueabrv'])
190 |     merged_df.to_csv(f, sep='\t', index=False, header=False, float_format='%.6g')
191 | 
192 | print('Done')
193 | 


--------------------------------------------------------------------------------
/qtl/src/ase_calculate_lamp.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | import matplotlib
 4 | matplotlib.use('Agg')
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | import numpy as np
 8 | import argparse
 9 | import os
10 | 
11 | 
12 | def lamp_from_readcount_list(readcount_files, coverage_cutoff=8, other_ratio_cutoff=0.05):
13 |     """
14 |     Determine ratio of foreign alleles across all samples from an individual
15 |     """
16 |     # concatenate samples
17 |     merged_readcount_df = []
18 |     for i,rfile in enumerate(readcount_files):
19 |         readcount_df = pd.read_csv(rfile, sep='\t', usecols=['variantID', 'totalCount', 'otherBases'], index_col=0)
20 |         readcount_df = readcount_df[readcount_df['totalCount']>=coverage_cutoff]
21 |         merged_readcount_df.append(readcount_df)
22 |     merged_readcount_df = pd.concat(merged_readcount_df, axis=0)
23 |     
24 |     merged_readcount_df['allCount'] = merged_readcount_df['totalCount'] + merged_readcount_df['otherBases']
25 |     merged_readcount_df['otherRatio'] = merged_readcount_df['otherBases'] / merged_readcount_df['allCount']
26 |     idx = merged_readcount_df['otherRatio'] < other_ratio_cutoff
27 |     sumother = merged_readcount_df.loc[idx, 'otherBases'].sum()
28 |     sumtotal = merged_readcount_df.loc[idx, 'allCount'].sum()
29 |     
30 |     return sumother / sumtotal / 2.0
31 | 
32 | 
33 | if __name__=="__main__":
34 |     parser = argparse.ArgumentParser(description='Calculate lamp')
35 |     parser.add_argument('readcount_files', help='Output from ASE pipeline')
36 |     parser.add_argument('prefix', help='Prefix for output files')
37 |     parser.add_argument('-o', '--output_dir', default='.')
38 |     args = parser.parse_args()
39 |     
40 |     with open(args.readcount_files) as f:
41 |         file_list = f.read().strip().split('\n')
42 |     sample_ids = [os.path.split(i)[1].split('.')[0] for i in file_list]
43 |     individual_ids = ['-'.join(i.split('-')[:2]) for i in sample_ids]
44 | 
45 |     readcounts_df = pd.DataFrame(np.array([individual_ids, file_list]).T, index=sample_ids, columns=['individual_id', 'ase_read_count_file'])
46 |     dfg = readcounts_df.groupby('individual_id')
47 | 
48 |     lamp_s = pd.Series(index=np.unique(individual_ids))
49 |     for k,i in enumerate(lamp_s.index):
50 |         print('Processing individual {0:d}/{1:d}'.format(k+1, lamp_s.shape[0]), end='\r')
51 |         g = dfg.get_group(i)['ase_read_count_file']
52 |         lamp_s.loc[i] = lamp_from_readcount_list(g.values)
53 |     lamp_s.name = 'lamp'
54 |     lamp_s = pd.DataFrame(lamp_s)
55 |     lamp_s.index.name = 'individual_id'
56 |     lamp_s.to_csv(os.path.join(args.output_dir, args.prefix+'.lamp_values.txt'), sep='\t', float_format='%.6g')
57 | 
58 |     # plot distribution
59 |     fig = plt.figure()
60 |     ax = fig.add_subplot(111)
61 |     v = ax.hist(lamp_s['lamp'].values, 20, edgecolor='k', lw=0.5)
62 |     ax.plot([lamp_s['lamp'].mean()]*2, [0, np.max(v[0])], 'k--')
63 |     ax.plot([lamp_s['lamp'].median()]*2, [0, np.max(v[0])], '--', color=[0.5,0.5,0.5])
64 |     ax.set_xlabel('Foreign allele read frequency ($\epsilon$)', fontsize=16)
65 |     ax.text(lamp_s['lamp'].mean(), np.max(v[0]), r' $\bar\epsilon$ = {0:.6e}'.format(lamp_s['lamp'].mean()), fontsize=12, va='top')
66 |     ax.text(lamp_s['lamp'].median(), 0.9*np.max(v[0]), r' median($\epsilon$) = {0:.6e}'.format(lamp_s['lamp'].median()), fontsize=12, va='top', color=[0.5,0.5,0.5])
67 |     ax.set_ylabel('Frequency', fontsize=16)
68 |     plt.savefig(os.path.join(args.output_dir, args.prefix+'.lamp_distribution.pdf'))
69 | 


--------------------------------------------------------------------------------
/qtl/src/combine_covariates.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | import pandas as pd
 4 | import numpy as np
 5 | import argparse
 6 | import os
 7 | 
 8 | parser = argparse.ArgumentParser(description='Combine covariates into a single matrix')
 9 | parser.add_argument('expression_covariates', help='')
10 | parser.add_argument('prefix', help='')
11 | parser.add_argument('--genotype_pcs', default=None, help='Genotype PCs')
12 | parser.add_argument('--add_covariates', default=[], nargs='+', help='Additional covariates')
13 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
14 | args = parser.parse_args()
15 | 
16 | print('Combining covariates ... ', end='', flush=True)
17 | expression_df = pd.read_csv(args.expression_covariates, sep='\t', index_col=0, dtype=str)
18 | if args.genotype_pcs is not None:
19 |     genotype_df = pd.read_csv(args.genotype_pcs, sep='\t', index_col=0, dtype=str)
20 |     combined_df = pd.concat([genotype_df[expression_df.columns], expression_df], axis=0)
21 | else:
22 |     combined_df = expression_df
23 | for c in args.add_covariates:
24 |     additional_df = pd.read_csv(c, sep='\t', index_col=0, dtype=str)
25 |     combined_df = pd.concat([combined_df, additional_df[expression_df.columns]], axis=0)
26 | 
27 | # identify and drop colinear covariates
28 | C = combined_df.astype(np.float64).T
29 | Q,R = np.linalg.qr(C-np.mean(C, axis=0))
30 | colinear_ix = np.abs(np.diag(R)) < np.finfo(np.float64).eps * C.shape[1]
31 | if np.any(colinear_ix):
32 |     print('Colinear covariates detected:')
33 |     for i in C.columns[colinear_ix]:
34 |         print("  * dropped '{}'".format(i))
35 |     combined_df = combined_df.loc[~colinear_ix]
36 | 
37 | combined_df.to_csv(os.path.join(args.output_dir, args.prefix+'.combined_covariates.txt'), sep='\t')#, float_format='%.6g')
38 | print('done.')
39 | 


--------------------------------------------------------------------------------
/qtl/src/combine_signif_pairs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import argparse
 4 | import os
 5 | import gzip
 6 | 
 7 | parser = argparse.ArgumentParser(description='Combine significant pairs from multiple eQTL mapping runs.')
 8 | parser.add_argument('signifpair_list_file', help="File listing of 'signifpairs' outputs from eQTL pipeline.")
 9 | parser.add_argument('prefix', help='Prefix for output file: <prefix>.combined_signifpairs.txt.gz')
10 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
11 | args = parser.parse_args()
12 | 
13 | with open(args.signifpair_list_file) as f:
14 |     file_paths = f.read().strip().split('\n')
15 | 
16 | print('Loading significant pairs.')
17 | dfs = []
18 | for f in file_paths:
19 |     dfs.append(pd.read_csv(f, sep='\t', usecols=['variant_id', 'gene_id']))
20 | dfs = pd.concat(dfs, axis=0)
21 | dfs = dfs.drop_duplicates()
22 | 
23 | print('Sorting significant pairs.')
24 | dfs['chr'] = dfs['variant_id'].apply(lambda x: x.split('_',1)[0])
25 | dfs['pos'] = dfs['variant_id'].apply(lambda x: int(x.split('_',2)[1]))
26 | dfs = dfs.sort_values(['chr', 'pos', 'gene_id'])
27 | 
28 | print('Writing output.')
29 | with gzip.open(os.path.join(args.output_dir, args.prefix+'.combined_signifpairs.txt.gz'), 'wt', compresslevel=6) as f:
30 |     dfs.to_csv(f, sep='\t', index=False)
31 | 


--------------------------------------------------------------------------------
/qtl/src/convert_vep.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import pandas as pd
  3 | import numpy as np
  4 | import gzip
  5 | import pickle
  6 | import sys
  7 | import argparse
  8 | import os
  9 | 
 10 | 
 11 | def choose_variant_annotation(csq_string, variant_annotation_rank_dict, gene_ix, conseq_ix):
 12 |     """Parse lowest ranking consequence and gene ID from CSQ string"""
 13 |     minrank = len(variant_annotation_rank_dict)
 14 |     gene_id = 'NA'
 15 |     conseq = 'NA'
 16 | 
 17 |     for tr in csq_string.split(','):
 18 |         annots = tr.split('|')
 19 |         for v in annots[conseq_ix].split('&'):
 20 |             if v in variant_annotation_rank_dict:
 21 |                 r = variant_annotation_rank_dict[v]
 22 |                 if r<minrank:
 23 |                     minrank = r
 24 |                     gene_id = annots[gene_ix]
 25 |                     conseq = v
 26 |     return gene_id, conseq
 27 | 
 28 | 
 29 | def get_vep_format(vep_vcf):
 30 |     """Format of the CSQ string"""
 31 |     fmt = None
 32 |     with gzip.open(vep_vcf) as f:
 33 |         for line in f:
 34 |             line = line.decode().strip()
 35 |             if line[0]!='#':
 36 |                 break
 37 |             elif line[:6]=='##INFO' and 'CSQ' in line:
 38 |                 fmt = line.split('Format: ')[1].replace('">','').split('|')
 39 |                 break
 40 |     if fmt is None:
 41 |         raise ValueError('CSQ format not found in VCF header')
 42 |     return fmt
 43 | 
 44 | 
 45 | # from http://useast.ensembl.org/info/genome/variation/predicted_data.html
 46 | variant_annotation_rank_list = [
 47 |     'transcript_ablation',
 48 |     'splice_acceptor_variant',
 49 |     'splice_donor_variant',
 50 |     'stop_gained',
 51 |     'frameshift_variant',
 52 |     'stop_lost',
 53 |     'start_lost',
 54 |     'transcript_amplification',
 55 |     'inframe_insertion',
 56 |     'inframe_deletion',
 57 |     'missense_variant',
 58 |     'protein_altering_variant',
 59 |     'splice_region_variant',
 60 |     'incomplete_terminal_codon_variant',
 61 |     'stop_retained_variant',
 62 |     'synonymous_variant',
 63 |     'coding_sequence_variant',
 64 |     'mature_miRNA_variant',
 65 |     '5_prime_UTR_variant',
 66 |     '3_prime_UTR_variant',
 67 |     'non_coding_transcript_exon_variant',
 68 |     'intron_variant']
 69 | 
 70 | variant_annotation_rank_dict = {j:i for i,j in enumerate(variant_annotation_rank_list)}
 71 | 
 72 | 
 73 | if __name__=='__main__':
 74 | 
 75 |     parser = argparse.ArgumentParser(description='Extract single effect from VEP VCF')
 76 |     parser.add_argument('vcf', help='VCF')
 77 |     parser.add_argument('vep_vcf', help='Variant Effect Predictor VCF')
 78 |     parser.add_argument('-o', '--output_dir', default='.')
 79 |     args = parser.parse_args()
 80 | 
 81 |     #------------------------------------------------
 82 |     # Pre-processing: extract variant set from VCF
 83 |     #------------------------------------------------
 84 |     prefix = os.path.basename(args.vcf).split('.vcf')[0]
 85 |     variant_id_file = os.path.join(args.output_dir, prefix+'.variant_ids.txt.gz')
 86 |     if not os.path.exists(variant_id_file):
 87 |         print('Extracting variant IDs to {}'.format(variant_id_file))
 88 |         subprocess.check_call('zcat '+args.vcf+' | grep -v "#" | cut -f3 | gzip -c -1 > '+variant_id_file)
 89 | 
 90 |     print('Loading variant IDs')
 91 |     with gzip.open(variant_id_file) as f:
 92 |         v = f.read()
 93 |     variant_set = set(v.decode().strip().split('\n'))
 94 | 
 95 |     # get position of Gene and Consequence fields in CSQ string (changes with VEP versions)
 96 |     fmt = get_vep_format(args.vep_vcf)
 97 |     gene_ix = np.where(np.array(fmt)=='Gene')[0][0]
 98 |     conseq_ix = np.where(np.array(fmt)=='Consequence')[0][0]
 99 | 
100 |     # parse VEP VCF and write variant_id, gene, consequence to file
101 |     # use pandas parser for speed/process by chunks to reduce memory usage
102 |     print('Parsing VEP VCF')
103 |     vep_file = os.path.join(args.output_dir, prefix+'.vep.txt.gz')
104 |     with gzip.open(vep_file, 'wt') as f:
105 |         f.write('variant_id\tgene_id\tvep\n')
106 |         for k,chunk in enumerate(pd.read_csv(args.vep_vcf, sep='\t', comment='#', header=None,
107 |             names=['chr', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info'], dtype=bytes, usecols=['id', 'info'],
108 |             iterator=True, chunksize=100000)):
109 | 
110 |             print('\rProcessing chunk {0:d}'.format(k+1), end='')
111 |             for i,c in zip(chunk['id'], chunk['info']):
112 |                 if len(c.split('|')) > 10:
113 |                     cs = [i for i in c.split(';') if i.startswith('CSQ')][0]
114 |                     gene_id, conseq = choose_variant_annotation(cs, variant_annotation_rank_dict, gene_ix, conseq_ix)
115 |                     if gene_id != 'NA':
116 |                         for j in i.split(';'):
117 |                             if j in variant_set:
118 |                                 f.write(j+'\t'+gene_id+'\t'+conseq+'\n')
119 |             print()
120 | 
121 |     print('Converting to dictionary')
122 |     vep_df = pd.read_csv(vep_file, sep='\t', index_col=0)
123 |     vep_df['combined'] = [(i,j) for i,j in zip(vep_df['gene_id'], vep_df['vep'])]
124 |     vep_dict = vep_df['combined'].to_dict()
125 | 
126 |     print('Saving as pickle')
127 |     with open(os.path.join(args.output_dir, prefix+'.vep_dict.pickle'), 'wb') as f:
128 |         pickle.dump(vep_dict, f, pickle.HIGHEST_PROTOCOL)
129 | 


--------------------------------------------------------------------------------
/qtl/src/eqtl_prepare_expression.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Author: Francois Aguet
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import gzip
  7 | import subprocess
  8 | import scipy.stats as stats
  9 | import argparse
 10 | import os
 11 | import qtl.io
 12 | import qtl.norm
 13 | 
 14 | 
 15 | def prepare_expression(counts_df, tpm_df, sample_frac_threshold=0.2,
 16 |                        count_threshold=6, tpm_threshold=0.1, mode='tmm'):
 17 |     """
 18 |     Genes are filtered using the following expression thresholds:
 19 |       TPM >= tpm_threshold in >= sample_frac_threshold * samples
 20 |       read counts >= count_threshold in sample_frac_threshold * samples
 21 | 
 22 |     The filtered counts matrix is then normalized using:
 23 |       TMM (mode='tmm'; default) or
 24 |       quantile normalization (mode='qn')
 25 |     """
 26 | 
 27 |     # expression thresholds
 28 |     ns = tpm_df.shape[1]
 29 |     mask = (
 30 |         (np.sum(tpm_df >= tpm_threshold, axis=1) >= sample_frac_threshold * ns) &
 31 |         (np.sum(counts_df >= count_threshold, axis=1) >= sample_frac_threshold * ns)
 32 |     ).values
 33 | 
 34 |     # apply normalization
 35 |     if mode.lower() == 'tmm':
 36 |         tmm_counts_df = qtl.norm.edger_cpm(counts_df, normalized_lib_sizes=True)
 37 |         norm_df = qtl.norm.inverse_normal_transform(tmm_counts_df[mask])
 38 |     elif mode.lower() == 'qn':
 39 |         qn_df = qtl.norm.normalize_quantiles(tpm_df.loc[mask])
 40 |         norm_df = qtl.norm.inverse_normal_transform(qn_df)
 41 |     else:
 42 |         raise ValueError(f'Unsupported mode {mode}')
 43 | 
 44 |     return norm_df
 45 | 
 46 | 
 47 | 
 48 | if __name__ == '__main__':
 49 |     parser = argparse.ArgumentParser(description='Generate normalized expression BED files for eQTL analyses')
 50 |     parser.add_argument('tpm_gct', help='GCT or Parquet file with expression in normalized units, e.g., TPM or FPKM')
 51 |     parser.add_argument('counts_gct', help='GCT or Parquet file with read counts')
 52 |     parser.add_argument('annotation_gtf', help='GTF annotation used for generating expression matrices')
 53 |     parser.add_argument('sample_to_participant', help='TSV linking sample IDs (columns in expression matrices) to participant IDs (VCF IDs)')
 54 |     parser.add_argument('prefix', help='Prefix for output file names')
 55 |     parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
 56 |     parser.add_argument('--sample_ids', default=None, help='File listing sample IDs to include')
 57 |     parser.add_argument('--chrs', help='File listing chromosomes to include (default: chr1-22 + chrX)')
 58 |     parser.add_argument('--convert_tpm', action='store_true', help='Convert to TPM (in case input is in RPKM/FPKM)')
 59 |     parser.add_argument('--tpm_threshold', type=np.double, default=0.1, help='Selects genes with > expression_threshold expression in at least sample_frac_threshold')
 60 |     parser.add_argument('--count_threshold', type=np.int32, default=6, help='Selects genes with >= count_threshold reads in at least sample_frac_threshold samples')
 61 |     parser.add_argument('--sample_frac_threshold', type=np.double, default=0.2, help='Minimum fraction of samples that must satisfy thresholds')
 62 |     parser.add_argument('--normalization_method', default='tmm', help='Normalization method: TMM or quantile normalization (qn)')
 63 |     parser.add_argument('--parquet', action='store_true', help='Write output in Parquet format')
 64 |     args = parser.parse_args()
 65 | 
 66 |     print('Loading expression data', flush=True)
 67 |     sample_ids = None
 68 |     if args.sample_ids is not None:
 69 |         with open(args.sample_ids) as f:
 70 |             sample_ids = f.read().strip().split('\n')
 71 |             print(f'  * Loading {len(sample_ids)} samples', flush=True)
 72 | 
 73 |     counts_df = qtl.io.read_gct(args.counts_gct, sample_ids=sample_ids, load_description=False)
 74 |     tpm_df = qtl.io.read_gct(args.tpm_gct, sample_ids=sample_ids, load_description=False)
 75 | 
 76 |     sample_to_participant_s = pd.read_csv(args.sample_to_participant, sep='\t', index_col=0,
 77 |                                           header=None, dtype=str).squeeze('columns')
 78 | 
 79 |     # check inputs
 80 |     if not counts_df.columns.equals(tpm_df.columns):
 81 |         raise ValueError('Sample IDs in the TPM and read counts files must match.')
 82 |     missing_ids = ~counts_df.columns.isin(sample_to_participant_s.index)
 83 |     if missing_ids.any():
 84 |         raise ValueError(f"Sample IDs in expression files and participant lookup table must match ({missing_ids.sum()} sample IDs missing from {os.path.basename(args.sample_to_participant)}).")
 85 | 
 86 |     if args.convert_tpm:
 87 |         print('  * Converting to TPM', flush=True)
 88 |         tpm_df = tpm_df / tpm_df.sum(0) * 1e6
 89 | 
 90 |     print(f'Normalizing data ({args.normalization_method})', flush=True)
 91 |     norm_df = prepare_expression(counts_df, tpm_df,
 92 |                                  sample_frac_threshold=args.sample_frac_threshold,
 93 |                                  count_threshold=args.count_threshold,
 94 |                                  tpm_threshold=args.tpm_threshold,
 95 |                                  mode=args.normalization_method)
 96 |     print(f'  * {counts_df.shape[0]} genes in input tables', flush=True)
 97 |     print(f'  * {norm_df.shape[0]} genes remain after thresholding', flush=True)
 98 | 
 99 |     # change sample IDs to participant IDs
100 |     norm_df.rename(columns=sample_to_participant_s, inplace=True)
101 | 
102 |     if args.chrs is not None:
103 |         with open(args.chrs) as f:
104 |             chrs = f.read().strip().split('\n')
105 |     else:
106 |         chrs = [f'chr{i}' for i in range(1,23)] + ['chrX']
107 | 
108 |     # prepare BED
109 |     bed_template_df = qtl.io.gtf_to_tss_bed(args.annotation_gtf, feature='transcript')
110 |     bed_template_df = bed_template_df[bed_template_df['chr'].isin(chrs)]
111 |     bed_df = pd.merge(bed_template_df, norm_df, left_index=True, right_index=True)
112 |     qtl.io.sort_bed(bed_df, inplace=True)
113 |     print(f'  * {bed_df.shape[0]} genes remain after selecting chromosomes', flush=True)
114 | 
115 |     print('Writing BED file', flush=True)
116 |     if not args.parquet:
117 |         qtl.io.write_bed(bed_df, os.path.join(args.output_dir, f'{args.prefix}.expression.bed.gz'))
118 |     else:
119 |         bed_df.to_parquet(os.path.join(args.output_dir, f'{args.prefix}.expression.bed.parquet'))
120 | 


--------------------------------------------------------------------------------
/qtl/src/metasoft_postprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import argparse
 5 | import os
 6 | import gzip
 7 | 
 8 | 
 9 | def load_pair_data(path):
10 |     if path.endswith('.txt.gz'):
11 |         return pd.read_csv(path, sep='\t', usecols=['pair_id', 'pval_nominal'], index_col=0, dtype={'pair_id':str, 'pval_nominal':np.float64})
12 |     elif path.endswith('.parquet'):
13 |         return pd.read_parquet(path, columns=['pval_nominal'])
14 |     else:
15 |         raise ValueError('Input format not recognized.')
16 | 
17 | 
18 | parser = argparse.ArgumentParser(description='METASOFT post-processing.')
19 | parser.add_argument('metasoft_output_chunks', help="List of metasoft outputs.")
20 | parser.add_argument('metasoft_pairs', help="List of metasoft input pair files (containing nominal p-values).")
21 | parser.add_argument('prefix', help='Prefix for output file: <prefix>.metasoft_input.[chunk000.]txt.gz')
22 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
23 | args = parser.parse_args()
24 | 
25 | with open(args.metasoft_output_chunks) as f:
26 |     chunk_paths = f.read().strip().split('\n')
27 | 
28 | with open(args.metasoft_pairs) as f:
29 |     pair_paths = f.read().strip().split('\n')
30 | 
31 | # parse sample IDs from pairs
32 | sample_ids = np.array([os.path.split(i)[1].split('.')[0] for i in pair_paths])
33 | assert len(sample_ids)==len(np.unique(sample_ids))
34 | # sort by sample ID
35 | i = np.argsort(sample_ids)
36 | sample_ids = sample_ids[i]
37 | pair_paths = np.array(pair_paths)[i]
38 | 
39 | # prepare header
40 | header = ['RSID', '#STUDY', 'PVALUE_FE', 'BETA_FE', 'STD_FE', 'PVALUE_RE', 'BETA_RE', 'STD_RE',
41 |     'PVALUE_RE2', 'STAT1_RE2', 'STAT2_RE2', 'PVALUE_BE', 'I_SQUARE', 'Q', 'PVALUE_Q', 'TAU_SQUARE']
42 | header += ['pval_'+i for i in sample_ids] + ['mval_'+i for i in sample_ids]
43 | 
44 | # concatenate chunks
45 | print('Loading chunks')
46 | output_df = []
47 | for i,c in enumerate(chunk_paths):
48 |     print('  * loading chunk {}/{}'.format(i+1, len(chunk_paths)), flush=True)
49 |     output_df.append(
50 |         pd.read_csv(c, sep='\t', header=None, skiprows=1, names=header, usecols=header, index_col=0)
51 |     )
52 | print('Concatenating chunks')
53 | output_df = pd.concat(output_df, axis=0)
54 | 
55 | # sort chunks by input order
56 | pair_df = load_pair_data(pair_paths[0])
57 | if not np.all(output_df.index==pair_df.index):
58 |     print('Sorting output')
59 |     output_df = output_df.loc[pair_df.index]
60 | 
61 | print('Substituting p-values')
62 | output_df['pval_'+sample_ids[0]] = pair_df['pval_nominal']
63 | for i,p in zip(sample_ids[1:], pair_paths[1:]):
64 |     pair_df = load_pair_data(p)
65 |     output_df['pval_'+i] = pair_df['pval_nominal']
66 | 
67 | print('Writing output')
68 | with gzip.open(os.path.join(args.output_dir, args.prefix+'.metasoft.txt.gz'), 'wt', compresslevel=6) as f:
69 |     output_df.to_csv(f, sep='\t', float_format='%.6g', na_rep='NA')
70 | 


--------------------------------------------------------------------------------
/qtl/src/metasoft_prepare_input.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import argparse
 5 | import subprocess
 6 | import os
 7 | import gzip
 8 | 
 9 | 
10 | def load_pair_data(path):
11 |     if path.endswith('.txt.gz'):
12 |         return pd.read_csv(path, sep='\t', usecols=['pair_id', 'slope', 'slope_se'], index_col=0, dtype={'pair_id':str, 'slope':np.float32, 'slope_se':np.float32})
13 |     elif path.endswith('.parquet'):
14 |         return pd.read_parquet(path, columns=['slope', 'slope_se'])
15 |     else:
16 |         raise ValueError('Input format not recognized.')
17 | 
18 | 
19 | parser = argparse.ArgumentParser(description='Prepare METASOFT input.')
20 | parser.add_argument('variant_gene_pair_files', help="List of variant-gene pair association result. Header must specify 'slope' and 'slope_se' columns.")
21 | parser.add_argument('prefix', help='Prefix for output file: <prefix>.metasoft_input.[chunk000.]txt.gz')
22 | parser.add_argument('--chunks', default=None, type=int, help='')
23 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
24 | parser.add_argument('--write_full', action='store_true', help='Write full input table')
25 | args = parser.parse_args()
26 | 
27 | with open(args.variant_gene_pair_files) as f:
28 |     paths = f.read().strip().split('\n')
29 | 
30 | sample_ids = np.array([os.path.split(i)[1].split('.')[0] for i in paths])
31 | assert len(sample_ids)==len(np.unique(sample_ids))
32 | # sort by sample ID
33 | i = np.argsort(sample_ids)
34 | sample_ids = sample_ids[i]
35 | paths = np.array(paths)[i]
36 | 
37 | print('Reading input files')
38 | df = load_pair_data(paths[0])
39 | 
40 | # input format: pair_id, tissue1_slope, tissue1_slope_se, tissue2_slope, tissue2_slope_s2, ...
41 | metasoft_df = pd.DataFrame(0, index=df.index, columns=[j for i in sample_ids for j in [i+'_slope', i+'_slope_se']], dtype=np.float32)
42 | metasoft_df[sample_ids[0]+'_slope'] = df['slope']
43 | metasoft_df[sample_ids[0]+'_slope_se'] = df['slope_se']
44 | for k,(i,p) in enumerate(zip(sample_ids[1:], paths[1:])):
45 |     print('  * processing {}/{}'.format(k+2, len(paths)), flush=True)
46 |     df = load_pair_data(p)
47 |     metasoft_df[i+'_slope'] = df['slope']
48 |     metasoft_df[i+'_slope_se'] = df['slope_se']
49 | print()
50 | 
51 | print('Writing Metasoft input')
52 | # split into chunks for parallelization
53 | if args.chunks is not None:
54 |     chunk_size = int(np.ceil(metasoft_df.shape[0] / args.chunks))
55 |     for i in np.arange(args.chunks):
56 |         print('  * writing chunk {}/{}'.format(i+1, args.chunks), flush=True)
57 |         with gzip.open(os.path.join(args.output_dir, args.prefix+'.metasoft_input.chunk{:03d}.txt.gz'.format(i)), 'wt', compresslevel=1) as f:
58 |             metasoft_df.iloc[i*chunk_size:(i+1)*chunk_size].to_csv(f, sep='\t', float_format='%.6g', na_rep='NA')
59 |     print()
60 | 
61 | if args.write_full:
62 |     print('Writing full table')
63 |     with gzip.open(os.path.join(args.output_dir, args.prefix+'.metasoft_input.txt.gz'), 'wt', compresslevel=1) as f:
64 |         metasoft_df.to_csv(f, sep='\t', float_format='%.6g', na_rep='NA')
65 | 


--------------------------------------------------------------------------------
/qtl/src/run_GATK_ASEReadCounter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | 
 4 | import argparse
 5 | import os
 6 | import subprocess
 7 | from datetime import datetime
 8 | 
 9 | parser = argparse.ArgumentParser(description='Run GATK ASEReadCounter')
10 | parser.add_argument('gatk_jar', help='GATK4 .jar file')
11 | parser.add_argument('genome_fasta', help='FASTA reference genome')
12 | parser.add_argument('het_vcf', help='VCF with het sites (biallelic only)')
13 | parser.add_argument('bam_file', help='RNA-seq BAM file filtered for biased reads, e.g., using STAR/WASP (grep -v "vW:i:[2-7]")')
14 | parser.add_argument('prefix', help='Prefix for output file names')
15 | parser.add_argument('--min_depth', default=1, type=int, help='Minimum depth after filters')
16 | parser.add_argument('--min_mapping_quality', default=255, type=int, help='Minimum read mapping quality (255 for unique mapping reads from STAR).')
17 | parser.add_argument('--min_base_quality', default=10, type=int, help='Minimum base quality')
18 | parser.add_argument('--disable_drf', action='store_true', help='Disable DuplicateRead filter')
19 | args = parser.parse_args()
20 | 
21 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Running GATK ASEReadCounter', flush=True)
22 | 
23 | cmd = f"java -jar {args.gatk_jar} \
24 |     ASEReadCounter \
25 |     -R {args.genome_fasta} \
26 |     -I {args.bam_file} \
27 |     -V {args.het_vcf} \
28 |     -O {args.prefix}.readcounts.txt \
29 |     -min-depth {args.min_depth} \
30 |     --min-mapping-quality {args.min_mapping_quality} \
31 |     --min-base-quality {args.min_base_quality}"
32 | 
33 | if args.disable_drf:
34 |     cmd += ' -DF NotDuplicateReadFilter'
35 | 
36 | subprocess.check_call(cmd, shell=True, executable='/bin/bash')
37 | subprocess.check_call(f'gzip {args.prefix}.readcounts.txt', shell=True)
38 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Done', flush=True)
39 | 


--------------------------------------------------------------------------------
/qtl/src/run_PEER.R:
--------------------------------------------------------------------------------
 1 | # Author: Francois Aguet
 2 | 
 3 | library(peer, quietly=TRUE)  # https://github.com/PMBio/peer
 4 | library(argparser, quietly=TRUE)
 5 | 
 6 | WriteTable <- function(data, filename, index.name) {
 7 |     datafile <- file(filename, open = "wt")
 8 |     on.exit(close(datafile))
 9 |     header <- c(index.name, colnames(data))
10 |     writeLines(paste0(header, collapse="\t"), con=datafile, sep="\n")
11 |     write.table(data, datafile, sep="\t", col.names=FALSE, quote=FALSE)
12 | }
13 | 
14 | p <- arg_parser("Run PEER factor estimation")
15 | p <- add_argument(p, "expr.file", help="")
16 | p <- add_argument(p, "prefix", help="")
17 | p <- add_argument(p, "n", help="Number of hidden confounders to estimate")
18 | p <- add_argument(p, "--covariates", help="Observed covariates")
19 | p <- add_argument(p, "--alphaprior_a", help="", default=0.001)
20 | p <- add_argument(p, "--alphaprior_b", help="", default=0.01)
21 | p <- add_argument(p, "--epsprior_a", help="", default=0.1)
22 | p <- add_argument(p, "--epsprior_b", help="", default=10)
23 | p <- add_argument(p, "--max_iter", help="", default=1000)
24 | p <- add_argument(p, "--output_dir", short="-o", help="Output directory", default=".")
25 | argv <- parse_args(p)
26 | 
27 | cat("PEER: loading expression data ... ")
28 | if (grepl('.gz$', argv$expr.file)) {
29 |     nrows <- as.integer(system(paste0("zcat ", argv$expr.file, " | wc -l | cut -d' ' -f1 "), intern=TRUE, wait=TRUE))
30 | } else {
31 |     nrows <- as.integer(system(paste0("wc -l ", argv$expr.file, " | cut -d' ' -f1 "), intern=TRUE, wait=TRUE))
32 | }
33 | if (grepl('.bed$', argv$expr.file) || grepl('.bed.gz$', argv$expr.file)) {
34 |     df <- read.table(argv$expr.file, sep="\t", nrows=nrows, header=TRUE, check.names=FALSE, comment.char="")
35 |     row.names(df) <- df[, 4]
36 |     df <- df[, 5:ncol(df)]
37 | } else {
38 |     df <- read.table(argv$expr.file, sep="\t", nrows=nrows, header=TRUE, check.names=FALSE, comment.char="", row.names=1)
39 | }
40 | M <- t(as.matrix(df))
41 | cat("done.\n")
42 | 
43 | # run PEER
44 | cat(paste0("PEER: estimating hidden confounders (", argv$n, ")\n"))
45 | model <- PEER()
46 | invisible(PEER_setNk(model, argv$n))
47 | invisible(PEER_setPhenoMean(model, M))
48 | invisible(PEER_setPriorAlpha(model, argv$alphaprior_a, argv$alphaprior_b))
49 | invisible(PEER_setPriorEps(model, argv$epsprior_a, argv$epsprior_b))
50 | invisible(PEER_setNmax_iterations(model, argv$max_iter))
51 | if (!is.null(argv$covariates) && !is.na(argv$covariates)) {
52 |     has.cov <- TRUE
53 |     covar.df <- read.table(argv$covariates, sep="\t", header=TRUE, row.names=1, as.is=TRUE)
54 |     covar.df[] <- sapply(covar.df, as.numeric)
55 |     cat(paste0("  * including ", dim(covar.df)[2], " covariates", "\n"))
56 |     invisible(PEER_setCovariates(model, as.matrix(covar.df[rownames(M), ])))  # samples x covariates
57 | } else {
58 |     has.cov <- FALSE
59 | }
60 | time <- system.time(PEER_update(model))
61 | 
62 | X <- PEER_getX(model)  # samples x PEER factors
63 | A <- PEER_getAlpha(model)  # PEER factors x 1
64 | R <- t(PEER_getResiduals(model))  # genes x samples
65 | 
66 | # add relevant row/column names
67 | if (has.cov) {
68 |     cols <- c(colnames(covar.df), paste0("InferredCov",1:(ncol(X)-dim(covar.df)[2])))
69 | } else {
70 |     cols <- paste0("InferredCov",1:ncol(X))
71 | }
72 | rownames(X) <- rownames(M)
73 | colnames(X) <- cols
74 | rownames(A) <- cols
75 | colnames(A) <- "Alpha"
76 | A <- as.data.frame(A)
77 | A$Relevance <- 1.0 / A$Alpha
78 | rownames(R) <- colnames(M)
79 | colnames(R) <- rownames(M)
80 | 
81 | # write results
82 | cat("PEER: writing results ... ")
83 | WriteTable(t(X), file.path(argv$output_dir, paste0(argv$prefix, ".PEER_covariates.txt")), "ID")  # format(X, digits=6)
84 | WriteTable(A, file.path(argv$output_dir, paste0(argv$prefix, ".PEER_alpha.txt")), "ID")
85 | WriteTable(R, file.path(argv$output_dir, paste0(argv$prefix, ".PEER_residuals.txt")), "ID")
86 | cat("done.\n")
87 | 


--------------------------------------------------------------------------------
/qtl/src/run_metasoft.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import subprocess
 4 | import os
 5 | import tempfile
 6 | 
 7 | parser = argparse.ArgumentParser(description='Run METASOFT.')
 8 | parser.add_argument('metasoft_jar', help='metasoft.jar')
 9 | parser.add_argument('metasoft_input', help='')
10 | parser.add_argument('prefix', help='Prefix for output file: <prefix>.metasoft.txt.gz')
11 | parser.add_argument('--pvalue_table', default=None, help='HanEskinPvalueTable.txt if not in same directory as metasoft.jar')
12 | parser.add_argument('--seed', default=100, type=int, help='')
13 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
14 | args = parser.parse_args()
15 | 
16 | if args.pvalue_table is not None:
17 |     pvalue_table = args.pvalue_table
18 | else:
19 |     pvalue_table = os.path.join(os.path.split(args.metasoft_jar)[0], 'HanEskinPvalueTable.txt')
20 |     assert os.path.exists(pvalue_table)
21 | 
22 | if args.metasoft_input.endswith('.gz'):
23 |     input_file = os.path.splitext(os.path.split(args.metasoft_input)[1])[0]
24 |     subprocess.check_call('gunzip -c {} > {}'.format(args.metasoft_input, input_file), shell=True, executable='/bin/bash')
25 | else:
26 |     input_file = args.metasoft_input
27 | 
28 | # strip header if present
29 | with open(input_file) as f:
30 |     header = f.readline().strip().split('\t')
31 |     if header[0]=='pair_id':
32 |         subprocess.check_call('tail -n+2 {0} > tmp.txt && mv tmp.txt {0}'.format(input_file), shell=True, executable='/bin/bash')
33 | 
34 | cmd = 'java -jar '+args.metasoft_jar\
35 |     +' -input '+input_file\
36 |     +' -pvalue_table '+pvalue_table\
37 |     +' -output '+os.path.join(args.output_dir, args.prefix+'.metasoft.txt')\
38 |     +' -mvalue_p_thres 1.0'\
39 |     +' -mvalue_method mcmc'\
40 |     +' -seed '+str(args.seed)\
41 |     +' -log '+os.path.join(args.output_dir, args.prefix+'.metasoft.log')\
42 |     +' -mvalue'
43 | subprocess.check_call(cmd, shell=True, executable='/bin/bash')
44 | subprocess.check_call('gzip -1 '+args.prefix+'.metasoft.txt', shell=True, executable='/bin/bash')
45 | 


--------------------------------------------------------------------------------
/qtl/tensorqtl_cis_independent.wdl:
--------------------------------------------------------------------------------
 1 | task tensorqtl_cis_independent {
 2 | 
 3 |     File plink_pgen
 4 |     File plink_pvar
 5 |     File plink_psam
 6 | 
 7 |     File phenotype_bed
 8 |     File covariates
 9 |     String prefix
10 |     File cis_output
11 |     File? phenotype_groups
12 |     Int? seed
13 | 
14 |     Int memory
15 |     Int disk_space
16 |     Int num_threads
17 |     Int num_preempt
18 | 
19 |     command {
20 |         set -euo pipefail
21 |         plink_base=$(echo "${plink_pgen}" | rev | cut -f 2- -d '.' | rev)
22 |         python3 -m tensorqtl \
23 |             $plink_base ${phenotype_bed} ${prefix} \
24 |             --mode cis_independent \
25 |             --covariates ${covariates} \
26 |             --cis_output ${cis_output} \
27 |             ${"--seed " + seed} \
28 |             ${"--phenotype_groups " + phenotype_groups}
29 |     }
30 | 
31 |     runtime {
32 |         docker: "gcr.io/broad-cga-francois-gtex/tensorqtl:latest"
33 |         memory: "${memory}GB"
34 |         disks: "local-disk ${disk_space} HDD"
35 |         bootDiskSizeGb: 25
36 |         cpu: "${num_threads}"
37 |         preemptible: "${num_preempt}"
38 |         gpuType: "nvidia-tesla-p100"
39 |         gpuCount: 1
40 |         zones: ["us-central1-c"]
41 |     }
42 | 
43 |     output {
44 |         File cis_independent_qtl="${prefix}.cis_independent_qtl.txt.gz"
45 |         File log="${prefix}.tensorQTL.cis_independent.log"
46 |     }
47 | 
48 |     meta {
49 |         author: "Francois Aguet"
50 |     }
51 | }
52 | 
53 | workflow tensorqtl_cis_independent_workflow {
54 |     call tensorqtl_cis_independent
55 | }
56 | 


--------------------------------------------------------------------------------
/qtl/tensorqtl_cis_nominal.wdl:
--------------------------------------------------------------------------------
 1 | task tensorqtl_cis_nominal {
 2 | 
 3 |     File plink_pgen
 4 |     File plink_pvar
 5 |     File plink_psam
 6 | 
 7 |     File phenotype_bed
 8 |     File covariates
 9 |     String prefix
10 | 
11 |     File? interaction
12 |     File? phenotype_groups
13 | 
14 |     Int memory
15 |     Int disk_space
16 |     Int num_threads
17 |     Int num_preempt
18 | 
19 |     command {
20 |         set -euo pipefail
21 |         plink_base=$(echo "${plink_pgen}" | rev | cut -f 2- -d '.' | rev)
22 |         python3 -m tensorqtl \
23 |             $plink_base ${phenotype_bed} ${prefix} \
24 |             --mode cis_nominal \
25 |             --covariates ${covariates} \
26 |             ${"--interaction " + interaction} \
27 |             ${"--phenotype_groups " + phenotype_groups}
28 |     }
29 | 
30 |     runtime {
31 |         docker: "gcr.io/broad-cga-francois-gtex/tensorqtl:latest"
32 |         memory: "${memory}GB"
33 |         disks: "local-disk ${disk_space} HDD"
34 |         bootDiskSizeGb: 25
35 |         cpu: "${num_threads}"
36 |         preemptible: "${num_preempt}"
37 |         gpuType: "nvidia-tesla-p100"
38 |         gpuCount: 1
39 |         zones: ["us-central1-c"]
40 |     }
41 | 
42 |     output {
43 |         Array[File] chr_parquet=glob("${prefix}*.parquet")
44 |         File log=glob("${prefix}*.log")[0]
45 |     }
46 | 
47 |     meta {
48 |         author: "Francois Aguet"
49 |     }
50 | }
51 | 
52 | workflow tensorqtl_cis_nominal_workflow {
53 |     call tensorqtl_cis_nominal
54 | }
55 | 


--------------------------------------------------------------------------------
/qtl/tensorqtl_cis_permutations.wdl:
--------------------------------------------------------------------------------
 1 | task tensorqtl_cis_permutations {
 2 | 
 3 |     File plink_pgen
 4 |     File plink_pvar
 5 |     File plink_psam
 6 | 
 7 |     File phenotype_bed
 8 |     File covariates
 9 |     String prefix
10 | 
11 |     File? phenotype_groups
12 |     Float? fdr
13 |     Float? qvalue_lambda
14 |     Int? seed
15 |     String? flags
16 | 
17 |     Int memory
18 |     Int disk_space
19 |     Int num_threads
20 |     Int num_preempt
21 | 
22 |     command {
23 |         set -euo pipefail
24 |         plink_base=$(echo "${plink_pgen}" | rev | cut -f 2- -d '.' | rev)
25 |         python3 -m tensorqtl \
26 |             $plink_base ${phenotype_bed} ${prefix} \
27 |             --mode cis \
28 |             --covariates ${covariates} \
29 |             ${"--phenotype_groups " + phenotype_groups} \
30 |             ${"--fdr " + fdr} \
31 |             ${"--qvalue_lambda " + qvalue_lambda} \
32 |             ${"--seed " + seed} \
33 |             ${flags}
34 |     }
35 | 
36 |     runtime {
37 |         docker: "gcr.io/broad-cga-francois-gtex/tensorqtl:latest"
38 |         memory: "${memory}GB"
39 |         disks: "local-disk ${disk_space} HDD"
40 |         bootDiskSizeGb: 25
41 |         cpu: "${num_threads}"
42 |         preemptible: "${num_preempt}"
43 |         gpuType: "nvidia-tesla-p100"
44 |         gpuCount: 1
45 |         zones: ["us-central1-c"]
46 |     }
47 | 
48 |     output {
49 |         File cis_qtl="${prefix}.cis_qtl.txt.gz"
50 |         File log="${prefix}.tensorQTL.cis.log"
51 |     }
52 | 
53 |     meta {
54 |         author: "Francois Aguet"
55 |     }
56 | }
57 | 
58 | workflow tensorqtl_cis_permutations_workflow {
59 |     call tensorqtl_cis_permutations
60 | }
61 | 


--------------------------------------------------------------------------------
/qtl/tensorqtl_cis_susie.wdl:
--------------------------------------------------------------------------------
 1 | task tensorqtl_cis_susie {
 2 | 
 3 |     File plink_pgen
 4 |     File plink_pvar
 5 |     File plink_psam
 6 | 
 7 |     File phenotype_bed
 8 |     File covariates
 9 |     String prefix
10 |     File cis_output
11 | 
12 |     Float? fdr
13 |     Int? max_effects
14 | 
15 |     Int memory
16 |     Int disk_space
17 |     Int num_threads
18 |     Int num_preempt
19 | 
20 |     command {
21 |         set -euo pipefail
22 |         plink_base=$(echo "${plink_pgen}" | rev | cut -f 2- -d '.' | rev)
23 |         python3 -m tensorqtl \
24 |             $plink_base ${phenotype_bed} ${prefix} \
25 |             --mode cis_susie \
26 |             --covariates ${covariates} \
27 |             --cis_output ${cis_output} \
28 |             ${"--fdr " + fdr} \
29 |             ${"--max_effects " + max_effects}
30 |     }
31 | 
32 |     runtime {
33 |         docker: "gcr.io/broad-cga-francois-gtex/tensorqtl:latest"
34 |         memory: "${memory}GB"
35 |         disks: "local-disk ${disk_space} HDD"
36 |         bootDiskSizeGb: 25
37 |         cpu: "${num_threads}"
38 |         preemptible: "${num_preempt}"
39 |         gpuType: "nvidia-tesla-p100"
40 |         gpuCount: 1
41 |         zones: ["us-central1-c"]
42 |     }
43 | 
44 |     output {
45 |         File susie_summary="${prefix}.SuSiE_summary.parquet"
46 |         File susie_pickle="${prefix}.SuSiE.pickle"
47 |         File log="${prefix}.tensorQTL.cis_susie.log"
48 |     }
49 | 
50 |     meta {
51 |         author: "Francois Aguet"
52 |     }
53 | }
54 | 
55 | workflow tensorqtl_cis_susie_workflow {
56 |     call tensorqtl_cis_susie
57 | }
58 | 


--------------------------------------------------------------------------------
/qtl/torus/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile for Torus (https://github.com/xqwen/torus)
 2 | FROM ubuntu:16.04
 3 | MAINTAINER Francois Aguet
 4 | 
 5 | RUN apt-get -qq update && apt-get install -qqy \
 6 |         build-essential \
 7 |         curl \
 8 |         git-all \
 9 |         lbzip2 \
10 |         libboost-all-dev \
11 |         libcurl3-dev \
12 |         libgsl-dev \
13 |         python3 \
14 |         python3-pip \
15 |         unzip \
16 |         wget \
17 |         zlib1g-dev \
18 |     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
19 |     apt-get clean && \
20 |     apt-get autoremove -y && \
21 |     rm -rf /var/lib/{apt,dpkg,cache,log}/
22 | 
23 | 
24 | # Torus
25 | RUN cd /opt && \
26 |     git clone https://github.com/xqwen/torus.git && \
27 |     cd torus/src && make && mkdir ../bin && cp torus ../bin/ && make clean
28 | ENV PATH /opt/torus/bin:$PATH
29 | 
30 | # copy scripts
31 | COPY src src/
32 | 


--------------------------------------------------------------------------------
/qtl/torus/src/run_torus.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import subprocess
 4 | from datetime import datetime
 5 | 
 6 | parser = argparse.ArgumentParser(description='Run TORUS')
 7 | parser.add_argument('qtl_file', help='')
 8 | parser.add_argument('annotation_file', help='')
 9 | parser.add_argument('prefix', help='Prefix for output file names')
10 | parser.add_argument('-o', '--output_dir', default='./', help='Output directory')
11 | args = parser.parse_args()
12 | 
13 | 
14 | def run_torus(qtl_file, annotation_file, prefix, output_dir='.'):
15 |     out_file = os.path.join(output_dir, prefix+'.torus_enrichment.txt')
16 |     log_file = os.path.join(output_dir, prefix+'.torus_enrichment.log')
17 | 
18 |     s = '['+datetime.now().strftime("%b %d %H:%M:%S")+'] Running Torus'
19 |     print(s, flush=True)
20 |     with open(log_file, 'w') as f:
21 |         f.write(s+'\n')
22 |         f.flush()
23 | 
24 |     cmd = 'torus \
25 |     -d {} \
26 |     --fastqtl \
27 |     -est \
28 |     -annot {} \
29 |     1> {} \
30 |     2> >(tee -a {} >&2)'.format(qtl_file, annotation_file, out_file, log_file)
31 |     subprocess.check_call(cmd, shell=True, executable='/bin/bash')
32 | 
33 |     s = '['+datetime.now().strftime("%b %d %H:%M:%S")+'] Done'
34 |     print(s, flush=True)
35 |     with open(log_file, 'a') as f:
36 |         f.write(s+'\n')
37 |         f.flush()
38 | 
39 | 
40 | if __name__=='__main__':
41 |     run_torus(args.qtl_file, args.annotation_file, args.prefix, output_dir=args.output_dir)
42 | 


--------------------------------------------------------------------------------
/qtl/torus/torus.wdl:
--------------------------------------------------------------------------------
 1 | task torus {
 2 | 
 3 |     File qtl_file
 4 |     File annotation_file
 5 |     String prefix
 6 | 
 7 |     Int memory
 8 |     Int disk_space
 9 |     Int num_threads
10 |     Int num_preempt
11 | 
12 |     command {
13 |         set -euo pipefail
14 |         python3 /src/run_torus.py ${qtl_file} ${annotation_file} ${prefix}
15 |     }
16 | 
17 |     runtime {
18 |         docker: "gcr.io/broad-cga-francois-gtex/torus:latest"
19 |         memory: "${memory}GB"
20 |         disks: "local-disk ${disk_space} HDD"
21 |         cpu: "${num_threads}"
22 |         preemptible: "${num_preempt}"
23 |     }
24 | 
25 |     output {
26 |         File torus_output="${prefix}.torus_enrichment.txt"
27 |         File torus_log="${prefix}.torus_enrichment.log"
28 |     }
29 | 
30 |     meta {
31 |         author: "Francois Aguet"
32 |     }
33 | }
34 | 
35 | 
36 | workflow torus_workflow {
37 |     call torus
38 | }
39 | 


--------------------------------------------------------------------------------
/rnaseq/Dockerfile:
--------------------------------------------------------------------------------
  1 | # Dockerfile for GTEx RNA-seq pipeline
  2 | FROM ubuntu:22.04
  3 | MAINTAINER Francois Aguet
  4 | 
  5 | RUN apt-get update && apt-get install -y software-properties-common && \
  6 |     apt-get update && apt-get install -y \
  7 |         build-essential \
  8 |         cmake \
  9 |         curl \
 10 |         git \
 11 |         libboost-all-dev \
 12 |         libbz2-dev \
 13 |         libcurl3-dev \
 14 |         libhdf5-serial-dev \
 15 |         liblzma-dev \
 16 |         libncurses5-dev \
 17 |         libssl-dev \
 18 |         openjdk-17-jdk \
 19 |         python3 \
 20 |         python3-pip \
 21 |         unzip \
 22 |         vim-common \
 23 |         wget \
 24 |         zlib1g-dev \
 25 |     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
 26 |     apt-get clean && \
 27 |     apt-get autoremove -y && \
 28 |     rm -rf /var/lib/{apt,dpkg,cache,log}/
 29 | 
 30 | 
 31 | #-----------------------------
 32 | # Pipeline components
 33 | #-----------------------------
 34 | 
 35 | # htslib
 36 | RUN cd /opt && \
 37 |     wget --no-check-certificate https://github.com/samtools/htslib/releases/download/1.19.1/htslib-1.19.1.tar.bz2 && \
 38 |     tar -xf htslib-1.19.1.tar.bz2 && rm htslib-1.19.1.tar.bz2 && cd htslib-1.19.1 && \
 39 |     ./configure --enable-libcurl --enable-s3 --enable-plugins --enable-gcs && \
 40 |     make && make install && make clean
 41 | 
 42 | # samtools
 43 | RUN cd /opt && \
 44 |     wget --no-check-certificate https://github.com/samtools/samtools/releases/download/1.19.2/samtools-1.19.2.tar.bz2 && \
 45 |     tar -xf samtools-1.19.2.tar.bz2 && rm samtools-1.19.2.tar.bz2 && cd samtools-1.19.2 && \
 46 |     ./configure --with-htslib=/opt/htslib-1.18 && make && make install && make clean
 47 | 
 48 | # STAR v2.7.11b
 49 | RUN cd /opt && \
 50 |     wget --no-check-certificate https://github.com/alexdobin/STAR/archive/2.7.11b.tar.gz && \
 51 |     tar -xf 2.7.11b.tar.gz && rm 2.7.11b.tar.gz
 52 | ENV PATH /opt/STAR-2.7.11b/bin/Linux_x86_64_static:$PATH
 53 | 
 54 | # RSEM v1.3.3
 55 | RUN cd /opt && \
 56 |     wget --no-check-certificate https://github.com/deweylab/RSEM/archive/v1.3.3.tar.gz && \
 57 |     tar -xvf v1.3.3.tar.gz && rm v1.3.3.tar.gz && cd RSEM-1.3.3 && make
 58 | ENV PATH /opt/RSEM-1.3.3:$PATH
 59 | 
 60 | # bamtools
 61 | RUN cd /opt && \
 62 |     wget --no-check-certificate https://github.com/pezmaster31/bamtools/archive/v2.4.1.tar.gz && \
 63 |     tar -xf v2.4.1.tar.gz && rm v2.4.1.tar.gz && cd bamtools-2.4.1 && mkdir build && cd build && cmake .. && make && make install && make clean
 64 | ENV LD_LIBRARY_PATH /usr/local/lib/bamtools:$LD_LIBRARY_PATH
 65 | 
 66 | # bamsync
 67 | COPY bamsync /opt/bamsync
 68 | RUN cd /opt/bamsync && make
 69 | ENV PATH /opt/bamsync:$PATH
 70 | 
 71 | # Picard tools
 72 | RUN mkdir /opt/picard-tools && \
 73 |     wget --no-check-certificate -P /opt/picard-tools/ https://github.com/broadinstitute/picard/releases/download/2.27.1/picard.jar
 74 | 
 75 | # kallisto
 76 | RUN cd /opt && \
 77 |     wget https://github.com/pachterlab/kallisto/releases/download/v0.50.1/kallisto_linux-v0.50.1.tar.gz && \
 78 |     tar -xf kallisto_linux-v0.50.1.tar.gz && rm kallisto_linux-v0.50.1.tar.gz
 79 | ENV PATH $PATH:/opt/kallisto_linux-v0.50.1
 80 | 
 81 | # bedtools
 82 | RUN cd /opt && \
 83 |     wget --no-check-certificate https://github.com/arq5x/bedtools2/releases/download/v2.31.1/bedtools-2.31.1.tar.gz && \
 84 |     tar -xf bedtools-2.31.1.tar.gz && rm bedtools-2.31.1.tar.gz && \
 85 |     cd bedtools2 && make && make install && make clean
 86 | 
 87 | # UCSC tools
 88 | RUN mkdir /opt/ucsc && \
 89 |     wget --no-check-certificate -P /opt/ucsc/ http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigWigToBedGraph && \
 90 |     wget --no-check-certificate -P /opt/ucsc/ http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedGraphToBigWig && \
 91 |     chmod 755 /opt/ucsc/*
 92 | ENV PATH /opt/ucsc:$PATH
 93 | 
 94 | # python modules
 95 | RUN pip3 install --upgrade pip setuptools
 96 | RUN pip3 install tables numpy pandas scipy pyarrow matplotlib seaborn
 97 | # numpy dependencies:
 98 | RUN pip3 install pyBigWig
 99 | 
100 | # FastQC
101 | RUN cd /opt && \
102 |     wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.9.zip && \
103 |     unzip fastqc_v0.11.9.zip && mv FastQC FastQC-0.11.9 && cd FastQC-0.11.9 && chmod 775 fastqc
104 | ENV PATH /opt/FastQC-0.11.9:$PATH
105 | 
106 | # RNA-SeQC
107 | RUN mkdir /opt/rnaseqc && cd /opt/rnaseqc && \
108 |     wget https://github.com/getzlab/rnaseqc/releases/download/v2.4.2/rnaseqc.v2.4.2.linux.gz && \
109 |     gunzip rnaseqc.v2.4.2.linux.gz && mv rnaseqc.v2.4.2.linux rnaseqc && chmod 775 rnaseqc
110 | RUN pip3 install rnaseqc
111 | ENV PATH /opt/rnaseqc:$PATH
112 | 
113 | # gcloud
114 | RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg sudo
115 | RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \
116 |     | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
117 |     curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && \
118 |     apt-get update -y && apt-get install google-cloud-sdk -y
119 | 
120 | # scripts
121 | COPY src src/
122 | 


--------------------------------------------------------------------------------
/rnaseq/README.md:
--------------------------------------------------------------------------------
  1 | # RNA-seq pipeline for the [GTEx Consortium](www.gtexportal.org)
  2 | 
  3 | This repository contains all components of the RNA-seq pipeline used by the GTEx Consortium, including alignment, expression quantification, and quality control.
  4 | 
  5 | ## Docker image
  6 | The GTEx RNA-seq pipeline is provided as a Docker image, available at https://hub.docker.com/r/broadinstitute/gtex_rnaseq/
  7 | 
  8 | To download the image, run:
  9 | ```bash
 10 | docker pull broadinstitute/gtex_rnaseq:V10
 11 | ```
 12 | 
 13 | #### Image contents and pipeline components
 14 | The following tools are included in the Docker image:
 15 | 
 16 | * [SamToFastq](http://broadinstitute.github.io/picard/command-line-overview.html#SamToFastq): BAM to FASTQ conversion
 17 | * [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/): sequencing quality control
 18 | * [STAR](https://github.com/alexdobin/STAR): spliced alignment of RNA-seq reads
 19 | * [Picard MarkDuplicates](https://broadinstitute.github.io/picard/command-line-overview.html#MarkDuplicates): mark duplicate reads
 20 | * [RSEM](http://deweylab.github.io/RSEM): transcript expression quantification
 21 | * [bamsync](bamsync): utility for transferring QC flags and re-generating read group IDs when realigning BAMs
 22 | * [RNA-SeQC](https://github.com/getzlab/rnaseqc): RNA-seq quality control (metrics and gene-level expression quantification)
 23 | 
 24 | Versions used across GTEx releases*:
 25 | |         | V7      | V8      | V10      |
 26 | | ------- | ------- | ------- | -------- |
 27 | | STAR    | v2.4.2a | v2.5.3a | v2.7.10a |
 28 | | RSEM    | v.1.2.22| v1.3.0  | v1.3.3   |
 29 | | RNA-SeQC| v1.1.8  | v1.1.9  | v2.4.2   |
 30 | | Genome  | GRCh37  | GRCh38  | GRCh38   |
 31 | | GENCODE | [v19](https://www.gencodegenes.org/human/release_19.html) | [v26](https://www.gencodegenes.org/human/release_26.html) | [v39](https://www.gencodegenes.org/human/release_39.html) |
 32 | 
 33 | *V9 did not include any RNA-seq updates
 34 | 
 35 | ##  Setup steps
 36 | #### Reference genome and annotation
 37 | Reference indexes for STAR and RSEM are needed to run the pipeline. All reference files are available at [gs://gtex-resources](https://console.cloud.google.com/storage/browser/gtex-resources).
 38 | 
 39 | GTEx releases from V8 onward are based on the GRCh38/hg38 reference genome. Please see [TOPMed_RNAseq_pipeline.md](https://github.com/broadinstitute/gtex-pipeline/blob/master/TOPMed_RNAseq_pipeline.md) for details and links for this reference. Releases up to V7 were based on the GRCh37/hg19 reference genome ([download](http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta)). 
 40 | 
 41 | For hg19-based analyses, the GENCODE annotation should be patched to use Ensembl chromosome names:
 42 | ```
 43 | zcat gencode.v19.annotation.gtf.gz | \
 44 |     sed 's/chrM/chrMT/;s/chr//' > gencode.v19.annotation.patched_contigs.gtf
 45 | ```
 46 | The collapsed version for RNA-SeQC was generated with:
 47 | ```
 48 | python collapse_annotation.py --transcript_blacklist gencode19_unannotated_readthrough_blacklist.txt \
 49 |     gencode.v19.annotation.patched_contigs.gtf gencode.v19.annotation.patched_contigs.collapsed.gtf
 50 | ```
 51 | 
 52 | #### Building the indexes
 53 | The STAR index should be built to match the sequencing read length, specified by the `sjdbOverhang` parameter. GTEx samples were sequenced using a 2x76 bp paired-end sequencing protocol, and the matching `sjdbOverhang` is 75.
 54 | 
 55 | ```bash
 56 | # build the STAR index:
 57 | mkdir $path_to_references/star_index_oh75
 58 | docker run --rm -v $path_to_references:/data -t broadinstitute/gtex_rnaseq:V10 \
 59 |     /bin/bash -c "STAR \
 60 |         --runMode genomeGenerate \
 61 |         --genomeDir /data/star_index_oh75 \
 62 |         --genomeFastaFiles /data/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta \
 63 |         --sjdbGTFfile /data/gencode.v39.GRCh38.annotation.gtf \
 64 |         --sjdbOverhang 75 \
 65 |         --runThreadN 4"
 66 | 
 67 | # build the RSEM index:
 68 | docker run --rm -v $path_to_references:/data -t broadinstitute/gtex_rnaseq:V10 \
 69 |     /bin/bash -c "rsem-prepare-reference \
 70 |         /data/Homo_sapiens_assembly38_noALT_noHLA_noDecoy.fasta \
 71 |         /data/rsem_reference/rsem_reference \
 72 |         --gtf /data/gencode.v39.GRCh38.annotation.gtf \
 73 |         --num-threads 4"
 74 | ```
 75 | 
 76 | ## Running the pipeline
 77 | Individual components of the pipeline can be run using the commands below. It is assumed that the `$path_to_data` directory  contains the input data and reference indexes.
 78 | 
 79 | ```bash
 80 | # BAM to FASTQ conversion
 81 | docker run --rm -v $path_to_data:/data -t broadinstitute/gtex_rnaseq \
 82 |     /bin/bash -c "/src/run_SamToFastq.py /data/$input_bam -p ${sample_id} -o /data"
 83 | 
 84 | # STAR alignment
 85 | docker run --rm -v $path_to_data:/data -t broadinstitute/gtex_rnaseq:V10 \
 86 |     /bin/bash -c "/src/run_STAR.py \
 87 |         /data/star_index_oh75 \
 88 |         /data/${sample_id}_1.fastq.gz \
 89 |         /data/${sample_id}_2.fastq.gz \
 90 |         ${sample_id} \
 91 |         --threads 4 \
 92 |         --output_dir /tmp/star_out && mv /tmp/star_out /data/star_out"
 93 | 
 94 | # sync BAMs (optional; copy QC flags and read group IDs)
 95 | docker run --rm -v $path_to_data:/data -t broadinstitute/gtex_rnaseq:V10 \
 96 |     /bin/bash -c "/src/run_bamsync.sh \
 97 |         /data/$input_bam \
 98 |         /data/star_out/${sample_id}.Aligned.sortedByCoord.out.bam \
 99 |         /data/star_out/${sample_id}"
100 | 
101 | # mark duplicates (Picard)
102 | docker run --rm -v $path_to_data:/data -t broadinstitute/gtex_rnaseq:V10 \
103 |     /bin/bash -c "/src/run_MarkDuplicates.py \
104 |         /data/star_out/${sample_id}.Aligned.sortedByCoord.out.patched.bam \
105 |         ${sample_id}.Aligned.sortedByCoord.out.patched.md \
106 |         --output_dir /data"
107 | 
108 | # RNA-SeQC
109 | docker run --rm -v $path_to_data:/data -t broadinstitute/gtex_rnaseq:V10 \
110 |     /bin/bash -c "/src/run_rnaseqc.py \
111 |     ${sample_id}.Aligned.sortedByCoord.out.patched.md.bam \
112 |     ${genes_gtf} \
113 |     ${genome_fasta} \
114 |     ${sample_id} \
115 |     --output_dir /data"
116 | 
117 | # RSEM transcript quantification
118 | docker run --rm -v $path_to_data:/data -t broadinstitute/gtex_rnaseq:V10 \
119 |     /bin/bash -c "/src/run_RSEM.py \
120 |         /data/rsem_reference \
121 |         /data/star_out/${sample_id}.Aligned.toTranscriptome.out.bam \
122 |         /data/${sample_id} \
123 |         --threads 4"
124 | ```
125 | 
126 | #### Aggregating outputs
127 | Sample-level outputs in GCT format can be concatenated using `combine_GCTs.py`:
128 | ```
129 | docker run --rm -v $path_to_data:/data -t broadinstitute/gtex_rnaseq:V10 \
130 |     /bin/bash -c "python3 /src/combine_GCTs.py \
131 |         ${rnaseqc_tpm_gcts} ${sample_set_id}.rnaseqc_tpm"
132 | ```
133 | 


--------------------------------------------------------------------------------
/rnaseq/bam2coverage.wdl:
--------------------------------------------------------------------------------
 1 | task bam_to_coverage {
 2 | 
 3 |     File bam_file
 4 |     File chr_sizes
 5 |     String prefix
 6 |     File? intervals_bed
 7 | 
 8 |     Int memory
 9 |     Int disk_space
10 |     Int num_threads
11 |     Int num_preempt
12 | 
13 |     command {
14 |         set -euo pipefail
15 |         python3 /src/bam2coverage.py ${bam_file} ${chr_sizes} ${prefix} ${"--intersect " + intervals_bed}
16 |     }
17 | 
18 |     output {
19 |         File coverage_file = "${prefix}.coverage.gz"
20 |     }
21 | 
22 |     runtime {
23 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
24 |         memory: "${memory}GB"
25 |         disks: "local-disk ${disk_space} HDD"
26 |         cpu: "${num_threads}"
27 |         preemptible: "${num_preempt}"
28 |     }
29 | 
30 |     meta {
31 |         author: "Francois Aguet"
32 |     }
33 | }
34 | 
35 | 
36 | workflow bam_to_coverage_workflow {
37 |     call bam_to_coverage
38 | }
39 | 


--------------------------------------------------------------------------------
/rnaseq/bamsync.wdl:
--------------------------------------------------------------------------------
 1 | task bamsync {
 2 | 
 3 |     File source_bam
 4 |     File target_bam
 5 |     File target_bam_index
 6 |     String prefix
 7 | 
 8 |     Int memory
 9 |     Int disk_space
10 |     Int num_threads
11 |     Int num_preempt
12 | 
13 |     command {
14 |         set -euo pipefail
15 |         echo $(date +"[%b %d %H:%M:%S] Running bamsync")
16 |         /src/run_bamsync.sh ${source_bam} ${target_bam} ${prefix}
17 |         echo $(date +"[%b %d %H:%M:%S] Running samtools flagstat")
18 |         samtools flagstat ${prefix}.Aligned.sortedByCoord.out.patched.bam > ${prefix}.Aligned.sortedByCoord.out.patched.bam.flagstat.txt
19 |     }
20 | 
21 |     output {
22 |         File patched_bam_file="${prefix}.Aligned.sortedByCoord.out.patched.bam"
23 |         File patched_bam_index="${prefix}.Aligned.sortedByCoord.out.patched.bam.bai"
24 |         File patched_bam_flagstat="${prefix}.Aligned.sortedByCoord.out.patched.bam.flagstat.txt"
25 |     }
26 | 
27 |     runtime {
28 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
29 |         memory: "${memory}GB"
30 |         disks: "local-disk ${disk_space} HDD"
31 |         cpu: "${num_threads}"
32 |         preemptible: "${num_preempt}"
33 |     }
34 | 
35 |     meta {
36 |         author: "Francois Aguet"
37 |     }
38 | }
39 | 
40 | 
41 | workflow bamsync_workflow {
42 |     call bamsync
43 | }
44 | 


--------------------------------------------------------------------------------
/rnaseq/bamsync/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | CCFLAGS := -O3 -Wall -Wextra\
3 | 	-I/usr/local/include/bamtools
4 | 
5 | LDFLAGS := -L /usr/local/lib/bamtools
6 | 
7 | bamsync.o: bamsync.cpp
8 | 	g++ -std=c++11 $(CCFLAGS) bamsync.cpp $(LDFLAGS) -o bamsync -lboost_program_options -lbamtools
9 | 


--------------------------------------------------------------------------------
/rnaseq/bamsync/README.md:
--------------------------------------------------------------------------------
 1 | ### bamsync
 2 | This utility adds the following information that is lost when realigning BAM files:
 3 | * Copies the "read fails platform/vendor quality checks" flag (0x200) from the original BAM
 4 | * Adds a read group ID to each read based on the read ID, in the format RG:Z:xxxxx.x
 5 | 
 6 | Command:
 7 | ```
 8 | bamsync reference.bam target.bam
 9 | ```
10 | 


--------------------------------------------------------------------------------
/rnaseq/fastqc.wdl:
--------------------------------------------------------------------------------
 1 | task fastqc {
 2 | 
 3 |     File fastq1
 4 |     File? fastq2
 5 | 
 6 |     Float memory
 7 |     Int disk_space
 8 |     Int num_threads
 9 |     Int num_preempt
10 | 
11 |     String fastq1_name = sub(sub(basename(fastq1), "\\.fastq.gz$", ""), "\\.fq.gz$", "" )
12 |     String fastq2_name = sub(sub(basename(fastq2), "\\.fastq.gz$", ""), "\\.fq.gz$", "" )
13 | 
14 |     command <<<
15 |         set -euo pipefail
16 |         fastqc ${fastq1} ${fastq2} \
17 |             --threads ${num_threads} \
18 |             --outdir .
19 |         unzip -p ${fastq1_name}_fastqc.zip ${fastq1_name}_fastqc/fastqc_data.txt | gzip > ${fastq1_name}.fastqc_data.txt.gz
20 |         unzip -p ${fastq2_name}_fastqc.zip ${fastq2_name}_fastqc/fastqc_data.txt | gzip > ${fastq2_name}.fastqc_data.txt.gz
21 |     >>>
22 | 
23 |     output {
24 |         File fastq1_fastqc_html = "${fastq1_name}_fastqc.html"
25 |         File fastq1_fastqc_zip =  "${fastq1_name}_fastqc.zip"
26 |         File fastq1_fastqc_data = "${fastq1_name}.fastqc_data.txt.gz"
27 |         File fastq2_fastqc_html = "${fastq2_name}_fastqc.html"
28 |         File fastq2_fastqc_zip =  "${fastq2_name}_fastqc.zip"
29 |         File fastq2_fastqc_data = "${fastq2_name}.fastqc_data.txt.gz"
30 |     }
31 | 
32 |     runtime {
33 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
34 |         memory: "${memory}GB"
35 |         disks: "local-disk ${disk_space} HDD"
36 |         cpu: "${num_threads}"
37 |         preemptible: "${num_preempt}"
38 |     }
39 | 
40 |     meta {
41 |         author: "Francois Aguet"
42 |     }
43 | }
44 | 
45 | workflow fastqc_workflow {
46 |     call fastqc
47 | }
48 | 


--------------------------------------------------------------------------------
/rnaseq/markduplicates.wdl:
--------------------------------------------------------------------------------
 1 | task markduplicates {
 2 | 
 3 |     File input_bam
 4 |     String prefix
 5 |     Int? max_records_in_ram
 6 |     Float? sorting_collection_size_ratio
 7 | 
 8 |     Float memory
 9 |     Int java_memory = floor(memory - 0.5)
10 |     Int disk_space
11 |     Int num_threads
12 |     Int num_preempt
13 | 
14 |     String output_bam = sub(basename(input_bam), "\\.bam$", ".md.bam")
15 | 
16 |     command {
17 |         set -euo pipefail
18 |         python3 -u /src/run_MarkDuplicates.py ${input_bam} ${prefix} \
19 |             --memory ${java_memory} \
20 |             ${"--max_records_in_ram " + max_records_in_ram} \
21 |             ${"--sorting_collection_size_ratio " + sorting_collection_size_ratio}
22 |         samtools index ${output_bam}
23 |     }
24 | 
25 |     output {
26 |         File bam_file = "${output_bam}"
27 |         File bam_index = "${output_bam}.bai"
28 |         File metrics = "${prefix}.marked_dup_metrics.txt"
29 |     }
30 | 
31 |     runtime {
32 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
33 |         memory: "${memory}GB"
34 |         disks: "local-disk ${disk_space} HDD"
35 |         cpu: "${num_threads}"
36 |         preemptible: "${num_preempt}"
37 |     }
38 | 
39 |     meta {
40 |         author: "Francois Aguet"
41 |     }
42 | }
43 | 
44 | 
45 | workflow markduplicates_workflow {
46 |     call markduplicates
47 | }
48 | 


--------------------------------------------------------------------------------
/rnaseq/references/ERCC92.chrsizes:
--------------------------------------------------------------------------------
 1 | ERCC_00002	1061
 2 | ERCC_00003	1023
 3 | ERCC_00004	523
 4 | ERCC_00009	984
 5 | ERCC_00012	994
 6 | ERCC_00013	808
 7 | ERCC_00014	1957
 8 | ERCC_00016	844
 9 | ERCC_00017	1136
10 | ERCC_00019	644
11 | ERCC_00022	751
12 | ERCC_00024	536
13 | ERCC_00025	1994
14 | ERCC_00028	1130
15 | ERCC_00031	1138
16 | ERCC_00033	2022
17 | ERCC_00034	1019
18 | ERCC_00035	1130
19 | ERCC_00039	740
20 | ERCC_00040	744
21 | ERCC_00041	1122
22 | ERCC_00042	1023
23 | ERCC_00043	1023
24 | ERCC_00044	1156
25 | ERCC_00046	522
26 | ERCC_00048	992
27 | ERCC_00051	274
28 | ERCC_00053	1023
29 | ERCC_00054	274
30 | ERCC_00057	1021
31 | ERCC_00058	1136
32 | ERCC_00059	525
33 | ERCC_00060	523
34 | ERCC_00061	1136
35 | ERCC_00062	1023
36 | ERCC_00067	644
37 | ERCC_00069	1137
38 | ERCC_00071	642
39 | ERCC_00073	603
40 | ERCC_00074	522
41 | ERCC_00075	1023
42 | ERCC_00076	642
43 | ERCC_00077	273
44 | ERCC_00078	993
45 | ERCC_00079	644
46 | ERCC_00081	534
47 | ERCC_00083	1022
48 | ERCC_00084	994
49 | ERCC_00085	844
50 | ERCC_00086	1020
51 | ERCC_00092	1124
52 | ERCC_00095	521
53 | ERCC_00096	1107
54 | ERCC_00097	523
55 | ERCC_00098	1143
56 | ERCC_00099	1350
57 | ERCC_00104	2022
58 | ERCC_00108	1022
59 | ERCC_00109	536
60 | ERCC_00111	994
61 | ERCC_00112	1136
62 | ERCC_00113	840
63 | ERCC_00116	1991
64 | ERCC_00117	1136
65 | ERCC_00120	536
66 | ERCC_00123	1022
67 | ERCC_00126	1118
68 | ERCC_00130	1059
69 | ERCC_00131	771
70 | ERCC_00134	274
71 | ERCC_00136	1033
72 | ERCC_00137	537
73 | ERCC_00138	1024
74 | ERCC_00142	493
75 | ERCC_00143	784
76 | ERCC_00144	538
77 | ERCC_00145	1042
78 | ERCC_00147	1023
79 | ERCC_00148	494
80 | ERCC_00150	743
81 | ERCC_00154	537
82 | ERCC_00156	494
83 | ERCC_00157	1019
84 | ERCC_00158	1027
85 | ERCC_00160	743
86 | ERCC_00162	523
87 | ERCC_00163	543
88 | ERCC_00164	1022
89 | ERCC_00165	872
90 | ERCC_00168	1024
91 | ERCC_00170	1023
92 | ERCC_00171	505
93 | 


--------------------------------------------------------------------------------
/rnaseq/references/ERCC92.gtf:
--------------------------------------------------------------------------------
 1 | ERCC-00002	ERCC	exon	1	1061	0.000000	+	.	gene_id "ERCC-00002"; transcript_id "DQ459430";
 2 | ERCC-00003	ERCC	exon	1	1023	0.000000	+	.	gene_id "ERCC-00003"; transcript_id "DQ516784";
 3 | ERCC-00004	ERCC	exon	1	523	0.000000	+	.	gene_id "ERCC-00004"; transcript_id "DQ516752";
 4 | ERCC-00009	ERCC	exon	1	984	0.000000	+	.	gene_id "ERCC-00009"; transcript_id "DQ668364";
 5 | ERCC-00012	ERCC	exon	1	994	0.000000	+	.	gene_id "ERCC-00012"; transcript_id "DQ883670";
 6 | ERCC-00013	ERCC	exon	1	808	0.000000	+	.	gene_id "ERCC-00013"; transcript_id "EF011062";
 7 | ERCC-00014	ERCC	exon	1	1957	0.000000	+	.	gene_id "ERCC-00014"; transcript_id "DQ875385";
 8 | ERCC-00016	ERCC	exon	1	844	0.000000	+	.	gene_id "ERCC-00016"; transcript_id "DQ883664";
 9 | ERCC-00017	ERCC	exon	1	1136	0.000000	+	.	gene_id "ERCC-00017"; transcript_id "DQ459420";
10 | ERCC-00019	ERCC	exon	1	644	0.000000	+	.	gene_id "ERCC-00019"; transcript_id "DQ883651";
11 | ERCC-00022	ERCC	exon	1	751	0.000000	+	.	gene_id "ERCC-00022"; transcript_id "DQ855004";
12 | ERCC-00024	ERCC	exon	1	536	0.000000	+	.	gene_id "ERCC-00024"; transcript_id "DQ854993";
13 | ERCC-00025	ERCC	exon	1	1994	0.000000	+	.	gene_id "ERCC-00025"; transcript_id "DQ883689";
14 | ERCC-00028	ERCC	exon	1	1130	0.000000	+	.	gene_id "ERCC-00028"; transcript_id "DQ459419";
15 | ERCC-00031	ERCC	exon	1	1138	0.000000	+	.	gene_id "ERCC-00031"; transcript_id "DQ459431";
16 | ERCC-00033	ERCC	exon	1	2022	0.000000	+	.	gene_id "ERCC-00033"; transcript_id "DQ516796";
17 | ERCC-00034	ERCC	exon	1	1019	0.000000	+	.	gene_id "ERCC-00034"; transcript_id "DQ855001";
18 | ERCC-00035	ERCC	exon	1	1130	0.000000	+	.	gene_id "ERCC-00035"; transcript_id "DQ459413";
19 | ERCC-00039	ERCC	exon	1	740	0.000000	+	.	gene_id "ERCC-00039"; transcript_id "DQ883656";
20 | ERCC-00040	ERCC	exon	1	744	0.000000	+	.	gene_id "ERCC-00040"; transcript_id "DQ883661";
21 | ERCC-00041	ERCC	exon	1	1122	0.000000	+	.	gene_id "ERCC-00041"; transcript_id "EF011069";
22 | ERCC-00042	ERCC	exon	1	1023	0.000000	+	.	gene_id "ERCC-00042"; transcript_id "DQ516783";
23 | ERCC-00043	ERCC	exon	1	1023	0.000000	+	.	gene_id "ERCC-00043"; transcript_id "DQ516787";
24 | ERCC-00044	ERCC	exon	1	1156	0.000000	+	.	gene_id "ERCC-00044"; transcript_id "DQ459424";
25 | ERCC-00046	ERCC	exon	1	522	0.000000	+	.	gene_id "ERCC-00046"; transcript_id "DQ516748";
26 | ERCC-00048	ERCC	exon	1	992	0.000000	+	.	gene_id "ERCC-00048"; transcript_id "DQ883671";
27 | ERCC-00051	ERCC	exon	1	274	0.000000	+	.	gene_id "ERCC-00051"; transcript_id "DQ516740";
28 | ERCC-00053	ERCC	exon	1	1023	0.000000	+	.	gene_id "ERCC-00053"; transcript_id "DQ516785";
29 | ERCC-00054	ERCC	exon	1	274	0.000000	+	.	gene_id "ERCC-00054"; transcript_id "DQ516731";
30 | ERCC-00057	ERCC	exon	1	1021	0.000000	+	.	gene_id "ERCC-00057"; transcript_id "DQ668366";
31 | ERCC-00058	ERCC	exon	1	1136	0.000000	+	.	gene_id "ERCC-00058"; transcript_id "DQ459418";
32 | ERCC-00059	ERCC	exon	1	525	0.000000	+	.	gene_id "ERCC-00059"; transcript_id "DQ668356";
33 | ERCC-00060	ERCC	exon	1	523	0.000000	+	.	gene_id "ERCC-00060"; transcript_id "DQ516763";
34 | ERCC-00061	ERCC	exon	1	1136	0.000000	+	.	gene_id "ERCC-00061"; transcript_id "DQ459426";
35 | ERCC-00062	ERCC	exon	1	1023	0.000000	+	.	gene_id "ERCC-00062"; transcript_id "DQ516786";
36 | ERCC-00067	ERCC	exon	1	644	0.000000	+	.	gene_id "ERCC-00067"; transcript_id "DQ883653";
37 | ERCC-00069	ERCC	exon	1	1137	0.000000	+	.	gene_id "ERCC-00069"; transcript_id "DQ459421";
38 | ERCC-00071	ERCC	exon	1	642	0.000000	+	.	gene_id "ERCC-00071"; transcript_id "DQ883654";
39 | ERCC-00073	ERCC	exon	1	603	0.000000	+	.	gene_id "ERCC-00073"; transcript_id "DQ668358";
40 | ERCC-00074	ERCC	exon	1	522	0.000000	+	.	gene_id "ERCC-00074"; transcript_id "DQ516754";
41 | ERCC-00075	ERCC	exon	1	1023	0.000000	+	.	gene_id "ERCC-00075"; transcript_id "DQ516778";
42 | ERCC-00076	ERCC	exon	1	642	0.000000	+	.	gene_id "ERCC-00076"; transcript_id "DQ883650";
43 | ERCC-00077	ERCC	exon	1	273	0.000000	+	.	gene_id "ERCC-00077"; transcript_id "DQ516742";
44 | ERCC-00078	ERCC	exon	1	993	0.000000	+	.	gene_id "ERCC-00078"; transcript_id "DQ883673";
45 | ERCC-00079	ERCC	exon	1	644	0.000000	+	.	gene_id "ERCC-00079"; transcript_id "DQ883652";
46 | ERCC-00081	ERCC	exon	1	534	0.000000	+	.	gene_id "ERCC-00081"; transcript_id "DQ854991";
47 | ERCC-00083	ERCC	exon	1	1022	0.000000	+	.	gene_id "ERCC-00083"; transcript_id "DQ516780";
48 | ERCC-00084	ERCC	exon	1	994	0.000000	+	.	gene_id "ERCC-00084"; transcript_id "DQ883682";
49 | ERCC-00085	ERCC	exon	1	844	0.000000	+	.	gene_id "ERCC-00085"; transcript_id "DQ883669";
50 | ERCC-00086	ERCC	exon	1	1020	0.000000	+	.	gene_id "ERCC-00086"; transcript_id "DQ516791";
51 | ERCC-00092	ERCC	exon	1	1124	0.000000	+	.	gene_id "ERCC-00092"; transcript_id "DQ459425";
52 | ERCC-00095	ERCC	exon	1	521	0.000000	+	.	gene_id "ERCC-00095"; transcript_id "DQ516759";
53 | ERCC-00096	ERCC	exon	1	1107	0.000000	+	.	gene_id "ERCC-00096"; transcript_id "DQ459429";
54 | ERCC-00097	ERCC	exon	1	523	0.000000	+	.	gene_id "ERCC-00097"; transcript_id "DQ516758";
55 | ERCC-00098	ERCC	exon	1	1143	0.000000	+	.	gene_id "ERCC-00098"; transcript_id "DQ459415";
56 | ERCC-00099	ERCC	exon	1	1350	0.000000	+	.	gene_id "ERCC-00099"; transcript_id "DQ875387";
57 | ERCC-00104	ERCC	exon	1	2022	0.000000	+	.	gene_id "ERCC-00104"; transcript_id "DQ516815";
58 | ERCC-00108	ERCC	exon	1	1022	0.000000	+	.	gene_id "ERCC-00108"; transcript_id "DQ668365";
59 | ERCC-00109	ERCC	exon	1	536	0.000000	+	.	gene_id "ERCC-00109"; transcript_id "DQ854998";
60 | ERCC-00111	ERCC	exon	1	994	0.000000	+	.	gene_id "ERCC-00111"; transcript_id "DQ883685";
61 | ERCC-00112	ERCC	exon	1	1136	0.000000	+	.	gene_id "ERCC-00112"; transcript_id "DQ459422";
62 | ERCC-00113	ERCC	exon	1	840	0.000000	+	.	gene_id "ERCC-00113"; transcript_id "DQ883663";
63 | ERCC-00116	ERCC	exon	1	1991	0.000000	+	.	gene_id "ERCC-00116"; transcript_id "DQ668367";
64 | ERCC-00117	ERCC	exon	1	1136	0.000000	+	.	gene_id "ERCC-00117"; transcript_id "DQ459412";
65 | ERCC-00120	ERCC	exon	1	536	0.000000	+	.	gene_id "ERCC-00120"; transcript_id "DQ854992";
66 | ERCC-00123	ERCC	exon	1	1022	0.000000	+	.	gene_id "ERCC-00123"; transcript_id "DQ516782";
67 | ERCC-00126	ERCC	exon	1	1118	0.000000	+	.	gene_id "ERCC-00126"; transcript_id "DQ459427";
68 | ERCC-00130	ERCC	exon	1	1059	0.000000	+	.	gene_id "ERCC-00130"; transcript_id "EF011072";
69 | ERCC-00131	ERCC	exon	1	771	0.000000	+	.	gene_id "ERCC-00131"; transcript_id "DQ855003";
70 | ERCC-00134	ERCC	exon	1	274	0.000000	+	.	gene_id "ERCC-00134"; transcript_id "DQ516739";
71 | ERCC-00136	ERCC	exon	1	1033	0.000000	+	.	gene_id "ERCC-00136"; transcript_id "EF011063";
72 | ERCC-00137	ERCC	exon	1	537	0.000000	+	.	gene_id "ERCC-00137"; transcript_id "DQ855000";
73 | ERCC-00138	ERCC	exon	1	1024	0.000000	+	.	gene_id "ERCC-00138"; transcript_id "DQ516777";
74 | ERCC-00142	ERCC	exon	1	493	0.000000	+	.	gene_id "ERCC-00142"; transcript_id "DQ883646";
75 | ERCC-00143	ERCC	exon	1	784	0.000000	+	.	gene_id "ERCC-00143"; transcript_id "DQ668362";
76 | ERCC-00144	ERCC	exon	1	538	0.000000	+	.	gene_id "ERCC-00144"; transcript_id "DQ854995";
77 | ERCC-00145	ERCC	exon	1	1042	0.000000	+	.	gene_id "ERCC-00145"; transcript_id "DQ875386";
78 | ERCC-00147	ERCC	exon	1	1023	0.000000	+	.	gene_id "ERCC-00147"; transcript_id "DQ516790";
79 | ERCC-00148	ERCC	exon	1	494	0.000000	+	.	gene_id "ERCC-00148"; transcript_id "DQ883642";
80 | ERCC-00150	ERCC	exon	1	743	0.000000	+	.	gene_id "ERCC-00150"; transcript_id "DQ883659";
81 | ERCC-00154	ERCC	exon	1	537	0.000000	+	.	gene_id "ERCC-00154"; transcript_id "DQ854997";
82 | ERCC-00156	ERCC	exon	1	494	0.000000	+	.	gene_id "ERCC-00156"; transcript_id "DQ883643";
83 | ERCC-00157	ERCC	exon	1	1019	0.000000	+	.	gene_id "ERCC-00157"; transcript_id "DQ839618";
84 | ERCC-00158	ERCC	exon	1	1027	0.000000	+	.	gene_id "ERCC-00158"; transcript_id "DQ516795";
85 | ERCC-00160	ERCC	exon	1	743	0.000000	+	.	gene_id "ERCC-00160"; transcript_id "DQ883658";
86 | ERCC-00162	ERCC	exon	1	523	0.000000	+	.	gene_id "ERCC-00162"; transcript_id "DQ516750";
87 | ERCC-00163	ERCC	exon	1	543	0.000000	+	.	gene_id "ERCC-00163"; transcript_id "DQ668359";
88 | ERCC-00164	ERCC	exon	1	1022	0.000000	+	.	gene_id "ERCC-00164"; transcript_id "DQ516779";
89 | ERCC-00165	ERCC	exon	1	872	0.000000	+	.	gene_id "ERCC-00165"; transcript_id "DQ668363";
90 | ERCC-00168	ERCC	exon	1	1024	0.000000	+	.	gene_id "ERCC-00168"; transcript_id "DQ516776";
91 | ERCC-00170	ERCC	exon	1	1023	0.000000	+	.	gene_id "ERCC-00170"; transcript_id "DQ516773";
92 | ERCC-00171	ERCC	exon	1	505	0.000000	+	.	gene_id "ERCC-00171"; transcript_id "DQ854994";
93 | 


--------------------------------------------------------------------------------
/rnaseq/rnaseq_pipeline_bam.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:samtofastq_v1-0_BETA/versions/7/plain-WDL/descriptor" as samtofastq_wdl
 2 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:star_v1-0_BETA/versions/8/plain-WDL/descriptor" as star_wdl
 3 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:markduplicates_v1-0_BETA/versions/6/plain-WDL/descriptor" as markduplicates_wdl
 4 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:rsem_v1-0_BETA/versions/6/plain-WDL/descriptor" as rsem_wdl
 5 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:rnaseqc2_v1-0_BETA/versions/4/plain-WDL/descriptor" as rnaseqc_wdl
 6 | 
 7 | workflow rnaseq_pipeline_bam_workflow {
 8 | 
 9 |     String prefix
10 | 
11 |     call samtofastq_wdl.samtofastq {
12 |         input: prefix=prefix
13 |     }
14 | 
15 |     call star_wdl.star {
16 |         input: fastq1=samtofastq.fastq1, fastq2=samtofastq.fastq2, prefix=prefix
17 |     }
18 | 
19 |     call markduplicates_wdl.markduplicates {
20 |         input: input_bam=star.bam_file, prefix=prefix
21 |     }
22 | 
23 |     call rsem_wdl.rsem {
24 |         input: transcriptome_bam=star.transcriptome_bam, prefix=prefix
25 |     }
26 | 
27 |     call rnaseqc_wdl.rnaseqc2 {
28 |         input: bam_file=markduplicates.bam_file, sample_id=prefix
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/rnaseq/rnaseq_pipeline_fastq.wdl:
--------------------------------------------------------------------------------
 1 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:fastqc_v1-0_BETA/versions/2/plain-WDL/descriptor" as fastqc_wdl
 2 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:star_v1-0_BETA/versions/8/plain-WDL/descriptor" as star_wdl
 3 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:markduplicates_v1-0_BETA/versions/6/plain-WDL/descriptor" as markduplicates_wdl
 4 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:rsem_v1-0_BETA/versions/6/plain-WDL/descriptor" as rsem_wdl
 5 | import "https://api.firecloud.org/ga4gh/v1/tools/broadinstitute_gtex:rnaseqc2_v1-0_BETA/versions/4/plain-WDL/descriptor" as rnaseqc_wdl
 6 | 
 7 | workflow rnaseq_pipeline_fastq_workflow {
 8 | 
 9 |     String prefix
10 | 
11 |     call fastqc_wdl.fastqc {}
12 | 
13 |     call star_wdl.star {
14 |         input: prefix=prefix
15 |     }
16 | 
17 |     call markduplicates_wdl.markduplicates {
18 |         input: input_bam=star.bam_file, prefix=prefix
19 |     }
20 | 
21 |     call rsem_wdl.rsem {
22 |         input: transcriptome_bam=star.transcriptome_bam, prefix=prefix
23 |     }
24 | 
25 |     call rnaseqc_wdl.rnaseqc2 {
26 |         input: bam_file=markduplicates.bam_file, sample_id=prefix
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/rnaseq/rnaseqc2.wdl:
--------------------------------------------------------------------------------
 1 | task rnaseqc2 {
 2 | 
 3 |     File bam_file
 4 |     File genes_gtf
 5 |     String sample_id
 6 |     String? strandedness
 7 |     File? intervals_bed
 8 |     File? reference_fasta
 9 |     File? reference_fasta_index
10 |     String? flags
11 | 
12 |     Int memory
13 |     Int disk_space
14 |     Int num_threads
15 |     Int num_preempt
16 | 
17 |     command {
18 |         set -euo pipefail
19 |         echo $(date +"[%b %d %H:%M:%S] Running RNA-SeQC 2")
20 |         touch ${sample_id}.fragmentSizes.txt
21 |         touch ${sample_id}.gc_content.tsv
22 |         rnaseqc ${genes_gtf} ${bam_file} . -s ${sample_id} ${"--bed " + intervals_bed} ${"--stranded " + strandedness} ${"--fasta " + reference_fasta} -vv ${flags}
23 |         echo "  * compressing outputs"
24 |         gzip *.gct
25 |         echo $(date +"[%b %d %H:%M:%S] done")
26 |     }
27 | 
28 |     output {
29 |         File gene_tpm = "${sample_id}.gene_tpm.gct.gz"
30 |         File gene_counts = "${sample_id}.gene_reads.gct.gz"
31 |         File exon_counts = "${sample_id}.exon_reads.gct.gz"
32 |         File metrics = "${sample_id}.metrics.tsv"
33 |         File gc_content = "${sample_id}.gc_content.tsv"
34 |         File insertsize_distr = "${sample_id}.fragmentSizes.txt"
35 |     }
36 | 
37 |     runtime {
38 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
39 |         memory: "${memory}GB"
40 |         disks: "local-disk ${disk_space} HDD"
41 |         cpu: "${num_threads}"
42 |         preemptible: "${num_preempt}"
43 |     }
44 | 
45 |     meta {
46 |         author: "Francois Aguet"
47 |     }
48 | }
49 | 
50 | 
51 | workflow rnaseqc2_workflow {
52 |     call rnaseqc2
53 | }
54 | 


--------------------------------------------------------------------------------
/rnaseq/rnaseqc2_aggregate.wdl:
--------------------------------------------------------------------------------
 1 | task rnaseqc2_aggregate {
 2 | 
 3 |     Array[File] tpm_gcts
 4 |     Array[File] count_gcts
 5 |     Array[File] exon_count_gcts
 6 |     Array[File] metrics_tsvs
 7 |     String prefix
 8 |     Array[File]? insertsize_hists
 9 |     String? flags
10 | 
11 |     Int memory
12 |     Int disk_space
13 |     Int num_threads
14 |     Int num_preempt
15 | 
16 |     command {
17 |         set -euo pipefail
18 |         echo $(date +"[%b %d %H:%M:%S] Aggregating RNA-SeQC outputs")
19 |         mkdir individual_outputs
20 |         mv ${sep=' ' tpm_gcts} individual_outputs/
21 |         mv ${sep=' ' count_gcts} individual_outputs/
22 |         mv ${sep=' ' exon_count_gcts} individual_outputs/
23 |         mv ${sep=' ' metrics_tsvs} individual_outputs/
24 |         if [ -n '${sep=',' insertsize_hists}' ]; then
25 |             mv ${sep=' ' insertsize_hists} individual_outputs/
26 |         fi
27 |         touch ${prefix}.insert_size_hists.txt.gz
28 |         python3 -m rnaseqc aggregate \
29 |             -o . \
30 |             individual_outputs \
31 |             ${prefix} \
32 |             ${flags}
33 |         echo $(date +"[%b %d %H:%M:%S] done")
34 |     }
35 | 
36 |     output {
37 |         File metrics="${prefix}.metrics.txt.gz"
38 |         File insert_size_hists="${prefix}.insert_size_hists.txt.gz"
39 |         File tpm_gct=glob("${prefix}.gene_tpm.*")[0]
40 |         File count_gct=glob("${prefix}.gene_reads.*")[0]
41 |         File exon_count_gct=glob("${prefix}.exon_reads.*")[0]
42 |     }
43 | 
44 |     runtime {
45 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
46 |         memory: "${memory}GB"
47 |         disks: "local-disk ${disk_space} HDD"
48 |         cpu: "${num_threads}"
49 |         preemptible: "${num_preempt}"
50 |     }
51 | 
52 |     meta {
53 |         author: "Francois Aguet"
54 |     }
55 | }
56 | 
57 | 
58 | workflow rnaseqc2_aggregate_workflow {
59 |     call rnaseqc2_aggregate
60 | }
61 | 


--------------------------------------------------------------------------------
/rnaseq/rsem.wdl:
--------------------------------------------------------------------------------
 1 | task rsem {
 2 | 
 3 |     File transcriptome_bam
 4 |     File rsem_reference
 5 |     String prefix
 6 | 
 7 |     Int memory
 8 |     Int disk_space
 9 |     Int num_threads
10 |     Int num_preempt
11 | 
12 |     Int? max_frag_len
13 |     String? estimate_rspd
14 |     String? is_stranded
15 |     String? paired_end
16 | 
17 |     command {
18 |         set -euo pipefail
19 |         mkdir rsem_reference
20 |         tar -xvvf ${rsem_reference} -C rsem_reference --strip-components=1
21 | 
22 |         /src/run_RSEM.py \
23 |             ${"--max_frag_len " + max_frag_len} \
24 |             ${"--estimate_rspd " + estimate_rspd} \
25 |             ${"--is_stranded " + is_stranded} \
26 |             ${"--paired_end " + paired_end} \
27 |             --threads ${num_threads} \
28 |             rsem_reference ${transcriptome_bam} ${prefix}
29 |         gzip *.results
30 |     }
31 | 
32 |     output {
33 |         File genes="${prefix}.rsem.genes.results.gz"
34 |         File isoforms="${prefix}.rsem.isoforms.results.gz"
35 |     }
36 | 
37 |     runtime {
38 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
39 |         memory: "${memory}GB"
40 |         disks: "local-disk ${disk_space} HDD"
41 |         cpu: "${num_threads}"
42 |         preemptible: "${num_preempt}"
43 |     }
44 | 
45 |     meta {
46 |         author: "Francois Aguet"
47 |     }
48 | }
49 | 
50 | 
51 | workflow rsem_workflow {
52 |     call rsem
53 | }
54 | 


--------------------------------------------------------------------------------
/rnaseq/rsem_aggregate.wdl:
--------------------------------------------------------------------------------
 1 | task rsem_aggregate_results {
 2 | 
 3 |     Array[File] rsem_isoforms
 4 |     Array[File] rsem_genes
 5 |     String prefix
 6 | 
 7 |     Int memory
 8 |     Int disk_space
 9 |     Int num_threads
10 |     Int num_preempt
11 | 
12 |     command {
13 |         echo $(date +"[%b %d %H:%M:%S] Combining transcript-level output")
14 |         python3 /src/aggregate_rsem_results.py ${write_lines(rsem_isoforms)} TPM IsoPct expected_count ${prefix}
15 |         echo $(date +"[%b %d %H:%M:%S] Combining gene-level output")
16 |         python3 /src/aggregate_rsem_results.py ${write_lines(rsem_genes)} TPM expected_count ${prefix}
17 |     }
18 | 
19 |     output {
20 |         File transcripts_tpm="${prefix}.rsem_transcripts_tpm.txt.gz"
21 |         File transcripts_isopct="${prefix}.rsem_transcripts_isopct.txt.gz"
22 |         File transcripts_expected_count="${prefix}.rsem_transcripts_expected_count.txt.gz"
23 |         File genes_tpm="${prefix}.rsem_genes_tpm.txt.gz"
24 |         File genes_expected_count="${prefix}.rsem_genes_expected_count.txt.gz"
25 |     }
26 | 
27 |     runtime {
28 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
29 |         memory: "${memory}GB"
30 |         disks: "local-disk ${disk_space} HDD"
31 |         cpu: "${num_threads}"
32 |         preemptible: "${num_preempt}"
33 |     }
34 | 
35 |     meta {
36 |         author: "Francois Aguet"
37 |     }
38 | }
39 | 
40 | 
41 | workflow rsem_aggregate_results_workflow {
42 |     call rsem_aggregate_results
43 | }
44 | 


--------------------------------------------------------------------------------
/rnaseq/rsem_preprocessing.wdl:
--------------------------------------------------------------------------------
 1 | task remove_IDS_reads {
 2 | 
 3 |     File transcriptome_bam
 4 |     String prefix
 5 | 
 6 |     Int memory
 7 |     Int disk_space
 8 |     Int num_threads
 9 |     Int num_preempt
10 | 
11 |     command {
12 |         set -euo pipefail
13 |         echo $(date +"[%b %d %H:%M:%S] Running remove_IDS_reads")
14 |         /src/run_remove_IDS_reads.sh ${transcriptome_bam} ${prefix}
15 |     }
16 | 
17 |     output {
18 |         File transcriptome_noIDS_bam = "${prefix}.Aligned.toTranscriptome_noIDS.out.bam"
19 |     }
20 | 
21 |     runtime {
22 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
23 |         memory: "${memory}GB"
24 |         disks: "local-disk ${disk_space} HDD"
25 |         cpu: "${num_threads}"
26 |         preemptible: "${num_preempt}"
27 |     }
28 | 
29 |     meta {
30 |         author: "Abhishek Choudhary"
31 |     }
32 | }
33 | 
34 | 
35 | workflow rsem_preprocessing_workflow {
36 |     call remove_IDS_reads
37 | }
38 | 


--------------------------------------------------------------------------------
/rnaseq/rsem_reference.wdl:
--------------------------------------------------------------------------------
 1 | task rsem_reference {
 2 | 
 3 |     File reference_fasta
 4 |     File annotation_gtf
 5 |     String prefix
 6 | 
 7 |     Int memory
 8 |     Int disk_space
 9 |     Int num_threads
10 |     Int num_preempt
11 | 
12 |     command {
13 |         mkdir ${prefix} && cd ${prefix}
14 |         rsem-prepare-reference ${reference_fasta} rsem_reference --gtf ${annotation_gtf} --num-threads ${num_threads}
15 |         cd .. && tar -cvzf ${prefix}.tar.gz ${prefix}
16 |     }
17 | 
18 |     output {
19 |         File rsem_reference = "${prefix}.tar.gz"
20 |     }
21 | 
22 |     runtime {
23 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
24 |         memory: "${memory}GB"
25 |         disks: "local-disk ${disk_space} HDD"
26 |         cpu: "${num_threads}"
27 |         preemptible: "${num_preempt}"
28 |     }
29 | 
30 |     meta {
31 |         author: "Francois Aguet"
32 |     }
33 | }
34 | 
35 | 
36 | workflow rsem_reference_workflow {
37 |     call rsem_reference
38 | }
39 | 


--------------------------------------------------------------------------------
/rnaseq/samtofastq.wdl:
--------------------------------------------------------------------------------
 1 | task samtofastq {
 2 | 
 3 |     File input_bam_cram
 4 |     String prefix
 5 |     File? reference_fasta
 6 |     File? reference_fasta_index
 7 | 
 8 |     Float memory
 9 |     Int java_memory = floor(memory - 0.5)
10 |     Int disk_space
11 |     Int num_threads
12 |     Int num_preempt
13 | 
14 |     command {
15 |         set -euo pipefail
16 | 
17 |         # make sure path is absolute
18 |         input_bam_abs=${input_bam_cram}
19 |         if [[ $input_bam_abs != /* ]]; then
20 |             input_bam_abs=$PWD/$input_bam_abs
21 |         fi
22 | 
23 |         mkdir samtofastq  # workaround for named pipes
24 |         python3 -u /src/run_SamToFastq.py $input_bam_abs -p ${prefix} ${"--reference_fasta " + reference_fasta} --output_dir samtofastq --memory ${java_memory}
25 |         mv samtofastq/${prefix}_*.fastq.gz .
26 |     }
27 | 
28 |     output {
29 |         File fastq1="${prefix}_1.fastq.gz"
30 |         File fastq2="${prefix}_2.fastq.gz"
31 |     }
32 | 
33 |     runtime {
34 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
35 |         memory: "${memory}GB"
36 |         disks: "local-disk ${disk_space} HDD"
37 |         cpu: "${num_threads}"
38 |         preemptible: "${num_preempt}"
39 |     }
40 | 
41 |     meta {
42 |         author: "Francois Aguet"
43 |     }
44 | }
45 | 
46 | 
47 | workflow samtofastq_workflow {
48 |     call samtofastq
49 | }
50 | 


--------------------------------------------------------------------------------
/rnaseq/samtools_view.wdl:
--------------------------------------------------------------------------------
 1 | task samtools_view {
 2 | 
 3 |     File bam_file
 4 |     File bam_index
 5 |     String prefix
 6 |     String? options
 7 |     String? region
 8 |     File? reference_fasta
 9 |     File? reference_fasta_index
10 | 
11 |     Int memory
12 |     Int disk_space
13 |     Int num_threads
14 |     Int num_preempt
15 | 
16 |     command {
17 |         set -euo pipefail
18 |         echo $(date +"[%b %d %H:%M:%S] Running 'samtools view'.")
19 |         samtools view ${options} ${"-T " + reference_fasta} ${bam_file} ${region} > ${prefix}.bam
20 |         samtools index ${prefix}.bam
21 |         echo $(date +"[%b %d %H:%M:%S] done.")
22 |     }
23 | 
24 |     output {
25 |         File output_bam_file = "${prefix}.bam"
26 |         File output_bam_index = "${prefix}.bam.bai"
27 |     }
28 | 
29 |     runtime {
30 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
31 |         memory: "${memory}GB"
32 |         disks: "local-disk ${disk_space} HDD"
33 |         cpu: "${num_threads}"
34 |         preemptible: "${num_preempt}"
35 |     }
36 | 
37 |     meta {
38 |         author: "Francois Aguet"
39 |     }
40 | }
41 | 
42 | 
43 | workflow samtools_view_workflow {
44 |     call samtools_view
45 | }
46 | 


--------------------------------------------------------------------------------
/rnaseq/src/aggregate_rnaseqc_metrics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import argparse
 4 | import os
 5 | 
 6 | parser = argparse.ArgumentParser(description='Aggregate RNA-SeQC metrics from multiple samples')
 7 | parser.add_argument('input_files_tsv', help='TSV file with paths to metrics files')
 8 | parser.add_argument('prefix', help='Prefix for output file name')
 9 | parser.add_argument('--annotation_headers', default='', help='Comma-separate list')
10 | parser.add_argument('--annotation_tsvs', nargs='+', default=[])
11 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
12 | args = parser.parse_args()
13 | 
14 | annotation_headers = args.annotation_headers.split(',')
15 | if len(annotation_headers)==1 and annotation_headers[0]=='':
16 |     annotation_headers = []
17 | assert len(args.annotation_tsvs)==len(annotation_headers)
18 | 
19 | path_s = pd.read_csv(args.input_files_tsv, sep='\t', index_col=0, header=None, names=['sample_id','metrics_path'], squeeze=True)
20 | if path_s.isnull().all():  # ID not provided
21 |     path_s = pd.Series(path_s.index, index=[os.path.split(i)[1].split('.metrics.tsv')[0] for i in path_s.index])
22 | 
23 | # check format
24 | df = pd.read_csv(path_s.iloc[0], sep='\t', header=None)
25 | if df.shape[0]==2:  # RNA-SeQC v1.1.9
26 |     dfs = [pd.read_csv(i, sep='\t') for i in path_s]
27 | elif df.shape[1]==2:  # RNA-SeQC v2
28 |     dfs = [pd.read_csv(i, sep='\t', header=None, index_col=0).T for i in path_s]
29 | else:
30 |     raise ValueError('Unrecognized input format (shape {}).'.format(df.shape))
31 | metrics_df = pd.concat(dfs, axis=0)
32 | metrics_df.index = metrics_df['Sample']
33 | 
34 | # add optional annotations as additional columns
35 | for h,tsv in zip(annotation_headers, args.annotation_tsvs):
36 |     annotation_df = pd.read_csv(tsv, sep='\t', index_col=0, header=None, names=['sample_id',h])
37 |     metrics_df[h] = annotation_df.loc[metrics_df.index, h]
38 | 
39 | metrics_df.to_csv(os.path.join(args.output_dir, args.prefix+'.metrics.tsv'), sep='\t', index=False, float_format='%.8g')
40 | 


--------------------------------------------------------------------------------
/rnaseq/src/aggregate_rsem_results.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import pandas as pd
  3 | import numpy as np
  4 | import argparse
  5 | import gzip
  6 | import os
  7 | import itertools
  8 | from collections import defaultdict
  9 | import tempfile
 10 | 
 11 | 
 12 | def load_isoform_results(rsem_output, cols=None):
 13 |     dtype = {'transcript_id':str, 'gene_id':str,
 14 |         'length':np.int32, 'effective_length':np.float32,
 15 |         'expected_count':np.float32, 'TPM':np.float32,
 16 |         'FPKM':np.float32, 'IsoPct':np.float32}
 17 |     if cols is None:
 18 |         return pd.read_csv(rsem_output, sep='\t',index_col=0, dtype=dtype)
 19 |     else:
 20 |         return pd.read_csv(rsem_output, sep='\t',index_col=0, dtype=dtype, usecols=['transcript_id']+cols)
 21 | 
 22 | 
 23 | def load_gene_results(rsem_output, cols=None):
 24 |     dtype = {'gene_id':str, 'transcript_id(s)':str,
 25 |         'length':np.float32, 'effective_length':np.float32,
 26 |         'expected_count':np.float32, 'TPM':np.float32, 'FPKM':np.float32}
 27 |     if cols is None:
 28 |         return pd.read_csv(rsem_output, sep='\t',index_col=0, dtype=dtype)
 29 |     else:
 30 |         return pd.read_csv(rsem_output, sep='\t',index_col=0, dtype=dtype, usecols=['gene_id']+cols)
 31 | 
 32 | 
 33 | def aggregate_rsem_results(file_list, col_ids, rsem_loader):
 34 |     """
 35 |     Concatenate columns 'col_ids' from multiple RSEM output files;
 36 |     return as dict of DataFrames
 37 |     """
 38 |     df_dict = defaultdict(list)
 39 |     for k,f in enumerate(file_list, 1):
 40 |         filename = os.path.split(f)[1]
 41 |         sample_id = filename.split('.')[0]
 42 |         print(f'\rProcessing RSEM output {k}/{len(file_list)}', end='' if k < len(file_list) else None)
 43 |         rsem_df = rsem_loader(f, cols=col_ids)
 44 |         for c in col_ids:
 45 |             s = rsem_df[c]
 46 |             s.name = sample_id
 47 |             df_dict[c].append(s)
 48 | 
 49 |     for c in col_ids:
 50 |         df_dict[c] = pd.concat(df_dict[c], axis=1)
 51 |     return df_dict
 52 | 
 53 | 
 54 | if __name__=='__main__':
 55 | 
 56 |     parser = argparse.ArgumentParser(description='Aggregate RSEM expression from multiple samples.')
 57 |     parser.add_argument('rsem_output_list', help='File listing RSEM output files, with format $sample_id.rsem.{genes|isoforms}.results')
 58 |     parser.add_argument('col_ids', choices=['expected_count', 'TPM', 'FPKM', 'IsoPct'], nargs='+', help='Column header')
 59 |     parser.add_argument('output_prefix', help='Prefix for output file: ${prefix}.txt.gz')
 60 |     parser.add_argument('--chunk_size', default=500, type=int, help='Files to process simultaneously')
 61 |     parser.add_argument('--parquet', action='store_true', help='Write to parquet format instead of txt.gz')
 62 |     parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
 63 |     args = parser.parse_args()
 64 | 
 65 | 
 66 |     with open(args.rsem_output_list) as f:
 67 |         file_list = f.read().strip().split('\n')
 68 |     sample_ids = [os.path.split(i)[1].split('.')[0] for i in file_list]
 69 | 
 70 |     if np.all(['isoform' in os.path.split(i)[1] for i in file_list]):
 71 |         prefix = f'{args.output_prefix}.rsem_transcripts_'
 72 |         rsem_loader = load_isoform_results
 73 |         index_df = load_isoform_results(file_list[0], cols=['gene_id'])
 74 |     elif np.all(['gene' in os.path.split(i)[1] for i in file_list]):
 75 |         prefix = f'{args.output_prefix}.rsem_genes_'
 76 |         rsem_loader = load_gene_results
 77 |         index_df = load_gene_results(file_list[0], cols=['transcript_id(s)'])
 78 |     else:
 79 |         raise ValueError('Unrecognized input format.')
 80 | 
 81 |     # merge outputs in chunks, store to hdf
 82 |     tmp_store = tempfile.NamedTemporaryFile(dir=args.output_dir)
 83 |     nchunks = int(np.ceil(len(file_list) / args.chunk_size))
 84 |     iargs = [iter(file_list)] * args.chunk_size
 85 |     for k,sub_list in enumerate(itertools.zip_longest(*iargs)):
 86 |         sub_list = [j for j in sub_list if j is not None]  # last chunk
 87 |         print(f'Processing chunk {k+1}/{nchunks}')
 88 |         df_dict = aggregate_rsem_results(sub_list, args.col_ids, rsem_loader)
 89 |         for c in args.col_ids:
 90 |             df_dict[c].to_hdf(tmp_store.name, key=f'{c}{k}')
 91 | 
 92 |     # aggregate chunks for each output type
 93 |     for c in args.col_ids:
 94 |         dfs = [index_df]
 95 |         for k in range(nchunks):
 96 |             print(f'\rLoading chunk {k+1}/{nchunks}', end='' if k+1 < nchunks else None)
 97 |             dfs.append(pd.read_hdf(tmp_store.name, key=f'{c}{k}'))
 98 |         dfs = pd.concat(dfs, axis=1)
 99 | 
100 |         if args.parquet:
101 |             fname = f'{prefix}{c.lower()}.parquet'
102 |             print(f'Writing {fname}')
103 |             dfs.to_parquet(os.path.join(args.output_dir, fname))
104 | 
105 |         else:  # txt.gz
106 |             fname = f'{prefix}{c.lower()}.txt.gz'
107 |             print(f'Writing {fname}')
108 |             with gzip.open(os.path.join(args.output_dir, fname), 'wt', compresslevel=6) as f:
109 |                 dfs.to_csv(f, sep='\t', float_format='%.5g')
110 | 
111 |     tmp_store.close()
112 | 


--------------------------------------------------------------------------------
/rnaseq/src/bam2coverage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | 
 4 | import argparse
 5 | import subprocess
 6 | import os
 7 | from datetime import datetime
 8 | import gzip
 9 | import pyBigWig
10 | 
11 | parser = argparse.ArgumentParser(description='Computes coverage in bigWig and/or bedGraph format from input BAM.')
12 | parser.add_argument('bam_file', help='Input BAM file')
13 | parser.add_argument('chr_sizes', help='Chromosome sizes for the reference genome used')
14 | parser.add_argument('prefix', help='Prefix for output file names')
15 | parser.add_argument('--intersect', default=None, type=str, help='BED file containing intervals to calculate coverage on')
16 | parser.add_argument('-f', '--format', default=['bigwig'], type=str.lower, nargs='+', choices=['bigwig', 'bedgraph'])
17 | parser.add_argument('--sam_flags', default='-q 255 -F 2816', help='Flags for samtools. Default: filter out secondary, supplementary and QC-failed reads')
18 | parser.add_argument('--stranded', action='store_true', help='Generate a track for each strand')
19 | parser.add_argument('--output_dir', default='.', help='Output directory')
20 | args = parser.parse_args()
21 | 
22 | 
23 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Starting coverage computation', flush=True)
24 | 
25 | print('  * generating bedGraph from BAM', flush=True)
26 | # bedGraph file must be sorted for compatibility with bedGraphToBigWig
27 | # exclude reads that are: not primary alignment; failed platform/vendor quality checks (flags 8 & 9)
28 | if args.stranded:
29 |     bgpath_plus = os.path.join(args.output_dir, f'{args.prefix}.plus.bedGraph')
30 |     cmd = f'samtools view {args.sam_flags} -b {args.bam_file} | bedtools genomecov -bga -split -strand + -ibam - | sort -k1,1 -k2,2n > {bgpath_plus}'
31 |     subprocess.call(cmd, shell=True)
32 |     bgpath_minus = os.path.join(args.output_dir, f'{args.prefix}.minus.bedGraph')
33 |     cmd = f'samtools view {args.sam_flags} -b {args.bam_file} | bedtools genomecov -bga -split -strand - -ibam - | sort -k1,1 -k2,2n > {bgpath_minus}'
34 |     subprocess.call(cmd, shell=True)
35 |     bedgraph_files = [bgpath_plus, bgpath_minus]
36 | else:
37 |     bgpath = os.path.join(args.output_dir, f'{args.prefix}.bedGraph')
38 |     cmd = f'samtools view {args.sam_flags} -b {args.bam_file} | bedtools genomecov -bga -split -ibam - | sort -k1,1 -k2,2n > {bgpath}'
39 |     subprocess.call(cmd, shell=True)
40 |     bedgraph_files = [bgpath]
41 | 
42 | if 'bigwig' in args.format:
43 |     print('  * generating bigWig from bedGraph', flush=True)
44 |     for bgpath in bedgraph_files:
45 |         bwpath = bgpath.replace('.bedGraph', '.bigWig')
46 |         subprocess.call(f'bedGraphToBigWig {bgpath} {args.chr_sizes} {bwpath}', shell=True)
47 | 
48 |         if args.intersect is not None:
49 |             print('  * calculating coverage on BED intervals', flush=True)
50 |             bw = pyBigWig.open(bwpath)
51 |             with gzip.open(bwpath.replace('.bigWig', '.coverage.gz'), 'wt') as f, gzip.open(args.intersect) as bed:
52 |                 f.write('\t'.join(['gene_id', 'chr', 'start', 'end', 'coverage'])+'\n')
53 |                 for line in bed:
54 |                     line = line.decode().strip().split()
55 |                     c = bw.values(line[0], int(line[1]), int(line[2]), numpy=True)
56 |                     f.write('\t'.join([line[3], line[0], line[1], line[2], ','.join(c.astype(int).astype(str))])+'\n')
57 |             bw.close()
58 | 
59 |         if 'bedgraph' not in args.format:
60 |             os.remove(bgpath)
61 | 
62 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Done', flush=True)
63 | 


--------------------------------------------------------------------------------
/rnaseq/src/combine_GCTs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | import pandas as pd
 4 | import numpy as np
 5 | import argparse
 6 | import gzip
 7 | import os
 8 | import qtl.io
 9 | 
10 | parser = argparse.ArgumentParser(description='Combine GCT files')
11 | parser.add_argument('input_files', nargs='+', help='List of GCT files, or file containing paths to GCTs.')
12 | parser.add_argument('prefix', help='Prefix for output file: <prefix>.gct.gz')
13 | parser.add_argument('--parquet', action='store_true', help='Write to parquet format instead of GCT')
14 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
15 | args = parser.parse_args()
16 | 
17 | if len(args.input_files) == 1 and '.gct' not in args.input_files[0]:
18 |     with open(args.input_files[0]) as f:
19 |         paths = f.read().strip().split('\n')
20 | else:
21 |     paths = args.input_files
22 | 
23 | path_dict = {os.path.split(i)[1].split('.')[0]:i for i in paths}
24 | sample_ids = sorted(path_dict.keys())
25 | assert len(sample_ids) == len(paths)
26 | 
27 | # detect format
28 | if all([i.endswith('.parquet') for i in paths]):
29 |     sample_id = sample_ids[0]
30 |     df = pd.read_parquet(path_dict[sample_id])
31 |     gct_df = pd.DataFrame(0, index=df.index, columns=['Description']+list(sample_ids), dtype=df[sample_id].dtype)
32 |     gct_df['Description'] = df['Description']
33 |     gct_df[sample_id] = df[sample_id]
34 |     for k, sample_id in enumerate(sample_ids[1:], 2):
35 |         print(f"\rProcessing {k}/{len(sample_ids)}", end='' if k < len(sample_ids) else None, flush=True)
36 |         df = pd.read_parquet(path_dict[sample_id], columns=[sample_id])
37 |         gct_df[sample_id] = df[sample_id]
38 | else:  # .gct or .gct.gz
39 |     sample_id = sample_ids[0]
40 |     df = pd.read_csv(path_dict[sample_id], sep='\t', skiprows=3, header=None, index_col=0, names=['Name', 'Description', sample_id])
41 |     if df[sample_id].dtype == np.float64:
42 |         dtype = np.float32
43 |     elif df[sample_id].dtype == np.int64:
44 |         dtype = np.int32
45 |     else:
46 |         dtype = df[sample_id].dtype.type
47 |     gct_df = pd.DataFrame(0, index=df.index, columns=['Description']+list(sample_ids), dtype=dtype)
48 |     gct_df['Description'] = df['Description']
49 |     gct_df[sample_id] = df[sample_id].astype(dtype)
50 |     for k, sample_id in enumerate(sample_ids[1:], 2):
51 |         print(f"\rProcessing {k}/{len(sample_ids)}", end='' if k < len(sample_ids) else None, flush=True)
52 |         df = pd.read_csv(path_dict[sample_id], sep='\t', skiprows=3, header=None, usecols=[0,2],
53 |                          index_col=0, names=['Name', sample_id], dtype={'Name':str, sample_id:dtype})
54 |         gct_df[sample_id] = df[sample_id]
55 | 
56 | if args.parquet:
57 |     gct_df.to_parquet(os.path.join(args.output_dir, f"{args.prefix}.gct.parquet"))
58 | else:
59 |     qtl.io.write_gct(gct_df, os.path.join(args.output_dir, f"{args.prefix}.gct.gz"), float_format='%.6g', compresslevel=6)
60 | 


--------------------------------------------------------------------------------
/rnaseq/src/process_star_junctions.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | import os
 5 | import gzip
 6 | import qtl.io
 7 | 
 8 | parser = argparse.ArgumentParser(description='Convert STAR junction output (unique mapping reads) to GCT.')
 9 | parser.add_argument('star_junction_output', help='SJ.out.tab from STAR')
10 | parser.add_argument('reference_junctions', help='File (tsv) containing columns: chr, intron_start, intron_end, gene_id')
11 | parser.add_argument('prefix', help='Prefix for output file: <prefix>.SJ.gct.gz')
12 | parser.add_argument('--parquet', action='store_true', help='Write to parquet format instead of GCT')
13 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
14 | args = parser.parse_args()
15 | 
16 | # STAR junction format:
17 | #  0) chromosome
18 | #  1) 1st base of intron (1-based)
19 | #  2) last base of intron (1-based)
20 | #  3) strand (0: undefined, 1: +, 2: -)
21 | #  4) intron motif: 0: non-canonical
22 | #                   1: GT/AG
23 | #                   2: CT/AC
24 | #                   3: GC/AG
25 | #                   4: CT/GC
26 | #                   5: AT/AC
27 | #                   6: GT/AT
28 | #  5) annotation status: 0: unannotated
29 | #                        1: annotated
30 | #  6) number of uniquely mapping reads crossing the junction
31 | #  7) number of multi-mapping reads crossing the junction
32 | #  8) maximum spliced alignment overhang
33 | 
34 | 
35 | # read STAR output
36 | columns = ['chr', 'intron_start', 'intron_end', 'strand', 'motif', 'status', 'n_unique', 'n_multi', 'max_overhang']
37 | dtype = {'chr':str, 'intron_start':np.int32, 'intron_end':np.int32, 'strand':np.int32, 'motif':np.int32,
38 |          'status':np.int32, 'n_unique':np.int32, 'n_multi':np.int32, 'max_overhang':np.int32}
39 | junctions_df = pd.read_csv(args.star_junction_output, sep='\t', header=None, names=columns, dtype=dtype)
40 | junctions_df['strand'] = junctions_df['strand'].map({0: '?', 1:'+', 2:'-'})
41 | junctions_df.index = (junctions_df['chr'] + ':' + junctions_df['intron_start'].astype(str)
42 |                       + '-' + junctions_df['intron_end'].astype(str) + ':' + junctions_df['strand'])
43 | 
44 | # read reference
45 | reference_df = pd.read_csv(args.reference_junctions, sep='\t',
46 |                            dtype={'chr':str, 'intron_start':np.int32, 'intron_end':np.int32, 'gene_id':str})
47 | reference_df.index = (reference_df['chr'] + ':' + reference_df['intron_start'].astype(str)
48 |                       + '-' + reference_df['intron_end'].astype(str) + ':' + reference_df['strand'])
49 | assert not reference_df.index.duplicated().any(), "Junction annotation must not contain any duplicated entries"
50 | 
51 | # use unique-mapping reads
52 | gct_df = reference_df[['gene_id']].join(junctions_df['n_unique'])
53 | gct_df['n_unique'] = gct_df['n_unique'].fillna(0).astype(np.int32)
54 | # write as GCT
55 | gct_df.rename(columns={'gene_id':'Description', 'n_unique':args.prefix}, inplace=True)
56 | gct_df.index.name = 'Name'
57 | if args.parquet:
58 |     gct_df.to_parquet(os.path.join(args.output_dir, f"{args.prefix}.SJ.parquet"))
59 | else:
60 |     qtl.io.write_gct(gct_df, os.path.join(args.output_dir, f"{args.prefix}.SJ.gct.gz"))
61 | 


--------------------------------------------------------------------------------
/rnaseq/src/run_MarkDuplicates.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | 
 4 | import argparse
 5 | import os
 6 | import struct
 7 | import subprocess
 8 | from datetime import datetime
 9 | import contextlib
10 | 
11 | @contextlib.contextmanager
12 | def cd(cd_path):
13 |     saved_path = os.getcwd()
14 |     os.chdir(cd_path)
15 |     yield
16 |     os.chdir(saved_path)
17 | 
18 | parser = argparse.ArgumentParser(description='Convert BAM to FASTQ using SamToFastq from Picard.')
19 | parser.add_argument('input_bam', type=str, help='BAM file')
20 | parser.add_argument('prefix', type=str, help='Prefix for output files; usually <sample_id>')
21 | parser.add_argument('-o', '--output_dir', default=os.getcwd(), help='Output directory')
22 | parser.add_argument('-m', '--memory', default=3, type=int, help='Memory, in GB')
23 | parser.add_argument('--max_records_in_ram', default=500000, type=int,
24 |                     help='Number of records stored in RAM before spilling to disk')
25 | parser.add_argument('--sorting_collection_size_ratio', default=0.25, type=float)
26 | parser.add_argument('--tagging_policy', default='DontTag', choices=['All', 'OpticalOnly', 'DontTag'])
27 | parser.add_argument('--optical_duplicate_pixel_distance', default=100,
28 |                     help='Maximum offset between two duplicate clusters. 100 (default) is appropriate for unpatterned, 2500 recommended for patterned flowcells.')
29 | parser.add_argument('--jar', default='/opt/picard-tools/picard.jar', help='Path to Picard jar')
30 | args = parser.parse_args()
31 | 
32 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Starting MarkDuplicates', flush=True)
33 | 
34 | if not os.path.exists(args.output_dir):
35 |     os.makedirs(args.output_dir)
36 | 
37 | with cd(args.output_dir):
38 |     subprocess.check_call('java -jar -Xmx{}g {}'.format(args.memory, args.jar)\
39 |         +' MarkDuplicates I={}'.format(args.input_bam)\
40 |         +' O={}'.format(os.path.basename(args.input_bam).replace('.bam', '.md.bam'))\
41 |         +' PROGRAM_RECORD_ID=null'\
42 |         +' MAX_RECORDS_IN_RAM={}'.format(args.max_records_in_ram)\
43 |         +' SORTING_COLLECTION_SIZE_RATIO={}'.format(args.sorting_collection_size_ratio)\
44 |         +' TMP_DIR={}'.format(args.output_dir)\
45 |         +' M={}.marked_dup_metrics.txt'.format(args.prefix)\
46 |         +' ASSUME_SORT_ORDER=coordinate'\
47 |         +' TAGGING_POLICY={}'.format(args.tagging_policy)\
48 |         +' OPTICAL_DUPLICATE_PIXEL_DISTANCE={}'.format(args.optical_duplicate_pixel_distance),
49 |     shell=True)
50 | 
51 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Finished MarkDuplicates', flush=True)
52 | 


--------------------------------------------------------------------------------
/rnaseq/src/run_RSEM.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | 
 4 | import argparse
 5 | import os.path
 6 | import subprocess
 7 | from datetime import datetime
 8 | import contextlib
 9 | 
10 | @contextlib.contextmanager
11 | def cd(cd_path):
12 |     saved_path = os.getcwd()
13 |     os.chdir(cd_path)
14 |     yield
15 |     os.chdir(saved_path)
16 | 
17 | parser = argparse.ArgumentParser(description='Run RSEM')
18 | parser.add_argument('rsem_ref_dir', type=str, help='Path to RSEM reference files generated with rsem-prepare-reference. The reference file prefix (for files within rsem_ref_dir) must be ''rsem_reference''')
19 | parser.add_argument('input_file', type=str, help='BAM file or .gz.list file with paths to fastq.gz files')
20 | parser.add_argument('prefix', help='Prefix for output file names')
21 | parser.add_argument('-o', '--output_dir', default='.', help='Output directory')
22 | parser.add_argument('--max_frag_len', default='1000', help='Maximum fragment length')
23 | parser.add_argument('--estimate_rspd', type=str.lower, choices=['true', 'false'], default='true', help='Set to estimate the read start position distribution from data (recommended)')
24 | parser.add_argument('--calc_ci', type=str.lower, choices=['true', 'false'], default='false', help='Calculate 95% credibility intervals and posterior mean estimates')
25 | parser.add_argument('--is_stranded', type=str.lower, choices=['true', 'false'], default='false', help='Stranded protocol')
26 | parser.add_argument('--paired_end', type=str.lower, choices=['true', 'false'], default='true', help='Paired-end protocol')
27 | parser.add_argument('-t', '--threads', default='4', help='Number of threads')
28 | parser.add_argument('--bowtie_version', choices=['1', '2'], default='2', help='Select Bowtie version')
29 | args = parser.parse_args()
30 | 
31 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Running RSEM', flush=True)
32 | with cd(args.output_dir):
33 |     cmd = 'rsem-calculate-expression --num-threads '+args.threads+' --fragment-length-max '+args.max_frag_len+' --no-bam-output'
34 | 
35 |     if args.paired_end=='true':
36 |         cmd += ' --paired-end'
37 | 
38 |     if args.estimate_rspd=='true':
39 |         cmd += ' --estimate-rspd'
40 | 
41 |     if args.calc_ci=='true':
42 |         cmd += ' --calc-ci'
43 | 
44 |     if args.is_stranded=='true':
45 |         cmd += ' --forward-prob 0.0'
46 | 
47 |     if os.path.splitext(args.input_file)[1]=='.bam':
48 |         cmd += ' --bam '+args.input_file+' '+os.path.join(args.rsem_ref_dir,'rsem_reference')+' '+args.prefix+'.rsem'
49 |     else:
50 |         with open(args.input_file) as fqlist:
51 |             fastq1 = fqlist.readline().strip()
52 |             fastq2 = fqlist.readline().strip()
53 |         if args.bowtie_version=='2':
54 |             cmd += ' --bowtie2'
55 |         cmd += ' --bowtie-chunkmbs 128 <(gunzip -c '+fastq1+') <(gunzip -c '+fastq2+') '+os.path.join(args.rsem_ref_dir,'rsem_reference')+' '+args.prefix+'.rsem'
56 | 
57 |     # run RSEM
58 |     print('  * command: '+cmd, flush=True)
59 |     subprocess.check_call(cmd, shell=True, executable='/bin/bash')
60 | 
61 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Finished RSEM', flush=True)
62 | 


--------------------------------------------------------------------------------
/rnaseq/src/run_SamToFastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | 
 4 | import argparse
 5 | import os
 6 | import struct
 7 | import subprocess
 8 | from datetime import datetime
 9 | import contextlib
10 | 
11 | @contextlib.contextmanager
12 | def cd(cd_path):
13 |     saved_path = os.getcwd()
14 |     os.chdir(cd_path)
15 |     yield
16 |     os.chdir(saved_path)
17 | 
18 | 
19 | parser = argparse.ArgumentParser(description='Convert BAM/CRAM to FASTQ using Picard SamToFastq.')
20 | parser.add_argument('bam_file', type=str, help='BAM or CRAM file')
21 | parser.add_argument('-p', '--prefix', type=str, default='Reads', help='Prefix for output files; usually <sample_id>')
22 | parser.add_argument('-o', '--output_dir', default=os.getcwd(), help='Directory to which FASTQs will be written')
23 | parser.add_argument('-m', '--memory', default='8', type=str, help='Memory, in GB')
24 | parser.add_argument('--reference_fasta', default=None, help='Path to reference sequence FASTA (required if input is CRAM)')
25 | parser.add_argument('--jar', default='/opt/picard-tools/picard.jar', help='Path to Picard jar')
26 | parser.add_argument('--gzip', type=str.lower, default='1', help='gzip compression level for FASTQs; see "man gzip"')
27 | parser.add_argument('--include_non_pf_reads', type=str.lower, choices=['true', 'false'], default='true', help='Sets INCLUDE_NON_PF_READS option (PF: passed filtering). SamToFastq default: false')
28 | parser.add_argument('--include_non_primary_alignments', type=str.lower, choices=['true', 'false'], default='false', help='Sets INCLUDE_NON_PRIMARY_ALIGNMENTS option. SamToFastq default: false')
29 | args = parser.parse_args()
30 | 
31 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Starting SamToFastq', flush=True)
32 | 
33 | if not os.path.exists(args.output_dir):
34 |     os.makedirs(args.output_dir)
35 | 
36 | # Make named pipes for gzip
37 | with cd(args.output_dir):
38 |     fastq1 = args.prefix+'_1.fastq.gz'
39 |     fastq2 = args.prefix+'_2.fastq.gz'
40 |     fastq0 = args.prefix+'_unpaired.fastq.gz'
41 | 
42 |     subprocess.check_call('mkfifo read1_pipe read2_pipe read0_pipe', shell=True)
43 | 
44 |     # Set gzip streams
45 |     subprocess.check_call('gzip -'+args.gzip+' -c < read1_pipe > '+fastq1+' &', shell=True)
46 |     subprocess.check_call('gzip -'+args.gzip+' -c < read2_pipe > '+fastq2+' &', shell=True)
47 |     subprocess.check_call('gzip -'+args.gzip+' -c < read0_pipe > '+fastq0+' &', shell=True)
48 | 
49 |     # SamToFastq (write to pipes)
50 |     cmd = 'java -jar -Xmx'+str(int(args.memory))+'g '+args.jar+' SamToFastq INPUT='+args.bam_file\
51 |         +' INCLUDE_NON_PF_READS='+args.include_non_pf_reads\
52 |         +' INCLUDE_NON_PRIMARY_ALIGNMENTS='+args.include_non_primary_alignments\
53 |         +' VALIDATION_STRINGENCY=SILENT FASTQ=read1_pipe SECOND_END_FASTQ=read2_pipe UNPAIRED_FASTQ=read0_pipe'
54 |     if args.reference_fasta is not None:
55 |         cmd += ' REFERENCE_SEQUENCE={}'.format(args.reference_fasta)
56 |     subprocess.check_call(cmd, shell=True)
57 | 
58 |     # Delete named pipes
59 |     subprocess.check_call('rm read1_pipe read2_pipe read0_pipe', shell=True)
60 | 
61 |     # Delete unpaired reads FASTQ if empty
62 |     with open(fastq0, 'rb') as f0:
63 |         f0.seek(-4,2)
64 |         if struct.unpack('<I', f0.read(4))[0]==0:  # empty file
65 |             os.remove(fastq0)
66 | 
67 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Finished SamToFastq', flush=True)
68 | 


--------------------------------------------------------------------------------
/rnaseq/src/run_bamsync.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Author: Francois Aguet
 3 | 
 4 | set -euo pipefail
 5 | IFS=$'\n\t'
 6 | 
 7 | source_bam=$1
 8 | target_bam=$2
 9 | prefix=$3
10 | 
11 | output_bam=${prefix}.Aligned.sortedByCoord.out.patched.bam
12 | bamsync ${source_bam} ${target_bam} -o ${output_bam}
13 | samtools index ${output_bam}
14 | 
15 | # verify that both BAMs have same number of reads
16 | nreads_in=$(samtools idxstats ${target_bam} | cut -f3-4 | awk '{s+=$1+$2}END{print s}')
17 | nreads_out=$(samtools idxstats ${output_bam} | cut -f3-4 | awk '{s+=$1+$2}END{print s}')
18 | if (( ${nreads_in}!=${nreads_out} )); then
19 |     echo "Read numbers do not match: ${nreads_in} vs ${nreads_out}"
20 | 	exit 1
21 | fi
22 | 


--------------------------------------------------------------------------------
/rnaseq/src/run_remove_IDS_reads.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Author: Abhishek Choudhary
 3 | 
 4 | set -euo pipefail
 5 | IFS=$'\n\t'
 6 | 
 7 | input_bam=$1
 8 | prefix=$2
 9 | 
10 | output_bam=${prefix}.Aligned.toTranscriptome_noIDS.out.bam
11 | 
12 | samtools view ${input_bam} | awk '{print $1}' | sort | uniq > readids_all
13 | samtools view ${input_bam} | awk '$6 ~ "I|D|S"' | awk '{print $1}' | sort | uniq > readids_flagIDS
14 | comm -23 readids_all readids_flagIDS > readids_flagnonIDS
15 | 
16 | echo readids all = $(cat readids_all | wc -l)
17 | echo readids for I/D/S flag = $(cat readids_flagIDS | wc -l)
18 | echo readids for non I/D/S flag = $(cat readids_flagnonIDS | wc -l)
19 | 
20 | samtools view -N readids_flagnonIDS -hb -o ${output_bam} ${input_bam}
21 | 


--------------------------------------------------------------------------------
/rnaseq/src/run_rnaseqc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Author: Francois Aguet
 3 | import argparse
 4 | import subprocess
 5 | from datetime import datetime
 6 | import os
 7 | 
 8 | 
 9 | parser = argparse.ArgumentParser(description='Wrapper for RNA-SeQC 2')
10 | parser.add_argument('genes_gtf', type=str, help='Gene annotation GTF')
11 | parser.add_argument('bam_file', type=str, help='BAM file')
12 | parser.add_argument('prefix', type=str, default='Reads', help='Prefix for output files; usually sample_id')
13 | parser.add_argument('-o', '--output_dir', default=os.getcwd(), help='Output directory')
14 | parser.add_argument('--stranded', default=None, choices=['rf', 'fr'], help='Strandedness for stranded libraries')
15 | parser.add_argument('--bed', default=None, help='BED file with intervals for estimating insert size distribution')
16 | args = parser.parse_args()
17 | 
18 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Running RNA-SeQC', flush=True)
19 | 
20 | cmd = 'rnaseqc {} {} {}'.format(args.genes_gtf, args.bam_file, args.output_dir) \
21 |     + ' -s '+args.prefix \
22 |     + ' -vv'
23 | if args.stranded is not None:
24 |     cmd += ' --stranded '+args.stranded
25 | if args.bed is not None:
26 |     cmd += ' --bed '+args.bed
27 | print('  * command: "{}"'.format(cmd), flush=True)
28 | subprocess.check_call(cmd, shell=True)
29 | 
30 | # gzip GCTs
31 | subprocess.check_call('gzip {0}.exon_reads.gct {0}.gene_tpm.gct {0}.gene_reads.gct'.format(args.prefix), shell=True)
32 | 
33 | print('['+datetime.now().strftime("%b %d %H:%M:%S")+'] Finished RNA-SeQC', flush=True)
34 | 


--------------------------------------------------------------------------------
/rnaseq/star.wdl:
--------------------------------------------------------------------------------
  1 | task star {
  2 | 
  3 |     File fastq1
  4 |     File? fastq2
  5 |     String prefix
  6 |     File star_index
  7 | 
  8 |     # STAR options
  9 |     Int? outFilterMultimapNmax
 10 |     Int? alignSJoverhangMin
 11 |     Int? alignSJDBoverhangMin
 12 |     Int? outFilterMismatchNmax
 13 |     Float? outFilterMismatchNoverLmax
 14 |     Int? alignIntronMin
 15 |     Int? alignIntronMax
 16 |     Int? alignMatesGapMax
 17 |     String? outFilterType
 18 |     Float? outFilterScoreMinOverLread
 19 |     Float? outFilterMatchNminOverLread
 20 |     Int? limitSjdbInsertNsj
 21 |     String? outSAMstrandField
 22 |     String? outFilterIntronMotifs
 23 |     String? alignSoftClipAtReferenceEnds
 24 |     String? quantMode
 25 |     String? outSAMattrRGline
 26 |     String? outSAMattributes
 27 |     File? varVCFfile
 28 |     String? waspOutputMode
 29 |     Int? chimSegmentMin
 30 |     Int? chimJunctionOverhangMin
 31 |     String? chimOutType
 32 |     Int? chimMainSegmentMultNmax
 33 |     Int? chimOutJunctionFormat
 34 |     File? sjdbFileChrStartEnd
 35 |     String? quantTranscriptomeSAMoutput
 36 |     Int? winAnchorMultimapNmax
 37 |     String? genomeTransformOutput
 38 | 
 39 |     Int memory
 40 |     Int disk_space
 41 |     Int num_threads
 42 |     Int num_preempt
 43 | 
 44 |     command {
 45 |         set -euo pipefail
 46 | 
 47 |         if [[ ${fastq1} == *".tar" || ${fastq1} == *".tar.gz" ]]; then
 48 |             tar -xvvf ${fastq1}
 49 |             fastq1_abs=$(for f in *_1.fastq*; do echo "$(pwd)/$f"; done | paste -s -d ',')
 50 |             fastq2_abs=$(for f in *_2.fastq*; do echo "$(pwd)/$f"; done | paste -s -d ',')
 51 |             if [[ $fastq1_abs == *"*_1.fastq*" ]]; then  # no paired-end FASTQs found; check for single-end FASTQ
 52 |                 fastq1_abs=$(for f in *.fastq*; do echo "$(pwd)/$f"; done | paste -s -d ',')
 53 |                 fastq2_abs=''
 54 |             fi
 55 |         else
 56 |             # make sure paths are absolute
 57 |             fastq1_abs=${fastq1}
 58 |             fastq2_abs=${fastq2}
 59 |             if [[ $fastq1_abs != /* ]]; then
 60 |                 fastq1_abs=$PWD/$fastq1_abs
 61 |                 fastq2_abs=$PWD/$fastq2_abs
 62 |             fi
 63 |         fi
 64 | 
 65 |         echo "FASTQs:"
 66 |         echo $fastq1_abs
 67 |         echo $fastq2_abs
 68 | 
 69 |         # extract index
 70 |         echo $(date +"[%b %d %H:%M:%S] Extracting STAR index")
 71 |         mkdir star_index
 72 |         tar -xvvf ${star_index} -C star_index --strip-components=1
 73 | 
 74 |         mkdir star_out
 75 |         # placeholders for optional outputs
 76 |         touch star_out/${prefix}.Aligned.toTranscriptome.out.bam
 77 |         touch star_out/${prefix}.Chimeric.out.junction.gz
 78 |         touch star_out/${prefix}.Chimeric.out.sorted.bam
 79 |         touch star_out/${prefix}.Chimeric.out.sorted.bam.bai
 80 |         touch star_out/${prefix}.ReadsPerGene.out.tab  # run_STAR.py will gzip
 81 | 
 82 |         /src/run_STAR.py \
 83 |             star_index $fastq1_abs $fastq2_abs ${prefix} \
 84 |             --output_dir star_out \
 85 |             ${"--outFilterMultimapNmax " + outFilterMultimapNmax} \
 86 |             ${"--alignSJoverhangMin " + alignSJoverhangMin} \
 87 |             ${"--alignSJDBoverhangMin " + alignSJDBoverhangMin} \
 88 |             ${"--outFilterMismatchNmax " + outFilterMismatchNmax} \
 89 |             ${"--outFilterMismatchNoverLmax " + outFilterMismatchNoverLmax} \
 90 |             ${"--alignIntronMin " + alignIntronMin} \
 91 |             ${"--alignIntronMax " + alignIntronMax} \
 92 |             ${"--alignMatesGapMax " + alignMatesGapMax} \
 93 |             ${"--outFilterType " + outFilterType} \
 94 |             ${"--outFilterScoreMinOverLread " + outFilterScoreMinOverLread} \
 95 |             ${"--outFilterMatchNminOverLread " + outFilterMatchNminOverLread} \
 96 |             ${"--limitSjdbInsertNsj " + limitSjdbInsertNsj} \
 97 |             ${"--outSAMstrandField " + outSAMstrandField} \
 98 |             ${"--outFilterIntronMotifs " + outFilterIntronMotifs} \
 99 |             ${"--alignSoftClipAtReferenceEnds " + alignSoftClipAtReferenceEnds} \
100 |             ${"--quantMode " + quantMode} \
101 |             ${"--outSAMattrRGline " + outSAMattrRGline} \
102 |             ${"--outSAMattributes " + outSAMattributes} \
103 |             ${"--varVCFfile " + varVCFfile} \
104 |             ${"--waspOutputMode " + waspOutputMode} \
105 |             ${"--chimSegmentMin " + chimSegmentMin} \
106 |             ${"--chimJunctionOverhangMin " + chimJunctionOverhangMin} \
107 |             ${"--chimOutType " + chimOutType} \
108 |             ${"--chimMainSegmentMultNmax " + chimMainSegmentMultNmax} \
109 |             ${"--chimOutJunctionFormat " + chimOutJunctionFormat} \
110 |             ${"--sjdbFileChrStartEnd " + sjdbFileChrStartEnd} \
111 |             ${"--quantTranscriptomeSAMoutput " + quantTranscriptomeSAMoutput} \
112 |             ${"--winAnchorMultimapNmax " + winAnchorMultimapNmax} \
113 |             ${"--genomeTransformOutput " + genomeTransformOutput} \
114 |             --threads ${num_threads}
115 |     }
116 | 
117 |     output {
118 |         File bam_file = "star_out/${prefix}.Aligned.sortedByCoord.out.bam"
119 |         File bam_index = "star_out/${prefix}.Aligned.sortedByCoord.out.bam.bai"
120 |         File transcriptome_bam = "star_out/${prefix}.Aligned.toTranscriptome.out.bam"
121 |         File chimeric_junctions = "star_out/${prefix}.Chimeric.out.junction.gz"
122 |         File chimeric_bam_file = "star_out/${prefix}.Chimeric.out.sorted.bam"
123 |         File chimeric_bam_index = "star_out/${prefix}.Chimeric.out.sorted.bam.bai"
124 |         File read_counts = "star_out/${prefix}.ReadsPerGene.out.tab.gz"
125 |         File junctions = "star_out/${prefix}.SJ.out.tab.gz"
126 |         File junctions_pass1 = "star_out/${prefix}._STARpass1/${prefix}.SJ.pass1.out.tab.gz"
127 |         Array[File] logs = ["star_out/${prefix}.Log.final.out", "star_out/${prefix}.Log.out", "star_out/${prefix}.Log.progress.out"]
128 |     }
129 | 
130 |     runtime {
131 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
132 |         memory: "${memory}GB"
133 |         disks: "local-disk ${disk_space} HDD"
134 |         cpu: "${num_threads}"
135 |         preemptible: "${num_preempt}"
136 |     }
137 | 
138 |     meta {
139 |         author: "Francois Aguet"
140 |     }
141 | }
142 | 
143 | 
144 | workflow star_workflow {
145 |     call star
146 | }
147 | 


--------------------------------------------------------------------------------
/rnaseq/star_fastq_list.wdl:
--------------------------------------------------------------------------------
  1 | task star_fastq_list {
  2 | 
  3 |     Array[File] fastq1
  4 |     Array[File]? fastq2
  5 |     String prefix
  6 |     File star_index
  7 | 
  8 |     # STAR options
  9 |     Int? outFilterMultimapNmax
 10 |     Int? alignSJoverhangMin
 11 |     Int? alignSJDBoverhangMin
 12 |     Int? outFilterMismatchNmax
 13 |     Float? outFilterMismatchNoverLmax
 14 |     Int? alignIntronMin
 15 |     Int? alignIntronMax
 16 |     Int? alignMatesGapMax
 17 |     String? outFilterType
 18 |     Float? outFilterScoreMinOverLread
 19 |     Float? outFilterMatchNminOverLread
 20 |     Int? limitSjdbInsertNsj
 21 |     String? outSAMstrandField
 22 |     String? outFilterIntronMotifs
 23 |     String? alignSoftClipAtReferenceEnds
 24 |     String? quantMode
 25 |     String? outSAMattrRGline
 26 |     String? outSAMattributes
 27 |     File? varVCFfile
 28 |     String? waspOutputMode
 29 |     Int? chimSegmentMin
 30 |     Int? chimJunctionOverhangMin
 31 |     String? chimOutType
 32 |     Int? chimMainSegmentMultNmax
 33 |     Int? chimOutJunctionFormat
 34 |     File? sjdbFileChrStartEnd
 35 | 
 36 |     Int memory
 37 |     Int disk_space
 38 |     Int num_threads
 39 |     Int num_preempt
 40 | 
 41 |     command {
 42 |         set -euo pipefail
 43 | 
 44 |         # extract index
 45 |         echo $(date +"[%b %d %H:%M:%S] Extracting STAR index")
 46 |         mkdir star_index
 47 |         tar -xvvf ${star_index} -C star_index --strip-components=1
 48 | 
 49 |         mkdir star_out
 50 |         # placeholders for optional outputs
 51 |         touch star_out/${prefix}.Aligned.toTranscriptome.out.bam
 52 |         touch star_out/${prefix}.Chimeric.out.sorted.bam
 53 |         touch star_out/${prefix}.Chimeric.out.sorted.bam.bai
 54 |         touch star_out/${prefix}.ReadsPerGene.out.tab  # run_STAR.py will gzip
 55 | 
 56 |         /src/run_STAR.py \
 57 |             star_index ${sep=',' fastq1} ${sep=',' fastq2} ${prefix} \
 58 |             --output_dir star_out \
 59 |             ${"--outFilterMultimapNmax " + outFilterMultimapNmax} \
 60 |             ${"--alignSJoverhangMin " + alignSJoverhangMin} \
 61 |             ${"--alignSJDBoverhangMin " + alignSJDBoverhangMin} \
 62 |             ${"--outFilterMismatchNmax " + outFilterMismatchNmax} \
 63 |             ${"--outFilterMismatchNoverLmax " + outFilterMismatchNoverLmax} \
 64 |             ${"--alignIntronMin " + alignIntronMin} \
 65 |             ${"--alignIntronMax " + alignIntronMax} \
 66 |             ${"--alignMatesGapMax " + alignMatesGapMax} \
 67 |             ${"--outFilterType " + outFilterType} \
 68 |             ${"--outFilterScoreMinOverLread " + outFilterScoreMinOverLread} \
 69 |             ${"--outFilterMatchNminOverLread " + outFilterMatchNminOverLread} \
 70 |             ${"--limitSjdbInsertNsj " + limitSjdbInsertNsj} \
 71 |             ${"--outSAMstrandField " + outSAMstrandField} \
 72 |             ${"--outFilterIntronMotifs " + outFilterIntronMotifs} \
 73 |             ${"--alignSoftClipAtReferenceEnds " + alignSoftClipAtReferenceEnds} \
 74 |             ${"--quantMode " + quantMode} \
 75 |             ${"--outSAMattrRGline " + outSAMattrRGline} \
 76 |             ${"--outSAMattributes " + outSAMattributes} \
 77 |             ${"--varVCFfile " + varVCFfile} \
 78 |             ${"--waspOutputMode " + waspOutputMode} \
 79 |             ${"--chimSegmentMin " + chimSegmentMin} \
 80 |             ${"--chimJunctionOverhangMin " + chimJunctionOverhangMin} \
 81 |             ${"--chimOutType " + chimOutType} \
 82 |             ${"--chimMainSegmentMultNmax " + chimMainSegmentMultNmax} \
 83 |             ${"--chimOutJunctionFormat " + chimOutJunctionFormat} \
 84 |             ${"--sjdbFileChrStartEnd " + sjdbFileChrStartEnd} \
 85 |             --threads ${num_threads}
 86 |     }
 87 | 
 88 |     output {
 89 |         File bam_file = "star_out/${prefix}.Aligned.sortedByCoord.out.bam"
 90 |         File bam_index = "star_out/${prefix}.Aligned.sortedByCoord.out.bam.bai"
 91 |         File transcriptome_bam = "star_out/${prefix}.Aligned.toTranscriptome.out.bam"
 92 |         File chimeric_junctions = "star_out/${prefix}.Chimeric.out.junction.gz"
 93 |         File chimeric_bam_file = "star_out/${prefix}.Chimeric.out.sorted.bam"
 94 |         File chimeric_bam_index = "star_out/${prefix}.Chimeric.out.sorted.bam.bai"
 95 |         File read_counts = "star_out/${prefix}.ReadsPerGene.out.tab.gz"
 96 |         File junctions = "star_out/${prefix}.SJ.out.tab.gz"
 97 |         File junctions_pass1 = "star_out/${prefix}._STARpass1/${prefix}.SJ.pass1.out.tab.gz"
 98 |         Array[File] logs = ["star_out/${prefix}.Log.final.out", "star_out/${prefix}.Log.out", "star_out/${prefix}.Log.progress.out"]
 99 |     }
100 | 
101 |     runtime {
102 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
103 |         memory: "${memory}GB"
104 |         disks: "local-disk ${disk_space} HDD"
105 |         cpu: "${num_threads}"
106 |         preemptible: "${num_preempt}"
107 |     }
108 | 
109 |     meta {
110 |         author: "Francois Aguet"
111 |     }
112 | }
113 | 
114 | 
115 | workflow star_fastq_list_workflow {
116 |     call star_fastq_list
117 | }
118 | 


--------------------------------------------------------------------------------
/rnaseq/star_index.wdl:
--------------------------------------------------------------------------------
 1 | task star_index {
 2 | 
 3 |     File reference_fasta
 4 |     File annotation_gtf
 5 |     String prefix
 6 |     Int overhang
 7 |     Int? limit_sjdb_insert_junctions
 8 |     Int? suffix_length_max
 9 |     String? transform_type
10 |     File? transform_vcf
11 | 
12 |     Int memory
13 |     Int disk_space
14 |     Int num_threads
15 |     Int num_preempt
16 | 
17 |     command {
18 |         set -euo pipefail
19 |         mkdir ${prefix}
20 |         STAR \
21 |             --runMode genomeGenerate \
22 |             --genomeDir ${prefix} \
23 |             --genomeFastaFiles ${reference_fasta} \
24 |             --sjdbGTFfile ${annotation_gtf} \
25 |             --sjdbOverhang ${overhang} \
26 |             ${"--limitSjdbInsertNsj " + limit_sjdb_insert_junctions} \
27 |             ${"--genomeSuffixLengthMax " + suffix_length_max} \
28 |             ${"--genomeTransformType " + transform_type} \
29 |             ${"--genomeTransformVCF " + transform_vcf} \
30 |             --runThreadN ${num_threads}
31 |         tar -cvzf ${prefix}.tar.gz ${prefix}
32 |     }
33 | 
34 |     output {
35 |         File star_index = "${prefix}.tar.gz"
36 |     }
37 | 
38 |     runtime {
39 |         docker: "gcr.io/broad-cga-francois-gtex/gtex_rnaseq:V10"
40 |         memory: "${memory}GB"
41 |         disks: "local-disk ${disk_space} HDD"
42 |         cpu: "${num_threads}"
43 |         preemptible: "${num_preempt}"
44 |     }
45 | 
46 |     meta {
47 |         author: "Francois Aguet"
48 |     }
49 | }
50 | 
51 | 
52 | workflow star_index_workflow {
53 |     call star_index
54 | }
55 | 


--------------------------------------------------------------------------------