├── .gitattributes
├── .gitignore
├── GLASS-WG.Rproj
├── LICENSE
├── R
    ├── cnv
    │   ├── README.md
    │   ├── aneuploidy.R
    │   ├── archive
    │   │   ├── check-gatk-modelsegments-hets.R
    │   │   ├── cnv-sig.R
    │   │   ├── compare-CNV.R
    │   │   ├── gatk_write_seg_to_db.R
    │   │   ├── loh_to_igv.R
    │   │   ├── noncodel-cell-cycle-aneuploidy.R
    │   │   ├── paired_delta_seg_to_igv.R
    │   │   └── prop_het_plot.R
    │   ├── copy-number-optimization.R
    │   ├── copynumber-seg-gene-level.R
    │   ├── determine-codel-status.R
    │   ├── fig3-aneuploidy.R
    │   ├── figEDF-cnv-arm-heatmap.R
    │   ├── prepare_gistic.R
    │   ├── titan-assessment.R
    │   └── titan-results-enumerate-events.R
    ├── figures
    │   ├── EDF2-fractionmf.R
    │   ├── F1-F3-GLASS-heatmap.R
    │   ├── F3b_d-cell-cycle-analysis.R
    │   ├── fig-PyClone-ccf-shared.R
    │   └── fig-mutational-signatures.R
    ├── manifest
    │   ├── README.md
    │   ├── case-western-create-manifest.R
    │   ├── create-stjude-barcodes.R
    │   ├── dfci-create-wxs-readgroups.R
    │   ├── dkfz-create-manifest.R
    │   ├── gdc-create-manifest.R
    │   ├── glass-LU-create-manifest.R
    │   ├── glass-stjude-create-manifest.R
    │   ├── glass-wxs-create-manifest.R
    │   ├── hf-create-manifest.R
    │   ├── hk-create-manifest.R
    │   ├── hongkong-seq-json-metadata.R
    │   ├── jdg-create-manifest.R
    │   ├── life-history-barcode-generation.R
    │   ├── make-manifest-K2.R
    │   ├── mda-create-manifest.R
    │   ├── merge-manifest.R
    │   ├── mgh-create-manifest.R
    │   └── nested-list-example.R
    ├── misc
    │   ├── README.md
    │   ├── blocklist2db.R
    │   ├── cytoband2DB.R
    │   ├── dashboard.R
    │   ├── geneTable2DB.R
    │   ├── roel-grant.R
    │   ├── seqz2DB.R
    │   ├── st-jude-life-history-identification.R
    │   ├── table-to-json-example.R
    │   ├── titan2DB.R
    │   └── titanparams2db.R
    ├── neoantigens
    │   ├── analysis
    │   │   ├── SuppTable6_writetotext.r
    │   │   ├── neoag_depletion_hla_count.r
    │   │   ├── neoantigen_depletion_subclonal_selection.r
    │   │   └── neoantigen_depletion_survival_cox.r
    │   ├── figures
    │   │   ├── ExtendedDataFig_neoag_ccf_shared.r
    │   │   ├── ExtendedDataFig_neoag_depletion_CIBERSORT_barplots.r
    │   │   ├── Fig4_neoag_depletion_clonality_timepoint.r
    │   │   ├── Fig4_neoag_depletion_subtype_timepoint_hm.r
    │   │   ├── Fig4_neoag_nonsyn_rate.r
    │   │   └── neoag_depletion_fraction_subtype_hm_boxplots.r
    │   └── upload
    │   │   ├── cibersort_table.r
    │   │   └── combine_neoag_tables.r
    ├── preprocess
    │   ├── Novogene-MDACC-sample-status.R
    │   ├── README.md
    │   ├── add_aligned_bam_to_files.R
    │   ├── aliquots-coverage-metrics.R
    │   ├── crosscheck-mismatch-identification.R
    │   ├── crosscheckmetricscluster.R
    │   ├── glass-surgery-clinical-data.R
    │   ├── metrics.R
    │   ├── ucsf-clinical-update.R
    │   └── vcf_aliquot_qc.R
    ├── pyclone_paired.R
    ├── shiny
    │   ├── RShinyDBFrontend.R
    │   └── shinyvaf.R
    ├── snakemake
    │   ├── cov2db.R
    │   ├── geno2db.R
    │   ├── pyclone_create_tsv.R
    │   ├── runSeqz.R
    │   ├── seg2db.R
    │   ├── snv2db.R
    │   └── vep_upload.r
    ├── snv
    │   ├── Fig2b-ccf-rank-kendall.R
    │   ├── GLASS_gene_comparison_clean.R
    │   ├── README.md
    │   ├── archive
    │   │   ├── allMutationDbToGRanges.R
    │   │   ├── called_mut_db_as_granges.R
    │   │   ├── ensembl_genes_to_db.R
    │   │   ├── heatmap-snv.R
    │   │   ├── mf_longitudinal.R
    │   │   ├── mf_private_shared_time.R
    │   │   ├── mut-freq.R
    │   │   ├── mutfreq_interval.R
    │   │   ├── pri_vs_rec_muts.R
    │   │   ├── privateVsSharedMutationDbToGRanges.R
    │   │   ├── sample_variants.R
    │   │   ├── shared_private_to_vcf.R
    │   │   ├── signature1_by_age_and_interval.R
    │   │   ├── sigproba2db.R
    │   │   ├── test-gene-vaf.R
    │   │   └── vaf-freq.R
    │   ├── clonevol.R
    │   ├── dndscv.R
    │   ├── driver-evolution-associations.R
    │   ├── fig1-hypermutators-survival.R
    │   ├── fig1-temporal-somatic-changes.R
    │   ├── fig2-neutralitytestR-subtype.R
    │   ├── fig2-subclonal-selection.R
    │   ├── figEDF-clonal-dynamics.R
    │   ├── figEDF-hypermutation-clonality.R
    │   ├── figEDF-longitudinal-gene-comparison.R
    │   ├── figR-purity-mf.R
    │   ├── figR-somatic-burden-survival.R
    │   ├── gene-driver-evolution.R
    │   ├── hypermutators-overall-survival.R
    │   ├── longitudinal-mutational-frequency.R
    │   ├── longitudinal-vaf-analyses.R
    │   ├── maf-comparison-tcga-pcwag-glass.R
    │   ├── mutect2-varscan2-results.R
    │   ├── neutral-evolution-mutect2.R
    │   ├── neutral-evolution.R
    │   ├── neutralitytestr-aliquots-results.R
    │   ├── neutralitytestr-aliquots.R
    │   ├── neutralitytestr-fractionated-results.R
    │   ├── notch1-mutations-glass.R
    │   ├── subclonalselection-neutralitytestr-integration.R
    │   └── temporal-mutation-analyses.R
    ├── telseq
    │   └── telseq.R
    ├── timing
    │   ├── fig3c-cdkn2a-aneuploidy-timing.R
    │   ├── pyclone_cluster_ccf.R
    │   └── timing.R
    └── vcf_filter.R
├── README.md
├── Snakefile
├── bin
    ├── bam-rg-insert-size-calc.pl
    ├── bamtofastq-rename.sh
    ├── bedtovcf.sh
    ├── create_fake_wgs_bams.sh
    ├── extractSplitReads_BwaMem
    ├── get-readgroups.sh
    ├── gistic_run.pbs
    ├── preprocess-intervals.sh
    ├── reset_directory_structure.sh
    ├── scatter-interval-list-to-bed.sh
    ├── select-germline-variants.sh
    ├── snakemake-run.sh
    └── svaba-test-parameters.sh
├── conf
    ├── cluster.json
    ├── config.yaml
    └── optitype_config.ini
├── dag
    ├── align.rulegraph.png
    ├── fingerprint.rulegraph.png
    ├── gatk-cnv.rulegraph.png
    ├── mt2.rulegraph.png
    ├── svdetect.rulegraph.png
    └── vs2.rulegraph.png
├── dbm
    ├── glass-rearranged.png
    ├── glass.dbm
    ├── glass.png
    └── glass.svg
├── envs
    ├── absolute.yaml
    ├── align.yaml
    ├── bcftools.yaml
    ├── delly.yaml
    ├── freebayes.yaml
    ├── gatk4.yaml
    ├── gdc-client.yaml
    ├── glass.yaml
    ├── glass.yml
    ├── haplotype.yaml
    ├── lumpy-sv.yaml
    ├── manta.yaml
    ├── optitype.yaml
    ├── pvacseq.yaml
    ├── pyclone.yaml
    ├── r.yaml
    ├── sequenza.yaml
    ├── somaticseq.yaml
    ├── telseq.yaml
    ├── titan.yaml
    ├── varscan2.yaml
    ├── vcf2maf.yaml
    └── vep.yaml
├── jar
    ├── VarScan.v2.4.2.jar
    └── VarScan.v2.4.3.jar
├── julia
    ├── README.md
    ├── extract_vafs.py
    ├── runsubclonalselection.jl.txt
    ├── subclonalselection.qsubsec.txt
    └── subclonalselection.tff.txt
├── python
    ├── .ipynb_checkpoints
    │   ├── LearnRegexp-checkpoint.ipynb
    │   ├── LearningJSON-1-checkpoint.ipynb
    │   ├── LearningJSON-2-checkpoint.ipynb
    │   ├── LearningJSON-3-checkpoint.ipynb
    │   ├── LearningJSON-4-checkpoint.ipynb
    │   ├── SoftwareCarpentryExercises-checkpoint.ipynb
    │   ├── Untitled-checkpoint.ipynb
    │   ├── Untitled1-checkpoint.ipynb
    │   ├── Untitled2-checkpoint.ipynb
    │   └── Untitled3-checkpoint.ipynb
    ├── JSONManifestHandler.py
    ├── ManifestHandler.py
    ├── PostgreSQLManifestHandler.py
    ├── __init__.py
    ├── countPysam.py
    ├── dexseq_prepare_annotation.py
    ├── glassfunc.py
    ├── manifest_tester.py
    └── map_building_functions.py
├── snakemake
    ├── align.smk
    ├── batches2db.R
    ├── cnv-post.smk
    ├── cnv.smk
    ├── cnvnator.smk
    ├── delly.smk
    ├── download.smk
    ├── fingerprinting.smk
    ├── fusorsv.smk
    ├── haplotype-map.smk
    ├── lumpy.smk
    ├── manta.smk
    ├── mutect2-post.smk
    ├── mutect2.smk
    ├── optitype.smk
    ├── pvacseq.smk
    ├── pyclone.smk
    ├── sequenza.smk
    ├── somaticseq.smk
    ├── telseq.smk
    ├── titan.smk
    └── varscan2.smk
└── sql
    ├── clinical
        ├── clinical-tumor-pairs-db2.sql
        └── clinical_by_tumor_pair.sql
    ├── cnv
        ├── c710_status.sql
        ├── call_10q25_26.sql
        ├── cdkn2a_genome_ccf.sql
        ├── cnv_by_gene_gatk.sql
        ├── gatk_seg_diff_call.sql
        ├── gistic_prepare.sql
        ├── prop_heterozygous_gatk.sql
        ├── recapseg_postgres.sql
        └── taylor_aneuploidy.sql
    ├── compare_seg_stats.sql
    ├── compute_chr7_10.sql
    ├── dndscv
        ├── dndscv_input_by_fraction.sql
        ├── dndscv_input_by_fraction_hyperm.sql
        ├── dndscv_input_by_fraction_triplet.sql
        ├── dndscv_input_by_sample.sql
        └── dndscv_input_by_sample_hyperm.sql
    ├── drivers
        ├── driver_status.sql
        ├── driver_status_arm.sql
        ├── driver_status_cnv.sql
        ├── driver_status_snv.sql
        └── driver_status_snv_neoag.sql
    ├── figures
        ├── mutsig_boxplot_fig1.sql
        └── mutsig_corr.sql
    ├── fred_cnv.sql
    ├── fred_mutation.sql
    ├── fred_mutation2.sql
    ├── get_gene_variant_coverage_sample.sql
    ├── heatmap
        ├── heatmap_aneuploidy.sql
        ├── heatmap_arm.sql
        ├── heatmap_arm_by_arm.sql
        ├── heatmap_c710.sql
        ├── heatmap_clinical.sql
        ├── heatmap_cnv.sql
        ├── heatmap_cnv_by_gene.sql
        ├── heatmap_drivers.sql
        ├── heatmap_evolution.sql
        ├── heatmap_mf.sql
        ├── heatmap_purity.sql
        ├── heatmap_pyclone_clusters.sql
        ├── heatmap_signatures.sql
        ├── heatmap_snv.sql
        ├── heatmap_snv_by_gene.sql
        └── heatmap_time.sql
    ├── id_multiple_aliquot_driver_change.sql
    ├── mf_longitudinal_analysis.sql
    ├── mut_freq
        └── mut_freq.sql
    ├── mut_sig
        ├── archive
        │   ├── mut_sig_aliquot.sql
        │   ├── mut_sig_driver_genes.sql
        │   ├── mut_sig_drivers.sql
        │   ├── mut_sig_effect.sql
        │   ├── mut_sig_fraction_subtype.sql
        │   ├── mut_sig_fraction_subtype_hypermutation.sql
        │   ├── mut_sig_gene.sql
        │   └── mut_sig_variant_classification.sql
        ├── mut_sig_class.sql
        ├── mut_sig_fraction.sql
        └── mut_sig_fraction_limited.sql
    ├── neoag
        ├── cibersort_depletion.sql
        ├── cibersort_depletion_fraction.sql
        ├── neoag_ccf_shared.sql
        ├── neoag_depletion_aliquot.sql
        ├── neoag_depletion_fraction.sql
        ├── neoag_freq.sql
        ├── neoantigen_depletion.sql
        ├── neoantigen_depletion_clonality.sql
        ├── neoantigen_depletion_fraction.sql
        ├── neoantigen_peptide_counts.sql
        ├── neoantigens_by_aliquot.sql
        └── neoantigens_by_pair.sql
    ├── neutrality
        ├── neutralitytestr-subtype.sql
        └── original_submission
        │   ├── neutrality-testr-input-mutect2.sql
        │   ├── neutrality_testr_input.sql
        │   └── neutralitytestr-input-aliquot-level.sql
    ├── prop_equal_paired_seg.sql
    ├── pyclone
        ├── pyclone_aliquots.sql
        ├── pyclone_cluster_pairs.sql
        ├── pyclone_cluster_pairs_anno_drivers.sql
        ├── pyclone_cluster_stats.sql
        ├── pyclone_cluster_stats2.sql
        └── pyclone_create_tsv.sql
    ├── roeltable.sql
    ├── selected_aliquots.sql
    ├── selected_tumor_pairs_silver.sql
    ├── set
        ├── gold_set.sql
        └── silver_set.sql
    ├── snv
        ├── longitudinal_gene_comparison_snv_all_genes.sql
        ├── longitudinal_gene_comparison_snv_smg.sql
        ├── tumor_mut_comparison.sql
        └── tumor_mut_comparison_anno.sql
    ├── tel.sql
    ├── test.sql
    ├── timing
        ├── ccf_shared.sql
        ├── timing.sql
        ├── timing_cnv.sql
        ├── timing_pairs.sql
        ├── timing_snv.sql
        └── timing_tp53_idh1_atrx.sql
    ├── titan_vs_seqz.sql
    ├── tumor_mut_comparison.sql
    ├── vaf_compare.sql
    ├── variant_status_leeds.sql
    └── variants
        ├── passanno.sql
        └── passgeno.sql


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.smk linguist-language=Python
2 | Snakefile linguist-language=Python
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | benchmarks/**
 2 | data/**
 3 | results/**
 4 | logs/**
 5 | dbm/glass-dbdump-20181004.sql
 6 | R/md-anderson-clinical.R
 7 | R/manifest/yung-create-manifest.R
 8 | R/RData/**
 9 | sandbox/**
10 | archive/**
11 | documents/**
12 | tmp/
13 | .snakemake/**
14 | .Rhistory
15 | *.Rdata
16 | *.RData
17 | *.Rda
18 | *.RDa
19 | .Rproj.user
20 | .Rproj.user/**
21 | .**
22 | *.pyc
23 | *.pyo
24 | figures/**
25 | 


--------------------------------------------------------------------------------
/GLASS-WG.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Floris P Barthel, Kevin C Johnson and Collaborators
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/R/cnv/README.md:
--------------------------------------------------------------------------------
1 | ### Copy Number Variation analyses
2 | 
3 | For copy number enumeration as well as determining 1p19q status in the WXS/WGS samples. Some temporary files processed in these scripts are no longer available as they have been replaced by the final calls.
4 | 


--------------------------------------------------------------------------------
/R/cnv/archive/check-gatk-modelsegments-hets.R:
--------------------------------------------------------------------------------
 1 | files = list.files("results/cnv/modelsegments/", pattern = "hets.tsv|hets.normal.tsv", recursive = T, full.names = T)
 2 | df = data.frame(fn = files, case = substr(basename(files),1,12), analysis = substr(basename(files),21,23), size = file.size(files))
 3 | 
 4 | library(tidyverse)
 5 | 
 6 | df2 <- df %>% group_by(case, analysis) %>% mutate(var = var(size)) %>% ungroup()
 7 | 
 8 | 
 9 | thets <- read_tsv("results/cnv/modelsegments/TCGA-DU-7304-R1-12D-WGS-TNHHDG/TCGA-DU-7304-R1-12D-WGS-TNHHDG.hets.tsv", comment = "@", col_types = "ciiicc")
10 | nhets <- read_tsv("results/cnv/modelsegments/TCGA-DU-7304-R1-12D-WGS-TNHHDG/TCGA-DU-7304-R1-12D-WGS-TNHHDG.hets.normal.tsv", comment = "@", col_types = "ciiicc")
11 | 
12 | nhets <- nhets %>%
13 |   mutate(ct = REF_COUNT + ALT_COUNT, vaf = ALT_COUNT / ct)
14 | 
15 | thets <- thets %>%
16 |   mutate(ct = REF_COUNT + ALT_COUNT, vaf = ALT_COUNT / ct)
17 | 
18 | plot(density(nhets$vaf))
19 | plot(density(thets$vaf))
20 | 


--------------------------------------------------------------------------------
/R/cnv/archive/cnv-sig.R:
--------------------------------------------------------------------------------
 1 | devtools::install_github("ShixiangWang/VSHunter", build_vignettes = TRUE)
 2 | library(VSHunter)
 3 | vignette('VSHunter')
 4 | load(system.file("extdata/example_cn_list.RData", package = "VSHunter"))
 5 | tcga_frac = cnv_getLengthFraction(tcga_segTabs)
 6 | tcga_features = cnv_derivefeatures(CN_data = tcga_segTabs, cores = 1, genome_build = "hg19")
 7 | tcga_components = cnv_fitMixModels(CN_features = tcga_features, cores = 4)
 8 | tcga_sample_component_matrix = cnv_generateSbCMatrix(tcga_features, tcga_components, cores = 4)
 9 | tcga_sig_choose = cnv_chooseSigNumber(tcga_sample_component_matrix, nrun = 10, cores = 4)
10 | tcga_sig_choose2 = cnv_chooseSigNumber(tcga_sample_component_matrix, nrun = 10, cores = 4, testRandom = FALSE)
11 | tcga_signatures = cnv_extractSignatures(tcga_sample_component_matrix, nsig = 3, cores = 4)
12 | w = NMF::basis(tcga_signatures)
13 | tcga_exposure = cnv_quantifySigExposure(sample_by_component = tcga_sample_component_matrix, component_by_signature = w)
14 | tcga_results = cnv_autoCaptureSignatures(tcga_sample_component_matrix, nrun=10, cores = 4)
15 | cnv_plotDistributionProfile(tcga_frac)
16 | cnv_plotDistributionProfile(tcga_frac, mode = "cd")
17 | cnv_plotDistributionProfile(tcga_frac, mode = "cd" , fill = TRUE)
18 | 
19 | cnv_plotDistributionProfile()
20 | cnv_plotFeatureDistribution()
21 | cnv_plotMixComponents()
22 | cnv_plotSignatures()


--------------------------------------------------------------------------------
/R/cnv/archive/gatk_write_seg_to_db.R:
--------------------------------------------------------------------------------
 1 | library(DBI)
 2 | library(tidyverse)
 3 | 
 4 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 5 | 
 6 | segfiles <- list.files("results/cnv/callsegments", full.names = TRUE)
 7 | segs <- parallel::mclapply(segfiles, function(f) {
 8 |   dat <- read.delim(f, comment.char = "@", as.is= TRUE)
 9 |   dat <- dat %>%
10 |     mutate(aliquot_barcode = substr(basename(f),1,30), pos = sprintf("[%s,%s]", START, END)) %>%
11 |     select(aliquot_barcode, chrom = CONTIG, pos, num_points = NUM_POINTS_COPY_RATIO, log2_copy_ratio = MEAN_LOG2_COPY_RATIO, call = CALL)
12 |   return(dat)
13 | }, mc.cores = 8)
14 | segs <- data.table::rbindlist(segs) %>% as.data.frame()
15 | 
16 | dbWriteTable(con, Id(schema="analysis",table="gatk_seg"), segs)
17 | 


--------------------------------------------------------------------------------
/R/cnv/archive/loh_to_igv.R:
--------------------------------------------------------------------------------
 1 | library(DBI)
 2 | library(tidyverse)
 3 | library(ggplot2)
 4 | 
 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 6 | 
 7 | q <- "SELECT
 8 | 	pair_barcode,
 9 | chrom::varchar(2),
10 | lower(pos) as \"start\",
11 | upper(pos)-1 as \"end\",
12 | num_snp,
13 | 2*(median_ratio-0.75) AS median_ratio
14 | FROM analysis.titan_seg"
15 | 
16 | qres <- dbGetQuery(con, q)
17 | qres = qres %>% filter(complete.cases(start,end,num_snp,median_ratio))
18 | 
19 | write.table(qres, file = "loh.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
20 | 


--------------------------------------------------------------------------------
/R/cnv/archive/noncodel-cell-cycle-aneuploidy.R:
--------------------------------------------------------------------------------
 1 | # To get mutations in cell cycle genes from Figure 1.
 2 | mutation_genes = read_file("sql/heatmap/build_heatmap_data_mutation.sql")
 3 | mutations_selected = dbGetQuery(con, mutation_genes)
 4 | 
 5 | # To query genes that are found to be altered in the cell cycle.
 6 | cell_cycle_cnv_titan = dbGetQuery(con, "SELECT * FROM analysis.cnv_by_gene WHERE gene_symbol IN ('CDK4','CCND2','CDK6','CDKN2A','RB1')")
 7 | cell_cycle_cnv_gatk = dbGetQuery(con, "SELECT * FROM analysis.cnv_by_gene_gatk WHERE gene_symbol IN ('CDK4','CCND2','CDK6','CDKN2A','RB1')")
 8 | cell_cycle_cnv_titan = dbGetQuery(con, "SELECT * FROM analysis.cnv_by_gene WHERE gene_symbol = 'CDKN2A'")
 9 | cell_cycle_cnv_gatk = dbGetQuery(con, "SELECT * FROM analysis.cnv_by_gene_gatk WHERE gene_symbol = 'CDKN2A'")
10 | 
11 | cnv_titan = cell_cycle_cnv_titan %>% 
12 |   inner_join(pairs, by="pair_barcode") %>% 
13 |   select(tumor_barcode, gene_symbol, copy_number, corrected_cn, titan_call)
14 | 
15 | cell_cycle_cnv_merged = cell_cycle_cnv_gatk %>% 
16 |   select(aliquot_barcode, corrected_cn_gatk = corrected_cn, cn_call_gatk = cn_call) %>% 
17 |   inner_join(cnv_titan, by=c("aliquot_barcode"="tumor_barcode"))
18 | 
19 | table(cell_cycle_cnv_merged$cn_call_gatk, cell_cycle_cnv_merged$titan_call)


--------------------------------------------------------------------------------
/R/cnv/archive/paired_delta_seg_to_igv.R:
--------------------------------------------------------------------------------
 1 | library(DBI)
 2 | library(tidyverse)
 3 | library(ggplot2)
 4 | 
 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 6 | 
 7 | q<- "SELECT
 8 | 	tumor_pair_barcode,
 9 | chrom::varchar(2),
10 | lower(pos) as \"start\",
11 | upper(pos)-1 as \"end\",
12 | 0 as num_snps,
13 | delta_cn
14 | FROM analysis.titan_seg_paired_delta pa"
15 | 
16 | qres <- dbGetQuery(con, q)
17 | seg = qres %>% filter(complete.cases(start,end,num_snps))
18 | 
19 | #write.table(seg, file = "diff.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
20 | 
21 | q <- "SELECT tumor_pair_barcode, diamond_set::integer
22 | FROM analysis.titan_seg_paired_comparison ts
23 | LEFT JOIN biospecimen.aliquots al ON al.aliquot_barcode = ts.tumor_barcode_a
24 | LEFT JOIN clinical.surgeries cl ON cl.sample_barcode = al.sample_barcode"
25 | 
26 | qres <- dbGetQuery(con, q)
27 | 
28 | seg <- seg %>%
29 |   left_join(qres) %>% 
30 |   filter(diamond_set==1) %>%
31 |   select(-diamond_set)
32 | 
33 | write.table(seg, file = "results/cnv/gistic/input.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
34 | 
35 | markers = data.frame(id = 1:(2*nrow(seg)), chr = c(seg$chrom, seg$chrom), pos = c(seg$start, seg$end), stringsAsFactors = FALSE) %>% distinct()
36 | write.table(markers, file = "results/cnv/gistic/markers.txt", sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE)
37 | 
38 | #write.table(qres, file = "diff.annoseg.txt", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
39 | 


--------------------------------------------------------------------------------
/R/cnv/archive/prop_het_plot.R:
--------------------------------------------------------------------------------
 1 | library(DBI)
 2 | library(tidyverse)
 3 | library(ggplot2)
 4 | 
 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 6 | q <- "SELECT ts.*, cl.idh_codel_subtype
 7 | FROM analysis.titan_seg_paired_comparison ts
 8 | LEFT JOIN biospecimen.aliquots al ON al.aliquot_barcode = ts.tumor_barcode_a
 9 | LEFT JOIN clinical.surgeries cl ON cl.sample_barcode = al.sample_barcode"
10 | 
11 | qres <- dbGetQuery(con, q)
12 | 
13 | ggplot(qres, aes(x=delta_prop_het, color = idh_codel_subtype)) + geom_density() + coord_cartesian(xlim=c(-1,1)) + labs(x="Heterozygous proportion of the genome\n(recurrence-primary)")
14 | ggplot(qres, aes(x=prop_delta_eq, color = idh_codel_subtype)) + geom_density() + labs(x="Proportion of the genome with identical copy number states\n(recurrence-primary)")
15 | 
16 | + geom_smooth(method = "lm") + facet_wrap(~mutation_status)
17 | ggplot(qres, aes(x=case_age_diagnosis_years, y = relative_contribution)) + geom_point() + geom_smooth(method = "lm") + facet_wrap(~mutation_status + idh_codel_subtype)
18 | 
19 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="shared")
20 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="primary")
21 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="recurrent")
22 | 
23 | ggplot(qres, aes(x=surgical_interval_mo, y = relative_contribution)) + 
24 |   geom_point() + 
25 |   geom_smooth(method = "lm") +
26 |   facet_wrap(~mutation_status + idh_codel_subtype, scales = "free_x")
27 |        


--------------------------------------------------------------------------------
/R/cnv/prepare_gistic.R:
--------------------------------------------------------------------------------
 1 | ##################################################
 2 | # Prepare input files for running GISTIC seperately for primaries and recurrences
 3 | # Ignores multi-sector samples (one sample per patient and timepoint)
 4 | # Updated: 2019.04.19
 5 | # Author: Floris B
 6 | ##################################################
 7 | 
 8 | library(DBI)
 9 | library(tidyverse)
10 | library(ggplot2)
11 | 
12 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2")
13 | 
14 | q <- read_file("sql/cnv/gistic_prepare.sql")
15 | 
16 | qres <- dbGetQuery(con, q)
17 | seg = qres %>% filter(complete.cases(start,end,num_snps))
18 | 
19 | seg_p = seg %>% filter(sample_type == "P") %>% select(-sample_type)
20 | seg_r = seg %>% filter(sample_type == "R") %>% select(-sample_type)
21 | 
22 | write.table(seg_p, file = "results/gistic2/primary.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
23 | write.table(seg_r, file = "results/gistic2/recurrence.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
24 | 
25 | markers = data.frame(id = 1:(2*nrow(seg)), chr = c(seg$chrom, seg$chrom), pos = c(seg$start, seg$end), stringsAsFactors = FALSE) %>% distinct()
26 | write.table(markers, file = "results/gistic2/markers.txt", sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE)


--------------------------------------------------------------------------------
/R/cnv/titan-results-enumerate-events.R:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # Use the segmented copy number calls to derive gene-level gains/losses
 3 | # Date: 2018.11.01 
 4 | # Author: Kevin J.
 5 | #######################################################
 6 | 
 7 | # Directory for GLASS analysis.
 8 | mybasedir = 'Volumes/verhaak-lab/GLASS-analysis/'
 9 | datadir  = 'results/cnv/'
10 | pattern   = '.called.seg$'
11 | 
12 | #######################################################
13 | 
14 | # Necessary packages:
15 | library(parallel)
16 | library(tidyverse)
17 | library(data.table)
18 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
19 | library(org.Hs.eg.db)
20 | library(DBI)
21 | 
22 | #######################################################
23 | # Establish connection with GLASS database.
24 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
25 | 
26 | # Downloaded the UCSC cytoband file for hg19.
27 | cytoband_file = "/Users/johnsk/Documents/Life-History/GLASS-WG/data/ref/human_grch37_hg19_ucsc_cytoBand.txt"
28 | cytobands = read.delim(cytoband_file, header=FALSE)
29 | 
30 | # Summarize the number of cytobands per chromosomal arm.
31 | cytobands %>% 
32 |   mutate(arm = substring(V4, 1, 1),
33 |     chr_cyto = paste(V1, arm, sep='.')) %>% 
34 |   group_by(chr_cyto) %>% 
35 |   summarise(cyto_per_chr = n())
36 | 
37 | 
38 | # Retrieve cytoband-specific copy number calls
39 | cytoband = dbGetQuery(con,"SELECT * FROM analysis.cnv_by_cytoband")
40 | 
41 | # MERGE cytoband calls with tumor_purity and tumor_ploidy.
42 | # DETERMINE status based on whether its copy number was greater, smaller, or equal to the sample's ploidy.
43 | 
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/R/manifest/README.md:
--------------------------------------------------------------------------------
1 | ### Generating manifest files for Snakemake pipeline
2 | 
3 | There were a large number of differences in how metadata was stored for each of the GLASS cohorts. We sought to standardize metadata and sequencing information. Each R script in this directory represents our attempt to wrangle the data into a structure that works with the Snakemake pipeline. We later migrated to a PostgreSQL format, but these scripts may be helpful to others trying to implement the GLASS workflow.
4 | 


--------------------------------------------------------------------------------
/R/manifest/merge-manifest.R:
--------------------------------------------------------------------------------
 1 | ## Merge manifest
 2 | ## Author: Floris Barthel
 3 | ## Date: Jun 24 2018
 4 | 
 5 | setwd("~/projects/GLASS-WG/")
 6 | 
 7 | library(tidyverse)
 8 | 
 9 | manifest_dir      = "data/manifest"
10 | cases_prefix      = "cases"
11 | samples_prefix    = "samples"
12 | aliquots_prefix   = "aliquots"
13 | readgroups_prefix = "readgroups"
14 | files_prefix      = "files"
15 | pairs_prefix      = "pairs"
16 | 
17 | cases = list.files(manifest_dir, pattern = sprintf("%s.tsv", cases_prefix), recursive = T, full.names = T) %>%
18 |   map(read.delim, as.is=T) %>%
19 |   reduce(bind_rows) %>% ## map_dfr maybe bettre
20 |   distinct() ## NEED TO REMOVE DUPLICATE ROWS IN SOURCE FILES
21 | 
22 | samples = list.files(manifest_dir, pattern = sprintf("%s.tsv", samples_prefix), recursive = T, full.names = T) %>%
23 |   map(read.delim, as.is=T) %>%
24 |   reduce(bind_rows)
25 | 
26 | aliquots = list.files(manifest_dir, pattern = sprintf("%s.tsv", aliquots_prefix), recursive = T, full.names = T) %>%
27 |   map(read.delim, as.is=T) %>%
28 |   reduce(bind_rows)
29 | 
30 | readgroups = list.files(manifest_dir, pattern = sprintf("%s.tsv", readgroups_prefix), recursive = T, full.names = T) %>%
31 |   map(read.delim, as.is=T) %>%
32 |   reduce(bind_rows)
33 | 
34 | files = list.files(manifest_dir, pattern = sprintf("%s.tsv", files_prefix), recursive = T, full.names = T) %>%
35 |   map(read.delim, as.is=T) %>%
36 |   reduce(bind_rows)
37 | 
38 | pairs = list.files(manifest_dir, pattern = sprintf("%s.tsv", pairs_prefix), recursive = T, full.names = T) %>%
39 |   map(read.delim, as.is=T) %>%
40 |   reduce(bind_rows)
41 | 
42 | print(sprintf("Exporting manifest as json files for snakemake use."))
43 | write(jsonlite::toJSON(aliquots, pretty = T), file = sprintf("%s/%s.json", manifest_dir, aliquots_prefix))
44 | write(jsonlite::toJSON(files, pretty = T), file = sprintf("%s/%s.json", manifest_dir, files_prefix))
45 | write(jsonlite::toJSON(cases, pretty = T), file = sprintf("%s/%s.json", manifest_dir, cases_prefix))
46 | write(jsonlite::toJSON(pairs, pretty = T), file = sprintf("%s/%s.json", manifest_dir, pairs_prefix))
47 | write(jsonlite::toJSON(readgroups, pretty = T), file = sprintf("%s/%s.json", manifest_dir, readgroups_prefix))
48 | write(jsonlite::toJSON(samples, pretty = T), file = sprintf("%s/%s.json", manifest_dir, samples_prefix))
49 | 
50 | ## END ##


--------------------------------------------------------------------------------
/R/misc/README.md:
--------------------------------------------------------------------------------
1 | ### Random R
2 | 
3 | An assortment of analyses tangentially related to the GLASS project.
4 | 


--------------------------------------------------------------------------------
/R/misc/blocklist2db.R:
--------------------------------------------------------------------------------
 1 | ## push blocklist to db
 2 | 
 3 | library(tidyverse)
 4 | library(DBI)
 5 | library(odbc)
 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 7 | 
 8 | tmp = read.delim("data/ref/block_review_allow_quality_lists_20181206.txt", as.is = TRUE)
 9 | 
10 | df <- tmp %>%
11 |   select(aliquot_barcode,
12 |          fingerprint_exclusion,
13 |          coverage_exclusion = coverage_mut_exclusion,
14 |          cnv_exclusion = manual_cn_exlusion,
15 |          clinical_exclusion = surgical_interval_exclusion,
16 |          fingerprint_exclusion_reason,
17 |          coverage_exclusion_reason = mut_exclusion_reason,
18 |          cnv_exclusion_reason = cn_exclusion_reason,
19 |          clinical_exclusion_reason = surgical_interval_exclusion_reason) %>%
20 |   mutate(clinical_exclusion_reason = ifelse(clinical_exclusion_reason == "", NA, clinical_exclusion_reason))
21 | 
22 | dbWriteTable(con, Id(schema="analysis",table="blocklist"), df, append = FALSE)
23 | 


--------------------------------------------------------------------------------
/R/misc/cytoband2DB.R:
--------------------------------------------------------------------------------
 1 | ## export gene table to db
 2 | 
 3 | library(tidyverse)
 4 | library(DBI)
 5 | 
 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 7 | 
 8 | setwd("/Volumes/Helix-Common/GLASS-analysis/")
 9 | 
10 | cbref = read.delim(file = "data/ref/human_grch37_hg19_ucsc_cytoBand.txt", as.is = TRUE, header = FALSE)
11 | colnames(cbref) = c("chrom", "start", "end", "cytoband", "giestain")
12 | #genecov = read_tsv(file = "data/ref/gene.covariates.txt")
13 | 
14 | df <- cbref %>% 
15 |   transmute(cytoband = cytoband,
16 |             chrom = gsub("chr","",chrom),
17 |             pos = sprintf("[%s,%s)",start,end),
18 |             gie_stain = giestain)
19 |   
20 | dbWriteTable(con, Id(schema="ref",table="cytobands"), df, append=T)
21 | 


--------------------------------------------------------------------------------
/R/misc/dashboard.R:
--------------------------------------------------------------------------------
 1 | library(shinydashboard)
 2 | library(shiny)
 3 | library(DBI)
 4 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 5 | tables <- dbGetQuery(con, "SELECT table_schema, table_name FROM information_schema.tables WHERE table_schema != 'pg_catalog' AND table_schema !='information_schema'")
 6 | ui <- dashboardPage(
 7 |   dashboardHeader(),
 8 |   dashboardSidebar(
 9 |     sidebarMenu(
10 |       lapply(unique(tables$table_schema), function(schema) menuItem(schema, 
11 |                                                                     lapply(tables$table_name[tables$table_schema==schema],
12 |                                                                            function(table) menuSubItem(table, tabName = table, icon = icon("th"))), 
13 |                                                                     tabName = schema, icon = icon("dashboard")))
14 |     )
15 |   ),
16 |   dashboardBody(
17 |     do.call(tabItems,
18 |       lapply(tables$table_name, function(table) {
19 |         tabItem(tabName = table,
20 |                 h2(table), DT::dataTableOutput(outputId = table))
21 |       })
22 |     )
23 |   )
24 | )
25 | server <- function(input, output) {
26 |   lapply(tables$table_name, function(table) {
27 |     output[[table]] <- DT::renderDataTable(dbReadTable(con, Id(schema=tables$table_schema[tables$table_name==table], table=table)))
28 |   })
29 | }
30 | runApp(shinyApp(ui, server), host = "10.7.0.151", port = 2018)


--------------------------------------------------------------------------------
/R/misc/geneTable2DB.R:
--------------------------------------------------------------------------------
 1 | ## export gene table to db
 2 | 
 3 | library(tidyverse)
 4 | library(DBI)
 5 | 
 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 7 | 
 8 | setwd("/Volumes/Helix-Common/GLASS-analysis/")
 9 | 
10 | generef = read.delim(file = "data/ref/ncbiRefSeqCurated_hg19.tsv", as.is = TRUE)
11 | genecov = read_tsv(file = "data/ref/gene.covariates.txt")
12 | 
13 | df = generef %>% 
14 |   group_by(name2) %>%
15 |   filter(row_number()==1) %>%
16 |   ungroup() %>%
17 |   transmute(gene_symbol = name2,
18 |             transcript_id = name,
19 |             chrom = gsub("chr","",chrom),
20 |             pos = sprintf("[%s,%s)",txStart,txEnd),
21 |             strand = strand,
22 |             exons = exonCount,
23 |             tx_size = txEnd - txStart,
24 |             cds_size = sapply(mapply("-", lapply(str_split(exonEnds, ","), as.numeric), lapply(str_split(exonStarts, ","), as.numeric), SIMPLIFY = FALSE), sum, na.rm = TRUE) ) %>%
25 |   filter(chrom %in% c(1:22,"X","Y")) %>%
26 |   left_join(genecov, by = c("gene_symbol" = "gene")) %>%
27 |   mutate(expr = ifelse(is.nan(expr), 0, expr),
28 |          reptime = ifelse(is.nan(reptime), 0, reptime),
29 |          hic = ifelse(is.nan(hic), 0, hic)) %>%
30 |   arrange(gene_symbol)
31 | 
32 | dbWriteTable(con, Id(schema="ref",table="genes"), df, append=T)
33 | 


--------------------------------------------------------------------------------
/R/misc/seqz2DB.R:
--------------------------------------------------------------------------------
 1 | ### push titan seg into db
 2 | 
 3 | library(tidyverse)
 4 | library(DBI)
 5 | library(odbc)
 6 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2")
 7 | 
 8 | seg <- read.delim("results/sequenza/glass_seqz_segments.tsv", as.is = TRUE)
 9 | pp <- read.delim("results/sequenza/glass_seqz_purity_ploidy.tsv", as.is = TRUE)
10 | 
11 | pp <- pp %>% select(pair_barcode, cellularity, ploidy, slpp = SLPP)
12 | seg <- seg %>% transmute(pair_barcode,
13 |                          chrom = ifelse(chromosome=='X',23,as.integer(chromosome)),
14 |                          pos = sprintf("[%s,%s]",start.pos,end.pos),
15 |                          baf = Bf,
16 |                          baf_n = N.BAF,
17 |                          baf_sd = sd.BAF,
18 |                          ratio = depth.ratio,
19 |                          ratio_n = N.ratio,
20 |                          ratio_sd = sd.ratio,
21 |                          copy_number = CNt,
22 |                          major_cn = A,
23 |                          minor_cn = B,
24 |                          log_posterior_proba = LPP)
25 | 
26 | dbWriteTable(con, Id(schema="variants",table="seqz_seg"), seg, append=T)
27 | dbWriteTable(con, Id(schema="variants",table="seqz_params"), pp, append=T)
28 | 
29 | # segfiles <- list.files('results/cnv/titanfinal/seg', full.names = TRUE)
30 | # 
31 | # lapply(segfiles, function(f){
32 | #   message(f)
33 | #   dat <- read.delim(f, as.is=T, header=T, row.names = NULL)
34 | #   df <- dat %>%
35 | #     transmute(pair_barcode = Sample,
36 | #               chrom = Chromosome,
37 | #               pos = sprintf("[%s,%s]",Start_Position.bp.,End_Position.bp.),
38 | #               num_snp = Length.snp.,
39 | #               median_ratio = Median_Ratio,
40 | #               median_logr = Median_logR,
41 | #               titan_state = TITAN_state,
42 | #               titan_call = TITAN_call,
43 | #               copy_number = Copy_Number,
44 | #               major_cn = MajorCN,
45 | #               minor_cn = MinorCN,
46 | #               clonal_cluster = Clonal_Cluster,
47 | #               cellular_prevalence = Cellular_Prevalence,
48 | #               logr_copy_number = logR_Copy_Number,
49 | #               corrected_copy_number = Corrected_Copy_Number,
50 | #               corrected_call = Corrected_Call)
51 | #   
52 | #     dbWriteTable(con, Id(schema="analysis",table="titan_seg"), df, append=T)
53 | #     Sys.sleep(1)
54 | # })


--------------------------------------------------------------------------------
/R/misc/st-jude-life-history-identification.R:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # Identify the St. Jude pediatric brain tumors that have 
 3 | # availabe germline, diagnosis, and recurrent tumors.
 4 | # Date: 2018.05.14
 5 | # Author: Kevin J
 6 | #######################################################
 7 | 
 8 | # project directory.
 9 | setwd("/Users/johnsk/Documents/Life-History/GLASS-WG/")
10 | StJude_dataset_path = "data/st-jude-data/StJude.20180511.xlsx"
11 | 
12 | #######################################################
13 | 
14 | library(tidyverse)
15 | library(openxlsx)
16 | 
17 | #######################################################
18 | 
19 | # Roel provided the life history working group a file containing the St. Jude data set to which we were
20 | # granted access.
21 | StJude_avail_data = readWorkbook(StJude_dataset_path, sheet = 1, startRow = 1, colNames = TRUE)
22 | 
23 | ################################
24 | # The goal is to identify paired primary-recurrent samples that have WGS data.
25 | ################################
26 | # What datasets are available?
27 | table(StJude_avail_data$sj_diseases) 
28 | 
29 | # HGG = High Grade Glioma.
30 | StJude_HGG = StJude_avail_data %>% 
31 |   filter(sequencing_type=="WGS" & sj_diseases=="HGG" & file_type=="BAM")
32 | StJude_HGG_bams = filter(StJude_HGG, !grepl('bai', file_path))
33 | table(StJude_HGG_bams$subject_name, StJude_HGG_bams$sample_type)
34 | # Looks like 7-pairs: 3 with sample at autopsy; 4 with samples at recurrence.
35 | 
36 | # LGG = Low Grade Glioma.
37 | StJude_LGG = StJude_avail_data %>% 
38 |   filter(sequencing_type=="WGS" & sj_diseases=="LGG" & file_type=="BAM")
39 | StJude_LGG_bams = filter(StJude_LGG, !grepl('bai', file_path))
40 | table(StJude_LGG_bams$subject_name, StJude_LGG_bams$sample_type)
41 | # 3 LGGs with primary and relapse.
42 | 
43 | # Might there be patients that started off with LGG and progressed to HGG?
44 | StJude_glioma = StJude_avail_data %>% 
45 |   filter(sequencing_type=="WGS" & sj_diseases%in% c("LGG","HGG") & file_type=="BAM")
46 | StJude_glioma_bams = filter(StJude_glioma, !grepl('bai', file_path))
47 | StJude_glioma_table = table(StJude_glioma_bams$subject_name, StJude_glioma_bams$sample_type)
48 | 
49 | # Gather names on possible trios.
50 | StJude_glioma_trios = StJude_glioma_bams %>% 
51 |   group_by(subject_name) %>% summarise(Trio = n_distinct(sample_type)) %>% filter(Trio>2)
52 | StJude_glioma_trio_names = StJude_glioma_trios$subject_name
53 | 
54 | # Check available data for all trio names. 
55 | StJude_glioma_trio_data = StJude_avail_data[StJude_avail_data$subject_name%in%StJude_glioma_trio_names, ]
56 | StJude_glioma_trio_data_wgs = StJude_glioma_trio_data %>% filter(sequencing_type=="WGS" & file_type=="BAM")
57 | 
58 | # Write out files to be downloaded from St. Jude Cloud.
59 | write.csv(StJude_glioma_trio_data_wgs, "data/st-jude-data/st-jude_glioma_trio_data_wgs.csv")
60 | 
61 | 


--------------------------------------------------------------------------------
/R/misc/table-to-json-example.R:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # Generate a metadata json file for Life-History sequencing samples.
 3 | # Date: 2018.05.21
 4 | # Authors: Kevin, Samir.
 5 | #######################################################
 6 | 
 7 | library(tidyverse)
 8 | library(jsonlite)
 9 | library(purrr)
10 | library(listviewer)
11 | 
12 | #######################################################
13 | # An example of tabular data containing read groups, file locations, and basic subject covariate information.
14 | metadf <- read_tsv("/Users/johnsk/Documents/Life-History/GLASS-WG/data/ref/table_to_json_test.tsv")
15 | 
16 | # Some columns did not need to be included in JSON.
17 | # This is a fabricated test that included a few BAM files 
18 | # from these libraries to test parsing ability.
19 | metajson <- metadf %>%
20 |   select(-one_of("Mate_ID")) %>%
21 |   group_by(Patient_ID, Cohort, Sex, Age) %>%
22 |   rename(fileNames = Fastq_Filenames) %>%
23 |   group_by(FlowCell_ID, Lane_ID, add = TRUE) %>%
24 |   mutate(files = list(list(fileType, fileNames))) %>%    
25 |   ungroup() %>%
26 |   select(-one_of("fileType", "fileNames")) %>%
27 |   filter(!duplicated(files)) %>%
28 |   nest(-Patient_ID, -Cohort, -Sex, -Age, .key = PatientLevel) %>%
29 |   mutate(PatientLevel = purrr::map(PatientLevel, ~ .x %>%
30 |                                      group_by(Sample_Type) %>%
31 |                                      nest(.key = SampleLevel)))
32 | # Interactively assess list tree structure.
33 | listviewer::jsonedit(metajson)
34 | 
35 | # If opening in SublimeText use app (shift + cmd + p) to get in JSON pretty format.
36 | write_json(metajson, "/Users/johnsk/Documents/Life-History/GLASS-WG/data/metadata-json.json")
37 | 
38 | 


--------------------------------------------------------------------------------
/R/misc/titan2DB.R:
--------------------------------------------------------------------------------
 1 | ### push titan seg into db
 2 | 
 3 | library(tidyverse)
 4 | library(DBI)
 5 | library(odbc)
 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 7 | 
 8 | segfiles <- list.files('results/cnv/titanfinal/seg', full.names = TRUE)
 9 | 
10 | lapply(segfiles, function(f){
11 |   message(f)
12 |   dat <- read.delim(f, as.is=T, header=T, row.names = NULL)
13 |   df <- dat %>%
14 |     transmute(pair_barcode = Sample,
15 |               chrom = Chromosome,
16 |               pos = sprintf("[%s,%s]",Start_Position.bp.,End_Position.bp.),
17 |               num_snp = Length.snp.,
18 |               median_ratio = Median_Ratio,
19 |               median_logr = Median_logR,
20 |               titan_state = TITAN_state,
21 |               titan_call = TITAN_call,
22 |               copy_number = Copy_Number,
23 |               major_cn = MajorCN,
24 |               minor_cn = MinorCN,
25 |               clonal_cluster = Clonal_Cluster,
26 |               cellular_prevalence = Cellular_Prevalence,
27 |               logr_copy_number = logR_Copy_Number,
28 |               corrected_copy_number = Corrected_Copy_Number,
29 |               corrected_call = Corrected_Call)
30 |   
31 |     dbWriteTable(con, Id(schema="analysis",table="titan_seg"), df, append=T)
32 |     Sys.sleep(1)
33 | })


--------------------------------------------------------------------------------
/R/misc/titanparams2db.R:
--------------------------------------------------------------------------------
 1 | ### push titan params to db
 2 | 
 3 | library(tidyverse)
 4 | library(DBI)
 5 | library(odbc)
 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 7 | 
 8 | paramfiles <- list.files('results/cnv/titanfinal/params', full.names = TRUE)
 9 | 
10 | ## taken from Gavin's script
11 | formatParams <- function(params){
12 |   id <- colnames(params)
13 |   barcode <- strsplit(id, "_cluster")[[1]][1]
14 |   cellPrev <- strsplit(params[grepl("Clonal cluster cellular prevalence", 
15 |                                     rownames(params)), 1], " ")[[1]]
16 |   numClust <- length(cellPrev)
17 |   cellPrev <- paste0(format(cellPrev, digits=4), collapse=",")
18 |   norm <- as.numeric(params[grepl("Normal contamination estimate", rownames(params)), 1])
19 |   purity <- 1 - norm
20 |   ploidy <- as.numeric(params[grepl("Average tumour ploidy estimate", rownames(params)), 1])
21 |   loglik <- as.numeric(params[grepl("likelihood", rownames(params)), 1])
22 |   sdbw <- as.numeric(params[grepl("S_Dbw validity index \\(Both\\)", rownames(params)), 1])
23 |   return(data.frame(id=id, barcode=barcode, numClust=numClust, cellPrev=cellPrev, 
24 |               purity=purity, norm=norm, ploidy=ploidy, loglik=loglik, sdbw=sdbw,
25 |               stringsAsFactors = FALSE))
26 | }
27 | 
28 | datlist = lapply(paramfiles, function(f) {
29 |   phi <- read.delim(f, header=F, row.names=1, stringsAsFactors=F, sep="\t")
30 |   colnames(phi) <- gsub(".params.txt", "", basename(f))	
31 |   return(formatParams(phi))
32 | })
33 | 
34 | dat = data.table::rbindlist(datlist) %>% 
35 |   as.data.frame() %>%
36 |   select(pair_barcode = barcode,
37 |          num_clones = numClust,
38 |          cellular_prevalence = cellPrev,
39 |          purity,
40 |          normal_contamination = norm,
41 |          ploidy,
42 |          loglik,
43 |          sdbw)
44 | 
45 | dbWriteTable(con, Id(schema="analysis",table="titan_params"), dat, append = FALSE)
46 | 


--------------------------------------------------------------------------------
/R/neoantigens/analysis/SuppTable6_writetotext.r:
--------------------------------------------------------------------------------
 1 | #This script saves Supplementary Table 6 (generated using the neoantigen_peptide_counts.sql query) to a text file
 2 | #-----------------------------------------------------
 3 | 
 4 | library(DBI)
 5 | library(odbc)
 6 | library(ggplot2)
 7 | library(reshape)
 8 | 
 9 | rm(list=ls())
10 | 
11 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
12 | 
13 | res <- dbGetQuery(con, read_file("sql/neoag/neoantigen_peptide_counts.sql"))
14 | 
15 | write.table(res,"/projects/varnf/GLASS/Figures/resubmission/final/SuppTableS4.txt",sep="\t",quote=F,row.names=F)


--------------------------------------------------------------------------------
/R/neoantigens/analysis/neoag_depletion_hla_count.r:
--------------------------------------------------------------------------------
 1 | #Code to correlate each sample's number of unique HLA alleles with their neoantigen depletion values
 2 | #Query at the top counts each patient's total number of HLA loci from the analysis.neoantigens_by_aliquot table
 3 | #First correlation: All samples (reported in manuscript)
 4 | #Second correlation: initial only samples
 5 | #Third correlation: recurrent only samples
 6 | #-----------------------------------------------------
 7 | 
 8 | library(DBI)
 9 | library(odbc)
10 | library(ggplot2)
11 | library(reshape)
12 | 
13 | rm(list=ls())
14 | 
15 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
16 | 
17 | q = "WITH hla_table AS
18 | (
19 | 	SELECT aliquot_barcode, hla_allele
20 | 	FROM analysis.neoantigens_by_aliquot
21 | 	GROUP BY aliquot_barcode, hla_allele
22 | ),
23 | hla_tot AS
24 | (
25 | 	SELECT aliquot_barcode, COUNT(*) AS hla_num
26 | 	FROM hla_table
27 | 	GROUP BY aliquot_barcode
28 | )
29 | 
30 | SELECT nd.*,hla.hla_num
31 | FROM analysis.neoantigen_depletion nd
32 | INNER JOIN hla_tot hla ON hla.aliquot_barcode = nd.aliquot_barcode
33 | ORDER BY rneo"
34 | 
35 | res <- dbGetQuery(con,q)
36 | 
37 | res[,"hla_num"] <- as.numeric(res[,"hla_num"])
38 | 
39 | cor.test(res[,"rneo"],res[,"hla_num"],method="s")		#R = 0.29	P = 2.1e-9
40 | 
41 | pri <- res[grep("-TP-",res[,1]),]
42 | rec <- res[grep("-R1-|-R2-|-R3-|-R4-",res[,1]),]
43 | 
44 | cor.test(pri[,"rneo"],pri[,"hla_num"],method="s")		#R = 0.23	P = 5e-4
45 | cor.test(rec[,"rneo"],rec[,"hla_num"],method="s")		#R = 0.32	P = 5.6-7
46 | 


--------------------------------------------------------------------------------
/R/neoantigens/analysis/neoantigen_depletion_subclonal_selection.r:
--------------------------------------------------------------------------------
 1 | #Code to that compares each the observed-to-expected neoantigen ratios between samples marked as "selected" and "neutral" evolution using the subclonalSelection method
 2 | #Query at the top joins analysis.neoantigen_depletion table to analysis.subclonalselection table (and others)
 3 | #Comparisons are made for initial tumors and for recurrent tumors
 4 | #No significant associations
 5 | #This analysis is reported in the manuscript
 6 | #------------------------------------------------------------------------------
 7 | 
 8 | library(odbc)
 9 | library(DBI)
10 | library(ggplot2)
11 | 
12 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")  
13 | 
14 | q <- "SELECT gs.* , nd1.rneo AS nd_a, nd2.rneo AS nd_b, clin.idh_codel_subtype AS subtype,sc1.most_probable_classification AS neut1, sc2.most_probable_classification AS neut2, sc1.probability_neutral AS prob1, sc2.probability_neutral AS prob2
15 | FROM analysis.gold_set gs
16 | LEFT JOIN analysis.neoantigen_depletion nd1 ON nd1.aliquot_barcode = gs.tumor_barcode_a
17 | LEFT JOIN analysis.neoantigen_depletion nd2 ON nd2.aliquot_barcode = gs.tumor_barcode_b
18 | LEFT JOIN analysis.mut_freq mf1 ON mf1.aliquot_barcode = gs.tumor_barcode_a
19 | LEFT JOIN analysis.mut_freq mf2 ON mf2.aliquot_barcode = gs.tumor_barcode_b
20 | LEFT JOIN clinical.subtypes clin ON clin.case_barcode = gs.case_barcode
21 | LEFT JOIN analysis.subclonalselection sc1 ON sc1.aliquot_barcode = gs.tumor_barcode_a
22 | LEFT JOIN analysis.subclonalselection sc2 ON sc2.aliquot_barcode = gs.tumor_barcode_b
23 | WHERE nd1.rneo IS NOT NULL AND nd2.rneo IS NOT NULL AND (nd1.nobs >= 3 AND nd2.nobs >= 3) --AND
24 | --sc1.most_probable_classification IS NOT NULL AND sc2.most_probable_classification IS NOT NULL
25 | ORDER BY nd1.rneo"
26 | 
27 | res <- dbGetQuery(con, q)
28 | 
29 | subtypes <- unique(res[,"subtype"])
30 | for(i in 1:length(subtypes))
31 | {
32 | 	sub_res <- res[which(res[,"subtype"]==subtypes[i]),]
33 | 	pri_s <- sub_res[which(sub_res[,"neut1"]=="S"),"nd_a"]
34 | 	pri_n <- sub_res[which(sub_res[,"neut1"]=="N"),"nd_a"]
35 | 	
36 | 	rec_s <- sub_res[which(sub_res[,"neut2"]=="S"),"nd_b"]
37 | 	rec_n <- sub_res[which(sub_res[,"neut2"]=="N"),"nd_b"]
38 | 	
39 | 	t.test(pri_s,pri_n)
40 | 	t.test(rec_s,rec_n)
41 | 	
42 | 	s1 <- c(pri_s,rec_s)
43 | 	n1 <- c(pri_n,rec_n)
44 | 	t.test(s1,n1)	
45 | }
46 | 


--------------------------------------------------------------------------------
/R/neoantigens/upload/cibersort_table.r:
--------------------------------------------------------------------------------
 1 | #Code to upload the CIBERSORT data from the Wang et al Cancer Cell paper (PMID: 28697342)
 2 | #Produces a table in long format that has a row for each aliquot/cell combination
 3 | #This table is used to make Extended Data Figure 12C
 4 | #Manually fixes the name of TCGA-14-1402 to match up with the db
 5 | #-----------------------------------------------------
 6 | 
 7 | library(DBI)
 8 | library(odbc)
 9 | library(reshape)
10 | 
11 | rm(list=ls())
12 | 
13 | cibersort <- read.delim("/projects/varnf/GLASS/data/CIBERSORT/CIBERSORT_cancer_cell.txt",sep="\t",header=T,stringsAsFactor=F)
14 | mapping_table <- read.delim("/projects/varnf/GLASS/data/CIBERSORT/cancer_cell_RNAseq_mapping.txt",sep="\t",header=T,stringsAsFactor=F)
15 | myoutf <- "/projects/varnf/GLASS/data/CIBERSORT/CIBERSORT_GLASS_format.txt"
16 | 
17 | #Add information for TCGA-14-1402
18 | mapping_table[which(mapping_table[,"SampleId"]=="TCGA.14.1402.01"),"GLSS_barcodeTP"] <- "TCGA-14-1402-TP"
19 | mapping_table[which(mapping_table[,"SampleId2"]=="TCGA.14.1402.02A"),"GLSS_barcodeR1"] <- "TCGA-14-1402-R1"
20 | 
21 | cibersort[,2] <- gsub("-",".",cibersort[,2])
22 | 
23 | mapping <- c(mapping_table[,"GLSS_barcodeTP"],mapping_table[,"GLSS_barcodeR1"])
24 | names(mapping) <- c(mapping_table[,"SampleId"],mapping_table[,"SampleId2"])
25 | 
26 | ordered_names <- mapping[cibersort[,"SampleId"]]
27 | cibersort[,"sample_barcode"] <- ordered_names
28 | 
29 | cibersort <- cibersort[-which(is.na(cibersort[,"sample_barcode"])),]
30 | cibersort <- cibersort[-which(cibersort[,"sample_barcode"]=="not in data freeze"),]
31 | 
32 | rownames(cibersort) <- cibersort[,"sample_barcode"]
33 | cibersort <- cibersort[,3:24]
34 | cibersort <- cbind(rownames(cibersort),cibersort)
35 | colnames(cibersort)[1] <- "sample_barcode"
36 | colnames(cibersort) <- gsub("\\.","",colnames(cibersort))
37 | 
38 | cibersort <- melt(cibersort)
39 | 
40 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
41 | dbWriteTable(con, Id(schema="analysis",table="cibersort"), cibersort, overwrite=TRUE, row.names=FALSE)
42 | 
43 | write.table(cibersort, myoutf,sep="\t",quote=F,row.names=F)
44 | 


--------------------------------------------------------------------------------
/R/preprocess/README.md:
--------------------------------------------------------------------------------
1 | ### Basic preprocessing 
2 | 
3 | These scripts are **not essential** for preprocessing, but rather they relate to quality assessment or data collection. There are likely to be many files specified that cannot be found in this repository. This code is being maintained here largely as a reference point for the Verhaak laboratory and for parties interested in how some of the sequencing metrics were generated.
4 | 


--------------------------------------------------------------------------------
/R/preprocess/add_aligned_bam_to_files.R:
--------------------------------------------------------------------------------
 1 | ## Add aligned files
 2 | 
 3 | library(odbc)
 4 | library(DBI)
 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 6 | 
 7 | bamfiles = list.files("/projects/verhaak-lab/GLASS-analysis/results/align/bqsr", pattern = "^GLSS-MG.*bam$", full.names = T)
 8 | md5files = list.files("/projects/verhaak-lab/GLASS-analysis/results/align/bqsr", pattern = "^GLSS-MG.*md5$", full.names = T)
 9 | filesizes = sapply(bamfiles, function(f) file.info(f)$size)
10 | filemd5s  = sapply(md5files, function(f) readLines(f, warn=F))
11 | 
12 | files_add = data.frame(aliquot_barcode = gsub(".realn.mdup.bqsr.bam", "", basename(bamfiles)), file_name = basename(bamfiles), file_size = unname(filesizes), file_md5sum= unname(filemd5s), file_format = "aligned BAM", file_path = bamfiles, stringsAsFactors = F)
13 |                  
14 | dbWriteTable(con, Id(schema="analysis",table="files"), files_add, append=T)
15 | 
16 | tmp = dbReadTable(con, Id(schema="biospecimen",table="aliquots"))
17 | write.csv(tmp, file = "aliquots.csv")
18 | 


--------------------------------------------------------------------------------
/R/preprocess/aliquots-coverage-metrics.R:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # Enumerate cumulative coverage per aliquot for WGS/WXS
 3 | # Date: 2018.11.06 
 4 | # Author: Kevin J.
 5 | #######################################################
 6 | 
 7 | # Directory for GLASS analysis.
 8 | mybasedir = '/Volumes/verhaak-lab/GLASS-analysis/'
 9 | datadir  = 'results/align/wgsmetrics/'
10 | pattern   = '.WgsMetrics.txt$'
11 | 
12 | #######################################################
13 | 
14 | # Necessary packages:
15 | library(parallel)
16 | library(tidyverse)
17 | library(data.table)
18 | library(DBI)
19 | 
20 | #######################################################
21 | # Establish connection with the database.
22 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
23 | 
24 | ## Read in an example "*.WgsMetrics.txt" file to test the calling.
25 | files = list.files(datadir, full.names = T, pattern = pattern, recursive=T)
26 | 
27 | # If it is desirable to include the sample names.
28 | samples = data.frame(sample_id=gsub(".WgsMetrics.txt", "", basename(files)), library_type = substring(basename(files), 21, 23))
29 | 
30 | # The first 10 rows of each file represent a header of additional information.
31 | cov_dat = mclapply(files, function(f){
32 |   dat = tryCatch(read.delim(f,as.is=T, header=T, row.names = NULL, skip = 10), error=function(e) e)
33 |   if(inherits(dat,'error')) {
34 |     message(f, '\n', dat, '\n')
35 |     return()
36 |   }
37 |   # Truncate the file name to just the sample_id.
38 |   dat = dat %>%
39 |     mutate(sample_id = gsub(".WgsMetrics.txt", "", basename(f))) # %>%  
40 | #    filter(coverage!="0") # Filter out those bases with `0` coverage.
41 |   
42 |   return(dat)
43 |   
44 | }, mc.cores=20)
45 | 
46 | ## Combine all the samples from the GLASS cohort.
47 | glass_cov = data.table::rbindlist(cov_dat)
48 | 
49 | # Cumulatively add the number of bases at each level:
50 | glass_samples_cumulative_cov = glass_cov %>% 
51 |   group_by(sample_id) %>% 
52 |   mutate(cumulative_coverage = rev(cumsum(rev(high_quality_coverage_count)))) %>% 
53 |   # Make sure colnames are formatting right.
54 |   select(aliquot_barcode = sample_id, coverage, high_quality_coverage_count, cumulative_coverage) 
55 |   
56 | 
57 | # Total number should be 1166 (2019.03.08).
58 | n_distinct(glass_samples_cumulative_cov$aliquot_barcode)
59 | 
60 | # Write output as one table or a table for each file:
61 | # write.table(glass_samples_cumulative_cov, file = "/Users/johnsk/Documents/Life-History/GLASS-WG/data/ref/glass-cumulative-coverage.txt", sep="\t", row.names = F, col.names = T, quote = F)
62 | 
63 | # Write to cumulative coverage files to database.
64 | dbWriteTable(con, Id(schema="analysis",table="coverage"), glass_samples_cumulative_cov, append=T)
65 | 


--------------------------------------------------------------------------------
/R/preprocess/crosscheckmetricscluster.R:
--------------------------------------------------------------------------------
 1 | tmp = read.delim("/fastscratch/verhaak-lab/GLASS-WG/results/fingerprinting/GLASS-WG.crosscheck_metrics", skip = 6)
 2 | 
 3 | d = dist(tmp[,c(3,4,7)])
 4 | fit = hclust(tmp[,c(3,4,7)])
 5 | plot(as.dendrogram(fit), horiz=T)
 6 | 
 7 | x=tmp[,c(1,2,5)] %>% spread(RIGHT_GROUP_VALUE, LOD_SCORE)
 8 | rownames(x) = x$LEFT_GROUP_VALUE
 9 | x$LEFT_GROUP_VALUE = NULL
10 | x = as.matrix(x)
11 | 
12 | table(is.na(x))
13 | 
14 | fit = hclust(dist(x))
15 | 
16 | fit = hclust(tmp[,c(1,2,5)])
17 | 
18 | plot(fit)
19 | 


--------------------------------------------------------------------------------
/R/preprocess/vcf_aliquot_qc.R:
--------------------------------------------------------------------------------
 1 | library(VariantAnnotation)
 2 | library(DBI)
 3 | library(odbc)
 4 | 
 5 | rm(list=ls())
 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 7 | q <- "SELECT * FROM biospecimen.aliquots"
 8 | aliquots <- dbGetQuery(con,q)
 9 | aliquots[,"case_barcode"] <- sapply(strsplit(aliquots[,"sample_barcode"],"-"),function(x)paste(x[1:3],collapse="-"))
10 | 
11 | myDir1 <- "/projects/verhaak-lab/GLASS-analysis/results/mutect2/m2filter"
12 | 
13 | mytag <- dir(myDir1)
14 | mytag <- mytag[grep("filtered.vcf.gz$",mytag)]
15 | vcff <- paste(myDir1,mytag,sep="/")
16 | mytag <- gsub(".filtered.vcf.gz","",mytag)
17 | 
18 | check <- matrix(NA,nrow=length(vcff),ncol=15)
19 | rownames(check) <- mytag
20 | aliquot_match <- rep(0,length(vcff))
21 | for(i in 1:length(vcff))
22 | {
23 | 	cat("\r",i)
24 | 	vcf = readVcf(vcff[i], "hg19")
25 | 	samp_names <- rownames(colData(vcf))
26 | 	case_names <- sapply(strsplit(samp_names,"-"),function(x)paste(x[1:3],collapse="-"))
27 | 	
28 | 	samp_boo <- as.numeric(case_names == mytag[i])
29 | 	nsamp <- length(samp_boo)
30 | 	
31 | 	check[i,1:nsamp] <- samp_boo
32 | 	check[i,ncol(check)] <- nsamp
33 | 	
34 | 	sub_aliquots <- aliquots[which(aliquots[,"case_barcode"]==mytag[i]),]
35 | 	aliquot_match[i] <- sum(samp_names %in% sub_aliquots[,"aliquot_barcode"])/nrow(sub_aliquots)
36 | }
37 | 
38 | sums <- apply(check[,1:(ncol(check)-1)],1,function(x)sum(x,na.rm=T))
39 | sums == check[,ncol(check)]
40 | sum(sums == check[,ncol(check)]) == nrow(check)
41 | aliquot_match
42 | 
43 | 


--------------------------------------------------------------------------------
/R/snakemake/cov2db.R:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # Enumerate cumulative coverage per aliquot for WGS/WXS
 3 | # Date: 2018.11.06 
 4 | # Author: Kevin J., FP Barthel
 5 | #######################################################
 6 | 
 7 | options(scipen=999)
 8 | 
 9 | ## Parse snakemake
10 | if(exists("snakemake")) {
11 |   files = snakemake@input[["metrics"]]
12 |   outfn = snakemake@output[["tsv"]]
13 | } else {
14 |   files = list.files("results/align/wgsmetrics", recursive = T, pattern = "WgsMetrics.txt", full.names = T) # list("results/align/wgsmetrics/GLSS-DK-0012-NB-01D-WXS-ABCB18.WgsMetrics.txt", "results/align/wgsmetrics/GLSS-DK-0003-TP-01D-WXS-E43D26.WgsMetrics.txt")
15 | }
16 | 
17 | # Necessary packages:
18 | library(parallel)
19 | library(tidyverse)
20 | library(data.table)
21 | library(DBI)
22 | 
23 | # The first 10 rows of each file represent a header of additional information.
24 | cov_dat = lapply(files, function(f){
25 |   dat = tryCatch(read.delim(f,as.is=T, header=T, row.names = NULL, skip = 10), error=function(e) e)
26 |   if(inherits(dat,'error')) {
27 |     message(f, '\n', dat, '\n')
28 |     return()
29 |   }
30 |   # Truncate the file name to just the sample_id.
31 |   dat = dat %>%
32 |     mutate(sample_id = gsub(".WgsMetrics.txt", "", basename(f)),
33 |            high_quality_coverage_count = as.numeric(high_quality_coverage_count)) # %>%  
34 |   #    filter(coverage!="0") # Filter out those bases with `0` coverage.
35 |   
36 |   return(dat)
37 | })#, mc.cores=20)
38 | 
39 | ## Combine all the samples from the GLASS cohort.
40 | glass_cov = data.table::rbindlist(cov_dat)
41 | 
42 | # Cumulatively add the number of bases at each level:
43 | glass_samples_cumulative_cov = glass_cov %>% 
44 |   group_by(sample_id) %>% 
45 |   mutate(cumulative_coverage = rev(cumsum(rev(high_quality_coverage_count)))) %>% 
46 |   select(aliquot_barcode = sample_id, coverage, high_quality_coverage_count, cumulative_coverage)
47 | 
48 | # Write output as one table or a table for each file:
49 | write.table(glass_samples_cumulative_cov, file = outfn, quote = F, sep = "\t", row.names = FALSE, col.names = FALSE)


--------------------------------------------------------------------------------
/R/snakemake/pyclone_create_tsv.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(DBI)
 3 | 
 4 | .libPaths("/home/barthf/R/x86_64-pc-linux-gnu-library/3.3")
 5 | dyn.load("/projects/verhaak-lab/verhaak_env/anaconda/v4.2.0/envs/rvenv2018/lib/libodbc.so")
 6 | ## database connection
 7 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2b")
 8 | 
 9 | ## input/output parameters
10 | barcode   <- snakemake@wildcards[["aliquot_barcode"]]
11 | tsv       <- snakemake@output[["tsv"]]
12 | 
13 | ## Logging
14 | message("Processing ", barcode)
15 | 
16 | ## process parameters
17 | case_barcode  <- substring(barcode,1,12)
18 | 
19 | ## Fetch data from DB
20 | rs <- dbSendQuery(con,read_file("sql/pyclone/pyclone_create_tsv.sql"))
21 | dbBind(rs, list(case_barcode))
22 | qres <- dbFetch(rs)
23 |   
24 | df <- qres %>%
25 |   filter(aliquot_barcode == barcode) %>% 
26 |   select(mutation_id,ref_counts,var_counts,normal_cn,minor_cn,major_cn)
27 |   
28 | write.table(df, file = tsv, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)


--------------------------------------------------------------------------------
/R/snakemake/seg2db.R:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # Segments to database
 3 | #######################################################
 4 | 
 5 | options(scipen=999)
 6 | 
 7 | ## Parse snakemake
 8 | if(exists("snakemake")) {
 9 |   files = snakemake@input[["seg"]]
10 |   outfn = snakemake@output[["tsv"]]
11 | } else {
12 |   files = list.files("results/cnv/callsegments/", recursive = T, pattern = "called.seg", full.names = T)[1:10]
13 | }
14 | 
15 | # Necessary packages:
16 | library(parallel)
17 | library(tidyverse)
18 | library(data.table)
19 | library(DBI)
20 | 
21 | segs = lapply(files, function(f){
22 |   dat <- read.delim(f, comment.char = "@", as.is= TRUE)
23 |   dat <- dat %>%
24 |     mutate(aliquot_barcode = substr(basename(f),1,30), pos = sprintf("[%s,%s]", START, END)) %>%
25 |     select(aliquot_barcode, chrom = CONTIG, pos, num_points = NUM_POINTS_COPY_RATIO, log2_copy_ratio = MEAN_LOG2_COPY_RATIO, call = CALL)
26 |   return(dat)
27 | })
28 | segs <- data.table::rbindlist(segs) %>% as.data.frame()
29 | 
30 | write.table(segs, file = outfn, quote = F, sep = "\t", row.names = FALSE, col.names = FALSE)


--------------------------------------------------------------------------------
/R/snakemake/snv2db.R:
--------------------------------------------------------------------------------
 1 | library(VariantAnnotation)
 2 | library(stringr)
 3 | 
 4 | vcff = snakemake@input[["vcf"]]
 5 | tsvf = snakemake@output[["tsv"]]
 6 | 
 7 | vcf <- readVcf(vcff)
 8 | 
 9 | funcolumns <- unlist(strsplit(unlist(strsplit(info(header(vcf))['FUNCOTATION',3], '\\: '))[2],'\\|'))
10 | funcotation <- as.data.frame(do.call('rbind', str_split(gsub("^\\[|\\]$","",as.character(info(vcf)[,'FUNCOTATION'])), "\\|")))
11 | colnames(funcotation) <- funcolumns
12 | 
13 | df <- data.frame(chrom = gsub("^chr","",as.character(seqnames(vcf))),
14 |                  pos = sprintf("[%s,%s]", start(vcf), end(vcf)),
15 |                  ref = ref(vcf),
16 |                  alt = unstrsplit(CharacterList(alt(vcf)), sep=","),
17 |                  gene_symbol = funcotation$Gencode_19_hugoSymbol,
18 |                  variant_classification = funcotation$Gencode_19_variantClassification,
19 |                  secondary_variant_classification = funcotation$Gencode_19_secondaryVariantClassification,
20 |                  variant_type = funcotation$Gencode_19_variantType,
21 |                  genome_change = funcotation$Gencode_19_genomeChange,
22 |                  transcript = funcotation$Gencode_19_annotationTranscript,
23 |                  transcript_strand = funcotation$Gencode_19_transcriptStrand,
24 |                  transcript_exon = funcotation$Gencode_19_transcriptExon,
25 |                  transcript_position = funcotation$Gencode_19_transcriptPos,
26 |                  cdna_change = funcotation$Gencode_19_cDnaChange,
27 |                  cds_change = funcotation$Gencode_19_codonChange,
28 |                  protein_change = funcotation$Gencode_19_proteinChange,
29 |                  gc_content = funcotation$Gencode_19_gcContent, 
30 |                  reference_context = funcotation$Gencode_19_referenceContext,
31 |                  stringsAsFactors = FALSE)
32 | 
33 | write.table(df, file = tsvf, quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE)
34 | 


--------------------------------------------------------------------------------
/R/snakemake/vep_upload.r:
--------------------------------------------------------------------------------
 1 | #This script takes the output of the annotate_vep rule in the Snakemake mutect2-post.smk module reformats it for uploading to the db (variants.vep table)
 2 | #Additionally generates a .tsv file for backup
 3 | #-----------------------------------------------------
 4 | 
 5 | library(VariantAnnotation)
 6 | 
 7 | library(ensemblVEP)
 8 | library(tidyverse)
 9 | library(DBI)
10 | 
11 | 
12 | setwd('/projects/varnf/GLASS/GLASS/')
13 | 
14 | ## Parse snakemake
15 | maff = "results/mutect2/annoconsensusvcf/consensus.normalized.sorted.vep.maf"
16 | vcff = "results/mutect2/consensusvcf/consensus.normalized.sorted.vcf.gz"
17 | tsvf = "results/mutect2/maf2db/consensus.normalized.sorted.vep.tsv"
18 | 
19 | vcf = readVcf(vcff, "hg19")
20 | maf = read.delim(maff, as.is = T, comment.char = '#')
21 | 
22 | message("Read file ", basename(vcff))
23 | message("Read file ", basename(maff))
24 | 
25 | df = data.frame(chrom = as.character(seqnames(vcf)),
26 | 				pos = sprintf("[%s,%s]", start(vcf), end(vcf)),
27 |                 ref = ref(vcf),
28 |                 alt = unstrsplit(CharacterList(alt(vcf)), sep=","),
29 |                 gene_id = maf$Gene,
30 |                 gene_symbol = maf$Hugo_Symbol,
31 |                 variant_classification = maf$Variant_Classification,
32 |                 variant_type = maf$Variant_Type,
33 |                 cdna_position = maf$cDNA_position,
34 |                 cds_position = maf$CDS_position,
35 |                 protein_position = maf$Protein_position,
36 |                 amino_acids = maf$Amino_acids, 
37 |                 codons = maf$Codons,
38 |                 hgvs_c = maf$HGVSc,
39 |                 hgvs_p = maf$HGVSp_Short,
40 |                 polyphen = maf$PolyPhen,
41 |                 sift = maf$SIFT,
42 |                 stringsAsFactors = F)
43 | 
44 | #Change chromosome X to chromosome 23
45 | df[which(df[,"chrom"]=='X'),"chrom"] <- 23
46 | df[,"chrom"] = as.numeric(df[,"chrom"])
47 | 
48 | #Manual edit to match GLASS variant_classifications table; this is now done in SQL
49 | #df[which(df[,"variant_classification"]=="Splice_Region"),"variant_classification"]  <- "Splice_Site"
50 | 
51 | write.table(df, file = tsvf, quote = F, sep = "\t", row.names = F, col.names = T)
52 | 
53 | message("Wrote output ", basename(tsvf))
54 | 
55 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
56 | dbWriteTable(con, Id(schema="variants",table="vep"), df, overwrite=TRUE)
57 | 
58 | 


--------------------------------------------------------------------------------
/R/snv/README.md:
--------------------------------------------------------------------------------
1 | ### Mutation analysis scripts
2 | 
3 | These R scripts represent **ongoing** mutation data exploration and analyses. Note that not all files specified here are available for download.
4 | 
5 | Mutect2 calls generated from the GLASS dataset are supported on Synapse. 
6 | 


--------------------------------------------------------------------------------
/R/snv/archive/ensembl_genes_to_db.R:
--------------------------------------------------------------------------------
 1 | library(biomaRt)
 2 | library(DBI)
 3 | 
 4 | ensembl = useMart("ensembl")
 5 | ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)
 6 | term = getBM(c('ensembl_gene_id','hgnc_symbol'),mart=ensembl)
 7 | 
 8 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2")
 9 | dbWriteTable(con, Id(schema='ref',table='ensembl_genes'),term)
10 | 


--------------------------------------------------------------------------------
/R/snv/archive/mf_longitudinal.R:
--------------------------------------------------------------------------------
 1 | 
 2 | library(tidyverse)
 3 | library(DBI)
 4 | 
 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 6 | dat <- dbGetQuery(con, read_file("sql/mf_longitudinal_analysis.sql"))
 7 | 
 8 | dat <- dat %>% 
 9 |   gather(v, value, time_birth:mf_recurrence) %>%
10 |   separate(v, c("var", "descriptor")) %>%
11 |   spread(var, value)
12 | 
13 | p <- ggplot(data = dat, aes(x = time, y = mf, group = tumor_pair_barcode, color = descriptor)) +
14 |   #geom_line() +
15 |   stat_smooth(aes(group = 1), method = "lm") +
16 |   #stat_summary(aes(group = 1), fun.y = mean, geom = "point",
17 |   #             shape = 17, size = 3) +
18 |   facet_wrap(~hypermutator_status ) + 
19 |   coord_cartesian(ylim = c(0,10))
20 | 
21 | p
22 | 


--------------------------------------------------------------------------------
/R/snv/archive/mf_private_shared_time.R:
--------------------------------------------------------------------------------
 1 | library(DBI)
 2 | library(tidyverse)
 3 | library(ggplot2)
 4 | library(RColorBrewer)
 5 | 
 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 7 | 
 8 | dat <- dbGetQuery(con, read_file("sql/mutation_freq_private_shared.sql"))
 9 | clindata <- dbGetQuery(con, "SELECT DISTINCT case_barcode, idh_codel_subtype FROM clinical.surgeries WHERE idh_codel_subtype IS NOT NULL")
10 | dat <- dat %>% left_join(clindata) %>% filter(mf_a < 10, mf_b < 10)
11 | 
12 | ggplot(dat, aes(x = surgical_interval_mo, y = mf_shared)) + 
13 |   geom_point() + 
14 |   geom_smooth(method="lm") +
15 |   facet_wrap(~idh_codel_subtype, scales = "free")
16 | 
17 | ggplot(dat, aes(x = surgical_interval_mo, y = mf_private_b)) + 
18 |   geom_point() + 
19 |   geom_smooth(method="lm") +
20 |   facet_wrap(~idh_codel_subtype, scales = "free")
21 | 
22 | ggplot(dat, aes(x = surgical_interval_mo, y = mf_private_a)) + 
23 |   geom_point() + 
24 |   geom_smooth(method="lm") +
25 |   facet_wrap(~idh_codel_subtype, scales = "free")


--------------------------------------------------------------------------------
/R/snv/archive/sample_variants.R:
--------------------------------------------------------------------------------
 1 | 
 2 | library(VariantAnnotation)
 3 | setwd("/fastscratch/verhaak-lab/GLASS-WG")
 4 | 
 5 | ## Parse snakemake
 6 | fbf = snakemake@input[["freebayes"]]
 7 | csf = snakemake@input[["consensus"]]
 8 | mtf = snakemake@params[["mutect2"]]
 9 | trf = snakemake@output[["trigger"]]
10 | spl = snakemake@wildcards[["aliquot_barcode"]]
11 | 
12 | ## Read freebayes and consensus input as VRanges
13 | fb = readVcfAsVRanges(fbf, "hg19")
14 | cs = readVcfAsVRanges(csf, "hg19", param=ScanVcfParam(fixed = "ALT", info = NA, geno = "AD"))
15 | message("Loaded ", basename(fbf))
16 | message("Loaded ", basename(csf))
17 | 
18 | ## If sample is a tumor sample, read mutect calls
19 | paired = FALSE
20 | if(!is.na(mtf) & file.exists(mtf)) {
21 |   paired = TRUE
22 |   mt = readVcfAsVRanges(mtf, "hg19", param=ScanVcfParam(fixed = "ALT", info = NA, geno = "AD"))
23 |   message("Loaded ", basename(mtf))
24 | }
25 | 
26 | ## Count overlap between freebayes calls and consensus callset
27 | hitscs = fb %in% cs
28 | 
29 | ## Print some numbers
30 | prop_cs = round(length(fb)/length(cs)*100,1)
31 | prop_fb = round(sum(hitscs)/length(hitscs)*100,1)
32 | message("Found ", length(cs), " variants in consensus callset")
33 | message("Found ", length(fb), " freebayes calls (", prop_cs, "% of callset), amongst which ",
34 |         sum(hitscs), " (", prop_fb, "%) matched calls from the consensus callset.")
35 | 
36 | ## Subset freebayes output by only variants present in db
37 | fb = fb[hitscs]
38 | 
39 | ## Clear some memory
40 | rm(cs)
41 | 
42 | ## If a mutect callset is available, quantify overlap between mutect and freebayes
43 | if(paired) {
44 |   hitsmt = fb %in% mt
45 |   prop_mt = round(sum(hitsmt)/length(mt)*100,1)
46 |   message("Found ", length(mt), " filtered Mutect2 calls, of which ", sum(hitsmt), " (",
47 |           prop_mt, "%) exactly match calls from the consensus callset.")
48 |   
49 |   ## Annotate M2-called variants
50 |   fb$called = hitsmt
51 | } else {
52 |   fb$called = FALSE
53 | }
54 | 
55 | ## Create output dataframe
56 | df = data.frame(aliquot_barcode = spl, chrom = seqnames(fb), start = start(fb), end = end(fb), alt = alt(fb),
57 |                 genotype = fb$GT, read_depth = totalDepth(fb), ref_count = refDepth(fb), alt_count = altDepth(fb),
58 |                 called = fb$called,
59 |                 stringsAsFactors = F)
60 | 
61 | ## Drop variants without read counts
62 | df = df[which(!is.na(df$read_depth)),]
63 | 
64 | ## Clear more memory
65 | rm(fb)
66 | 
67 | ## Write to database
68 | .libPaths('/home/barthf/R/x86_64-pc-linux-gnu-library/3.3')
69 | 
70 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
71 | DBI::dbWriteTable(con, DBI::Id(schema="analysis",table="snv_genotypes"), df, append=T)
72 | 
73 | ## Write a trigger with the number of rows added
74 | cat(nrow(df), file = trf)
75 | message("Printed number of rows (", nrow(df), ") to file: ", basename(trf))
76 | 
77 | ## END ##


--------------------------------------------------------------------------------
/R/snv/archive/shared_private_to_vcf.R:
--------------------------------------------------------------------------------
 1 | library(VariantAnnotation)
 2 | library(DBI)
 3 | library(tidyverse)
 4 | library(BSgenome.Hsapiens.UCSC.hg19)
 5 | 
 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 7 | time <- system.time(qres <- dbGetQuery(con, read_file(('sql/variant_status_leeds.sql'))))
 8 | 
 9 | ref_genome <- BSgenome.Hsapiens.UCSC.hg19
10 | ref_organism <- GenomeInfoDb::organism(ref_genome)
11 | ref_style <- seqlevelsStyle(ref_genome)
12 | 
13 | genome_name <- genome(ref_genome)[[1]]
14 | seqlevelsStyle(ref_genome) = "NCBI"
15 | 
16 | vcf <- VCF(rowRanges = GRanges(seqnames = trimws(qres$chrom),
17 |                         ranges = IRanges(start = as.integer(qres$start_pos),
18 |                                          end = as.integer(qres$end_pos)),
19 |                         seqinfo = seqinfo(ref_genome),
20 |                         paramRangeID = rep(factor(NA),nrow(qres))),
21 |     fixed = DataFrame(REF = DNAStringSet(qres$ref),
22 |                       ALT = unname(split(DNAStringSet(qres$alt),1:length(qres$alt))),
23 |                       QUAL = as.numeric(NA_integer_),
24 |                       FILTER = 'PASS'),
25 |     geno = SimpleList(GT = matrix(rep(".",nrow(qres), dim.names = c(1:nrow(qres), "TEST")))),
26 |     colData = DataFrame(Samples = 1, row.names = c("TEST")))
27 | 
28 | vcfout <- split(vcf, sprintf("%s-%s", qres$tumor_pair_barcode, qres$variant_status))
29 | #rm(vcf)
30 | 
31 | for (vcf_name in names(vcfout)) {
32 |   vcf = vcfout[[vcf_name]]
33 |   message("Writing ", vcf_name, " with ", nrow(vcf), " rows.")
34 |   writeVcf(vcf, file = sprintf("results/mutect2/fractionated-vcf/%s.vcf", vcf_name), index = FALSE)
35 | }


--------------------------------------------------------------------------------
/R/snv/archive/signature1_by_age_and_interval.R:
--------------------------------------------------------------------------------
 1 | library(DBI)
 2 | library(tidyverse)
 3 | library(ggplot2)
 4 | 
 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 6 | q <- "SELECT ms.tumor_pair_barcode, tp.case_barcode, mutation_status, ms.relative_contribution, cs.case_age_diagnosis_years, tp.surgical_interval_mo, su.idh_codel_subtype
 7 | FROM analysis.mutsig_private_vs_shared ms
 8 | LEFT JOIN analysis.tumor_pairs tp ON tp.tumor_pair_barcode = ms.tumor_pair_barcode
 9 | LEFT JOIN clinical.cases cs ON cs.case_barcode = tp.case_barcode
10 | LEFT JOIN clinical.surgeries su ON su.sample_barcode = substring(tp.tumor_pair_barcode from 1 for 15)
11 | WHERE signature = 'Signature.1' AND mut_count >= 100"
12 | 
13 | qres <- dbGetQuery(con, q)
14 | 
15 | ggplot(qres, aes(x=case_age_diagnosis_years, y = relative_contribution)) + geom_point() + geom_smooth(method = "lm") + facet_wrap(~mutation_status)
16 | ggplot(qres, aes(x=case_age_diagnosis_years, y = relative_contribution)) + geom_point() + geom_smooth(method = "lm") + facet_wrap(~mutation_status + idh_codel_subtype)
17 | 
18 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="shared")
19 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="primary")
20 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="recurrent")
21 | 
22 | ggplot(qres, aes(x=surgical_interval_mo, y = relative_contribution)) + 
23 |   geom_point() + 
24 |   geom_smooth(method = "lm") +
25 |   facet_wrap(~mutation_status + idh_codel_subtype, scales = "free_x")
26 |        


--------------------------------------------------------------------------------
/R/snv/archive/sigproba2db.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2")
 4 | 
 5 | tmp <- read.delim('Downloads/signatures_probabilities.txt', as.is = TRUE)
 6 | 
 7 | df <- tmp %>% gather(-(1:3),key="signature",value="proba") %>% filter(grepl("^Signature",signature))
 8 | 
 9 | df2 <- df %>% transmute(signature = as.numeric(gsub("Signature.","",signature)),
10 |                         ref_context = Trinucleotide,
11 |                         alt = substring(Substitution.Type,3,3),
12 |                         substitution_type = Substitution.Type,
13 |                         proba)
14 | 
15 | dbWriteTable(con, Id(schema="ref",table="signature_proba"), df2, overwrite = TRUE)
16 | 


--------------------------------------------------------------------------------
/R/snv/archive/vaf-freq.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(DBI)
 3 | library(ggthemes)
 4 | library(ggplot2)
 5 | library(RColorBrewer)
 6 | 
 7 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB")
 8 | vaf_res <- dbGetQuery(con, read_file('sql/vaf_compare.sql'))
 9 | 
10 | g <-
11 |   ggplot(vaf_res, aes(vaf_a, vaf_b)) +
12 |   geom_point(aes(color=variant_classification)) + 
13 |   geom_abline(slope=1, alpha=0.2, linetype=2) +
14 |   labs(x="Primary VAF", y="Optimal recurrence VAF", color = "Variant Classification") +
15 |   coord_cartesian(xlim = c(0,1), ylim = c(0,1)) +
16 |   theme_bw(base_size = 18) +
17 |   theme(axis.text=element_text(size=10)) +
18 |   facet_wrap(~gene_symbol, ncol = 3) + 
19 |   scale_color_manual(values=c("5'Flank" = brewer.pal(9, "Paired")[9],
20 |                               "Frame_Shift_Del" = brewer.pal(7, "Paired")[1],
21 |                               "Frame_Shift_Ins" = brewer.pal(7, "Paired")[2],
22 |                               "In_Frame_Del" = brewer.pal(7, "Paired")[3],
23 |                               "In_Frame_Ins" = brewer.pal(7, "Paired")[4],
24 |                               "Missense_Mutation" = brewer.pal(7, "Paired")[5],
25 |                               "Nonsense_Mutation" = brewer.pal(7, "Paired")[6],
26 |                               "Splice_Site" = brewer.pal(9, "Paired")[7],
27 |                               "Translation_Start_Site" = brewer.pal(9, "Paired")[8]))
28 | 
29 | pdf(file = "~/The Jackson Laboratory/GLASS - Documents/Figure 1/suppl_vaf.pdf", width = 10, height = 9)
30 | plot(g)
31 | dev.off()
32 | 


--------------------------------------------------------------------------------
/R/telseq/telseq.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(dplyr)
 3 | library(tidyr)
 4 | library(parallel)
 5 | 
 6 | ## Constants/parameters
 7 | K = 7
 8 | G = 332720800
 9 | c = 46000
10 | 
11 | resultsdir  = "results/telseq"
12 | ptrn        = "telseq.txt$"
13 | 
14 | ##################################################################################################################################################################
15 | 
16 | message("Merging telseq output")
17 | 
18 | # list of telseq data files
19 | tsfiles = list.files(resultsdir, pattern=ptrn, recursive=T, full.names=T)
20 | 
21 | tsdat = mclapply(tsfiles, function(fn) {
22 |   aliquot_barcode = unlist(strsplit(basename(fn),"\\."))[1]
23 |   
24 |   f = tryCatch(data.table::fread(fn), error=function(e) e)
25 |   
26 |   if(inherits(f, "error"))
27 |     message(fn)
28 |   
29 |   f = data.frame(aliquot_barcode,f)
30 |   
31 |   total_reads = as.numeric(as.character(f$Total))
32 |   total_reads_wm = round(sum(total_reads,na.rm=T))
33 |   
34 |   mapped_reads = as.numeric(as.character(f$Mapped))
35 |   mapped_reads_wm = round(sum(mapped_reads,na.rm=T))
36 |   
37 |   duplicate_reads = as.numeric(as.character(f$Duplicates))
38 |   duplicate_reads_wm = round(sum(duplicate_reads,na.rm=T))
39 |   
40 |   tel = apply(f[,na.omit(match(paste('TEL',K:99,sep=''),colnames(f)))],1,sum,na.rm=T)
41 |   tel_wm = weighted.mean(tel, total_reads, na.rm=T)
42 |   
43 |   gc = apply(f[,match(paste('GC',4:5,sep=''),colnames(f))],1,sum)
44 |   gc_wm = weighted.mean(gc, total_reads, na.rm=T)
45 |   
46 |   len = (tel/gc)*(G/c)
47 |   len_wm = weighted.mean(len, total_reads, na.rm=T)
48 |   
49 |   out = data.frame(aliquot_barcode,
50 |                    total_reads=total_reads_wm,
51 |                    mapped_reads=mapped_reads_wm,
52 |                    duplicate_reads=duplicate_reads_wm,
53 |                    tel=tel_wm,
54 |                    K,
55 |                    G,
56 |                    c,
57 |                    gc=gc_wm,
58 |                    length=len_wm)
59 |   
60 |   return(out)
61 | })
62 | 
63 | tsdat = rbindlist(tsdat) %>% as.data.frame()
64 | 
65 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv3")
66 | DBI::dbWriteTable(con, DBI::Id(schema="analysis",table="telseq"), tsdat)
67 | 


--------------------------------------------------------------------------------
/bin/bam-rg-insert-size-calc.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | # Script that retrieves the reads for a particular read group from a BAM file, and
 4 | # does a calculation of the average insert size
 5 | 
 6 | my $infile;
 7 | my $rg;
 8 | # my $tempfile;
 9 | # my $outfile;
10 | 
11 | if (scalar(@ARGV) != 2) {
12 | 	print "Usage: perl bam-rg-insert-size-calc.pl [infile] [read group]\n";
13 | 	exit(1);
14 | }
15 | 
16 | $infile = shift(@ARGV);
17 | $rg = shift(@ARGV);
18 | # $tempfile = shift(@ARGV);
19 | # $outfile = shift(@ARGV);
20 | 
21 | # my @insert_sizes = ();
22 | my $n = 0;
23 | my $sum = 0;
24 | my @insert_sizes = ();
25 | 
26 | # my $samtools_cmd = "samtools view -r $rg -f 0x2 $infile | head -n 1000000 > $tempfile";
27 | # system($samtools_cmd);
28 | 
29 | my @lines = `samtools view -r $rg -f 0x2 $infile | head -n 1000000`;
30 | 
31 | # open INFILE, "<$tempfile" or die "Can't open $tempfile: $!\n";
32 | foreach my $line (@lines) {
33 | 	chomp($line);
34 | 	
35 | 	my @pieces = split(/\s+/, $line);
36 | 	
37 | 	# DEBUG
38 | # 	print $pieces[2]."\t".$pieces[6]."\n";
39 | # 	print $pieces[3]."\n";
40 | 	
41 | 	if ($pieces[6] eq "=") {
42 | 		my $insert_size = abs($pieces[7] - $pieces[3]);
43 | 		my $length = length($pieces[9]);
44 | 		# my $length = abs($pieces[8]);
45 | 		
46 | 		# DEBUG
47 | 		# print "Length: ".$length."\n";
48 | 		
49 | 		if ($insert_size > 0 && $insert_size <= 5000) {
50 | 			# push(@insert_sizes, $insert_size);
51 | 			
52 | 			# DEBUG
53 | # 			if ($insert_size > 150000000) {
54 | # 				next;
55 | # 				# print $line."\n";
56 | # 			}
57 | 			# print $insert_size."\n";
58 | 			
59 | 			push(@insert_sizes, ($insert_size+$length));
60 | 			$sum += ($insert_size + $length);
61 | 			$n++;
62 | 			if ($n >= 1000000) {
63 | 				last;
64 | 			}
65 | 		}
66 | 	}
67 | }
68 | 
69 | # close(INFILE);
70 | # open OUTFILE, ">$outfile" or die "Can't open $outfile: $!\n";
71 | my $avg;
72 | # print $n."\n";
73 | if ($n == 0) {
74 | 	print "No insert sizes above zero\n";
75 | } else {
76 | 	$avg = $sum/$n;
77 | # 	open OUTFILE, ">$outfile" or die "Can't open $outfile: $!\n";
78 | 	print "Average: ".$avg."\n";
79 | # 	close(OUTFILE);
80 | }
81 | 
82 | my $median;
83 | if ($n == 0) {
84 | 	# print "No insert sizes above zero\n";
85 | } else {
86 | 	@insert_sizes = sort {$a <=> $b} @insert_sizes;
87 | 	if ($n % 2) { # Odd case
88 | 		$median = $insert_sizes[int($n/2)];
89 | 	} else { # Even case
90 | 		$median = ($insert_sizes[($n/2)-1]+$insert_sizes[($n/2)])/2;
91 | 	}
92 | 	# open OUTFILE, ">$outfile" or die "Can't open $outfile: $!\n";
93 | 	print "Median: ".$median."\n";
94 | }
95 | 
96 | # close(OUTFILE);
97 | exit();
98 | 


--------------------------------------------------------------------------------
/bin/bamtofastq-rename.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ## Script to quickly turn uBAM into FASTQ with appropriate filenames
3 | ## Tested on "_test2" per readgroup "RevertSam" output uBAM files
4 | for i in *.bam; do ID="_test2"; SM=`samtools view -H $i | grep '^@RG' | sed "s/.*SM:\([^\t]*\).*/\1/g"`; FC=`samtools view -H $i | grep '^@RG' | sed "s/.*PU:[^_]*_[^_]*_[^_]*_[^_]*_\([^_]*\).*/\1/g"`; LN=`samtools view -H $i | grep '^@RG' | sed "s/.*PU:[^_]*_[^_]*_[^_]*_[^_]*_[^_]*_\([^#]*\).*/\1/g"`; bedtools bamtofastq -i $i -fq ${ID}_${SM}_${FC}_L${LN}_R1.fq -fq2 ${ID}_${SM}_${FC}_L${LN}_R2.fq; done


--------------------------------------------------------------------------------
/bin/bedtovcf.sh:
--------------------------------------------------------------------------------
1 | zcat consensus.norm.vcf.gz | awk '{OFS="\t"; \
2 | 	if (!/^#/ && (length($4) > 1 || length($5) > 1))\
3 | 	{ print $1,$2-sqrt((length($4)-length($5))^2)-1,$2+sqrt((length($4)-length($5))^2)+1,$4"/"$5,"+" } \
4 | 	else if (!/^#/) \
5 | 	{ print $1,$2-1,$2,$4"/"$5,"+" } \
6 | 	}' | less -S


--------------------------------------------------------------------------------
/bin/get-readgroups.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | ## Get readgroups
3 | BAMDIR="/fastscratch/barthf/GLASS-WG/download"
4 | find "${BAMDIR}" -maxdepth 2 -type f -name "*bam*" | xargs -I% sh -c "samtools view -H % | grep ^@RG | sed 's|^|%\t|' | grep -v '^\['"


--------------------------------------------------------------------------------
/bin/gistic_run.pbs:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #PBS -V
 3 | #PBS -N GISTIC
 4 | #PBS -j oe
 5 | #PBS -M floris.barthel@jax.org
 6 | #PBS -m a
 7 | #PBS -l nodes=1:ppn=8,walltime=72:00:00
 8 | #PBS -l mem=24gb
 9 | 
10 | BDP=/projects/verhaak-lab/GLASS-analysis/results/gistic2/primary
11 | BDR=/projects/verhaak-lab/GLASS-analysis/results/gistic2/recurrence
12 | SEGP=/projects/verhaak-lab/GLASS-analysis/results/gistic2/primary.seg
13 | SEGR=/projects/verhaak-lab/GLASS-analysis/results/gistic2/recurrence.seg
14 | MK=/projects/verhaak-lab/GLASS-analysis/results/gistic2/markers.txt
15 | REFGENE=/projects/verhaak-lab/FRONTIER/data/ref/hg19_v19.mat
16 | CNV=/home/barthf/opt/GISTIC_2_0_22/ref/CNV.hg19.bypos.111213.txt
17 | 
18 | /home/barthf/opt/GISTIC_2_0_23/gistic_run -b $BDP -seg $SEGP -mk $MK -cnv $CNV -refgene $REFGENE -genegistic 1 -smallmem 1 -broad 1 -brlen 0.5 -conf 0.99 -armpeel 1 -savegene 1 -gcm extreme -v 25 -rx 0 -maxspace 1 -js 1 -cap 0.5 -td 0.05 -ta 0.05
19 | /home/barthf/opt/GISTIC_2_0_23/gistic_run -b $BDR -seg $SEGR -mk $MK -cnv $CNV -refgene $REFGENE -genegistic 1 -smallmem 1 -broad 1 -brlen 0.5 -conf 0.99 -armpeel 1 -savegene 1 -gcm extreme -v 25 -rx 0 -maxspace 1 -js 1 -cap 0.5 -td 0.05 -ta 0.05


--------------------------------------------------------------------------------
/bin/preprocess-intervals.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ## GATK pre-proccess intervals for CNV calling
 3 | ## See https://software.broadinstitute.org/gatk/documentation/article?id=11682
 4 | 
 5 | gatk PreprocessIntervals \
 6 |     -R /projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta \
 7 |     --bin-length 1000 \
 8 |     --padding 0 \
 9 |     --interval-merging-rule OVERLAPPING_ONLY \
10 |     -O human_g1k_v37_decoy.preprocessed.interval_list \
11 |     --exclude-intervals X \
12 |     --exclude-intervals Y \
13 |     --exclude-intervals MT \
14 | 	--exclude-intervals GL000207.1 \
15 | 	--exclude-intervals GL000226.1 \
16 | 	--exclude-intervals GL000229.1 \
17 | 	--exclude-intervals GL000231.1 \
18 | 	--exclude-intervals GL000210.1 \
19 | 	--exclude-intervals GL000239.1 \
20 | 	--exclude-intervals GL000235.1 \
21 | 	--exclude-intervals GL000201.1 \
22 | 	--exclude-intervals GL000247.1 \
23 | 	--exclude-intervals GL000245.1 \
24 | 	--exclude-intervals GL000197.1 \
25 | 	--exclude-intervals GL000203.1 \
26 | 	--exclude-intervals GL000246.1 \
27 | 	--exclude-intervals GL000249.1 \
28 | 	--exclude-intervals GL000196.1 \
29 | 	--exclude-intervals GL000248.1 \
30 | 	--exclude-intervals GL000244.1 \
31 | 	--exclude-intervals GL000238.1 \
32 | 	--exclude-intervals GL000202.1 \
33 | 	--exclude-intervals GL000234.1 \
34 | 	--exclude-intervals GL000232.1 \
35 | 	--exclude-intervals GL000206.1 \
36 | 	--exclude-intervals GL000240.1 \
37 | 	--exclude-intervals GL000236.1 \
38 | 	--exclude-intervals GL000241.1 \
39 | 	--exclude-intervals GL000243.1 \
40 | 	--exclude-intervals GL000242.1 \
41 | 	--exclude-intervals GL000230.1 \
42 | 	--exclude-intervals GL000237.1 \
43 | 	--exclude-intervals GL000233.1 \
44 | 	--exclude-intervals GL000204.1 \
45 | 	--exclude-intervals GL000198.1 \
46 | 	--exclude-intervals GL000208.1 \
47 | 	--exclude-intervals GL000191.1 \
48 | 	--exclude-intervals GL000227.1 \
49 | 	--exclude-intervals GL000228.1 \
50 | 	--exclude-intervals GL000214.1 \
51 | 	--exclude-intervals GL000221.1 \
52 | 	--exclude-intervals GL000209.1 \
53 | 	--exclude-intervals GL000218.1 \
54 | 	--exclude-intervals GL000220.1 \
55 | 	--exclude-intervals GL000213.1 \
56 | 	--exclude-intervals GL000211.1 \
57 | 	--exclude-intervals GL000199.1 \
58 | 	--exclude-intervals GL000217.1 \
59 | 	--exclude-intervals GL000216.1 \
60 | 	--exclude-intervals GL000215.1 \
61 | 	--exclude-intervals GL000205.1 \
62 | 	--exclude-intervals GL000219.1 \
63 | 	--exclude-intervals GL000224.1 \
64 | 	--exclude-intervals GL000223.1 \
65 | 	--exclude-intervals GL000195.1 \
66 | 	--exclude-intervals GL000212.1 \
67 | 	--exclude-intervals GL000222.1 \
68 | 	--exclude-intervals GL000200.1 \
69 | 	--exclude-intervals GL000193.1 \
70 | 	--exclude-intervals GL000194.1 \
71 | 	--exclude-intervals GL000225.1 \
72 | 	--exclude-intervals GL000192.1 \
73 | 	--exclude-intervals NC_007605 \
74 | 	--exclude-intervals hs37d5


--------------------------------------------------------------------------------
/bin/reset_directory_structure.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | rm benchmarks/*/*
 3 | rmdir benchmarks/*
 4 | rmdir benchmarks
 5 | 
 6 | rm logs/*/*
 7 | rmdir logs/*
 8 | rmdir logs
 9 | 
10 | mkdir -p logs/drmaa
11 | 
12 | rm results/*/*/*/*
13 | rm results/*/*/*
14 | rm results/*/*
15 | rm results/*
16 | 
17 | rmdir results/*/*/*
18 | rmdir results/*/*
19 | rmdir results/*
20 | rmdir results


--------------------------------------------------------------------------------
/bin/scatter-interval-list-to-bed.sh:
--------------------------------------------------------------------------------
1 | #!\bin\bash
2 | cd /projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/b37/scattered_wgs_intervals/scatter-5
3 | for f in `find . -type f -name "scattered.interval_list"`;
4 | do
5 |     cat $f | grep -vE "^@" | awk 'OFS="\t" {print $1, $2-1, $3, $5, 0, $4}' > ${f%.*}.bed;
6 | done


--------------------------------------------------------------------------------
/bin/select-germline-variants.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ## Select variants from af-only-gnomad.raw.sites.b37.vcf.gz
 3 | 
 4 | module load bcftools
 5 | 
 6 | ## Need to split multi-allelic sites across multiple lines, see
 7 | ## https://gatkforums.broadinstitute.org/gatk/discussion/10975/use-select-variants-on-a-gnomad-vcf-for-mutect2-contamination-filtering
 8 | bcftools norm \
 9 | 	-f "/projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta" \
10 | 	-m- \
11 | 	-o "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.vcf.gz" \
12 | 	-O z \
13 | 	--threads 6 \
14 | 	"/projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/Mutect2/af-only-gnomad.raw.sites.b37.vcf.gz"
15 | 
16 | ## Had to remove contigs from VCF file because bcftools does not add length attribute to contigs
17 | ## and SelectVariants complains if they are missing
18 | bcftools view \
19 | 	-h "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.vcf.gz" | \
20 | 	sed '/^##contig/d' \
21 | 	> "/projects/verhaak-lab/verhaak_ref/gatk-cnv/newheader.txt"
22 | 
23 | bcftools reheader \
24 | 	-h "/projects/verhaak-lab/verhaak_ref/gatk-cnv/newheader.txt" \
25 | 	-o "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.reheader.vcf.gz" \
26 | 	"/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.vcf.gz"
27 | 
28 | ## Need to have an index because SelectVariants complains w/o index
29 | gatk IndexFeatureFile \
30 | 	-F "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.reheader.vcf.gz"
31 | 
32 | ## Finally we can select variants
33 | gatk SelectVariants \
34 | 	-V  "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.reheader.vcf.gz" \
35 | 	-O "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.selected.vcf.gz" \
36 | 	-R "/projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta" \
37 | 	--select "AF>0.05"


--------------------------------------------------------------------------------
/bin/svaba-test-parameters.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #PBS -N svaba-test
 4 | #PBS -l walltime=20:00:00
 5 | #PBS -l nodes=1:ppn=15
 6 | #PBS -r n
 7 | #PBS -M kevin.c.johnson@jax.org
 8 | #PBS -m a
 9 | #PBS -k oe
10 | #PBS -q batch
11 | #PBS -o /projects/verhaak-lab/sulman_GSCs/GSC-BAM/svaba-test/logs/svaba-${PBS_JOBID}.log
12 | #PBS -e /projects/verhaak-lab/sulman_GSCs/GSC-BAM/svaba-test/logs/svaba-${PBS_JOBID}.err
13 | #PBS -V
14 | 
15 | # Example bam file from the Sulman GSC project using a matched normal.
16 | TUM_BAM="/projects/verhaak-lab/sulman_GSCs/GSC-BAM/BAM/GS6-27-sample1_S1.aln.dup.realn.recal.rp.bam"
17 | NORM_BAM="/projects/verhaak-lab/sulman_GSCs/GSC-BAM/BAM/N6-27-sample2_S2.aln.dup.realn.recal.rp.bam"
18 | DBSNP="/projects/verhaak-lab/sulman_GSCs/GSC-BAM/svaba-test/test-reference/dbsnp_indel.vcf"
19 | CORES=10
20 | REF="/projects/verhaak-lab/glassdir/dockscratch/bundle/human_g1k_v37_decoy.fasta"
21 | 
22 | # Samir downloaded svaba and saved it to Verhaak_env.
23 | module load rvsvaba
24 | 
25 | # Change our working directory to the SVABA test.
26 | cd $PBS_O_WORKDIR
27 | 
28 | # Set the date for which a sample was run.
29 | STARTTIME=`date`
30 | echo $STARTTIME
31 | echo "Processing $TUM_BAM"
32 | 
33 | # Run on all chromosomes, including scaffolds.
34 | svaba run -t $TUM_BAM -n $NORM_BAM -D $DBSNP -a GS6-27 -G $REF --hp -p $CORES


--------------------------------------------------------------------------------
/conf/optitype_config.ini:
--------------------------------------------------------------------------------
 1 | [mapping]
 2 | 
 3 | # Absolute path to RazerS3 binary, and number of threads to use for mapping
 4 | 
 5 | razers3=razers3
 6 | threads=1
 7 | 
 8 | [ilp]
 9 | 
10 | # A Pyomo-supported ILP solver. The solver must be globally accessible in the
11 | # environment OptiType is run, so make sure to include it in PATH.
12 | # Note: this is NOT a path to the solver binary, but a keyword argument for
13 | # Pyomo. Examples: glpk, cplex, cbc.
14 | 
15 | solver=glpk
16 | threads=1
17 | 
18 | [behavior]
19 | 
20 | # tempdir=/path/to/tempdir  # we may enable this setting later. Not used now.
21 | 
22 | # Delete intermediate bam files produced by RazerS3 after OptiType finished
23 | # loading them. If you plan to re-analyze your samples with different settings
24 | # disabling this option can be a time-saver, as you'll be able to pass the bam
25 | # files to OptiType directly as input and spare the expensive read mapping
26 | # step.
27 | 
28 | deletebam=true
29 | 
30 | # In paired-end mode one might want to use reads with just one mapped end (e.g.,
31 | # the other end falls outside the reference region). This setting allows the
32 | # user to keep them with an optionally reduced weight. A value of 0 means they
33 | # are discarded for typing, 0.2 means single reads are "worth" 20% of paired
34 | # reads, and a value of 1 means they are treated as valuable as properly mapped
35 | # read pairs. Note: unpaired reads will be reported on the result coverage plots
36 | # for completeness, regardless of this setting.
37 | 
38 | unpaired_weight=0
39 | 
40 | # We call a read pair discordant if its two ends best-map to two disjoint sets
41 | # of alleles. Such reads can be either omitted or either of their ends treated
42 | # as unpaired hits. Note: discordant read pairs are reported on the coverage
43 | # plots as unpaired reads, regardless of this setting.
44 | 
45 | use_discordant=false
46 | 


--------------------------------------------------------------------------------
/dag/align.rulegraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/align.rulegraph.png


--------------------------------------------------------------------------------
/dag/fingerprint.rulegraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/fingerprint.rulegraph.png


--------------------------------------------------------------------------------
/dag/gatk-cnv.rulegraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/gatk-cnv.rulegraph.png


--------------------------------------------------------------------------------
/dag/mt2.rulegraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/mt2.rulegraph.png


--------------------------------------------------------------------------------
/dag/svdetect.rulegraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/svdetect.rulegraph.png


--------------------------------------------------------------------------------
/dag/vs2.rulegraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/vs2.rulegraph.png


--------------------------------------------------------------------------------
/dbm/glass-rearranged.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dbm/glass-rearranged.png


--------------------------------------------------------------------------------
/dbm/glass.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dbm/glass.png


--------------------------------------------------------------------------------
/envs/absolute.yaml:
--------------------------------------------------------------------------------
 1 | name: absolute
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - r-devtools
 9 |   - r-numderiv
10 | 


--------------------------------------------------------------------------------
/envs/align.yaml:
--------------------------------------------------------------------------------
 1 | name: align
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - bwa=0.7.17=ha92aebf_3
 9 |   - fastqc=0.11.7=pl5.22.0_0
10 |   - gatk4=4.0.9.0=0
11 |   - multiqc=1.6a0=py36h24bf2e0_2
12 |   # - simplejson=3.8.1=py36_0
13 |   # - colormath=3.0.0=py_2
14 |   # - lzstring=1.0.3=py36_0
15 |   # - perl=5.22.0.1=0
16 |   # - spectra=0.0.11=py_0
17 |   # - asn1crypto=0.24.0=py36_0
18 |   # - blas=1.0=mkl
19 |   # - ca-certificates=2018.03.07=0
20 |   # - certifi=2018.4.16=py36_0
21 |   # - cffi=1.11.5=py36h9745a5d_0
22 |   # - chardet=3.0.4=py36_1
23 |   # - click=6.7=py36_0
24 |   # - cryptography=2.2.2=py36h14c3975_0
25 |   # - cycler=0.10.0=py36_0
26 |   # - dbus=1.13.2=h714fa37_1
27 |   # - decorator=4.3.0=py36_0
28 |   # - expat=2.2.5=he0dffb1_0
29 |   # - font-ttf-dejavu-sans-mono=2.37=h6964260_0
30 |   # - fontconfig=2.13.0=h9420a91_0
31 |   # - freetype=2.9.1=h8a8886c_0
32 |   # - future=0.16.0=py36_0
33 |   # - glib=2.56.1=h000015b_0
34 |   # - gst-plugins-base=1.14.0=hbbd80ab_1
35 |   # - gstreamer=1.14.0=hb453b48_1
36 |   # - icu=58.2=h9c2bf20_1
37 |   # - idna=2.7=py36_0
38 |   # - intel-openmp=2018.0.3=0
39 |   # - jinja2=2.10=py36_0
40 |   # - jpeg=9b=h024ee3a_2
41 |   # - kiwisolver=1.0.1=py36hf484d3e_0
42 |   # - libedit=3.1.20170329=h6b74fdf_2
43 |   # - libffi=3.2.1=hd88cf55_4
44 |   # - libgcc-ng=7.2.0=hdf63c60_3
45 |   # - libgfortran-ng=7.2.0=hdf63c60_3
46 |   # - libpng=1.6.34=hb9fc6fc_0
47 |   # - libstdcxx-ng=7.2.0=hdf63c60_3
48 |   # - libuuid=1.0.3=h1bed415_2
49 |   # - libxcb=1.13=h1bed415_1
50 |   # - libxml2=2.9.8=h26e45fe_1
51 |   # - markdown=2.6.11=py36_0
52 |   # - markupsafe=1.0=py36h14c3975_1
53 |   # - matplotlib=2.2.2=py36hb69df0a_2
54 |   # - mkl=2018.0.3=1
55 |   # - mkl_fft=1.0.4=py36h4414c95_1
56 |   # - mkl_random=1.0.1=py36h4414c95_1
57 |   # - ncurses=6.1=hf484d3e_0
58 |   # - networkx=2.0=py36h7e96fb8_0
59 |   # - numpy=1.15.0=py36h1b885b7_0
60 |   # - numpy-base=1.15.0=py36h3dfced4_0
61 |   # - openjdk=8.0.121=1
62 |   # - openssl=1.0.2o=h20670df_0
63 |   # - pcre=8.42=h439df22_0
64 |   # - pip=10.0.1=py36_0
65 |   # - pycparser=2.18=py36_1
66 |   # - pyopenssl=18.0.0=py36_0
67 |   # - pyparsing=2.2.0=py36_1
68 |   # - pyqt=5.9.2=py36h22d08a2_0
69 |   # - pysocks=1.6.8=py36_0
70 |   # - python=3.6.6=hc3d631a_0
71 |   # - python-dateutil=2.7.3=py36_0
72 |   # - pytz=2018.5=py36_0
73 |   # - pyyaml=3.13=py36h14c3975_0
74 |   # - qt=5.9.6=h52aff34_0
75 |   # - readline=7.0=ha6073c6_4
76 |   # - requests=2.19.1=py36_0
77 |   # - setuptools=39.2.0=py36_0
78 |   # - sip=4.19.8=py36hf484d3e_0
79 |   # - six=1.11.0=py36_1
80 |   # - sqlite=3.24.0=h84994c4_0
81 |   # - tk=8.6.7=hc745277_3
82 |   # - tornado=5.0.2=py36h14c3975_0
83 |   # - urllib3=1.23=py36_0
84 |   # - wheel=0.31.1=py36_0
85 |   # - xz=5.2.4=h14c3975_4
86 |   # - yaml=0.1.7=had09818_2
87 |   # - zlib=1.2.11=ha838bed_2
88 | 
89 | 


--------------------------------------------------------------------------------
/envs/bcftools.yaml:
--------------------------------------------------------------------------------
 1 | name: freebayes
 2 | channels:
 3 |   - bioconda
 4 |   - defaults
 5 |   - conda-forge
 6 | dependencies:
 7 |   - htslib=1.8=1
 8 |   - libdeflate=1.0=h470a237_0
 9 |   - parallel=20160622=1
10 |   - bcftools=1.9
11 | #   - perl-threaded=5.22.0=13
12 | #   - bzip2=1.0.6=h470a237_2
13 | #   - ca-certificates=2018.8.24=ha4d7672_0
14 | #   - certifi=2018.8.24=py36_1
15 | #   - libgcc-ng=7.2.0=hdf63c60_3
16 | #   - libstdcxx-ng=7.2.0=hdf63c60_3
17 | #   - openssl=1.0.2o=h470a237_1
18 | #   - perl=5.26.2=h470a237_0
19 | #   - curl=7.54.1=0
20 | #   - krb5=1.13.2=0
21 | #   - libgcc=5.2.0=0
22 | #   - libssh2=1.8.0=0
23 | #   - pip=9.0.1=py36_1
24 | #   - python=3.6.2=0
25 | #   - readline=6.2=2
26 | #   - setuptools=36.4.0=py36_1
27 | #   - sqlite=3.13.0=0
28 | #   - tk=8.5.18=0
29 | #   - wheel=0.29.0=py36_0
30 | #   - xz=5.2.3=0
31 | #   - zlib=1.2.11=0
32 | # prefix: /projects/barthf/opt/miniconda3/envs/freebayes
33 | 
34 | 


--------------------------------------------------------------------------------
/envs/delly.yaml:
--------------------------------------------------------------------------------
 1 | name: delly2
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - bcftools=1.9=h4da6232_0
 9 |   - delly=0.7.8=hd37b1a0_2
10 |   - htslib=1.7=0
11 |   - libdeflate=1.0=h470a237_0
12 |   - boost=1.67.0=py36h3e44d54_0
13 |   - boost-cpp=1.67.0=h3a22d5f_0
14 |   - bzip2=1.0.6=h470a237_2
15 |   - curl=7.61.0=h93b3f91_1
16 |   - icu=58.2=hfc679d8_0
17 |   - krb5=1.14.6=0
18 |   - libffi=3.2.1=3
19 |   - libssh2=1.8.0=h5b517e9_2
20 |   - ncurses=6.1=hfc679d8_1
21 |   - pip=18.0=py36_0
22 |   - python=3.6.6=h5001a0f_0
23 |   - readline=7.0=haf1bffa_1
24 |   - setuptools=40.0.0=py36_0
25 |   - sqlite=3.24.0=h2f33b56_0
26 |   - tk=8.6.8=0
27 |   - wheel=0.31.1=py36_0
28 |   - xz=5.2.4=h470a237_0
29 |   - zlib=1.2.11=h470a237_3
30 |   - ca-certificates=2018.03.07=0
31 |   - certifi=2018.4.16=py36_0
32 |   - libgcc=7.2.0=h69d50b8_2
33 |   - libgcc-ng=7.2.0=hdf63c60_3
34 |   - libgfortran-ng=7.2.0=hdf63c60_3
35 |   - libopenblas=0.2.20=h9ac9557_7
36 |   - libstdcxx-ng=7.2.0=hdf63c60_3
37 |   - numpy=1.14.3=py36h28100ab_2
38 |   - numpy-base=1.14.3=py36h0ea5e3f_1
39 |   - openssl=1.0.2o=h20670df_0
40 | 
41 | 


--------------------------------------------------------------------------------
/envs/freebayes.yaml:
--------------------------------------------------------------------------------
 1 | name: freebayes
 2 | channels:
 3 |   - bioconda
 4 |   - defaults
 5 |   - conda-forge
 6 | dependencies:
 7 |   - freebayes=1.2.0=py36h82df9c4_2
 8 |   - htslib=1.8=1
 9 |   - libdeflate=1.0=h470a237_0
10 |   - parallel=20160622=1
11 |   - bcftools
12 |   - vt=2015.11.10=he941832_3
13 | #   - perl-threaded=5.22.0=13
14 | #   - bzip2=1.0.6=h470a237_2
15 | #   - ca-certificates=2018.8.24=ha4d7672_0
16 | #   - certifi=2018.8.24=py36_1
17 | #   - libgcc-ng=7.2.0=hdf63c60_3
18 | #   - libstdcxx-ng=7.2.0=hdf63c60_3
19 | #   - openssl=1.0.2o=h470a237_1
20 | #   - perl=5.26.2=h470a237_0
21 | #   - curl=7.54.1=0
22 | #   - krb5=1.13.2=0
23 | #   - libgcc=5.2.0=0
24 | #   - libssh2=1.8.0=0
25 | #   - pip=9.0.1=py36_1
26 | #   - python=3.6.2=0
27 | #   - readline=6.2=2
28 | #   - setuptools=36.4.0=py36_1
29 | #   - sqlite=3.13.0=0
30 | #   - tk=8.5.18=0
31 | #   - wheel=0.29.0=py36_0
32 | #   - xz=5.2.3=0
33 | #   - zlib=1.2.11=0
34 | # prefix: /projects/barthf/opt/miniconda3/envs/freebayes
35 | 
36 | 


--------------------------------------------------------------------------------
/envs/gatk4.yaml:
--------------------------------------------------------------------------------
 1 | name: gatk4
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - gatk4=4.1.0.0
 9 |   # - samtools=1.7=1
10 |   # - python=3.6.6
11 |   # - r=3.3.2=r3.3.2_0
12 |   # - r-base=3.3.2=0
13 |   # - r-boot=1.3_18=r3.3.2_0
14 |   # - r-class=7.3_14=r3.3.2_0
15 |   # - r-cluster=2.0.5=r3.3.2_0
16 |   # - r-codetools=0.2_15=r3.3.2_0
17 |   # - r-foreign=0.8_67=r3.3.2_0
18 |   # - r-getopt=1.20.0=r3.3.2_0
19 |   # - r-kernsmooth=2.23_15=r3.3.2_0
20 |   # - r-lattice=0.20_34=r3.3.2_0
21 |   # - r-mass=7.3_45=r3.3.2_0
22 |   # - r-matrix=1.2_7.1=r3.3.2_0
23 |   # - r-mgcv=1.8_16=r3.3.2_0
24 |   # - r-nlme=3.1_128=r3.3.2_0
25 |   # - r-nnet=7.3_12=r3.3.2_0
26 |   # - r-recommended=3.3.2=r3.3.2_0
27 |   # - r-rpart=4.1_10=r3.3.2_0
28 |   # - r-spatial=7.3_11=r3.3.2_0
29 |   # - r-survival=2.40_1=r3.3.2_0
30 |   # - r-optparse
31 |   # - r-data.table
32 |   # - bcftools
33 |   # - bzip2=1.0.6=h14c3975_5
34 |   # - ca-certificates=2018.03.07=0
35 |   # - cairo=1.14.8=0
36 |   # - certifi=2018.4.16=py36_0
37 |   # - curl=7.61.0=h84994c4_0
38 |   # - fontconfig=2.12.1=3
39 |   # - freetype=2.5.5=2
40 |   # - glib=2.50.2=1
41 |   # - gsl=2.4=h14c3975_4
42 |   # - harfbuzz=0.9.39=2
43 |   # - icu=54.1=0
44 |   # - jbig=2.1=hdba287a_0
45 |   # - jpeg=8d=2
46 |   # - libcurl=7.61.0=h1ad7b7a_0
47 |   # - libedit=3.1.20170329=h6b74fdf_2
48 |   # - libffi=3.2.1=hd88cf55_4
49 |   # - libgcc=7.2.0=h69d50b8_2
50 |   # - libgcc-ng=7.2.0=hdf63c60_3
51 |   # - libiconv=1.14=0
52 |   # - libpng=1.6.34=hb9fc6fc_0
53 |   # - libssh2=1.8.0=h9cfc8f7_4
54 |   # - libstdcxx-ng=7.2.0=hdf63c60_3
55 |   # - libtiff=4.0.6=2
56 |   # - libxml2=2.9.4=0
57 |   # - ncurses=6.1=hf484d3e_0
58 |   # - openjdk=8.0.121=1
59 |   # - openssl=1.0.2o=h14c3975_1
60 |   # - pango=1.40.3=1
61 |   # - pcre=8.39=1
62 |   # - pip=10.0.1=py36_0
63 |   # - pixman=0.34.0=hceecf20_3
64 |   # - readline=7.0=ha6073c6_4
65 |   # - setuptools=39.2.0=py36_0
66 |   # - sqlite=3.24.0=h84994c4_0
67 |   # - tk=8.6.7=hc745277_3
68 |   # - wheel=0.31.1=py36_0
69 |   # - xz=5.2.4=h14c3975_4
70 |   # - zlib=1.2.11=ha838bed_2
71 |   # - _r-mutex=1.0.0=anacondar_1
72 | 
73 | 


--------------------------------------------------------------------------------
/envs/gdc-client.yaml:
--------------------------------------------------------------------------------
 1 | name: gdc
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - ca-certificates=2018.03.07=0
 9 |   - certifi=2018.4.16=py27_0
10 |   - icu=58.2=h9c2bf20_1
11 |   - libedit=3.1=heed3624_0
12 |   - libffi=3.2.1=hd88cf55_4
13 |   - libgcc-ng=7.2.0=hdf63c60_3
14 |   - libstdcxx-ng=7.2.0=hdf63c60_3
15 |   - libxml2=2.9.8=hf84eae3_0
16 |   - libxslt=1.1.32=h1312cb7_0
17 |   - lxml=4.2.1=py27h23eabaa_0
18 |   - ncurses=6.0=h9df7e31_2
19 |   - openssl=1.0.2o=h20670df_0
20 |   - pip=9.0.3=py27_0
21 |   - python=2.7.14=h1571d57_31
22 |   - readline=7.0=ha6073c6_4
23 |   - setuptools=39.0.1=py27_0
24 |   - sqlite=3.23.1=he433501_0
25 |   - tk=8.6.7=hc745277_3
26 |   - wheel=0.31.0=py27_0
27 |   - xz=5.2.3=h5e939de_4
28 |   - zlib=1.2.11=ha838bed_2
29 |   - pip:
30 |     - asn1crypto==0.24.0
31 |     - cffi==1.11.5
32 |     - cmd2==0.6.8
33 |     - cryptography==2.2.2
34 |     - enum34==1.1.6
35 |     - flask==0.10.1
36 |     - functools32==3.2.3.post2
37 |     - gdc-client==1.3.0
38 |     - idna==2.6
39 |     - intervaltree==2.0.4
40 |     - ipaddress==1.0.22
41 |     - itsdangerous==0.24
42 |     - jinja2==2.10
43 |     - jsonschema==2.5.1
44 |     - markupsafe==1.0
45 |     - ndg-httpsclient==0.4.2
46 |     - parcel==0.1.13
47 |     - progressbar==2.3
48 |     - pyasn1==0.2.3
49 |     - pycparser==2.18
50 |     - pyopenssl==17.1.0
51 |     - pyparsing==2.2.0
52 |     - pyyaml==3.11
53 |     - requests==2.5.1
54 |     - six==1.11.0
55 |     - sortedcontainers==1.5.10
56 |     - termcolor==1.1.0
57 |     - werkzeug==0.14.1
58 | 
59 | 


--------------------------------------------------------------------------------
/envs/haplotype.yaml:
--------------------------------------------------------------------------------
 1 | name: haplotype
 2 | channels:
 3 |   - bioconda
 4 |   - defaults
 5 |   - conda-forge
 6 | dependencies:
 7 |   - bcftools=1.7=0
 8 |   - bedtools=2.27.1=he941832_2
 9 |   - htslib=1.7=0
10 |   - plink=1.90b4=h0a6d026_2
11 |   - plink2=1.90b3.35=0
12 |   - pybedtools=0.7.10=py27_2
13 |   - pysam=0.14.1=py27_htslib1.7_0
14 |   - samtools=1.7=2
15 |   - vcftools=0.1.16=he941832_2
16 |   - libstdcxx-ng=7.2.0=hdf63c60_3
17 |   - openblas=0.2.20=8
18 |   - bitarray=0.8.1=py27_0
19 |   - blas=1.0=mkl
20 |   - bzip2=1.0.6=3
21 |   - certifi=2016.2.28=py27_0
22 |   - curl=7.54.1=0
23 |   - krb5=1.13.2=0
24 |   - libgcc=5.2.0=0
25 |   - libgfortran=3.0.0=1
26 |   - libssh2=1.8.0=0
27 |   - mkl=2017.0.3=0
28 |   - ncurses=5.9=10
29 |   - nose=1.3.7=py27_1
30 |   - numpy=1.12.1=py27_0
31 |   - openssl=1.0.2l=0
32 |   - pandas=0.20.3=py27_0
33 |   - pip=9.0.1=py27_1
34 |   - python=2.7.13=0
35 |   - python-dateutil=2.6.1=py27_0
36 |   - pytz=2017.2=py27_0
37 |   - readline=6.2=2
38 |   - scipy=0.18.1=np112py27_1
39 |   - setuptools=36.4.0=py27_1
40 |   - six=1.10.0=py27_0
41 |   - sqlite=3.13.0=0
42 |   - tk=8.5.18=0
43 |   - wheel=0.29.0=py27_0
44 |   - xz=5.2.3=0
45 |   - zlib=1.2.11=0
46 | 
47 | 


--------------------------------------------------------------------------------
/envs/lumpy-sv.yaml:
--------------------------------------------------------------------------------
 1 | name: lumpy-sv
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - bcftools
 9 |   - htslib
10 |   - libdeflate
11 |   - lumpy-sv
12 |   - pysam
13 |   - sambamba
14 |   - samblaster
15 |   - samtools
16 |   - svtyper
17 |   - cytoolz
18 |   - r-jsonlite
19 | 


--------------------------------------------------------------------------------
/envs/manta.yaml:
--------------------------------------------------------------------------------
 1 | name: manta
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - manta=1.4.0=py27_1
 9 |   - ca-certificates=2018.03.07=0
10 |   - certifi=2018.4.16=py27_0
11 |   - libedit=3.1.20170329=h6b74fdf_2
12 |   - libffi=3.2.1=hd88cf55_4
13 |   - libgcc-ng=7.2.0=hdf63c60_3
14 |   - libstdcxx-ng=7.2.0=hdf63c60_3
15 |   - ncurses=6.1=hf484d3e_0
16 |   - openssl=1.0.2o=h20670df_0
17 |   - pip=10.0.1=py27_0
18 |   - python=2.7.15=h1571d57_0
19 |   - readline=7.0=ha6073c6_4
20 |   - setuptools=39.2.0=py27_0
21 |   - sqlite=3.24.0=h84994c4_0
22 |   - tk=8.6.7=hc745277_3
23 |   - wheel=0.31.1=py27_0
24 |   - zlib=1.2.11=ha838bed_2
25 | 
26 | 


--------------------------------------------------------------------------------
/envs/pvacseq.yaml:
--------------------------------------------------------------------------------
 1 | name: pvacseq
 2 | channels:
 3 |   - conda-forge
 4 |   - bioconda
 5 |   - defaults
 6 |   - r
 7 |   - vacation
 8 | dependencies:
 9 |   - asn1crypto=0.24.0=py35_3
10 |   - blas=2.4=openblas
11 |   - bzip2=1.0.6=h14c3975_1002
12 |   - ca-certificates=2019.3.9=hecc5488_0
13 |   - certifi=2018.8.24=py35_1001
14 |   - cffi=1.11.5=py35h5e8e0c9_1
15 |   - chardet=3.0.4=py35_3
16 |   - click=7.0=py_0
17 |   - clickclick=1.2.2=py_1
18 |   - connexion=1.5.3=py35_0
19 |   - cryptography=2.3.1=py35hdffb7b8_0
20 |   - cryptography-vectors=2.3.1=py35_0
21 |   - flask=1.0.2=py_2
22 |   - idna=2.7=py35_2
23 |   - inflection=0.3.1=py35_0
24 |   - itsdangerous=1.1.0=py_0
25 |   - jinja2=2.10=py_1
26 |   - jsonschema=2.6.0=py35_2
27 |   - libblas=3.8.0=4_openblas
28 |   - libcblas=3.8.0=4_openblas
29 |   - libffi=3.2.1=he1b5a44_1006
30 |   - libgcc-ng=8.2.0=hdf63c60_1
31 |   - libgfortran-ng=7.3.0=hdf63c60_0
32 |   - liblapack=3.8.0=4_openblas
33 |   - liblapacke=3.8.0=4_openblas
34 |   - libopenblas=0.3.3=h5a2b251_3
35 |   - libstdcxx-ng=8.2.0=hdf63c60_1
36 |   - markupsafe=1.0=py35h470a237_1
37 |   - ncurses=6.1=hf484d3e_1002
38 |   - numpy=1.15.2=py35h99e49ec_0
39 |   - numpy-base=1.15.2=py35h2f8d375_0
40 |   - openblas=0.3.5=h9ac9557_1001
41 |   - openssl=1.0.2r=h14c3975_0
42 |   - pandas=0.23.4=py35hf8a1672_0
43 |   - pip=18.0=py35_1001
44 |   - pvacseq=4.0.10=py35_2
45 |   - pycparser=2.19=py_0
46 |   - pyopenssl=18.0.0=py35_0
47 |   - pysocks=1.6.8=py35_2
48 |   - python=3.5.5=h5001a0f_2
49 |   - python-dateutil=2.8.0=py_0
50 |   - pytz=2018.9=py_0
51 |   - pyvcf=0.6.8=py35_0
52 |   - pyyaml=3.13=py35h470a237_1
53 |   - readline=7.0=hf8c457e_1001
54 |   - requests=2.19.1=py35_1
55 |   - setuptools=40.4.3=py35_0
56 |   - six=1.11.0=py35_1
57 |   - sqlite=3.26.0=h67949de_1001
58 |   - swagger-spec-validator=2.4.3=py_0
59 |   - tk=8.6.9=h84994c4_1001
60 |   - typing=3.6.6=py35_0
61 |   - urllib3=1.23=py35_1
62 |   - werkzeug=0.15.1=py_0
63 |   - wheel=0.32.0=py35_1000
64 |   - xz=5.2.4=h14c3975_1001
65 |   - yaml=0.1.7=h14c3975_1001
66 |   - zlib=1.2.11=h14c3975_1004
67 | 
68 | 


--------------------------------------------------------------------------------
/envs/pyclone.yaml:
--------------------------------------------------------------------------------
 1 | name: pyclone
 2 | channels:
 3 |   - aroth85
 4 |   - bioconda
 5 |   - conda-forge
 6 |   - defaults
 7 | dependencies:
 8 |   - pyclone=0.13.1=py27_0
 9 |   - pydp=0.2.4=py27_0
10 |   - backports=1.0=py_2
11 |   - backports.functools_lru_cache=1.5=py_1
12 |   - backports_abc=0.5=py_1
13 |   - ca-certificates=2018.11.29=ha4d7672_0
14 |   - certifi=2018.11.29=py27_1000
15 |   - cycler=0.10.0=py_1
16 |   - dbus=1.13.0=h3a4f0e9_0
17 |   - enum34=1.1.6=py27_1001
18 |   - expat=2.2.5=hfc679d8_2
19 |   - fontconfig=2.13.1=h65d0f4c_0
20 |   - freetype=2.9.1=h6debe1e_4
21 |   - funcsigs=1.0.2=py_3
22 |   - functools32=3.2.3.2=py_3
23 |   - futures=3.2.0=py27_1000
24 |   - gettext=0.19.8.1=h5e8e0c9_1
25 |   - glib=2.56.2=h464dc38_1
26 |   - gst-plugins-base=1.12.5=hde13a9d_0
27 |   - gstreamer=1.12.5=h5856ed1_0
28 |   - icu=58.2=hfc679d8_0
29 |   - jpeg=9c=h470a237_1
30 |   - kiwisolver=1.0.1=py27h2d50403_2
31 |   - libffi=3.2.1=hfc679d8_5
32 |   - libgcc-ng=7.2.0=hdf63c60_3
33 |   - libgfortran=3.0.0=1
34 |   - libiconv=1.15=h470a237_3
35 |   - libpng=1.6.36=ha92aebf_0
36 |   - libstdcxx-ng=7.2.0=hdf63c60_3
37 |   - libuuid=2.32.1=h470a237_2
38 |   - libxcb=1.13=h470a237_2
39 |   - libxml2=2.9.8=h422b904_5
40 |   - llvmlite=0.26.0=py27hd28b015_0
41 |   - matplotlib=2.2.3=py27h8e2386c_0
42 |   - ncurses=6.1=hfc679d8_2
43 |   - numba=0.41.0=py27hf8a1672_0
44 |   - openssl=1.0.2p=h470a237_1
45 |   - pandas=0.23.4=py27hf8a1672_0
46 |   - patsy=0.5.1=py_0
47 |   - pcre=8.41=hfc679d8_3
48 |   - pip=18.1=py27_1000
49 |   - pthread-stubs=0.4=h470a237_1
50 |   - pyparsing=2.3.0=py_0
51 |   - pyqt=5.6.0=py27h8210e8a_7
52 |   - python=2.7.15=h33da82c_6
53 |   - python-dateutil=2.7.5=py_0
54 |   - pytz=2018.7=py_0
55 |   - pyyaml=3.13=py27h470a237_1
56 |   - qt=5.6.2=hf70d934_9
57 |   - readline=7.0=haf1bffa_1
58 |   - seaborn=0.9.0=py_0
59 |   - setuptools=40.6.3=py27_0
60 |   - singledispatch=3.4.0.3=py27_1000
61 |   - sip=4.18.1=py27hfc679d8_0
62 |   - six=1.12.0=py27_1000
63 |   - sqlite=3.26.0=hb1c47c0_0
64 |   - statsmodels=0.9.0=py27h7eb728f_0
65 |   - subprocess32=3.5.3=py27h470a237_0
66 |   - tk=8.6.9=ha92aebf_0
67 |   - tornado=5.1.1=py27h470a237_0
68 |   - wheel=0.32.3=py27_0
69 |   - xorg-libxau=1.0.8=h470a237_6
70 |   - xorg-libxdmcp=1.1.2=h470a237_7
71 |   - xz=5.2.4=h470a237_1
72 |   - yaml=0.1.7=h470a237_1
73 |   - zlib=1.2.11=h470a237_3
74 |   - blas=1.0=mkl
75 |   - mkl=2017.0.3=0
76 |   - numpy=1.13.1=py27_0
77 |   - scipy=0.19.1=np113py27_0
78 | 
79 | 


--------------------------------------------------------------------------------
/envs/sequenza.yaml:
--------------------------------------------------------------------------------
 1 | name: sequenza
 2 | channels:
 3 |   - dranew
 4 |   - biobuilds
 5 |   - http://conda.anaconda.org/dranew
 6 |   - bioconda
 7 |   - conda-forge
 8 |   - defaults
 9 | dependencies:
10 |   - bioconductor-biocgenerics=0.24.0=r342h10e8652_0
11 |   - bioconductor-copynumber=1.18.0=r342h84c3342_0
12 |   - bioconductor-genomeinfodb=1.14.0=r342h317c8a6_0
13 |   - bioconductor-genomeinfodbdata=0.99.1=r342h4c5fc93_0
14 |   - bioconductor-genomicranges=1.30.0=r342hbf6d5b2_0
15 |   - bioconductor-iranges=2.12.0=r342hb627adb_0
16 |   - bioconductor-s4vectors=0.16.0=r342ha375a43_0
17 |   - bioconductor-xvector=0.18.0=r342h80a1e3f_0
18 |   - bioconductor-zlibbioc=1.24.0=r342h5ff288e_0
19 |   - bzip2=1.0.6=h966e7de_0
20 |   - ca-certificates=2018.11.29=ha4d7672_0
21 |   - cairo=1.14.10=h021c1ba_0
22 |   - certifi=2018.11.29=py36_1000
23 |   - curl=7.56.1=h15b681c_0
24 |   - fontconfig=2.12.4=h3f6a2db_0
25 |   - freetype=2.8=h48caf01_1
26 |   - glib=2.53.6=py36h5cf23cf_0
27 |   - graphite2=1.3.10=hd5afa3c_0
28 |   - gsl=2.4=h9aeeda3_0
29 |   - harfbuzz=1.5.0=h7cf9945_0
30 |   - icu=60.1=h58d5639_1
31 |   - jpeg=9b=h67a1377_0
32 |   - libffi=3.2.1=hfc679d8_5
33 |   - libgcc=7.2.0=h69d50b8_2
34 |   - libgcc-ng=7.2.0=hdf63c60_3
35 |   - libgfortran-ng=7.2.0=hdf63c60_3
36 |   - libidn2=2.0.4=hb0ec843_0
37 |   - libpng=1.6.34=ha6fa132_1
38 |   - libssh2=1.8.0=hb91037a_2
39 |   - libstdcxx-ng=7.2.0=hdf63c60_3
40 |   - libtiff=4.0.8=h04300b7_0
41 |   - libxcb=1.13=h470a237_2
42 |   - libxml2=2.9.4=hc2fdcf8_0
43 |   - ncurses=5.9=701
44 |   - openssl=1.0.2p=h470a237_2
45 |   - pango=1.40.11=h6a13506_1
46 |   - pcre=8.41=h39f570f_0
47 |   - pip=19.0.3=py36_0
48 |   - pixman=0.34.0=h0ca3aba_701
49 |   - pthread-stubs=0.4=h470a237_1
50 |   - python=3.6.5=1
51 |   - r-base=3.4.2=h3655213_0
52 |   - r-bitops=1.0_6=r342h6d3b7a6_1
53 |   - r-rcurl=1.95_4.8=r342hfdac255_0
54 |   - r-sequenza=2.1.2=r342h39d70e4_0
55 |   - r-squash=1.0.8=r342ha8977b7_0
56 |   - readline=7.0=h5a58b2a_0
57 |   - samtools=1.5=0
58 |   - setuptools=40.8.0=py36_0
59 |   - sqlite=3.20.1=2
60 |   - tabix=0.2.6=ha92aebf_0
61 |   - tk=8.6.7=he069c39_0
62 |   - wheel=0.33.1=py36_0
63 |   - xorg-libxau=1.0.8=h470a237_6
64 |   - xorg-libxdmcp=1.1.2=h470a237_7
65 |   - xz=5.2.3=h5714765_0
66 |   - zlib=1.2.11=h3b3956b_0
67 |   - pip:
68 |     - sequenza-utils==2.1.9999b0
69 | 
70 | 


--------------------------------------------------------------------------------
/envs/somaticseq.yaml:
--------------------------------------------------------------------------------
 1 | name: somaticseq
 2 | channels:
 3 |   - bioconda
 4 |   - defaults
 5 |   - conda-forge
 6 | dependencies:
 7 |   - bamtools=2.4.1=0
 8 |   - bcftools=1.6=0
 9 |   - bedtools=2.27.1=0
10 |   - gatk4=4.0.9.0=0
11 |   - htslib=1.7=0
12 |   - lofreq=2.1.3.1=py36_0
13 |   - muse=1.0.rc=0
14 |   - pysam=0.14.0=py36_htslib1.7_0
15 |   - r-ada=2.0_5=r3.3.2_0
16 |   - regex=2016.06.24=py36_1
17 |   - samtools=1.7=0
18 |   - scalpel=0.5.3=h2407274_2
19 |   - somaticseq=2.8.1=py36_0
20 |   - vardict=2018.09.21=0
21 |   - varscan=2.4.3=1
22 |   - icu=58.2=hfc679d8_0
23 |   - libgcc-ng=7.2.0=hdf63c60_3
24 |   - libstdcxx-ng=7.2.0=hdf63c60_3
25 |   - perl=5.26.2=h470a237_0
26 |   - r-base=3.3.2=5
27 |   - r-rpart=4.1_13=r3.3.2_0
28 |   - blas=1.0=mkl
29 |   - bzip2=1.0.6=3
30 |   - cairo=1.14.8=0
31 |   - certifi=2016.2.28=py36_0
32 |   - curl=7.54.1=0
33 |   - fontconfig=2.12.1=3
34 |   - freetype=2.5.5=2
35 |   - glib=2.50.2=1
36 |   - gsl=2.2.1=0
37 |   - harfbuzz=0.9.39=2
38 |   - jbig=2.1=0
39 |   - jpeg=9b=0
40 |   - krb5=1.13.2=0
41 |   - libffi=3.2.1=1
42 |   - libgcc=5.2.0=0
43 |   - libgfortran=3.0.0=1
44 |   - libiconv=1.14=0
45 |   - libpng=1.6.30=1
46 |   - libssh2=1.8.0=0
47 |   - libtiff=4.0.6=3
48 |   - libxml2=2.9.4=0
49 |   - mkl=2017.0.3=0
50 |   - ncurses=5.9=10
51 |   - numpy=1.13.1=py36_0
52 |   - openjdk=8.0.121=1
53 |   - openssl=1.0.2l=0
54 |   - pango=1.40.3=1
55 |   - pcre=8.39=1
56 |   - pip=9.0.1=py36_1
57 |   - pixman=0.34.0=0
58 |   - python=3.6.2=0
59 |   - readline=6.2=2
60 |   - scipy=0.19.1=np113py36_0
61 |   - setuptools=36.4.0=py36_1
62 |   - sqlite=3.13.0=0
63 |   - tk=8.5.18=0
64 |   - wheel=0.29.0=py36_0
65 |   - xz=5.2.3=0
66 |   - zlib=1.2.8=3
67 | 
68 | 


--------------------------------------------------------------------------------
/envs/telseq.yaml:
--------------------------------------------------------------------------------
 1 | name: telseq
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - bamtools=2.4.1=1
 9 |   - telseq=0.0.1=hbed2392_1
10 |  # - libgcc=7.2.0=h69d50b8_2
11 |  # - libgcc-ng=7.2.0=hdf63c60_3
12 |  # - libstdcxx-ng=7.2.0=hdf63c60_3
13 |  # - zlib=1.2.11=ha838bed_2
14 | 
15 | 


--------------------------------------------------------------------------------
/envs/varscan2.yaml:
--------------------------------------------------------------------------------
 1 | name: varscan2
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - bam-readcount=0.8=py36pl5.22.0_3
 9 |   - samtools=1.7=1
10 |   - varscan=2.4.3=1
11 |   - libedit=3.1.20170329=0
12 |   - perl=5.22.0.1=0
13 |   - bzip2=1.0.6=h14c3975_5
14 |   - ca-certificates=2018.03.07=0
15 |   - certifi=2018.4.16=py36_0
16 |   - curl=7.61.0=h84994c4_0
17 |   - libcurl=7.61.0=h1ad7b7a_0
18 |   - libffi=3.2.1=hd88cf55_4
19 |   - libgcc=7.2.0=h69d50b8_2
20 |   - libgcc-ng=7.2.0=hdf63c60_3
21 |   - libssh2=1.8.0=h9cfc8f7_4
22 |   - libstdcxx-ng=7.2.0=hdf63c60_3
23 |   - ncurses=5.9=10
24 |   - openjdk=8.0.121=1
25 |   - openssl=1.0.2o=h14c3975_1
26 |   - pip=10.0.1=py36_0
27 |   - python=3.6.3=h1284df2_4
28 |   - readline=7.0=hb321a52_4
29 |   - setuptools=39.2.0=py36_0
30 |   - sqlite=3.24.0=h84994c4_0
31 |   - tk=8.6.7=hc745277_3
32 |   - wheel=0.31.1=py36_0
33 |   - xz=5.2.4=h14c3975_4
34 |   - zlib=1.2.11=ha838bed_2
35 | 
36 | 


--------------------------------------------------------------------------------
/envs/vcf2maf.yaml:
--------------------------------------------------------------------------------
 1 | name: vcf2maf
 2 | channels:
 3 |   - bioconda
 4 |   - r
 5 |   - defaults
 6 |   - conda-forge
 7 | dependencies:
 8 |   - ensembl-vep
 9 |   - vcf2maf
10 |   - samtools
11 |   - bcftools
12 |   - tabix
13 | 


--------------------------------------------------------------------------------
/jar/VarScan.v2.4.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/jar/VarScan.v2.4.2.jar


--------------------------------------------------------------------------------
/jar/VarScan.v2.4.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/jar/VarScan.v2.4.3.jar


--------------------------------------------------------------------------------
/julia/README.md:
--------------------------------------------------------------------------------
 1 | # SubClonalSelection Pipeline
 2 | 
 3 | ## Files used
 4 | |File|Description|
 5 | |---|---|
 6 | |GLASS_genotypes.csv| Contains information for every mutation called in a sample.|
 7 | |GLASS\_genotype\_comparison| Contains shared/private status for every mutation called in a tumour pair, including whether it is shared or private and also the TITAN copy number estimate for the position.|
 8 | |GLASS\_genotype\_comparison\_extracted.tsv| The extracted GLASS\_genotype\_comparison file.|
 9 | |silver_set.csv| Contains a list of paired samples to include in the analysis.|
10 | |titanparams_synapse.tsv| Contains TITAN ploidy and purity estimates for each sample.|
11 | 
12 | ## Pipeline
13 | 
14 | ### 1. Extract necessary columns from the genotypes comparisons file
15 | 
16 | (The position column counts as 2 due to the comma).
17 | 
18 | ```
19 | cat GLASS_genotype_comparison.tsv | cut -d, -f 1,9,10,12,21,22 | tr "," "\\t" > GLASS_genotype_comparison_extracted.tsv
20 | ```
21 | 
22 | ### 2. Run extrac_vafs.py 
23 | 
24 | ```
25 | python extrac_vafs.py -c GLASS_genotype_comparison_extracted.tsv -g GLASS_genotypes.csv -s silver_set.csv -t titanparams_synapse.tsv -o ./
26 | ```
27 | 
28 | ### 3. Add minimum VAF column
29 | 
30 | Manualy add a column for minimum VAF to the metadata file output from extrac_vafs.py by inspecting the histogram outputs for each sample and choosing the VAF for the highest point of the left most peak.
31 | 
32 | ### 4. Run the analysis through qsubsec
33 | See https://www.ncbi.nlm.nih.gov/pubmed/26635140 for details on qsubsec.
34 | 
35 | ```
36 | qsubsec subclonalselection.qsubsec subclonalselection.tff -s
37 | ```
38 | 
39 | ### 5. Subsample inputs 
40 | 
41 | For runs that don't finish within 48h due to large numbers of mutations, subsample their VAF inputs and rerun.
42 | 
43 | ```
44 | shuf -n 20000 XXX.txt > XXX.txt
45 | ``` 
46 | 
47 | ### 6. Remove runs with high error in the model results
48 | 
49 | Remove any runs with "New ϵ is within 7.0% of previous population, stop ABC SMC" warning in logs.


--------------------------------------------------------------------------------
/julia/runsubclonalselection.jl.txt:
--------------------------------------------------------------------------------
 1 | using ArgParse
 2 | using SubClonalSelection
 3 | 
 4 | 
 5 | s = ArgParseSettings()
 6 | @add_arg_table s begin
 7 |     "--readdepth", "-d"
 8 |         help = "Mean read depth"
 9 |         arg_type=Float64
10 |     "--minvaf", "-v"
11 |        	help = "Minimum VAF"
12 |        	arg_type=Float64
13 |     "--fmin", "-f"
14 |         help = "Minimum VAFs to model"
15 |        	arg_type=Float64
16 |     "--mincellularity", "-m"
17 |         help = "Min cellularity"
18 |        	arg_type=Float64
19 |     "--maxcellularity", "-x"
20 |         help = "Max cellularity"
21 |        	arg_type=Float64
22 |     "--ploidy", "-l"
23 |         help = "Ploidy"
24 |         arg_type=Float64
25 |     "--maxiterations", "-i"
26 |         help = "Max iterations"
27 |         arg_type=Int
28 |     "--nparticles", "-p"
29 |         help = "Number of particles"
30 |        	arg_type=Int
31 |     "--resultsdirectory", "-r"
32 |         help = "Results directory"
33 |     "--name", "-n"
34 |         help = "Name for run"
35 |     "file"
36 |         help = "Directory and file name conatining VAFs"
37 |         required = true
38 |     
39 | end
40 | 
41 | args = parse_args(s)
42 | 
43 | out = fitABCmodels(args["file"], args["name"], read_depth = args["readdepth"], resultsdirectory = args["resultsdirectory"], minvaf = args["minvaf"], fmin = args["fmin"], ploidy = args["ploidy"], maxiterations = args["maxiterations"], nparticles = args["nparticles"], mincellularity = args["mincellularity"], maxcellularity = args["maxcellularity"], Nmaxinf = 10^6, save = true, adaptpriors = true)
44 | 


--------------------------------------------------------------------------------
/julia/subclonalselection.qsubsec.txt:
--------------------------------------------------------------------------------
 1 | # This script runs a single sample through subclonalselection
 2 | # Georgette Tanner & ALastair Droop, 2019-01-11
 3 | 
 4 | 
 5 | section('subclonalselection-{SAMPLE}-{FRACTION}-{RUN}', description='Run sample {SAMPLE} through subclonalselection.')
 6 | limits(h_rt='48:00:00', h_vmem='16G')
 7 | options('V', 'cwd', 'notify')
 8 | outputs('{LOG_DIR}')
 9 | 
10 | # Load the metadata file:
11 | metadata = dict()
12 | try:
13 |     metadata_file = open('{METADATA_DIR}/{METADATA_FILE}', 'rt')
14 |     headers = metadata_file.readline()
15 |     for metadata_line in metadata_file.readlines():
16 |         metadata_data = metadata_line.strip().split('\t')
17 |         metadata[metadata_data[1]] = metadata_data
18 | except FileNotFoundError as err:
19 |     message('ERROR: metadata file "{METADATA_FILE}" does not exist')
20 |     raise
21 | except Exception as err: raise
22 | 
23 | # Check that the given SAMPLE is present in the metadata dictionary:
24 | if '{SAMPLE}' not in metadata.keys(): raise Exception('Sample {SAMPLE} not in metadata')
25 | 
26 | # Check number of suclonal VAFs
27 | if '{FRACTION}' == 'shared': NUM = int(metadata['{SAMPLE}'][3])
28 | elif '{FRACTION}' == 'private': NUM = int(metadata['{SAMPLE}'][5])
29 | elif '{FRACTION}' == 'all': NUM = int(metadata['{SAMPLE}'][7])
30 | 
31 | if NUM<25 :
32 |     command('echo "Sample {SAMPLE}_{FRACTION} has less than 25 subclonal VAFs"', name = 'quit')
33 | elif metadata['{SAMPLE}'][14]=='-':
34 |     command('echo "Sample {SAMPLE}_{FRACTION} has no minimum VAF given - assuming non suitable sample"', name = 'quit')
35 | else:
36 | 
37 |     # Extract values:
38 |     read_depth = float(metadata['{SAMPLE}'][8])
39 |     min_vaf = float(metadata['{SAMPLE}'][14])
40 |     f_min = float(metadata['{SAMPLE}'][14])
41 |     ploidy = int(metadata['{SAMPLE}'][13])
42 |     min_cellularity = float(metadata['{SAMPLE}'][11])
43 |     max_cellularity = float(metadata['{SAMPLE}'][12])
44 | 
45 |     # Build the command to submit:
46 |     command('{JULIA_EXEC} {JULIA_SCRIPT} {VAF_FILE} --name {SAMPLE}_{FRACTION}_{RUN} --resultsdirectory {OUTPUT_DIR} --readdepth %s  --maxiterations {ITERATIONS} --nparticles 500 --minvaf %s --fmin %s --ploidy %s --mincellularity %s --maxcellularity %s' % (read_depth, min_vaf, f_min, ploidy, min_cellularity, max_cellularity), name='run_julia')


--------------------------------------------------------------------------------
/julia/subclonalselection.tff.txt:
--------------------------------------------------------------------------------
 1 | # Define run:
 2 | RUN=“run1”
 3 | 
 4 | 
 5 | # Define the basic project structure:
 6 | BASE_DIR = “./“
 7 | LOG_DIR = "{BASE_DIR}/logs"
 8 | INPUT_DIR = "{BASE_DIR}/input"
 9 | OUTPUT_DIR = "{BASE_DIR}/output"
10 | 
11 | # Define the metadata file:
12 | METADATA_DIR = "{BASE_DIR}/metadata"
13 | METADATA_FILE = "metadata.tsv”
14 | SAMPLE = FILE("{METADATA_DIR}/samples_to_run.txt")
15 | 
16 | # Define the input VAF filename structure:
17 | VAF_FILE = "{INPUT_DIR}/{SAMPLE}_{FRACTION}.txt"
18 | 
19 | # Define the mutation fraction to use:
20 | FRACTION = "shared", "private", "all"
21 | 
22 | # Define the executables:
23 | JULIA_EXEC = "~/julia-1.0.3/bin/julia"
24 | JULIA_SCRIPT = "{BASE_DIR}/scripts/runsubclonalselection.jl"
25 | 
26 | # Iterations:
27 | ITERATIONS = "100000"


--------------------------------------------------------------------------------
/python/.ipynb_checkpoints/LearnRegexp-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/python/.ipynb_checkpoints/LearningJSON-1-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "hello world\n"
13 |      ]
14 |     }
15 |    ],
16 |    "source": [
17 |     "print(\"hello world\")"
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": null,
23 |    "metadata": {},
24 |    "outputs": [],
25 |    "source": []
26 |   },
27 |   {
28 |    "cell_type": "code",
29 |    "execution_count": null,
30 |    "metadata": {},
31 |    "outputs": [],
32 |    "source": []
33 |   }
34 |  ],
35 |  "metadata": {
36 |   "kernelspec": {
37 |    "display_name": "Python 3",
38 |    "language": "python",
39 |    "name": "python3"
40 |   },
41 |   "language_info": {
42 |    "codemirror_mode": {
43 |     "name": "ipython",
44 |     "version": 3
45 |    },
46 |    "file_extension": ".py",
47 |    "mimetype": "text/x-python",
48 |    "name": "python",
49 |    "nbconvert_exporter": "python",
50 |    "pygments_lexer": "ipython3",
51 |    "version": "3.6.4"
52 |   }
53 |  },
54 |  "nbformat": 4,
55 |  "nbformat_minor": 2
56 | }
57 | 


--------------------------------------------------------------------------------
/python/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "CLUSTER_META    = json.load(\"/fastscratch/verhaak-lab/GLASS-WG/conf/cluster.json\")"
10 |    ]
11 |   }
12 |  ],
13 |  "metadata": {
14 |   "kernelspec": {
15 |    "display_name": "Python 3",
16 |    "language": "python",
17 |    "name": "python3"
18 |   },
19 |   "language_info": {
20 |    "codemirror_mode": {
21 |     "name": "ipython",
22 |     "version": 3
23 |    },
24 |    "file_extension": ".py",
25 |    "mimetype": "text/x-python",
26 |    "name": "python",
27 |    "nbconvert_exporter": "python",
28 |    "pygments_lexer": "ipython3",
29 |    "version": "3.6.4"
30 |   }
31 |  },
32 |  "nbformat": 4,
33 |  "nbformat_minor": 2
34 | }
35 | 


--------------------------------------------------------------------------------
/python/.ipynb_checkpoints/Untitled1-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/python/.ipynb_checkpoints/Untitled2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/python/.ipynb_checkpoints/Untitled3-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/python/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/python/__init__.py


--------------------------------------------------------------------------------
/python/countPysam.py:
--------------------------------------------------------------------------------
 1 | # requires python3
 2 | # requires pysam-0.11.2.1
 3 | import sys
 4 | import pysam
 5 | 
 6 | chromToUse = sys.argv[1]  # 0 for all chromosomes
 7 | norm_hetpsns = sys.argv[2]
 8 | bam_file = sys.argv[3]
 9 | #ref_file = sys.argv[4]
10 | base_quality = int(sys.argv[4])
11 | map_quality = int(sys.argv[5])
12 | vcf_quality = int(sys.argv[6])
13 | positions = {}
14 | 
15 | #  add (position,depth) from the normal hetpositions input file to a dictionary of lists
16 | #    indexed by chromosome
17 | for line in open(norm_hetpsns):
18 |     if not line.strip().startswith("#"):
19 |         chrom = line.split()[0] 
20 |         if chrom == chromToUse or chromToUse == 0:
21 |             position = int(line.strip().split()[1])
22 |             ref_base = line.strip().split()[3]
23 |             nref_base = line.strip().split()[4]
24 |             qual = line.strip().split()[5]
25 |             depth = line.split()[7].split(';')[0].replace('DP=', '') 
26 |             position_data = position, depth, ref_base, nref_base, qual
27 |             if chrom not in positions:
28 |                 positions[chrom] = []
29 |             positions[chrom].append(position_data)
30 | 
31 | sample = pysam.AlignmentFile(bam_file)
32 | #reference = pysam.FastaFile(ref_file)
33 | ## print header ##
34 | print ("Chr\tPosition\tRef\tRefCount\tNref\tNrefCount\tNormQuality")
35 | 
36 | for chrom in positions:
37 |   i = 0
38 |   for position_data in positions[chrom]:
39 |   	position = int(position_data[0])
40 |   	result = str(chrom) + "\t" + str(position)
41 |   	ref_base = position_data[2]
42 |   	nref_base = position_data[3]
43 |   	qual = float(position_data[4])
44 |   	if qual >= vcf_quality and qual != None:
45 |   		_p = sample.pileup(reference=chrom, start=position, end=position + 1)
46 |   		bases = list()
47 |   		for p in _p:
48 |   			if p.reference_pos == position:
49 |   				for r in p.pileups:
50 |   					if not r.is_del and not r.is_refskip:
51 |   						base = r.alignment.query_sequence[r.query_position-1]
52 |   						mapq = r.alignment.mapping_quality
53 |   						baseq = r.alignment.query_qualities[r.query_position-1]
54 |   						if mapq >= map_quality and baseq >= base_quality:
55 |   							bases.append(base)
56 |   		ref_count = 0
57 |   		depth = 0
58 |   		for base in bases:
59 |   			depth += 1
60 |   			if base == ref_base:
61 |   				ref_count += 1
62 |   		alt_count = depth - ref_count
63 |   		
64 |   		result += "\t" + ref_base + "\t" + str(ref_count) + "\t" + nref_base + "\t" + str(alt_count) + '\t' + str(qual)
65 |   		print(result)
66 |   		i += 1
67 | 						
68 |         
69 | 


--------------------------------------------------------------------------------
/python/glassfunc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | GLASS helper functions
 3 | """
 4 | 
 5 | import os, fnmatch
 6 | from configparser import ConfigParser
 7 | 
 8 | def touch_file(fname, mode=0o666, dir_fd=None, **kwargs):
 9 |     """
10 |     Touch function taken from stackoverflow
11 |     Link: https://stackoverflow.com/questions/1158076/implement-touch-using-python
12 |     """
13 |     flags = os.O_CREAT | os.O_APPEND
14 |     with os.fdopen(os.open(fname, flags=flags, mode=mode, dir_fd=dir_fd)) as f:
15 |         os.utime(f.fileno() if os.utime in os.supports_fd else fname,
16 |             dir_fd=None if os.supports_fd else dir_fd, **kwargs)
17 | 
18 | def build_dict(seq, key):
19 |     """
20 |     Turn an unnamed list of dicts into a nammed list of dicts
21 |     Taken from stackoverflow
22 |     https://stackoverflow.com/questions/4391697/find-the-index-of-a-dict-within-a-list-by-matching-the-dicts-value
23 |     """
24 |     return dict((d[key], dict(d, index=index)) for (index, d) in enumerate(seq))
25 | 
26 | def dbconfig(filename, section):
27 |     """
28 |     Loads db connection settings in text file
29 |     From http://www.postgresqltutorial.com/postgresql-python/connect/
30 |     """
31 |     # create a parser
32 |     parser = ConfigParser()
33 |     # read config file
34 |     parser.read(filename)
35 |  
36 |     # get section, default to postgresql
37 |     db = {}
38 |     if parser.has_section(section):
39 |         params = parser.items(section)
40 |         for param in params:
41 |             db[param[0]] = param[1]
42 |     else:
43 |         raise Exception('Section {0} not found in the {1} file'.format(section, filename))
44 |  
45 |     return db
46 | 
47 | def locate(pattern, root = os.curdir):
48 |     """
49 |     Locate all files matching supplied filename pattern in and below
50 |     supplied root directory.
51 |     Taken from: http://code.activestate.com/recipes/499305-locating-files-throughout-a-directory-tree/
52 |     """
53 |     for path, dirs, files in os.walk(os.path.abspath(root)):
54 |         for filename in fnmatch.filter(files, pattern):
55 |             yield os.path.join(path, filename)
56 | 
57 | ## END ##
58 | 


--------------------------------------------------------------------------------
/python/manifest_tester.py:
--------------------------------------------------------------------------------
 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 2 | ## Manifest tester
 3 | ## Authors: Floris Barthel
 4 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 5 | 
 6 | import os
 7 | import pandas as pd
 8 | import itertools
 9 | import yaml
10 | 
11 | ## Import manifest processing functions
12 | from python.glassfunc import dbconfig, locate
13 | from python.PostgreSQLManifestHandler import PostgreSQLManifestHandler
14 | from python.JSONManifestHandler import JSONManifestHandler
15 | 
16 | config = yaml.load(open('conf/config.yaml'))
17 | 
18 | ## Connect to database
19 | dbconf = dbconfig(config["db"]["configfile"], config["db"]["configsection"])
20 | 
21 | ## Instantiate manifest
22 | manifest = PostgreSQLManifestHandler(host = dbconf["servername"], port = dbconf["port"], user = dbconf["username"], password = dbconf["password"], database = dbconf["database"],
23 |     source_file_basepath = config["data"]["source_path"], aligned_file_basepath = config["data"]["realn_path"], from_source = config["from_source"])
24 | print(manifest)


--------------------------------------------------------------------------------
/snakemake/download.smk:
--------------------------------------------------------------------------------
 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 2 | ## Download BAM file from GDC
 3 | ## GDC key needs to be re-downloaded and updated from time to time
 4 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
 5 | 
 6 | rule download:
 7 |     output:
 8 |         "data/download/{uuid}/{filename}.bam"
 9 |     threads:
10 |         CLUSTER_META["download"]["ppn"]
11 |     message:
12 |         "Downloading from GDC\n"
13 |         "UUID {wildcards.uuid}\n"
14 |         "File {wildcards.filename}"
15 |     conda:
16 |         "../envs/gdc-client.yaml"
17 |     log:
18 |         "logs/download/{uuid}.{filename}.log"
19 |     benchmark:
20 |         "benchmarks/download/{uuid}.{filename}.txt"
21 |     shell:
22 |         "gdc-client download \
23 |             -d download \
24 |             -n {threads} \
25 |             -t {config[gdc_token]} \
26 |             {wildcards.uuid} \
27 |             > {log} 2>&1"


--------------------------------------------------------------------------------
/snakemake/fusorsv.smk:
--------------------------------------------------------------------------------
 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 2 | ## FusorSV
 3 | ## Preparing for FusorSV by collecting VCF files from various callers
 4 | ## See: https://github.com/timothyjamesbecker/FusorSV
 5 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 6 | 
 7 | rule fusorsv_prepare:
 8 |     input:
 9 |         delly = "results/delly/filter/{pair_id}.prefilt.bcf",
10 |         lumpy = "results/lumpy/filter/{pair_id}.dict.svtyper.filtered.vcf",
11 |         manta = "results/manta/{pair_id}/results/variants/somaticSV.vcf.gz"
12 |     output:
13 |         delly = "results/fusorsv/prepare/{pair_id}/{pair_id}.delly.vcf",
14 |         lumpy = "results/fusorsv/prepare/{pair_id}/{pair_id}.lumpy.vcf",
15 |         manta = "results/fusorsv/prepare/{pair_id}/{pair_id}.manta.vcf"
16 |     params:
17 |         mem = CLUSTER_META["fusorsv_prepare"]["mem"]
18 |     threads:
19 |         CLUSTER_META["fusorsv_prepare"]["ppn"]
20 |     conda:
21 |         "../envs/fusorsv.yaml"
22 |     log:
23 |         "logs/fusorsv/prepare/{pair_id}.log"
24 |     benchmark:
25 |         "benchmarks/fusorsv/prepare/{pair_id}.txt"
26 |     message:
27 |         "Preparing for FusorSV by collecting VCF files from various callers\n"
28 |         "Pair: {wildcards.pair_id}"
29 |     shell:
30 |         "(bcftools view {input.delly} > {output.delly} && \
31 |             bcftools view {input.lumpy} > {output.lumpy} && \
32 |             bcftools view {input.manta} > {output.manta}) \
33 |             > {log} 2>&1"
34 | 
35 | #        "bcftools view {output.bcf} > {params.vcftmp} && \
36 | #            bgzip -i {params.vcftmp} && \
37 | #            bcftools index -t {output.vcf}"
38 | 
39 | ## END ##


--------------------------------------------------------------------------------
/snakemake/manta.smk:
--------------------------------------------------------------------------------
 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 2 | ## Manta
 3 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 4 | 
 5 | rule manta_config:
 6 |     input:
 7 |         tumor = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getTumor(wildcards.pair_barcode)),
 8 |         normal = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getNormal(wildcards.pair_barcode))
 9 |     output:
10 |         script = "results/manta/{pair_barcode}/runWorkflow.py"
11 |     params:
12 |         rundir = "results/manta/{pair_barcode}",
13 |         mem = CLUSTER_META["manta_config"]["mem"]
14 |     threads:
15 |         CLUSTER_META["manta_config"]["ppn"]
16 |     conda:
17 |         "../envs/manta.yaml"
18 |     log:
19 |         "logs/manta/config/{pair_barcode}.log"
20 |     benchmark:
21 |         "benchmarks/manta/config/{pair_barcode}.txt"
22 |     message:
23 |         "Configuring Manta for tumor/normal pair\n"
24 |         "Pair: {wildcards.pair_barcode}"
25 |     shell:
26 |         "configManta.py \
27 |             --normalBam {input.normal} \
28 |             --tumorBam {input.tumor} \
29 |             --callRegions {config[svinclude_manta]} \
30 |             --referenceFasta {config[reference_fasta]} \
31 |             --runDir {params.rundir} \
32 |             > {log} 2>&1; "
33 | 
34 | rule manta_execute:
35 |     input:
36 |         script = "results/manta/{pair_barcode}/runWorkflow.py",
37 |         tumor = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getTumor(wildcards.pair_barcode)),
38 |         normal = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getNormal(wildcards.pair_barcode))
39 |     output:
40 |         "results/manta/{pair_barcode}/results/variants/diploidSV.vcf.gz",
41 |         "results/manta/{pair_barcode}/results/variants/somaticSV.vcf.gz",
42 |         "results/manta/{pair_barcode}/results/variants/candidateSV.vcf.gz",
43 |         "results/manta/{pair_barcode}/results/variants/candidateSmallIndels.vcf.gz"
44 |     params:
45 |         mem = CLUSTER_META["manta_execute"]["mem"]
46 |     threads:
47 |         CLUSTER_META["manta_execute"]["ppn"]
48 |     conda:
49 |         "../envs/manta.yaml"
50 |     log:
51 |         "logs/manta/execute/{pair_barcode}.log"
52 |     benchmark:
53 |         "benchmarks/manta/execute/{pair_barcode}.txt"
54 |     message:
55 |         "Running Manta for tumor/normal pair\n"
56 |         "Pair: {wildcards.pair_barcode}"
57 |     shell:
58 |         "{input.script} \
59 |             -m local \
60 |             -j {threads} \
61 |             -g {params.mem} \
62 |             > {log} 2>&1; "
63 | 
64 | ## END ##


--------------------------------------------------------------------------------
/snakemake/somaticseq.smk:
--------------------------------------------------------------------------------
 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 2 | ## Snakefile for SomaticSeq Varscan2 and Mutect2 consensus calling
 3 | ## Authors: Floris Barthel
 4 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 5 | 
 6 | rule somatiseq:
 7 |     input:
 8 |         vs2snp = "results/varscan2/fpfilter/{pair_barcode}.snp.Somatic.hc.final.vcf",
 9 |         vs2indel = "results/varscan2/vs2-filter/{pair_barcode}.indel.Somatic.hc.filter.vcf",
10 |         mutect2 = "results/mutect2/final/{pair_barcode}.final.vcf",
11 |         tumorbam = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getTumor(wildcards.pair_barcode)),
12 |         normalbam = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getNormal(wildcards.pair_barcode))
13 |     output:
14 |         "results/somaticseq/{pair_barcode}/Consensus.sSNV.vcf"
15 |         "results/somaticseq/{pair_barcode}/Consensus.sINDEL.vcf",
16 |         "results/somaticseq/{pair_barcode}/Ensemble.sSNV.tsv",
17 |         "results/somaticseq/{pair_barcode}/Ensemble.sINDEL.tsv"
18 |     params:
19 |     	outdir = "results/somaticseq/{pair_barcode}",
20 |         mem = CLUSTER_META["somaticseq"]["mem"]
21 |     threads:
22 |         CLUSTER_META["somaticseq"]["ppn"]
23 |     conda:
24 |         "../envs/somaticseq.yaml"
25 |     log:
26 |         "logs/somaticseq/{pair_barcode}.log"
27 |     benchmark:
28 |         "benchmarks/somaticseq/{pair_barcode}.txt"
29 |     message:
30 |         "Running SomaticSeq consensus calling\n"
31 |         "Pair: {wildcards.pair_barcode}"
32 |     shell:
33 |         "SomaticSeq.Wrapper.sh \
34 |             --output-directory {params.outdir} \
35 |             --genome-reference {config[reference_fasta]} \
36 |             paired \
37 |             --tumor-bam-file {input.tumorbam} \
38 | 			--normal-bam-file {input.normalbam} \
39 | 			--mutect2-vcf {input.mutect2} \
40 | 			--varscan-snv {input.vs2snp} \
41 | 			--varscan-indel {input.vs2indel} \
42 |             > {log} 2>&1; "


--------------------------------------------------------------------------------
/snakemake/telseq.smk:
--------------------------------------------------------------------------------
 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 2 | ## Telomere content estimates from BAM file
 3 | ## See: https://github.com/abyzovlab/CNVnator
 4 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 
 5 | 
 6 | rule telseq_run:
 7 |     input:
 8 |         "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam"
 9 |     output:
10 |         protected("results/telseq/{aliquot_barcode}.telseq.txt")
11 |     params:
12 |         mem = CLUSTER_META["telseq_run"]["mem"]
13 |     threads:
14 |         CLUSTER_META["telseq_run"]["ppn"]
15 |     #conda:
16 |     #    "../envs/telseq.yaml"
17 |     log:
18 |         "logs/telseq/{aliquot_barcode}.log"
19 |     benchmark:
20 |         "benchmarks/telseq/{aliquot_barcode}.txt"
21 |     message:
22 |         "Telomere content estimates from BAM file\n"
23 |         "Sample: {wildcards.aliquot_barcode}"
24 |     shell:"""
25 |         module load telseq
26 |         telseq -o {output} \
27 |             -r {config[telseq_r]} \
28 |             {input} \
29 |             > {log} 2>&1
30 |         """
31 | 
32 | ## END ##


--------------------------------------------------------------------------------
/sql/cnv/c710_status.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Use GATK-based arm level calls (Taylor method) to call the glioma event combined amplification of chr 7 and chr 10 deletion
 3 | */
 4 | WITH
 5 | t1 AS
 6 | (
 7 | 	SELECT
 8 | 		aliquot_barcode,
 9 | 		chrom,
10 | 		arm,
11 | 		arm_call,
12 | 		(CASE
13 | 		 WHEN chrom = 7 THEN arm_call = 1
14 | 		 WHEN chrom = 10 THEN arm_call = -1
15 | 		 END) AS bool_call
16 | 	FROM analysis.gatk_cnv_by_arm
17 | 	WHERE chrom IN (7,10)
18 | ),
19 | t2 AS
20 | (
21 | 	SELECT
22 | 		aliquot_barcode,
23 | 		chrom,
24 | 		COUNT(CASE WHEN bool_call IS TRUE THEN 1 END) AS count_true, -- number of chromosome arms with event
25 | 		COUNT(CASE WHEN bool_call IS FALSE THEN 1 END) AS count_false, -- number of chromosome arms lacking event
26 | 		COUNT(CASE WHEN bool_call IS NULL THEN 1 END) AS count_null -- number of chromosome arms unknown
27 | 	FROM t1
28 | 	GROUP BY 1, 2
29 | ),
30 | t3 AS
31 | (
32 | 	SELECT
33 | 		aliquot_barcode,
34 | 		(CASE WHEN bool_or(count_null = 2) THEN NULL ELSE bool_and(count_true > 0) AND bool_and(count_false = 0) END) AS c710
35 | 	FROM t2
36 | 	GROUP BY 1
37 | )
38 | SELECT * FROM t3


--------------------------------------------------------------------------------
/sql/cnv/call_10q25_26.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Call CNV for 10q25-26 region
 3 | */
 4 | WITH
 5 | selected_regions AS
 6 | (
 7 | 	SELECT '10q25-26' AS region, * FROM ref.cytobands WHERE chrom = 10 AND substring(cytoband from 1 for 3) IN ('q25','q26')
 8 | ),
 9 | gene_seg_intersect AS
10 | (
11 |     SELECT aliquot_barcode, region, gs.chrom, (upper(t0.pos * gs.pos) - lower(t0.pos * gs.pos) -1) AS w, 2^log2_copy_ratio::decimal As cr
12 |     FROM variants.gatk_seg gs
13 |     INNER JOIN selected_regions t0 ON t0.chrom = gs.chrom AND t0.pos && gs.pos
14 | ),
15 | gene_sample_call AS
16 | (
17 |     SELECT aliquot_barcode, region, 
18 | 		sum(w * cr) / sum(w) AS wcr
19 |     FROM gene_seg_intersect
20 |     GROUP BY aliquot_barcode, region
21 | ),
22 | seg_stats_optimized AS
23 | (
24 | 	SELECT
25 | 		gs.aliquot_barcode,
26 | 		LEAST(0.9, neu_fwmean - 2 * neu_fwsd) AS del_thres,
27 | 		GREATEST(1.1, neu_fwmean + 2 * neu_fwsd) AS amp_thres,
28 | 		(CASE
29 | 		 WHEN max_loss_arm_wmean < 0.9 AND max_loss_arm_n >= 3 THEN GREATEST(0,max_loss_arm_wmean - 2 * max_loss_arm_wsd)
30 | 		 WHEN del_fwmean < 0.9 AND del_n >= 3 THEN GREATEST(0,del_fwmean - 2 * del_fwsd)
31 | 		 ELSE NULL
32 | 		END) AS hldel_thres,
33 | 		(CASE
34 | 		 WHEN max_gain_arm_wmean > 1.1 AND max_gain_arm_n >= 3 THEN max_gain_arm_wmean + 2 * max_gain_arm_wsd
35 | 		 WHEN amp_fwmean > 1.1 AND amp_n >= 3 THEN amp_fwmean + 2 * amp_fwsd
36 | 		 ELSE NULL
37 | 		END) AS hlamp_thres
38 | 	FROM analysis.gatk_seg_stats gs
39 | 	LEFT JOIN analysis.gatk_aneuploidy gsa ON gsa.aliquot_barcode = gs.aliquot_barcode
40 | ),
41 | gene_cp AS
42 | (
43 | 	SELECT ts.aliquot_barcode, region, ts.chrom, (upper(t0.pos * ts.pos) - lower(t0.pos * ts.pos) -1) AS w, cellular_prevalence As cp
44 | 	FROM variants.titan_seg ts
45 | 	INNER JOIN selected_regions t0 ON t0.chrom = ts.chrom AND t0.pos && ts.pos
46 | ),
47 | gene_cp_agg AS
48 | (
49 | 	SELECT aliquot_barcode, region, 
50 | 		COALESCE(sum(w * cp) / NULLIF(sum(w),0),NULL) AS wcp
51 |     FROM gene_cp
52 |     GROUP BY 1, 2
53 | )
54 | SELECT
55 | 	gc.aliquot_barcode,
56 | 	gc.region,
57 | 	(CASE
58 | 	 WHEN gc.wcr >= del_thres AND gc.wcr <= amp_thres THEN 0
59 | 	 WHEN gc.wcr < hldel_thres THEN -2
60 | 	 WHEN gc.wcr < del_thres THEN -1
61 | 	 WHEN gc.wcr > hlamp_thres THEN 2
62 | 	 WHEN gc.wcr > amp_thres THEN 1
63 | 	 ELSE NULL
64 | 	 END) hlvl_call,
65 | 	gc.wcr,
66 | 	wcp AS cellular_prevalence
67 | FROM gene_sample_call gc
68 | LEFT JOIN seg_stats_optimized ss ON ss.aliquot_barcode = gc.aliquot_barcode
69 | LEFT JOIN gene_cp_agg cp ON cp.aliquot_barcode = gc.aliquot_barcode AND cp.region = gc.region
70 | ORDER BY 3


--------------------------------------------------------------------------------
/sql/cnv/gistic_prepare.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Prepare a set of primaries and recurrences from the gold set (these are good quality CNV data)
 3 | As input for running GISTIC
 4 | ==
 5 | Ensures one sample per patient
 6 | */
 7 | WITH t1 AS
 8 | (
 9 | 	SELECT
10 | 		aliquot_barcode,
11 | 		(CASE WHEN chrom = 23 THEN 'X' ELSE chrom::varchar(2) END) AS chrom,
12 | 		lower(pos) AS "start",
13 | 		upper(pos)-1 AS "end",
14 | 		num_points AS num_snps,
15 | 		log2_copy_ratio
16 | 	FROM variants.gatk_seg gs
17 | )
18 | SELECT t1.*,'P' AS sample_type FROM t1
19 | INNER JOIN analysis.gold_set gs ON gs.tumor_barcode_a = t1.aliquot_barcode
20 | 
21 | UNION
22 | 
23 | SELECT t1.*,'R' AS sample_type FROM t1
24 | INNER JOIN analysis.gold_set gs ON gs.tumor_barcode_b = t1.aliquot_barcode
25 | 


--------------------------------------------------------------------------------
/sql/cnv/prop_heterozygous_gatk.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | - For each tumor pair in the tumor pairs table, compute the proportion of the original genome changed
 3 | */
 4 | WITH
 5 | cnv AS
 6 | (
 7 | 	SELECT
 8 | 		gs.tumor_pair_barcode, 
 9 | 		gs.case_barcode, 
10 | 		gs.tumor_barcode_a, 
11 | 		gs.tumor_barcode_b,
12 | 		sum(upper(pos) - lower(pos) -1) AS seg_size,
13 | 		sum(CASE WHEN gs.cnv_call = 0 THEN (upper(pos) - lower(pos) -1) ELSE 0 END) AS het_size
14 | 	FROM analysis.gatk_seg_diff_call gs
15 | 	WHERE chrom < 23
16 | 	GROUP BY 1,2,3,4
17 | )
18 | SELECT
19 | 	tumor_pair_barcode, 
20 | 	case_barcode, 
21 | 	tumor_barcode_a, 
22 | 	tumor_barcode_b,
23 | 	round(1.0 - het_size::decimal/seg_size,4) AS prop_change
24 | FROM cnv
25 | ORDER BY 2


--------------------------------------------------------------------------------
/sql/compare_seg_stats.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | For each `tumor_pair_barcode` in the `tumor_pairs` table:
 3 | - Compare number of segments between (a) and (b)
 4 | - Compare proportion of genome that is heterozyous between (a) and (b)
 5 | */
 6 | SELECT
 7 | 	tumor_pair_barcode,
 8 | 	tumor_barcode_a,
 9 | 	tumor_barcode_b,
10 | 	s1.num_seg AS num_seg_a,
11 | 	s2.num_seg AS num_seg_b,
12 | 	s1.prop_het AS prop_het_a,
13 | 	s2.prop_het AS prop_het_b,
14 | 	s2.num_seg - s1.num_seg AS delta_num_seg,
15 | 	s2.prop_het - s1.prop_het AS delta_prop_het
16 | FROM analysis.tumor_pairs pa
17 | LEFT JOIN analysis.pairs p1 ON p1.tumor_barcode = pa.tumor_barcode_a
18 | LEFT JOIN analysis.pairs p2 ON p2.tumor_barcode = pa.tumor_barcode_b
19 | LEFT JOIN analysis.titan_seg_prop_het s1 ON s1.pair_barcode = p1.pair_barcode
20 | LEFT JOIN analysis.titan_seg_prop_het s2 ON s2.pair_barcode = p2.pair_barcode
21 | ORDER BY 9


--------------------------------------------------------------------------------
/sql/compute_chr7_10.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Calculate chromosome 7/10 status
 3 | */
 4 | WITH
 5 | selected_tumor_pairs AS
 6 | (
 7 | 	SELECT
 8 | 		tumor_pair_barcode,
 9 | 		case_barcode,
10 | 		tumor_barcode_a,
11 | 		tumor_barcode_b,
12 | 		row_number() OVER (PARTITION BY case_barcode ORDER BY surgical_interval_mo DESC, portion_a ASC, portion_b ASC, substring(tumor_pair_barcode from 27 for 3) ASC) AS priority
13 | 	FROM analysis.tumor_pairs ps
14 | 	LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a
15 | 	LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b
16 | 	WHERE
17 | 		comparison_type = 'longitudinal' AND
18 | 		sample_type_b <> 'M1' AND 													-- exclude metastatic samples here because this is outside the scope of our study
19 | 		b1.coverage_exclusion <> 'block' AND b2.cnv_exclusion <> 'block' 
20 | ),
21 | t2 AS
22 | (
23 | 	SELECT
24 | 		*,
25 | 		(SELECT log2_copy_ratio FROM analysis.cnv_by_chr_gatk WHERE chrom = '7' AND aliquot_barcode = stp.tumor_barcode_a) AS chr7_logr_a,
26 | 		(SELECT log2_copy_ratio FROM analysis.cnv_by_chr_gatk WHERE chrom = '7' AND aliquot_barcode = stp.tumor_barcode_b) AS chr7_logr_b,
27 | 		(SELECT log2_copy_ratio FROM analysis.cnv_by_chr_gatk WHERE chrom = '10' AND aliquot_barcode = stp.tumor_barcode_a) AS chr10_logr_a,
28 | 		(SELECT log2_copy_ratio FROM analysis.cnv_by_chr_gatk WHERE chrom = '10' AND aliquot_barcode = stp.tumor_barcode_b) AS chr10_logr_b
29 | 	FROM selected_tumor_pairs stp
30 | 	WHERE priority = 1
31 | ),
32 | t3 AS
33 | (
34 | 	SELECT
35 | 		*,
36 | 		(CASE WHEN chr7_logr_a > 0.1 AND chr10_logr_a < -0.1 THEN 1 ELSE 0 END)::boolean AS chr7_10_a,
37 | 		(CASE WHEN chr7_logr_b > 0.1 AND chr10_logr_b < -0.1 THEN 1 ELSE 0 END)::boolean AS chr7_10_b
38 | 	FROM t2
39 | )
40 | SELECT
41 | 	tumor_pair_barcode,
42 | 	case_barcode,
43 | 	tumor_barcode_a,
44 | 	tumor_barcode_b,
45 | 	(CASE WHEN chr7_10_a AND chr7_10_b THEN 'shared'
46 | 		  WHEN chr7_10_a AND NOT chr7_10_b THEN 'shed'
47 | 		  WHEN chr7_10_b AND NOT chr7_10_a THEN 'acquired'
48 | 		  WHEN NOT chr7_10_a AND NOT chr7_10_b THEN 'no' END) AS chr7_10_status
49 | FROM t3


--------------------------------------------------------------------------------
/sql/dndscv/dndscv_input_by_fraction.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Prepare input for use with dNdS CV
 3 | Remove contiguous sites using EXISTS
 4 | */
 5 | SELECT
 6 |   tp.case_barcode,
 7 |   (CASE WHEN pgeno.chrom = 23 THEN 'X' ELSE pgeno.chrom::varchar(2) END) AS chrom,
 8 |   lower(pgeno.pos) AS pos,
 9 |   pgeno.ref,
10 |   pgeno.alt AS mut,
11 |   st.idh_codel_subtype AS subtype,
12 |     (CASE WHEN mutect2_call_a AND mutect2_call_b     THEN 'S'
13 |           WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P'
14 |           WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS fraction
15 | FROM variants.pgeno
16 | INNER JOIN analysis.gold_set tp ON tp.tumor_pair_barcode = pgeno.tumor_pair_barcode
17 | LEFT JOIN clinical.subtypes st ON st.case_barcode = pgeno.case_barcode
18 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = tp.tumor_pair_barcode
19 | WHERE
20 |     (mutect2_call_a OR mutect2_call_b) AND hypermutator_status IS FALSE
21 |     
22 | -- END --


--------------------------------------------------------------------------------
/sql/dndscv/dndscv_input_by_fraction_hyperm.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Prepare input for use with dNdS CV
 3 | Remove contiguous sites using EXISTS
 4 | */
 5 | SELECT
 6 |   tp.case_barcode,
 7 |   (CASE WHEN pgeno.chrom = 23 THEN 'X' ELSE pgeno.chrom::varchar(2) END) AS chrom,
 8 |   lower(pgeno.pos) AS pos,
 9 |   pgeno.ref,
10 |   pgeno.alt AS mut,
11 |   st.idh_codel_subtype AS subtype,
12 |     (CASE WHEN mutect2_call_a AND mutect2_call_b     THEN 'S'
13 |           WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P'
14 |           WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS fraction
15 | FROM variants.pgeno
16 | INNER JOIN analysis.gold_set tp ON tp.tumor_pair_barcode = pgeno.tumor_pair_barcode
17 | LEFT JOIN clinical.subtypes st ON st.case_barcode = pgeno.case_barcode
18 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = tp.tumor_pair_barcode
19 | WHERE
20 |     (mutect2_call_a OR mutect2_call_b) AND hypermutator_status IS TRUE
21 |     
22 | -- END --


--------------------------------------------------------------------------------
/sql/dndscv/dndscv_input_by_sample.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Prepare input for use with dNdS CV
 3 | Remove contiguous sites using EXISTS
 4 | Modified for per sample analysis
 5 | */
 6 | WITH
 7 | selected_aliquots AS
 8 | (
 9 |     SELECT gs.tumor_barcode_a AS aliquot_barcode, 'P' AS sample_type FROM analysis.gold_set gs
10 |     INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = gs.tumor_pair_barcode
11 |     WHERE hypermutator_status IS FALSE
12 |     UNION
13 |     SELECT gs.tumor_barcode_b AS aliquot_barcode, 'R' AS sample_type FROM analysis.gold_set gs
14 |     INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = gs.tumor_pair_barcode
15 |     WHERE hypermutator_status IS FALSE
16 | )
17 | 
18 | SELECT DISTINCT -- remove duplicate entries
19 |     gt.aliquot_barcode,
20 |     (CASE WHEN gt.chrom = 23 THEN 'X' ELSE gt.chrom::varchar(2) END) AS chrom,
21 |     lower(gt.pos) AS pos,
22 |     pa.ref,
23 |     gt.alt AS mut,
24 |     sample_type,
25 |     idh_codel_subtype AS subtype
26 | FROM variants.passgeno gt
27 | INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = gt.aliquot_barcode 
28 | LEFT JOIN variants.passanno pa ON pa.variant_id = gt.variant_id
29 | LEFT JOIN clinical.subtypes su ON su.case_barcode = substring(gt.aliquot_barcode from 1 for 12)
30 | WHERE
31 |     ssm2_pass_call
32 |     
33 | -- END --


--------------------------------------------------------------------------------
/sql/dndscv/dndscv_input_by_sample_hyperm.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Prepare input for use with dNdS CV
 3 | Remove contiguous sites using EXISTS
 4 | Modified for per sample analysis
 5 | */
 6 | WITH
 7 | selected_aliquots AS
 8 | (
 9 |     SELECT gs.tumor_barcode_a AS aliquot_barcode, 'P' AS sample_type FROM analysis.gold_set gs
10 |     INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = gs.tumor_pair_barcode
11 |     WHERE hypermutator_status IS TRUE
12 |     UNION
13 |     SELECT gs.tumor_barcode_b AS aliquot_barcode, 'R' AS sample_type FROM analysis.gold_set gs
14 |     INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = gs.tumor_pair_barcode
15 |     WHERE hypermutator_status IS TRUE
16 | )
17 | 
18 | SELECT DISTINCT -- remove duplicate entries
19 |     gt.aliquot_barcode,
20 |     (CASE WHEN gt.chrom = 23 THEN 'X' ELSE gt.chrom::varchar(2) END) AS chrom,
21 |     lower(gt.pos) AS pos,
22 |     pa.ref,
23 |     gt.alt AS mut,
24 |     sample_type,
25 |     idh_codel_subtype AS subtype
26 | FROM variants.passgeno gt
27 | INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = gt.aliquot_barcode 
28 | LEFT JOIN variants.passanno pa ON pa.variant_id = gt.variant_id
29 | LEFT JOIN clinical.subtypes su ON su.case_barcode = substring(gt.aliquot_barcode from 1 for 12)
30 | WHERE
31 |     ssm2_pass_call
32 |     
33 | -- END --


--------------------------------------------------------------------------------
/sql/drivers/driver_status.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     ss.tumor_pair_barcode,
 3 |     ss.case_barcode,
 4 |     ss.tumor_barcode_a,
 5 |     ss.tumor_barcode_b,
 6 |     st.idh_codel_subtype,
 7 |     arm_driver_count,
 8 | 	snv_driver_count,
 9 |     cnv_driver_count,
10 |     arm_driver_count_shared,
11 | 	snv_driver_count_shared,
12 |     cnv_driver_count_shared,
13 |     arm_driver_count_private_a,
14 | 	snv_driver_count_private_a,
15 |     cnv_driver_count_private_a,
16 |     arm_driver_count_private_b,
17 | 	snv_driver_count_private_b,
18 |     cnv_driver_count_private_b,
19 |     arm_driver_shared,
20 | 	snv_driver_shared,
21 |     cnv_driver_shared,
22 |     arm_driver_stability,
23 | 	snv_driver_stability,
24 |     cnv_driver_stability,
25 |     arm_driver_change_a,
26 | 	snv_driver_change_a,
27 | 	cnv_driver_change_a,
28 | 	arm_driver_change_b,
29 | 	snv_driver_change_b,
30 |     cnv_driver_change_b,
31 |     snv_driver_context_shared,
32 |     cnv_driver_context_shared,
33 |     cnv_driver_context_change,
34 |     snv_driver_context_change,
35 |     snv_driver_evolution
36 | FROM analysis.silver_set ss 
37 | LEFT JOIN analysis.driver_status_snv dss ON ss.tumor_pair_barcode = dss.tumor_pair_barcode
38 | LEFT JOIN analysis.driver_status_cnv dsc ON ss.tumor_pair_barcode = dsc.tumor_pair_barcode
39 | LEFT JOIN analysis.driver_status_arm dsa ON ss.tumor_pair_barcode = dsa.tumor_pair_barcode
40 | LEFT JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode


--------------------------------------------------------------------------------
/sql/figures/mutsig_boxplot_fig1.sql:
--------------------------------------------------------------------------------
 1 | WITH t1 AS
 2 | (
 3 | 	SELECT gs.case_barcode, idh_codel_subtype, hypermutator_status::integer, fraction, signature, mut_n, abs_score, rel_score, RANK() OVER (PARTITION BY gs.case_barcode,fraction ORDER BY rel_score DESC) AS rnk, COUNT(*) OVER (PARTITION BY gs.case_barcode, signature) AS all_fractions_counts
 4 | 	FROM analysis.mut_sig_fraction_limited ms
 5 | 	INNER JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ms.tumor_pair_barcode
 6 | 	INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = ms.tumor_pair_barcode
 7 | 	INNER JOIN clinical.subtypes st ON st.case_barcode = gs.case_barcode
 8 | )
 9 | SELECT * --signature,fraction,hypermutator_status,sum(rel_score) / count(rel_score) AS avg_rel_score, stddev(rel_score) AS sd_rel_score
10 | FROM t1
11 | --GROUP BY 1,2,3
12 | --ORDER BY 4 DESC


--------------------------------------------------------------------------------
/sql/figures/mutsig_corr.sql:
--------------------------------------------------------------------------------
1 | SELECT gs.case_barcode, case_age_diagnosis_years AS age, surgical_interval, idh_codel_subtype, hypermutator_status::integer, fraction, signature, mut_n, abs_score, rel_score, RANK() OVER (PARTITION BY gs.case_barcode,fraction ORDER BY rel_score DESC) AS rnk, COUNT(*) OVER (PARTITION BY gs.case_barcode, signature) AS all_fractions_counts
2 | FROM analysis.mut_sig_fraction_limited ms
3 | INNER JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ms.tumor_pair_barcode
4 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = ms.tumor_pair_barcode
5 | INNER JOIN clinical.subtypes st ON st.case_barcode = gs.case_barcode
6 | INNER JOIN clinical.cases ca ON ca.case_barcode = gs.case_barcode


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_aneuploidy.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 	tumor_pair_barcode,
 3 | 	tumor_barcode_a,
 4 | 	tumor_barcode_b,
 5 | 	ss.case_barcode,
 6 | 	idh_codel_subtype,
 7 | 	a1.prop_aneuploidy AS aneuploidy_a,
 8 | 	a2.prop_aneuploidy AS aneuploidy_b,
 9 | 	a1.aneuploidy_amp_score::integer AS aneuploidy_amp_score_a,
10 | 	a2.aneuploidy_amp_score::integer AS aneuploidy_amp_score_b,
11 | 	a1.aneuploidy_del_score::integer AS aneuploidy_del_score_a,
12 | 	a2.aneuploidy_del_score::integer AS aneuploidy_del_score_b,
13 | 	a1.aneuploidy_score::integer AS aneuploidy_score_a,
14 | 	a2.aneuploidy_score::integer AS aneuploidy_score_b,
15 | 	(CASE WHEN b1.cnv_exclusion <> 'allow' OR b2.cnv_exclusion <> 'allow' THEN 1 ELSE 0 END) qc_fail
16 | FROM analysis.gold_set ss
17 | LEFT JOIN analysis.gatk_aneuploidy a1 ON a1.aliquot_barcode = ss.tumor_barcode_a
18 | LEFT JOIN analysis.gatk_aneuploidy a2 ON a2.aliquot_barcode = ss.tumor_barcode_b
19 | --LEFT JOIN analysis.taylor_aneuploidy t1 ON t1.aliquot_barcode = ss.tumor_barcode_a
20 | --LEFT JOIN analysis.taylor_aneuploidy t2 ON t2.aliquot_barcode = ss.tumor_barcode_b
21 | LEFT JOIN clinical.subtypes su ON su.case_barcode = ss.case_barcode
22 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ss.tumor_barcode_a
23 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ss.tumor_barcode_b


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_arm.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | selected_tumor_pairs AS
 3 | (
 4 | 	SELECT ss.tumor_pair_barcode, ss.tumor_barcode_a, ss.tumor_barcode_b, ss.case_barcode, idh_codel_subtype, (CASE WHEN gs.tumor_pair_barcode IS NULL THEN 'Silver set' ELSE 'Gold set' END) AS gold_set
 5 | 	FROM analysis.gold_set ss
 6 | 	LEFT JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ss.tumor_pair_barcode
 7 | 	INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode
 8 | ),
 9 | selected_arms AS
10 | (
11 | 	SELECT chrom,arm,direction FROM ref.arm_drivers_subtype
12 | ),
13 | cnv_by_pair_arm AS
14 | (
15 | 	SELECT
16 | 		stp.tumor_pair_barcode,
17 | 		stp.case_barcode,
18 | 		stp.idh_codel_subtype,
19 | 		stp.tumor_barcode_a,
20 | 		stp.tumor_barcode_b,
21 | 		sa.chrom,
22 | 		sa.arm,
23 | 		c1.arm_call AS arm_a,
24 | 		c2.arm_call AS arm_b,
25 | 		(CASE
26 | 		 WHEN sa.direction = -1 AND (c1.arm_call = -1 OR c2.arm_call = -1) THEN 'del'
27 | 		 WHEN sa.direction = 1 AND (c1.arm_call = 1 OR c2.arm_call = 1) THEN 'amp'
28 | 		 WHEN sa.direction = -1 AND (c1.arm_call = 1 OR c2.arm_call = 1) THEN 'neut'
29 | 		 WHEN sa.direction = 1 AND (c1.arm_call = -1 OR c2.arm_call = -1) THEN 'neut'
30 | 		 WHEN (c1.arm_call = 0 OR c2.arm_call = 0) THEN 'neut'
31 | 		 ELSE NULL
32 | 		 END) cnv_state,
33 | 		(CASE
34 | 		 WHEN direction = -1 AND (c1.arm_call = -1 OR c2.arm_call = -1) AND c1.arm_call < c2.arm_call THEN 'P'
35 | 		 WHEN direction = -1 AND (c1.arm_call = -1 OR c2.arm_call = -1) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S'
36 | 		 WHEN direction = -1 AND (c1.arm_call = -1 OR c2.arm_call = -1) AND c1.arm_call > c2.arm_call THEN 'R'
37 | 		 WHEN direction = 1 AND (c1.arm_call = 1 OR c2.arm_call = 1) AND c1.arm_call > c2.arm_call THEN 'P'
38 | 		 WHEN direction = 1 AND (c1.arm_call = 1 OR c2.arm_call = 1) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S'
39 | 		 WHEN direction = 1 AND (c1.arm_call = 1 OR c2.arm_call = 1) AND c1.arm_call < c2.arm_call THEN 'R'
40 | 		 WHEN (c1.arm_call = 0 OR c2.arm_call = 0) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S'
41 | 		 WHEN direction = -1 AND (c1.arm_call = 1 OR c2.arm_call = 1) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S'
42 | 		 WHEN direction = 1 AND (c1.arm_call = -1 OR c2.arm_call = -1) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S'
43 | 		 ELSE NULL
44 | 		 END) cnv_change
45 | 	FROM selected_tumor_pairs stp
46 | 	CROSS JOIN selected_arms sa
47 | 	LEFT JOIN analysis.gatk_cnv_by_arm c1 ON c1.aliquot_barcode = stp.tumor_barcode_a AND c1.arm = sa.arm
48 | 	LEFT JOIN analysis.gatk_cnv_by_arm c2 ON c2.aliquot_barcode = stp.tumor_barcode_b AND c2.arm = sa.arm
49 | )
50 | SELECT * FROM cnv_by_pair_arm ORDER BY 1


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_c710.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 	tumor_pair_barcode,
 3 | 	ss.case_barcode,
 4 | 	idh_codel_subtype,
 5 | 	(CASE
 6 | 	 WHEN c1.c710 IS FALSE AND c2.c710 IS FALSE THEN 'WT'
 7 | 	 WHEN c1.c710 IS TRUE AND c2.c710 IS TRUE THEN 'S'
 8 | 	 WHEN c1.c710 IS TRUE AND c2.c710 IS FALSE THEN 'P'
 9 | 	 WHEN c1.c710 IS FALSE AND c2.c710 IS TRUE THEN 'R'
10 | 	 ELSE NULL
11 | 	END) c710_status
12 | FROM analysis.gold_set ss
13 | LEFT JOIN analysis.gatk_c710_status c1 ON c1.aliquot_barcode = ss.tumor_barcode_a
14 | LEFT JOIN analysis.gatk_c710_status c2 ON c2.aliquot_barcode = ss.tumor_barcode_b
15 | LEFT JOIN clinical.subtypes su ON su.case_barcode = ss.case_barcode


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_clinical.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 	cp.tumor_pair_barcode,
 3 | 	cp.case_barcode,
 4 | 	(CASE WHEN recurrence_location = 'Distal' THEN 1 WHEN recurrence_location = 'Local' THEN 0 ELSE NULL END) location_distal,
 5 | 	(CASE WHEN grade_change = 'Grade up' THEN 1 WHEN grade_change IN ('Grade up', 'Grade stable') THEN 0 ELSE NULL END) grade_change,
 6 | 	(CASE WHEN received_alk = '1' THEN 1 WHEN received_alk = '0' THEN 0 ELSE NULL END) received_alk,
 7 | 	(CASE WHEN received_rt = '1' THEN 1 WHEN received_rt = '0' THEN 0 ELSE NULL END) received_rt,
 8 | 	(CASE WHEN hypermutator_status = '1' THEN 1 WHEN hypermutator_status = '0' THEN 0 ELSE NULL END) is_hypermutator,
 9 | 	idh_codel_subtype
10 | FROM analysis.tumor_clinical_comparison cp
11 | INNER JOIN analysis.gold_set ss ON ss.tumor_pair_barcode = cp.tumor_pair_barcode
12 | LEFT JOIN clinical.subtypes st ON st.case_barcode = cp.case_barcode


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_drivers.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |     ss.tumor_pair_barcode,
 3 |     ss.case_barcode,
 4 |     ss.tumor_barcode_a,
 5 |     ss.tumor_barcode_b,
 6 |     st.idh_codel_subtype,
 7 |     snv_driver_count::integer,
 8 |     cnv_driver_count::integer,
 9 |     snv_driver_count_shared::integer,
10 |     cnv_driver_count_shared::integer,
11 |     snv_driver_count_private_a::integer,
12 |     cnv_driver_count_private_a::integer,
13 |     snv_driver_count_private_b::integer,
14 |     cnv_driver_count_private_b::integer,
15 |     snv_driver_shared,
16 |     cnv_driver_shared,
17 |     snv_driver_stability,
18 |     cnv_driver_stability,
19 |     snv_driver_change_a,
20 |     snv_driver_change_b,
21 |     cnv_driver_change_a,
22 |     cnv_driver_change_b,
23 |     snv_driver_context_shared,
24 |     cnv_driver_context_shared,
25 |     cnv_driver_context_change,
26 |     snv_driver_context_change,
27 |     snv_driver_evolution
28 | FROM analysis.gold_set ss 
29 | LEFT JOIN analysis.driver_status_snv dss ON ss.tumor_pair_barcode = dss.tumor_pair_barcode
30 | LEFT JOIN analysis.driver_status_cnv dsc ON ss.tumor_pair_barcode = dsc.tumor_pair_barcode
31 | LEFT JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_evolution.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | 	tumor_pair_barcode, ss.case_barcode, idh_codel_subtype, tumor_barcode_a, tumor_barcode_b, 
3 | 	s1.most_probable_classification AS evolution_a,
4 | 	s2.most_probable_classification AS evolution_b,
5 | 	(CASE WHEN s1.most_probable_classification IS NOT NULL AND s2.most_probable_classification IS NOT NULL THEN s1.most_probable_classification || '-' || s2.most_probable_classification END) as evolution_ab
6 | FROM analysis.gold_set ss
7 | LEFT JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode
8 | LEFT JOIN analysis.subclonalselection s1 ON s1.aliquot_barcode = ss.tumor_barcode_a
9 | LEFT JOIN analysis.subclonalselection s2 ON s2.aliquot_barcode = ss.tumor_barcode_b


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_mf.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 	tmc.tumor_pair_barcode,
 3 | 	tmc.case_barcode,
 4 | 	idh_codel_subtype,
 5 | 	tmc.surgical_interval_mo,
 6 | 	tmc.count_a,
 7 | 	tmc.count_b,
 8 | 	tmc.union_ab,
 9 | 	tmc.intersection_ab,
10 | 	tmc.setdiff_a,
11 | 	tmc.setdiff_b,
12 | 	mf1.cumulative_coverage AS cov_a,
13 | 	mf2.cumulative_coverage AS cov_b,
14 | 	LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) AS min_cov,
15 | 	ROUND(setdiff_a::decimal / mf1.cumulative_coverage * 1e6, 4) AS mf_private_a,
16 | 	ROUND(setdiff_b::decimal / mf2.cumulative_coverage * 1e6, 4) AS mf_private_b,
17 | 	ROUND(intersection_ab::decimal / LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) * 1e6, 4) AS mf_shared,
18 | 	mf1.coverage_adj_mut_freq AS mf_a,
19 | 	mf2.coverage_adj_mut_freq AS mf_b
20 | FROM analysis.tumor_mut_comparison tmc
21 | INNER JOIN analysis.gold_set stp ON tmc.tumor_pair_barcode = stp.tumor_pair_barcode
22 | LEFT JOIN analysis.mut_freq mf1 ON mf1.aliquot_barcode = tmc.tumor_barcode_a 
23 | LEFT JOIN analysis.mut_freq mf2 ON mf2.aliquot_barcode = tmc.tumor_barcode_b 
24 | LEFT JOIN clinical.subtypes su ON su.case_barcode = stp.case_barcode


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_purity.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 	ss.case_barcode,
 3 | 	tp1.purity AS purity_a,
 4 | 	tp2.purity AS purity_b,
 5 | 	sp1.cellularity AS seqz_purity_a,
 6 | 	sp1.cellularity AS seqz_purity_b,
 7 | 	idh_codel_subtype
 8 | FROM analysis.gold_set ss
 9 | LEFT JOIN analysis.pairs p1 ON p1.tumor_barcode = ss.tumor_barcode_a
10 | LEFT JOIN variants.titan_params tp1 ON tp1.pair_barcode = p1.pair_barcode
11 | LEFT JOIN analysis.pairs p2 ON p2.tumor_barcode = ss.tumor_barcode_b
12 | LEFT JOIN variants.titan_params tp2 ON tp2.pair_barcode = p2.pair_barcode
13 | LEFT JOIN clinical.subtypes su ON su.case_barcode = ss.case_barcode
14 | LEFT JOIN variants.seqz_params sp1 ON sp1.pair_barcode = p1.pair_barcode
15 | LEFT JOIN variants.seqz_params sp2 ON sp2.pair_barcode = p2.pair_barcode


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_pyclone_clusters.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | selected_tumor_pairs AS
 3 | (
 4 |     SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype
 5 |     FROM analysis.gold_set ss
 6 |     INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode
 7 | ),
 8 | selected_aliquots AS
 9 | (
10 |     SELECT tumor_barcode_a AS aliquot_barcode, idh_codel_subtype, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs
11 |     UNION
12 |     SELECT tumor_barcode_b AS aliquot_barcode, idh_codel_subtype, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs
13 | ),
14 | pyclone_clusters AS
15 | (
16 |     SELECT sa.case_barcode, sa.idh_codel_subtype, pc.cluster_id, COUNT(*) AS num_samples, min(size) as cluster_size, MIN(mean) as min_ccf, MAX(mean) AS max_ccf, sum(mean)/COUNT(mean) AS mean_ccf
17 |     FROM variants.pyclone_cluster pc
18 |     RIGHT JOIN selected_aliquots sa ON sa.aliquot_barcode = pc.aliquot_barcode
19 |     GROUP BY 1,2,3
20 |     HAVING MIN(mean) > 0.1 OR MAX(mean) > 0.1 OR bool_and(mean IS NULL)
21 |     ORDER BY 5 DESC
22 | )
23 | --SELECT DISTINCT case_barcode FROM pyclone_clusters
24 | SELECT * FROM pyclone_clusters ORDER BY 1,2


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_signatures.sql:
--------------------------------------------------------------------------------
 1 | /*SELECT gs.case_barcode, idh_codel_subtype, fraction, signature, mut_n, abs_score, rel_score
 2 | FROM analysis.mut_sig_fraction ms
 3 | INNER JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ms.tumor_pair_barcode
 4 | INNER JOIN clinical.subtypes st ON st.case_barcode = gs.case_barcode
 5 | WHERE signature IN (1,3,11,15,26)*/
 6 | WITH t1 AS (
 7 | 	SELECT gs.case_barcode, idh_codel_subtype, fraction, signature, mut_n, abs_score, rel_score, RANK() OVER (PARTITION BY gs.case_barcode,fraction ORDER BY rel_score DESC) AS rnk
 8 | 	FROM analysis.mut_sig_fraction_limited ms
 9 | 	INNER JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ms.tumor_pair_barcode
10 | 	INNER JOIN clinical.subtypes st ON st.case_barcode = gs.case_barcode
11 | 	)
12 | 	SELECT * FROM t1 --WHERE rnk = 1


--------------------------------------------------------------------------------
/sql/heatmap/heatmap_time.sql:
--------------------------------------------------------------------------------
 1 | WITH t1 AS
 2 | (
 3 | 	SELECT
 4 | 		case_barcode,
 5 | 		surgery_number::varchar(255),
 6 | 		surgical_interval_mo AS time_mo,
 7 | 		idh_codel_subtype
 8 | 	FROM clinical.surgeries
 9 | 	WHERE idh_codel_subtype IS NOT NULL
10 | 
11 | 	UNION
12 | 
13 | 	SELECT
14 | 		cc.case_barcode,
15 | 		case_vital_status::varchar(255) AS surgery_number,
16 | 		case_overall_survival_mo AS time_mo,
17 | 		idh_codel_subtype
18 | 	FROM clinical.cases cc
19 | 	LEFT JOIN clinical.subtypes cs ON cc.case_barcode = cs.case_barcode
20 | )
21 | SELECT t1.*, case_source
22 | FROM t1
23 | INNER JOIN analysis.gold_set ss ON t1.case_barcode = ss.case_barcode
24 | LEFT JOIN clinical.cases cc ON t1.case_barcode = cc.case_barcode


--------------------------------------------------------------------------------
/sql/id_multiple_aliquot_driver_change.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Using driver stable/change annotation, identify patients with:
 3 | - Many available samples
 4 | - Driver change
 5 | */
 6 | WITH
 7 | t1 AS
 8 | (
 9 | 	SELECT case_barcode,aliquot_analysis_type,COUNT(*) AS num_aliquots
10 | 	FROM biospecimen.aliquots al
11 | 	LEFT JOIN biospecimen.samples sa ON al.sample_barcode = sa.sample_barcode
12 | 	GROUP BY case_barcode,aliquot_analysis_type
13 | 	ORDER BY 3 DESC
14 | )
15 | SELECT ds.case_barcode,driver_count,driver_status,target,num_aliquots
16 | FROM analysis.driver_status ds
17 | LEFT JOIN t1 ON ds.case_barcode = t1.case_barcode
18 | ORDER BY 2 DESC


--------------------------------------------------------------------------------
/sql/mf_longitudinal_analysis.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 	tmc.tumor_pair_barcode,
 3 | 	tmc.case_barcode,
 4 | 	idh_codel_subtype,
 5 | 	received_alkylating_agent,
 6 | 	hypermutator_status,
 7 | 	0 AS time_birth,
 8 | 	ca.case_age_diagnosis_years AS time_initial,
 9 | 	ROUND(ca.case_age_diagnosis_years + (tmc.surgical_interval_mo / 12.0),2) AS time_recurrence,
10 | 	0 AS mf_birth,
11 | 	mf1.coverage_adj_mut_freq AS mf_initial,
12 | 	mf2.coverage_adj_mut_freq AS mf_recurrence
13 | 	/*tmc.count_a,
14 | 	tmc.count_b,
15 | 	tmc.union_ab,
16 | 	tmc.intersection_ab,
17 | 	tmc.setdiff_a,
18 | 	tmc.setdiff_b,
19 | 	mf1.cumulative_coverage AS cov_a,
20 | 	mf2.cumulative_coverage AS cov_b,
21 | 	LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) AS min_cov,
22 | 	ROUND(setdiff_a::decimal / mf1.cumulative_coverage * 1e6, 4) AS mf_private_a,
23 | 	ROUND(setdiff_b::decimal / mf2.cumulative_coverage * 1e6, 4) AS mf_private_b,
24 | 	ROUND(intersection_ab::decimal / LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) * 1e6, 4) AS mf_shared,*/
25 | FROM analysis.tumor_mut_comparison tmc
26 | INNER JOIN analysis.silver_set stp ON tmc.tumor_pair_barcode = stp.tumor_pair_barcode
27 | LEFT JOIN clinical.clinical_by_tumor_pair ctp ON ctp.tumor_pair_barcode = stp.tumor_pair_barcode
28 | LEFT JOIN analysis.mutation_freq mf1 ON mf1.aliquot_barcode = tmc.tumor_barcode_a 
29 | LEFT JOIN analysis.mutation_freq mf2 ON mf2.aliquot_barcode = tmc.tumor_barcode_b 
30 | LEFT JOIN clinical.subtypes su ON su.case_barcode = stp.case_barcode
31 | LEFT JOIN clinical.cases ca ON ca.case_barcode = stp.case_barcode


--------------------------------------------------------------------------------
/sql/mut_freq/mut_freq.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Compute mutation frequencies for each aliquot_barcode
 3 | - Mutation frequencies is output in mutations per megabase (1e6 basepairs)
 4 | - Only mutations with >= 15x are counted
 5 | - Mutation counts are divided by the number of basepairs with at least 15x coverage
 6 | - COALESCE is used to prevent "divison by zero" problems
 7 | - JOIN to blocklist so we don't report any aliquots that were excluded based on fingerprinting or coverage
 8 | 
 9 | Note:
10 | - for the ssm2_count table I counted mutation using greater than (>) --> 14 threshold
11 | - for the coverage table I counted coverage using greater than or equal to (>=) --> 15 threshold
12 | */
13 | 
14 | SELECT 
15 | 	m2.aliquot_barcode,
16 | 	cumulative_coverage,
17 | 	ssm2_call_count AS mutation_count,
18 | 	COALESCE(ROUND(ssm2_call_count::numeric / cumulative_coverage::numeric * 1e6, 4), 0::numeric) AS coverage_adj_mut_freq
19 | FROM variants.ssm2_count m2
20 | INNER JOIN analysis.coverage cov ON cov.aliquot_barcode = m2.aliquot_barcode
21 | WHERE m2.ad_depth = 14 AND cov.coverage = 15
22 | 
23 | 


--------------------------------------------------------------------------------
/sql/mut_sig/archive/mut_sig_aliquot.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Calculate mutational signatures
 3 | By Aliquot
 4 | */
 5 | WITH selected_aliquots AS
 6 | (
 7 | 	SELECT ba.aliquot_barcode
 8 | 	FROM biospecimen.aliquots ba
 9 | 	LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = ba.aliquot_barcode
10 | 	LEFT JOIN biospecimen.samples bs ON bs.sample_barcode = ba.sample_barcode
11 | 	WHERE fingerprint_exclusion = 'allow' AND coverage_exclusion = 'allow' AND sample_type NOT IN ('NM','NB') --AND ba.aliquot_barcode IN ('GLSS-HK-0002-R1-01D-WGS-S3QETN') --,'GLSS-DK-0008-R1-01D-WXS-DDD4B8','TCGA-06-0190-R1-01D-WGS-P20F5P','TCGA-14-1402-R1-01D-WGS-2EHMQ2')--('GLSS-CU-R008-TP-01D-WXS-0238UJ','GLSS-HK-0004-R1-01D-WGS-RYFPEB')
12 | ),
13 | variant_contexts AS
14 | (
15 | 	SELECT DISTINCT ref_context AS trinucleotide_context, alt
16 | 	FROM ref.signature_proba sp
17 | ),
18 | variant_contexts_aliquots AS
19 | (
20 | 	SELECT *
21 | 	FROM selected_aliquots, variant_contexts
22 | ),
23 | variant_context_counts AS
24 | (	
25 | 	SELECT aliquot_barcode, trinucleotide_context, pa.alt, COUNT(*) AS mut_n
26 | 	FROM variants.passgeno pg
27 | 	INNER JOIN variants.passanno pa ON pa.variant_id = pg.variant_id
28 | 	WHERE ssm2_pass_call IS TRUE AND variant_type = 'SNP' AND ad_alt + ad_ref >= 15
29 | 	GROUP BY 1,2,3
30 | ),
31 | variant_context_counts_aliquots AS
32 | (
33 | 	SELECT vca.*, COALESCE(mut_n,0) AS mut_n, SUM(COALESCE(mut_n,0)) OVER (PARTITION BY vca.aliquot_barcode) AS mut_n_total
34 | 	FROM variant_contexts_aliquots vca
35 | 	LEFT JOIN variant_context_counts vcc ON vcc.aliquot_barcode = vca.aliquot_barcode AND vcc.trinucleotide_context = vca.trinucleotide_context AND vcc.alt = vca.alt
36 | ),
37 | ref_context_array AS
38 | (
39 | 	SELECT array_agg(a ORDER BY signature) AS ref_context_arr
40 | 	FROM (SELECT signature, array_agg(proba ORDER BY ref_context,alt) a FROM ref.signature_proba sp GROUP BY 1) t
41 | ),
42 | context_reconstruction AS
43 | (
44 | 	SELECT aliquot_barcode,ref_context_arr,sum(mut_n) AS mut_n, array_agg(mut_n ORDER BY trinucleotide_context,alt), lsqnonneg(ref_context_arr, array_agg(mut_n ORDER BY trinucleotide_context,alt)) AS mut_sigs
45 | 	FROM variant_context_counts_aliquots, ref_context_array
46 | 	WHERE mut_n_total > 1
47 | 	GROUP BY 1,2
48 | )
49 | SELECT aliquot_barcode, generate_series(1,30) AS signature, mut_n, unnest(mut_sigs) AS abs_score, UNNEST(mut_sigs) / (SELECT SUM(s) FROM UNNEST(mut_sigs) s) AS rel_score
50 | FROM context_reconstruction


--------------------------------------------------------------------------------
/sql/mut_sig/archive/mut_sig_gene.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Calculate mutational signatures
 3 | By Gene
 4 | */
 5 | WITH selected_genes AS
 6 | (
 7 | 	SELECT DISTINCT gene_symbol
 8 | 	FROM variants.anno
 9 | 	ORDER BY 1
10 | ),
11 | variant_contexts AS
12 | (
13 | 	SELECT DISTINCT ref_context AS trinucleotide_context, alt
14 | 	FROM ref.signature_proba sp
15 | ),
16 | variant_contexts_genes AS
17 | (
18 | 	SELECT *
19 | 	FROM selected_genes, variant_contexts
20 | ),
21 | variant_context_counts AS
22 | (	
23 | 	SELECT gene_symbol, trinucleotide_context, pa.alt, COUNT(*) AS mut_n
24 | 	FROM variants.passgeno pg
25 | 	INNER JOIN variants.passanno pa ON pa.variant_id = pg.variant_id
26 | 	LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = pg.aliquot_barcode
27 | 	WHERE ssm2_pass_call IS TRUE AND variant_type = 'SNP' AND ad_alt + ad_ref >= 15 AND fingerprint_exclusion = 'allow' AND coverage_exclusion = 'allow' AND variant_classification = 'MISSENSE'
28 | 	GROUP BY 1,2,3
29 | ),
30 | variant_context_counts_genes AS
31 | (
32 | 	SELECT vca.*, COALESCE(mut_n,0) AS mut_n, SUM(COALESCE(mut_n,0)) OVER (PARTITION BY vca.gene_symbol) AS mut_n_total
33 | 	FROM variant_contexts_genes vca
34 | 	LEFT JOIN variant_context_counts vcc ON vcc.gene_symbol = vca.gene_symbol AND vcc.trinucleotide_context = vca.trinucleotide_context AND vcc.alt = vca.alt
35 | ),
36 | ref_context_array AS
37 | (
38 | 	SELECT array_agg(a ORDER BY signature) AS ref_context_arr
39 | 	FROM (SELECT signature, array_agg(proba ORDER BY ref_context,alt) a FROM ref.signature_proba sp GROUP BY 1) t
40 | ),
41 | context_reconstruction AS
42 | (
43 | 	SELECT gene_symbol,ref_context_arr,sum(mut_n) AS mut_n, array_agg(mut_n ORDER BY trinucleotide_context,alt), lsqnonneg(ref_context_arr, array_agg(mut_n ORDER BY trinucleotide_context,alt)) AS mut_sigs
44 | 	FROM variant_context_counts_genes, ref_context_array
45 | 	WHERE mut_n_total > 9
46 | 	GROUP BY 1,2
47 | )
48 | SELECT gene_symbol, generate_series(1,30) AS signature, mut_n, unnest(mut_sigs) AS abs_score, UNNEST(mut_sigs) / (SELECT SUM(s) FROM UNNEST(mut_sigs) s) AS rel_score
49 | FROM context_reconstruction


--------------------------------------------------------------------------------
/sql/mut_sig/archive/mut_sig_variant_classification.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Calculate mutational signatures
 3 | By Variant Classification
 4 | */
 5 | WITH variant_classifications AS
 6 | (
 7 | 	SELECT DISTINCT variant_classification
 8 | 	FROM variants.variant_classifications
 9 | 	ORDER BY 1
10 | ),
11 | variant_contexts AS
12 | (
13 | 	SELECT DISTINCT ref_context AS trinucleotide_context, alt
14 | 	FROM ref.signature_proba sp
15 | ),
16 | variant_contexts_classifications AS
17 | (
18 | 	SELECT *
19 | 	FROM variant_classifications, variant_contexts
20 | ),
21 | variant_context_counts AS
22 | (	
23 | 	SELECT variant_classification, trinucleotide_context, pa.alt, COUNT(*) AS mut_n
24 | 	FROM variants.passgeno pg
25 | 	INNER JOIN variants.passanno pa ON pa.variant_id = pg.variant_id
26 | 	LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = pg.aliquot_barcode
27 | 	WHERE ssm2_pass_call IS TRUE AND variant_type = 'SNP' AND ad_alt + ad_ref >= 15 AND fingerprint_exclusion = 'allow' AND coverage_exclusion = 'allow'
28 | 	GROUP BY 1,2,3
29 | ),
30 | variant_context_counts_aliquots AS
31 | (
32 | 	SELECT vca.*, COALESCE(mut_n,0) AS mut_n, SUM(COALESCE(mut_n,0)) OVER (PARTITION BY vca.variant_classification) AS mut_n_total
33 | 	FROM variant_contexts_classifications vca
34 | 	LEFT JOIN variant_context_counts vcc ON vcc.variant_classification = vca.variant_classification AND vcc.trinucleotide_context = vca.trinucleotide_context AND vcc.alt = vca.alt
35 | ),
36 | ref_context_array AS
37 | (
38 | 	SELECT array_agg(a ORDER BY signature) AS ref_context_arr
39 | 	FROM (SELECT signature, array_agg(proba ORDER BY ref_context,alt) a FROM ref.signature_proba sp GROUP BY 1) t
40 | ),
41 | context_reconstruction AS
42 | (
43 | 	SELECT variant_classification,ref_context_arr,sum(mut_n) AS mut_n, array_agg(mut_n ORDER BY trinucleotide_context,alt), lsqnonneg(ref_context_arr, array_agg(mut_n ORDER BY trinucleotide_context,alt)) AS mut_sigs
44 | 	FROM variant_context_counts_aliquots, ref_context_array
45 | 	WHERE mut_n_total > 1
46 | 	GROUP BY 1,2
47 | )
48 | SELECT variant_classification, generate_series(1,30) AS signature, mut_n, unnest(mut_sigs) AS abs_score, UNNEST(mut_sigs) / (SELECT SUM(s) FROM UNNEST(mut_sigs) s) AS rel_score
49 | FROM context_reconstruction


--------------------------------------------------------------------------------
/sql/neoag/neoantigen_peptide_counts.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Creates Supplementary Table 6 (?): List of all unique neoantigens in the GLASS cohort and the number of initial/recurrent tumors harboring each one
 3 | A separate R script is used to save the table for publication: R/neoag/analysis/SuppTable6_writetottext.r
 4 | */
 5 | 
 6 | WITH neoag_by_ali AS
 7 | (
 8 | 	SELECT aliquot_barcode, variant_id, gene_name, mutation, pvacseq_protein_position, peptide_length, sub_peptide_position, mt_epitope_seq 
 9 | 	FROM analysis.neoantigens_by_aliquot neo
10 | 	WHERE ssm2_pass_call = TRUE
11 | 	GROUP BY aliquot_barcode, variant_id, gene_name, mutation, pvacseq_protein_position, peptide_length, sub_peptide_position, mt_epitope_seq
12 | ),
13 | ini_counts AS
14 | (
15 | 	SELECT neo.gene_name, neo.pvacseq_protein_position, neo.mutation, neo.mt_epitope_seq, COUNT(*) AS total
16 | 	FROM analysis.gold_set gs 
17 | 	LEFT JOIN neoag_by_ali neo ON neo.aliquot_barcode = gs.tumor_barcode_a
18 | 	GROUP BY neo.gene_name, neo.pvacseq_protein_position, neo.mutation, neo.mt_epitope_seq
19 | 	ORDER BY total DESC
20 | ),
21 | rec_counts AS
22 | (
23 | 	SELECT neo.gene_name, neo.pvacseq_protein_position, neo.mutation, neo.mt_epitope_seq, COUNT(*) AS total
24 | 	FROM analysis.gold_set gs 
25 | 	LEFT JOIN neoag_by_ali neo ON neo.aliquot_barcode = gs.tumor_barcode_b
26 | 	GROUP BY neo.gene_name, neo.pvacseq_protein_position, neo.mutation, neo.mt_epitope_seq
27 | 	ORDER BY total DESC
28 | )
29 | SELECT ini.gene_name, ini.pvacseq_protein_position, ini.mutation, ini.mt_epitope_seq, 
30 | COALESCE(ini.total,0) AS initial_total, 
31 | COALESCE(rec.total,0) AS recurrent_total, 
32 | COALESCE(ini.total,0) + COALESCE(rec.total,0) AS total_tumors
33 | FROM ini_counts ini
34 | LEFT JOIN rec_counts rec ON rec.gene_name = ini.gene_name AND 
35 | 	rec.pvacseq_protein_position = ini.pvacseq_protein_position AND
36 | 	rec.mutation = ini.mutation AND
37 | 	rec.mt_epitope_seq = ini.mt_epitope_seq
38 | ORDER BY total_tumors DESC
39 | 


--------------------------------------------------------------------------------
/sql/neutrality/neutralitytestr-subtype.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Prepare input for use with neutralitytestr using SNVs in non-altered regions.
 3 | Also, retain information about clonality AND variant_classification and variant_type.
 4 | */
 5 | 
 6 | WITH t1 AS (SELECT 
 7 |     pg.tumor_pair_barcode,
 8 |     pg.case_barcode,
 9 |     pg.tumor_barcode_a,
10 |     pg.tumor_barcode_b,
11 |     pg.chrom, 
12 |     pg.pos,
13 |     pg.variant_id,
14 |     pg.variant_type,
15 |     pg.variant_classification,
16 |     pg.mutect2_call_a,
17 |     pg.mutect2_call_b, 
18 |     pl1.cellular_prevalence AS cellular_prevalence_a, 
19 |     pl1.variant_allele_frequency AS variant_allele_frequency_a, 
20 |     (CASE WHEN pl1.cellular_prevalence >= 0.5 THEN 'C' WHEN pl1.cellular_prevalence < 0.5 THEN 'S' END) AS clonality_a,
21 |     pl2.cellular_prevalence AS cellular_prevalence_b, 
22 |     pl2.variant_allele_frequency AS variant_allele_frequency_b, 
23 |     (CASE WHEN pl2.cellular_prevalence >= 0.5 THEN 'C' WHEN pl2.cellular_prevalence < 0.5 THEN 'S' END) AS clonality_b,
24 |     (CASE WHEN mutect2_call_a AND mutect2_call_b THEN 'S' 
25 |     WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P' 
26 |     WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS fraction
27 | FROM variants.pgeno pg
28 | LEFT JOIN variants.pyclone_loci pl1 ON pl1.variant_id = pg.variant_id AND pl1.aliquot_barcode = pg.tumor_barcode_a 
29 | LEFT JOIN variants.pyclone_loci pl2 ON pl2.variant_id = pg.variant_id AND pl2.aliquot_barcode= pg.tumor_barcode_b
30 | INNER JOIN analysis.gold_set gs ON pg.tumor_pair_barcode = gs.tumor_pair_barcode
31 | WHERE pl1.cellular_prevalence IS NOT NULL) 
32 | 
33 | SELECT
34 |     t1.tumor_pair_barcode,
35 |     t1.case_barcode,
36 |     t1.tumor_barcode_a,
37 |     t1.tumor_barcode_b,
38 |     t1.chrom, 
39 |     t1.pos,
40 |     t1.variant_id,
41 |     t1.variant_type,
42 |     t1.variant_classification,
43 |     t1.mutect2_call_a,
44 |     t1.mutect2_call_b,
45 |     t1.cellular_prevalence_a,
46 |     t1.cellular_prevalence_b,
47 |     t1.variant_allele_frequency_a,
48 |     t1.variant_allele_frequency_b,
49 |     t1.clonality_a,
50 |     t1.clonality_b,
51 |     t1.fraction,
52 |     gs1.cnv_call AS cnv_call_a, 
53 |     gs2.cnv_call AS cnv_call_b
54 | FROM t1 
55 | LEFT JOIN variants.gatk_seg gs1 ON gs1.aliquot_barcode = t1.tumor_barcode_a AND gs1.chrom = t1.chrom AND gs1.pos && t1.pos 
56 | LEFT JOIN variants.gatk_seg gs2 ON gs2.aliquot_barcode = t1.tumor_barcode_b AND gs2.chrom = t1.chrom AND gs2.pos && t1.pos 
57 | WHERE t1.fraction IS NOT NULL
58 | 
59 | -- END --


--------------------------------------------------------------------------------
/sql/neutrality/original_submission/neutrality-testr-input-mutect2.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Prepare input for use with Neutrality Testr
 3 | */
 4 | SELECT
 5 |     gtc.case_barcode,
 6 |     gtc.tumor_barcode_a,
 7 |     gtc.tumor_barcode_b,
 8 |     gtc.chrom,
 9 |     gtc.pos,
10 |     gtc.alt,
11 |     gtc.ref_count_a,
12 |     gtc.ref_count_b,
13 |     gtc.alt_count_a,
14 |     gtc.alt_count_b,
15 |     gtc.ref_count_a + gtc.ref_count_b AS ref_count_ab,
16 |     gtc.alt_count_a + gtc.alt_count_b AS alt_count_ab,
17 |     ROUND(gtc.alt_count_a::decimal / (gtc.alt_count_a + gtc.ref_count_a),4) AS vaf_a,
18 |     ROUND(gtc.alt_count_b::decimal / (gtc.alt_count_b + gtc.ref_count_b),4) AS vaf_b,
19 |     ROUND((gtc.alt_count_a::decimal + gtc.alt_count_b::decimal) / (gtc.alt_count_a + gtc.alt_count_b + gtc.ref_count_a + gtc.ref_count_b),4) AS vaf_ab,
20 |     (CASE WHEN mutect2_call_a AND mutect2_call_b THEN 'S' WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P' WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS status
21 | FROM analysis.master_genotype_comparison gtc
22 | LEFT JOIN analysis.snvs snvs ON snvs.chrom = gtc.chrom AND snvs.pos = gtc.pos AND snvs.alt = gtc.alt
23 | WHERE 
24 |     (mutect2_call_a OR mutect2_call_b) AND 
25 |     (gtc.alt_count_a + gtc.ref_count_a) >= 30 AND 
26 |     (gtc.alt_count_b + gtc.ref_count_b) >= 30  AND
27 |     (gtc.alt_count_a > 0 OR gtc.alt_count_b > 0)
28 | 
29 | -- END --


--------------------------------------------------------------------------------
/sql/neutrality/original_submission/neutrality_testr_input.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Prepare input for use with Neutrality Testr
 3 | */
 4 | SELECT
 5 |     gtc.case_barcode,
 6 |     gtc.tumor_barcode_a,
 7 |     gtc.tumor_barcode_b,
 8 |     gtc.chrom,
 9 |     gtc.pos,
10 |     gtc.alt,
11 |     gtc.ref_count_a,
12 |     gtc.ref_count_b,
13 |     gtc.alt_count_a,
14 |     gtc.alt_count_b,
15 |     gtc.ref_count_a + gtc.ref_count_b AS ref_count_ab,
16 |     gtc.alt_count_a + gtc.alt_count_b AS alt_count_ab,
17 |     ROUND(gtc.alt_count_a::decimal / (gtc.alt_count_a + gtc.ref_count_a),4) AS vaf_a,
18 |     ROUND(gtc.alt_count_b::decimal / (gtc.alt_count_b + gtc.ref_count_b),4) AS vaf_b,
19 |     ROUND((gtc.alt_count_a::decimal + gtc.alt_count_b::decimal) / (gtc.alt_count_a + gtc.alt_count_b + gtc.ref_count_a + gtc.ref_count_b),4) AS vaf_ab,
20 |     (CASE WHEN gtc.alt_count_a > 0 AND gtc.alt_count_b > 0 THEN 'S' WHEN gtc.alt_count_a > 0 AND NOT gtc.alt_count_b > 0 THEN 'P' WHEN gtc.alt_count_b > 0 AND NOT gtc.alt_count_a > 0 THEN 'R' END) AS status
21 | FROM analysis.master_genotype_comparison gtc
22 | LEFT JOIN analysis.snvs snvs ON snvs.chrom = gtc.chrom AND snvs.pos = gtc.pos AND snvs.alt = gtc.alt
23 | WHERE 
24 |     (mutect2_call_a OR mutect2_call_b) AND 
25 |     (gtc.alt_count_a + gtc.ref_count_a) >= 30 AND 
26 |     (gtc.alt_count_b + gtc.ref_count_b) >= 30  AND
27 |     (gtc.alt_count_a > 0 OR gtc.alt_count_b > 0)
28 | 
29 | -- END --


--------------------------------------------------------------------------------
/sql/neutrality/original_submission/neutralitytestr-input-aliquot-level.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Prepare aliquot-level input for use with Neutrality Testr
 3 | */
 4 | SELECT
 5 |     gtc.aliquot_barcode,
 6 |     gtc.case_barcode,
 7 |     gtc.chrom::character varying(2),
 8 |     gtc.pos,
 9 |     gtc.alt,
10 |     gtc.ref_count,
11 |     gtc.alt_count,
12 |     ROUND(gtc.alt_count::decimal / (gtc.alt_count + gtc.ref_count),4) AS vaf
13 | FROM analysis.genotypes gtc
14 | WHERE 
15 |     (mutect2_call) AND 
16 |     (gtc.alt_count + gtc.ref_count) >= 30
17 | -- END -- 
18 | 


--------------------------------------------------------------------------------
/sql/pyclone/pyclone_aliquots.sql:
--------------------------------------------------------------------------------
 1 | WITH selected_aliquots
 2 | AS (
 3 |   SELECT
 4 |     sa.case_barcode,
 5 |     al.aliquot_barcode,
 6 |     round(tp.purity::numeric, 2) AS purity,
 7 |   COUNT(*) OVER (PARTITION BY su.case_barcode) AS num_samples
 8 |   FROM biospecimen.aliquots al
 9 |   LEFT JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode
10 |   LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = al.aliquot_barcode
11 |   LEFT JOIN clinical.surgeries su ON al.sample_barcode = su.sample_barcode
12 |   LEFT JOIN analysis.pairs pa ON al.aliquot_barcode = pa.tumor_barcode
13 |   LEFT JOIN variants.titan_params tp ON tp.pair_barcode = pa.pair_barcode
14 |   WHERE 
15 |     bl.fingerprint_exclusion = 'allow' AND
16 |     bl.coverage_exclusion = 'allow' AND
17 |     sa.sample_type NOT IN ('NB','NM')
18 |   ORDER BY su.case_barcode, su.surgery_number, al.aliquot_portion
19 | )
20 | SELECT
21 |   case_barcode,
22 |   aliquot_barcode,
23 |   purity
24 | FROM selected_aliquots
25 | WHERE num_samples > 1


--------------------------------------------------------------------------------
/sql/pyclone/pyclone_cluster_pairs.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | selected_tumor_pairs AS
 3 | (
 4 |     SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype
 5 |     FROM analysis.gold_set ss
 6 |     INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode
 7 | ),
 8 | pyclone_clusters AS
 9 | (
10 |     SELECT stp.case_barcode, stp.idh_codel_subtype, pc1.cluster_id, pc1.size AS size, pc1.mean AS ccf_a, pc2.mean AS ccf_b,
11 |     	(RANK() OVER (PARTITION BY stp.case_barcode ORDER BY pc1.mean DESC))::integer AS rank_a,
12 |     	(RANK() OVER (PARTITION BY stp.case_barcode ORDER BY pc2.mean DESC))::integer AS rank_b
13 |     FROM selected_tumor_pairs stp
14 |     INNER JOIN variants.pyclone_cluster pc1 ON pc1.aliquot_barcode = stp.tumor_barcode_a
15 |     INNER JOIN variants.pyclone_cluster pc2 ON pc2.aliquot_barcode = stp.tumor_barcode_b AND pc2.cluster_id = pc1.cluster_id
16 |     WHERE pc1.size > 1 AND (pc1.mean > 0.1 OR pc2.mean > 0.1)
17 | )
18 | SELECT * FROM pyclone_clusters


--------------------------------------------------------------------------------
/sql/pyclone/pyclone_cluster_pairs_anno_drivers.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | selected_tumor_pairs AS
 3 | (
 4 |     SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype
 5 |     FROM analysis.gold_set ss
 6 |     INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode
 7 | ),
 8 | pyclone_clusters AS
 9 | (
10 |     SELECT stp.case_barcode, stp.idh_codel_subtype, pc1.cluster_id, pc1.size AS size, pc1.mean AS ccf_a, pc2.mean AS ccf_b,
11 |         (RANK() OVER (PARTITION BY stp.case_barcode ORDER BY pc1.mean DESC))::integer AS rank_a,
12 |         (RANK() OVER (PARTITION BY stp.case_barcode ORDER BY pc2.mean DESC))::integer AS rank_b
13 |     FROM selected_tumor_pairs stp
14 |     INNER JOIN variants.pyclone_cluster pc1 ON pc1.aliquot_barcode = stp.tumor_barcode_a
15 |     INNER JOIN variants.pyclone_cluster pc2 ON pc2.aliquot_barcode = stp.tumor_barcode_b AND pc2.cluster_id = pc1.cluster_id
16 |     WHERE pc1.size > 1 AND (pc1.mean > 0.1 OR pc2.mean > 0.1)
17 | ),
18 | selected_genes AS
19 | (
20 |     SELECT DISTINCT sn.gene_symbol, variant_id, chrom, pos, alt, sn.variant_classification, variant_classification_priority, protein_change
21 |     FROM variants.passanno sn
22 |     INNER JOIN ref.driver_genes ds ON ds.gene_symbol = sn.gene_symbol
23 |     LEFT JOIN variants.variant_classifications vc ON sn.variant_classification = vc.variant_classification
24 |     WHERE
25 |         has_mut IS TRUE AND
26 |         ((sn.gene_symbol NOT IN ('TERT','IDH1') AND variant_classification_priority IS NOT NULL) OR
27 |         (sn.gene_symbol = 'TERT' AND sn.variant_classification = '5''Flank' AND lower(sn.pos) IN (1295228,1295250)) OR
28 |         (sn.gene_symbol = 'IDH1' AND sn.protein_change IN ('p.R132C','p.R132G','p.R132H','p.R132S')))
29 | ),
30 | selected_genes_geno AS
31 | (
32 |     SELECT DISTINCT case_barcode, cluster_id, string_agg(DISTINCT gene_symbol, ', ') AS drivers
33 |     FROM selected_genes sg
34 |     INNER JOIN variants.passgeno pg ON pg.variant_id = sg.variant_id
35 |     INNER JOIN variants.pyclone_loci pl ON pl.variant_id = sg.variant_id AND pl.aliquot_barcode = pg.aliquot_barcode
36 |     WHERE ssm2_pass_call IS TRUE
37 |     GROUP BY 1,2
38 | )
39 | --SELECT * FROM selected_genes_geno ORDER BY 3 DESC
40 | SELECT pc.case_barcode, idh_codel_subtype, pc.cluster_id, size, ccf_a, ccf_b, rank_a, rank_b, drivers
41 | FROM pyclone_clusters pc
42 | LEFT JOIN selected_genes_geno sgg ON sgg.case_barcode = pc.case_barcode AND sgg.cluster_id = pc.cluster_id


--------------------------------------------------------------------------------
/sql/pyclone/pyclone_cluster_stats.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | selected_tumor_pairs AS
 3 | (
 4 |     SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype
 5 |     FROM analysis.gold_set ss
 6 |     INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode
 7 | ),
 8 | selected_aliquots AS
 9 | (
10 |     SELECT tumor_barcode_a AS aliquot_barcode, idh_codel_subtype AS subtype, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs
11 |     UNION
12 |     SELECT tumor_barcode_b AS aliquot_barcode, idh_codel_subtype AS subtype, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs
13 | ),
14 | driver_counts AS
15 | (
16 |     SELECT case_barcode, cluster_id, COUNT(*) AS num_drivers
17 |     FROM variants.passanno pa
18 |     INNER JOIN ref.driver_genes dg ON dg.gene_symbol = pa.gene_symbol
19 |     INNER JOIN variants.pyclone_loci pl ON pl.variant_id = pa.variant_id
20 |     INNER JOIN biospecimen.aliquots al ON al.aliquot_barcode = pl.aliquot_barcode
21 |     INNER JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode
22 |     WHERE has_mut IS TRUE AND variant_allele_frequency > 0
23 |     GROUP BY 1,2    
24 | ),
25 | pyclone_clusters AS
26 | (
27 |     SELECT sa.case_barcode, pc.cluster_id, COUNT(*) AS num_samples, min(size) as cluster_size, MIN(mean) as min_ccf, MAX(mean) AS max_ccf, sum(mean)/COUNT(mean) AS mean_ccf
28 |     FROM variants.pyclone_cluster pc
29 |     --INNER JOIN biospecimen.aliquots al ON al.aliquot_barcode = pc.aliquot_barcode
30 |     INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = pc.aliquot_barcode
31 |     --INNER JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode
32 |     GROUP BY 1,2
33 |     ORDER BY 5 DESC
34 |     HAVING MIN(mean) > 0.1 OR MAX(mean) > 0.1
35 | ),
36 | t2 AS
37 | (
38 |     SELECT t1.case_barcode,
39 |         COUNT(t1.cluster_id) AS num_clust,
40 |         COUNT(CASE WHEN min_ccf > 0.25 THEN 1 END) AS num_clonal,
41 |         bool_or(min_ccf > 0.25 AND num_drivers > 0)::integer AS any_clonal_drivers,
42 |         bool_or(min_ccf <= 0.25 AND num_drivers > 0)::integer AS any_subclonal_drivers
43 |         --COUNT(CASE WHEN min_ccf > 0.25 THEN num_drivers END) AS num_drivers_clonal,
44 |         --COUNT(CASE WHEN min_ccf <= 0.25 THEN num_drivers END) AS num_drivers_subclonal
45 |     FROM pyclone_clusters t1
46 |     LEFT JOIN driver_counts t0 ON t0.case_barcode = t1.case_barcode AND t0.cluster_id = t1.cluster_id
47 |     GROUP BY 1
48 |     ORDER BY 3 DESC
49 | )
50 | SELECT * FROM t2


--------------------------------------------------------------------------------
/sql/pyclone/pyclone_cluster_stats2.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | selected_tumor_pairs AS
 3 | (
 4 |     SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype
 5 |     FROM analysis.gold_set ss
 6 |     INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode
 7 | ),
 8 | selected_aliquots AS
 9 | (
10 |     SELECT tumor_barcode_a AS aliquot_barcode, idh_codel_subtype, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs
11 |     UNION
12 |     SELECT tumor_barcode_b AS aliquot_barcode, idh_codel_subtype, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs
13 | ),
14 | driver_counts AS
15 | (
16 |     SELECT case_barcode, cluster_id, COUNT(*) AS num_drivers
17 |     FROM variants.passanno pa
18 |     INNER JOIN ref.driver_genes dg ON dg.gene_symbol = pa.gene_symbol
19 |     INNER JOIN variants.pyclone_loci pl ON pl.variant_id = pa.variant_id
20 |     INNER JOIN biospecimen.aliquots al ON al.aliquot_barcode = pl.aliquot_barcode
21 |     INNER JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode
22 |     WHERE has_mut IS TRUE AND variant_allele_frequency > 0
23 |     GROUP BY 1,2    
24 | ),
25 | pyclone_clusters AS
26 | (
27 |     SELECT pc.aliquot_barcode, sa.case_barcode, pc.cluster_id, sample_type, idh_codel_subtype, pc.size, pc.mean, pc.std, num_drivers
28 | 	FROM variants.pyclone_cluster pc
29 | 	INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = pc.aliquot_barcode
30 | 	LEFT JOIN driver_counts t0 ON t0.case_barcode = sa.case_barcode AND t0.cluster_id = pc.cluster_id
31 | 	WHERE pc.size > 1
32 | )/*,
33 | t2 AS
34 | (
35 |     SELECT t1.case_barcode,
36 |         COUNT(t1.cluster_id) AS num_clust,
37 |         COUNT(CASE WHEN min_ccf > 0.25 THEN 1 END) AS num_clonal,
38 |         bool_or(min_ccf > 0.25 AND num_drivers > 0)::integer AS any_clonal_drivers,
39 |         bool_or(min_ccf <= 0.25 AND num_drivers > 0)::integer AS any_subclonal_drivers
40 |         --COUNT(CASE WHEN min_ccf > 0.25 THEN num_drivers END) AS num_drivers_clonal,
41 |         --COUNT(CASE WHEN min_ccf <= 0.25 THEN num_drivers END) AS num_drivers_subclonal
42 |     FROM pyclone_clusters t1
43 |     LEFT JOIN driver_counts t0 ON t0.case_barcode = t1.case_barcode AND t0.cluster_id = t1.cluster_id
44 |     GROUP BY 1
45 |     ORDER BY 3 DESC
46 | )*/
47 | SELECT * FROM pyclone_clusters


--------------------------------------------------------------------------------
/sql/pyclone/pyclone_create_tsv.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | t1 AS (
 3 |       SELECT
 4 |               gt.case_barcode,
 5 |               gt.aliquot_barcode,
 6 |               gt.variant_id::integer AS mutation_id,
 7 |               ad_ref AS ref_counts,
 8 |               ad_alt AS var_counts,
 9 |               (CASE WHEN case_sex = 'male' AND gt.chrom = 23 THEN 1 ELSE 2 END) AS normal_cn,
10 |               minor_cn,
11 |               major_cn,
12 |               (COUNT(*) OVER (PARTITION BY gt.case_barcode, gt.variant_id)) AS num_aliquots_variants
13 |       FROM variants.passgeno gt
14 |       INNER JOIN biospecimen.aliquots al ON al.aliquot_barcode = gt.aliquot_barcode
15 |       INNER JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode
16 |     INNER JOIN analysis.blocklist bl ON bl.aliquot_barcode = al.aliquot_barcode
17 |       INNER JOIN analysis.pairs ps ON ps.tumor_barcode = gt.aliquot_barcode
18 |       INNER JOIN clinical.cases cs ON cs.case_barcode = gt.case_barcode
19 |       INNER JOIN variants.titan_seg ts ON ts.pair_barcode = ps.pair_barcode AND ts.chrom = gt.chrom AND ts.pos && gt.pos
20 |       WHERE
21 |               gt.case_barcode = ? AND 
22 |               (case_sex IS NOT NULL OR gt.chrom <> 23) AND
23 |               major_cn > 0 AND
24 |               ad_ref + ad_alt >= 30 AND
25 |               minor_cn IS NOT NULL AND 
26 |               major_cn IS NOT NULL AND
27 |               bl.fingerprint_exclusion = 'allow' AND
28 |               bl.coverage_exclusion = 'allow' AND
29 |               sa.sample_type NOT IN ('NB','NM')
30 | ),
31 | t2 AS (
32 |       SELECT case_barcode,(COUNT(DISTINCT aliquot_barcode)) AS num_aliquots FROM t1 GROUP BY 1
33 | ),
34 | t3 AS (
35 |       SELECT t1.case_barcode,aliquot_barcode,mutation_id,ref_counts,var_counts,normal_cn,minor_cn,major_cn
36 |       FROM t1
37 |       LEFT JOIN t2 ON t1.case_barcode = t2.case_barcode
38 |       WHERE num_aliquots_variants = num_aliquots AND num_aliquots > 1
39 | ),
40 | t4 AS (
41 |       SELECT aliquot_barcode,COUNT(*)
42 |       FROM t3
43 |       GROUP BY 1
44 | )
45 | SELECT * FROM t3-- ORDER BY 1,2 DESC


--------------------------------------------------------------------------------
/sql/roeltable.sql:
--------------------------------------------------------------------------------
 1 |  SELECT crosstab.case_source_description,
 2 |     crosstab.case_project,
 3 |     crosstab.aliquot_analysis_type,
 4 |     crosstab."Primary",
 5 |     crosstab."1st Recurrence",
 6 |     crosstab."2nd Recurrence",
 7 |     crosstab."3rd Recurrence",
 8 |     crosstab."4th Recurrence"
 9 |    FROM crosstab('
10 | 	SELECT case_source_description, case_project, aliquot_analysis_type, sample_type, COUNT( DISTINCT ca.case_barcode )
11 | 	FROM biospecimen.aliquots AS al
12 | 	INNER JOIN biospecimen.samples AS sa ON sa.sample_barcode = al.sample_barcode
13 | 	INNER JOIN clinical.cases AS ca ON ca.case_barcode = sa.case_barcode
14 | 	INNER JOIN clinical.case_sources AS cs ON ca.case_source = cs.case_source
15 | 	WHERE sa.sample_type IN (''TP'', ''R1'', ''R2'', ''R3'', ''R4'')
16 | 	GROUP BY case_source_description, case_project, aliquot_analysis_type, sample_type
17 | 	ORDER BY 2,3,1
18 | 	'::text, '
19 | 	SELECT sample_type FROM biospecimen.sample_types WHERE sample_type IN (''TP'', ''R1'', ''R2'', ''R3'', ''R4'')
20 | 	'::text) crosstab(case_source_description character varying, case_project character(4), aliquot_analysis_type character(3), "Primary" integer, "1st Recurrence" integer, "2nd Recurrence" integer, "3rd Recurrence" integer, "4th Recurrence" integer);


--------------------------------------------------------------------------------
/sql/selected_aliquots.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Select aliquots
 3 | - Stringent blocklist filtering (diamond set)
 4 | 	* Fingerprinting
 5 | 	* Coverage
 6 | 	* CNV
 7 | - Create sample short names using surgery number and portion
 8 | - Drop cases with less than 2 aliquots
 9 | */
10 | WITH
11 | selected_aliquots AS
12 | (
13 | 	SELECT case_barcode, aliquot_analysis_type, al.aliquot_barcode, ROUND(purity::decimal,2) AS purity, case_barcode || '-' || aliquot_analysis_type AS short_name, COUNT(*) OVER (PARTITION BY case_barcode, aliquot_analysis_type) AS num_samples
14 | 	FROM biospecimen.aliquots al
15 | 	LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = al.aliquot_barcode
16 | 	LEFT JOIN clinical.surgeries su ON al.sample_barcode = su.sample_barcode
17 | 	LEFT JOIN analysis.pairs pa ON al.aliquot_barcode = pa.tumor_barcode
18 | 	LEFT JOIN analysis.titan_params tp ON tp.pair_barcode = pa.pair_barcode
19 | 	WHERE 
20 | 		bl.fingerprint_exclusion = 'allow' AND
21 | 		bl.coverage_exclusion = 'allow' AND
22 | 		bl.cnv_exclusion = 'allow'
23 | 	ORDER BY 1, su.surgery_number, al.aliquot_portion
24 | )
25 | SELECT * FROM selected_aliquots WHERE num_samples > 1


--------------------------------------------------------------------------------
/sql/selected_tumor_pairs_silver.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | silver_set AS
 3 | (
 4 | 	SELECT
 5 | 		tumor_pair_barcode,
 6 | 		case_barcode,
 7 | 		tumor_barcode_a,
 8 | 		tumor_barcode_b,
 9 | 		row_number() OVER (PARTITION BY case_barcode ORDER BY surgical_interval_mo DESC, portion_a ASC, portion_b ASC, substring(tumor_pair_barcode from 27 for 3) ASC) AS priority
10 | 	FROM analysis.tumor_pairs ps
11 | 	LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a
12 | 	LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b
13 | 	WHERE
14 | 		comparison_type = 'longitudinal' AND
15 | 		sample_type_b <> 'M1' AND
16 | 		b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow'
17 | ),
18 | gold_set AS
19 | (
20 | 	SELECT
21 | 		tumor_pair_barcode,
22 | 		case_barcode,
23 | 		tumor_barcode_a,
24 | 		tumor_barcode_b,
25 | 		row_number() OVER (PARTITION BY case_barcode ORDER BY surgical_interval_mo DESC, portion_a ASC, portion_b ASC, substring(tumor_pair_barcode from 27 for 3) ASC) AS priority
26 | 	FROM analysis.tumor_pairs ps
27 | 	LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a
28 | 	LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b
29 | 	WHERE
30 | 		comparison_type = 'longitudinal' AND
31 | 		sample_type_b <> 'M1' AND
32 | 		b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow' AND
33 | 		b1.cnv_exclusion IN ('allow','review') AND b2.cnv_exclusion IN ('allow','review')
34 | )
35 | SELECT *
36 | FROM gold_set WHERE priority = 1
37 | 
38 | EXCEPT
39 | 
40 | SELECT *
41 | FROM silver_set WHERE priority = 1
42 | 


--------------------------------------------------------------------------------
/sql/set/gold_set.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | This is the initial definition but was deprecated out of confusion
 3 | This definition could include cases (patients) that are also in the silver set, but using a different combionation of primary and recurrence
 4 | The new definition (not commented, below) instead takes the subset of the silver set
 5 | Note that for GISTIC we instead took the deprecated gold set to define a set of unique primaries and unique recurrences
 6 | ===
 7 | WITH
 8 | selected_tumor_pairs AS
 9 | (
10 |     SELECT
11 |         ps.tumor_pair_barcode,
12 |         ps.case_barcode,
13 |         ps.tumor_barcode_a,
14 |         ps.tumor_barcode_b,
15 |         row_number() OVER (PARTITION BY ps.case_barcode ORDER BY ps.surgical_interval_mo DESC, ps.portion_a, ps.portion_b, ("substring"(ps.tumor_pair_barcode, 27, 3))) AS priority
16 |     FROM analysis.tumor_pairs ps
17 |     LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a
18 |     LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b
19 |     WHERE ps.comparison_type = 'longitudinal' AND ps.sample_type_b <> 'M1' AND b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow' AND b1.cnv_exclusion != 'block' AND b2.cnv_exclusion != 'block'
20 | )
21 | SELECT
22 |     tumor_pair_barcode,
23 |     case_barcode,
24 |     tumor_barcode_a,
25 |     tumor_barcode_b
26 | FROM selected_tumor_pairs
27 | WHERE selected_tumor_pairs.priority = 1*/
28 | 
29 | SELECT *
30 | FROM analysis.silver_set ss
31 | INNER JOIN analysis.blocklist bl1 ON bl1.aliquot_barcode = ss.tumor_barcode_a
32 | INNER JOIN analysis.blocklist bl2 ON bl2.aliquot_barcode = ss.tumor_barcode_b
33 | WHERE bl1.cnv_exclusion != 'block' AND bl2.cnv_exclusion != 'block'


--------------------------------------------------------------------------------
/sql/set/silver_set.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | selected_tumor_pairs AS
 3 | (
 4 |     SELECT
 5 |         ps.tumor_pair_barcode,
 6 |         ps.case_barcode,
 7 |         ps.tumor_barcode_a,
 8 |         ps.tumor_barcode_b,
 9 |         row_number() OVER (PARTITION BY ps.case_barcode ORDER BY ps.surgical_interval_mo DESC, ps.portion_a, ps.portion_b, ("substring"(ps.tumor_pair_barcode, 27, 3))) AS priority
10 |     FROM analysis.tumor_pairs ps
11 |     LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a
12 |     LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b
13 |     WHERE ps.comparison_type = 'longitudinal' AND ps.sample_type_b <> 'M1' AND b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow'
14 | )
15 | SELECT
16 |     tumor_pair_barcode,
17 |     case_barcode,
18 |     tumor_barcode_a,
19 |     tumor_barcode_b
20 | FROM selected_tumor_pairs
21 | WHERE selected_tumor_pairs.priority = 1


--------------------------------------------------------------------------------
/sql/snv/tumor_mut_comparison.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 	tp.tumor_pair_barcode,
 3 | 	tp.case_barcode,
 4 | 	tp.tumor_barcode_a,
 5 | 	tp.tumor_barcode_b,
 6 | 	tp.sample_type_a,
 7 | 	tp.sample_type_b,
 8 | 	tp.portion_a,
 9 | 	tp.portion_b,
10 | 	tp.comparison_type,
11 | 	tp.surgical_interval_mo,
12 | 	
13 | 	( 	SELECT count(*) AS count
14 | 		FROM variants.passgeno gt
15 | 		WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) AS count_a,
16 | 	
17 | 	( 	SELECT count(*) AS count
18 | 		FROM variants.passgeno gt
19 | 		WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) AS count_b,
20 | 		
21 | 	(	SELECT count(*) AS count
22 | 		FROM (	SELECT variant_id
23 | 				FROM variants.passgeno gt
24 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE
25 | 				UNION
26 | 				SELECT variant_id
27 | 				FROM variants.passgeno gt
28 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) t) AS union_ab,
29 | 		
30 | 	(	SELECT count(*) AS count
31 | 		FROM (	SELECT variant_id
32 | 				FROM variants.passgeno gt
33 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE
34 | 				INTERSECT
35 | 				SELECT variant_id
36 | 				FROM variants.passgeno gt
37 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) t) AS intersection_ab,
38 | 		
39 | 	(	SELECT count(*) AS count
40 | 		FROM (	SELECT variant_id
41 | 				FROM variants.passgeno gt
42 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE
43 | 				EXCEPT
44 | 				SELECT variant_id
45 | 				FROM variants.passgeno gt
46 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) t) AS setdiff_a,
47 | 	
48 | 	(	SELECT count(*) AS count
49 | 		FROM ( SELECT variant_id
50 | 				FROM variants.passgeno gt
51 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE
52 | 				EXCEPT
53 | 				SELECT variant_id
54 | 				FROM variants.passgeno gt
55 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) t) AS setdiff_b
56 | 	 
57 | FROM analysis.tumor_pairs tp


--------------------------------------------------------------------------------
/sql/snv/tumor_mut_comparison_anno.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 	tmc.tumor_pair_barcode,
 3 | 	tmc.case_barcode,
 4 | 	tmc.tumor_barcode_a,
 5 | 	tmc.tumor_barcode_b,
 6 | 	idh_codel_subtype,
 7 | 	received_alk,
 8 | 	hypermutator_status,
 9 | 	0 AS time_birth,
10 | 	ca.case_age_diagnosis_years AS time_initial,
11 | 	ROUND(ca.case_age_diagnosis_years + (tmc.surgical_interval_mo / 12.0),2) AS time_recurrence,
12 | 	0 AS mf_birth,
13 | 	mf1.coverage_adj_mut_freq AS mf_initial,
14 | 	mf2.coverage_adj_mut_freq AS mf_recurrence,
15 | 	tmc.count_a,
16 | 	tmc.count_b,
17 | 	tmc.union_ab,
18 | 	tmc.intersection_ab,
19 | 	tmc.setdiff_a,
20 | 	tmc.setdiff_b,
21 | 	mf1.cumulative_coverage AS cov_a,
22 | 	mf2.cumulative_coverage AS cov_b,
23 | 	LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) AS min_cov,
24 | 	ROUND(setdiff_a::decimal / mf1.cumulative_coverage * 1e6, 4) AS mf_private_a,
25 | 	ROUND(setdiff_b::decimal / mf2.cumulative_coverage * 1e6, 4) AS mf_private_b,
26 | 	ROUND(intersection_ab::decimal / LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) * 1e6, 4) AS mf_shared
27 | FROM analysis.tumor_mut_comparison tmc
28 | INNER JOIN analysis.silver_set stp ON tmc.tumor_pair_barcode = stp.tumor_pair_barcode
29 | LEFT JOIN analysis.tumor_clinical_comparison ctp ON ctp.tumor_pair_barcode = stp.tumor_pair_barcode
30 | LEFT JOIN analysis.mut_freq mf1 ON mf1.aliquot_barcode = tmc.tumor_barcode_a 
31 | LEFT JOIN analysis.mut_freq mf2 ON mf2.aliquot_barcode = tmc.tumor_barcode_b 
32 | LEFT JOIN clinical.subtypes su ON su.case_barcode = stp.case_barcode
33 | LEFT JOIN clinical.cases ca ON ca.case_barcode = stp.case_barcode


--------------------------------------------------------------------------------
/sql/tel.sql:
--------------------------------------------------------------------------------
 1 | SELECT tmc.case_barcode, tumor_barcode_a, tumor_barcode_b, aliquot_analysis_type, case_source, idh_codel_subtype, received_alk, hypermutator_status, tq1.length AS p_len, tq2.length AS r_len, tqn.length AS n_len, tq1.length / tq2.length AS pr_len_ratio, 
 2 | 	a1.prop_aneuploidy AS aneuploidy_a,
 3 | 	a2.prop_aneuploidy AS aneuploidy_b,
 4 | 	a1.aneuploidy_amp_score::integer AS aneuploidy_amp_score_a,
 5 | 	a2.aneuploidy_amp_score::integer AS aneuploidy_amp_score_b,
 6 | 	a1.aneuploidy_del_score::integer AS aneuploidy_del_score_a,
 7 | 	a2.aneuploidy_del_score::integer AS aneuploidy_del_score_b,
 8 | 	a1.aneuploidy_score::integer AS aneuploidy_score_a,
 9 | 	a2.aneuploidy_score::integer AS aneuploidy_score_b
10 | FROM analysis.tumor_mut_comparison_anno tmc
11 | LEFT JOIN analysis.gatk_aneuploidy a1 ON a1.aliquot_barcode = tmc.tumor_barcode_a
12 | LEFT JOIN analysis.gatk_aneuploidy a2 ON a2.aliquot_barcode = tmc.tumor_barcode_b
13 | LEFT JOIN analysis.pairs pa1 ON pa1.tumor_barcode = tmc.tumor_barcode_a
14 | LEFT JOIN biospecimen.aliquots al1 ON al1.aliquot_barcode = tmc.tumor_barcode_a
15 | LEFT JOIN biospecimen.samples sa1 ON sa1.sample_barcode = al1.sample_barcode
16 | LEFT JOIN clinical.cases ca1 ON ca1.case_barcode = sa1.case_barcode
17 | LEFT JOIN analysis.telseq tqn ON tqn.aliquot_barcode = pa1.normal_barcode
18 | LEFT JOIN analysis.telseq tq1 ON tq1.aliquot_barcode = tmc.tumor_barcode_a
19 | LEFT JOIN analysis.telseq tq2 ON tq2.aliquot_barcode = tmc.tumor_barcode_b


--------------------------------------------------------------------------------
/sql/test.sql:
--------------------------------------------------------------------------------
1 | hey
2 | 


--------------------------------------------------------------------------------
/sql/timing/ccf_shared.sql:
--------------------------------------------------------------------------------
 1 | SELECT 
 2 | 		pg.tumor_pair_barcode,
 3 | 		pg.case_barcode,
 4 | 		st.idh_codel_subtype,
 5 | 		pg.tumor_barcode_a,
 6 | 		pg.tumor_barcode_b,
 7 | 		hypermutator_status,
 8 | 		pg.chrom, 
 9 | 		pg.pos,
10 | 		pg.variant_id,
11 | 		pg.gene_symbol,
12 | 		pg.variant_classification,
13 | 		vc.variant_effect,
14 | 		vc.variant_classification_vep,
15 | 		pl1.cellular_prevalence AS cellular_prevalence_a, 
16 | 		pl1.variant_allele_frequency AS variant_allele_frequency_a, 
17 | 		(CASE WHEN pl1.cellular_prevalence >= 0.5 THEN 'C' WHEN pl1.cellular_prevalence >= 0.1 AND pl1.cellular_prevalence < 0.5 THEN 'S' ELSE 'ND' END) AS clonality_a,
18 | 		pl2.cellular_prevalence AS cellular_prevalence_b, 
19 | 		pl2.variant_allele_frequency AS variant_allele_frequency_b, 
20 | 		(CASE WHEN pl2.cellular_prevalence >= 0.5 THEN 'C' WHEN pl2.cellular_prevalence >= 0.1 AND pl2.cellular_prevalence < 0.5 THEN 'S' ELSE 'ND' END) AS clonality_b,
21 | 		rank() OVER (PARTITION BY pg.tumor_pair_barcode, pg.gene_symbol ORDER BY variant_classification_priority, pl1.cellular_prevalence + pl2.cellular_prevalence DESC)
22 | 	FROM variants.pgeno pg
23 | 	LEFT JOIN variants.pyclone_loci pl1 ON pl1.variant_id = pg.variant_id AND pl1.aliquot_barcode = pg.tumor_barcode_a 
24 | 	LEFT JOIN variants.pyclone_loci pl2 ON pl2.variant_id = pg.variant_id AND pl2.aliquot_barcode= pg.tumor_barcode_b
25 | 	LEFT JOIN variants.variant_classifications vc ON vc.variant_classification = pg.variant_classification
26 | 	INNER JOIN analysis.gold_set ss ON pg.tumor_pair_barcode = ss.tumor_pair_barcode
27 | 	INNER JOIN analysis.tumor_mut_comparison_anno tmc ON tmc.tumor_pair_barcode = ss.tumor_pair_barcode
28 | 	INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode
29 | 	WHERE pl1.cellular_prevalence IS NOT NULL AND mutect2_call_a AND mutect2_call_b AND variant_classification_priority IS NOT NULL


--------------------------------------------------------------------------------
/sql/timing/timing_cnv.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | selected_tumor_pairs AS
 3 | (
 4 | 	SELECT * FROM analysis.silver_set
 5 | ),
 6 | selected_aliquots AS
 7 | (
 8 | 	SELECT tumor_barcode_a AS aliquot_barcode, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs
 9 | 	UNION
10 | 	SELECT tumor_barcode_b AS aliquot_barcode, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs
11 | ),
12 | cnv_timing AS
13 | (
14 | 	SELECT gc.gene_symbol,idh_codel_subtype,sample_type,hlvl_call, COUNT(cellular_prevalence) AS num_cp, SUM(cellular_prevalence)/COUNT(cellular_prevalence) AS mean_cp
15 | 	FROM analysis.gatk_cnv_by_gene gc
16 | 	INNER JOIN ref.driver_genes dg ON dg.gene_symbol = gc.gene_symbol
17 | 	INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = gc.aliquot_barcode
18 | 	INNER JOIN clinical.subtypes st ON st.case_barcode = sa.case_barcode
19 | 	GROUP BY 1,2,3,4
20 | )
21 | SELECT * FROM cnv_timing


--------------------------------------------------------------------------------
/sql/timing/timing_pairs.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | snv_pairs AS
 3 | (
 4 | 	SELECT t1.gene_symbol AS gene_symbol_a, t2.gene_symbol AS gene_symbol_b, t1.idh_codel_subtype AS idh_codel_subtype 
 5 | 	FROM ref.snv_drivers_subtype t1
 6 | 	INNER JOIN ref.snv_drivers_subtype t2 ON t1.idh_codel_subtype = t2.idh_codel_subtype
 7 | 	WHERE t1.gene_symbol < t2.gene_symbol
 8 | 	ORDER BY 3,1,2
 9 | ),
10 | cnv_pairs AS
11 | (
12 | 	SELECT t1.gene_symbol AS gene_symbol_a, t1.direction AS direction_a, t2.gene_symbol AS gene_symbol_b, t2.direction AS direction_b, t1.idh_codel_subtype AS idh_codel_subtype
13 | 	FROM ref.cnv_drivers_subtype t1
14 | 	INNER JOIN ref.cnv_drivers_subtype t2 ON t1.idh_codel_subtype = t2.idh_codel_subtype
15 | 	WHERE t1.gene_symbol < t2.gene_symbol
16 | 	ORDER BY 5,1,2
17 | ),
18 | arm_pairs AS
19 | (
20 | 	SELECT t1.arm AS arm_a, t1.direction AS direction_a, t2.arm AS arm_b, t2.direction AS direction_b, t1.idh_codel_subtype AS idh_codel_subtype
21 | 	FROM ref.arm_drivers_subtype t1
22 | 	INNER JOIN ref.arm_drivers_subtype t2 ON t1.idh_codel_subtype = t2.idh_codel_subtype
23 | 	WHERE t1.arm < t2.arm
24 | 	ORDER BY 5,1,2
25 | ),
26 | arm_cnv_pairs AS
27 | (
28 | 	SELECT t1.arm || (CASE t1.direction WHEN -1 THEN ' del' WHEN 1 THEN ' amp' ELSE NULL END) AS evnt_a, t2.gene_symbol || (CASE t2.direction WHEN -2 THEN ' del' WHEN 2 THEN ' amp' ELSE NULL END) AS evnt_b, t1.idh_codel_subtype AS idh_codel_subtype
29 | 	FROM ref.arm_drivers_subtype t1
30 | 	INNER JOIN ref.cnv_drivers_subtype t2 ON t1.idh_codel_subtype = t2.idh_codel_subtype
31 | 	--WHERE t1.arm < t2.arm
32 | 	ORDER BY 3,1,2
33 | ),
34 | pairs AS
35 | (
36 | 	SELECT gene_symbol_a  || ' mut' AS evnt_a, gene_symbol_b  || ' mut' AS evnt_b, idh_codel_subtype FROM snv_pairs
37 | 	UNION
38 | 	SELECT gene_symbol_a || (CASE direction_a WHEN -2 THEN ' del' WHEN 2 THEN ' amp' ELSE NULL END) AS evnt_a, gene_symbol_b || (CASE direction_b WHEN -2 THEN ' del' WHEN 2 THEN ' amp' ELSE NULL END) AS evnt_b, idh_codel_subtype FROM cnv_pairs
39 | 	UNION
40 | 	SELECT arm_a || (CASE direction_a WHEN -1 THEN ' del' WHEN 1 THEN ' amp' ELSE NULL END) AS evnt_a, arm_b || (CASE direction_b WHEN -1 THEN ' del' WHEN 1 THEN ' amp' ELSE NULL END) AS evnt_b, idh_codel_subtype FROM arm_pairs
41 | 	UNION
42 | 	SELECT evnt_a, evnt_b, idh_codel_subtype FROM arm_cnv_pairs
43 | )
44 | SELECT * FROM pairs
45 | ORDER BY 3,1,2


--------------------------------------------------------------------------------
/sql/timing/timing_snv.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | selected_tumor_pairs AS
 3 | (
 4 | 	SELECT * FROM analysis.silver_set
 5 | ),
 6 | selected_aliquots AS
 7 | (
 8 | 	SELECT tumor_barcode_a AS aliquot_barcode, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs
 9 | 	UNION
10 | 	SELECT tumor_barcode_b AS aliquot_barcode, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs
11 | ),
12 | selected_genes AS
13 | (
14 | 	SELECT DISTINCT sn.gene_symbol, ensembl_gene_id, variant_id, chrom, pos, alt, sn.variant_classification, variant_classification_priority, protein_change
15 | 	FROM variants.passanno sn
16 | 	INNER JOIN ref.driver_genes ds ON ds.gene_symbol = sn.gene_symbol
17 | 	INNER JOIN ref.ensembl_gene_mapping gm ON gm.gene_symbol = sn.gene_symbol
18 | 	LEFT JOIN variants.variant_classifications vc ON sn.variant_classification = vc.variant_classification
19 | 	WHERE
20 | 		has_mut IS TRUE AND
21 | 		((sn.gene_symbol NOT IN ('TERT','IDH1') AND variant_classification_priority IS NOT NULL) OR
22 | 		(sn.gene_symbol = 'TERT' AND sn.variant_classification = 'FIVE_PRIME_FLANK' AND lower(sn.pos) IN (1295228,1295250)) OR
23 | 		(sn.gene_symbol = 'IDH1' AND sn.protein_change IN ('p.R132C','p.R132G','p.R132H','p.R132S')))
24 | ),
25 | timing_snv AS 
26 | (
27 | 	SELECT pl.aliquot_barcode,idh_codel_subtype,sample_type,gene_symbol,variant_classification,protein_change,cellular_prevalence,titan_ccf,pyclone_ccf, rank() OVER (PARTITION BY pl.aliquot_barcode ORDER BY cellular_prevalence DESC) AS mut_order
28 | 	FROM variants.pyclone_loci pl
29 | 	INNER JOIN selected_genes sg ON sg.variant_id = pl.variant_id
30 | 	INNER JOIN selected_aliquots sq ON sq.aliquot_barcode = pl.aliquot_barcode
31 | 	INNER JOIN variants.passgeno pg ON pg.variant_id = pl.variant_id AND pg.aliquot_barcode = pl.aliquot_barcode
32 | 	INNER JOIN clinical.subtypes st ON st.case_barcode = pg.case_barcode
33 | 	WHERE ssm2_pass_call
34 | )
35 | SELECT gene_symbol,idh_codel_subtype,sample_type,SUM(mut_order)/COUNT(mut_order),COUNT(mut_order)
36 | FROM timing_snv GROUP BY 1,2,3 ORDER BY 2,3,4
37 | --SELECT gene_symbol, idh_codel_subtype, sample_type, COUNT(cellular_prevalence) AS num_mut, SUM(cellular_prevalence)/COUNT(cellular_prevalence) AS mean_cp, SUM(cellular_prevalence_sd)/COUNT(cellular_prevalence_sd) AS mean_cp_sd FROM timing_snv
38 | --GROUP BY 1,2,3
39 | --ORDER BY 1,2,3


--------------------------------------------------------------------------------
/sql/titan_vs_seqz.sql:
--------------------------------------------------------------------------------
 1 | WITH selected_samples AS
 2 | (
 3 | 	SELECT tumor_barcode_a AS aliquot_barcode, pair_barcode FROM analysis.diamond_set ds
 4 | 	INNER JOIN analysis.pairs pa ON pa.tumor_barcode = ds.tumor_barcode_a
 5 | 	UNION
 6 | 	SELECT tumor_barcode_b AS aliquot_barcode, pair_barcode FROM analysis.diamond_set ds
 7 | 	INNER JOIN analysis.pairs pa ON pa.tumor_barcode = ds.tumor_barcode_b
 8 | )
 9 | SELECT ss.aliquot_barcode, cellularity, purity
10 | FROM selected_samples ss
11 | INNER JOIN variants.titan_params tp ON tp.pair_barcode = ss.pair_barcode
12 | INNER JOIN variants.seqz_params sp ON sp.pair_barcode = ss.pair_barcode


--------------------------------------------------------------------------------
/sql/tumor_mut_comparison.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | 	tp.tumor_pair_barcode,
 3 | 	tp.case_barcode,
 4 | 	tp.tumor_barcode_a,
 5 | 	tp.tumor_barcode_b,
 6 | 	tp.sample_type_a,
 7 | 	tp.sample_type_b,
 8 | 	tp.portion_a,
 9 | 	tp.portion_b,
10 | 	tp.comparison_type,
11 | 	tp.surgical_interval_mo,
12 | 	
13 | 	( 	SELECT count(*) AS count
14 | 		FROM analysis.called_genotypes gt
15 | 		WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14) AS count_a,
16 | 	
17 | 	( 	SELECT count(*) AS count
18 | 		FROM analysis.called_genotypes gt
19 | 		WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14) AS count_b,
20 | 		
21 | 	(	SELECT count(*) AS count
22 | 		FROM (	SELECT
23 | 					gt.chrom,
24 | 					gt.start,
25 | 					gt."end",
26 | 					gt.alt
27 | 				FROM analysis.called_genotypes gt
28 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14
29 | 				UNION
30 | 				SELECT
31 | 					gt.chrom,
32 | 					gt.start,
33 | 					gt."end",
34 | 					gt.alt
35 | 				FROM analysis.called_genotypes gt
36 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14) t) AS union_ab,
37 | 		
38 | 	(	SELECT count(*) AS count
39 | 		FROM (	SELECT
40 | 					gt.chrom,
41 | 					gt.start,
42 | 					gt."end",
43 | 					gt.alt
44 | 				FROM analysis.called_genotypes gt
45 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14
46 | 				INTERSECT
47 | 				SELECT
48 | 					gt.chrom,
49 | 					gt.start,
50 | 					gt."end",
51 | 					gt.alt
52 | 				FROM analysis.called_genotypes gt
53 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14) t) AS intersection_ab,
54 | 		
55 | 	(	SELECT count(*) AS count
56 | 		FROM (	SELECT
57 | 					gt.chrom,
58 | 					gt.start,
59 | 					gt."end",
60 | 					gt.alt
61 | 				FROM analysis.called_genotypes gt
62 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14
63 | 				EXCEPT
64 | 				SELECT
65 | 					gt.chrom,
66 | 					gt.start,
67 | 					gt."end",
68 | 					gt.alt
69 | 				FROM analysis.called_genotypes gt
70 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14) t) AS setdiff_a,
71 | 	
72 | 	(	SELECT count(*) AS count
73 | 		FROM ( SELECT
74 | 					gt.chrom,
75 | 					gt.start,
76 | 					gt."end",
77 | 					gt.alt
78 | 				FROM analysis.called_genotypes gt
79 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14
80 | 				EXCEPT
81 | 				SELECT
82 | 					gt.chrom,
83 | 					gt.start,
84 | 					gt."end",
85 | 					gt.alt
86 | 				FROM analysis.called_genotypes gt
87 | 				WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14) t) AS setdiff_b
88 | 	 
89 | FROM analysis.tumor_pairs tp
90 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = tp.tumor_barcode_a
91 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = tp.tumor_barcode_b
92 | WHERE b1.coverage_exclusion = 'allow'::bpchar AND b2.coverage_exclusion = 'allow'::bpchar;
93 | 


--------------------------------------------------------------------------------
/sql/vaf_compare.sql:
--------------------------------------------------------------------------------
 1 | WITH
 2 | variants_by_case_and_gene AS
 3 | (
 4 | 	SELECT
 5 | 		gtc.gene_symbol,
 6 | 		gtc.case_barcode,
 7 | 		gtc.variant_classification,
 8 | 		sn.hgvs_p,
 9 | 		ROUND(alt_count_a::decimal / (alt_count_a + ref_count_a),4) AS vaf_a,
10 | 		ROUND(alt_count_b::decimal / (alt_count_b + ref_count_b),4) AS vaf_b,
11 | 		row_number() OVER (PARTITION BY gtc.gene_symbol, gtc.case_barcode ORDER BY vc.variant_classification_priority, mutect2_call_a::integer + mutect2_call_b::integer DESC, (alt_count_a + ref_count_a) + (alt_count_b + ref_count_b) DESC) AS priority
12 | 	FROM analysis.master_genotype_comparison gtc
13 | 	INNER JOIN analysis.silver_set stp ON stp.tumor_pair_barcode = gtc.tumor_pair_barcode
14 | 	INNER JOIN analysis.dnds_fraction_sel_cv ds ON ds.gene_symbol = gtc.gene_symbol --AND (ds.qglobal_cv < 0.05 OR gtc.gene_symbol IN ('TERT','IDH2','NOTCH1','PDGFRA','PIK3CG','BRAF','H3F3A'))
15 | 	LEFT JOIN analysis.variant_classifications vc ON gtc.variant_classification = vc.variant_classification
16 | 	INNER JOIN analysis.snvs sn ON sn.chrom = gtc.chrom AND sn.pos = gtc.pos AND sn.alt = gtc.alt
17 | 	WHERE
18 | 		(mutect2_call_a OR mutect2_call_b) AND
19 | 	(ds.qglobal_cv < 0.05 OR ds.gene_symbol IN ('TERT','IDH2','NOTCH1','PDGFRA','PIK3CG','BRAF','H3F3A')) AND
20 | 		(alt_count_a + ref_count_a) >= 5 AND (alt_count_b + ref_count_b) >= 5 AND
21 | 		(gtc.gene_symbol NOT IN ('TERT','IDH1','IDH2','BRAF','H3F3A') AND variant_classification_priority IS NOT NULL) OR 
22 | 		(gtc.gene_symbol = 'TERT' AND gtc.variant_classification = '5''Flank' AND lower(sn.pos) IN (1295228,1295250)) OR
23 | 		(gtc.gene_symbol = 'IDH1' AND sn.hgvs_p IN ('p.R132C','p.R132G','p.R132H','p.R132S')) OR
24 | 		(gtc.gene_symbol = 'IDH2' AND sn.hgvs_p = 'p.R172K') OR
25 | 		(gtc.gene_symbol = 'BRAF' AND sn.hgvs_p = 'p.V600E') OR
26 | 		(gtc.gene_symbol = 'H3F3A' AND sn.hgvs_p = 'p.G35R')
27 | )
28 | SELECT gene_symbol, case_barcode, variant_classification, hgvs_p, vaf_a, vaf_b
29 | FROM variants_by_case_and_gene vg
30 | WHERE priority = 1


--------------------------------------------------------------------------------
/sql/variant_status_leeds.sql:
--------------------------------------------------------------------------------
 1 | WITH selected_tumor_pairs AS
 2 | (
 3 | 	SELECT
 4 | 		tumor_pair_barcode,
 5 | 		case_barcode,
 6 | 		tumor_barcode_a,
 7 | 		tumor_barcode_b,
 8 | 		row_number() OVER (PARTITION BY case_barcode ORDER BY surgical_interval_mo DESC, portion_a ASC, portion_b ASC, substring(tumor_pair_barcode from 27 for 3) ASC) AS priority
 9 | 	FROM analysis.tumor_pairs ps
10 | 	LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a
11 | 	LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b
12 | 	WHERE
13 | 		comparison_type = 'longitudinal' AND
14 | 		sample_type_b <> 'M1' AND 													-- exclude metastatic samples here because this is outside the scope of our study
15 | 		b1.fingerprint_exclusion = 'allow' AND b2.fingerprint_exclusion = 'allow' AND
16 | 		b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow'
17 | )
18 | SELECT
19 | 	mgt.tumor_pair_barcode,
20 | 	(CASE WHEN mutect2_call_a AND mutect2_call_b THEN 'S' WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P' WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS variant_status,
21 | 	mgt.case_barcode,
22 | 	mgt.tumor_barcode_a,
23 | 	mgt.tumor_barcode_b,
24 | 	mgt.gene_symbol,
25 | 	mgt.variant_type,
26 | 	mgt.variant_classification,
27 | 	mgt.chrom::varchar(2),
28 | 	lower(mgt.pos) AS start_pos,
29 | 	upper(mgt.pos) -1 AS end_pos,
30 | 	ref,
31 | 	mgt.alt,
32 | 	ref_count_a,
33 | 	ref_count_b,
34 | 	alt_count_a,
35 | 	alt_count_b,
36 | 	mutect2_call_a,
37 | 	mutect2_call_b,
38 | 	vaf_corrected_call_a,
39 | 	vaf_corrected_call_b,
40 | 	logr_copy_number_a,
41 | 	logr_copy_number_b,
42 | 	corrected_copy_number_a,
43 | 	corrected_copy_number_b,
44 | 	corrected_call_a::varchar(5),
45 | 	corrected_call_b::varchar(5)
46 | FROM analysis.master_genotype_comparison mgt
47 | LEFT JOIN analysis.snvs snvs ON snvs.chrom = mgt.chrom AND snvs.pos = mgt.pos AND snvs.alt = mgt.alt
48 | WHERE (mutect2_call_a OR mutect2_call_b) AND (ref_count_a + alt_count_a) >= 10 AND (ref_count_b + alt_count_b) >= 10
49 | --INNER JOIN selected_tumor_pairs stp ON stp.tumor_pair_barcode = mgt.tumor_pair_barcode


--------------------------------------------------------------------------------
/sql/variants/passanno.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |     - - - - - - - - - -
 3 |     variants.passanno
 4 |     - - - - - - - - - -
 5 |     Limited variant annotations (only variants that PASS filters)
 6 |     Define a list of variants for which we want to preserve annotations
 7 |     In this case meaning all PASS variants and IDH/TERT variants
 8 |     We have to specifically retain IDH/TERT due to GATK 4.1.0.0 bug with force-calling
 9 | */
10 | WITH t1 AS (
11 |     SELECT DISTINCT info.variant_id
12 |     FROM variants.info
13 |     WHERE info.filter = 'PASS' OR 
14 |         (info.chrom = 2 AND lower(info.pos) IN (209113112, 209113113)) OR 
15 |         (info.chrom = 5 AND lower(info.pos) IN (1295169, 1295228, 1295242, 1295250)) OR
16 |         (info.chrom = 15 AND lower(info.pos) IN (90631837, 90631838, 90631839))
17 | )
18 | SELECT
19 |     anno.variant_id,
20 |     chrom,
21 |     pos,
22 |     ref,
23 |     alt,
24 |     gene_symbol,
25 |     variant_classification,
26 |     secondary_variant_classification,
27 |     variant_type,
28 |     genome_change,
29 |     transcript,
30 |     transcript_strand,
31 |     transcript_exon,
32 |     transcript_position,
33 |     cdna_change,
34 |     cds_change,
35 |     protein_change,
36 |     gc_content,
37 |     reference_context,
38 |     "substring"(reference_context::text, 10, 3) AS trinucleotide_context
39 | FROM variants.anno
40 | INNER JOIN t1 ON t1.variant_id = anno.variant_id


--------------------------------------------------------------------------------