├── .gitattributes ├── .gitignore ├── GLASS-WG.Rproj ├── LICENSE ├── R ├── cnv │ ├── README.md │ ├── aneuploidy.R │ ├── archive │ │ ├── check-gatk-modelsegments-hets.R │ │ ├── cnv-sig.R │ │ ├── compare-CNV.R │ │ ├── gatk_write_seg_to_db.R │ │ ├── loh_to_igv.R │ │ ├── noncodel-cell-cycle-aneuploidy.R │ │ ├── paired_delta_seg_to_igv.R │ │ └── prop_het_plot.R │ ├── copy-number-optimization.R │ ├── copynumber-seg-gene-level.R │ ├── determine-codel-status.R │ ├── fig3-aneuploidy.R │ ├── figEDF-cnv-arm-heatmap.R │ ├── prepare_gistic.R │ ├── titan-assessment.R │ └── titan-results-enumerate-events.R ├── figures │ ├── EDF2-fractionmf.R │ ├── F1-F3-GLASS-heatmap.R │ ├── F3b_d-cell-cycle-analysis.R │ ├── fig-PyClone-ccf-shared.R │ └── fig-mutational-signatures.R ├── manifest │ ├── README.md │ ├── case-western-create-manifest.R │ ├── create-stjude-barcodes.R │ ├── dfci-create-wxs-readgroups.R │ ├── dkfz-create-manifest.R │ ├── gdc-create-manifest.R │ ├── glass-LU-create-manifest.R │ ├── glass-stjude-create-manifest.R │ ├── glass-wxs-create-manifest.R │ ├── hf-create-manifest.R │ ├── hk-create-manifest.R │ ├── hongkong-seq-json-metadata.R │ ├── jdg-create-manifest.R │ ├── life-history-barcode-generation.R │ ├── make-manifest-K2.R │ ├── mda-create-manifest.R │ ├── merge-manifest.R │ ├── mgh-create-manifest.R │ └── nested-list-example.R ├── misc │ ├── README.md │ ├── blocklist2db.R │ ├── cytoband2DB.R │ ├── dashboard.R │ ├── geneTable2DB.R │ ├── roel-grant.R │ ├── seqz2DB.R │ ├── st-jude-life-history-identification.R │ ├── table-to-json-example.R │ ├── titan2DB.R │ └── titanparams2db.R ├── neoantigens │ ├── analysis │ │ ├── SuppTable6_writetotext.r │ │ ├── neoag_depletion_hla_count.r │ │ ├── neoantigen_depletion_subclonal_selection.r │ │ └── neoantigen_depletion_survival_cox.r │ ├── figures │ │ ├── ExtendedDataFig_neoag_ccf_shared.r │ │ ├── ExtendedDataFig_neoag_depletion_CIBERSORT_barplots.r │ │ ├── Fig4_neoag_depletion_clonality_timepoint.r │ │ ├── Fig4_neoag_depletion_subtype_timepoint_hm.r │ │ ├── Fig4_neoag_nonsyn_rate.r │ │ └── neoag_depletion_fraction_subtype_hm_boxplots.r │ └── upload │ │ ├── cibersort_table.r │ │ └── combine_neoag_tables.r ├── preprocess │ ├── Novogene-MDACC-sample-status.R │ ├── README.md │ ├── add_aligned_bam_to_files.R │ ├── aliquots-coverage-metrics.R │ ├── crosscheck-mismatch-identification.R │ ├── crosscheckmetricscluster.R │ ├── glass-surgery-clinical-data.R │ ├── metrics.R │ ├── ucsf-clinical-update.R │ └── vcf_aliquot_qc.R ├── pyclone_paired.R ├── shiny │ ├── RShinyDBFrontend.R │ └── shinyvaf.R ├── snakemake │ ├── cov2db.R │ ├── geno2db.R │ ├── pyclone_create_tsv.R │ ├── runSeqz.R │ ├── seg2db.R │ ├── snv2db.R │ └── vep_upload.r ├── snv │ ├── Fig2b-ccf-rank-kendall.R │ ├── GLASS_gene_comparison_clean.R │ ├── README.md │ ├── archive │ │ ├── allMutationDbToGRanges.R │ │ ├── called_mut_db_as_granges.R │ │ ├── ensembl_genes_to_db.R │ │ ├── heatmap-snv.R │ │ ├── mf_longitudinal.R │ │ ├── mf_private_shared_time.R │ │ ├── mut-freq.R │ │ ├── mutfreq_interval.R │ │ ├── pri_vs_rec_muts.R │ │ ├── privateVsSharedMutationDbToGRanges.R │ │ ├── sample_variants.R │ │ ├── shared_private_to_vcf.R │ │ ├── signature1_by_age_and_interval.R │ │ ├── sigproba2db.R │ │ ├── test-gene-vaf.R │ │ └── vaf-freq.R │ ├── clonevol.R │ ├── dndscv.R │ ├── driver-evolution-associations.R │ ├── fig1-hypermutators-survival.R │ ├── fig1-temporal-somatic-changes.R │ ├── fig2-neutralitytestR-subtype.R │ ├── fig2-subclonal-selection.R │ ├── figEDF-clonal-dynamics.R │ ├── figEDF-hypermutation-clonality.R │ ├── figEDF-longitudinal-gene-comparison.R │ ├── figR-purity-mf.R │ ├── figR-somatic-burden-survival.R │ ├── gene-driver-evolution.R │ ├── hypermutators-overall-survival.R │ ├── longitudinal-mutational-frequency.R │ ├── longitudinal-vaf-analyses.R │ ├── maf-comparison-tcga-pcwag-glass.R │ ├── mutect2-varscan2-results.R │ ├── neutral-evolution-mutect2.R │ ├── neutral-evolution.R │ ├── neutralitytestr-aliquots-results.R │ ├── neutralitytestr-aliquots.R │ ├── neutralitytestr-fractionated-results.R │ ├── notch1-mutations-glass.R │ ├── subclonalselection-neutralitytestr-integration.R │ └── temporal-mutation-analyses.R ├── telseq │ └── telseq.R ├── timing │ ├── fig3c-cdkn2a-aneuploidy-timing.R │ ├── pyclone_cluster_ccf.R │ └── timing.R └── vcf_filter.R ├── README.md ├── Snakefile ├── bin ├── bam-rg-insert-size-calc.pl ├── bamtofastq-rename.sh ├── bedtovcf.sh ├── create_fake_wgs_bams.sh ├── extractSplitReads_BwaMem ├── get-readgroups.sh ├── gistic_run.pbs ├── preprocess-intervals.sh ├── reset_directory_structure.sh ├── scatter-interval-list-to-bed.sh ├── select-germline-variants.sh ├── snakemake-run.sh └── svaba-test-parameters.sh ├── conf ├── cluster.json ├── config.yaml └── optitype_config.ini ├── dag ├── align.rulegraph.png ├── fingerprint.rulegraph.png ├── gatk-cnv.rulegraph.png ├── mt2.rulegraph.png ├── svdetect.rulegraph.png └── vs2.rulegraph.png ├── dbm ├── glass-rearranged.png ├── glass.dbm ├── glass.png └── glass.svg ├── envs ├── absolute.yaml ├── align.yaml ├── bcftools.yaml ├── delly.yaml ├── freebayes.yaml ├── gatk4.yaml ├── gdc-client.yaml ├── glass.yaml ├── glass.yml ├── haplotype.yaml ├── lumpy-sv.yaml ├── manta.yaml ├── optitype.yaml ├── pvacseq.yaml ├── pyclone.yaml ├── r.yaml ├── sequenza.yaml ├── somaticseq.yaml ├── telseq.yaml ├── titan.yaml ├── varscan2.yaml ├── vcf2maf.yaml └── vep.yaml ├── jar ├── VarScan.v2.4.2.jar └── VarScan.v2.4.3.jar ├── julia ├── README.md ├── extract_vafs.py ├── runsubclonalselection.jl.txt ├── subclonalselection.qsubsec.txt └── subclonalselection.tff.txt ├── python ├── .ipynb_checkpoints │ ├── LearnRegexp-checkpoint.ipynb │ ├── LearningJSON-1-checkpoint.ipynb │ ├── LearningJSON-2-checkpoint.ipynb │ ├── LearningJSON-3-checkpoint.ipynb │ ├── LearningJSON-4-checkpoint.ipynb │ ├── SoftwareCarpentryExercises-checkpoint.ipynb │ ├── Untitled-checkpoint.ipynb │ ├── Untitled1-checkpoint.ipynb │ ├── Untitled2-checkpoint.ipynb │ └── Untitled3-checkpoint.ipynb ├── JSONManifestHandler.py ├── ManifestHandler.py ├── PostgreSQLManifestHandler.py ├── __init__.py ├── countPysam.py ├── dexseq_prepare_annotation.py ├── glassfunc.py ├── manifest_tester.py └── map_building_functions.py ├── snakemake ├── align.smk ├── batches2db.R ├── cnv-post.smk ├── cnv.smk ├── cnvnator.smk ├── delly.smk ├── download.smk ├── fingerprinting.smk ├── fusorsv.smk ├── haplotype-map.smk ├── lumpy.smk ├── manta.smk ├── mutect2-post.smk ├── mutect2.smk ├── optitype.smk ├── pvacseq.smk ├── pyclone.smk ├── sequenza.smk ├── somaticseq.smk ├── telseq.smk ├── titan.smk └── varscan2.smk └── sql ├── clinical ├── clinical-tumor-pairs-db2.sql └── clinical_by_tumor_pair.sql ├── cnv ├── c710_status.sql ├── call_10q25_26.sql ├── cdkn2a_genome_ccf.sql ├── cnv_by_gene_gatk.sql ├── gatk_seg_diff_call.sql ├── gistic_prepare.sql ├── prop_heterozygous_gatk.sql ├── recapseg_postgres.sql └── taylor_aneuploidy.sql ├── compare_seg_stats.sql ├── compute_chr7_10.sql ├── dndscv ├── dndscv_input_by_fraction.sql ├── dndscv_input_by_fraction_hyperm.sql ├── dndscv_input_by_fraction_triplet.sql ├── dndscv_input_by_sample.sql └── dndscv_input_by_sample_hyperm.sql ├── drivers ├── driver_status.sql ├── driver_status_arm.sql ├── driver_status_cnv.sql ├── driver_status_snv.sql └── driver_status_snv_neoag.sql ├── figures ├── mutsig_boxplot_fig1.sql └── mutsig_corr.sql ├── fred_cnv.sql ├── fred_mutation.sql ├── fred_mutation2.sql ├── get_gene_variant_coverage_sample.sql ├── heatmap ├── heatmap_aneuploidy.sql ├── heatmap_arm.sql ├── heatmap_arm_by_arm.sql ├── heatmap_c710.sql ├── heatmap_clinical.sql ├── heatmap_cnv.sql ├── heatmap_cnv_by_gene.sql ├── heatmap_drivers.sql ├── heatmap_evolution.sql ├── heatmap_mf.sql ├── heatmap_purity.sql ├── heatmap_pyclone_clusters.sql ├── heatmap_signatures.sql ├── heatmap_snv.sql ├── heatmap_snv_by_gene.sql └── heatmap_time.sql ├── id_multiple_aliquot_driver_change.sql ├── mf_longitudinal_analysis.sql ├── mut_freq └── mut_freq.sql ├── mut_sig ├── archive │ ├── mut_sig_aliquot.sql │ ├── mut_sig_driver_genes.sql │ ├── mut_sig_drivers.sql │ ├── mut_sig_effect.sql │ ├── mut_sig_fraction_subtype.sql │ ├── mut_sig_fraction_subtype_hypermutation.sql │ ├── mut_sig_gene.sql │ └── mut_sig_variant_classification.sql ├── mut_sig_class.sql ├── mut_sig_fraction.sql └── mut_sig_fraction_limited.sql ├── neoag ├── cibersort_depletion.sql ├── cibersort_depletion_fraction.sql ├── neoag_ccf_shared.sql ├── neoag_depletion_aliquot.sql ├── neoag_depletion_fraction.sql ├── neoag_freq.sql ├── neoantigen_depletion.sql ├── neoantigen_depletion_clonality.sql ├── neoantigen_depletion_fraction.sql ├── neoantigen_peptide_counts.sql ├── neoantigens_by_aliquot.sql └── neoantigens_by_pair.sql ├── neutrality ├── neutralitytestr-subtype.sql └── original_submission │ ├── neutrality-testr-input-mutect2.sql │ ├── neutrality_testr_input.sql │ └── neutralitytestr-input-aliquot-level.sql ├── prop_equal_paired_seg.sql ├── pyclone ├── pyclone_aliquots.sql ├── pyclone_cluster_pairs.sql ├── pyclone_cluster_pairs_anno_drivers.sql ├── pyclone_cluster_stats.sql ├── pyclone_cluster_stats2.sql └── pyclone_create_tsv.sql ├── roeltable.sql ├── selected_aliquots.sql ├── selected_tumor_pairs_silver.sql ├── set ├── gold_set.sql └── silver_set.sql ├── snv ├── longitudinal_gene_comparison_snv_all_genes.sql ├── longitudinal_gene_comparison_snv_smg.sql ├── tumor_mut_comparison.sql └── tumor_mut_comparison_anno.sql ├── tel.sql ├── test.sql ├── timing ├── ccf_shared.sql ├── timing.sql ├── timing_cnv.sql ├── timing_pairs.sql ├── timing_snv.sql └── timing_tp53_idh1_atrx.sql ├── titan_vs_seqz.sql ├── tumor_mut_comparison.sql ├── vaf_compare.sql ├── variant_status_leeds.sql └── variants ├── passanno.sql └── passgeno.sql /.gitattributes: -------------------------------------------------------------------------------- 1 | *.smk linguist-language=Python 2 | Snakefile linguist-language=Python 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | benchmarks/** 2 | data/** 3 | results/** 4 | logs/** 5 | dbm/glass-dbdump-20181004.sql 6 | R/md-anderson-clinical.R 7 | R/manifest/yung-create-manifest.R 8 | R/RData/** 9 | sandbox/** 10 | archive/** 11 | documents/** 12 | tmp/ 13 | .snakemake/** 14 | .Rhistory 15 | *.Rdata 16 | *.RData 17 | *.Rda 18 | *.RDa 19 | .Rproj.user 20 | .Rproj.user/** 21 | .** 22 | *.pyc 23 | *.pyo 24 | figures/** 25 | -------------------------------------------------------------------------------- /GLASS-WG.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Floris P Barthel, Kevin C Johnson and Collaborators 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /R/cnv/README.md: -------------------------------------------------------------------------------- 1 | ### Copy Number Variation analyses 2 | 3 | For copy number enumeration as well as determining 1p19q status in the WXS/WGS samples. Some temporary files processed in these scripts are no longer available as they have been replaced by the final calls. 4 | -------------------------------------------------------------------------------- /R/cnv/archive/check-gatk-modelsegments-hets.R: -------------------------------------------------------------------------------- 1 | files = list.files("results/cnv/modelsegments/", pattern = "hets.tsv|hets.normal.tsv", recursive = T, full.names = T) 2 | df = data.frame(fn = files, case = substr(basename(files),1,12), analysis = substr(basename(files),21,23), size = file.size(files)) 3 | 4 | library(tidyverse) 5 | 6 | df2 <- df %>% group_by(case, analysis) %>% mutate(var = var(size)) %>% ungroup() 7 | 8 | 9 | thets <- read_tsv("results/cnv/modelsegments/TCGA-DU-7304-R1-12D-WGS-TNHHDG/TCGA-DU-7304-R1-12D-WGS-TNHHDG.hets.tsv", comment = "@", col_types = "ciiicc") 10 | nhets <- read_tsv("results/cnv/modelsegments/TCGA-DU-7304-R1-12D-WGS-TNHHDG/TCGA-DU-7304-R1-12D-WGS-TNHHDG.hets.normal.tsv", comment = "@", col_types = "ciiicc") 11 | 12 | nhets <- nhets %>% 13 | mutate(ct = REF_COUNT + ALT_COUNT, vaf = ALT_COUNT / ct) 14 | 15 | thets <- thets %>% 16 | mutate(ct = REF_COUNT + ALT_COUNT, vaf = ALT_COUNT / ct) 17 | 18 | plot(density(nhets$vaf)) 19 | plot(density(thets$vaf)) 20 | -------------------------------------------------------------------------------- /R/cnv/archive/cnv-sig.R: -------------------------------------------------------------------------------- 1 | devtools::install_github("ShixiangWang/VSHunter", build_vignettes = TRUE) 2 | library(VSHunter) 3 | vignette('VSHunter') 4 | load(system.file("extdata/example_cn_list.RData", package = "VSHunter")) 5 | tcga_frac = cnv_getLengthFraction(tcga_segTabs) 6 | tcga_features = cnv_derivefeatures(CN_data = tcga_segTabs, cores = 1, genome_build = "hg19") 7 | tcga_components = cnv_fitMixModels(CN_features = tcga_features, cores = 4) 8 | tcga_sample_component_matrix = cnv_generateSbCMatrix(tcga_features, tcga_components, cores = 4) 9 | tcga_sig_choose = cnv_chooseSigNumber(tcga_sample_component_matrix, nrun = 10, cores = 4) 10 | tcga_sig_choose2 = cnv_chooseSigNumber(tcga_sample_component_matrix, nrun = 10, cores = 4, testRandom = FALSE) 11 | tcga_signatures = cnv_extractSignatures(tcga_sample_component_matrix, nsig = 3, cores = 4) 12 | w = NMF::basis(tcga_signatures) 13 | tcga_exposure = cnv_quantifySigExposure(sample_by_component = tcga_sample_component_matrix, component_by_signature = w) 14 | tcga_results = cnv_autoCaptureSignatures(tcga_sample_component_matrix, nrun=10, cores = 4) 15 | cnv_plotDistributionProfile(tcga_frac) 16 | cnv_plotDistributionProfile(tcga_frac, mode = "cd") 17 | cnv_plotDistributionProfile(tcga_frac, mode = "cd" , fill = TRUE) 18 | 19 | cnv_plotDistributionProfile() 20 | cnv_plotFeatureDistribution() 21 | cnv_plotMixComponents() 22 | cnv_plotSignatures() -------------------------------------------------------------------------------- /R/cnv/archive/gatk_write_seg_to_db.R: -------------------------------------------------------------------------------- 1 | library(DBI) 2 | library(tidyverse) 3 | 4 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 5 | 6 | segfiles <- list.files("results/cnv/callsegments", full.names = TRUE) 7 | segs <- parallel::mclapply(segfiles, function(f) { 8 | dat <- read.delim(f, comment.char = "@", as.is= TRUE) 9 | dat <- dat %>% 10 | mutate(aliquot_barcode = substr(basename(f),1,30), pos = sprintf("[%s,%s]", START, END)) %>% 11 | select(aliquot_barcode, chrom = CONTIG, pos, num_points = NUM_POINTS_COPY_RATIO, log2_copy_ratio = MEAN_LOG2_COPY_RATIO, call = CALL) 12 | return(dat) 13 | }, mc.cores = 8) 14 | segs <- data.table::rbindlist(segs) %>% as.data.frame() 15 | 16 | dbWriteTable(con, Id(schema="analysis",table="gatk_seg"), segs) 17 | -------------------------------------------------------------------------------- /R/cnv/archive/loh_to_igv.R: -------------------------------------------------------------------------------- 1 | library(DBI) 2 | library(tidyverse) 3 | library(ggplot2) 4 | 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 6 | 7 | q <- "SELECT 8 | pair_barcode, 9 | chrom::varchar(2), 10 | lower(pos) as \"start\", 11 | upper(pos)-1 as \"end\", 12 | num_snp, 13 | 2*(median_ratio-0.75) AS median_ratio 14 | FROM analysis.titan_seg" 15 | 16 | qres <- dbGetQuery(con, q) 17 | qres = qres %>% filter(complete.cases(start,end,num_snp,median_ratio)) 18 | 19 | write.table(qres, file = "loh.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE) 20 | -------------------------------------------------------------------------------- /R/cnv/archive/noncodel-cell-cycle-aneuploidy.R: -------------------------------------------------------------------------------- 1 | # To get mutations in cell cycle genes from Figure 1. 2 | mutation_genes = read_file("sql/heatmap/build_heatmap_data_mutation.sql") 3 | mutations_selected = dbGetQuery(con, mutation_genes) 4 | 5 | # To query genes that are found to be altered in the cell cycle. 6 | cell_cycle_cnv_titan = dbGetQuery(con, "SELECT * FROM analysis.cnv_by_gene WHERE gene_symbol IN ('CDK4','CCND2','CDK6','CDKN2A','RB1')") 7 | cell_cycle_cnv_gatk = dbGetQuery(con, "SELECT * FROM analysis.cnv_by_gene_gatk WHERE gene_symbol IN ('CDK4','CCND2','CDK6','CDKN2A','RB1')") 8 | cell_cycle_cnv_titan = dbGetQuery(con, "SELECT * FROM analysis.cnv_by_gene WHERE gene_symbol = 'CDKN2A'") 9 | cell_cycle_cnv_gatk = dbGetQuery(con, "SELECT * FROM analysis.cnv_by_gene_gatk WHERE gene_symbol = 'CDKN2A'") 10 | 11 | cnv_titan = cell_cycle_cnv_titan %>% 12 | inner_join(pairs, by="pair_barcode") %>% 13 | select(tumor_barcode, gene_symbol, copy_number, corrected_cn, titan_call) 14 | 15 | cell_cycle_cnv_merged = cell_cycle_cnv_gatk %>% 16 | select(aliquot_barcode, corrected_cn_gatk = corrected_cn, cn_call_gatk = cn_call) %>% 17 | inner_join(cnv_titan, by=c("aliquot_barcode"="tumor_barcode")) 18 | 19 | table(cell_cycle_cnv_merged$cn_call_gatk, cell_cycle_cnv_merged$titan_call) -------------------------------------------------------------------------------- /R/cnv/archive/paired_delta_seg_to_igv.R: -------------------------------------------------------------------------------- 1 | library(DBI) 2 | library(tidyverse) 3 | library(ggplot2) 4 | 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 6 | 7 | q<- "SELECT 8 | tumor_pair_barcode, 9 | chrom::varchar(2), 10 | lower(pos) as \"start\", 11 | upper(pos)-1 as \"end\", 12 | 0 as num_snps, 13 | delta_cn 14 | FROM analysis.titan_seg_paired_delta pa" 15 | 16 | qres <- dbGetQuery(con, q) 17 | seg = qres %>% filter(complete.cases(start,end,num_snps)) 18 | 19 | #write.table(seg, file = "diff.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE) 20 | 21 | q <- "SELECT tumor_pair_barcode, diamond_set::integer 22 | FROM analysis.titan_seg_paired_comparison ts 23 | LEFT JOIN biospecimen.aliquots al ON al.aliquot_barcode = ts.tumor_barcode_a 24 | LEFT JOIN clinical.surgeries cl ON cl.sample_barcode = al.sample_barcode" 25 | 26 | qres <- dbGetQuery(con, q) 27 | 28 | seg <- seg %>% 29 | left_join(qres) %>% 30 | filter(diamond_set==1) %>% 31 | select(-diamond_set) 32 | 33 | write.table(seg, file = "results/cnv/gistic/input.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE) 34 | 35 | markers = data.frame(id = 1:(2*nrow(seg)), chr = c(seg$chrom, seg$chrom), pos = c(seg$start, seg$end), stringsAsFactors = FALSE) %>% distinct() 36 | write.table(markers, file = "results/cnv/gistic/markers.txt", sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE) 37 | 38 | #write.table(qres, file = "diff.annoseg.txt", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE) 39 | -------------------------------------------------------------------------------- /R/cnv/archive/prop_het_plot.R: -------------------------------------------------------------------------------- 1 | library(DBI) 2 | library(tidyverse) 3 | library(ggplot2) 4 | 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 6 | q <- "SELECT ts.*, cl.idh_codel_subtype 7 | FROM analysis.titan_seg_paired_comparison ts 8 | LEFT JOIN biospecimen.aliquots al ON al.aliquot_barcode = ts.tumor_barcode_a 9 | LEFT JOIN clinical.surgeries cl ON cl.sample_barcode = al.sample_barcode" 10 | 11 | qres <- dbGetQuery(con, q) 12 | 13 | ggplot(qres, aes(x=delta_prop_het, color = idh_codel_subtype)) + geom_density() + coord_cartesian(xlim=c(-1,1)) + labs(x="Heterozygous proportion of the genome\n(recurrence-primary)") 14 | ggplot(qres, aes(x=prop_delta_eq, color = idh_codel_subtype)) + geom_density() + labs(x="Proportion of the genome with identical copy number states\n(recurrence-primary)") 15 | 16 | + geom_smooth(method = "lm") + facet_wrap(~mutation_status) 17 | ggplot(qres, aes(x=case_age_diagnosis_years, y = relative_contribution)) + geom_point() + geom_smooth(method = "lm") + facet_wrap(~mutation_status + idh_codel_subtype) 18 | 19 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="shared") 20 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="primary") 21 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="recurrent") 22 | 23 | ggplot(qres, aes(x=surgical_interval_mo, y = relative_contribution)) + 24 | geom_point() + 25 | geom_smooth(method = "lm") + 26 | facet_wrap(~mutation_status + idh_codel_subtype, scales = "free_x") 27 | -------------------------------------------------------------------------------- /R/cnv/prepare_gistic.R: -------------------------------------------------------------------------------- 1 | ################################################## 2 | # Prepare input files for running GISTIC seperately for primaries and recurrences 3 | # Ignores multi-sector samples (one sample per patient and timepoint) 4 | # Updated: 2019.04.19 5 | # Author: Floris B 6 | ################################################## 7 | 8 | library(DBI) 9 | library(tidyverse) 10 | library(ggplot2) 11 | 12 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2") 13 | 14 | q <- read_file("sql/cnv/gistic_prepare.sql") 15 | 16 | qres <- dbGetQuery(con, q) 17 | seg = qres %>% filter(complete.cases(start,end,num_snps)) 18 | 19 | seg_p = seg %>% filter(sample_type == "P") %>% select(-sample_type) 20 | seg_r = seg %>% filter(sample_type == "R") %>% select(-sample_type) 21 | 22 | write.table(seg_p, file = "results/gistic2/primary.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE) 23 | write.table(seg_r, file = "results/gistic2/recurrence.seg", sep="\t", quote = FALSE, row.names = FALSE, col.names = TRUE) 24 | 25 | markers = data.frame(id = 1:(2*nrow(seg)), chr = c(seg$chrom, seg$chrom), pos = c(seg$start, seg$end), stringsAsFactors = FALSE) %>% distinct() 26 | write.table(markers, file = "results/gistic2/markers.txt", sep="\t", quote = FALSE, row.names = FALSE, col.names = FALSE) -------------------------------------------------------------------------------- /R/cnv/titan-results-enumerate-events.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | # Use the segmented copy number calls to derive gene-level gains/losses 3 | # Date: 2018.11.01 4 | # Author: Kevin J. 5 | ####################################################### 6 | 7 | # Directory for GLASS analysis. 8 | mybasedir = 'Volumes/verhaak-lab/GLASS-analysis/' 9 | datadir = 'results/cnv/' 10 | pattern = '.called.seg$' 11 | 12 | ####################################################### 13 | 14 | # Necessary packages: 15 | library(parallel) 16 | library(tidyverse) 17 | library(data.table) 18 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 19 | library(org.Hs.eg.db) 20 | library(DBI) 21 | 22 | ####################################################### 23 | # Establish connection with GLASS database. 24 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 25 | 26 | # Downloaded the UCSC cytoband file for hg19. 27 | cytoband_file = "/Users/johnsk/Documents/Life-History/GLASS-WG/data/ref/human_grch37_hg19_ucsc_cytoBand.txt" 28 | cytobands = read.delim(cytoband_file, header=FALSE) 29 | 30 | # Summarize the number of cytobands per chromosomal arm. 31 | cytobands %>% 32 | mutate(arm = substring(V4, 1, 1), 33 | chr_cyto = paste(V1, arm, sep='.')) %>% 34 | group_by(chr_cyto) %>% 35 | summarise(cyto_per_chr = n()) 36 | 37 | 38 | # Retrieve cytoband-specific copy number calls 39 | cytoband = dbGetQuery(con,"SELECT * FROM analysis.cnv_by_cytoband") 40 | 41 | # MERGE cytoband calls with tumor_purity and tumor_ploidy. 42 | # DETERMINE status based on whether its copy number was greater, smaller, or equal to the sample's ploidy. 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /R/manifest/README.md: -------------------------------------------------------------------------------- 1 | ### Generating manifest files for Snakemake pipeline 2 | 3 | There were a large number of differences in how metadata was stored for each of the GLASS cohorts. We sought to standardize metadata and sequencing information. Each R script in this directory represents our attempt to wrangle the data into a structure that works with the Snakemake pipeline. We later migrated to a PostgreSQL format, but these scripts may be helpful to others trying to implement the GLASS workflow. 4 | -------------------------------------------------------------------------------- /R/manifest/merge-manifest.R: -------------------------------------------------------------------------------- 1 | ## Merge manifest 2 | ## Author: Floris Barthel 3 | ## Date: Jun 24 2018 4 | 5 | setwd("~/projects/GLASS-WG/") 6 | 7 | library(tidyverse) 8 | 9 | manifest_dir = "data/manifest" 10 | cases_prefix = "cases" 11 | samples_prefix = "samples" 12 | aliquots_prefix = "aliquots" 13 | readgroups_prefix = "readgroups" 14 | files_prefix = "files" 15 | pairs_prefix = "pairs" 16 | 17 | cases = list.files(manifest_dir, pattern = sprintf("%s.tsv", cases_prefix), recursive = T, full.names = T) %>% 18 | map(read.delim, as.is=T) %>% 19 | reduce(bind_rows) %>% ## map_dfr maybe bettre 20 | distinct() ## NEED TO REMOVE DUPLICATE ROWS IN SOURCE FILES 21 | 22 | samples = list.files(manifest_dir, pattern = sprintf("%s.tsv", samples_prefix), recursive = T, full.names = T) %>% 23 | map(read.delim, as.is=T) %>% 24 | reduce(bind_rows) 25 | 26 | aliquots = list.files(manifest_dir, pattern = sprintf("%s.tsv", aliquots_prefix), recursive = T, full.names = T) %>% 27 | map(read.delim, as.is=T) %>% 28 | reduce(bind_rows) 29 | 30 | readgroups = list.files(manifest_dir, pattern = sprintf("%s.tsv", readgroups_prefix), recursive = T, full.names = T) %>% 31 | map(read.delim, as.is=T) %>% 32 | reduce(bind_rows) 33 | 34 | files = list.files(manifest_dir, pattern = sprintf("%s.tsv", files_prefix), recursive = T, full.names = T) %>% 35 | map(read.delim, as.is=T) %>% 36 | reduce(bind_rows) 37 | 38 | pairs = list.files(manifest_dir, pattern = sprintf("%s.tsv", pairs_prefix), recursive = T, full.names = T) %>% 39 | map(read.delim, as.is=T) %>% 40 | reduce(bind_rows) 41 | 42 | print(sprintf("Exporting manifest as json files for snakemake use.")) 43 | write(jsonlite::toJSON(aliquots, pretty = T), file = sprintf("%s/%s.json", manifest_dir, aliquots_prefix)) 44 | write(jsonlite::toJSON(files, pretty = T), file = sprintf("%s/%s.json", manifest_dir, files_prefix)) 45 | write(jsonlite::toJSON(cases, pretty = T), file = sprintf("%s/%s.json", manifest_dir, cases_prefix)) 46 | write(jsonlite::toJSON(pairs, pretty = T), file = sprintf("%s/%s.json", manifest_dir, pairs_prefix)) 47 | write(jsonlite::toJSON(readgroups, pretty = T), file = sprintf("%s/%s.json", manifest_dir, readgroups_prefix)) 48 | write(jsonlite::toJSON(samples, pretty = T), file = sprintf("%s/%s.json", manifest_dir, samples_prefix)) 49 | 50 | ## END ## -------------------------------------------------------------------------------- /R/misc/README.md: -------------------------------------------------------------------------------- 1 | ### Random R 2 | 3 | An assortment of analyses tangentially related to the GLASS project. 4 | -------------------------------------------------------------------------------- /R/misc/blocklist2db.R: -------------------------------------------------------------------------------- 1 | ## push blocklist to db 2 | 3 | library(tidyverse) 4 | library(DBI) 5 | library(odbc) 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 7 | 8 | tmp = read.delim("data/ref/block_review_allow_quality_lists_20181206.txt", as.is = TRUE) 9 | 10 | df <- tmp %>% 11 | select(aliquot_barcode, 12 | fingerprint_exclusion, 13 | coverage_exclusion = coverage_mut_exclusion, 14 | cnv_exclusion = manual_cn_exlusion, 15 | clinical_exclusion = surgical_interval_exclusion, 16 | fingerprint_exclusion_reason, 17 | coverage_exclusion_reason = mut_exclusion_reason, 18 | cnv_exclusion_reason = cn_exclusion_reason, 19 | clinical_exclusion_reason = surgical_interval_exclusion_reason) %>% 20 | mutate(clinical_exclusion_reason = ifelse(clinical_exclusion_reason == "", NA, clinical_exclusion_reason)) 21 | 22 | dbWriteTable(con, Id(schema="analysis",table="blocklist"), df, append = FALSE) 23 | -------------------------------------------------------------------------------- /R/misc/cytoband2DB.R: -------------------------------------------------------------------------------- 1 | ## export gene table to db 2 | 3 | library(tidyverse) 4 | library(DBI) 5 | 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 7 | 8 | setwd("/Volumes/Helix-Common/GLASS-analysis/") 9 | 10 | cbref = read.delim(file = "data/ref/human_grch37_hg19_ucsc_cytoBand.txt", as.is = TRUE, header = FALSE) 11 | colnames(cbref) = c("chrom", "start", "end", "cytoband", "giestain") 12 | #genecov = read_tsv(file = "data/ref/gene.covariates.txt") 13 | 14 | df <- cbref %>% 15 | transmute(cytoband = cytoband, 16 | chrom = gsub("chr","",chrom), 17 | pos = sprintf("[%s,%s)",start,end), 18 | gie_stain = giestain) 19 | 20 | dbWriteTable(con, Id(schema="ref",table="cytobands"), df, append=T) 21 | -------------------------------------------------------------------------------- /R/misc/dashboard.R: -------------------------------------------------------------------------------- 1 | library(shinydashboard) 2 | library(shiny) 3 | library(DBI) 4 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 5 | tables <- dbGetQuery(con, "SELECT table_schema, table_name FROM information_schema.tables WHERE table_schema != 'pg_catalog' AND table_schema !='information_schema'") 6 | ui <- dashboardPage( 7 | dashboardHeader(), 8 | dashboardSidebar( 9 | sidebarMenu( 10 | lapply(unique(tables$table_schema), function(schema) menuItem(schema, 11 | lapply(tables$table_name[tables$table_schema==schema], 12 | function(table) menuSubItem(table, tabName = table, icon = icon("th"))), 13 | tabName = schema, icon = icon("dashboard"))) 14 | ) 15 | ), 16 | dashboardBody( 17 | do.call(tabItems, 18 | lapply(tables$table_name, function(table) { 19 | tabItem(tabName = table, 20 | h2(table), DT::dataTableOutput(outputId = table)) 21 | }) 22 | ) 23 | ) 24 | ) 25 | server <- function(input, output) { 26 | lapply(tables$table_name, function(table) { 27 | output[[table]] <- DT::renderDataTable(dbReadTable(con, Id(schema=tables$table_schema[tables$table_name==table], table=table))) 28 | }) 29 | } 30 | runApp(shinyApp(ui, server), host = "10.7.0.151", port = 2018) -------------------------------------------------------------------------------- /R/misc/geneTable2DB.R: -------------------------------------------------------------------------------- 1 | ## export gene table to db 2 | 3 | library(tidyverse) 4 | library(DBI) 5 | 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 7 | 8 | setwd("/Volumes/Helix-Common/GLASS-analysis/") 9 | 10 | generef = read.delim(file = "data/ref/ncbiRefSeqCurated_hg19.tsv", as.is = TRUE) 11 | genecov = read_tsv(file = "data/ref/gene.covariates.txt") 12 | 13 | df = generef %>% 14 | group_by(name2) %>% 15 | filter(row_number()==1) %>% 16 | ungroup() %>% 17 | transmute(gene_symbol = name2, 18 | transcript_id = name, 19 | chrom = gsub("chr","",chrom), 20 | pos = sprintf("[%s,%s)",txStart,txEnd), 21 | strand = strand, 22 | exons = exonCount, 23 | tx_size = txEnd - txStart, 24 | cds_size = sapply(mapply("-", lapply(str_split(exonEnds, ","), as.numeric), lapply(str_split(exonStarts, ","), as.numeric), SIMPLIFY = FALSE), sum, na.rm = TRUE) ) %>% 25 | filter(chrom %in% c(1:22,"X","Y")) %>% 26 | left_join(genecov, by = c("gene_symbol" = "gene")) %>% 27 | mutate(expr = ifelse(is.nan(expr), 0, expr), 28 | reptime = ifelse(is.nan(reptime), 0, reptime), 29 | hic = ifelse(is.nan(hic), 0, hic)) %>% 30 | arrange(gene_symbol) 31 | 32 | dbWriteTable(con, Id(schema="ref",table="genes"), df, append=T) 33 | -------------------------------------------------------------------------------- /R/misc/seqz2DB.R: -------------------------------------------------------------------------------- 1 | ### push titan seg into db 2 | 3 | library(tidyverse) 4 | library(DBI) 5 | library(odbc) 6 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2") 7 | 8 | seg <- read.delim("results/sequenza/glass_seqz_segments.tsv", as.is = TRUE) 9 | pp <- read.delim("results/sequenza/glass_seqz_purity_ploidy.tsv", as.is = TRUE) 10 | 11 | pp <- pp %>% select(pair_barcode, cellularity, ploidy, slpp = SLPP) 12 | seg <- seg %>% transmute(pair_barcode, 13 | chrom = ifelse(chromosome=='X',23,as.integer(chromosome)), 14 | pos = sprintf("[%s,%s]",start.pos,end.pos), 15 | baf = Bf, 16 | baf_n = N.BAF, 17 | baf_sd = sd.BAF, 18 | ratio = depth.ratio, 19 | ratio_n = N.ratio, 20 | ratio_sd = sd.ratio, 21 | copy_number = CNt, 22 | major_cn = A, 23 | minor_cn = B, 24 | log_posterior_proba = LPP) 25 | 26 | dbWriteTable(con, Id(schema="variants",table="seqz_seg"), seg, append=T) 27 | dbWriteTable(con, Id(schema="variants",table="seqz_params"), pp, append=T) 28 | 29 | # segfiles <- list.files('results/cnv/titanfinal/seg', full.names = TRUE) 30 | # 31 | # lapply(segfiles, function(f){ 32 | # message(f) 33 | # dat <- read.delim(f, as.is=T, header=T, row.names = NULL) 34 | # df <- dat %>% 35 | # transmute(pair_barcode = Sample, 36 | # chrom = Chromosome, 37 | # pos = sprintf("[%s,%s]",Start_Position.bp.,End_Position.bp.), 38 | # num_snp = Length.snp., 39 | # median_ratio = Median_Ratio, 40 | # median_logr = Median_logR, 41 | # titan_state = TITAN_state, 42 | # titan_call = TITAN_call, 43 | # copy_number = Copy_Number, 44 | # major_cn = MajorCN, 45 | # minor_cn = MinorCN, 46 | # clonal_cluster = Clonal_Cluster, 47 | # cellular_prevalence = Cellular_Prevalence, 48 | # logr_copy_number = logR_Copy_Number, 49 | # corrected_copy_number = Corrected_Copy_Number, 50 | # corrected_call = Corrected_Call) 51 | # 52 | # dbWriteTable(con, Id(schema="analysis",table="titan_seg"), df, append=T) 53 | # Sys.sleep(1) 54 | # }) -------------------------------------------------------------------------------- /R/misc/st-jude-life-history-identification.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | # Identify the St. Jude pediatric brain tumors that have 3 | # availabe germline, diagnosis, and recurrent tumors. 4 | # Date: 2018.05.14 5 | # Author: Kevin J 6 | ####################################################### 7 | 8 | # project directory. 9 | setwd("/Users/johnsk/Documents/Life-History/GLASS-WG/") 10 | StJude_dataset_path = "data/st-jude-data/StJude.20180511.xlsx" 11 | 12 | ####################################################### 13 | 14 | library(tidyverse) 15 | library(openxlsx) 16 | 17 | ####################################################### 18 | 19 | # Roel provided the life history working group a file containing the St. Jude data set to which we were 20 | # granted access. 21 | StJude_avail_data = readWorkbook(StJude_dataset_path, sheet = 1, startRow = 1, colNames = TRUE) 22 | 23 | ################################ 24 | # The goal is to identify paired primary-recurrent samples that have WGS data. 25 | ################################ 26 | # What datasets are available? 27 | table(StJude_avail_data$sj_diseases) 28 | 29 | # HGG = High Grade Glioma. 30 | StJude_HGG = StJude_avail_data %>% 31 | filter(sequencing_type=="WGS" & sj_diseases=="HGG" & file_type=="BAM") 32 | StJude_HGG_bams = filter(StJude_HGG, !grepl('bai', file_path)) 33 | table(StJude_HGG_bams$subject_name, StJude_HGG_bams$sample_type) 34 | # Looks like 7-pairs: 3 with sample at autopsy; 4 with samples at recurrence. 35 | 36 | # LGG = Low Grade Glioma. 37 | StJude_LGG = StJude_avail_data %>% 38 | filter(sequencing_type=="WGS" & sj_diseases=="LGG" & file_type=="BAM") 39 | StJude_LGG_bams = filter(StJude_LGG, !grepl('bai', file_path)) 40 | table(StJude_LGG_bams$subject_name, StJude_LGG_bams$sample_type) 41 | # 3 LGGs with primary and relapse. 42 | 43 | # Might there be patients that started off with LGG and progressed to HGG? 44 | StJude_glioma = StJude_avail_data %>% 45 | filter(sequencing_type=="WGS" & sj_diseases%in% c("LGG","HGG") & file_type=="BAM") 46 | StJude_glioma_bams = filter(StJude_glioma, !grepl('bai', file_path)) 47 | StJude_glioma_table = table(StJude_glioma_bams$subject_name, StJude_glioma_bams$sample_type) 48 | 49 | # Gather names on possible trios. 50 | StJude_glioma_trios = StJude_glioma_bams %>% 51 | group_by(subject_name) %>% summarise(Trio = n_distinct(sample_type)) %>% filter(Trio>2) 52 | StJude_glioma_trio_names = StJude_glioma_trios$subject_name 53 | 54 | # Check available data for all trio names. 55 | StJude_glioma_trio_data = StJude_avail_data[StJude_avail_data$subject_name%in%StJude_glioma_trio_names, ] 56 | StJude_glioma_trio_data_wgs = StJude_glioma_trio_data %>% filter(sequencing_type=="WGS" & file_type=="BAM") 57 | 58 | # Write out files to be downloaded from St. Jude Cloud. 59 | write.csv(StJude_glioma_trio_data_wgs, "data/st-jude-data/st-jude_glioma_trio_data_wgs.csv") 60 | 61 | -------------------------------------------------------------------------------- /R/misc/table-to-json-example.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | # Generate a metadata json file for Life-History sequencing samples. 3 | # Date: 2018.05.21 4 | # Authors: Kevin, Samir. 5 | ####################################################### 6 | 7 | library(tidyverse) 8 | library(jsonlite) 9 | library(purrr) 10 | library(listviewer) 11 | 12 | ####################################################### 13 | # An example of tabular data containing read groups, file locations, and basic subject covariate information. 14 | metadf <- read_tsv("/Users/johnsk/Documents/Life-History/GLASS-WG/data/ref/table_to_json_test.tsv") 15 | 16 | # Some columns did not need to be included in JSON. 17 | # This is a fabricated test that included a few BAM files 18 | # from these libraries to test parsing ability. 19 | metajson <- metadf %>% 20 | select(-one_of("Mate_ID")) %>% 21 | group_by(Patient_ID, Cohort, Sex, Age) %>% 22 | rename(fileNames = Fastq_Filenames) %>% 23 | group_by(FlowCell_ID, Lane_ID, add = TRUE) %>% 24 | mutate(files = list(list(fileType, fileNames))) %>% 25 | ungroup() %>% 26 | select(-one_of("fileType", "fileNames")) %>% 27 | filter(!duplicated(files)) %>% 28 | nest(-Patient_ID, -Cohort, -Sex, -Age, .key = PatientLevel) %>% 29 | mutate(PatientLevel = purrr::map(PatientLevel, ~ .x %>% 30 | group_by(Sample_Type) %>% 31 | nest(.key = SampleLevel))) 32 | # Interactively assess list tree structure. 33 | listviewer::jsonedit(metajson) 34 | 35 | # If opening in SublimeText use app (shift + cmd + p) to get in JSON pretty format. 36 | write_json(metajson, "/Users/johnsk/Documents/Life-History/GLASS-WG/data/metadata-json.json") 37 | 38 | -------------------------------------------------------------------------------- /R/misc/titan2DB.R: -------------------------------------------------------------------------------- 1 | ### push titan seg into db 2 | 3 | library(tidyverse) 4 | library(DBI) 5 | library(odbc) 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 7 | 8 | segfiles <- list.files('results/cnv/titanfinal/seg', full.names = TRUE) 9 | 10 | lapply(segfiles, function(f){ 11 | message(f) 12 | dat <- read.delim(f, as.is=T, header=T, row.names = NULL) 13 | df <- dat %>% 14 | transmute(pair_barcode = Sample, 15 | chrom = Chromosome, 16 | pos = sprintf("[%s,%s]",Start_Position.bp.,End_Position.bp.), 17 | num_snp = Length.snp., 18 | median_ratio = Median_Ratio, 19 | median_logr = Median_logR, 20 | titan_state = TITAN_state, 21 | titan_call = TITAN_call, 22 | copy_number = Copy_Number, 23 | major_cn = MajorCN, 24 | minor_cn = MinorCN, 25 | clonal_cluster = Clonal_Cluster, 26 | cellular_prevalence = Cellular_Prevalence, 27 | logr_copy_number = logR_Copy_Number, 28 | corrected_copy_number = Corrected_Copy_Number, 29 | corrected_call = Corrected_Call) 30 | 31 | dbWriteTable(con, Id(schema="analysis",table="titan_seg"), df, append=T) 32 | Sys.sleep(1) 33 | }) -------------------------------------------------------------------------------- /R/misc/titanparams2db.R: -------------------------------------------------------------------------------- 1 | ### push titan params to db 2 | 3 | library(tidyverse) 4 | library(DBI) 5 | library(odbc) 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 7 | 8 | paramfiles <- list.files('results/cnv/titanfinal/params', full.names = TRUE) 9 | 10 | ## taken from Gavin's script 11 | formatParams <- function(params){ 12 | id <- colnames(params) 13 | barcode <- strsplit(id, "_cluster")[[1]][1] 14 | cellPrev <- strsplit(params[grepl("Clonal cluster cellular prevalence", 15 | rownames(params)), 1], " ")[[1]] 16 | numClust <- length(cellPrev) 17 | cellPrev <- paste0(format(cellPrev, digits=4), collapse=",") 18 | norm <- as.numeric(params[grepl("Normal contamination estimate", rownames(params)), 1]) 19 | purity <- 1 - norm 20 | ploidy <- as.numeric(params[grepl("Average tumour ploidy estimate", rownames(params)), 1]) 21 | loglik <- as.numeric(params[grepl("likelihood", rownames(params)), 1]) 22 | sdbw <- as.numeric(params[grepl("S_Dbw validity index \\(Both\\)", rownames(params)), 1]) 23 | return(data.frame(id=id, barcode=barcode, numClust=numClust, cellPrev=cellPrev, 24 | purity=purity, norm=norm, ploidy=ploidy, loglik=loglik, sdbw=sdbw, 25 | stringsAsFactors = FALSE)) 26 | } 27 | 28 | datlist = lapply(paramfiles, function(f) { 29 | phi <- read.delim(f, header=F, row.names=1, stringsAsFactors=F, sep="\t") 30 | colnames(phi) <- gsub(".params.txt", "", basename(f)) 31 | return(formatParams(phi)) 32 | }) 33 | 34 | dat = data.table::rbindlist(datlist) %>% 35 | as.data.frame() %>% 36 | select(pair_barcode = barcode, 37 | num_clones = numClust, 38 | cellular_prevalence = cellPrev, 39 | purity, 40 | normal_contamination = norm, 41 | ploidy, 42 | loglik, 43 | sdbw) 44 | 45 | dbWriteTable(con, Id(schema="analysis",table="titan_params"), dat, append = FALSE) 46 | -------------------------------------------------------------------------------- /R/neoantigens/analysis/SuppTable6_writetotext.r: -------------------------------------------------------------------------------- 1 | #This script saves Supplementary Table 6 (generated using the neoantigen_peptide_counts.sql query) to a text file 2 | #----------------------------------------------------- 3 | 4 | library(DBI) 5 | library(odbc) 6 | library(ggplot2) 7 | library(reshape) 8 | 9 | rm(list=ls()) 10 | 11 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 12 | 13 | res <- dbGetQuery(con, read_file("sql/neoag/neoantigen_peptide_counts.sql")) 14 | 15 | write.table(res,"/projects/varnf/GLASS/Figures/resubmission/final/SuppTableS4.txt",sep="\t",quote=F,row.names=F) -------------------------------------------------------------------------------- /R/neoantigens/analysis/neoag_depletion_hla_count.r: -------------------------------------------------------------------------------- 1 | #Code to correlate each sample's number of unique HLA alleles with their neoantigen depletion values 2 | #Query at the top counts each patient's total number of HLA loci from the analysis.neoantigens_by_aliquot table 3 | #First correlation: All samples (reported in manuscript) 4 | #Second correlation: initial only samples 5 | #Third correlation: recurrent only samples 6 | #----------------------------------------------------- 7 | 8 | library(DBI) 9 | library(odbc) 10 | library(ggplot2) 11 | library(reshape) 12 | 13 | rm(list=ls()) 14 | 15 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 16 | 17 | q = "WITH hla_table AS 18 | ( 19 | SELECT aliquot_barcode, hla_allele 20 | FROM analysis.neoantigens_by_aliquot 21 | GROUP BY aliquot_barcode, hla_allele 22 | ), 23 | hla_tot AS 24 | ( 25 | SELECT aliquot_barcode, COUNT(*) AS hla_num 26 | FROM hla_table 27 | GROUP BY aliquot_barcode 28 | ) 29 | 30 | SELECT nd.*,hla.hla_num 31 | FROM analysis.neoantigen_depletion nd 32 | INNER JOIN hla_tot hla ON hla.aliquot_barcode = nd.aliquot_barcode 33 | ORDER BY rneo" 34 | 35 | res <- dbGetQuery(con,q) 36 | 37 | res[,"hla_num"] <- as.numeric(res[,"hla_num"]) 38 | 39 | cor.test(res[,"rneo"],res[,"hla_num"],method="s") #R = 0.29 P = 2.1e-9 40 | 41 | pri <- res[grep("-TP-",res[,1]),] 42 | rec <- res[grep("-R1-|-R2-|-R3-|-R4-",res[,1]),] 43 | 44 | cor.test(pri[,"rneo"],pri[,"hla_num"],method="s") #R = 0.23 P = 5e-4 45 | cor.test(rec[,"rneo"],rec[,"hla_num"],method="s") #R = 0.32 P = 5.6-7 46 | -------------------------------------------------------------------------------- /R/neoantigens/analysis/neoantigen_depletion_subclonal_selection.r: -------------------------------------------------------------------------------- 1 | #Code to that compares each the observed-to-expected neoantigen ratios between samples marked as "selected" and "neutral" evolution using the subclonalSelection method 2 | #Query at the top joins analysis.neoantigen_depletion table to analysis.subclonalselection table (and others) 3 | #Comparisons are made for initial tumors and for recurrent tumors 4 | #No significant associations 5 | #This analysis is reported in the manuscript 6 | #------------------------------------------------------------------------------ 7 | 8 | library(odbc) 9 | library(DBI) 10 | library(ggplot2) 11 | 12 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 13 | 14 | q <- "SELECT gs.* , nd1.rneo AS nd_a, nd2.rneo AS nd_b, clin.idh_codel_subtype AS subtype,sc1.most_probable_classification AS neut1, sc2.most_probable_classification AS neut2, sc1.probability_neutral AS prob1, sc2.probability_neutral AS prob2 15 | FROM analysis.gold_set gs 16 | LEFT JOIN analysis.neoantigen_depletion nd1 ON nd1.aliquot_barcode = gs.tumor_barcode_a 17 | LEFT JOIN analysis.neoantigen_depletion nd2 ON nd2.aliquot_barcode = gs.tumor_barcode_b 18 | LEFT JOIN analysis.mut_freq mf1 ON mf1.aliquot_barcode = gs.tumor_barcode_a 19 | LEFT JOIN analysis.mut_freq mf2 ON mf2.aliquot_barcode = gs.tumor_barcode_b 20 | LEFT JOIN clinical.subtypes clin ON clin.case_barcode = gs.case_barcode 21 | LEFT JOIN analysis.subclonalselection sc1 ON sc1.aliquot_barcode = gs.tumor_barcode_a 22 | LEFT JOIN analysis.subclonalselection sc2 ON sc2.aliquot_barcode = gs.tumor_barcode_b 23 | WHERE nd1.rneo IS NOT NULL AND nd2.rneo IS NOT NULL AND (nd1.nobs >= 3 AND nd2.nobs >= 3) --AND 24 | --sc1.most_probable_classification IS NOT NULL AND sc2.most_probable_classification IS NOT NULL 25 | ORDER BY nd1.rneo" 26 | 27 | res <- dbGetQuery(con, q) 28 | 29 | subtypes <- unique(res[,"subtype"]) 30 | for(i in 1:length(subtypes)) 31 | { 32 | sub_res <- res[which(res[,"subtype"]==subtypes[i]),] 33 | pri_s <- sub_res[which(sub_res[,"neut1"]=="S"),"nd_a"] 34 | pri_n <- sub_res[which(sub_res[,"neut1"]=="N"),"nd_a"] 35 | 36 | rec_s <- sub_res[which(sub_res[,"neut2"]=="S"),"nd_b"] 37 | rec_n <- sub_res[which(sub_res[,"neut2"]=="N"),"nd_b"] 38 | 39 | t.test(pri_s,pri_n) 40 | t.test(rec_s,rec_n) 41 | 42 | s1 <- c(pri_s,rec_s) 43 | n1 <- c(pri_n,rec_n) 44 | t.test(s1,n1) 45 | } 46 | -------------------------------------------------------------------------------- /R/neoantigens/upload/cibersort_table.r: -------------------------------------------------------------------------------- 1 | #Code to upload the CIBERSORT data from the Wang et al Cancer Cell paper (PMID: 28697342) 2 | #Produces a table in long format that has a row for each aliquot/cell combination 3 | #This table is used to make Extended Data Figure 12C 4 | #Manually fixes the name of TCGA-14-1402 to match up with the db 5 | #----------------------------------------------------- 6 | 7 | library(DBI) 8 | library(odbc) 9 | library(reshape) 10 | 11 | rm(list=ls()) 12 | 13 | cibersort <- read.delim("/projects/varnf/GLASS/data/CIBERSORT/CIBERSORT_cancer_cell.txt",sep="\t",header=T,stringsAsFactor=F) 14 | mapping_table <- read.delim("/projects/varnf/GLASS/data/CIBERSORT/cancer_cell_RNAseq_mapping.txt",sep="\t",header=T,stringsAsFactor=F) 15 | myoutf <- "/projects/varnf/GLASS/data/CIBERSORT/CIBERSORT_GLASS_format.txt" 16 | 17 | #Add information for TCGA-14-1402 18 | mapping_table[which(mapping_table[,"SampleId"]=="TCGA.14.1402.01"),"GLSS_barcodeTP"] <- "TCGA-14-1402-TP" 19 | mapping_table[which(mapping_table[,"SampleId2"]=="TCGA.14.1402.02A"),"GLSS_barcodeR1"] <- "TCGA-14-1402-R1" 20 | 21 | cibersort[,2] <- gsub("-",".",cibersort[,2]) 22 | 23 | mapping <- c(mapping_table[,"GLSS_barcodeTP"],mapping_table[,"GLSS_barcodeR1"]) 24 | names(mapping) <- c(mapping_table[,"SampleId"],mapping_table[,"SampleId2"]) 25 | 26 | ordered_names <- mapping[cibersort[,"SampleId"]] 27 | cibersort[,"sample_barcode"] <- ordered_names 28 | 29 | cibersort <- cibersort[-which(is.na(cibersort[,"sample_barcode"])),] 30 | cibersort <- cibersort[-which(cibersort[,"sample_barcode"]=="not in data freeze"),] 31 | 32 | rownames(cibersort) <- cibersort[,"sample_barcode"] 33 | cibersort <- cibersort[,3:24] 34 | cibersort <- cbind(rownames(cibersort),cibersort) 35 | colnames(cibersort)[1] <- "sample_barcode" 36 | colnames(cibersort) <- gsub("\\.","",colnames(cibersort)) 37 | 38 | cibersort <- melt(cibersort) 39 | 40 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 41 | dbWriteTable(con, Id(schema="analysis",table="cibersort"), cibersort, overwrite=TRUE, row.names=FALSE) 42 | 43 | write.table(cibersort, myoutf,sep="\t",quote=F,row.names=F) 44 | -------------------------------------------------------------------------------- /R/preprocess/README.md: -------------------------------------------------------------------------------- 1 | ### Basic preprocessing 2 | 3 | These scripts are **not essential** for preprocessing, but rather they relate to quality assessment or data collection. There are likely to be many files specified that cannot be found in this repository. This code is being maintained here largely as a reference point for the Verhaak laboratory and for parties interested in how some of the sequencing metrics were generated. 4 | -------------------------------------------------------------------------------- /R/preprocess/add_aligned_bam_to_files.R: -------------------------------------------------------------------------------- 1 | ## Add aligned files 2 | 3 | library(odbc) 4 | library(DBI) 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 6 | 7 | bamfiles = list.files("/projects/verhaak-lab/GLASS-analysis/results/align/bqsr", pattern = "^GLSS-MG.*bam$", full.names = T) 8 | md5files = list.files("/projects/verhaak-lab/GLASS-analysis/results/align/bqsr", pattern = "^GLSS-MG.*md5$", full.names = T) 9 | filesizes = sapply(bamfiles, function(f) file.info(f)$size) 10 | filemd5s = sapply(md5files, function(f) readLines(f, warn=F)) 11 | 12 | files_add = data.frame(aliquot_barcode = gsub(".realn.mdup.bqsr.bam", "", basename(bamfiles)), file_name = basename(bamfiles), file_size = unname(filesizes), file_md5sum= unname(filemd5s), file_format = "aligned BAM", file_path = bamfiles, stringsAsFactors = F) 13 | 14 | dbWriteTable(con, Id(schema="analysis",table="files"), files_add, append=T) 15 | 16 | tmp = dbReadTable(con, Id(schema="biospecimen",table="aliquots")) 17 | write.csv(tmp, file = "aliquots.csv") 18 | -------------------------------------------------------------------------------- /R/preprocess/aliquots-coverage-metrics.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | # Enumerate cumulative coverage per aliquot for WGS/WXS 3 | # Date: 2018.11.06 4 | # Author: Kevin J. 5 | ####################################################### 6 | 7 | # Directory for GLASS analysis. 8 | mybasedir = '/Volumes/verhaak-lab/GLASS-analysis/' 9 | datadir = 'results/align/wgsmetrics/' 10 | pattern = '.WgsMetrics.txt$' 11 | 12 | ####################################################### 13 | 14 | # Necessary packages: 15 | library(parallel) 16 | library(tidyverse) 17 | library(data.table) 18 | library(DBI) 19 | 20 | ####################################################### 21 | # Establish connection with the database. 22 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 23 | 24 | ## Read in an example "*.WgsMetrics.txt" file to test the calling. 25 | files = list.files(datadir, full.names = T, pattern = pattern, recursive=T) 26 | 27 | # If it is desirable to include the sample names. 28 | samples = data.frame(sample_id=gsub(".WgsMetrics.txt", "", basename(files)), library_type = substring(basename(files), 21, 23)) 29 | 30 | # The first 10 rows of each file represent a header of additional information. 31 | cov_dat = mclapply(files, function(f){ 32 | dat = tryCatch(read.delim(f,as.is=T, header=T, row.names = NULL, skip = 10), error=function(e) e) 33 | if(inherits(dat,'error')) { 34 | message(f, '\n', dat, '\n') 35 | return() 36 | } 37 | # Truncate the file name to just the sample_id. 38 | dat = dat %>% 39 | mutate(sample_id = gsub(".WgsMetrics.txt", "", basename(f))) # %>% 40 | # filter(coverage!="0") # Filter out those bases with `0` coverage. 41 | 42 | return(dat) 43 | 44 | }, mc.cores=20) 45 | 46 | ## Combine all the samples from the GLASS cohort. 47 | glass_cov = data.table::rbindlist(cov_dat) 48 | 49 | # Cumulatively add the number of bases at each level: 50 | glass_samples_cumulative_cov = glass_cov %>% 51 | group_by(sample_id) %>% 52 | mutate(cumulative_coverage = rev(cumsum(rev(high_quality_coverage_count)))) %>% 53 | # Make sure colnames are formatting right. 54 | select(aliquot_barcode = sample_id, coverage, high_quality_coverage_count, cumulative_coverage) 55 | 56 | 57 | # Total number should be 1166 (2019.03.08). 58 | n_distinct(glass_samples_cumulative_cov$aliquot_barcode) 59 | 60 | # Write output as one table or a table for each file: 61 | # write.table(glass_samples_cumulative_cov, file = "/Users/johnsk/Documents/Life-History/GLASS-WG/data/ref/glass-cumulative-coverage.txt", sep="\t", row.names = F, col.names = T, quote = F) 62 | 63 | # Write to cumulative coverage files to database. 64 | dbWriteTable(con, Id(schema="analysis",table="coverage"), glass_samples_cumulative_cov, append=T) 65 | -------------------------------------------------------------------------------- /R/preprocess/crosscheckmetricscluster.R: -------------------------------------------------------------------------------- 1 | tmp = read.delim("/fastscratch/verhaak-lab/GLASS-WG/results/fingerprinting/GLASS-WG.crosscheck_metrics", skip = 6) 2 | 3 | d = dist(tmp[,c(3,4,7)]) 4 | fit = hclust(tmp[,c(3,4,7)]) 5 | plot(as.dendrogram(fit), horiz=T) 6 | 7 | x=tmp[,c(1,2,5)] %>% spread(RIGHT_GROUP_VALUE, LOD_SCORE) 8 | rownames(x) = x$LEFT_GROUP_VALUE 9 | x$LEFT_GROUP_VALUE = NULL 10 | x = as.matrix(x) 11 | 12 | table(is.na(x)) 13 | 14 | fit = hclust(dist(x)) 15 | 16 | fit = hclust(tmp[,c(1,2,5)]) 17 | 18 | plot(fit) 19 | -------------------------------------------------------------------------------- /R/preprocess/vcf_aliquot_qc.R: -------------------------------------------------------------------------------- 1 | library(VariantAnnotation) 2 | library(DBI) 3 | library(odbc) 4 | 5 | rm(list=ls()) 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 7 | q <- "SELECT * FROM biospecimen.aliquots" 8 | aliquots <- dbGetQuery(con,q) 9 | aliquots[,"case_barcode"] <- sapply(strsplit(aliquots[,"sample_barcode"],"-"),function(x)paste(x[1:3],collapse="-")) 10 | 11 | myDir1 <- "/projects/verhaak-lab/GLASS-analysis/results/mutect2/m2filter" 12 | 13 | mytag <- dir(myDir1) 14 | mytag <- mytag[grep("filtered.vcf.gz$",mytag)] 15 | vcff <- paste(myDir1,mytag,sep="/") 16 | mytag <- gsub(".filtered.vcf.gz","",mytag) 17 | 18 | check <- matrix(NA,nrow=length(vcff),ncol=15) 19 | rownames(check) <- mytag 20 | aliquot_match <- rep(0,length(vcff)) 21 | for(i in 1:length(vcff)) 22 | { 23 | cat("\r",i) 24 | vcf = readVcf(vcff[i], "hg19") 25 | samp_names <- rownames(colData(vcf)) 26 | case_names <- sapply(strsplit(samp_names,"-"),function(x)paste(x[1:3],collapse="-")) 27 | 28 | samp_boo <- as.numeric(case_names == mytag[i]) 29 | nsamp <- length(samp_boo) 30 | 31 | check[i,1:nsamp] <- samp_boo 32 | check[i,ncol(check)] <- nsamp 33 | 34 | sub_aliquots <- aliquots[which(aliquots[,"case_barcode"]==mytag[i]),] 35 | aliquot_match[i] <- sum(samp_names %in% sub_aliquots[,"aliquot_barcode"])/nrow(sub_aliquots) 36 | } 37 | 38 | sums <- apply(check[,1:(ncol(check)-1)],1,function(x)sum(x,na.rm=T)) 39 | sums == check[,ncol(check)] 40 | sum(sums == check[,ncol(check)]) == nrow(check) 41 | aliquot_match 42 | 43 | -------------------------------------------------------------------------------- /R/snakemake/cov2db.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | # Enumerate cumulative coverage per aliquot for WGS/WXS 3 | # Date: 2018.11.06 4 | # Author: Kevin J., FP Barthel 5 | ####################################################### 6 | 7 | options(scipen=999) 8 | 9 | ## Parse snakemake 10 | if(exists("snakemake")) { 11 | files = snakemake@input[["metrics"]] 12 | outfn = snakemake@output[["tsv"]] 13 | } else { 14 | files = list.files("results/align/wgsmetrics", recursive = T, pattern = "WgsMetrics.txt", full.names = T) # list("results/align/wgsmetrics/GLSS-DK-0012-NB-01D-WXS-ABCB18.WgsMetrics.txt", "results/align/wgsmetrics/GLSS-DK-0003-TP-01D-WXS-E43D26.WgsMetrics.txt") 15 | } 16 | 17 | # Necessary packages: 18 | library(parallel) 19 | library(tidyverse) 20 | library(data.table) 21 | library(DBI) 22 | 23 | # The first 10 rows of each file represent a header of additional information. 24 | cov_dat = lapply(files, function(f){ 25 | dat = tryCatch(read.delim(f,as.is=T, header=T, row.names = NULL, skip = 10), error=function(e) e) 26 | if(inherits(dat,'error')) { 27 | message(f, '\n', dat, '\n') 28 | return() 29 | } 30 | # Truncate the file name to just the sample_id. 31 | dat = dat %>% 32 | mutate(sample_id = gsub(".WgsMetrics.txt", "", basename(f)), 33 | high_quality_coverage_count = as.numeric(high_quality_coverage_count)) # %>% 34 | # filter(coverage!="0") # Filter out those bases with `0` coverage. 35 | 36 | return(dat) 37 | })#, mc.cores=20) 38 | 39 | ## Combine all the samples from the GLASS cohort. 40 | glass_cov = data.table::rbindlist(cov_dat) 41 | 42 | # Cumulatively add the number of bases at each level: 43 | glass_samples_cumulative_cov = glass_cov %>% 44 | group_by(sample_id) %>% 45 | mutate(cumulative_coverage = rev(cumsum(rev(high_quality_coverage_count)))) %>% 46 | select(aliquot_barcode = sample_id, coverage, high_quality_coverage_count, cumulative_coverage) 47 | 48 | # Write output as one table or a table for each file: 49 | write.table(glass_samples_cumulative_cov, file = outfn, quote = F, sep = "\t", row.names = FALSE, col.names = FALSE) -------------------------------------------------------------------------------- /R/snakemake/pyclone_create_tsv.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(DBI) 3 | 4 | .libPaths("/home/barthf/R/x86_64-pc-linux-gnu-library/3.3") 5 | dyn.load("/projects/verhaak-lab/verhaak_env/anaconda/v4.2.0/envs/rvenv2018/lib/libodbc.so") 6 | ## database connection 7 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2b") 8 | 9 | ## input/output parameters 10 | barcode <- snakemake@wildcards[["aliquot_barcode"]] 11 | tsv <- snakemake@output[["tsv"]] 12 | 13 | ## Logging 14 | message("Processing ", barcode) 15 | 16 | ## process parameters 17 | case_barcode <- substring(barcode,1,12) 18 | 19 | ## Fetch data from DB 20 | rs <- dbSendQuery(con,read_file("sql/pyclone/pyclone_create_tsv.sql")) 21 | dbBind(rs, list(case_barcode)) 22 | qres <- dbFetch(rs) 23 | 24 | df <- qres %>% 25 | filter(aliquot_barcode == barcode) %>% 26 | select(mutation_id,ref_counts,var_counts,normal_cn,minor_cn,major_cn) 27 | 28 | write.table(df, file = tsv, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) -------------------------------------------------------------------------------- /R/snakemake/seg2db.R: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | # Segments to database 3 | ####################################################### 4 | 5 | options(scipen=999) 6 | 7 | ## Parse snakemake 8 | if(exists("snakemake")) { 9 | files = snakemake@input[["seg"]] 10 | outfn = snakemake@output[["tsv"]] 11 | } else { 12 | files = list.files("results/cnv/callsegments/", recursive = T, pattern = "called.seg", full.names = T)[1:10] 13 | } 14 | 15 | # Necessary packages: 16 | library(parallel) 17 | library(tidyverse) 18 | library(data.table) 19 | library(DBI) 20 | 21 | segs = lapply(files, function(f){ 22 | dat <- read.delim(f, comment.char = "@", as.is= TRUE) 23 | dat <- dat %>% 24 | mutate(aliquot_barcode = substr(basename(f),1,30), pos = sprintf("[%s,%s]", START, END)) %>% 25 | select(aliquot_barcode, chrom = CONTIG, pos, num_points = NUM_POINTS_COPY_RATIO, log2_copy_ratio = MEAN_LOG2_COPY_RATIO, call = CALL) 26 | return(dat) 27 | }) 28 | segs <- data.table::rbindlist(segs) %>% as.data.frame() 29 | 30 | write.table(segs, file = outfn, quote = F, sep = "\t", row.names = FALSE, col.names = FALSE) -------------------------------------------------------------------------------- /R/snakemake/snv2db.R: -------------------------------------------------------------------------------- 1 | library(VariantAnnotation) 2 | library(stringr) 3 | 4 | vcff = snakemake@input[["vcf"]] 5 | tsvf = snakemake@output[["tsv"]] 6 | 7 | vcf <- readVcf(vcff) 8 | 9 | funcolumns <- unlist(strsplit(unlist(strsplit(info(header(vcf))['FUNCOTATION',3], '\\: '))[2],'\\|')) 10 | funcotation <- as.data.frame(do.call('rbind', str_split(gsub("^\\[|\\]$","",as.character(info(vcf)[,'FUNCOTATION'])), "\\|"))) 11 | colnames(funcotation) <- funcolumns 12 | 13 | df <- data.frame(chrom = gsub("^chr","",as.character(seqnames(vcf))), 14 | pos = sprintf("[%s,%s]", start(vcf), end(vcf)), 15 | ref = ref(vcf), 16 | alt = unstrsplit(CharacterList(alt(vcf)), sep=","), 17 | gene_symbol = funcotation$Gencode_19_hugoSymbol, 18 | variant_classification = funcotation$Gencode_19_variantClassification, 19 | secondary_variant_classification = funcotation$Gencode_19_secondaryVariantClassification, 20 | variant_type = funcotation$Gencode_19_variantType, 21 | genome_change = funcotation$Gencode_19_genomeChange, 22 | transcript = funcotation$Gencode_19_annotationTranscript, 23 | transcript_strand = funcotation$Gencode_19_transcriptStrand, 24 | transcript_exon = funcotation$Gencode_19_transcriptExon, 25 | transcript_position = funcotation$Gencode_19_transcriptPos, 26 | cdna_change = funcotation$Gencode_19_cDnaChange, 27 | cds_change = funcotation$Gencode_19_codonChange, 28 | protein_change = funcotation$Gencode_19_proteinChange, 29 | gc_content = funcotation$Gencode_19_gcContent, 30 | reference_context = funcotation$Gencode_19_referenceContext, 31 | stringsAsFactors = FALSE) 32 | 33 | write.table(df, file = tsvf, quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE) 34 | -------------------------------------------------------------------------------- /R/snakemake/vep_upload.r: -------------------------------------------------------------------------------- 1 | #This script takes the output of the annotate_vep rule in the Snakemake mutect2-post.smk module reformats it for uploading to the db (variants.vep table) 2 | #Additionally generates a .tsv file for backup 3 | #----------------------------------------------------- 4 | 5 | library(VariantAnnotation) 6 | 7 | library(ensemblVEP) 8 | library(tidyverse) 9 | library(DBI) 10 | 11 | 12 | setwd('/projects/varnf/GLASS/GLASS/') 13 | 14 | ## Parse snakemake 15 | maff = "results/mutect2/annoconsensusvcf/consensus.normalized.sorted.vep.maf" 16 | vcff = "results/mutect2/consensusvcf/consensus.normalized.sorted.vcf.gz" 17 | tsvf = "results/mutect2/maf2db/consensus.normalized.sorted.vep.tsv" 18 | 19 | vcf = readVcf(vcff, "hg19") 20 | maf = read.delim(maff, as.is = T, comment.char = '#') 21 | 22 | message("Read file ", basename(vcff)) 23 | message("Read file ", basename(maff)) 24 | 25 | df = data.frame(chrom = as.character(seqnames(vcf)), 26 | pos = sprintf("[%s,%s]", start(vcf), end(vcf)), 27 | ref = ref(vcf), 28 | alt = unstrsplit(CharacterList(alt(vcf)), sep=","), 29 | gene_id = maf$Gene, 30 | gene_symbol = maf$Hugo_Symbol, 31 | variant_classification = maf$Variant_Classification, 32 | variant_type = maf$Variant_Type, 33 | cdna_position = maf$cDNA_position, 34 | cds_position = maf$CDS_position, 35 | protein_position = maf$Protein_position, 36 | amino_acids = maf$Amino_acids, 37 | codons = maf$Codons, 38 | hgvs_c = maf$HGVSc, 39 | hgvs_p = maf$HGVSp_Short, 40 | polyphen = maf$PolyPhen, 41 | sift = maf$SIFT, 42 | stringsAsFactors = F) 43 | 44 | #Change chromosome X to chromosome 23 45 | df[which(df[,"chrom"]=='X'),"chrom"] <- 23 46 | df[,"chrom"] = as.numeric(df[,"chrom"]) 47 | 48 | #Manual edit to match GLASS variant_classifications table; this is now done in SQL 49 | #df[which(df[,"variant_classification"]=="Splice_Region"),"variant_classification"] <- "Splice_Site" 50 | 51 | write.table(df, file = tsvf, quote = F, sep = "\t", row.names = F, col.names = T) 52 | 53 | message("Wrote output ", basename(tsvf)) 54 | 55 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 56 | dbWriteTable(con, Id(schema="variants",table="vep"), df, overwrite=TRUE) 57 | 58 | -------------------------------------------------------------------------------- /R/snv/README.md: -------------------------------------------------------------------------------- 1 | ### Mutation analysis scripts 2 | 3 | These R scripts represent **ongoing** mutation data exploration and analyses. Note that not all files specified here are available for download. 4 | 5 | Mutect2 calls generated from the GLASS dataset are supported on Synapse. 6 | -------------------------------------------------------------------------------- /R/snv/archive/ensembl_genes_to_db.R: -------------------------------------------------------------------------------- 1 | library(biomaRt) 2 | library(DBI) 3 | 4 | ensembl = useMart("ensembl") 5 | ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl) 6 | term = getBM(c('ensembl_gene_id','hgnc_symbol'),mart=ensembl) 7 | 8 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2") 9 | dbWriteTable(con, Id(schema='ref',table='ensembl_genes'),term) 10 | -------------------------------------------------------------------------------- /R/snv/archive/mf_longitudinal.R: -------------------------------------------------------------------------------- 1 | 2 | library(tidyverse) 3 | library(DBI) 4 | 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 6 | dat <- dbGetQuery(con, read_file("sql/mf_longitudinal_analysis.sql")) 7 | 8 | dat <- dat %>% 9 | gather(v, value, time_birth:mf_recurrence) %>% 10 | separate(v, c("var", "descriptor")) %>% 11 | spread(var, value) 12 | 13 | p <- ggplot(data = dat, aes(x = time, y = mf, group = tumor_pair_barcode, color = descriptor)) + 14 | #geom_line() + 15 | stat_smooth(aes(group = 1), method = "lm") + 16 | #stat_summary(aes(group = 1), fun.y = mean, geom = "point", 17 | # shape = 17, size = 3) + 18 | facet_wrap(~hypermutator_status ) + 19 | coord_cartesian(ylim = c(0,10)) 20 | 21 | p 22 | -------------------------------------------------------------------------------- /R/snv/archive/mf_private_shared_time.R: -------------------------------------------------------------------------------- 1 | library(DBI) 2 | library(tidyverse) 3 | library(ggplot2) 4 | library(RColorBrewer) 5 | 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 7 | 8 | dat <- dbGetQuery(con, read_file("sql/mutation_freq_private_shared.sql")) 9 | clindata <- dbGetQuery(con, "SELECT DISTINCT case_barcode, idh_codel_subtype FROM clinical.surgeries WHERE idh_codel_subtype IS NOT NULL") 10 | dat <- dat %>% left_join(clindata) %>% filter(mf_a < 10, mf_b < 10) 11 | 12 | ggplot(dat, aes(x = surgical_interval_mo, y = mf_shared)) + 13 | geom_point() + 14 | geom_smooth(method="lm") + 15 | facet_wrap(~idh_codel_subtype, scales = "free") 16 | 17 | ggplot(dat, aes(x = surgical_interval_mo, y = mf_private_b)) + 18 | geom_point() + 19 | geom_smooth(method="lm") + 20 | facet_wrap(~idh_codel_subtype, scales = "free") 21 | 22 | ggplot(dat, aes(x = surgical_interval_mo, y = mf_private_a)) + 23 | geom_point() + 24 | geom_smooth(method="lm") + 25 | facet_wrap(~idh_codel_subtype, scales = "free") -------------------------------------------------------------------------------- /R/snv/archive/sample_variants.R: -------------------------------------------------------------------------------- 1 | 2 | library(VariantAnnotation) 3 | setwd("/fastscratch/verhaak-lab/GLASS-WG") 4 | 5 | ## Parse snakemake 6 | fbf = snakemake@input[["freebayes"]] 7 | csf = snakemake@input[["consensus"]] 8 | mtf = snakemake@params[["mutect2"]] 9 | trf = snakemake@output[["trigger"]] 10 | spl = snakemake@wildcards[["aliquot_barcode"]] 11 | 12 | ## Read freebayes and consensus input as VRanges 13 | fb = readVcfAsVRanges(fbf, "hg19") 14 | cs = readVcfAsVRanges(csf, "hg19", param=ScanVcfParam(fixed = "ALT", info = NA, geno = "AD")) 15 | message("Loaded ", basename(fbf)) 16 | message("Loaded ", basename(csf)) 17 | 18 | ## If sample is a tumor sample, read mutect calls 19 | paired = FALSE 20 | if(!is.na(mtf) & file.exists(mtf)) { 21 | paired = TRUE 22 | mt = readVcfAsVRanges(mtf, "hg19", param=ScanVcfParam(fixed = "ALT", info = NA, geno = "AD")) 23 | message("Loaded ", basename(mtf)) 24 | } 25 | 26 | ## Count overlap between freebayes calls and consensus callset 27 | hitscs = fb %in% cs 28 | 29 | ## Print some numbers 30 | prop_cs = round(length(fb)/length(cs)*100,1) 31 | prop_fb = round(sum(hitscs)/length(hitscs)*100,1) 32 | message("Found ", length(cs), " variants in consensus callset") 33 | message("Found ", length(fb), " freebayes calls (", prop_cs, "% of callset), amongst which ", 34 | sum(hitscs), " (", prop_fb, "%) matched calls from the consensus callset.") 35 | 36 | ## Subset freebayes output by only variants present in db 37 | fb = fb[hitscs] 38 | 39 | ## Clear some memory 40 | rm(cs) 41 | 42 | ## If a mutect callset is available, quantify overlap between mutect and freebayes 43 | if(paired) { 44 | hitsmt = fb %in% mt 45 | prop_mt = round(sum(hitsmt)/length(mt)*100,1) 46 | message("Found ", length(mt), " filtered Mutect2 calls, of which ", sum(hitsmt), " (", 47 | prop_mt, "%) exactly match calls from the consensus callset.") 48 | 49 | ## Annotate M2-called variants 50 | fb$called = hitsmt 51 | } else { 52 | fb$called = FALSE 53 | } 54 | 55 | ## Create output dataframe 56 | df = data.frame(aliquot_barcode = spl, chrom = seqnames(fb), start = start(fb), end = end(fb), alt = alt(fb), 57 | genotype = fb$GT, read_depth = totalDepth(fb), ref_count = refDepth(fb), alt_count = altDepth(fb), 58 | called = fb$called, 59 | stringsAsFactors = F) 60 | 61 | ## Drop variants without read counts 62 | df = df[which(!is.na(df$read_depth)),] 63 | 64 | ## Clear more memory 65 | rm(fb) 66 | 67 | ## Write to database 68 | .libPaths('/home/barthf/R/x86_64-pc-linux-gnu-library/3.3') 69 | 70 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 71 | DBI::dbWriteTable(con, DBI::Id(schema="analysis",table="snv_genotypes"), df, append=T) 72 | 73 | ## Write a trigger with the number of rows added 74 | cat(nrow(df), file = trf) 75 | message("Printed number of rows (", nrow(df), ") to file: ", basename(trf)) 76 | 77 | ## END ## -------------------------------------------------------------------------------- /R/snv/archive/shared_private_to_vcf.R: -------------------------------------------------------------------------------- 1 | library(VariantAnnotation) 2 | library(DBI) 3 | library(tidyverse) 4 | library(BSgenome.Hsapiens.UCSC.hg19) 5 | 6 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 7 | time <- system.time(qres <- dbGetQuery(con, read_file(('sql/variant_status_leeds.sql')))) 8 | 9 | ref_genome <- BSgenome.Hsapiens.UCSC.hg19 10 | ref_organism <- GenomeInfoDb::organism(ref_genome) 11 | ref_style <- seqlevelsStyle(ref_genome) 12 | 13 | genome_name <- genome(ref_genome)[[1]] 14 | seqlevelsStyle(ref_genome) = "NCBI" 15 | 16 | vcf <- VCF(rowRanges = GRanges(seqnames = trimws(qres$chrom), 17 | ranges = IRanges(start = as.integer(qres$start_pos), 18 | end = as.integer(qres$end_pos)), 19 | seqinfo = seqinfo(ref_genome), 20 | paramRangeID = rep(factor(NA),nrow(qres))), 21 | fixed = DataFrame(REF = DNAStringSet(qres$ref), 22 | ALT = unname(split(DNAStringSet(qres$alt),1:length(qres$alt))), 23 | QUAL = as.numeric(NA_integer_), 24 | FILTER = 'PASS'), 25 | geno = SimpleList(GT = matrix(rep(".",nrow(qres), dim.names = c(1:nrow(qres), "TEST")))), 26 | colData = DataFrame(Samples = 1, row.names = c("TEST"))) 27 | 28 | vcfout <- split(vcf, sprintf("%s-%s", qres$tumor_pair_barcode, qres$variant_status)) 29 | #rm(vcf) 30 | 31 | for (vcf_name in names(vcfout)) { 32 | vcf = vcfout[[vcf_name]] 33 | message("Writing ", vcf_name, " with ", nrow(vcf), " rows.") 34 | writeVcf(vcf, file = sprintf("results/mutect2/fractionated-vcf/%s.vcf", vcf_name), index = FALSE) 35 | } -------------------------------------------------------------------------------- /R/snv/archive/signature1_by_age_and_interval.R: -------------------------------------------------------------------------------- 1 | library(DBI) 2 | library(tidyverse) 3 | library(ggplot2) 4 | 5 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 6 | q <- "SELECT ms.tumor_pair_barcode, tp.case_barcode, mutation_status, ms.relative_contribution, cs.case_age_diagnosis_years, tp.surgical_interval_mo, su.idh_codel_subtype 7 | FROM analysis.mutsig_private_vs_shared ms 8 | LEFT JOIN analysis.tumor_pairs tp ON tp.tumor_pair_barcode = ms.tumor_pair_barcode 9 | LEFT JOIN clinical.cases cs ON cs.case_barcode = tp.case_barcode 10 | LEFT JOIN clinical.surgeries su ON su.sample_barcode = substring(tp.tumor_pair_barcode from 1 for 15) 11 | WHERE signature = 'Signature.1' AND mut_count >= 100" 12 | 13 | qres <- dbGetQuery(con, q) 14 | 15 | ggplot(qres, aes(x=case_age_diagnosis_years, y = relative_contribution)) + geom_point() + geom_smooth(method = "lm") + facet_wrap(~mutation_status) 16 | ggplot(qres, aes(x=case_age_diagnosis_years, y = relative_contribution)) + geom_point() + geom_smooth(method = "lm") + facet_wrap(~mutation_status + idh_codel_subtype) 17 | 18 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="shared") 19 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="primary") 20 | cor.test(~ relative_contribution + case_age_diagnosis_years, data = qres, subset = qres$mutation_status=="recurrent") 21 | 22 | ggplot(qres, aes(x=surgical_interval_mo, y = relative_contribution)) + 23 | geom_point() + 24 | geom_smooth(method = "lm") + 25 | facet_wrap(~mutation_status + idh_codel_subtype, scales = "free_x") 26 | -------------------------------------------------------------------------------- /R/snv/archive/sigproba2db.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv2") 4 | 5 | tmp <- read.delim('Downloads/signatures_probabilities.txt', as.is = TRUE) 6 | 7 | df <- tmp %>% gather(-(1:3),key="signature",value="proba") %>% filter(grepl("^Signature",signature)) 8 | 9 | df2 <- df %>% transmute(signature = as.numeric(gsub("Signature.","",signature)), 10 | ref_context = Trinucleotide, 11 | alt = substring(Substitution.Type,3,3), 12 | substitution_type = Substitution.Type, 13 | proba) 14 | 15 | dbWriteTable(con, Id(schema="ref",table="signature_proba"), df2, overwrite = TRUE) 16 | -------------------------------------------------------------------------------- /R/snv/archive/vaf-freq.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(DBI) 3 | library(ggthemes) 4 | library(ggplot2) 5 | library(RColorBrewer) 6 | 7 | con <- DBI::dbConnect(odbc::odbc(), "VerhaakDB") 8 | vaf_res <- dbGetQuery(con, read_file('sql/vaf_compare.sql')) 9 | 10 | g <- 11 | ggplot(vaf_res, aes(vaf_a, vaf_b)) + 12 | geom_point(aes(color=variant_classification)) + 13 | geom_abline(slope=1, alpha=0.2, linetype=2) + 14 | labs(x="Primary VAF", y="Optimal recurrence VAF", color = "Variant Classification") + 15 | coord_cartesian(xlim = c(0,1), ylim = c(0,1)) + 16 | theme_bw(base_size = 18) + 17 | theme(axis.text=element_text(size=10)) + 18 | facet_wrap(~gene_symbol, ncol = 3) + 19 | scale_color_manual(values=c("5'Flank" = brewer.pal(9, "Paired")[9], 20 | "Frame_Shift_Del" = brewer.pal(7, "Paired")[1], 21 | "Frame_Shift_Ins" = brewer.pal(7, "Paired")[2], 22 | "In_Frame_Del" = brewer.pal(7, "Paired")[3], 23 | "In_Frame_Ins" = brewer.pal(7, "Paired")[4], 24 | "Missense_Mutation" = brewer.pal(7, "Paired")[5], 25 | "Nonsense_Mutation" = brewer.pal(7, "Paired")[6], 26 | "Splice_Site" = brewer.pal(9, "Paired")[7], 27 | "Translation_Start_Site" = brewer.pal(9, "Paired")[8])) 28 | 29 | pdf(file = "~/The Jackson Laboratory/GLASS - Documents/Figure 1/suppl_vaf.pdf", width = 10, height = 9) 30 | plot(g) 31 | dev.off() 32 | -------------------------------------------------------------------------------- /R/telseq/telseq.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(dplyr) 3 | library(tidyr) 4 | library(parallel) 5 | 6 | ## Constants/parameters 7 | K = 7 8 | G = 332720800 9 | c = 46000 10 | 11 | resultsdir = "results/telseq" 12 | ptrn = "telseq.txt$" 13 | 14 | ################################################################################################################################################################## 15 | 16 | message("Merging telseq output") 17 | 18 | # list of telseq data files 19 | tsfiles = list.files(resultsdir, pattern=ptrn, recursive=T, full.names=T) 20 | 21 | tsdat = mclapply(tsfiles, function(fn) { 22 | aliquot_barcode = unlist(strsplit(basename(fn),"\\."))[1] 23 | 24 | f = tryCatch(data.table::fread(fn), error=function(e) e) 25 | 26 | if(inherits(f, "error")) 27 | message(fn) 28 | 29 | f = data.frame(aliquot_barcode,f) 30 | 31 | total_reads = as.numeric(as.character(f$Total)) 32 | total_reads_wm = round(sum(total_reads,na.rm=T)) 33 | 34 | mapped_reads = as.numeric(as.character(f$Mapped)) 35 | mapped_reads_wm = round(sum(mapped_reads,na.rm=T)) 36 | 37 | duplicate_reads = as.numeric(as.character(f$Duplicates)) 38 | duplicate_reads_wm = round(sum(duplicate_reads,na.rm=T)) 39 | 40 | tel = apply(f[,na.omit(match(paste('TEL',K:99,sep=''),colnames(f)))],1,sum,na.rm=T) 41 | tel_wm = weighted.mean(tel, total_reads, na.rm=T) 42 | 43 | gc = apply(f[,match(paste('GC',4:5,sep=''),colnames(f))],1,sum) 44 | gc_wm = weighted.mean(gc, total_reads, na.rm=T) 45 | 46 | len = (tel/gc)*(G/c) 47 | len_wm = weighted.mean(len, total_reads, na.rm=T) 48 | 49 | out = data.frame(aliquot_barcode, 50 | total_reads=total_reads_wm, 51 | mapped_reads=mapped_reads_wm, 52 | duplicate_reads=duplicate_reads_wm, 53 | tel=tel_wm, 54 | K, 55 | G, 56 | c, 57 | gc=gc_wm, 58 | length=len_wm) 59 | 60 | return(out) 61 | }) 62 | 63 | tsdat = rbindlist(tsdat) %>% as.data.frame() 64 | 65 | con <- DBI::dbConnect(odbc::odbc(), "GLASSv3") 66 | DBI::dbWriteTable(con, DBI::Id(schema="analysis",table="telseq"), tsdat) 67 | -------------------------------------------------------------------------------- /bin/bam-rg-insert-size-calc.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Script that retrieves the reads for a particular read group from a BAM file, and 4 | # does a calculation of the average insert size 5 | 6 | my $infile; 7 | my $rg; 8 | # my $tempfile; 9 | # my $outfile; 10 | 11 | if (scalar(@ARGV) != 2) { 12 | print "Usage: perl bam-rg-insert-size-calc.pl [infile] [read group]\n"; 13 | exit(1); 14 | } 15 | 16 | $infile = shift(@ARGV); 17 | $rg = shift(@ARGV); 18 | # $tempfile = shift(@ARGV); 19 | # $outfile = shift(@ARGV); 20 | 21 | # my @insert_sizes = (); 22 | my $n = 0; 23 | my $sum = 0; 24 | my @insert_sizes = (); 25 | 26 | # my $samtools_cmd = "samtools view -r $rg -f 0x2 $infile | head -n 1000000 > $tempfile"; 27 | # system($samtools_cmd); 28 | 29 | my @lines = `samtools view -r $rg -f 0x2 $infile | head -n 1000000`; 30 | 31 | # open INFILE, "<$tempfile" or die "Can't open $tempfile: $!\n"; 32 | foreach my $line (@lines) { 33 | chomp($line); 34 | 35 | my @pieces = split(/\s+/, $line); 36 | 37 | # DEBUG 38 | # print $pieces[2]."\t".$pieces[6]."\n"; 39 | # print $pieces[3]."\n"; 40 | 41 | if ($pieces[6] eq "=") { 42 | my $insert_size = abs($pieces[7] - $pieces[3]); 43 | my $length = length($pieces[9]); 44 | # my $length = abs($pieces[8]); 45 | 46 | # DEBUG 47 | # print "Length: ".$length."\n"; 48 | 49 | if ($insert_size > 0 && $insert_size <= 5000) { 50 | # push(@insert_sizes, $insert_size); 51 | 52 | # DEBUG 53 | # if ($insert_size > 150000000) { 54 | # next; 55 | # # print $line."\n"; 56 | # } 57 | # print $insert_size."\n"; 58 | 59 | push(@insert_sizes, ($insert_size+$length)); 60 | $sum += ($insert_size + $length); 61 | $n++; 62 | if ($n >= 1000000) { 63 | last; 64 | } 65 | } 66 | } 67 | } 68 | 69 | # close(INFILE); 70 | # open OUTFILE, ">$outfile" or die "Can't open $outfile: $!\n"; 71 | my $avg; 72 | # print $n."\n"; 73 | if ($n == 0) { 74 | print "No insert sizes above zero\n"; 75 | } else { 76 | $avg = $sum/$n; 77 | # open OUTFILE, ">$outfile" or die "Can't open $outfile: $!\n"; 78 | print "Average: ".$avg."\n"; 79 | # close(OUTFILE); 80 | } 81 | 82 | my $median; 83 | if ($n == 0) { 84 | # print "No insert sizes above zero\n"; 85 | } else { 86 | @insert_sizes = sort {$a <=> $b} @insert_sizes; 87 | if ($n % 2) { # Odd case 88 | $median = $insert_sizes[int($n/2)]; 89 | } else { # Even case 90 | $median = ($insert_sizes[($n/2)-1]+$insert_sizes[($n/2)])/2; 91 | } 92 | # open OUTFILE, ">$outfile" or die "Can't open $outfile: $!\n"; 93 | print "Median: ".$median."\n"; 94 | } 95 | 96 | # close(OUTFILE); 97 | exit(); 98 | -------------------------------------------------------------------------------- /bin/bamtofastq-rename.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## Script to quickly turn uBAM into FASTQ with appropriate filenames 3 | ## Tested on "_test2" per readgroup "RevertSam" output uBAM files 4 | for i in *.bam; do ID="_test2"; SM=`samtools view -H $i | grep '^@RG' | sed "s/.*SM:\([^\t]*\).*/\1/g"`; FC=`samtools view -H $i | grep '^@RG' | sed "s/.*PU:[^_]*_[^_]*_[^_]*_[^_]*_\([^_]*\).*/\1/g"`; LN=`samtools view -H $i | grep '^@RG' | sed "s/.*PU:[^_]*_[^_]*_[^_]*_[^_]*_[^_]*_\([^#]*\).*/\1/g"`; bedtools bamtofastq -i $i -fq ${ID}_${SM}_${FC}_L${LN}_R1.fq -fq2 ${ID}_${SM}_${FC}_L${LN}_R2.fq; done -------------------------------------------------------------------------------- /bin/bedtovcf.sh: -------------------------------------------------------------------------------- 1 | zcat consensus.norm.vcf.gz | awk '{OFS="\t"; \ 2 | if (!/^#/ && (length($4) > 1 || length($5) > 1))\ 3 | { print $1,$2-sqrt((length($4)-length($5))^2)-1,$2+sqrt((length($4)-length($5))^2)+1,$4"/"$5,"+" } \ 4 | else if (!/^#/) \ 5 | { print $1,$2-1,$2,$4"/"$5,"+" } \ 6 | }' | less -S -------------------------------------------------------------------------------- /bin/get-readgroups.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | ## Get readgroups 3 | BAMDIR="/fastscratch/barthf/GLASS-WG/download" 4 | find "${BAMDIR}" -maxdepth 2 -type f -name "*bam*" | xargs -I% sh -c "samtools view -H % | grep ^@RG | sed 's|^|%\t|' | grep -v '^\['" -------------------------------------------------------------------------------- /bin/gistic_run.pbs: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #PBS -V 3 | #PBS -N GISTIC 4 | #PBS -j oe 5 | #PBS -M floris.barthel@jax.org 6 | #PBS -m a 7 | #PBS -l nodes=1:ppn=8,walltime=72:00:00 8 | #PBS -l mem=24gb 9 | 10 | BDP=/projects/verhaak-lab/GLASS-analysis/results/gistic2/primary 11 | BDR=/projects/verhaak-lab/GLASS-analysis/results/gistic2/recurrence 12 | SEGP=/projects/verhaak-lab/GLASS-analysis/results/gistic2/primary.seg 13 | SEGR=/projects/verhaak-lab/GLASS-analysis/results/gistic2/recurrence.seg 14 | MK=/projects/verhaak-lab/GLASS-analysis/results/gistic2/markers.txt 15 | REFGENE=/projects/verhaak-lab/FRONTIER/data/ref/hg19_v19.mat 16 | CNV=/home/barthf/opt/GISTIC_2_0_22/ref/CNV.hg19.bypos.111213.txt 17 | 18 | /home/barthf/opt/GISTIC_2_0_23/gistic_run -b $BDP -seg $SEGP -mk $MK -cnv $CNV -refgene $REFGENE -genegistic 1 -smallmem 1 -broad 1 -brlen 0.5 -conf 0.99 -armpeel 1 -savegene 1 -gcm extreme -v 25 -rx 0 -maxspace 1 -js 1 -cap 0.5 -td 0.05 -ta 0.05 19 | /home/barthf/opt/GISTIC_2_0_23/gistic_run -b $BDR -seg $SEGR -mk $MK -cnv $CNV -refgene $REFGENE -genegistic 1 -smallmem 1 -broad 1 -brlen 0.5 -conf 0.99 -armpeel 1 -savegene 1 -gcm extreme -v 25 -rx 0 -maxspace 1 -js 1 -cap 0.5 -td 0.05 -ta 0.05 -------------------------------------------------------------------------------- /bin/preprocess-intervals.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## GATK pre-proccess intervals for CNV calling 3 | ## See https://software.broadinstitute.org/gatk/documentation/article?id=11682 4 | 5 | gatk PreprocessIntervals \ 6 | -R /projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta \ 7 | --bin-length 1000 \ 8 | --padding 0 \ 9 | --interval-merging-rule OVERLAPPING_ONLY \ 10 | -O human_g1k_v37_decoy.preprocessed.interval_list \ 11 | --exclude-intervals X \ 12 | --exclude-intervals Y \ 13 | --exclude-intervals MT \ 14 | --exclude-intervals GL000207.1 \ 15 | --exclude-intervals GL000226.1 \ 16 | --exclude-intervals GL000229.1 \ 17 | --exclude-intervals GL000231.1 \ 18 | --exclude-intervals GL000210.1 \ 19 | --exclude-intervals GL000239.1 \ 20 | --exclude-intervals GL000235.1 \ 21 | --exclude-intervals GL000201.1 \ 22 | --exclude-intervals GL000247.1 \ 23 | --exclude-intervals GL000245.1 \ 24 | --exclude-intervals GL000197.1 \ 25 | --exclude-intervals GL000203.1 \ 26 | --exclude-intervals GL000246.1 \ 27 | --exclude-intervals GL000249.1 \ 28 | --exclude-intervals GL000196.1 \ 29 | --exclude-intervals GL000248.1 \ 30 | --exclude-intervals GL000244.1 \ 31 | --exclude-intervals GL000238.1 \ 32 | --exclude-intervals GL000202.1 \ 33 | --exclude-intervals GL000234.1 \ 34 | --exclude-intervals GL000232.1 \ 35 | --exclude-intervals GL000206.1 \ 36 | --exclude-intervals GL000240.1 \ 37 | --exclude-intervals GL000236.1 \ 38 | --exclude-intervals GL000241.1 \ 39 | --exclude-intervals GL000243.1 \ 40 | --exclude-intervals GL000242.1 \ 41 | --exclude-intervals GL000230.1 \ 42 | --exclude-intervals GL000237.1 \ 43 | --exclude-intervals GL000233.1 \ 44 | --exclude-intervals GL000204.1 \ 45 | --exclude-intervals GL000198.1 \ 46 | --exclude-intervals GL000208.1 \ 47 | --exclude-intervals GL000191.1 \ 48 | --exclude-intervals GL000227.1 \ 49 | --exclude-intervals GL000228.1 \ 50 | --exclude-intervals GL000214.1 \ 51 | --exclude-intervals GL000221.1 \ 52 | --exclude-intervals GL000209.1 \ 53 | --exclude-intervals GL000218.1 \ 54 | --exclude-intervals GL000220.1 \ 55 | --exclude-intervals GL000213.1 \ 56 | --exclude-intervals GL000211.1 \ 57 | --exclude-intervals GL000199.1 \ 58 | --exclude-intervals GL000217.1 \ 59 | --exclude-intervals GL000216.1 \ 60 | --exclude-intervals GL000215.1 \ 61 | --exclude-intervals GL000205.1 \ 62 | --exclude-intervals GL000219.1 \ 63 | --exclude-intervals GL000224.1 \ 64 | --exclude-intervals GL000223.1 \ 65 | --exclude-intervals GL000195.1 \ 66 | --exclude-intervals GL000212.1 \ 67 | --exclude-intervals GL000222.1 \ 68 | --exclude-intervals GL000200.1 \ 69 | --exclude-intervals GL000193.1 \ 70 | --exclude-intervals GL000194.1 \ 71 | --exclude-intervals GL000225.1 \ 72 | --exclude-intervals GL000192.1 \ 73 | --exclude-intervals NC_007605 \ 74 | --exclude-intervals hs37d5 -------------------------------------------------------------------------------- /bin/reset_directory_structure.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm benchmarks/*/* 3 | rmdir benchmarks/* 4 | rmdir benchmarks 5 | 6 | rm logs/*/* 7 | rmdir logs/* 8 | rmdir logs 9 | 10 | mkdir -p logs/drmaa 11 | 12 | rm results/*/*/*/* 13 | rm results/*/*/* 14 | rm results/*/* 15 | rm results/* 16 | 17 | rmdir results/*/*/* 18 | rmdir results/*/* 19 | rmdir results/* 20 | rmdir results -------------------------------------------------------------------------------- /bin/scatter-interval-list-to-bed.sh: -------------------------------------------------------------------------------- 1 | #!\bin\bash 2 | cd /projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/b37/scattered_wgs_intervals/scatter-5 3 | for f in `find . -type f -name "scattered.interval_list"`; 4 | do 5 | cat $f | grep -vE "^@" | awk 'OFS="\t" {print $1, $2-1, $3, $5, 0, $4}' > ${f%.*}.bed; 6 | done -------------------------------------------------------------------------------- /bin/select-germline-variants.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## Select variants from af-only-gnomad.raw.sites.b37.vcf.gz 3 | 4 | module load bcftools 5 | 6 | ## Need to split multi-allelic sites across multiple lines, see 7 | ## https://gatkforums.broadinstitute.org/gatk/discussion/10975/use-select-variants-on-a-gnomad-vcf-for-mutect2-contamination-filtering 8 | bcftools norm \ 9 | -f "/projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta" \ 10 | -m- \ 11 | -o "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.vcf.gz" \ 12 | -O z \ 13 | --threads 6 \ 14 | "/projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/Mutect2/af-only-gnomad.raw.sites.b37.vcf.gz" 15 | 16 | ## Had to remove contigs from VCF file because bcftools does not add length attribute to contigs 17 | ## and SelectVariants complains if they are missing 18 | bcftools view \ 19 | -h "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.vcf.gz" | \ 20 | sed '/^##contig/d' \ 21 | > "/projects/verhaak-lab/verhaak_ref/gatk-cnv/newheader.txt" 22 | 23 | bcftools reheader \ 24 | -h "/projects/verhaak-lab/verhaak_ref/gatk-cnv/newheader.txt" \ 25 | -o "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.reheader.vcf.gz" \ 26 | "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.vcf.gz" 27 | 28 | ## Need to have an index because SelectVariants complains w/o index 29 | gatk IndexFeatureFile \ 30 | -F "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.reheader.vcf.gz" 31 | 32 | ## Finally we can select variants 33 | gatk SelectVariants \ 34 | -V "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.norm.reheader.vcf.gz" \ 35 | -O "/projects/verhaak-lab/verhaak_ref/gatk-cnv/af-only-gnomad.raw.sites.b37.selected.vcf.gz" \ 36 | -R "/projects/verhaak-lab/verhaak_ref/gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta" \ 37 | --select "AF>0.05" -------------------------------------------------------------------------------- /bin/svaba-test-parameters.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #PBS -N svaba-test 4 | #PBS -l walltime=20:00:00 5 | #PBS -l nodes=1:ppn=15 6 | #PBS -r n 7 | #PBS -M kevin.c.johnson@jax.org 8 | #PBS -m a 9 | #PBS -k oe 10 | #PBS -q batch 11 | #PBS -o /projects/verhaak-lab/sulman_GSCs/GSC-BAM/svaba-test/logs/svaba-${PBS_JOBID}.log 12 | #PBS -e /projects/verhaak-lab/sulman_GSCs/GSC-BAM/svaba-test/logs/svaba-${PBS_JOBID}.err 13 | #PBS -V 14 | 15 | # Example bam file from the Sulman GSC project using a matched normal. 16 | TUM_BAM="/projects/verhaak-lab/sulman_GSCs/GSC-BAM/BAM/GS6-27-sample1_S1.aln.dup.realn.recal.rp.bam" 17 | NORM_BAM="/projects/verhaak-lab/sulman_GSCs/GSC-BAM/BAM/N6-27-sample2_S2.aln.dup.realn.recal.rp.bam" 18 | DBSNP="/projects/verhaak-lab/sulman_GSCs/GSC-BAM/svaba-test/test-reference/dbsnp_indel.vcf" 19 | CORES=10 20 | REF="/projects/verhaak-lab/glassdir/dockscratch/bundle/human_g1k_v37_decoy.fasta" 21 | 22 | # Samir downloaded svaba and saved it to Verhaak_env. 23 | module load rvsvaba 24 | 25 | # Change our working directory to the SVABA test. 26 | cd $PBS_O_WORKDIR 27 | 28 | # Set the date for which a sample was run. 29 | STARTTIME=`date` 30 | echo $STARTTIME 31 | echo "Processing $TUM_BAM" 32 | 33 | # Run on all chromosomes, including scaffolds. 34 | svaba run -t $TUM_BAM -n $NORM_BAM -D $DBSNP -a GS6-27 -G $REF --hp -p $CORES -------------------------------------------------------------------------------- /conf/optitype_config.ini: -------------------------------------------------------------------------------- 1 | [mapping] 2 | 3 | # Absolute path to RazerS3 binary, and number of threads to use for mapping 4 | 5 | razers3=razers3 6 | threads=1 7 | 8 | [ilp] 9 | 10 | # A Pyomo-supported ILP solver. The solver must be globally accessible in the 11 | # environment OptiType is run, so make sure to include it in PATH. 12 | # Note: this is NOT a path to the solver binary, but a keyword argument for 13 | # Pyomo. Examples: glpk, cplex, cbc. 14 | 15 | solver=glpk 16 | threads=1 17 | 18 | [behavior] 19 | 20 | # tempdir=/path/to/tempdir # we may enable this setting later. Not used now. 21 | 22 | # Delete intermediate bam files produced by RazerS3 after OptiType finished 23 | # loading them. If you plan to re-analyze your samples with different settings 24 | # disabling this option can be a time-saver, as you'll be able to pass the bam 25 | # files to OptiType directly as input and spare the expensive read mapping 26 | # step. 27 | 28 | deletebam=true 29 | 30 | # In paired-end mode one might want to use reads with just one mapped end (e.g., 31 | # the other end falls outside the reference region). This setting allows the 32 | # user to keep them with an optionally reduced weight. A value of 0 means they 33 | # are discarded for typing, 0.2 means single reads are "worth" 20% of paired 34 | # reads, and a value of 1 means they are treated as valuable as properly mapped 35 | # read pairs. Note: unpaired reads will be reported on the result coverage plots 36 | # for completeness, regardless of this setting. 37 | 38 | unpaired_weight=0 39 | 40 | # We call a read pair discordant if its two ends best-map to two disjoint sets 41 | # of alleles. Such reads can be either omitted or either of their ends treated 42 | # as unpaired hits. Note: discordant read pairs are reported on the coverage 43 | # plots as unpaired reads, regardless of this setting. 44 | 45 | use_discordant=false 46 | -------------------------------------------------------------------------------- /dag/align.rulegraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/align.rulegraph.png -------------------------------------------------------------------------------- /dag/fingerprint.rulegraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/fingerprint.rulegraph.png -------------------------------------------------------------------------------- /dag/gatk-cnv.rulegraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/gatk-cnv.rulegraph.png -------------------------------------------------------------------------------- /dag/mt2.rulegraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/mt2.rulegraph.png -------------------------------------------------------------------------------- /dag/svdetect.rulegraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/svdetect.rulegraph.png -------------------------------------------------------------------------------- /dag/vs2.rulegraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dag/vs2.rulegraph.png -------------------------------------------------------------------------------- /dbm/glass-rearranged.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dbm/glass-rearranged.png -------------------------------------------------------------------------------- /dbm/glass.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/dbm/glass.png -------------------------------------------------------------------------------- /envs/absolute.yaml: -------------------------------------------------------------------------------- 1 | name: absolute 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - r-devtools 9 | - r-numderiv 10 | -------------------------------------------------------------------------------- /envs/align.yaml: -------------------------------------------------------------------------------- 1 | name: align 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - bwa=0.7.17=ha92aebf_3 9 | - fastqc=0.11.7=pl5.22.0_0 10 | - gatk4=4.0.9.0=0 11 | - multiqc=1.6a0=py36h24bf2e0_2 12 | # - simplejson=3.8.1=py36_0 13 | # - colormath=3.0.0=py_2 14 | # - lzstring=1.0.3=py36_0 15 | # - perl=5.22.0.1=0 16 | # - spectra=0.0.11=py_0 17 | # - asn1crypto=0.24.0=py36_0 18 | # - blas=1.0=mkl 19 | # - ca-certificates=2018.03.07=0 20 | # - certifi=2018.4.16=py36_0 21 | # - cffi=1.11.5=py36h9745a5d_0 22 | # - chardet=3.0.4=py36_1 23 | # - click=6.7=py36_0 24 | # - cryptography=2.2.2=py36h14c3975_0 25 | # - cycler=0.10.0=py36_0 26 | # - dbus=1.13.2=h714fa37_1 27 | # - decorator=4.3.0=py36_0 28 | # - expat=2.2.5=he0dffb1_0 29 | # - font-ttf-dejavu-sans-mono=2.37=h6964260_0 30 | # - fontconfig=2.13.0=h9420a91_0 31 | # - freetype=2.9.1=h8a8886c_0 32 | # - future=0.16.0=py36_0 33 | # - glib=2.56.1=h000015b_0 34 | # - gst-plugins-base=1.14.0=hbbd80ab_1 35 | # - gstreamer=1.14.0=hb453b48_1 36 | # - icu=58.2=h9c2bf20_1 37 | # - idna=2.7=py36_0 38 | # - intel-openmp=2018.0.3=0 39 | # - jinja2=2.10=py36_0 40 | # - jpeg=9b=h024ee3a_2 41 | # - kiwisolver=1.0.1=py36hf484d3e_0 42 | # - libedit=3.1.20170329=h6b74fdf_2 43 | # - libffi=3.2.1=hd88cf55_4 44 | # - libgcc-ng=7.2.0=hdf63c60_3 45 | # - libgfortran-ng=7.2.0=hdf63c60_3 46 | # - libpng=1.6.34=hb9fc6fc_0 47 | # - libstdcxx-ng=7.2.0=hdf63c60_3 48 | # - libuuid=1.0.3=h1bed415_2 49 | # - libxcb=1.13=h1bed415_1 50 | # - libxml2=2.9.8=h26e45fe_1 51 | # - markdown=2.6.11=py36_0 52 | # - markupsafe=1.0=py36h14c3975_1 53 | # - matplotlib=2.2.2=py36hb69df0a_2 54 | # - mkl=2018.0.3=1 55 | # - mkl_fft=1.0.4=py36h4414c95_1 56 | # - mkl_random=1.0.1=py36h4414c95_1 57 | # - ncurses=6.1=hf484d3e_0 58 | # - networkx=2.0=py36h7e96fb8_0 59 | # - numpy=1.15.0=py36h1b885b7_0 60 | # - numpy-base=1.15.0=py36h3dfced4_0 61 | # - openjdk=8.0.121=1 62 | # - openssl=1.0.2o=h20670df_0 63 | # - pcre=8.42=h439df22_0 64 | # - pip=10.0.1=py36_0 65 | # - pycparser=2.18=py36_1 66 | # - pyopenssl=18.0.0=py36_0 67 | # - pyparsing=2.2.0=py36_1 68 | # - pyqt=5.9.2=py36h22d08a2_0 69 | # - pysocks=1.6.8=py36_0 70 | # - python=3.6.6=hc3d631a_0 71 | # - python-dateutil=2.7.3=py36_0 72 | # - pytz=2018.5=py36_0 73 | # - pyyaml=3.13=py36h14c3975_0 74 | # - qt=5.9.6=h52aff34_0 75 | # - readline=7.0=ha6073c6_4 76 | # - requests=2.19.1=py36_0 77 | # - setuptools=39.2.0=py36_0 78 | # - sip=4.19.8=py36hf484d3e_0 79 | # - six=1.11.0=py36_1 80 | # - sqlite=3.24.0=h84994c4_0 81 | # - tk=8.6.7=hc745277_3 82 | # - tornado=5.0.2=py36h14c3975_0 83 | # - urllib3=1.23=py36_0 84 | # - wheel=0.31.1=py36_0 85 | # - xz=5.2.4=h14c3975_4 86 | # - yaml=0.1.7=had09818_2 87 | # - zlib=1.2.11=ha838bed_2 88 | 89 | -------------------------------------------------------------------------------- /envs/bcftools.yaml: -------------------------------------------------------------------------------- 1 | name: freebayes 2 | channels: 3 | - bioconda 4 | - defaults 5 | - conda-forge 6 | dependencies: 7 | - htslib=1.8=1 8 | - libdeflate=1.0=h470a237_0 9 | - parallel=20160622=1 10 | - bcftools=1.9 11 | # - perl-threaded=5.22.0=13 12 | # - bzip2=1.0.6=h470a237_2 13 | # - ca-certificates=2018.8.24=ha4d7672_0 14 | # - certifi=2018.8.24=py36_1 15 | # - libgcc-ng=7.2.0=hdf63c60_3 16 | # - libstdcxx-ng=7.2.0=hdf63c60_3 17 | # - openssl=1.0.2o=h470a237_1 18 | # - perl=5.26.2=h470a237_0 19 | # - curl=7.54.1=0 20 | # - krb5=1.13.2=0 21 | # - libgcc=5.2.0=0 22 | # - libssh2=1.8.0=0 23 | # - pip=9.0.1=py36_1 24 | # - python=3.6.2=0 25 | # - readline=6.2=2 26 | # - setuptools=36.4.0=py36_1 27 | # - sqlite=3.13.0=0 28 | # - tk=8.5.18=0 29 | # - wheel=0.29.0=py36_0 30 | # - xz=5.2.3=0 31 | # - zlib=1.2.11=0 32 | # prefix: /projects/barthf/opt/miniconda3/envs/freebayes 33 | 34 | -------------------------------------------------------------------------------- /envs/delly.yaml: -------------------------------------------------------------------------------- 1 | name: delly2 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - bcftools=1.9=h4da6232_0 9 | - delly=0.7.8=hd37b1a0_2 10 | - htslib=1.7=0 11 | - libdeflate=1.0=h470a237_0 12 | - boost=1.67.0=py36h3e44d54_0 13 | - boost-cpp=1.67.0=h3a22d5f_0 14 | - bzip2=1.0.6=h470a237_2 15 | - curl=7.61.0=h93b3f91_1 16 | - icu=58.2=hfc679d8_0 17 | - krb5=1.14.6=0 18 | - libffi=3.2.1=3 19 | - libssh2=1.8.0=h5b517e9_2 20 | - ncurses=6.1=hfc679d8_1 21 | - pip=18.0=py36_0 22 | - python=3.6.6=h5001a0f_0 23 | - readline=7.0=haf1bffa_1 24 | - setuptools=40.0.0=py36_0 25 | - sqlite=3.24.0=h2f33b56_0 26 | - tk=8.6.8=0 27 | - wheel=0.31.1=py36_0 28 | - xz=5.2.4=h470a237_0 29 | - zlib=1.2.11=h470a237_3 30 | - ca-certificates=2018.03.07=0 31 | - certifi=2018.4.16=py36_0 32 | - libgcc=7.2.0=h69d50b8_2 33 | - libgcc-ng=7.2.0=hdf63c60_3 34 | - libgfortran-ng=7.2.0=hdf63c60_3 35 | - libopenblas=0.2.20=h9ac9557_7 36 | - libstdcxx-ng=7.2.0=hdf63c60_3 37 | - numpy=1.14.3=py36h28100ab_2 38 | - numpy-base=1.14.3=py36h0ea5e3f_1 39 | - openssl=1.0.2o=h20670df_0 40 | 41 | -------------------------------------------------------------------------------- /envs/freebayes.yaml: -------------------------------------------------------------------------------- 1 | name: freebayes 2 | channels: 3 | - bioconda 4 | - defaults 5 | - conda-forge 6 | dependencies: 7 | - freebayes=1.2.0=py36h82df9c4_2 8 | - htslib=1.8=1 9 | - libdeflate=1.0=h470a237_0 10 | - parallel=20160622=1 11 | - bcftools 12 | - vt=2015.11.10=he941832_3 13 | # - perl-threaded=5.22.0=13 14 | # - bzip2=1.0.6=h470a237_2 15 | # - ca-certificates=2018.8.24=ha4d7672_0 16 | # - certifi=2018.8.24=py36_1 17 | # - libgcc-ng=7.2.0=hdf63c60_3 18 | # - libstdcxx-ng=7.2.0=hdf63c60_3 19 | # - openssl=1.0.2o=h470a237_1 20 | # - perl=5.26.2=h470a237_0 21 | # - curl=7.54.1=0 22 | # - krb5=1.13.2=0 23 | # - libgcc=5.2.0=0 24 | # - libssh2=1.8.0=0 25 | # - pip=9.0.1=py36_1 26 | # - python=3.6.2=0 27 | # - readline=6.2=2 28 | # - setuptools=36.4.0=py36_1 29 | # - sqlite=3.13.0=0 30 | # - tk=8.5.18=0 31 | # - wheel=0.29.0=py36_0 32 | # - xz=5.2.3=0 33 | # - zlib=1.2.11=0 34 | # prefix: /projects/barthf/opt/miniconda3/envs/freebayes 35 | 36 | -------------------------------------------------------------------------------- /envs/gatk4.yaml: -------------------------------------------------------------------------------- 1 | name: gatk4 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - gatk4=4.1.0.0 9 | # - samtools=1.7=1 10 | # - python=3.6.6 11 | # - r=3.3.2=r3.3.2_0 12 | # - r-base=3.3.2=0 13 | # - r-boot=1.3_18=r3.3.2_0 14 | # - r-class=7.3_14=r3.3.2_0 15 | # - r-cluster=2.0.5=r3.3.2_0 16 | # - r-codetools=0.2_15=r3.3.2_0 17 | # - r-foreign=0.8_67=r3.3.2_0 18 | # - r-getopt=1.20.0=r3.3.2_0 19 | # - r-kernsmooth=2.23_15=r3.3.2_0 20 | # - r-lattice=0.20_34=r3.3.2_0 21 | # - r-mass=7.3_45=r3.3.2_0 22 | # - r-matrix=1.2_7.1=r3.3.2_0 23 | # - r-mgcv=1.8_16=r3.3.2_0 24 | # - r-nlme=3.1_128=r3.3.2_0 25 | # - r-nnet=7.3_12=r3.3.2_0 26 | # - r-recommended=3.3.2=r3.3.2_0 27 | # - r-rpart=4.1_10=r3.3.2_0 28 | # - r-spatial=7.3_11=r3.3.2_0 29 | # - r-survival=2.40_1=r3.3.2_0 30 | # - r-optparse 31 | # - r-data.table 32 | # - bcftools 33 | # - bzip2=1.0.6=h14c3975_5 34 | # - ca-certificates=2018.03.07=0 35 | # - cairo=1.14.8=0 36 | # - certifi=2018.4.16=py36_0 37 | # - curl=7.61.0=h84994c4_0 38 | # - fontconfig=2.12.1=3 39 | # - freetype=2.5.5=2 40 | # - glib=2.50.2=1 41 | # - gsl=2.4=h14c3975_4 42 | # - harfbuzz=0.9.39=2 43 | # - icu=54.1=0 44 | # - jbig=2.1=hdba287a_0 45 | # - jpeg=8d=2 46 | # - libcurl=7.61.0=h1ad7b7a_0 47 | # - libedit=3.1.20170329=h6b74fdf_2 48 | # - libffi=3.2.1=hd88cf55_4 49 | # - libgcc=7.2.0=h69d50b8_2 50 | # - libgcc-ng=7.2.0=hdf63c60_3 51 | # - libiconv=1.14=0 52 | # - libpng=1.6.34=hb9fc6fc_0 53 | # - libssh2=1.8.0=h9cfc8f7_4 54 | # - libstdcxx-ng=7.2.0=hdf63c60_3 55 | # - libtiff=4.0.6=2 56 | # - libxml2=2.9.4=0 57 | # - ncurses=6.1=hf484d3e_0 58 | # - openjdk=8.0.121=1 59 | # - openssl=1.0.2o=h14c3975_1 60 | # - pango=1.40.3=1 61 | # - pcre=8.39=1 62 | # - pip=10.0.1=py36_0 63 | # - pixman=0.34.0=hceecf20_3 64 | # - readline=7.0=ha6073c6_4 65 | # - setuptools=39.2.0=py36_0 66 | # - sqlite=3.24.0=h84994c4_0 67 | # - tk=8.6.7=hc745277_3 68 | # - wheel=0.31.1=py36_0 69 | # - xz=5.2.4=h14c3975_4 70 | # - zlib=1.2.11=ha838bed_2 71 | # - _r-mutex=1.0.0=anacondar_1 72 | 73 | -------------------------------------------------------------------------------- /envs/gdc-client.yaml: -------------------------------------------------------------------------------- 1 | name: gdc 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - ca-certificates=2018.03.07=0 9 | - certifi=2018.4.16=py27_0 10 | - icu=58.2=h9c2bf20_1 11 | - libedit=3.1=heed3624_0 12 | - libffi=3.2.1=hd88cf55_4 13 | - libgcc-ng=7.2.0=hdf63c60_3 14 | - libstdcxx-ng=7.2.0=hdf63c60_3 15 | - libxml2=2.9.8=hf84eae3_0 16 | - libxslt=1.1.32=h1312cb7_0 17 | - lxml=4.2.1=py27h23eabaa_0 18 | - ncurses=6.0=h9df7e31_2 19 | - openssl=1.0.2o=h20670df_0 20 | - pip=9.0.3=py27_0 21 | - python=2.7.14=h1571d57_31 22 | - readline=7.0=ha6073c6_4 23 | - setuptools=39.0.1=py27_0 24 | - sqlite=3.23.1=he433501_0 25 | - tk=8.6.7=hc745277_3 26 | - wheel=0.31.0=py27_0 27 | - xz=5.2.3=h5e939de_4 28 | - zlib=1.2.11=ha838bed_2 29 | - pip: 30 | - asn1crypto==0.24.0 31 | - cffi==1.11.5 32 | - cmd2==0.6.8 33 | - cryptography==2.2.2 34 | - enum34==1.1.6 35 | - flask==0.10.1 36 | - functools32==3.2.3.post2 37 | - gdc-client==1.3.0 38 | - idna==2.6 39 | - intervaltree==2.0.4 40 | - ipaddress==1.0.22 41 | - itsdangerous==0.24 42 | - jinja2==2.10 43 | - jsonschema==2.5.1 44 | - markupsafe==1.0 45 | - ndg-httpsclient==0.4.2 46 | - parcel==0.1.13 47 | - progressbar==2.3 48 | - pyasn1==0.2.3 49 | - pycparser==2.18 50 | - pyopenssl==17.1.0 51 | - pyparsing==2.2.0 52 | - pyyaml==3.11 53 | - requests==2.5.1 54 | - six==1.11.0 55 | - sortedcontainers==1.5.10 56 | - termcolor==1.1.0 57 | - werkzeug==0.14.1 58 | 59 | -------------------------------------------------------------------------------- /envs/haplotype.yaml: -------------------------------------------------------------------------------- 1 | name: haplotype 2 | channels: 3 | - bioconda 4 | - defaults 5 | - conda-forge 6 | dependencies: 7 | - bcftools=1.7=0 8 | - bedtools=2.27.1=he941832_2 9 | - htslib=1.7=0 10 | - plink=1.90b4=h0a6d026_2 11 | - plink2=1.90b3.35=0 12 | - pybedtools=0.7.10=py27_2 13 | - pysam=0.14.1=py27_htslib1.7_0 14 | - samtools=1.7=2 15 | - vcftools=0.1.16=he941832_2 16 | - libstdcxx-ng=7.2.0=hdf63c60_3 17 | - openblas=0.2.20=8 18 | - bitarray=0.8.1=py27_0 19 | - blas=1.0=mkl 20 | - bzip2=1.0.6=3 21 | - certifi=2016.2.28=py27_0 22 | - curl=7.54.1=0 23 | - krb5=1.13.2=0 24 | - libgcc=5.2.0=0 25 | - libgfortran=3.0.0=1 26 | - libssh2=1.8.0=0 27 | - mkl=2017.0.3=0 28 | - ncurses=5.9=10 29 | - nose=1.3.7=py27_1 30 | - numpy=1.12.1=py27_0 31 | - openssl=1.0.2l=0 32 | - pandas=0.20.3=py27_0 33 | - pip=9.0.1=py27_1 34 | - python=2.7.13=0 35 | - python-dateutil=2.6.1=py27_0 36 | - pytz=2017.2=py27_0 37 | - readline=6.2=2 38 | - scipy=0.18.1=np112py27_1 39 | - setuptools=36.4.0=py27_1 40 | - six=1.10.0=py27_0 41 | - sqlite=3.13.0=0 42 | - tk=8.5.18=0 43 | - wheel=0.29.0=py27_0 44 | - xz=5.2.3=0 45 | - zlib=1.2.11=0 46 | 47 | -------------------------------------------------------------------------------- /envs/lumpy-sv.yaml: -------------------------------------------------------------------------------- 1 | name: lumpy-sv 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - bcftools 9 | - htslib 10 | - libdeflate 11 | - lumpy-sv 12 | - pysam 13 | - sambamba 14 | - samblaster 15 | - samtools 16 | - svtyper 17 | - cytoolz 18 | - r-jsonlite 19 | -------------------------------------------------------------------------------- /envs/manta.yaml: -------------------------------------------------------------------------------- 1 | name: manta 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - manta=1.4.0=py27_1 9 | - ca-certificates=2018.03.07=0 10 | - certifi=2018.4.16=py27_0 11 | - libedit=3.1.20170329=h6b74fdf_2 12 | - libffi=3.2.1=hd88cf55_4 13 | - libgcc-ng=7.2.0=hdf63c60_3 14 | - libstdcxx-ng=7.2.0=hdf63c60_3 15 | - ncurses=6.1=hf484d3e_0 16 | - openssl=1.0.2o=h20670df_0 17 | - pip=10.0.1=py27_0 18 | - python=2.7.15=h1571d57_0 19 | - readline=7.0=ha6073c6_4 20 | - setuptools=39.2.0=py27_0 21 | - sqlite=3.24.0=h84994c4_0 22 | - tk=8.6.7=hc745277_3 23 | - wheel=0.31.1=py27_0 24 | - zlib=1.2.11=ha838bed_2 25 | 26 | -------------------------------------------------------------------------------- /envs/pvacseq.yaml: -------------------------------------------------------------------------------- 1 | name: pvacseq 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | - defaults 6 | - r 7 | - vacation 8 | dependencies: 9 | - asn1crypto=0.24.0=py35_3 10 | - blas=2.4=openblas 11 | - bzip2=1.0.6=h14c3975_1002 12 | - ca-certificates=2019.3.9=hecc5488_0 13 | - certifi=2018.8.24=py35_1001 14 | - cffi=1.11.5=py35h5e8e0c9_1 15 | - chardet=3.0.4=py35_3 16 | - click=7.0=py_0 17 | - clickclick=1.2.2=py_1 18 | - connexion=1.5.3=py35_0 19 | - cryptography=2.3.1=py35hdffb7b8_0 20 | - cryptography-vectors=2.3.1=py35_0 21 | - flask=1.0.2=py_2 22 | - idna=2.7=py35_2 23 | - inflection=0.3.1=py35_0 24 | - itsdangerous=1.1.0=py_0 25 | - jinja2=2.10=py_1 26 | - jsonschema=2.6.0=py35_2 27 | - libblas=3.8.0=4_openblas 28 | - libcblas=3.8.0=4_openblas 29 | - libffi=3.2.1=he1b5a44_1006 30 | - libgcc-ng=8.2.0=hdf63c60_1 31 | - libgfortran-ng=7.3.0=hdf63c60_0 32 | - liblapack=3.8.0=4_openblas 33 | - liblapacke=3.8.0=4_openblas 34 | - libopenblas=0.3.3=h5a2b251_3 35 | - libstdcxx-ng=8.2.0=hdf63c60_1 36 | - markupsafe=1.0=py35h470a237_1 37 | - ncurses=6.1=hf484d3e_1002 38 | - numpy=1.15.2=py35h99e49ec_0 39 | - numpy-base=1.15.2=py35h2f8d375_0 40 | - openblas=0.3.5=h9ac9557_1001 41 | - openssl=1.0.2r=h14c3975_0 42 | - pandas=0.23.4=py35hf8a1672_0 43 | - pip=18.0=py35_1001 44 | - pvacseq=4.0.10=py35_2 45 | - pycparser=2.19=py_0 46 | - pyopenssl=18.0.0=py35_0 47 | - pysocks=1.6.8=py35_2 48 | - python=3.5.5=h5001a0f_2 49 | - python-dateutil=2.8.0=py_0 50 | - pytz=2018.9=py_0 51 | - pyvcf=0.6.8=py35_0 52 | - pyyaml=3.13=py35h470a237_1 53 | - readline=7.0=hf8c457e_1001 54 | - requests=2.19.1=py35_1 55 | - setuptools=40.4.3=py35_0 56 | - six=1.11.0=py35_1 57 | - sqlite=3.26.0=h67949de_1001 58 | - swagger-spec-validator=2.4.3=py_0 59 | - tk=8.6.9=h84994c4_1001 60 | - typing=3.6.6=py35_0 61 | - urllib3=1.23=py35_1 62 | - werkzeug=0.15.1=py_0 63 | - wheel=0.32.0=py35_1000 64 | - xz=5.2.4=h14c3975_1001 65 | - yaml=0.1.7=h14c3975_1001 66 | - zlib=1.2.11=h14c3975_1004 67 | 68 | -------------------------------------------------------------------------------- /envs/pyclone.yaml: -------------------------------------------------------------------------------- 1 | name: pyclone 2 | channels: 3 | - aroth85 4 | - bioconda 5 | - conda-forge 6 | - defaults 7 | dependencies: 8 | - pyclone=0.13.1=py27_0 9 | - pydp=0.2.4=py27_0 10 | - backports=1.0=py_2 11 | - backports.functools_lru_cache=1.5=py_1 12 | - backports_abc=0.5=py_1 13 | - ca-certificates=2018.11.29=ha4d7672_0 14 | - certifi=2018.11.29=py27_1000 15 | - cycler=0.10.0=py_1 16 | - dbus=1.13.0=h3a4f0e9_0 17 | - enum34=1.1.6=py27_1001 18 | - expat=2.2.5=hfc679d8_2 19 | - fontconfig=2.13.1=h65d0f4c_0 20 | - freetype=2.9.1=h6debe1e_4 21 | - funcsigs=1.0.2=py_3 22 | - functools32=3.2.3.2=py_3 23 | - futures=3.2.0=py27_1000 24 | - gettext=0.19.8.1=h5e8e0c9_1 25 | - glib=2.56.2=h464dc38_1 26 | - gst-plugins-base=1.12.5=hde13a9d_0 27 | - gstreamer=1.12.5=h5856ed1_0 28 | - icu=58.2=hfc679d8_0 29 | - jpeg=9c=h470a237_1 30 | - kiwisolver=1.0.1=py27h2d50403_2 31 | - libffi=3.2.1=hfc679d8_5 32 | - libgcc-ng=7.2.0=hdf63c60_3 33 | - libgfortran=3.0.0=1 34 | - libiconv=1.15=h470a237_3 35 | - libpng=1.6.36=ha92aebf_0 36 | - libstdcxx-ng=7.2.0=hdf63c60_3 37 | - libuuid=2.32.1=h470a237_2 38 | - libxcb=1.13=h470a237_2 39 | - libxml2=2.9.8=h422b904_5 40 | - llvmlite=0.26.0=py27hd28b015_0 41 | - matplotlib=2.2.3=py27h8e2386c_0 42 | - ncurses=6.1=hfc679d8_2 43 | - numba=0.41.0=py27hf8a1672_0 44 | - openssl=1.0.2p=h470a237_1 45 | - pandas=0.23.4=py27hf8a1672_0 46 | - patsy=0.5.1=py_0 47 | - pcre=8.41=hfc679d8_3 48 | - pip=18.1=py27_1000 49 | - pthread-stubs=0.4=h470a237_1 50 | - pyparsing=2.3.0=py_0 51 | - pyqt=5.6.0=py27h8210e8a_7 52 | - python=2.7.15=h33da82c_6 53 | - python-dateutil=2.7.5=py_0 54 | - pytz=2018.7=py_0 55 | - pyyaml=3.13=py27h470a237_1 56 | - qt=5.6.2=hf70d934_9 57 | - readline=7.0=haf1bffa_1 58 | - seaborn=0.9.0=py_0 59 | - setuptools=40.6.3=py27_0 60 | - singledispatch=3.4.0.3=py27_1000 61 | - sip=4.18.1=py27hfc679d8_0 62 | - six=1.12.0=py27_1000 63 | - sqlite=3.26.0=hb1c47c0_0 64 | - statsmodels=0.9.0=py27h7eb728f_0 65 | - subprocess32=3.5.3=py27h470a237_0 66 | - tk=8.6.9=ha92aebf_0 67 | - tornado=5.1.1=py27h470a237_0 68 | - wheel=0.32.3=py27_0 69 | - xorg-libxau=1.0.8=h470a237_6 70 | - xorg-libxdmcp=1.1.2=h470a237_7 71 | - xz=5.2.4=h470a237_1 72 | - yaml=0.1.7=h470a237_1 73 | - zlib=1.2.11=h470a237_3 74 | - blas=1.0=mkl 75 | - mkl=2017.0.3=0 76 | - numpy=1.13.1=py27_0 77 | - scipy=0.19.1=np113py27_0 78 | 79 | -------------------------------------------------------------------------------- /envs/sequenza.yaml: -------------------------------------------------------------------------------- 1 | name: sequenza 2 | channels: 3 | - dranew 4 | - biobuilds 5 | - http://conda.anaconda.org/dranew 6 | - bioconda 7 | - conda-forge 8 | - defaults 9 | dependencies: 10 | - bioconductor-biocgenerics=0.24.0=r342h10e8652_0 11 | - bioconductor-copynumber=1.18.0=r342h84c3342_0 12 | - bioconductor-genomeinfodb=1.14.0=r342h317c8a6_0 13 | - bioconductor-genomeinfodbdata=0.99.1=r342h4c5fc93_0 14 | - bioconductor-genomicranges=1.30.0=r342hbf6d5b2_0 15 | - bioconductor-iranges=2.12.0=r342hb627adb_0 16 | - bioconductor-s4vectors=0.16.0=r342ha375a43_0 17 | - bioconductor-xvector=0.18.0=r342h80a1e3f_0 18 | - bioconductor-zlibbioc=1.24.0=r342h5ff288e_0 19 | - bzip2=1.0.6=h966e7de_0 20 | - ca-certificates=2018.11.29=ha4d7672_0 21 | - cairo=1.14.10=h021c1ba_0 22 | - certifi=2018.11.29=py36_1000 23 | - curl=7.56.1=h15b681c_0 24 | - fontconfig=2.12.4=h3f6a2db_0 25 | - freetype=2.8=h48caf01_1 26 | - glib=2.53.6=py36h5cf23cf_0 27 | - graphite2=1.3.10=hd5afa3c_0 28 | - gsl=2.4=h9aeeda3_0 29 | - harfbuzz=1.5.0=h7cf9945_0 30 | - icu=60.1=h58d5639_1 31 | - jpeg=9b=h67a1377_0 32 | - libffi=3.2.1=hfc679d8_5 33 | - libgcc=7.2.0=h69d50b8_2 34 | - libgcc-ng=7.2.0=hdf63c60_3 35 | - libgfortran-ng=7.2.0=hdf63c60_3 36 | - libidn2=2.0.4=hb0ec843_0 37 | - libpng=1.6.34=ha6fa132_1 38 | - libssh2=1.8.0=hb91037a_2 39 | - libstdcxx-ng=7.2.0=hdf63c60_3 40 | - libtiff=4.0.8=h04300b7_0 41 | - libxcb=1.13=h470a237_2 42 | - libxml2=2.9.4=hc2fdcf8_0 43 | - ncurses=5.9=701 44 | - openssl=1.0.2p=h470a237_2 45 | - pango=1.40.11=h6a13506_1 46 | - pcre=8.41=h39f570f_0 47 | - pip=19.0.3=py36_0 48 | - pixman=0.34.0=h0ca3aba_701 49 | - pthread-stubs=0.4=h470a237_1 50 | - python=3.6.5=1 51 | - r-base=3.4.2=h3655213_0 52 | - r-bitops=1.0_6=r342h6d3b7a6_1 53 | - r-rcurl=1.95_4.8=r342hfdac255_0 54 | - r-sequenza=2.1.2=r342h39d70e4_0 55 | - r-squash=1.0.8=r342ha8977b7_0 56 | - readline=7.0=h5a58b2a_0 57 | - samtools=1.5=0 58 | - setuptools=40.8.0=py36_0 59 | - sqlite=3.20.1=2 60 | - tabix=0.2.6=ha92aebf_0 61 | - tk=8.6.7=he069c39_0 62 | - wheel=0.33.1=py36_0 63 | - xorg-libxau=1.0.8=h470a237_6 64 | - xorg-libxdmcp=1.1.2=h470a237_7 65 | - xz=5.2.3=h5714765_0 66 | - zlib=1.2.11=h3b3956b_0 67 | - pip: 68 | - sequenza-utils==2.1.9999b0 69 | 70 | -------------------------------------------------------------------------------- /envs/somaticseq.yaml: -------------------------------------------------------------------------------- 1 | name: somaticseq 2 | channels: 3 | - bioconda 4 | - defaults 5 | - conda-forge 6 | dependencies: 7 | - bamtools=2.4.1=0 8 | - bcftools=1.6=0 9 | - bedtools=2.27.1=0 10 | - gatk4=4.0.9.0=0 11 | - htslib=1.7=0 12 | - lofreq=2.1.3.1=py36_0 13 | - muse=1.0.rc=0 14 | - pysam=0.14.0=py36_htslib1.7_0 15 | - r-ada=2.0_5=r3.3.2_0 16 | - regex=2016.06.24=py36_1 17 | - samtools=1.7=0 18 | - scalpel=0.5.3=h2407274_2 19 | - somaticseq=2.8.1=py36_0 20 | - vardict=2018.09.21=0 21 | - varscan=2.4.3=1 22 | - icu=58.2=hfc679d8_0 23 | - libgcc-ng=7.2.0=hdf63c60_3 24 | - libstdcxx-ng=7.2.0=hdf63c60_3 25 | - perl=5.26.2=h470a237_0 26 | - r-base=3.3.2=5 27 | - r-rpart=4.1_13=r3.3.2_0 28 | - blas=1.0=mkl 29 | - bzip2=1.0.6=3 30 | - cairo=1.14.8=0 31 | - certifi=2016.2.28=py36_0 32 | - curl=7.54.1=0 33 | - fontconfig=2.12.1=3 34 | - freetype=2.5.5=2 35 | - glib=2.50.2=1 36 | - gsl=2.2.1=0 37 | - harfbuzz=0.9.39=2 38 | - jbig=2.1=0 39 | - jpeg=9b=0 40 | - krb5=1.13.2=0 41 | - libffi=3.2.1=1 42 | - libgcc=5.2.0=0 43 | - libgfortran=3.0.0=1 44 | - libiconv=1.14=0 45 | - libpng=1.6.30=1 46 | - libssh2=1.8.0=0 47 | - libtiff=4.0.6=3 48 | - libxml2=2.9.4=0 49 | - mkl=2017.0.3=0 50 | - ncurses=5.9=10 51 | - numpy=1.13.1=py36_0 52 | - openjdk=8.0.121=1 53 | - openssl=1.0.2l=0 54 | - pango=1.40.3=1 55 | - pcre=8.39=1 56 | - pip=9.0.1=py36_1 57 | - pixman=0.34.0=0 58 | - python=3.6.2=0 59 | - readline=6.2=2 60 | - scipy=0.19.1=np113py36_0 61 | - setuptools=36.4.0=py36_1 62 | - sqlite=3.13.0=0 63 | - tk=8.5.18=0 64 | - wheel=0.29.0=py36_0 65 | - xz=5.2.3=0 66 | - zlib=1.2.8=3 67 | 68 | -------------------------------------------------------------------------------- /envs/telseq.yaml: -------------------------------------------------------------------------------- 1 | name: telseq 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - bamtools=2.4.1=1 9 | - telseq=0.0.1=hbed2392_1 10 | # - libgcc=7.2.0=h69d50b8_2 11 | # - libgcc-ng=7.2.0=hdf63c60_3 12 | # - libstdcxx-ng=7.2.0=hdf63c60_3 13 | # - zlib=1.2.11=ha838bed_2 14 | 15 | -------------------------------------------------------------------------------- /envs/varscan2.yaml: -------------------------------------------------------------------------------- 1 | name: varscan2 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - bam-readcount=0.8=py36pl5.22.0_3 9 | - samtools=1.7=1 10 | - varscan=2.4.3=1 11 | - libedit=3.1.20170329=0 12 | - perl=5.22.0.1=0 13 | - bzip2=1.0.6=h14c3975_5 14 | - ca-certificates=2018.03.07=0 15 | - certifi=2018.4.16=py36_0 16 | - curl=7.61.0=h84994c4_0 17 | - libcurl=7.61.0=h1ad7b7a_0 18 | - libffi=3.2.1=hd88cf55_4 19 | - libgcc=7.2.0=h69d50b8_2 20 | - libgcc-ng=7.2.0=hdf63c60_3 21 | - libssh2=1.8.0=h9cfc8f7_4 22 | - libstdcxx-ng=7.2.0=hdf63c60_3 23 | - ncurses=5.9=10 24 | - openjdk=8.0.121=1 25 | - openssl=1.0.2o=h14c3975_1 26 | - pip=10.0.1=py36_0 27 | - python=3.6.3=h1284df2_4 28 | - readline=7.0=hb321a52_4 29 | - setuptools=39.2.0=py36_0 30 | - sqlite=3.24.0=h84994c4_0 31 | - tk=8.6.7=hc745277_3 32 | - wheel=0.31.1=py36_0 33 | - xz=5.2.4=h14c3975_4 34 | - zlib=1.2.11=ha838bed_2 35 | 36 | -------------------------------------------------------------------------------- /envs/vcf2maf.yaml: -------------------------------------------------------------------------------- 1 | name: vcf2maf 2 | channels: 3 | - bioconda 4 | - r 5 | - defaults 6 | - conda-forge 7 | dependencies: 8 | - ensembl-vep 9 | - vcf2maf 10 | - samtools 11 | - bcftools 12 | - tabix 13 | -------------------------------------------------------------------------------- /jar/VarScan.v2.4.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/jar/VarScan.v2.4.2.jar -------------------------------------------------------------------------------- /jar/VarScan.v2.4.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/jar/VarScan.v2.4.3.jar -------------------------------------------------------------------------------- /julia/README.md: -------------------------------------------------------------------------------- 1 | # SubClonalSelection Pipeline 2 | 3 | ## Files used 4 | |File|Description| 5 | |---|---| 6 | |GLASS_genotypes.csv| Contains information for every mutation called in a sample.| 7 | |GLASS\_genotype\_comparison| Contains shared/private status for every mutation called in a tumour pair, including whether it is shared or private and also the TITAN copy number estimate for the position.| 8 | |GLASS\_genotype\_comparison\_extracted.tsv| The extracted GLASS\_genotype\_comparison file.| 9 | |silver_set.csv| Contains a list of paired samples to include in the analysis.| 10 | |titanparams_synapse.tsv| Contains TITAN ploidy and purity estimates for each sample.| 11 | 12 | ## Pipeline 13 | 14 | ### 1. Extract necessary columns from the genotypes comparisons file 15 | 16 | (The position column counts as 2 due to the comma). 17 | 18 | ``` 19 | cat GLASS_genotype_comparison.tsv | cut -d, -f 1,9,10,12,21,22 | tr "," "\\t" > GLASS_genotype_comparison_extracted.tsv 20 | ``` 21 | 22 | ### 2. Run extrac_vafs.py 23 | 24 | ``` 25 | python extrac_vafs.py -c GLASS_genotype_comparison_extracted.tsv -g GLASS_genotypes.csv -s silver_set.csv -t titanparams_synapse.tsv -o ./ 26 | ``` 27 | 28 | ### 3. Add minimum VAF column 29 | 30 | Manualy add a column for minimum VAF to the metadata file output from extrac_vafs.py by inspecting the histogram outputs for each sample and choosing the VAF for the highest point of the left most peak. 31 | 32 | ### 4. Run the analysis through qsubsec 33 | See https://www.ncbi.nlm.nih.gov/pubmed/26635140 for details on qsubsec. 34 | 35 | ``` 36 | qsubsec subclonalselection.qsubsec subclonalselection.tff -s 37 | ``` 38 | 39 | ### 5. Subsample inputs 40 | 41 | For runs that don't finish within 48h due to large numbers of mutations, subsample their VAF inputs and rerun. 42 | 43 | ``` 44 | shuf -n 20000 XXX.txt > XXX.txt 45 | ``` 46 | 47 | ### 6. Remove runs with high error in the model results 48 | 49 | Remove any runs with "New ϵ is within 7.0% of previous population, stop ABC SMC" warning in logs. -------------------------------------------------------------------------------- /julia/runsubclonalselection.jl.txt: -------------------------------------------------------------------------------- 1 | using ArgParse 2 | using SubClonalSelection 3 | 4 | 5 | s = ArgParseSettings() 6 | @add_arg_table s begin 7 | "--readdepth", "-d" 8 | help = "Mean read depth" 9 | arg_type=Float64 10 | "--minvaf", "-v" 11 | help = "Minimum VAF" 12 | arg_type=Float64 13 | "--fmin", "-f" 14 | help = "Minimum VAFs to model" 15 | arg_type=Float64 16 | "--mincellularity", "-m" 17 | help = "Min cellularity" 18 | arg_type=Float64 19 | "--maxcellularity", "-x" 20 | help = "Max cellularity" 21 | arg_type=Float64 22 | "--ploidy", "-l" 23 | help = "Ploidy" 24 | arg_type=Float64 25 | "--maxiterations", "-i" 26 | help = "Max iterations" 27 | arg_type=Int 28 | "--nparticles", "-p" 29 | help = "Number of particles" 30 | arg_type=Int 31 | "--resultsdirectory", "-r" 32 | help = "Results directory" 33 | "--name", "-n" 34 | help = "Name for run" 35 | "file" 36 | help = "Directory and file name conatining VAFs" 37 | required = true 38 | 39 | end 40 | 41 | args = parse_args(s) 42 | 43 | out = fitABCmodels(args["file"], args["name"], read_depth = args["readdepth"], resultsdirectory = args["resultsdirectory"], minvaf = args["minvaf"], fmin = args["fmin"], ploidy = args["ploidy"], maxiterations = args["maxiterations"], nparticles = args["nparticles"], mincellularity = args["mincellularity"], maxcellularity = args["maxcellularity"], Nmaxinf = 10^6, save = true, adaptpriors = true) 44 | -------------------------------------------------------------------------------- /julia/subclonalselection.qsubsec.txt: -------------------------------------------------------------------------------- 1 | # This script runs a single sample through subclonalselection 2 | # Georgette Tanner & ALastair Droop, 2019-01-11 3 | 4 | 5 | section('subclonalselection-{SAMPLE}-{FRACTION}-{RUN}', description='Run sample {SAMPLE} through subclonalselection.') 6 | limits(h_rt='48:00:00', h_vmem='16G') 7 | options('V', 'cwd', 'notify') 8 | outputs('{LOG_DIR}') 9 | 10 | # Load the metadata file: 11 | metadata = dict() 12 | try: 13 | metadata_file = open('{METADATA_DIR}/{METADATA_FILE}', 'rt') 14 | headers = metadata_file.readline() 15 | for metadata_line in metadata_file.readlines(): 16 | metadata_data = metadata_line.strip().split('\t') 17 | metadata[metadata_data[1]] = metadata_data 18 | except FileNotFoundError as err: 19 | message('ERROR: metadata file "{METADATA_FILE}" does not exist') 20 | raise 21 | except Exception as err: raise 22 | 23 | # Check that the given SAMPLE is present in the metadata dictionary: 24 | if '{SAMPLE}' not in metadata.keys(): raise Exception('Sample {SAMPLE} not in metadata') 25 | 26 | # Check number of suclonal VAFs 27 | if '{FRACTION}' == 'shared': NUM = int(metadata['{SAMPLE}'][3]) 28 | elif '{FRACTION}' == 'private': NUM = int(metadata['{SAMPLE}'][5]) 29 | elif '{FRACTION}' == 'all': NUM = int(metadata['{SAMPLE}'][7]) 30 | 31 | if NUM<25 : 32 | command('echo "Sample {SAMPLE}_{FRACTION} has less than 25 subclonal VAFs"', name = 'quit') 33 | elif metadata['{SAMPLE}'][14]=='-': 34 | command('echo "Sample {SAMPLE}_{FRACTION} has no minimum VAF given - assuming non suitable sample"', name = 'quit') 35 | else: 36 | 37 | # Extract values: 38 | read_depth = float(metadata['{SAMPLE}'][8]) 39 | min_vaf = float(metadata['{SAMPLE}'][14]) 40 | f_min = float(metadata['{SAMPLE}'][14]) 41 | ploidy = int(metadata['{SAMPLE}'][13]) 42 | min_cellularity = float(metadata['{SAMPLE}'][11]) 43 | max_cellularity = float(metadata['{SAMPLE}'][12]) 44 | 45 | # Build the command to submit: 46 | command('{JULIA_EXEC} {JULIA_SCRIPT} {VAF_FILE} --name {SAMPLE}_{FRACTION}_{RUN} --resultsdirectory {OUTPUT_DIR} --readdepth %s --maxiterations {ITERATIONS} --nparticles 500 --minvaf %s --fmin %s --ploidy %s --mincellularity %s --maxcellularity %s' % (read_depth, min_vaf, f_min, ploidy, min_cellularity, max_cellularity), name='run_julia') -------------------------------------------------------------------------------- /julia/subclonalselection.tff.txt: -------------------------------------------------------------------------------- 1 | # Define run: 2 | RUN=“run1” 3 | 4 | 5 | # Define the basic project structure: 6 | BASE_DIR = “./“ 7 | LOG_DIR = "{BASE_DIR}/logs" 8 | INPUT_DIR = "{BASE_DIR}/input" 9 | OUTPUT_DIR = "{BASE_DIR}/output" 10 | 11 | # Define the metadata file: 12 | METADATA_DIR = "{BASE_DIR}/metadata" 13 | METADATA_FILE = "metadata.tsv” 14 | SAMPLE = FILE("{METADATA_DIR}/samples_to_run.txt") 15 | 16 | # Define the input VAF filename structure: 17 | VAF_FILE = "{INPUT_DIR}/{SAMPLE}_{FRACTION}.txt" 18 | 19 | # Define the mutation fraction to use: 20 | FRACTION = "shared", "private", "all" 21 | 22 | # Define the executables: 23 | JULIA_EXEC = "~/julia-1.0.3/bin/julia" 24 | JULIA_SCRIPT = "{BASE_DIR}/scripts/runsubclonalselection.jl" 25 | 26 | # Iterations: 27 | ITERATIONS = "100000" -------------------------------------------------------------------------------- /python/.ipynb_checkpoints/LearnRegexp-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /python/.ipynb_checkpoints/LearningJSON-1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "hello world\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "print(\"hello world\")" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 3", 38 | "language": "python", 39 | "name": "python3" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 3 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython3", 51 | "version": "3.6.4" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 2 56 | } 57 | -------------------------------------------------------------------------------- /python/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "CLUSTER_META = json.load(\"/fastscratch/verhaak-lab/GLASS-WG/conf/cluster.json\")" 10 | ] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 3", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.6.4" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 2 34 | } 35 | -------------------------------------------------------------------------------- /python/.ipynb_checkpoints/Untitled1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /python/.ipynb_checkpoints/Untitled2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /python/.ipynb_checkpoints/Untitled3-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpbarthel/GLASS/333d5d01477e49bb2cf87be459d4161d4cde4483/python/__init__.py -------------------------------------------------------------------------------- /python/countPysam.py: -------------------------------------------------------------------------------- 1 | # requires python3 2 | # requires pysam-0.11.2.1 3 | import sys 4 | import pysam 5 | 6 | chromToUse = sys.argv[1] # 0 for all chromosomes 7 | norm_hetpsns = sys.argv[2] 8 | bam_file = sys.argv[3] 9 | #ref_file = sys.argv[4] 10 | base_quality = int(sys.argv[4]) 11 | map_quality = int(sys.argv[5]) 12 | vcf_quality = int(sys.argv[6]) 13 | positions = {} 14 | 15 | # add (position,depth) from the normal hetpositions input file to a dictionary of lists 16 | # indexed by chromosome 17 | for line in open(norm_hetpsns): 18 | if not line.strip().startswith("#"): 19 | chrom = line.split()[0] 20 | if chrom == chromToUse or chromToUse == 0: 21 | position = int(line.strip().split()[1]) 22 | ref_base = line.strip().split()[3] 23 | nref_base = line.strip().split()[4] 24 | qual = line.strip().split()[5] 25 | depth = line.split()[7].split(';')[0].replace('DP=', '') 26 | position_data = position, depth, ref_base, nref_base, qual 27 | if chrom not in positions: 28 | positions[chrom] = [] 29 | positions[chrom].append(position_data) 30 | 31 | sample = pysam.AlignmentFile(bam_file) 32 | #reference = pysam.FastaFile(ref_file) 33 | ## print header ## 34 | print ("Chr\tPosition\tRef\tRefCount\tNref\tNrefCount\tNormQuality") 35 | 36 | for chrom in positions: 37 | i = 0 38 | for position_data in positions[chrom]: 39 | position = int(position_data[0]) 40 | result = str(chrom) + "\t" + str(position) 41 | ref_base = position_data[2] 42 | nref_base = position_data[3] 43 | qual = float(position_data[4]) 44 | if qual >= vcf_quality and qual != None: 45 | _p = sample.pileup(reference=chrom, start=position, end=position + 1) 46 | bases = list() 47 | for p in _p: 48 | if p.reference_pos == position: 49 | for r in p.pileups: 50 | if not r.is_del and not r.is_refskip: 51 | base = r.alignment.query_sequence[r.query_position-1] 52 | mapq = r.alignment.mapping_quality 53 | baseq = r.alignment.query_qualities[r.query_position-1] 54 | if mapq >= map_quality and baseq >= base_quality: 55 | bases.append(base) 56 | ref_count = 0 57 | depth = 0 58 | for base in bases: 59 | depth += 1 60 | if base == ref_base: 61 | ref_count += 1 62 | alt_count = depth - ref_count 63 | 64 | result += "\t" + ref_base + "\t" + str(ref_count) + "\t" + nref_base + "\t" + str(alt_count) + '\t' + str(qual) 65 | print(result) 66 | i += 1 67 | 68 | 69 | -------------------------------------------------------------------------------- /python/glassfunc.py: -------------------------------------------------------------------------------- 1 | """ 2 | GLASS helper functions 3 | """ 4 | 5 | import os, fnmatch 6 | from configparser import ConfigParser 7 | 8 | def touch_file(fname, mode=0o666, dir_fd=None, **kwargs): 9 | """ 10 | Touch function taken from stackoverflow 11 | Link: https://stackoverflow.com/questions/1158076/implement-touch-using-python 12 | """ 13 | flags = os.O_CREAT | os.O_APPEND 14 | with os.fdopen(os.open(fname, flags=flags, mode=mode, dir_fd=dir_fd)) as f: 15 | os.utime(f.fileno() if os.utime in os.supports_fd else fname, 16 | dir_fd=None if os.supports_fd else dir_fd, **kwargs) 17 | 18 | def build_dict(seq, key): 19 | """ 20 | Turn an unnamed list of dicts into a nammed list of dicts 21 | Taken from stackoverflow 22 | https://stackoverflow.com/questions/4391697/find-the-index-of-a-dict-within-a-list-by-matching-the-dicts-value 23 | """ 24 | return dict((d[key], dict(d, index=index)) for (index, d) in enumerate(seq)) 25 | 26 | def dbconfig(filename, section): 27 | """ 28 | Loads db connection settings in text file 29 | From http://www.postgresqltutorial.com/postgresql-python/connect/ 30 | """ 31 | # create a parser 32 | parser = ConfigParser() 33 | # read config file 34 | parser.read(filename) 35 | 36 | # get section, default to postgresql 37 | db = {} 38 | if parser.has_section(section): 39 | params = parser.items(section) 40 | for param in params: 41 | db[param[0]] = param[1] 42 | else: 43 | raise Exception('Section {0} not found in the {1} file'.format(section, filename)) 44 | 45 | return db 46 | 47 | def locate(pattern, root = os.curdir): 48 | """ 49 | Locate all files matching supplied filename pattern in and below 50 | supplied root directory. 51 | Taken from: http://code.activestate.com/recipes/499305-locating-files-throughout-a-directory-tree/ 52 | """ 53 | for path, dirs, files in os.walk(os.path.abspath(root)): 54 | for filename in fnmatch.filter(files, pattern): 55 | yield os.path.join(path, filename) 56 | 57 | ## END ## 58 | -------------------------------------------------------------------------------- /python/manifest_tester.py: -------------------------------------------------------------------------------- 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 2 | ## Manifest tester 3 | ## Authors: Floris Barthel 4 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 5 | 6 | import os 7 | import pandas as pd 8 | import itertools 9 | import yaml 10 | 11 | ## Import manifest processing functions 12 | from python.glassfunc import dbconfig, locate 13 | from python.PostgreSQLManifestHandler import PostgreSQLManifestHandler 14 | from python.JSONManifestHandler import JSONManifestHandler 15 | 16 | config = yaml.load(open('conf/config.yaml')) 17 | 18 | ## Connect to database 19 | dbconf = dbconfig(config["db"]["configfile"], config["db"]["configsection"]) 20 | 21 | ## Instantiate manifest 22 | manifest = PostgreSQLManifestHandler(host = dbconf["servername"], port = dbconf["port"], user = dbconf["username"], password = dbconf["password"], database = dbconf["database"], 23 | source_file_basepath = config["data"]["source_path"], aligned_file_basepath = config["data"]["realn_path"], from_source = config["from_source"]) 24 | print(manifest) -------------------------------------------------------------------------------- /snakemake/download.smk: -------------------------------------------------------------------------------- 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 2 | ## Download BAM file from GDC 3 | ## GDC key needs to be re-downloaded and updated from time to time 4 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 5 | 6 | rule download: 7 | output: 8 | "data/download/{uuid}/{filename}.bam" 9 | threads: 10 | CLUSTER_META["download"]["ppn"] 11 | message: 12 | "Downloading from GDC\n" 13 | "UUID {wildcards.uuid}\n" 14 | "File {wildcards.filename}" 15 | conda: 16 | "../envs/gdc-client.yaml" 17 | log: 18 | "logs/download/{uuid}.{filename}.log" 19 | benchmark: 20 | "benchmarks/download/{uuid}.{filename}.txt" 21 | shell: 22 | "gdc-client download \ 23 | -d download \ 24 | -n {threads} \ 25 | -t {config[gdc_token]} \ 26 | {wildcards.uuid} \ 27 | > {log} 2>&1" -------------------------------------------------------------------------------- /snakemake/fusorsv.smk: -------------------------------------------------------------------------------- 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 2 | ## FusorSV 3 | ## Preparing for FusorSV by collecting VCF files from various callers 4 | ## See: https://github.com/timothyjamesbecker/FusorSV 5 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 6 | 7 | rule fusorsv_prepare: 8 | input: 9 | delly = "results/delly/filter/{pair_id}.prefilt.bcf", 10 | lumpy = "results/lumpy/filter/{pair_id}.dict.svtyper.filtered.vcf", 11 | manta = "results/manta/{pair_id}/results/variants/somaticSV.vcf.gz" 12 | output: 13 | delly = "results/fusorsv/prepare/{pair_id}/{pair_id}.delly.vcf", 14 | lumpy = "results/fusorsv/prepare/{pair_id}/{pair_id}.lumpy.vcf", 15 | manta = "results/fusorsv/prepare/{pair_id}/{pair_id}.manta.vcf" 16 | params: 17 | mem = CLUSTER_META["fusorsv_prepare"]["mem"] 18 | threads: 19 | CLUSTER_META["fusorsv_prepare"]["ppn"] 20 | conda: 21 | "../envs/fusorsv.yaml" 22 | log: 23 | "logs/fusorsv/prepare/{pair_id}.log" 24 | benchmark: 25 | "benchmarks/fusorsv/prepare/{pair_id}.txt" 26 | message: 27 | "Preparing for FusorSV by collecting VCF files from various callers\n" 28 | "Pair: {wildcards.pair_id}" 29 | shell: 30 | "(bcftools view {input.delly} > {output.delly} && \ 31 | bcftools view {input.lumpy} > {output.lumpy} && \ 32 | bcftools view {input.manta} > {output.manta}) \ 33 | > {log} 2>&1" 34 | 35 | # "bcftools view {output.bcf} > {params.vcftmp} && \ 36 | # bgzip -i {params.vcftmp} && \ 37 | # bcftools index -t {output.vcf}" 38 | 39 | ## END ## -------------------------------------------------------------------------------- /snakemake/manta.smk: -------------------------------------------------------------------------------- 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 2 | ## Manta 3 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 4 | 5 | rule manta_config: 6 | input: 7 | tumor = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getTumor(wildcards.pair_barcode)), 8 | normal = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getNormal(wildcards.pair_barcode)) 9 | output: 10 | script = "results/manta/{pair_barcode}/runWorkflow.py" 11 | params: 12 | rundir = "results/manta/{pair_barcode}", 13 | mem = CLUSTER_META["manta_config"]["mem"] 14 | threads: 15 | CLUSTER_META["manta_config"]["ppn"] 16 | conda: 17 | "../envs/manta.yaml" 18 | log: 19 | "logs/manta/config/{pair_barcode}.log" 20 | benchmark: 21 | "benchmarks/manta/config/{pair_barcode}.txt" 22 | message: 23 | "Configuring Manta for tumor/normal pair\n" 24 | "Pair: {wildcards.pair_barcode}" 25 | shell: 26 | "configManta.py \ 27 | --normalBam {input.normal} \ 28 | --tumorBam {input.tumor} \ 29 | --callRegions {config[svinclude_manta]} \ 30 | --referenceFasta {config[reference_fasta]} \ 31 | --runDir {params.rundir} \ 32 | > {log} 2>&1; " 33 | 34 | rule manta_execute: 35 | input: 36 | script = "results/manta/{pair_barcode}/runWorkflow.py", 37 | tumor = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getTumor(wildcards.pair_barcode)), 38 | normal = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getNormal(wildcards.pair_barcode)) 39 | output: 40 | "results/manta/{pair_barcode}/results/variants/diploidSV.vcf.gz", 41 | "results/manta/{pair_barcode}/results/variants/somaticSV.vcf.gz", 42 | "results/manta/{pair_barcode}/results/variants/candidateSV.vcf.gz", 43 | "results/manta/{pair_barcode}/results/variants/candidateSmallIndels.vcf.gz" 44 | params: 45 | mem = CLUSTER_META["manta_execute"]["mem"] 46 | threads: 47 | CLUSTER_META["manta_execute"]["ppn"] 48 | conda: 49 | "../envs/manta.yaml" 50 | log: 51 | "logs/manta/execute/{pair_barcode}.log" 52 | benchmark: 53 | "benchmarks/manta/execute/{pair_barcode}.txt" 54 | message: 55 | "Running Manta for tumor/normal pair\n" 56 | "Pair: {wildcards.pair_barcode}" 57 | shell: 58 | "{input.script} \ 59 | -m local \ 60 | -j {threads} \ 61 | -g {params.mem} \ 62 | > {log} 2>&1; " 63 | 64 | ## END ## -------------------------------------------------------------------------------- /snakemake/somaticseq.smk: -------------------------------------------------------------------------------- 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 2 | ## Snakefile for SomaticSeq Varscan2 and Mutect2 consensus calling 3 | ## Authors: Floris Barthel 4 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 5 | 6 | rule somatiseq: 7 | input: 8 | vs2snp = "results/varscan2/fpfilter/{pair_barcode}.snp.Somatic.hc.final.vcf", 9 | vs2indel = "results/varscan2/vs2-filter/{pair_barcode}.indel.Somatic.hc.filter.vcf", 10 | mutect2 = "results/mutect2/final/{pair_barcode}.final.vcf", 11 | tumorbam = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getTumor(wildcards.pair_barcode)), 12 | normalbam = lambda wildcards: "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam".format(aliquot_barcode = manifest.getNormal(wildcards.pair_barcode)) 13 | output: 14 | "results/somaticseq/{pair_barcode}/Consensus.sSNV.vcf" 15 | "results/somaticseq/{pair_barcode}/Consensus.sINDEL.vcf", 16 | "results/somaticseq/{pair_barcode}/Ensemble.sSNV.tsv", 17 | "results/somaticseq/{pair_barcode}/Ensemble.sINDEL.tsv" 18 | params: 19 | outdir = "results/somaticseq/{pair_barcode}", 20 | mem = CLUSTER_META["somaticseq"]["mem"] 21 | threads: 22 | CLUSTER_META["somaticseq"]["ppn"] 23 | conda: 24 | "../envs/somaticseq.yaml" 25 | log: 26 | "logs/somaticseq/{pair_barcode}.log" 27 | benchmark: 28 | "benchmarks/somaticseq/{pair_barcode}.txt" 29 | message: 30 | "Running SomaticSeq consensus calling\n" 31 | "Pair: {wildcards.pair_barcode}" 32 | shell: 33 | "SomaticSeq.Wrapper.sh \ 34 | --output-directory {params.outdir} \ 35 | --genome-reference {config[reference_fasta]} \ 36 | paired \ 37 | --tumor-bam-file {input.tumorbam} \ 38 | --normal-bam-file {input.normalbam} \ 39 | --mutect2-vcf {input.mutect2} \ 40 | --varscan-snv {input.vs2snp} \ 41 | --varscan-indel {input.vs2indel} \ 42 | > {log} 2>&1; " -------------------------------------------------------------------------------- /snakemake/telseq.smk: -------------------------------------------------------------------------------- 1 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 2 | ## Telomere content estimates from BAM file 3 | ## See: https://github.com/abyzovlab/CNVnator 4 | ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## 5 | 6 | rule telseq_run: 7 | input: 8 | "results/align/bqsr/{aliquot_barcode}.realn.mdup.bqsr.bam" 9 | output: 10 | protected("results/telseq/{aliquot_barcode}.telseq.txt") 11 | params: 12 | mem = CLUSTER_META["telseq_run"]["mem"] 13 | threads: 14 | CLUSTER_META["telseq_run"]["ppn"] 15 | #conda: 16 | # "../envs/telseq.yaml" 17 | log: 18 | "logs/telseq/{aliquot_barcode}.log" 19 | benchmark: 20 | "benchmarks/telseq/{aliquot_barcode}.txt" 21 | message: 22 | "Telomere content estimates from BAM file\n" 23 | "Sample: {wildcards.aliquot_barcode}" 24 | shell:""" 25 | module load telseq 26 | telseq -o {output} \ 27 | -r {config[telseq_r]} \ 28 | {input} \ 29 | > {log} 2>&1 30 | """ 31 | 32 | ## END ## -------------------------------------------------------------------------------- /sql/cnv/c710_status.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Use GATK-based arm level calls (Taylor method) to call the glioma event combined amplification of chr 7 and chr 10 deletion 3 | */ 4 | WITH 5 | t1 AS 6 | ( 7 | SELECT 8 | aliquot_barcode, 9 | chrom, 10 | arm, 11 | arm_call, 12 | (CASE 13 | WHEN chrom = 7 THEN arm_call = 1 14 | WHEN chrom = 10 THEN arm_call = -1 15 | END) AS bool_call 16 | FROM analysis.gatk_cnv_by_arm 17 | WHERE chrom IN (7,10) 18 | ), 19 | t2 AS 20 | ( 21 | SELECT 22 | aliquot_barcode, 23 | chrom, 24 | COUNT(CASE WHEN bool_call IS TRUE THEN 1 END) AS count_true, -- number of chromosome arms with event 25 | COUNT(CASE WHEN bool_call IS FALSE THEN 1 END) AS count_false, -- number of chromosome arms lacking event 26 | COUNT(CASE WHEN bool_call IS NULL THEN 1 END) AS count_null -- number of chromosome arms unknown 27 | FROM t1 28 | GROUP BY 1, 2 29 | ), 30 | t3 AS 31 | ( 32 | SELECT 33 | aliquot_barcode, 34 | (CASE WHEN bool_or(count_null = 2) THEN NULL ELSE bool_and(count_true > 0) AND bool_and(count_false = 0) END) AS c710 35 | FROM t2 36 | GROUP BY 1 37 | ) 38 | SELECT * FROM t3 -------------------------------------------------------------------------------- /sql/cnv/call_10q25_26.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Call CNV for 10q25-26 region 3 | */ 4 | WITH 5 | selected_regions AS 6 | ( 7 | SELECT '10q25-26' AS region, * FROM ref.cytobands WHERE chrom = 10 AND substring(cytoband from 1 for 3) IN ('q25','q26') 8 | ), 9 | gene_seg_intersect AS 10 | ( 11 | SELECT aliquot_barcode, region, gs.chrom, (upper(t0.pos * gs.pos) - lower(t0.pos * gs.pos) -1) AS w, 2^log2_copy_ratio::decimal As cr 12 | FROM variants.gatk_seg gs 13 | INNER JOIN selected_regions t0 ON t0.chrom = gs.chrom AND t0.pos && gs.pos 14 | ), 15 | gene_sample_call AS 16 | ( 17 | SELECT aliquot_barcode, region, 18 | sum(w * cr) / sum(w) AS wcr 19 | FROM gene_seg_intersect 20 | GROUP BY aliquot_barcode, region 21 | ), 22 | seg_stats_optimized AS 23 | ( 24 | SELECT 25 | gs.aliquot_barcode, 26 | LEAST(0.9, neu_fwmean - 2 * neu_fwsd) AS del_thres, 27 | GREATEST(1.1, neu_fwmean + 2 * neu_fwsd) AS amp_thres, 28 | (CASE 29 | WHEN max_loss_arm_wmean < 0.9 AND max_loss_arm_n >= 3 THEN GREATEST(0,max_loss_arm_wmean - 2 * max_loss_arm_wsd) 30 | WHEN del_fwmean < 0.9 AND del_n >= 3 THEN GREATEST(0,del_fwmean - 2 * del_fwsd) 31 | ELSE NULL 32 | END) AS hldel_thres, 33 | (CASE 34 | WHEN max_gain_arm_wmean > 1.1 AND max_gain_arm_n >= 3 THEN max_gain_arm_wmean + 2 * max_gain_arm_wsd 35 | WHEN amp_fwmean > 1.1 AND amp_n >= 3 THEN amp_fwmean + 2 * amp_fwsd 36 | ELSE NULL 37 | END) AS hlamp_thres 38 | FROM analysis.gatk_seg_stats gs 39 | LEFT JOIN analysis.gatk_aneuploidy gsa ON gsa.aliquot_barcode = gs.aliquot_barcode 40 | ), 41 | gene_cp AS 42 | ( 43 | SELECT ts.aliquot_barcode, region, ts.chrom, (upper(t0.pos * ts.pos) - lower(t0.pos * ts.pos) -1) AS w, cellular_prevalence As cp 44 | FROM variants.titan_seg ts 45 | INNER JOIN selected_regions t0 ON t0.chrom = ts.chrom AND t0.pos && ts.pos 46 | ), 47 | gene_cp_agg AS 48 | ( 49 | SELECT aliquot_barcode, region, 50 | COALESCE(sum(w * cp) / NULLIF(sum(w),0),NULL) AS wcp 51 | FROM gene_cp 52 | GROUP BY 1, 2 53 | ) 54 | SELECT 55 | gc.aliquot_barcode, 56 | gc.region, 57 | (CASE 58 | WHEN gc.wcr >= del_thres AND gc.wcr <= amp_thres THEN 0 59 | WHEN gc.wcr < hldel_thres THEN -2 60 | WHEN gc.wcr < del_thres THEN -1 61 | WHEN gc.wcr > hlamp_thres THEN 2 62 | WHEN gc.wcr > amp_thres THEN 1 63 | ELSE NULL 64 | END) hlvl_call, 65 | gc.wcr, 66 | wcp AS cellular_prevalence 67 | FROM gene_sample_call gc 68 | LEFT JOIN seg_stats_optimized ss ON ss.aliquot_barcode = gc.aliquot_barcode 69 | LEFT JOIN gene_cp_agg cp ON cp.aliquot_barcode = gc.aliquot_barcode AND cp.region = gc.region 70 | ORDER BY 3 -------------------------------------------------------------------------------- /sql/cnv/gistic_prepare.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Prepare a set of primaries and recurrences from the gold set (these are good quality CNV data) 3 | As input for running GISTIC 4 | == 5 | Ensures one sample per patient 6 | */ 7 | WITH t1 AS 8 | ( 9 | SELECT 10 | aliquot_barcode, 11 | (CASE WHEN chrom = 23 THEN 'X' ELSE chrom::varchar(2) END) AS chrom, 12 | lower(pos) AS "start", 13 | upper(pos)-1 AS "end", 14 | num_points AS num_snps, 15 | log2_copy_ratio 16 | FROM variants.gatk_seg gs 17 | ) 18 | SELECT t1.*,'P' AS sample_type FROM t1 19 | INNER JOIN analysis.gold_set gs ON gs.tumor_barcode_a = t1.aliquot_barcode 20 | 21 | UNION 22 | 23 | SELECT t1.*,'R' AS sample_type FROM t1 24 | INNER JOIN analysis.gold_set gs ON gs.tumor_barcode_b = t1.aliquot_barcode 25 | -------------------------------------------------------------------------------- /sql/cnv/prop_heterozygous_gatk.sql: -------------------------------------------------------------------------------- 1 | /* 2 | - For each tumor pair in the tumor pairs table, compute the proportion of the original genome changed 3 | */ 4 | WITH 5 | cnv AS 6 | ( 7 | SELECT 8 | gs.tumor_pair_barcode, 9 | gs.case_barcode, 10 | gs.tumor_barcode_a, 11 | gs.tumor_barcode_b, 12 | sum(upper(pos) - lower(pos) -1) AS seg_size, 13 | sum(CASE WHEN gs.cnv_call = 0 THEN (upper(pos) - lower(pos) -1) ELSE 0 END) AS het_size 14 | FROM analysis.gatk_seg_diff_call gs 15 | WHERE chrom < 23 16 | GROUP BY 1,2,3,4 17 | ) 18 | SELECT 19 | tumor_pair_barcode, 20 | case_barcode, 21 | tumor_barcode_a, 22 | tumor_barcode_b, 23 | round(1.0 - het_size::decimal/seg_size,4) AS prop_change 24 | FROM cnv 25 | ORDER BY 2 -------------------------------------------------------------------------------- /sql/compare_seg_stats.sql: -------------------------------------------------------------------------------- 1 | /* 2 | For each `tumor_pair_barcode` in the `tumor_pairs` table: 3 | - Compare number of segments between (a) and (b) 4 | - Compare proportion of genome that is heterozyous between (a) and (b) 5 | */ 6 | SELECT 7 | tumor_pair_barcode, 8 | tumor_barcode_a, 9 | tumor_barcode_b, 10 | s1.num_seg AS num_seg_a, 11 | s2.num_seg AS num_seg_b, 12 | s1.prop_het AS prop_het_a, 13 | s2.prop_het AS prop_het_b, 14 | s2.num_seg - s1.num_seg AS delta_num_seg, 15 | s2.prop_het - s1.prop_het AS delta_prop_het 16 | FROM analysis.tumor_pairs pa 17 | LEFT JOIN analysis.pairs p1 ON p1.tumor_barcode = pa.tumor_barcode_a 18 | LEFT JOIN analysis.pairs p2 ON p2.tumor_barcode = pa.tumor_barcode_b 19 | LEFT JOIN analysis.titan_seg_prop_het s1 ON s1.pair_barcode = p1.pair_barcode 20 | LEFT JOIN analysis.titan_seg_prop_het s2 ON s2.pair_barcode = p2.pair_barcode 21 | ORDER BY 9 -------------------------------------------------------------------------------- /sql/compute_chr7_10.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Calculate chromosome 7/10 status 3 | */ 4 | WITH 5 | selected_tumor_pairs AS 6 | ( 7 | SELECT 8 | tumor_pair_barcode, 9 | case_barcode, 10 | tumor_barcode_a, 11 | tumor_barcode_b, 12 | row_number() OVER (PARTITION BY case_barcode ORDER BY surgical_interval_mo DESC, portion_a ASC, portion_b ASC, substring(tumor_pair_barcode from 27 for 3) ASC) AS priority 13 | FROM analysis.tumor_pairs ps 14 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a 15 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b 16 | WHERE 17 | comparison_type = 'longitudinal' AND 18 | sample_type_b <> 'M1' AND -- exclude metastatic samples here because this is outside the scope of our study 19 | b1.coverage_exclusion <> 'block' AND b2.cnv_exclusion <> 'block' 20 | ), 21 | t2 AS 22 | ( 23 | SELECT 24 | *, 25 | (SELECT log2_copy_ratio FROM analysis.cnv_by_chr_gatk WHERE chrom = '7' AND aliquot_barcode = stp.tumor_barcode_a) AS chr7_logr_a, 26 | (SELECT log2_copy_ratio FROM analysis.cnv_by_chr_gatk WHERE chrom = '7' AND aliquot_barcode = stp.tumor_barcode_b) AS chr7_logr_b, 27 | (SELECT log2_copy_ratio FROM analysis.cnv_by_chr_gatk WHERE chrom = '10' AND aliquot_barcode = stp.tumor_barcode_a) AS chr10_logr_a, 28 | (SELECT log2_copy_ratio FROM analysis.cnv_by_chr_gatk WHERE chrom = '10' AND aliquot_barcode = stp.tumor_barcode_b) AS chr10_logr_b 29 | FROM selected_tumor_pairs stp 30 | WHERE priority = 1 31 | ), 32 | t3 AS 33 | ( 34 | SELECT 35 | *, 36 | (CASE WHEN chr7_logr_a > 0.1 AND chr10_logr_a < -0.1 THEN 1 ELSE 0 END)::boolean AS chr7_10_a, 37 | (CASE WHEN chr7_logr_b > 0.1 AND chr10_logr_b < -0.1 THEN 1 ELSE 0 END)::boolean AS chr7_10_b 38 | FROM t2 39 | ) 40 | SELECT 41 | tumor_pair_barcode, 42 | case_barcode, 43 | tumor_barcode_a, 44 | tumor_barcode_b, 45 | (CASE WHEN chr7_10_a AND chr7_10_b THEN 'shared' 46 | WHEN chr7_10_a AND NOT chr7_10_b THEN 'shed' 47 | WHEN chr7_10_b AND NOT chr7_10_a THEN 'acquired' 48 | WHEN NOT chr7_10_a AND NOT chr7_10_b THEN 'no' END) AS chr7_10_status 49 | FROM t3 -------------------------------------------------------------------------------- /sql/dndscv/dndscv_input_by_fraction.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Prepare input for use with dNdS CV 3 | Remove contiguous sites using EXISTS 4 | */ 5 | SELECT 6 | tp.case_barcode, 7 | (CASE WHEN pgeno.chrom = 23 THEN 'X' ELSE pgeno.chrom::varchar(2) END) AS chrom, 8 | lower(pgeno.pos) AS pos, 9 | pgeno.ref, 10 | pgeno.alt AS mut, 11 | st.idh_codel_subtype AS subtype, 12 | (CASE WHEN mutect2_call_a AND mutect2_call_b THEN 'S' 13 | WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P' 14 | WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS fraction 15 | FROM variants.pgeno 16 | INNER JOIN analysis.gold_set tp ON tp.tumor_pair_barcode = pgeno.tumor_pair_barcode 17 | LEFT JOIN clinical.subtypes st ON st.case_barcode = pgeno.case_barcode 18 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = tp.tumor_pair_barcode 19 | WHERE 20 | (mutect2_call_a OR mutect2_call_b) AND hypermutator_status IS FALSE 21 | 22 | -- END -- -------------------------------------------------------------------------------- /sql/dndscv/dndscv_input_by_fraction_hyperm.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Prepare input for use with dNdS CV 3 | Remove contiguous sites using EXISTS 4 | */ 5 | SELECT 6 | tp.case_barcode, 7 | (CASE WHEN pgeno.chrom = 23 THEN 'X' ELSE pgeno.chrom::varchar(2) END) AS chrom, 8 | lower(pgeno.pos) AS pos, 9 | pgeno.ref, 10 | pgeno.alt AS mut, 11 | st.idh_codel_subtype AS subtype, 12 | (CASE WHEN mutect2_call_a AND mutect2_call_b THEN 'S' 13 | WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P' 14 | WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS fraction 15 | FROM variants.pgeno 16 | INNER JOIN analysis.gold_set tp ON tp.tumor_pair_barcode = pgeno.tumor_pair_barcode 17 | LEFT JOIN clinical.subtypes st ON st.case_barcode = pgeno.case_barcode 18 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = tp.tumor_pair_barcode 19 | WHERE 20 | (mutect2_call_a OR mutect2_call_b) AND hypermutator_status IS TRUE 21 | 22 | -- END -- -------------------------------------------------------------------------------- /sql/dndscv/dndscv_input_by_sample.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Prepare input for use with dNdS CV 3 | Remove contiguous sites using EXISTS 4 | Modified for per sample analysis 5 | */ 6 | WITH 7 | selected_aliquots AS 8 | ( 9 | SELECT gs.tumor_barcode_a AS aliquot_barcode, 'P' AS sample_type FROM analysis.gold_set gs 10 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = gs.tumor_pair_barcode 11 | WHERE hypermutator_status IS FALSE 12 | UNION 13 | SELECT gs.tumor_barcode_b AS aliquot_barcode, 'R' AS sample_type FROM analysis.gold_set gs 14 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = gs.tumor_pair_barcode 15 | WHERE hypermutator_status IS FALSE 16 | ) 17 | 18 | SELECT DISTINCT -- remove duplicate entries 19 | gt.aliquot_barcode, 20 | (CASE WHEN gt.chrom = 23 THEN 'X' ELSE gt.chrom::varchar(2) END) AS chrom, 21 | lower(gt.pos) AS pos, 22 | pa.ref, 23 | gt.alt AS mut, 24 | sample_type, 25 | idh_codel_subtype AS subtype 26 | FROM variants.passgeno gt 27 | INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = gt.aliquot_barcode 28 | LEFT JOIN variants.passanno pa ON pa.variant_id = gt.variant_id 29 | LEFT JOIN clinical.subtypes su ON su.case_barcode = substring(gt.aliquot_barcode from 1 for 12) 30 | WHERE 31 | ssm2_pass_call 32 | 33 | -- END -- -------------------------------------------------------------------------------- /sql/dndscv/dndscv_input_by_sample_hyperm.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Prepare input for use with dNdS CV 3 | Remove contiguous sites using EXISTS 4 | Modified for per sample analysis 5 | */ 6 | WITH 7 | selected_aliquots AS 8 | ( 9 | SELECT gs.tumor_barcode_a AS aliquot_barcode, 'P' AS sample_type FROM analysis.gold_set gs 10 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = gs.tumor_pair_barcode 11 | WHERE hypermutator_status IS TRUE 12 | UNION 13 | SELECT gs.tumor_barcode_b AS aliquot_barcode, 'R' AS sample_type FROM analysis.gold_set gs 14 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = gs.tumor_pair_barcode 15 | WHERE hypermutator_status IS TRUE 16 | ) 17 | 18 | SELECT DISTINCT -- remove duplicate entries 19 | gt.aliquot_barcode, 20 | (CASE WHEN gt.chrom = 23 THEN 'X' ELSE gt.chrom::varchar(2) END) AS chrom, 21 | lower(gt.pos) AS pos, 22 | pa.ref, 23 | gt.alt AS mut, 24 | sample_type, 25 | idh_codel_subtype AS subtype 26 | FROM variants.passgeno gt 27 | INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = gt.aliquot_barcode 28 | LEFT JOIN variants.passanno pa ON pa.variant_id = gt.variant_id 29 | LEFT JOIN clinical.subtypes su ON su.case_barcode = substring(gt.aliquot_barcode from 1 for 12) 30 | WHERE 31 | ssm2_pass_call 32 | 33 | -- END -- -------------------------------------------------------------------------------- /sql/drivers/driver_status.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ss.tumor_pair_barcode, 3 | ss.case_barcode, 4 | ss.tumor_barcode_a, 5 | ss.tumor_barcode_b, 6 | st.idh_codel_subtype, 7 | arm_driver_count, 8 | snv_driver_count, 9 | cnv_driver_count, 10 | arm_driver_count_shared, 11 | snv_driver_count_shared, 12 | cnv_driver_count_shared, 13 | arm_driver_count_private_a, 14 | snv_driver_count_private_a, 15 | cnv_driver_count_private_a, 16 | arm_driver_count_private_b, 17 | snv_driver_count_private_b, 18 | cnv_driver_count_private_b, 19 | arm_driver_shared, 20 | snv_driver_shared, 21 | cnv_driver_shared, 22 | arm_driver_stability, 23 | snv_driver_stability, 24 | cnv_driver_stability, 25 | arm_driver_change_a, 26 | snv_driver_change_a, 27 | cnv_driver_change_a, 28 | arm_driver_change_b, 29 | snv_driver_change_b, 30 | cnv_driver_change_b, 31 | snv_driver_context_shared, 32 | cnv_driver_context_shared, 33 | cnv_driver_context_change, 34 | snv_driver_context_change, 35 | snv_driver_evolution 36 | FROM analysis.silver_set ss 37 | LEFT JOIN analysis.driver_status_snv dss ON ss.tumor_pair_barcode = dss.tumor_pair_barcode 38 | LEFT JOIN analysis.driver_status_cnv dsc ON ss.tumor_pair_barcode = dsc.tumor_pair_barcode 39 | LEFT JOIN analysis.driver_status_arm dsa ON ss.tumor_pair_barcode = dsa.tumor_pair_barcode 40 | LEFT JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode -------------------------------------------------------------------------------- /sql/figures/mutsig_boxplot_fig1.sql: -------------------------------------------------------------------------------- 1 | WITH t1 AS 2 | ( 3 | SELECT gs.case_barcode, idh_codel_subtype, hypermutator_status::integer, fraction, signature, mut_n, abs_score, rel_score, RANK() OVER (PARTITION BY gs.case_barcode,fraction ORDER BY rel_score DESC) AS rnk, COUNT(*) OVER (PARTITION BY gs.case_barcode, signature) AS all_fractions_counts 4 | FROM analysis.mut_sig_fraction_limited ms 5 | INNER JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ms.tumor_pair_barcode 6 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = ms.tumor_pair_barcode 7 | INNER JOIN clinical.subtypes st ON st.case_barcode = gs.case_barcode 8 | ) 9 | SELECT * --signature,fraction,hypermutator_status,sum(rel_score) / count(rel_score) AS avg_rel_score, stddev(rel_score) AS sd_rel_score 10 | FROM t1 11 | --GROUP BY 1,2,3 12 | --ORDER BY 4 DESC -------------------------------------------------------------------------------- /sql/figures/mutsig_corr.sql: -------------------------------------------------------------------------------- 1 | SELECT gs.case_barcode, case_age_diagnosis_years AS age, surgical_interval, idh_codel_subtype, hypermutator_status::integer, fraction, signature, mut_n, abs_score, rel_score, RANK() OVER (PARTITION BY gs.case_barcode,fraction ORDER BY rel_score DESC) AS rnk, COUNT(*) OVER (PARTITION BY gs.case_barcode, signature) AS all_fractions_counts 2 | FROM analysis.mut_sig_fraction_limited ms 3 | INNER JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ms.tumor_pair_barcode 4 | INNER JOIN analysis.tumor_clinical_comparison tcc ON tcc.tumor_pair_barcode = ms.tumor_pair_barcode 5 | INNER JOIN clinical.subtypes st ON st.case_barcode = gs.case_barcode 6 | INNER JOIN clinical.cases ca ON ca.case_barcode = gs.case_barcode -------------------------------------------------------------------------------- /sql/heatmap/heatmap_aneuploidy.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | tumor_pair_barcode, 3 | tumor_barcode_a, 4 | tumor_barcode_b, 5 | ss.case_barcode, 6 | idh_codel_subtype, 7 | a1.prop_aneuploidy AS aneuploidy_a, 8 | a2.prop_aneuploidy AS aneuploidy_b, 9 | a1.aneuploidy_amp_score::integer AS aneuploidy_amp_score_a, 10 | a2.aneuploidy_amp_score::integer AS aneuploidy_amp_score_b, 11 | a1.aneuploidy_del_score::integer AS aneuploidy_del_score_a, 12 | a2.aneuploidy_del_score::integer AS aneuploidy_del_score_b, 13 | a1.aneuploidy_score::integer AS aneuploidy_score_a, 14 | a2.aneuploidy_score::integer AS aneuploidy_score_b, 15 | (CASE WHEN b1.cnv_exclusion <> 'allow' OR b2.cnv_exclusion <> 'allow' THEN 1 ELSE 0 END) qc_fail 16 | FROM analysis.gold_set ss 17 | LEFT JOIN analysis.gatk_aneuploidy a1 ON a1.aliquot_barcode = ss.tumor_barcode_a 18 | LEFT JOIN analysis.gatk_aneuploidy a2 ON a2.aliquot_barcode = ss.tumor_barcode_b 19 | --LEFT JOIN analysis.taylor_aneuploidy t1 ON t1.aliquot_barcode = ss.tumor_barcode_a 20 | --LEFT JOIN analysis.taylor_aneuploidy t2 ON t2.aliquot_barcode = ss.tumor_barcode_b 21 | LEFT JOIN clinical.subtypes su ON su.case_barcode = ss.case_barcode 22 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ss.tumor_barcode_a 23 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ss.tumor_barcode_b -------------------------------------------------------------------------------- /sql/heatmap/heatmap_arm.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | selected_tumor_pairs AS 3 | ( 4 | SELECT ss.tumor_pair_barcode, ss.tumor_barcode_a, ss.tumor_barcode_b, ss.case_barcode, idh_codel_subtype, (CASE WHEN gs.tumor_pair_barcode IS NULL THEN 'Silver set' ELSE 'Gold set' END) AS gold_set 5 | FROM analysis.gold_set ss 6 | LEFT JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ss.tumor_pair_barcode 7 | INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode 8 | ), 9 | selected_arms AS 10 | ( 11 | SELECT chrom,arm,direction FROM ref.arm_drivers_subtype 12 | ), 13 | cnv_by_pair_arm AS 14 | ( 15 | SELECT 16 | stp.tumor_pair_barcode, 17 | stp.case_barcode, 18 | stp.idh_codel_subtype, 19 | stp.tumor_barcode_a, 20 | stp.tumor_barcode_b, 21 | sa.chrom, 22 | sa.arm, 23 | c1.arm_call AS arm_a, 24 | c2.arm_call AS arm_b, 25 | (CASE 26 | WHEN sa.direction = -1 AND (c1.arm_call = -1 OR c2.arm_call = -1) THEN 'del' 27 | WHEN sa.direction = 1 AND (c1.arm_call = 1 OR c2.arm_call = 1) THEN 'amp' 28 | WHEN sa.direction = -1 AND (c1.arm_call = 1 OR c2.arm_call = 1) THEN 'neut' 29 | WHEN sa.direction = 1 AND (c1.arm_call = -1 OR c2.arm_call = -1) THEN 'neut' 30 | WHEN (c1.arm_call = 0 OR c2.arm_call = 0) THEN 'neut' 31 | ELSE NULL 32 | END) cnv_state, 33 | (CASE 34 | WHEN direction = -1 AND (c1.arm_call = -1 OR c2.arm_call = -1) AND c1.arm_call < c2.arm_call THEN 'P' 35 | WHEN direction = -1 AND (c1.arm_call = -1 OR c2.arm_call = -1) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S' 36 | WHEN direction = -1 AND (c1.arm_call = -1 OR c2.arm_call = -1) AND c1.arm_call > c2.arm_call THEN 'R' 37 | WHEN direction = 1 AND (c1.arm_call = 1 OR c2.arm_call = 1) AND c1.arm_call > c2.arm_call THEN 'P' 38 | WHEN direction = 1 AND (c1.arm_call = 1 OR c2.arm_call = 1) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S' 39 | WHEN direction = 1 AND (c1.arm_call = 1 OR c2.arm_call = 1) AND c1.arm_call < c2.arm_call THEN 'R' 40 | WHEN (c1.arm_call = 0 OR c2.arm_call = 0) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S' 41 | WHEN direction = -1 AND (c1.arm_call = 1 OR c2.arm_call = 1) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S' 42 | WHEN direction = 1 AND (c1.arm_call = -1 OR c2.arm_call = -1) AND (c1.arm_call = c2.arm_call OR (c1.arm_call IS NULL OR c2.arm_call IS NULL)) THEN 'S' 43 | ELSE NULL 44 | END) cnv_change 45 | FROM selected_tumor_pairs stp 46 | CROSS JOIN selected_arms sa 47 | LEFT JOIN analysis.gatk_cnv_by_arm c1 ON c1.aliquot_barcode = stp.tumor_barcode_a AND c1.arm = sa.arm 48 | LEFT JOIN analysis.gatk_cnv_by_arm c2 ON c2.aliquot_barcode = stp.tumor_barcode_b AND c2.arm = sa.arm 49 | ) 50 | SELECT * FROM cnv_by_pair_arm ORDER BY 1 -------------------------------------------------------------------------------- /sql/heatmap/heatmap_c710.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | tumor_pair_barcode, 3 | ss.case_barcode, 4 | idh_codel_subtype, 5 | (CASE 6 | WHEN c1.c710 IS FALSE AND c2.c710 IS FALSE THEN 'WT' 7 | WHEN c1.c710 IS TRUE AND c2.c710 IS TRUE THEN 'S' 8 | WHEN c1.c710 IS TRUE AND c2.c710 IS FALSE THEN 'P' 9 | WHEN c1.c710 IS FALSE AND c2.c710 IS TRUE THEN 'R' 10 | ELSE NULL 11 | END) c710_status 12 | FROM analysis.gold_set ss 13 | LEFT JOIN analysis.gatk_c710_status c1 ON c1.aliquot_barcode = ss.tumor_barcode_a 14 | LEFT JOIN analysis.gatk_c710_status c2 ON c2.aliquot_barcode = ss.tumor_barcode_b 15 | LEFT JOIN clinical.subtypes su ON su.case_barcode = ss.case_barcode -------------------------------------------------------------------------------- /sql/heatmap/heatmap_clinical.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | cp.tumor_pair_barcode, 3 | cp.case_barcode, 4 | (CASE WHEN recurrence_location = 'Distal' THEN 1 WHEN recurrence_location = 'Local' THEN 0 ELSE NULL END) location_distal, 5 | (CASE WHEN grade_change = 'Grade up' THEN 1 WHEN grade_change IN ('Grade up', 'Grade stable') THEN 0 ELSE NULL END) grade_change, 6 | (CASE WHEN received_alk = '1' THEN 1 WHEN received_alk = '0' THEN 0 ELSE NULL END) received_alk, 7 | (CASE WHEN received_rt = '1' THEN 1 WHEN received_rt = '0' THEN 0 ELSE NULL END) received_rt, 8 | (CASE WHEN hypermutator_status = '1' THEN 1 WHEN hypermutator_status = '0' THEN 0 ELSE NULL END) is_hypermutator, 9 | idh_codel_subtype 10 | FROM analysis.tumor_clinical_comparison cp 11 | INNER JOIN analysis.gold_set ss ON ss.tumor_pair_barcode = cp.tumor_pair_barcode 12 | LEFT JOIN clinical.subtypes st ON st.case_barcode = cp.case_barcode -------------------------------------------------------------------------------- /sql/heatmap/heatmap_drivers.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ss.tumor_pair_barcode, 3 | ss.case_barcode, 4 | ss.tumor_barcode_a, 5 | ss.tumor_barcode_b, 6 | st.idh_codel_subtype, 7 | snv_driver_count::integer, 8 | cnv_driver_count::integer, 9 | snv_driver_count_shared::integer, 10 | cnv_driver_count_shared::integer, 11 | snv_driver_count_private_a::integer, 12 | cnv_driver_count_private_a::integer, 13 | snv_driver_count_private_b::integer, 14 | cnv_driver_count_private_b::integer, 15 | snv_driver_shared, 16 | cnv_driver_shared, 17 | snv_driver_stability, 18 | cnv_driver_stability, 19 | snv_driver_change_a, 20 | snv_driver_change_b, 21 | cnv_driver_change_a, 22 | cnv_driver_change_b, 23 | snv_driver_context_shared, 24 | cnv_driver_context_shared, 25 | cnv_driver_context_change, 26 | snv_driver_context_change, 27 | snv_driver_evolution 28 | FROM analysis.gold_set ss 29 | LEFT JOIN analysis.driver_status_snv dss ON ss.tumor_pair_barcode = dss.tumor_pair_barcode 30 | LEFT JOIN analysis.driver_status_cnv dsc ON ss.tumor_pair_barcode = dsc.tumor_pair_barcode 31 | LEFT JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode -------------------------------------------------------------------------------- /sql/heatmap/heatmap_evolution.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | tumor_pair_barcode, ss.case_barcode, idh_codel_subtype, tumor_barcode_a, tumor_barcode_b, 3 | s1.most_probable_classification AS evolution_a, 4 | s2.most_probable_classification AS evolution_b, 5 | (CASE WHEN s1.most_probable_classification IS NOT NULL AND s2.most_probable_classification IS NOT NULL THEN s1.most_probable_classification || '-' || s2.most_probable_classification END) as evolution_ab 6 | FROM analysis.gold_set ss 7 | LEFT JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode 8 | LEFT JOIN analysis.subclonalselection s1 ON s1.aliquot_barcode = ss.tumor_barcode_a 9 | LEFT JOIN analysis.subclonalselection s2 ON s2.aliquot_barcode = ss.tumor_barcode_b -------------------------------------------------------------------------------- /sql/heatmap/heatmap_mf.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | tmc.tumor_pair_barcode, 3 | tmc.case_barcode, 4 | idh_codel_subtype, 5 | tmc.surgical_interval_mo, 6 | tmc.count_a, 7 | tmc.count_b, 8 | tmc.union_ab, 9 | tmc.intersection_ab, 10 | tmc.setdiff_a, 11 | tmc.setdiff_b, 12 | mf1.cumulative_coverage AS cov_a, 13 | mf2.cumulative_coverage AS cov_b, 14 | LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) AS min_cov, 15 | ROUND(setdiff_a::decimal / mf1.cumulative_coverage * 1e6, 4) AS mf_private_a, 16 | ROUND(setdiff_b::decimal / mf2.cumulative_coverage * 1e6, 4) AS mf_private_b, 17 | ROUND(intersection_ab::decimal / LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) * 1e6, 4) AS mf_shared, 18 | mf1.coverage_adj_mut_freq AS mf_a, 19 | mf2.coverage_adj_mut_freq AS mf_b 20 | FROM analysis.tumor_mut_comparison tmc 21 | INNER JOIN analysis.gold_set stp ON tmc.tumor_pair_barcode = stp.tumor_pair_barcode 22 | LEFT JOIN analysis.mut_freq mf1 ON mf1.aliquot_barcode = tmc.tumor_barcode_a 23 | LEFT JOIN analysis.mut_freq mf2 ON mf2.aliquot_barcode = tmc.tumor_barcode_b 24 | LEFT JOIN clinical.subtypes su ON su.case_barcode = stp.case_barcode -------------------------------------------------------------------------------- /sql/heatmap/heatmap_purity.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | ss.case_barcode, 3 | tp1.purity AS purity_a, 4 | tp2.purity AS purity_b, 5 | sp1.cellularity AS seqz_purity_a, 6 | sp1.cellularity AS seqz_purity_b, 7 | idh_codel_subtype 8 | FROM analysis.gold_set ss 9 | LEFT JOIN analysis.pairs p1 ON p1.tumor_barcode = ss.tumor_barcode_a 10 | LEFT JOIN variants.titan_params tp1 ON tp1.pair_barcode = p1.pair_barcode 11 | LEFT JOIN analysis.pairs p2 ON p2.tumor_barcode = ss.tumor_barcode_b 12 | LEFT JOIN variants.titan_params tp2 ON tp2.pair_barcode = p2.pair_barcode 13 | LEFT JOIN clinical.subtypes su ON su.case_barcode = ss.case_barcode 14 | LEFT JOIN variants.seqz_params sp1 ON sp1.pair_barcode = p1.pair_barcode 15 | LEFT JOIN variants.seqz_params sp2 ON sp2.pair_barcode = p2.pair_barcode -------------------------------------------------------------------------------- /sql/heatmap/heatmap_pyclone_clusters.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | selected_tumor_pairs AS 3 | ( 4 | SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype 5 | FROM analysis.gold_set ss 6 | INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode 7 | ), 8 | selected_aliquots AS 9 | ( 10 | SELECT tumor_barcode_a AS aliquot_barcode, idh_codel_subtype, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs 11 | UNION 12 | SELECT tumor_barcode_b AS aliquot_barcode, idh_codel_subtype, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs 13 | ), 14 | pyclone_clusters AS 15 | ( 16 | SELECT sa.case_barcode, sa.idh_codel_subtype, pc.cluster_id, COUNT(*) AS num_samples, min(size) as cluster_size, MIN(mean) as min_ccf, MAX(mean) AS max_ccf, sum(mean)/COUNT(mean) AS mean_ccf 17 | FROM variants.pyclone_cluster pc 18 | RIGHT JOIN selected_aliquots sa ON sa.aliquot_barcode = pc.aliquot_barcode 19 | GROUP BY 1,2,3 20 | HAVING MIN(mean) > 0.1 OR MAX(mean) > 0.1 OR bool_and(mean IS NULL) 21 | ORDER BY 5 DESC 22 | ) 23 | --SELECT DISTINCT case_barcode FROM pyclone_clusters 24 | SELECT * FROM pyclone_clusters ORDER BY 1,2 -------------------------------------------------------------------------------- /sql/heatmap/heatmap_signatures.sql: -------------------------------------------------------------------------------- 1 | /*SELECT gs.case_barcode, idh_codel_subtype, fraction, signature, mut_n, abs_score, rel_score 2 | FROM analysis.mut_sig_fraction ms 3 | INNER JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ms.tumor_pair_barcode 4 | INNER JOIN clinical.subtypes st ON st.case_barcode = gs.case_barcode 5 | WHERE signature IN (1,3,11,15,26)*/ 6 | WITH t1 AS ( 7 | SELECT gs.case_barcode, idh_codel_subtype, fraction, signature, mut_n, abs_score, rel_score, RANK() OVER (PARTITION BY gs.case_barcode,fraction ORDER BY rel_score DESC) AS rnk 8 | FROM analysis.mut_sig_fraction_limited ms 9 | INNER JOIN analysis.gold_set gs ON gs.tumor_pair_barcode = ms.tumor_pair_barcode 10 | INNER JOIN clinical.subtypes st ON st.case_barcode = gs.case_barcode 11 | ) 12 | SELECT * FROM t1 --WHERE rnk = 1 -------------------------------------------------------------------------------- /sql/heatmap/heatmap_time.sql: -------------------------------------------------------------------------------- 1 | WITH t1 AS 2 | ( 3 | SELECT 4 | case_barcode, 5 | surgery_number::varchar(255), 6 | surgical_interval_mo AS time_mo, 7 | idh_codel_subtype 8 | FROM clinical.surgeries 9 | WHERE idh_codel_subtype IS NOT NULL 10 | 11 | UNION 12 | 13 | SELECT 14 | cc.case_barcode, 15 | case_vital_status::varchar(255) AS surgery_number, 16 | case_overall_survival_mo AS time_mo, 17 | idh_codel_subtype 18 | FROM clinical.cases cc 19 | LEFT JOIN clinical.subtypes cs ON cc.case_barcode = cs.case_barcode 20 | ) 21 | SELECT t1.*, case_source 22 | FROM t1 23 | INNER JOIN analysis.gold_set ss ON t1.case_barcode = ss.case_barcode 24 | LEFT JOIN clinical.cases cc ON t1.case_barcode = cc.case_barcode -------------------------------------------------------------------------------- /sql/id_multiple_aliquot_driver_change.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Using driver stable/change annotation, identify patients with: 3 | - Many available samples 4 | - Driver change 5 | */ 6 | WITH 7 | t1 AS 8 | ( 9 | SELECT case_barcode,aliquot_analysis_type,COUNT(*) AS num_aliquots 10 | FROM biospecimen.aliquots al 11 | LEFT JOIN biospecimen.samples sa ON al.sample_barcode = sa.sample_barcode 12 | GROUP BY case_barcode,aliquot_analysis_type 13 | ORDER BY 3 DESC 14 | ) 15 | SELECT ds.case_barcode,driver_count,driver_status,target,num_aliquots 16 | FROM analysis.driver_status ds 17 | LEFT JOIN t1 ON ds.case_barcode = t1.case_barcode 18 | ORDER BY 2 DESC -------------------------------------------------------------------------------- /sql/mf_longitudinal_analysis.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | tmc.tumor_pair_barcode, 3 | tmc.case_barcode, 4 | idh_codel_subtype, 5 | received_alkylating_agent, 6 | hypermutator_status, 7 | 0 AS time_birth, 8 | ca.case_age_diagnosis_years AS time_initial, 9 | ROUND(ca.case_age_diagnosis_years + (tmc.surgical_interval_mo / 12.0),2) AS time_recurrence, 10 | 0 AS mf_birth, 11 | mf1.coverage_adj_mut_freq AS mf_initial, 12 | mf2.coverage_adj_mut_freq AS mf_recurrence 13 | /*tmc.count_a, 14 | tmc.count_b, 15 | tmc.union_ab, 16 | tmc.intersection_ab, 17 | tmc.setdiff_a, 18 | tmc.setdiff_b, 19 | mf1.cumulative_coverage AS cov_a, 20 | mf2.cumulative_coverage AS cov_b, 21 | LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) AS min_cov, 22 | ROUND(setdiff_a::decimal / mf1.cumulative_coverage * 1e6, 4) AS mf_private_a, 23 | ROUND(setdiff_b::decimal / mf2.cumulative_coverage * 1e6, 4) AS mf_private_b, 24 | ROUND(intersection_ab::decimal / LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) * 1e6, 4) AS mf_shared,*/ 25 | FROM analysis.tumor_mut_comparison tmc 26 | INNER JOIN analysis.silver_set stp ON tmc.tumor_pair_barcode = stp.tumor_pair_barcode 27 | LEFT JOIN clinical.clinical_by_tumor_pair ctp ON ctp.tumor_pair_barcode = stp.tumor_pair_barcode 28 | LEFT JOIN analysis.mutation_freq mf1 ON mf1.aliquot_barcode = tmc.tumor_barcode_a 29 | LEFT JOIN analysis.mutation_freq mf2 ON mf2.aliquot_barcode = tmc.tumor_barcode_b 30 | LEFT JOIN clinical.subtypes su ON su.case_barcode = stp.case_barcode 31 | LEFT JOIN clinical.cases ca ON ca.case_barcode = stp.case_barcode -------------------------------------------------------------------------------- /sql/mut_freq/mut_freq.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Compute mutation frequencies for each aliquot_barcode 3 | - Mutation frequencies is output in mutations per megabase (1e6 basepairs) 4 | - Only mutations with >= 15x are counted 5 | - Mutation counts are divided by the number of basepairs with at least 15x coverage 6 | - COALESCE is used to prevent "divison by zero" problems 7 | - JOIN to blocklist so we don't report any aliquots that were excluded based on fingerprinting or coverage 8 | 9 | Note: 10 | - for the ssm2_count table I counted mutation using greater than (>) --> 14 threshold 11 | - for the coverage table I counted coverage using greater than or equal to (>=) --> 15 threshold 12 | */ 13 | 14 | SELECT 15 | m2.aliquot_barcode, 16 | cumulative_coverage, 17 | ssm2_call_count AS mutation_count, 18 | COALESCE(ROUND(ssm2_call_count::numeric / cumulative_coverage::numeric * 1e6, 4), 0::numeric) AS coverage_adj_mut_freq 19 | FROM variants.ssm2_count m2 20 | INNER JOIN analysis.coverage cov ON cov.aliquot_barcode = m2.aliquot_barcode 21 | WHERE m2.ad_depth = 14 AND cov.coverage = 15 22 | 23 | -------------------------------------------------------------------------------- /sql/mut_sig/archive/mut_sig_aliquot.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Calculate mutational signatures 3 | By Aliquot 4 | */ 5 | WITH selected_aliquots AS 6 | ( 7 | SELECT ba.aliquot_barcode 8 | FROM biospecimen.aliquots ba 9 | LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = ba.aliquot_barcode 10 | LEFT JOIN biospecimen.samples bs ON bs.sample_barcode = ba.sample_barcode 11 | WHERE fingerprint_exclusion = 'allow' AND coverage_exclusion = 'allow' AND sample_type NOT IN ('NM','NB') --AND ba.aliquot_barcode IN ('GLSS-HK-0002-R1-01D-WGS-S3QETN') --,'GLSS-DK-0008-R1-01D-WXS-DDD4B8','TCGA-06-0190-R1-01D-WGS-P20F5P','TCGA-14-1402-R1-01D-WGS-2EHMQ2')--('GLSS-CU-R008-TP-01D-WXS-0238UJ','GLSS-HK-0004-R1-01D-WGS-RYFPEB') 12 | ), 13 | variant_contexts AS 14 | ( 15 | SELECT DISTINCT ref_context AS trinucleotide_context, alt 16 | FROM ref.signature_proba sp 17 | ), 18 | variant_contexts_aliquots AS 19 | ( 20 | SELECT * 21 | FROM selected_aliquots, variant_contexts 22 | ), 23 | variant_context_counts AS 24 | ( 25 | SELECT aliquot_barcode, trinucleotide_context, pa.alt, COUNT(*) AS mut_n 26 | FROM variants.passgeno pg 27 | INNER JOIN variants.passanno pa ON pa.variant_id = pg.variant_id 28 | WHERE ssm2_pass_call IS TRUE AND variant_type = 'SNP' AND ad_alt + ad_ref >= 15 29 | GROUP BY 1,2,3 30 | ), 31 | variant_context_counts_aliquots AS 32 | ( 33 | SELECT vca.*, COALESCE(mut_n,0) AS mut_n, SUM(COALESCE(mut_n,0)) OVER (PARTITION BY vca.aliquot_barcode) AS mut_n_total 34 | FROM variant_contexts_aliquots vca 35 | LEFT JOIN variant_context_counts vcc ON vcc.aliquot_barcode = vca.aliquot_barcode AND vcc.trinucleotide_context = vca.trinucleotide_context AND vcc.alt = vca.alt 36 | ), 37 | ref_context_array AS 38 | ( 39 | SELECT array_agg(a ORDER BY signature) AS ref_context_arr 40 | FROM (SELECT signature, array_agg(proba ORDER BY ref_context,alt) a FROM ref.signature_proba sp GROUP BY 1) t 41 | ), 42 | context_reconstruction AS 43 | ( 44 | SELECT aliquot_barcode,ref_context_arr,sum(mut_n) AS mut_n, array_agg(mut_n ORDER BY trinucleotide_context,alt), lsqnonneg(ref_context_arr, array_agg(mut_n ORDER BY trinucleotide_context,alt)) AS mut_sigs 45 | FROM variant_context_counts_aliquots, ref_context_array 46 | WHERE mut_n_total > 1 47 | GROUP BY 1,2 48 | ) 49 | SELECT aliquot_barcode, generate_series(1,30) AS signature, mut_n, unnest(mut_sigs) AS abs_score, UNNEST(mut_sigs) / (SELECT SUM(s) FROM UNNEST(mut_sigs) s) AS rel_score 50 | FROM context_reconstruction -------------------------------------------------------------------------------- /sql/mut_sig/archive/mut_sig_gene.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Calculate mutational signatures 3 | By Gene 4 | */ 5 | WITH selected_genes AS 6 | ( 7 | SELECT DISTINCT gene_symbol 8 | FROM variants.anno 9 | ORDER BY 1 10 | ), 11 | variant_contexts AS 12 | ( 13 | SELECT DISTINCT ref_context AS trinucleotide_context, alt 14 | FROM ref.signature_proba sp 15 | ), 16 | variant_contexts_genes AS 17 | ( 18 | SELECT * 19 | FROM selected_genes, variant_contexts 20 | ), 21 | variant_context_counts AS 22 | ( 23 | SELECT gene_symbol, trinucleotide_context, pa.alt, COUNT(*) AS mut_n 24 | FROM variants.passgeno pg 25 | INNER JOIN variants.passanno pa ON pa.variant_id = pg.variant_id 26 | LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = pg.aliquot_barcode 27 | WHERE ssm2_pass_call IS TRUE AND variant_type = 'SNP' AND ad_alt + ad_ref >= 15 AND fingerprint_exclusion = 'allow' AND coverage_exclusion = 'allow' AND variant_classification = 'MISSENSE' 28 | GROUP BY 1,2,3 29 | ), 30 | variant_context_counts_genes AS 31 | ( 32 | SELECT vca.*, COALESCE(mut_n,0) AS mut_n, SUM(COALESCE(mut_n,0)) OVER (PARTITION BY vca.gene_symbol) AS mut_n_total 33 | FROM variant_contexts_genes vca 34 | LEFT JOIN variant_context_counts vcc ON vcc.gene_symbol = vca.gene_symbol AND vcc.trinucleotide_context = vca.trinucleotide_context AND vcc.alt = vca.alt 35 | ), 36 | ref_context_array AS 37 | ( 38 | SELECT array_agg(a ORDER BY signature) AS ref_context_arr 39 | FROM (SELECT signature, array_agg(proba ORDER BY ref_context,alt) a FROM ref.signature_proba sp GROUP BY 1) t 40 | ), 41 | context_reconstruction AS 42 | ( 43 | SELECT gene_symbol,ref_context_arr,sum(mut_n) AS mut_n, array_agg(mut_n ORDER BY trinucleotide_context,alt), lsqnonneg(ref_context_arr, array_agg(mut_n ORDER BY trinucleotide_context,alt)) AS mut_sigs 44 | FROM variant_context_counts_genes, ref_context_array 45 | WHERE mut_n_total > 9 46 | GROUP BY 1,2 47 | ) 48 | SELECT gene_symbol, generate_series(1,30) AS signature, mut_n, unnest(mut_sigs) AS abs_score, UNNEST(mut_sigs) / (SELECT SUM(s) FROM UNNEST(mut_sigs) s) AS rel_score 49 | FROM context_reconstruction -------------------------------------------------------------------------------- /sql/mut_sig/archive/mut_sig_variant_classification.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Calculate mutational signatures 3 | By Variant Classification 4 | */ 5 | WITH variant_classifications AS 6 | ( 7 | SELECT DISTINCT variant_classification 8 | FROM variants.variant_classifications 9 | ORDER BY 1 10 | ), 11 | variant_contexts AS 12 | ( 13 | SELECT DISTINCT ref_context AS trinucleotide_context, alt 14 | FROM ref.signature_proba sp 15 | ), 16 | variant_contexts_classifications AS 17 | ( 18 | SELECT * 19 | FROM variant_classifications, variant_contexts 20 | ), 21 | variant_context_counts AS 22 | ( 23 | SELECT variant_classification, trinucleotide_context, pa.alt, COUNT(*) AS mut_n 24 | FROM variants.passgeno pg 25 | INNER JOIN variants.passanno pa ON pa.variant_id = pg.variant_id 26 | LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = pg.aliquot_barcode 27 | WHERE ssm2_pass_call IS TRUE AND variant_type = 'SNP' AND ad_alt + ad_ref >= 15 AND fingerprint_exclusion = 'allow' AND coverage_exclusion = 'allow' 28 | GROUP BY 1,2,3 29 | ), 30 | variant_context_counts_aliquots AS 31 | ( 32 | SELECT vca.*, COALESCE(mut_n,0) AS mut_n, SUM(COALESCE(mut_n,0)) OVER (PARTITION BY vca.variant_classification) AS mut_n_total 33 | FROM variant_contexts_classifications vca 34 | LEFT JOIN variant_context_counts vcc ON vcc.variant_classification = vca.variant_classification AND vcc.trinucleotide_context = vca.trinucleotide_context AND vcc.alt = vca.alt 35 | ), 36 | ref_context_array AS 37 | ( 38 | SELECT array_agg(a ORDER BY signature) AS ref_context_arr 39 | FROM (SELECT signature, array_agg(proba ORDER BY ref_context,alt) a FROM ref.signature_proba sp GROUP BY 1) t 40 | ), 41 | context_reconstruction AS 42 | ( 43 | SELECT variant_classification,ref_context_arr,sum(mut_n) AS mut_n, array_agg(mut_n ORDER BY trinucleotide_context,alt), lsqnonneg(ref_context_arr, array_agg(mut_n ORDER BY trinucleotide_context,alt)) AS mut_sigs 44 | FROM variant_context_counts_aliquots, ref_context_array 45 | WHERE mut_n_total > 1 46 | GROUP BY 1,2 47 | ) 48 | SELECT variant_classification, generate_series(1,30) AS signature, mut_n, unnest(mut_sigs) AS abs_score, UNNEST(mut_sigs) / (SELECT SUM(s) FROM UNNEST(mut_sigs) s) AS rel_score 49 | FROM context_reconstruction -------------------------------------------------------------------------------- /sql/neoag/neoantigen_peptide_counts.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Creates Supplementary Table 6 (?): List of all unique neoantigens in the GLASS cohort and the number of initial/recurrent tumors harboring each one 3 | A separate R script is used to save the table for publication: R/neoag/analysis/SuppTable6_writetottext.r 4 | */ 5 | 6 | WITH neoag_by_ali AS 7 | ( 8 | SELECT aliquot_barcode, variant_id, gene_name, mutation, pvacseq_protein_position, peptide_length, sub_peptide_position, mt_epitope_seq 9 | FROM analysis.neoantigens_by_aliquot neo 10 | WHERE ssm2_pass_call = TRUE 11 | GROUP BY aliquot_barcode, variant_id, gene_name, mutation, pvacseq_protein_position, peptide_length, sub_peptide_position, mt_epitope_seq 12 | ), 13 | ini_counts AS 14 | ( 15 | SELECT neo.gene_name, neo.pvacseq_protein_position, neo.mutation, neo.mt_epitope_seq, COUNT(*) AS total 16 | FROM analysis.gold_set gs 17 | LEFT JOIN neoag_by_ali neo ON neo.aliquot_barcode = gs.tumor_barcode_a 18 | GROUP BY neo.gene_name, neo.pvacseq_protein_position, neo.mutation, neo.mt_epitope_seq 19 | ORDER BY total DESC 20 | ), 21 | rec_counts AS 22 | ( 23 | SELECT neo.gene_name, neo.pvacseq_protein_position, neo.mutation, neo.mt_epitope_seq, COUNT(*) AS total 24 | FROM analysis.gold_set gs 25 | LEFT JOIN neoag_by_ali neo ON neo.aliquot_barcode = gs.tumor_barcode_b 26 | GROUP BY neo.gene_name, neo.pvacseq_protein_position, neo.mutation, neo.mt_epitope_seq 27 | ORDER BY total DESC 28 | ) 29 | SELECT ini.gene_name, ini.pvacseq_protein_position, ini.mutation, ini.mt_epitope_seq, 30 | COALESCE(ini.total,0) AS initial_total, 31 | COALESCE(rec.total,0) AS recurrent_total, 32 | COALESCE(ini.total,0) + COALESCE(rec.total,0) AS total_tumors 33 | FROM ini_counts ini 34 | LEFT JOIN rec_counts rec ON rec.gene_name = ini.gene_name AND 35 | rec.pvacseq_protein_position = ini.pvacseq_protein_position AND 36 | rec.mutation = ini.mutation AND 37 | rec.mt_epitope_seq = ini.mt_epitope_seq 38 | ORDER BY total_tumors DESC 39 | -------------------------------------------------------------------------------- /sql/neutrality/neutralitytestr-subtype.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Prepare input for use with neutralitytestr using SNVs in non-altered regions. 3 | Also, retain information about clonality AND variant_classification and variant_type. 4 | */ 5 | 6 | WITH t1 AS (SELECT 7 | pg.tumor_pair_barcode, 8 | pg.case_barcode, 9 | pg.tumor_barcode_a, 10 | pg.tumor_barcode_b, 11 | pg.chrom, 12 | pg.pos, 13 | pg.variant_id, 14 | pg.variant_type, 15 | pg.variant_classification, 16 | pg.mutect2_call_a, 17 | pg.mutect2_call_b, 18 | pl1.cellular_prevalence AS cellular_prevalence_a, 19 | pl1.variant_allele_frequency AS variant_allele_frequency_a, 20 | (CASE WHEN pl1.cellular_prevalence >= 0.5 THEN 'C' WHEN pl1.cellular_prevalence < 0.5 THEN 'S' END) AS clonality_a, 21 | pl2.cellular_prevalence AS cellular_prevalence_b, 22 | pl2.variant_allele_frequency AS variant_allele_frequency_b, 23 | (CASE WHEN pl2.cellular_prevalence >= 0.5 THEN 'C' WHEN pl2.cellular_prevalence < 0.5 THEN 'S' END) AS clonality_b, 24 | (CASE WHEN mutect2_call_a AND mutect2_call_b THEN 'S' 25 | WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P' 26 | WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS fraction 27 | FROM variants.pgeno pg 28 | LEFT JOIN variants.pyclone_loci pl1 ON pl1.variant_id = pg.variant_id AND pl1.aliquot_barcode = pg.tumor_barcode_a 29 | LEFT JOIN variants.pyclone_loci pl2 ON pl2.variant_id = pg.variant_id AND pl2.aliquot_barcode= pg.tumor_barcode_b 30 | INNER JOIN analysis.gold_set gs ON pg.tumor_pair_barcode = gs.tumor_pair_barcode 31 | WHERE pl1.cellular_prevalence IS NOT NULL) 32 | 33 | SELECT 34 | t1.tumor_pair_barcode, 35 | t1.case_barcode, 36 | t1.tumor_barcode_a, 37 | t1.tumor_barcode_b, 38 | t1.chrom, 39 | t1.pos, 40 | t1.variant_id, 41 | t1.variant_type, 42 | t1.variant_classification, 43 | t1.mutect2_call_a, 44 | t1.mutect2_call_b, 45 | t1.cellular_prevalence_a, 46 | t1.cellular_prevalence_b, 47 | t1.variant_allele_frequency_a, 48 | t1.variant_allele_frequency_b, 49 | t1.clonality_a, 50 | t1.clonality_b, 51 | t1.fraction, 52 | gs1.cnv_call AS cnv_call_a, 53 | gs2.cnv_call AS cnv_call_b 54 | FROM t1 55 | LEFT JOIN variants.gatk_seg gs1 ON gs1.aliquot_barcode = t1.tumor_barcode_a AND gs1.chrom = t1.chrom AND gs1.pos && t1.pos 56 | LEFT JOIN variants.gatk_seg gs2 ON gs2.aliquot_barcode = t1.tumor_barcode_b AND gs2.chrom = t1.chrom AND gs2.pos && t1.pos 57 | WHERE t1.fraction IS NOT NULL 58 | 59 | -- END -- -------------------------------------------------------------------------------- /sql/neutrality/original_submission/neutrality-testr-input-mutect2.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Prepare input for use with Neutrality Testr 3 | */ 4 | SELECT 5 | gtc.case_barcode, 6 | gtc.tumor_barcode_a, 7 | gtc.tumor_barcode_b, 8 | gtc.chrom, 9 | gtc.pos, 10 | gtc.alt, 11 | gtc.ref_count_a, 12 | gtc.ref_count_b, 13 | gtc.alt_count_a, 14 | gtc.alt_count_b, 15 | gtc.ref_count_a + gtc.ref_count_b AS ref_count_ab, 16 | gtc.alt_count_a + gtc.alt_count_b AS alt_count_ab, 17 | ROUND(gtc.alt_count_a::decimal / (gtc.alt_count_a + gtc.ref_count_a),4) AS vaf_a, 18 | ROUND(gtc.alt_count_b::decimal / (gtc.alt_count_b + gtc.ref_count_b),4) AS vaf_b, 19 | ROUND((gtc.alt_count_a::decimal + gtc.alt_count_b::decimal) / (gtc.alt_count_a + gtc.alt_count_b + gtc.ref_count_a + gtc.ref_count_b),4) AS vaf_ab, 20 | (CASE WHEN mutect2_call_a AND mutect2_call_b THEN 'S' WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P' WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS status 21 | FROM analysis.master_genotype_comparison gtc 22 | LEFT JOIN analysis.snvs snvs ON snvs.chrom = gtc.chrom AND snvs.pos = gtc.pos AND snvs.alt = gtc.alt 23 | WHERE 24 | (mutect2_call_a OR mutect2_call_b) AND 25 | (gtc.alt_count_a + gtc.ref_count_a) >= 30 AND 26 | (gtc.alt_count_b + gtc.ref_count_b) >= 30 AND 27 | (gtc.alt_count_a > 0 OR gtc.alt_count_b > 0) 28 | 29 | -- END -- -------------------------------------------------------------------------------- /sql/neutrality/original_submission/neutrality_testr_input.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Prepare input for use with Neutrality Testr 3 | */ 4 | SELECT 5 | gtc.case_barcode, 6 | gtc.tumor_barcode_a, 7 | gtc.tumor_barcode_b, 8 | gtc.chrom, 9 | gtc.pos, 10 | gtc.alt, 11 | gtc.ref_count_a, 12 | gtc.ref_count_b, 13 | gtc.alt_count_a, 14 | gtc.alt_count_b, 15 | gtc.ref_count_a + gtc.ref_count_b AS ref_count_ab, 16 | gtc.alt_count_a + gtc.alt_count_b AS alt_count_ab, 17 | ROUND(gtc.alt_count_a::decimal / (gtc.alt_count_a + gtc.ref_count_a),4) AS vaf_a, 18 | ROUND(gtc.alt_count_b::decimal / (gtc.alt_count_b + gtc.ref_count_b),4) AS vaf_b, 19 | ROUND((gtc.alt_count_a::decimal + gtc.alt_count_b::decimal) / (gtc.alt_count_a + gtc.alt_count_b + gtc.ref_count_a + gtc.ref_count_b),4) AS vaf_ab, 20 | (CASE WHEN gtc.alt_count_a > 0 AND gtc.alt_count_b > 0 THEN 'S' WHEN gtc.alt_count_a > 0 AND NOT gtc.alt_count_b > 0 THEN 'P' WHEN gtc.alt_count_b > 0 AND NOT gtc.alt_count_a > 0 THEN 'R' END) AS status 21 | FROM analysis.master_genotype_comparison gtc 22 | LEFT JOIN analysis.snvs snvs ON snvs.chrom = gtc.chrom AND snvs.pos = gtc.pos AND snvs.alt = gtc.alt 23 | WHERE 24 | (mutect2_call_a OR mutect2_call_b) AND 25 | (gtc.alt_count_a + gtc.ref_count_a) >= 30 AND 26 | (gtc.alt_count_b + gtc.ref_count_b) >= 30 AND 27 | (gtc.alt_count_a > 0 OR gtc.alt_count_b > 0) 28 | 29 | -- END -- -------------------------------------------------------------------------------- /sql/neutrality/original_submission/neutralitytestr-input-aliquot-level.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Prepare aliquot-level input for use with Neutrality Testr 3 | */ 4 | SELECT 5 | gtc.aliquot_barcode, 6 | gtc.case_barcode, 7 | gtc.chrom::character varying(2), 8 | gtc.pos, 9 | gtc.alt, 10 | gtc.ref_count, 11 | gtc.alt_count, 12 | ROUND(gtc.alt_count::decimal / (gtc.alt_count + gtc.ref_count),4) AS vaf 13 | FROM analysis.genotypes gtc 14 | WHERE 15 | (mutect2_call) AND 16 | (gtc.alt_count + gtc.ref_count) >= 30 17 | -- END -- 18 | -------------------------------------------------------------------------------- /sql/pyclone/pyclone_aliquots.sql: -------------------------------------------------------------------------------- 1 | WITH selected_aliquots 2 | AS ( 3 | SELECT 4 | sa.case_barcode, 5 | al.aliquot_barcode, 6 | round(tp.purity::numeric, 2) AS purity, 7 | COUNT(*) OVER (PARTITION BY su.case_barcode) AS num_samples 8 | FROM biospecimen.aliquots al 9 | LEFT JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode 10 | LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = al.aliquot_barcode 11 | LEFT JOIN clinical.surgeries su ON al.sample_barcode = su.sample_barcode 12 | LEFT JOIN analysis.pairs pa ON al.aliquot_barcode = pa.tumor_barcode 13 | LEFT JOIN variants.titan_params tp ON tp.pair_barcode = pa.pair_barcode 14 | WHERE 15 | bl.fingerprint_exclusion = 'allow' AND 16 | bl.coverage_exclusion = 'allow' AND 17 | sa.sample_type NOT IN ('NB','NM') 18 | ORDER BY su.case_barcode, su.surgery_number, al.aliquot_portion 19 | ) 20 | SELECT 21 | case_barcode, 22 | aliquot_barcode, 23 | purity 24 | FROM selected_aliquots 25 | WHERE num_samples > 1 -------------------------------------------------------------------------------- /sql/pyclone/pyclone_cluster_pairs.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | selected_tumor_pairs AS 3 | ( 4 | SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype 5 | FROM analysis.gold_set ss 6 | INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode 7 | ), 8 | pyclone_clusters AS 9 | ( 10 | SELECT stp.case_barcode, stp.idh_codel_subtype, pc1.cluster_id, pc1.size AS size, pc1.mean AS ccf_a, pc2.mean AS ccf_b, 11 | (RANK() OVER (PARTITION BY stp.case_barcode ORDER BY pc1.mean DESC))::integer AS rank_a, 12 | (RANK() OVER (PARTITION BY stp.case_barcode ORDER BY pc2.mean DESC))::integer AS rank_b 13 | FROM selected_tumor_pairs stp 14 | INNER JOIN variants.pyclone_cluster pc1 ON pc1.aliquot_barcode = stp.tumor_barcode_a 15 | INNER JOIN variants.pyclone_cluster pc2 ON pc2.aliquot_barcode = stp.tumor_barcode_b AND pc2.cluster_id = pc1.cluster_id 16 | WHERE pc1.size > 1 AND (pc1.mean > 0.1 OR pc2.mean > 0.1) 17 | ) 18 | SELECT * FROM pyclone_clusters -------------------------------------------------------------------------------- /sql/pyclone/pyclone_cluster_pairs_anno_drivers.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | selected_tumor_pairs AS 3 | ( 4 | SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype 5 | FROM analysis.gold_set ss 6 | INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode 7 | ), 8 | pyclone_clusters AS 9 | ( 10 | SELECT stp.case_barcode, stp.idh_codel_subtype, pc1.cluster_id, pc1.size AS size, pc1.mean AS ccf_a, pc2.mean AS ccf_b, 11 | (RANK() OVER (PARTITION BY stp.case_barcode ORDER BY pc1.mean DESC))::integer AS rank_a, 12 | (RANK() OVER (PARTITION BY stp.case_barcode ORDER BY pc2.mean DESC))::integer AS rank_b 13 | FROM selected_tumor_pairs stp 14 | INNER JOIN variants.pyclone_cluster pc1 ON pc1.aliquot_barcode = stp.tumor_barcode_a 15 | INNER JOIN variants.pyclone_cluster pc2 ON pc2.aliquot_barcode = stp.tumor_barcode_b AND pc2.cluster_id = pc1.cluster_id 16 | WHERE pc1.size > 1 AND (pc1.mean > 0.1 OR pc2.mean > 0.1) 17 | ), 18 | selected_genes AS 19 | ( 20 | SELECT DISTINCT sn.gene_symbol, variant_id, chrom, pos, alt, sn.variant_classification, variant_classification_priority, protein_change 21 | FROM variants.passanno sn 22 | INNER JOIN ref.driver_genes ds ON ds.gene_symbol = sn.gene_symbol 23 | LEFT JOIN variants.variant_classifications vc ON sn.variant_classification = vc.variant_classification 24 | WHERE 25 | has_mut IS TRUE AND 26 | ((sn.gene_symbol NOT IN ('TERT','IDH1') AND variant_classification_priority IS NOT NULL) OR 27 | (sn.gene_symbol = 'TERT' AND sn.variant_classification = '5''Flank' AND lower(sn.pos) IN (1295228,1295250)) OR 28 | (sn.gene_symbol = 'IDH1' AND sn.protein_change IN ('p.R132C','p.R132G','p.R132H','p.R132S'))) 29 | ), 30 | selected_genes_geno AS 31 | ( 32 | SELECT DISTINCT case_barcode, cluster_id, string_agg(DISTINCT gene_symbol, ', ') AS drivers 33 | FROM selected_genes sg 34 | INNER JOIN variants.passgeno pg ON pg.variant_id = sg.variant_id 35 | INNER JOIN variants.pyclone_loci pl ON pl.variant_id = sg.variant_id AND pl.aliquot_barcode = pg.aliquot_barcode 36 | WHERE ssm2_pass_call IS TRUE 37 | GROUP BY 1,2 38 | ) 39 | --SELECT * FROM selected_genes_geno ORDER BY 3 DESC 40 | SELECT pc.case_barcode, idh_codel_subtype, pc.cluster_id, size, ccf_a, ccf_b, rank_a, rank_b, drivers 41 | FROM pyclone_clusters pc 42 | LEFT JOIN selected_genes_geno sgg ON sgg.case_barcode = pc.case_barcode AND sgg.cluster_id = pc.cluster_id -------------------------------------------------------------------------------- /sql/pyclone/pyclone_cluster_stats.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | selected_tumor_pairs AS 3 | ( 4 | SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype 5 | FROM analysis.gold_set ss 6 | INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode 7 | ), 8 | selected_aliquots AS 9 | ( 10 | SELECT tumor_barcode_a AS aliquot_barcode, idh_codel_subtype AS subtype, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs 11 | UNION 12 | SELECT tumor_barcode_b AS aliquot_barcode, idh_codel_subtype AS subtype, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs 13 | ), 14 | driver_counts AS 15 | ( 16 | SELECT case_barcode, cluster_id, COUNT(*) AS num_drivers 17 | FROM variants.passanno pa 18 | INNER JOIN ref.driver_genes dg ON dg.gene_symbol = pa.gene_symbol 19 | INNER JOIN variants.pyclone_loci pl ON pl.variant_id = pa.variant_id 20 | INNER JOIN biospecimen.aliquots al ON al.aliquot_barcode = pl.aliquot_barcode 21 | INNER JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode 22 | WHERE has_mut IS TRUE AND variant_allele_frequency > 0 23 | GROUP BY 1,2 24 | ), 25 | pyclone_clusters AS 26 | ( 27 | SELECT sa.case_barcode, pc.cluster_id, COUNT(*) AS num_samples, min(size) as cluster_size, MIN(mean) as min_ccf, MAX(mean) AS max_ccf, sum(mean)/COUNT(mean) AS mean_ccf 28 | FROM variants.pyclone_cluster pc 29 | --INNER JOIN biospecimen.aliquots al ON al.aliquot_barcode = pc.aliquot_barcode 30 | INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = pc.aliquot_barcode 31 | --INNER JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode 32 | GROUP BY 1,2 33 | ORDER BY 5 DESC 34 | HAVING MIN(mean) > 0.1 OR MAX(mean) > 0.1 35 | ), 36 | t2 AS 37 | ( 38 | SELECT t1.case_barcode, 39 | COUNT(t1.cluster_id) AS num_clust, 40 | COUNT(CASE WHEN min_ccf > 0.25 THEN 1 END) AS num_clonal, 41 | bool_or(min_ccf > 0.25 AND num_drivers > 0)::integer AS any_clonal_drivers, 42 | bool_or(min_ccf <= 0.25 AND num_drivers > 0)::integer AS any_subclonal_drivers 43 | --COUNT(CASE WHEN min_ccf > 0.25 THEN num_drivers END) AS num_drivers_clonal, 44 | --COUNT(CASE WHEN min_ccf <= 0.25 THEN num_drivers END) AS num_drivers_subclonal 45 | FROM pyclone_clusters t1 46 | LEFT JOIN driver_counts t0 ON t0.case_barcode = t1.case_barcode AND t0.cluster_id = t1.cluster_id 47 | GROUP BY 1 48 | ORDER BY 3 DESC 49 | ) 50 | SELECT * FROM t2 -------------------------------------------------------------------------------- /sql/pyclone/pyclone_cluster_stats2.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | selected_tumor_pairs AS 3 | ( 4 | SELECT tumor_pair_barcode, tumor_barcode_a, tumor_barcode_b, ss.case_barcode, idh_codel_subtype 5 | FROM analysis.gold_set ss 6 | INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode 7 | ), 8 | selected_aliquots AS 9 | ( 10 | SELECT tumor_barcode_a AS aliquot_barcode, idh_codel_subtype, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs 11 | UNION 12 | SELECT tumor_barcode_b AS aliquot_barcode, idh_codel_subtype, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs 13 | ), 14 | driver_counts AS 15 | ( 16 | SELECT case_barcode, cluster_id, COUNT(*) AS num_drivers 17 | FROM variants.passanno pa 18 | INNER JOIN ref.driver_genes dg ON dg.gene_symbol = pa.gene_symbol 19 | INNER JOIN variants.pyclone_loci pl ON pl.variant_id = pa.variant_id 20 | INNER JOIN biospecimen.aliquots al ON al.aliquot_barcode = pl.aliquot_barcode 21 | INNER JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode 22 | WHERE has_mut IS TRUE AND variant_allele_frequency > 0 23 | GROUP BY 1,2 24 | ), 25 | pyclone_clusters AS 26 | ( 27 | SELECT pc.aliquot_barcode, sa.case_barcode, pc.cluster_id, sample_type, idh_codel_subtype, pc.size, pc.mean, pc.std, num_drivers 28 | FROM variants.pyclone_cluster pc 29 | INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = pc.aliquot_barcode 30 | LEFT JOIN driver_counts t0 ON t0.case_barcode = sa.case_barcode AND t0.cluster_id = pc.cluster_id 31 | WHERE pc.size > 1 32 | )/*, 33 | t2 AS 34 | ( 35 | SELECT t1.case_barcode, 36 | COUNT(t1.cluster_id) AS num_clust, 37 | COUNT(CASE WHEN min_ccf > 0.25 THEN 1 END) AS num_clonal, 38 | bool_or(min_ccf > 0.25 AND num_drivers > 0)::integer AS any_clonal_drivers, 39 | bool_or(min_ccf <= 0.25 AND num_drivers > 0)::integer AS any_subclonal_drivers 40 | --COUNT(CASE WHEN min_ccf > 0.25 THEN num_drivers END) AS num_drivers_clonal, 41 | --COUNT(CASE WHEN min_ccf <= 0.25 THEN num_drivers END) AS num_drivers_subclonal 42 | FROM pyclone_clusters t1 43 | LEFT JOIN driver_counts t0 ON t0.case_barcode = t1.case_barcode AND t0.cluster_id = t1.cluster_id 44 | GROUP BY 1 45 | ORDER BY 3 DESC 46 | )*/ 47 | SELECT * FROM pyclone_clusters -------------------------------------------------------------------------------- /sql/pyclone/pyclone_create_tsv.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | t1 AS ( 3 | SELECT 4 | gt.case_barcode, 5 | gt.aliquot_barcode, 6 | gt.variant_id::integer AS mutation_id, 7 | ad_ref AS ref_counts, 8 | ad_alt AS var_counts, 9 | (CASE WHEN case_sex = 'male' AND gt.chrom = 23 THEN 1 ELSE 2 END) AS normal_cn, 10 | minor_cn, 11 | major_cn, 12 | (COUNT(*) OVER (PARTITION BY gt.case_barcode, gt.variant_id)) AS num_aliquots_variants 13 | FROM variants.passgeno gt 14 | INNER JOIN biospecimen.aliquots al ON al.aliquot_barcode = gt.aliquot_barcode 15 | INNER JOIN biospecimen.samples sa ON sa.sample_barcode = al.sample_barcode 16 | INNER JOIN analysis.blocklist bl ON bl.aliquot_barcode = al.aliquot_barcode 17 | INNER JOIN analysis.pairs ps ON ps.tumor_barcode = gt.aliquot_barcode 18 | INNER JOIN clinical.cases cs ON cs.case_barcode = gt.case_barcode 19 | INNER JOIN variants.titan_seg ts ON ts.pair_barcode = ps.pair_barcode AND ts.chrom = gt.chrom AND ts.pos && gt.pos 20 | WHERE 21 | gt.case_barcode = ? AND 22 | (case_sex IS NOT NULL OR gt.chrom <> 23) AND 23 | major_cn > 0 AND 24 | ad_ref + ad_alt >= 30 AND 25 | minor_cn IS NOT NULL AND 26 | major_cn IS NOT NULL AND 27 | bl.fingerprint_exclusion = 'allow' AND 28 | bl.coverage_exclusion = 'allow' AND 29 | sa.sample_type NOT IN ('NB','NM') 30 | ), 31 | t2 AS ( 32 | SELECT case_barcode,(COUNT(DISTINCT aliquot_barcode)) AS num_aliquots FROM t1 GROUP BY 1 33 | ), 34 | t3 AS ( 35 | SELECT t1.case_barcode,aliquot_barcode,mutation_id,ref_counts,var_counts,normal_cn,minor_cn,major_cn 36 | FROM t1 37 | LEFT JOIN t2 ON t1.case_barcode = t2.case_barcode 38 | WHERE num_aliquots_variants = num_aliquots AND num_aliquots > 1 39 | ), 40 | t4 AS ( 41 | SELECT aliquot_barcode,COUNT(*) 42 | FROM t3 43 | GROUP BY 1 44 | ) 45 | SELECT * FROM t3-- ORDER BY 1,2 DESC -------------------------------------------------------------------------------- /sql/roeltable.sql: -------------------------------------------------------------------------------- 1 | SELECT crosstab.case_source_description, 2 | crosstab.case_project, 3 | crosstab.aliquot_analysis_type, 4 | crosstab."Primary", 5 | crosstab."1st Recurrence", 6 | crosstab."2nd Recurrence", 7 | crosstab."3rd Recurrence", 8 | crosstab."4th Recurrence" 9 | FROM crosstab(' 10 | SELECT case_source_description, case_project, aliquot_analysis_type, sample_type, COUNT( DISTINCT ca.case_barcode ) 11 | FROM biospecimen.aliquots AS al 12 | INNER JOIN biospecimen.samples AS sa ON sa.sample_barcode = al.sample_barcode 13 | INNER JOIN clinical.cases AS ca ON ca.case_barcode = sa.case_barcode 14 | INNER JOIN clinical.case_sources AS cs ON ca.case_source = cs.case_source 15 | WHERE sa.sample_type IN (''TP'', ''R1'', ''R2'', ''R3'', ''R4'') 16 | GROUP BY case_source_description, case_project, aliquot_analysis_type, sample_type 17 | ORDER BY 2,3,1 18 | '::text, ' 19 | SELECT sample_type FROM biospecimen.sample_types WHERE sample_type IN (''TP'', ''R1'', ''R2'', ''R3'', ''R4'') 20 | '::text) crosstab(case_source_description character varying, case_project character(4), aliquot_analysis_type character(3), "Primary" integer, "1st Recurrence" integer, "2nd Recurrence" integer, "3rd Recurrence" integer, "4th Recurrence" integer); -------------------------------------------------------------------------------- /sql/selected_aliquots.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Select aliquots 3 | - Stringent blocklist filtering (diamond set) 4 | * Fingerprinting 5 | * Coverage 6 | * CNV 7 | - Create sample short names using surgery number and portion 8 | - Drop cases with less than 2 aliquots 9 | */ 10 | WITH 11 | selected_aliquots AS 12 | ( 13 | SELECT case_barcode, aliquot_analysis_type, al.aliquot_barcode, ROUND(purity::decimal,2) AS purity, case_barcode || '-' || aliquot_analysis_type AS short_name, COUNT(*) OVER (PARTITION BY case_barcode, aliquot_analysis_type) AS num_samples 14 | FROM biospecimen.aliquots al 15 | LEFT JOIN analysis.blocklist bl ON bl.aliquot_barcode = al.aliquot_barcode 16 | LEFT JOIN clinical.surgeries su ON al.sample_barcode = su.sample_barcode 17 | LEFT JOIN analysis.pairs pa ON al.aliquot_barcode = pa.tumor_barcode 18 | LEFT JOIN analysis.titan_params tp ON tp.pair_barcode = pa.pair_barcode 19 | WHERE 20 | bl.fingerprint_exclusion = 'allow' AND 21 | bl.coverage_exclusion = 'allow' AND 22 | bl.cnv_exclusion = 'allow' 23 | ORDER BY 1, su.surgery_number, al.aliquot_portion 24 | ) 25 | SELECT * FROM selected_aliquots WHERE num_samples > 1 -------------------------------------------------------------------------------- /sql/selected_tumor_pairs_silver.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | silver_set AS 3 | ( 4 | SELECT 5 | tumor_pair_barcode, 6 | case_barcode, 7 | tumor_barcode_a, 8 | tumor_barcode_b, 9 | row_number() OVER (PARTITION BY case_barcode ORDER BY surgical_interval_mo DESC, portion_a ASC, portion_b ASC, substring(tumor_pair_barcode from 27 for 3) ASC) AS priority 10 | FROM analysis.tumor_pairs ps 11 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a 12 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b 13 | WHERE 14 | comparison_type = 'longitudinal' AND 15 | sample_type_b <> 'M1' AND 16 | b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow' 17 | ), 18 | gold_set AS 19 | ( 20 | SELECT 21 | tumor_pair_barcode, 22 | case_barcode, 23 | tumor_barcode_a, 24 | tumor_barcode_b, 25 | row_number() OVER (PARTITION BY case_barcode ORDER BY surgical_interval_mo DESC, portion_a ASC, portion_b ASC, substring(tumor_pair_barcode from 27 for 3) ASC) AS priority 26 | FROM analysis.tumor_pairs ps 27 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a 28 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b 29 | WHERE 30 | comparison_type = 'longitudinal' AND 31 | sample_type_b <> 'M1' AND 32 | b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow' AND 33 | b1.cnv_exclusion IN ('allow','review') AND b2.cnv_exclusion IN ('allow','review') 34 | ) 35 | SELECT * 36 | FROM gold_set WHERE priority = 1 37 | 38 | EXCEPT 39 | 40 | SELECT * 41 | FROM silver_set WHERE priority = 1 42 | -------------------------------------------------------------------------------- /sql/set/gold_set.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This is the initial definition but was deprecated out of confusion 3 | This definition could include cases (patients) that are also in the silver set, but using a different combionation of primary and recurrence 4 | The new definition (not commented, below) instead takes the subset of the silver set 5 | Note that for GISTIC we instead took the deprecated gold set to define a set of unique primaries and unique recurrences 6 | === 7 | WITH 8 | selected_tumor_pairs AS 9 | ( 10 | SELECT 11 | ps.tumor_pair_barcode, 12 | ps.case_barcode, 13 | ps.tumor_barcode_a, 14 | ps.tumor_barcode_b, 15 | row_number() OVER (PARTITION BY ps.case_barcode ORDER BY ps.surgical_interval_mo DESC, ps.portion_a, ps.portion_b, ("substring"(ps.tumor_pair_barcode, 27, 3))) AS priority 16 | FROM analysis.tumor_pairs ps 17 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a 18 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b 19 | WHERE ps.comparison_type = 'longitudinal' AND ps.sample_type_b <> 'M1' AND b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow' AND b1.cnv_exclusion != 'block' AND b2.cnv_exclusion != 'block' 20 | ) 21 | SELECT 22 | tumor_pair_barcode, 23 | case_barcode, 24 | tumor_barcode_a, 25 | tumor_barcode_b 26 | FROM selected_tumor_pairs 27 | WHERE selected_tumor_pairs.priority = 1*/ 28 | 29 | SELECT * 30 | FROM analysis.silver_set ss 31 | INNER JOIN analysis.blocklist bl1 ON bl1.aliquot_barcode = ss.tumor_barcode_a 32 | INNER JOIN analysis.blocklist bl2 ON bl2.aliquot_barcode = ss.tumor_barcode_b 33 | WHERE bl1.cnv_exclusion != 'block' AND bl2.cnv_exclusion != 'block' -------------------------------------------------------------------------------- /sql/set/silver_set.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | selected_tumor_pairs AS 3 | ( 4 | SELECT 5 | ps.tumor_pair_barcode, 6 | ps.case_barcode, 7 | ps.tumor_barcode_a, 8 | ps.tumor_barcode_b, 9 | row_number() OVER (PARTITION BY ps.case_barcode ORDER BY ps.surgical_interval_mo DESC, ps.portion_a, ps.portion_b, ("substring"(ps.tumor_pair_barcode, 27, 3))) AS priority 10 | FROM analysis.tumor_pairs ps 11 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a 12 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b 13 | WHERE ps.comparison_type = 'longitudinal' AND ps.sample_type_b <> 'M1' AND b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow' 14 | ) 15 | SELECT 16 | tumor_pair_barcode, 17 | case_barcode, 18 | tumor_barcode_a, 19 | tumor_barcode_b 20 | FROM selected_tumor_pairs 21 | WHERE selected_tumor_pairs.priority = 1 -------------------------------------------------------------------------------- /sql/snv/tumor_mut_comparison.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | tp.tumor_pair_barcode, 3 | tp.case_barcode, 4 | tp.tumor_barcode_a, 5 | tp.tumor_barcode_b, 6 | tp.sample_type_a, 7 | tp.sample_type_b, 8 | tp.portion_a, 9 | tp.portion_b, 10 | tp.comparison_type, 11 | tp.surgical_interval_mo, 12 | 13 | ( SELECT count(*) AS count 14 | FROM variants.passgeno gt 15 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) AS count_a, 16 | 17 | ( SELECT count(*) AS count 18 | FROM variants.passgeno gt 19 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) AS count_b, 20 | 21 | ( SELECT count(*) AS count 22 | FROM ( SELECT variant_id 23 | FROM variants.passgeno gt 24 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE 25 | UNION 26 | SELECT variant_id 27 | FROM variants.passgeno gt 28 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) t) AS union_ab, 29 | 30 | ( SELECT count(*) AS count 31 | FROM ( SELECT variant_id 32 | FROM variants.passgeno gt 33 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE 34 | INTERSECT 35 | SELECT variant_id 36 | FROM variants.passgeno gt 37 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) t) AS intersection_ab, 38 | 39 | ( SELECT count(*) AS count 40 | FROM ( SELECT variant_id 41 | FROM variants.passgeno gt 42 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE 43 | EXCEPT 44 | SELECT variant_id 45 | FROM variants.passgeno gt 46 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) t) AS setdiff_a, 47 | 48 | ( SELECT count(*) AS count 49 | FROM ( SELECT variant_id 50 | FROM variants.passgeno gt 51 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE 52 | EXCEPT 53 | SELECT variant_id 54 | FROM variants.passgeno gt 55 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.ad_ref + gt.ad_alt > 14 AND ssm2_pass_call IS TRUE) t) AS setdiff_b 56 | 57 | FROM analysis.tumor_pairs tp -------------------------------------------------------------------------------- /sql/snv/tumor_mut_comparison_anno.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | tmc.tumor_pair_barcode, 3 | tmc.case_barcode, 4 | tmc.tumor_barcode_a, 5 | tmc.tumor_barcode_b, 6 | idh_codel_subtype, 7 | received_alk, 8 | hypermutator_status, 9 | 0 AS time_birth, 10 | ca.case_age_diagnosis_years AS time_initial, 11 | ROUND(ca.case_age_diagnosis_years + (tmc.surgical_interval_mo / 12.0),2) AS time_recurrence, 12 | 0 AS mf_birth, 13 | mf1.coverage_adj_mut_freq AS mf_initial, 14 | mf2.coverage_adj_mut_freq AS mf_recurrence, 15 | tmc.count_a, 16 | tmc.count_b, 17 | tmc.union_ab, 18 | tmc.intersection_ab, 19 | tmc.setdiff_a, 20 | tmc.setdiff_b, 21 | mf1.cumulative_coverage AS cov_a, 22 | mf2.cumulative_coverage AS cov_b, 23 | LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) AS min_cov, 24 | ROUND(setdiff_a::decimal / mf1.cumulative_coverage * 1e6, 4) AS mf_private_a, 25 | ROUND(setdiff_b::decimal / mf2.cumulative_coverage * 1e6, 4) AS mf_private_b, 26 | ROUND(intersection_ab::decimal / LEAST(mf1.cumulative_coverage, mf2.cumulative_coverage) * 1e6, 4) AS mf_shared 27 | FROM analysis.tumor_mut_comparison tmc 28 | INNER JOIN analysis.silver_set stp ON tmc.tumor_pair_barcode = stp.tumor_pair_barcode 29 | LEFT JOIN analysis.tumor_clinical_comparison ctp ON ctp.tumor_pair_barcode = stp.tumor_pair_barcode 30 | LEFT JOIN analysis.mut_freq mf1 ON mf1.aliquot_barcode = tmc.tumor_barcode_a 31 | LEFT JOIN analysis.mut_freq mf2 ON mf2.aliquot_barcode = tmc.tumor_barcode_b 32 | LEFT JOIN clinical.subtypes su ON su.case_barcode = stp.case_barcode 33 | LEFT JOIN clinical.cases ca ON ca.case_barcode = stp.case_barcode -------------------------------------------------------------------------------- /sql/tel.sql: -------------------------------------------------------------------------------- 1 | SELECT tmc.case_barcode, tumor_barcode_a, tumor_barcode_b, aliquot_analysis_type, case_source, idh_codel_subtype, received_alk, hypermutator_status, tq1.length AS p_len, tq2.length AS r_len, tqn.length AS n_len, tq1.length / tq2.length AS pr_len_ratio, 2 | a1.prop_aneuploidy AS aneuploidy_a, 3 | a2.prop_aneuploidy AS aneuploidy_b, 4 | a1.aneuploidy_amp_score::integer AS aneuploidy_amp_score_a, 5 | a2.aneuploidy_amp_score::integer AS aneuploidy_amp_score_b, 6 | a1.aneuploidy_del_score::integer AS aneuploidy_del_score_a, 7 | a2.aneuploidy_del_score::integer AS aneuploidy_del_score_b, 8 | a1.aneuploidy_score::integer AS aneuploidy_score_a, 9 | a2.aneuploidy_score::integer AS aneuploidy_score_b 10 | FROM analysis.tumor_mut_comparison_anno tmc 11 | LEFT JOIN analysis.gatk_aneuploidy a1 ON a1.aliquot_barcode = tmc.tumor_barcode_a 12 | LEFT JOIN analysis.gatk_aneuploidy a2 ON a2.aliquot_barcode = tmc.tumor_barcode_b 13 | LEFT JOIN analysis.pairs pa1 ON pa1.tumor_barcode = tmc.tumor_barcode_a 14 | LEFT JOIN biospecimen.aliquots al1 ON al1.aliquot_barcode = tmc.tumor_barcode_a 15 | LEFT JOIN biospecimen.samples sa1 ON sa1.sample_barcode = al1.sample_barcode 16 | LEFT JOIN clinical.cases ca1 ON ca1.case_barcode = sa1.case_barcode 17 | LEFT JOIN analysis.telseq tqn ON tqn.aliquot_barcode = pa1.normal_barcode 18 | LEFT JOIN analysis.telseq tq1 ON tq1.aliquot_barcode = tmc.tumor_barcode_a 19 | LEFT JOIN analysis.telseq tq2 ON tq2.aliquot_barcode = tmc.tumor_barcode_b -------------------------------------------------------------------------------- /sql/test.sql: -------------------------------------------------------------------------------- 1 | hey 2 | -------------------------------------------------------------------------------- /sql/timing/ccf_shared.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | pg.tumor_pair_barcode, 3 | pg.case_barcode, 4 | st.idh_codel_subtype, 5 | pg.tumor_barcode_a, 6 | pg.tumor_barcode_b, 7 | hypermutator_status, 8 | pg.chrom, 9 | pg.pos, 10 | pg.variant_id, 11 | pg.gene_symbol, 12 | pg.variant_classification, 13 | vc.variant_effect, 14 | vc.variant_classification_vep, 15 | pl1.cellular_prevalence AS cellular_prevalence_a, 16 | pl1.variant_allele_frequency AS variant_allele_frequency_a, 17 | (CASE WHEN pl1.cellular_prevalence >= 0.5 THEN 'C' WHEN pl1.cellular_prevalence >= 0.1 AND pl1.cellular_prevalence < 0.5 THEN 'S' ELSE 'ND' END) AS clonality_a, 18 | pl2.cellular_prevalence AS cellular_prevalence_b, 19 | pl2.variant_allele_frequency AS variant_allele_frequency_b, 20 | (CASE WHEN pl2.cellular_prevalence >= 0.5 THEN 'C' WHEN pl2.cellular_prevalence >= 0.1 AND pl2.cellular_prevalence < 0.5 THEN 'S' ELSE 'ND' END) AS clonality_b, 21 | rank() OVER (PARTITION BY pg.tumor_pair_barcode, pg.gene_symbol ORDER BY variant_classification_priority, pl1.cellular_prevalence + pl2.cellular_prevalence DESC) 22 | FROM variants.pgeno pg 23 | LEFT JOIN variants.pyclone_loci pl1 ON pl1.variant_id = pg.variant_id AND pl1.aliquot_barcode = pg.tumor_barcode_a 24 | LEFT JOIN variants.pyclone_loci pl2 ON pl2.variant_id = pg.variant_id AND pl2.aliquot_barcode= pg.tumor_barcode_b 25 | LEFT JOIN variants.variant_classifications vc ON vc.variant_classification = pg.variant_classification 26 | INNER JOIN analysis.gold_set ss ON pg.tumor_pair_barcode = ss.tumor_pair_barcode 27 | INNER JOIN analysis.tumor_mut_comparison_anno tmc ON tmc.tumor_pair_barcode = ss.tumor_pair_barcode 28 | INNER JOIN clinical.subtypes st ON st.case_barcode = ss.case_barcode 29 | WHERE pl1.cellular_prevalence IS NOT NULL AND mutect2_call_a AND mutect2_call_b AND variant_classification_priority IS NOT NULL -------------------------------------------------------------------------------- /sql/timing/timing_cnv.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | selected_tumor_pairs AS 3 | ( 4 | SELECT * FROM analysis.silver_set 5 | ), 6 | selected_aliquots AS 7 | ( 8 | SELECT tumor_barcode_a AS aliquot_barcode, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs 9 | UNION 10 | SELECT tumor_barcode_b AS aliquot_barcode, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs 11 | ), 12 | cnv_timing AS 13 | ( 14 | SELECT gc.gene_symbol,idh_codel_subtype,sample_type,hlvl_call, COUNT(cellular_prevalence) AS num_cp, SUM(cellular_prevalence)/COUNT(cellular_prevalence) AS mean_cp 15 | FROM analysis.gatk_cnv_by_gene gc 16 | INNER JOIN ref.driver_genes dg ON dg.gene_symbol = gc.gene_symbol 17 | INNER JOIN selected_aliquots sa ON sa.aliquot_barcode = gc.aliquot_barcode 18 | INNER JOIN clinical.subtypes st ON st.case_barcode = sa.case_barcode 19 | GROUP BY 1,2,3,4 20 | ) 21 | SELECT * FROM cnv_timing -------------------------------------------------------------------------------- /sql/timing/timing_pairs.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | snv_pairs AS 3 | ( 4 | SELECT t1.gene_symbol AS gene_symbol_a, t2.gene_symbol AS gene_symbol_b, t1.idh_codel_subtype AS idh_codel_subtype 5 | FROM ref.snv_drivers_subtype t1 6 | INNER JOIN ref.snv_drivers_subtype t2 ON t1.idh_codel_subtype = t2.idh_codel_subtype 7 | WHERE t1.gene_symbol < t2.gene_symbol 8 | ORDER BY 3,1,2 9 | ), 10 | cnv_pairs AS 11 | ( 12 | SELECT t1.gene_symbol AS gene_symbol_a, t1.direction AS direction_a, t2.gene_symbol AS gene_symbol_b, t2.direction AS direction_b, t1.idh_codel_subtype AS idh_codel_subtype 13 | FROM ref.cnv_drivers_subtype t1 14 | INNER JOIN ref.cnv_drivers_subtype t2 ON t1.idh_codel_subtype = t2.idh_codel_subtype 15 | WHERE t1.gene_symbol < t2.gene_symbol 16 | ORDER BY 5,1,2 17 | ), 18 | arm_pairs AS 19 | ( 20 | SELECT t1.arm AS arm_a, t1.direction AS direction_a, t2.arm AS arm_b, t2.direction AS direction_b, t1.idh_codel_subtype AS idh_codel_subtype 21 | FROM ref.arm_drivers_subtype t1 22 | INNER JOIN ref.arm_drivers_subtype t2 ON t1.idh_codel_subtype = t2.idh_codel_subtype 23 | WHERE t1.arm < t2.arm 24 | ORDER BY 5,1,2 25 | ), 26 | arm_cnv_pairs AS 27 | ( 28 | SELECT t1.arm || (CASE t1.direction WHEN -1 THEN ' del' WHEN 1 THEN ' amp' ELSE NULL END) AS evnt_a, t2.gene_symbol || (CASE t2.direction WHEN -2 THEN ' del' WHEN 2 THEN ' amp' ELSE NULL END) AS evnt_b, t1.idh_codel_subtype AS idh_codel_subtype 29 | FROM ref.arm_drivers_subtype t1 30 | INNER JOIN ref.cnv_drivers_subtype t2 ON t1.idh_codel_subtype = t2.idh_codel_subtype 31 | --WHERE t1.arm < t2.arm 32 | ORDER BY 3,1,2 33 | ), 34 | pairs AS 35 | ( 36 | SELECT gene_symbol_a || ' mut' AS evnt_a, gene_symbol_b || ' mut' AS evnt_b, idh_codel_subtype FROM snv_pairs 37 | UNION 38 | SELECT gene_symbol_a || (CASE direction_a WHEN -2 THEN ' del' WHEN 2 THEN ' amp' ELSE NULL END) AS evnt_a, gene_symbol_b || (CASE direction_b WHEN -2 THEN ' del' WHEN 2 THEN ' amp' ELSE NULL END) AS evnt_b, idh_codel_subtype FROM cnv_pairs 39 | UNION 40 | SELECT arm_a || (CASE direction_a WHEN -1 THEN ' del' WHEN 1 THEN ' amp' ELSE NULL END) AS evnt_a, arm_b || (CASE direction_b WHEN -1 THEN ' del' WHEN 1 THEN ' amp' ELSE NULL END) AS evnt_b, idh_codel_subtype FROM arm_pairs 41 | UNION 42 | SELECT evnt_a, evnt_b, idh_codel_subtype FROM arm_cnv_pairs 43 | ) 44 | SELECT * FROM pairs 45 | ORDER BY 3,1,2 -------------------------------------------------------------------------------- /sql/timing/timing_snv.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | selected_tumor_pairs AS 3 | ( 4 | SELECT * FROM analysis.silver_set 5 | ), 6 | selected_aliquots AS 7 | ( 8 | SELECT tumor_barcode_a AS aliquot_barcode, case_barcode, 'P' AS sample_type FROM selected_tumor_pairs 9 | UNION 10 | SELECT tumor_barcode_b AS aliquot_barcode, case_barcode, 'R' AS sample_type FROM selected_tumor_pairs 11 | ), 12 | selected_genes AS 13 | ( 14 | SELECT DISTINCT sn.gene_symbol, ensembl_gene_id, variant_id, chrom, pos, alt, sn.variant_classification, variant_classification_priority, protein_change 15 | FROM variants.passanno sn 16 | INNER JOIN ref.driver_genes ds ON ds.gene_symbol = sn.gene_symbol 17 | INNER JOIN ref.ensembl_gene_mapping gm ON gm.gene_symbol = sn.gene_symbol 18 | LEFT JOIN variants.variant_classifications vc ON sn.variant_classification = vc.variant_classification 19 | WHERE 20 | has_mut IS TRUE AND 21 | ((sn.gene_symbol NOT IN ('TERT','IDH1') AND variant_classification_priority IS NOT NULL) OR 22 | (sn.gene_symbol = 'TERT' AND sn.variant_classification = 'FIVE_PRIME_FLANK' AND lower(sn.pos) IN (1295228,1295250)) OR 23 | (sn.gene_symbol = 'IDH1' AND sn.protein_change IN ('p.R132C','p.R132G','p.R132H','p.R132S'))) 24 | ), 25 | timing_snv AS 26 | ( 27 | SELECT pl.aliquot_barcode,idh_codel_subtype,sample_type,gene_symbol,variant_classification,protein_change,cellular_prevalence,titan_ccf,pyclone_ccf, rank() OVER (PARTITION BY pl.aliquot_barcode ORDER BY cellular_prevalence DESC) AS mut_order 28 | FROM variants.pyclone_loci pl 29 | INNER JOIN selected_genes sg ON sg.variant_id = pl.variant_id 30 | INNER JOIN selected_aliquots sq ON sq.aliquot_barcode = pl.aliquot_barcode 31 | INNER JOIN variants.passgeno pg ON pg.variant_id = pl.variant_id AND pg.aliquot_barcode = pl.aliquot_barcode 32 | INNER JOIN clinical.subtypes st ON st.case_barcode = pg.case_barcode 33 | WHERE ssm2_pass_call 34 | ) 35 | SELECT gene_symbol,idh_codel_subtype,sample_type,SUM(mut_order)/COUNT(mut_order),COUNT(mut_order) 36 | FROM timing_snv GROUP BY 1,2,3 ORDER BY 2,3,4 37 | --SELECT gene_symbol, idh_codel_subtype, sample_type, COUNT(cellular_prevalence) AS num_mut, SUM(cellular_prevalence)/COUNT(cellular_prevalence) AS mean_cp, SUM(cellular_prevalence_sd)/COUNT(cellular_prevalence_sd) AS mean_cp_sd FROM timing_snv 38 | --GROUP BY 1,2,3 39 | --ORDER BY 1,2,3 -------------------------------------------------------------------------------- /sql/titan_vs_seqz.sql: -------------------------------------------------------------------------------- 1 | WITH selected_samples AS 2 | ( 3 | SELECT tumor_barcode_a AS aliquot_barcode, pair_barcode FROM analysis.diamond_set ds 4 | INNER JOIN analysis.pairs pa ON pa.tumor_barcode = ds.tumor_barcode_a 5 | UNION 6 | SELECT tumor_barcode_b AS aliquot_barcode, pair_barcode FROM analysis.diamond_set ds 7 | INNER JOIN analysis.pairs pa ON pa.tumor_barcode = ds.tumor_barcode_b 8 | ) 9 | SELECT ss.aliquot_barcode, cellularity, purity 10 | FROM selected_samples ss 11 | INNER JOIN variants.titan_params tp ON tp.pair_barcode = ss.pair_barcode 12 | INNER JOIN variants.seqz_params sp ON sp.pair_barcode = ss.pair_barcode -------------------------------------------------------------------------------- /sql/tumor_mut_comparison.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | tp.tumor_pair_barcode, 3 | tp.case_barcode, 4 | tp.tumor_barcode_a, 5 | tp.tumor_barcode_b, 6 | tp.sample_type_a, 7 | tp.sample_type_b, 8 | tp.portion_a, 9 | tp.portion_b, 10 | tp.comparison_type, 11 | tp.surgical_interval_mo, 12 | 13 | ( SELECT count(*) AS count 14 | FROM analysis.called_genotypes gt 15 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14) AS count_a, 16 | 17 | ( SELECT count(*) AS count 18 | FROM analysis.called_genotypes gt 19 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14) AS count_b, 20 | 21 | ( SELECT count(*) AS count 22 | FROM ( SELECT 23 | gt.chrom, 24 | gt.start, 25 | gt."end", 26 | gt.alt 27 | FROM analysis.called_genotypes gt 28 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14 29 | UNION 30 | SELECT 31 | gt.chrom, 32 | gt.start, 33 | gt."end", 34 | gt.alt 35 | FROM analysis.called_genotypes gt 36 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14) t) AS union_ab, 37 | 38 | ( SELECT count(*) AS count 39 | FROM ( SELECT 40 | gt.chrom, 41 | gt.start, 42 | gt."end", 43 | gt.alt 44 | FROM analysis.called_genotypes gt 45 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14 46 | INTERSECT 47 | SELECT 48 | gt.chrom, 49 | gt.start, 50 | gt."end", 51 | gt.alt 52 | FROM analysis.called_genotypes gt 53 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14) t) AS intersection_ab, 54 | 55 | ( SELECT count(*) AS count 56 | FROM ( SELECT 57 | gt.chrom, 58 | gt.start, 59 | gt."end", 60 | gt.alt 61 | FROM analysis.called_genotypes gt 62 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14 63 | EXCEPT 64 | SELECT 65 | gt.chrom, 66 | gt.start, 67 | gt."end", 68 | gt.alt 69 | FROM analysis.called_genotypes gt 70 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14) t) AS setdiff_a, 71 | 72 | ( SELECT count(*) AS count 73 | FROM ( SELECT 74 | gt.chrom, 75 | gt.start, 76 | gt."end", 77 | gt.alt 78 | FROM analysis.called_genotypes gt 79 | WHERE gt.aliquot_barcode = tp.tumor_barcode_b AND gt.read_depth > 14 80 | EXCEPT 81 | SELECT 82 | gt.chrom, 83 | gt.start, 84 | gt."end", 85 | gt.alt 86 | FROM analysis.called_genotypes gt 87 | WHERE gt.aliquot_barcode = tp.tumor_barcode_a AND gt.read_depth > 14) t) AS setdiff_b 88 | 89 | FROM analysis.tumor_pairs tp 90 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = tp.tumor_barcode_a 91 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = tp.tumor_barcode_b 92 | WHERE b1.coverage_exclusion = 'allow'::bpchar AND b2.coverage_exclusion = 'allow'::bpchar; 93 | -------------------------------------------------------------------------------- /sql/vaf_compare.sql: -------------------------------------------------------------------------------- 1 | WITH 2 | variants_by_case_and_gene AS 3 | ( 4 | SELECT 5 | gtc.gene_symbol, 6 | gtc.case_barcode, 7 | gtc.variant_classification, 8 | sn.hgvs_p, 9 | ROUND(alt_count_a::decimal / (alt_count_a + ref_count_a),4) AS vaf_a, 10 | ROUND(alt_count_b::decimal / (alt_count_b + ref_count_b),4) AS vaf_b, 11 | row_number() OVER (PARTITION BY gtc.gene_symbol, gtc.case_barcode ORDER BY vc.variant_classification_priority, mutect2_call_a::integer + mutect2_call_b::integer DESC, (alt_count_a + ref_count_a) + (alt_count_b + ref_count_b) DESC) AS priority 12 | FROM analysis.master_genotype_comparison gtc 13 | INNER JOIN analysis.silver_set stp ON stp.tumor_pair_barcode = gtc.tumor_pair_barcode 14 | INNER JOIN analysis.dnds_fraction_sel_cv ds ON ds.gene_symbol = gtc.gene_symbol --AND (ds.qglobal_cv < 0.05 OR gtc.gene_symbol IN ('TERT','IDH2','NOTCH1','PDGFRA','PIK3CG','BRAF','H3F3A')) 15 | LEFT JOIN analysis.variant_classifications vc ON gtc.variant_classification = vc.variant_classification 16 | INNER JOIN analysis.snvs sn ON sn.chrom = gtc.chrom AND sn.pos = gtc.pos AND sn.alt = gtc.alt 17 | WHERE 18 | (mutect2_call_a OR mutect2_call_b) AND 19 | (ds.qglobal_cv < 0.05 OR ds.gene_symbol IN ('TERT','IDH2','NOTCH1','PDGFRA','PIK3CG','BRAF','H3F3A')) AND 20 | (alt_count_a + ref_count_a) >= 5 AND (alt_count_b + ref_count_b) >= 5 AND 21 | (gtc.gene_symbol NOT IN ('TERT','IDH1','IDH2','BRAF','H3F3A') AND variant_classification_priority IS NOT NULL) OR 22 | (gtc.gene_symbol = 'TERT' AND gtc.variant_classification = '5''Flank' AND lower(sn.pos) IN (1295228,1295250)) OR 23 | (gtc.gene_symbol = 'IDH1' AND sn.hgvs_p IN ('p.R132C','p.R132G','p.R132H','p.R132S')) OR 24 | (gtc.gene_symbol = 'IDH2' AND sn.hgvs_p = 'p.R172K') OR 25 | (gtc.gene_symbol = 'BRAF' AND sn.hgvs_p = 'p.V600E') OR 26 | (gtc.gene_symbol = 'H3F3A' AND sn.hgvs_p = 'p.G35R') 27 | ) 28 | SELECT gene_symbol, case_barcode, variant_classification, hgvs_p, vaf_a, vaf_b 29 | FROM variants_by_case_and_gene vg 30 | WHERE priority = 1 -------------------------------------------------------------------------------- /sql/variant_status_leeds.sql: -------------------------------------------------------------------------------- 1 | WITH selected_tumor_pairs AS 2 | ( 3 | SELECT 4 | tumor_pair_barcode, 5 | case_barcode, 6 | tumor_barcode_a, 7 | tumor_barcode_b, 8 | row_number() OVER (PARTITION BY case_barcode ORDER BY surgical_interval_mo DESC, portion_a ASC, portion_b ASC, substring(tumor_pair_barcode from 27 for 3) ASC) AS priority 9 | FROM analysis.tumor_pairs ps 10 | LEFT JOIN analysis.blocklist b1 ON b1.aliquot_barcode = ps.tumor_barcode_a 11 | LEFT JOIN analysis.blocklist b2 ON b2.aliquot_barcode = ps.tumor_barcode_b 12 | WHERE 13 | comparison_type = 'longitudinal' AND 14 | sample_type_b <> 'M1' AND -- exclude metastatic samples here because this is outside the scope of our study 15 | b1.fingerprint_exclusion = 'allow' AND b2.fingerprint_exclusion = 'allow' AND 16 | b1.coverage_exclusion = 'allow' AND b2.coverage_exclusion = 'allow' 17 | ) 18 | SELECT 19 | mgt.tumor_pair_barcode, 20 | (CASE WHEN mutect2_call_a AND mutect2_call_b THEN 'S' WHEN mutect2_call_a AND NOT mutect2_call_b THEN 'P' WHEN mutect2_call_b AND NOT mutect2_call_a THEN 'R' END) AS variant_status, 21 | mgt.case_barcode, 22 | mgt.tumor_barcode_a, 23 | mgt.tumor_barcode_b, 24 | mgt.gene_symbol, 25 | mgt.variant_type, 26 | mgt.variant_classification, 27 | mgt.chrom::varchar(2), 28 | lower(mgt.pos) AS start_pos, 29 | upper(mgt.pos) -1 AS end_pos, 30 | ref, 31 | mgt.alt, 32 | ref_count_a, 33 | ref_count_b, 34 | alt_count_a, 35 | alt_count_b, 36 | mutect2_call_a, 37 | mutect2_call_b, 38 | vaf_corrected_call_a, 39 | vaf_corrected_call_b, 40 | logr_copy_number_a, 41 | logr_copy_number_b, 42 | corrected_copy_number_a, 43 | corrected_copy_number_b, 44 | corrected_call_a::varchar(5), 45 | corrected_call_b::varchar(5) 46 | FROM analysis.master_genotype_comparison mgt 47 | LEFT JOIN analysis.snvs snvs ON snvs.chrom = mgt.chrom AND snvs.pos = mgt.pos AND snvs.alt = mgt.alt 48 | WHERE (mutect2_call_a OR mutect2_call_b) AND (ref_count_a + alt_count_a) >= 10 AND (ref_count_b + alt_count_b) >= 10 49 | --INNER JOIN selected_tumor_pairs stp ON stp.tumor_pair_barcode = mgt.tumor_pair_barcode -------------------------------------------------------------------------------- /sql/variants/passanno.sql: -------------------------------------------------------------------------------- 1 | /* 2 | - - - - - - - - - - 3 | variants.passanno 4 | - - - - - - - - - - 5 | Limited variant annotations (only variants that PASS filters) 6 | Define a list of variants for which we want to preserve annotations 7 | In this case meaning all PASS variants and IDH/TERT variants 8 | We have to specifically retain IDH/TERT due to GATK 4.1.0.0 bug with force-calling 9 | */ 10 | WITH t1 AS ( 11 | SELECT DISTINCT info.variant_id 12 | FROM variants.info 13 | WHERE info.filter = 'PASS' OR 14 | (info.chrom = 2 AND lower(info.pos) IN (209113112, 209113113)) OR 15 | (info.chrom = 5 AND lower(info.pos) IN (1295169, 1295228, 1295242, 1295250)) OR 16 | (info.chrom = 15 AND lower(info.pos) IN (90631837, 90631838, 90631839)) 17 | ) 18 | SELECT 19 | anno.variant_id, 20 | chrom, 21 | pos, 22 | ref, 23 | alt, 24 | gene_symbol, 25 | variant_classification, 26 | secondary_variant_classification, 27 | variant_type, 28 | genome_change, 29 | transcript, 30 | transcript_strand, 31 | transcript_exon, 32 | transcript_position, 33 | cdna_change, 34 | cds_change, 35 | protein_change, 36 | gc_content, 37 | reference_context, 38 | "substring"(reference_context::text, 10, 3) AS trinucleotide_context 39 | FROM variants.anno 40 | INNER JOIN t1 ON t1.variant_id = anno.variant_id --------------------------------------------------------------------------------