├── .gitignore ├── .gitmodules ├── .travis.yml ├── Makefile ├── Makefile.inc ├── README.md ├── aligners ├── align.inc ├── align.mk ├── bowtieAligner.mk ├── bwaAligner.mk ├── bwamemAligner.mk ├── gsnapAligner.mk ├── gsnapIADB.mk ├── hisatAligner.mk ├── novoalignIADB.mk ├── pBwaAligner.mk ├── starAligner.mk ├── starFusionAligner.mk ├── tmapAligner.mk └── tophatAligner.mk ├── bam_tools ├── fix_bam.mk ├── fix_mate.mk ├── fix_rg.mk ├── get_bam_data_mirror.mk ├── get_bam_irb_mirror.mk ├── merge_bam.mk ├── processBam.mk └── put_bam_data_mirror.mk ├── clonality ├── absoluteSeq.mk ├── clonehd.mk ├── pyclone_13.mk ├── pyclone_vi.mk └── tableToCloneHDFormat.pl ├── conda_env ├── R3.2.2.txt ├── delly_env.txt ├── jrflab_modules_env.txt ├── mutsig_report_env.txt ├── sum_reads_env.txt └── varscan_env.txt ├── config.inc ├── config ├── defuse.conf └── snpEff.conf ├── contamination ├── clusterSampleVcf.R ├── clusterSamples.mk └── contest.mk ├── copy_number ├── CytoBand.RData ├── absCNseq.R ├── absCNseq.mk ├── annotateFacets2Vcf.R ├── annotateFacetsCCF2Vcf.R ├── annotateFreeC.R ├── annotateTitanLOHVcf.R ├── ascat.R ├── ascat.mk ├── cbindCNVs.R ├── cghCall.R ├── cnvkit.mk ├── compare_facets_cncf.py ├── controlFreeC.mk ├── controlFreeCLOHTN.mk ├── controlFreeCTN.mk ├── convert_basecount_to_snp_pileup.py ├── createFacetsSummary.R ├── exomeCNV.R ├── exomeCNV.mk ├── exomeCNVLOH.R ├── exomeCNVLOH.mk ├── exomeCNVLOHHeatmap.mk ├── facets.mk ├── facetsFillGeneCN.R ├── facetsGeneCN.R ├── facetsGeneCNPlot.R ├── facetsPlotSampleLRR.R ├── facets_merge_tn.py ├── facets_suite.mk ├── gistic.mk ├── gisticFacets.mk ├── hg19_chrominfo.txt ├── hg19_cytoBandIdeo.txt ├── hg19_gaps.txt ├── hmmCopy.R ├── hmmCopy.mk ├── makeControlFreeCGraph.R ├── medicc2.mk ├── normaliseCopyNum.R ├── normalisedCopyNum.mk ├── oncosnp.mk ├── oncosnpseq.mk ├── plotFacets.R ├── plotFreeCCopyNum.R ├── plotFreeCLogRatio.R ├── plotGisticHeatmap.R ├── recenter_base_count.py ├── runFacets.R ├── runTitan.R ├── segmentVarscanCNV.R ├── summarizeTitan.R ├── titan.inc ├── titan.mk ├── varscanCNV.mk └── varscanCNVGeneCN.R ├── db ├── chasm_db.yaml ├── create_mysql_docker_images.sh ├── ensembl-hs-core-85-37_db.yaml ├── fathmm_config-cpu-6-2.ini ├── fathmm_config-e01.ini ├── fathmm_config-ika.ini ├── fathmm_config-lilac.ini ├── fathmm_config-swan.ini ├── fathmm_db.yaml ├── run_mysql_docker_images.sh ├── snv_box-cpu-6-2.conf ├── snv_box-e01.conf ├── snv_box-ika.conf ├── snv_box-lilac.conf └── snv_box-swan.conf ├── default_yaml ├── project_config.yaml ├── sample_attr.yaml └── summary_config.yaml ├── export └── cbioportal.mk ├── external └── SNVBox │ ├── README │ ├── db │ ├── ARFFutil.py │ ├── CodonMap.py │ ├── Config.py │ ├── DBUtil.py │ ├── DataSet.py │ ├── FeatureDb.py │ └── __init__.py │ ├── doc │ ├── CHASM_VEST_SNVGet_UserManual.doc │ ├── CHASM_VEST_SNVGet_UserManual.pdf │ └── License.txt │ ├── genomicToProtein │ ├── snvGetGenomic │ ├── snvGetTranscript │ └── snvGetTranscriptList ├── fastq_tools ├── bamtoFasta.mk ├── blastReads.mk ├── extractFastq.mk ├── extractReads.mk ├── extractunmappedpairs.mk ├── fastq.mk ├── fixFastqReadNames.mk ├── fixFastqReadNames.py ├── mergeFastq.mk ├── mergeSplitFastq.mk └── trimFastq.pl ├── genome_inc ├── GRCm38.inc ├── b37.inc ├── hg18.inc ├── hg19.inc └── hg38.inc ├── isoforms └── miso.mk ├── ploidy ├── bicseq.mk ├── expands.mk └── pyloh.mk ├── qc ├── TEQC.R ├── TEQCreport.R ├── TEQCreportFun.R ├── bamStats.mk ├── bam_interval_metrics.mk ├── bam_metrics.mk ├── fastqc.mk ├── fastqcSummaryPlot.R ├── intervalBamQC.R ├── intervalBamQC.mk ├── nonRefFreqFromPileup.pl ├── plotHsMetrics.R ├── plotRnaseqMetrics.R ├── qualimap.mk ├── readDepth.mk ├── rnaseqMetrics.mk ├── rseqc.mk ├── summarize_hs_metrics.py ├── summarize_idxstats.py ├── teqc.mk ├── variantEvalGatkReport.R └── wgs_metrics.mk ├── recurrent_mutations └── report.mk ├── reference ├── gene_lists │ ├── Kandoth_127genes.bed │ ├── Kandoth_127genes.hg19.bed │ ├── Lawrence_cancer5000-S.bed │ ├── Lawrence_cancer5000-S.hg19.bed │ ├── cancer_gene_census.b37.2016-10-14.bed │ ├── cancer_gene_census.b37.2017-05-25.bed │ ├── cancer_gene_census.hg19.2017-05-25.bed │ └── haplo_insuff_genes.bed └── hotspots │ ├── hotspot-dedup.vcf │ ├── hotspot-v1.hg19.vcf.gz │ ├── hotspot-v1.hg19.vcf.gz.tbi │ ├── hotspot-v1.vcf.gz │ ├── hotspot-v1.vcf.gz.tbi │ ├── hotspot-v2.hg19.vcf.gz │ ├── hotspot-v2.hg19.vcf.gz.tbi │ ├── hotspot-v2.vcf.gz │ ├── hotspot-v2.vcf.gz.tbi │ ├── hotspot-v3.vcf.gz │ └── hotspot-v3.vcf.gz.tbi ├── rnaseq ├── immunedeconv.mk ├── kallisto.mk └── sumreads.mk ├── scripts ├── Rshell ├── Sweave.R ├── add_dbsnp_gmaf.py ├── annotateSummaryVcf.R ├── bam_metrics.R ├── classify_indel_pathogenicity_vcf.py ├── classify_pathogenicity_vcf.py ├── classify_snv_pathogenicity_vcf.py ├── cnvkit.R ├── configure.py ├── convert_sample_txt2yaml.py ├── create_iontorrent_sample_merge_yaml.py ├── create_iontorrent_sample_yaml.py ├── create_sample_sets.pl ├── create_sample_yaml.py ├── create_sample_yaml2.py ├── drmaa_job.py ├── extract_signatures.R ├── facets_suite.R ├── filter_dbsnp_gmaf.py ├── filter_sv.R ├── get_basecounts.R ├── get_insert_size.py ├── hr_detect.R ├── immunedeconv.R ├── init_project.pl ├── job.py ├── join_eff.pl ├── knit.R ├── launcher_sql_db.py ├── medicc2.R ├── merge.R ├── mimsi.R ├── monitorMySQL.sh ├── monitor_gfserver.sh ├── mutation_taster_query.py ├── normalFilterVCF.pl ├── posnGeneLookup.pl ├── prepareFastq.sh ├── prepareFastq2.sh ├── prepareMultirunFastq.sh ├── provean_query.py ├── provean_vcf.py ├── pyclone_13.R ├── pyclone_vi.R ├── qmake.pl ├── qsub.pl ├── qsub.py ├── qsubClient.pl ├── qsubDaemon.pl ├── qsub_pbs.py ├── rbind.R ├── recurrent_mutations_plot.py ├── recurrent_mutations_sufam.ipynb ├── remote_provean_query.py ├── run.py ├── somaticFilterVCF.pl ├── split_bed.py ├── split_vcf.py ├── star_fish.R ├── sufam_gt.R ├── summarize_rnaseqreads.R ├── summarize_rnaseqreads_byexon.R ├── summarize_rnaseqreads_byintron.R ├── summarize_sleuth.R ├── sv_signature.R ├── swapvcf.R ├── tsvToExcel.py ├── vcfToTable.R └── wgs_metrics.R ├── signatures ├── deconstruct_sigs.mk ├── hr_detect.mk ├── star_fish.mk └── sv_signature.mk ├── snp6 ├── absolute.R ├── hapseg.R └── snp6.mk ├── summary ├── cravat_summary.R ├── cravat_summary.mk ├── cravat_summary.py ├── delmh_summary.R ├── delmh_summary.mk ├── genome_summary_excel.py ├── genomesummary.R ├── genomesummary.mk ├── hotspot_summary_excel.py ├── hotspotsummary.R ├── hotspotsummary.mk ├── mouse_summary_excel.py ├── mousesummary.R ├── mutation_summary_excel.py └── mutationsummary.mk ├── sv_callers ├── brass.mk ├── chimerascan.mk ├── crest.mk ├── defuse.mk ├── defuse2usv.py ├── defuseOncofuse.R ├── delly.mk ├── destruct.mk ├── ericScript.mk ├── ericscript2usv.py ├── extractCoordsFromDefuse.pl ├── filterDefuse.pl ├── fusioncatcher.mk ├── fusionfinder.mk ├── gridss_tumor_normal.mk ├── hydra.mk ├── integrate.mk ├── integrate2usv.py ├── integrateOncofuse.R ├── integrateRnaseq.mk ├── lumpy.mk ├── manta.inc ├── manta.mk ├── mantaRnaseq.mk ├── manta_config.py.ini ├── manta_hs_config.py.ini ├── manta_tumor_normal.mk ├── mapsplice.mk ├── mapsplice2usv.py ├── nfuseDNA.mk ├── nfuseWGSSWTSS.mk ├── normalFilterChimerascan.pl ├── normalFilterDefuse.pl ├── normalFilterSoapFuse.pl ├── oncofuse.mk ├── prepareSoapFuse.pl ├── recurrentFusions.R ├── soapFuse.mk ├── starFusion.mk ├── starfusion2usv.py ├── svaba_tumor_normal.mk └── tophatFusion.mk ├── variant_callers ├── dindel.mk ├── fixVarscanVcf.pl ├── gatk.inc ├── gatk.mk ├── get_basecounts.mk ├── haplotypeCaller.mk ├── hotspot.mk ├── museq.mk ├── pindel.mk ├── qsnp.mk ├── samtoolsHet.mk ├── somatic │ ├── crest.mk │ ├── dindelTNFilter.mk │ ├── gatkTNFilter.mk │ ├── gatkValidation.mk │ ├── hla_summary.R │ ├── lancet.mk │ ├── mimsi.mk │ ├── msisensor.mk │ ├── museqTN.mk │ ├── mutect.mk │ ├── mutect2.mk │ ├── mutectReport.Rmd │ ├── pindelTN.mk │ ├── platypus.mk │ ├── plotSeqLogoFromMutect.R │ ├── polysolver.mk │ ├── scalpel.mk │ ├── somaticIndelDetector.mk │ ├── somaticIndels.mk │ ├── somaticSniper.mk │ ├── somaticSniperFixAD.R │ ├── somaticVariants.mk │ ├── strelka.mk │ ├── strelkaVarscanIndels.mk │ ├── tvcTN.mk │ ├── varscanTN.mk │ └── varscanTNtoVcf.pl ├── sufam_gt.mk ├── sufamsampleset.mk ├── tvc.mk ├── variantEvalGatkReport.R ├── varscan.mk ├── varscanFpfilter.mk └── varscanToVcf.pl ├── vcf_tools ├── addGeneListAnnotationToVcf.R ├── annotateExtVcf.mk ├── annotateSomaticVcf.mk ├── annotateSummaryVcf.mk ├── annotateVcf.mk ├── annotate_source_vcf.py ├── annotate_sufam_gt_vcf.py ├── annotate_sv.mk ├── annotate_vcf2maf.py ├── bed_annotate_vcf.py ├── chasmVcf.R ├── combine_vcf.R ├── common_filter_vcf.py ├── compare_vcf.py ├── concat_vcf.py ├── cravat_annotation.mk ├── fathmmVcf.R ├── filter_vcf.R ├── gemini.mk ├── hotspot_vcf.py ├── indel_filter_vcf.py ├── interval_depth_filter_vcf.py ├── interval_filter_vcf.py ├── merge_indel_vcf.py ├── merge_sv.mk ├── merge_uvcf_vcf.py ├── merge_vcf.py ├── mutAssVcf.R ├── mutation_taster_vcf.py ├── oncokb_vcf.py ├── parsSNPVcf.R ├── pass_filter_vcf.py ├── proveanVcf.R ├── recurVcf.R ├── recurVcf.mk ├── snp_filter_vcf.py ├── somatic_ad_filter_vcf.py ├── somatic_vcf2tsv.py ├── split_snps_indels_vcf.py ├── summary_vcf.R ├── transficVcf.R ├── tumor_variant_read_filter_vcf.py ├── vcfAnnotations.mk ├── vcfCompare.mk ├── vcfCompareTN.mk ├── vcfFilters.mk ├── vcfMerge.mk ├── vcfMergePlatform.mk ├── vcfMergeTN.mk ├── vcfPostAnnotations.mk ├── vcfPostFilters.mk ├── vcfsorter.pl └── vcftools.mk └── virus ├── krona_classify.mk └── virus_detection_bowtie2.mk /.gitignore: -------------------------------------------------------------------------------- 1 | .*.tmp 2 | *.pyc 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "external/fathmm"] 2 | path = external/fathmm 3 | url = https://github.com/jrflab/fathmm.git 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | sudo: false 5 | install: 6 | - case "$TRAVIS_PYTHON_VERSION" in 7 | 2*) 8 | wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; 9 | PYTHON="python" 10 | ;; 11 | 3*) 12 | wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; 13 | PYTHON="python3" 14 | ;; 15 | esac 16 | - bash miniconda.sh -b -p $HOME/miniconda 17 | - export PATH="$HOME/miniconda/bin:$PATH" 18 | - hash -r 19 | - conda config --set always_yes yes --set changeps1 no 20 | - conda update -q conda 21 | - conda info -a 22 | - conda config --add channels r 23 | - conda config --add channels bioconda 24 | - conda config --add channels auto 25 | - conda config --add channels jrderuiter 26 | - conda config --add channels biobuilds 27 | - conda create -q -n jrflab --file conda_env/jrflab_modules_env.txt 28 | - conda create -q -n r-env --file conda_env/R3.2.2.txt 29 | script: 30 | # TODO: copy number heatmap test not working: 31 | # source activate r-env 32 | # bash -x test/copy_number/test_copynumber_heatmap.sh 33 | - source activate jrflab 34 | - bash -x test/vcf_tools/test_common_filter.sh 35 | - bash -x test/vcf_tools/test_hotspot.sh 36 | - bash -x test/vcf_tools/test_pathogenicity.sh 37 | - bash -x test/scripts/test_create_sample_yaml.sh 38 | - bash -x test/scripts/test_configure.sh 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # modules 2 | 3 | -------------------------------------------------------------------------------- /aligners/align.inc: -------------------------------------------------------------------------------- 1 | ifndef ALIGN_INC 2 | BAM_PHRED64 ?= false 3 | BAM_DUP_TYPE ?= markdup 4 | BAM_NO_FILTER ?= false 5 | BAM_NO_RECAL ?= false 6 | BAM_NO_REALN ?= false 7 | SPLIT_CHR ?= true 8 | SPLIT_FASTQ ?= false 9 | BAM_NO_SORT ?= false 10 | BAM_FIX_RG ?= false 11 | SEQ_PLATFORM ?= illumina 12 | 13 | BAM_SUFFIX := $(subst $( ),.,$(strip \ 14 | $(if $(findstring false,$(BAM_NO_SORT)),sorted)\ 15 | $(if $(findstring false,$(BAM_NO_FILTER)),filtered)\ 16 | $(if $(findstring true,$(PDX)),pdx_filtered)\ 17 | $(if $(findstring true,$(BAM_FIX_RG)),rg)\ 18 | $(if $(findstring false,$(BAM_NO_REALN)),realn)\ 19 | $(if $(findstring rmdup,$(BAM_DUP_TYPE)),rmdup)\ 20 | $(if $(findstring markdup,$(BAM_DUP_TYPE)),markdup)\ 21 | $(if $(findstring false,$(BAM_NO_RECAL)),recal)\ 22 | bam)) 23 | endif 24 | ALIGN_INC = true 25 | -------------------------------------------------------------------------------- /aligners/align.mk: -------------------------------------------------------------------------------- 1 | define bam-header 2 | $$(ALIGNER)/sam/$1.header.sam : $$(foreach split,$2,$$(ALIGNER)/bam/$$(split).$$(ALIGNER).sorted.bam) 3 | $$(INIT) $$(SAMTOOLS) view -H $$< | grep -v '^@RG' > $$@.tmp; \ 4 | for bam in $$^; do $$(SAMTOOLS) view -H $$$$bam | grep '^@RG' >> $$@.tmp; done; \ 5 | uniq $$@.tmp > $$@ && $$(RM) $$@.tmp 6 | endef 7 | $(foreach sample,$(SAMPLES),\ 8 | $(eval $(call bam-header,$(sample),$(split.$(sample))))) 9 | 10 | define merged-bam 11 | $$(ALIGNER)/bam/$1.$$(ALIGNER).sorted.bam : $$(ALIGNER)/sam/$1.header.sam $$(foreach split,$2,$$(ALIGNER)/bam/$$(split).$$(ALIGNER).sorted.bam) 12 | $$(call RUN,-s 12G -m 15G,"$$(SAMTOOLS) merge -f -h $$< $$(@) $$(filter %.bam,$$^) && $$(RM) $$^") 13 | endef 14 | define rename-bam 15 | $$(ALIGNER)/bam/$1.$$(ALIGNER).bam : $$(ALIGNER)/bam/$2.$$(ALIGNER).bam 16 | mv $$< $$@ 17 | $$(ALIGNER)/bam/$1.$$(ALIGNER).sorted.bam : $$(ALIGNER)/bam/$2.$$(ALIGNER).sorted.bam 18 | mv $$< $$@ 19 | endef 20 | $(foreach sample,$(SAMPLES),\ 21 | $(if $(word 2,$(split.$(sample))),\ 22 | $(eval $(call merged-bam,$(sample),$(split.$(sample)))),\ 23 | $(if $(split.$(sample)),\ 24 | $(eval $(call rename-bam,$(sample),$(split.$(sample))))))) 25 | -------------------------------------------------------------------------------- /aligners/gsnapIADB.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | OPTS = -d IADB -D ${GSNAP_REF} -B 4 -t 4 -A sam --novelsplicing=1 --pairexpect=200 -n 1 --quiet-if-excessive --nofails 4 | ifeq ($(BAM_PHRED64),true) 5 | OPTS += -J 64 -j -31 6 | endif 7 | GSNAP_SGE_RREQ = $(call MEM_FREE,2G,4G) -q all.q -pe $(PARALLEL_ENV) 4 -now n 8 | 9 | REQUIRED_FLAGS = 4 10 | BAM_FILTER_FLAGS = 1536 11 | 12 | SAMPLE_FILE = samples.txt 13 | SAMPLES = $(shell cat $(SAMPLE_FILE)) 14 | 15 | VPATH = bam 16 | 17 | LOGDIR = iadb/log 18 | 19 | .DELETE_ON_ERROR: 20 | .SECONDARY: 21 | .PHONY : all 22 | 23 | all : $(foreach sample,$(SAMPLES),iadb/bam/$(sample).bam) 24 | 25 | iadb/unaln_bam/%.bam : %.bam 26 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,1G,2G)" $(MKDIR) $(@D) $(LOGDIR); $(SAMTOOLS) view -f $(REQUIRED_FLAGS) -F $(BAM_FILTER_FLAGS) -bh $< > $@ 2> $(LOGDIR)/$(@F).log 27 | 28 | 29 | iadb/fastq/%.1.fastq iadb/fastq/%.2.fastq : iadb/unaln_bam/%.bam 30 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,5G,7G)" $(MKDIR) $(@D) $(LOGDIR); \ 31 | $(call SAM_TO_FASTQ_MEM,5G) I=$< FASTQ=iadb/fastq/$*.1.fastq SECOND_END_FASTQ=iadb/fastq/$*.2.fastq &> $(LOGDIR)/$(@F).log 32 | 33 | iadb/bam/%.gsnap.bam : iadb/fastq/%.1.fastq iadb/fastq/%.2.fastq 34 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,1.5G,2G) -pe $(PARALLEL_ENV) 4" $(MKDIR) $(@D) $(LOGDIR); \ 35 | $(MKDIR) $(@D) $(LOGDIR); $(GSNAP) $(OPTS) --read-group-id=$* $^ 2> $(LOGDIR)/$(@F).log | $(SAMTOOLS) view -bhS - > $@ 36 | 37 | iadb/bam/%.bam : iadb/bam/%.gsnap.sorted.filtered.markdup.bam 38 | $(MKDIR) $(@D); ln -v $< $@ 39 | 40 | include modules/bam_tools/processBam.mk 41 | -------------------------------------------------------------------------------- /aligners/novoalignIADB.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | NOVOALIGN = $(HOME)/share/usr/bin/novoalignMPI 4 | NOVOINDEX = $(HOME)/share/usr/bin/novoindex 5 | 6 | REF_FASTA = $(HOME)/share/references/IADB_feb2011.fa 7 | IADB_NIX = $(HOME)/share/references/IADB_feb2011.fa.nix 8 | 9 | SAMPLE_FILE = samples.txt 10 | SAMPLES = $(shell cat $(SAMPLE_FILE)) 11 | 12 | VPATH = bam 13 | 14 | LOGDIR = iadb/log 15 | 16 | .DELETE_ON_ERROR: 17 | .SECONDARY: 18 | .PHONY : all 19 | 20 | all : $(foreach sample,$(SAMPLES),iadb/processed_bam/$(sample).bam) 21 | 22 | iadb/fastq/%.1.fastq.gz iadb/fastq/%.2.fastq.gz : %.bam 23 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,10G,12G)" $(MKDIR) $(@D) $(LOGDIR); $(BAM2FASTQ) --no-aligned -o $(@D)/$*#.fastq $< &> $(LOGDIR)/$(@F).log && mv $(@D)/$*_1.fastq $(@D)/$*.1.fastq && mv $(@D)/$*_2.fastq $(@D)/$*.2.fastq && gzip $(@D)/$*.1.fastq $(@D)/$*.2.fastq 24 | 25 | iadb/bam/%.novoalign.bam : iadb/fastq/%.1.fastq.gz iadb/fastq/%.2.fastq.gz 26 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,14G,18G) -pe openmpi 5" $(MKDIR) $(@D) $(LOGDIR); mpiexec -np 5 $(NOVOALIGN) -i 200,50 -r A -R 0 -d $(IADB_NIX) -f $^ -o SAM $$'@RG\tID:$*\tPU:illumina\tLB:$*' 2> $(LOGDIR)/$(@F).log | $(SAMTOOLS) view -bS - > $@ 27 | 28 | iadb/processed_bam/%.bam : iadb/bam/%.novoalign.sorted.filtered.fixmate.markdup.bam 29 | $(MKDIR) $(@D); ln -v $< $@ 30 | 31 | include modules/bam_tools/processBam.mk 32 | -------------------------------------------------------------------------------- /aligners/pBwaAligner.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | SAM_TO_FASTQ = $(JAVA) -Xmx2G -jar $(JARDIR)/SamToFastq.jar VALIDATION_STRINGENCY=LENIENT 4 | SAMPLE_FILE = samples.txt 5 | SAMPLES = $(shell cat $(SAMPLE_FILE)) 6 | 7 | LOGDIR = log/pBwa.$(NOW) 8 | 9 | BWA_BAMS = $(foreach sample,$(SAMPLES),bam/$(sample).bam) 10 | SAMTOOLS_SORT_MEM = 2000000000 11 | SEQ_PLATFORM = illumina 12 | 13 | .SECONDARY: 14 | .DELETE_ON_ERROR: 15 | .PHONY : all bwa_bams 16 | 17 | all : bwa_bams 18 | bwa_bams : $(BWA_BAMS) $(addsuffix .bai,$(BWA_BAMS)) 19 | 20 | bwa/sai/%.1.sai bwa/sai/%.2.sai : fastq/%.1.fastq.gz fastq/%.2.fastq.gz 21 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,8G,9G) -pe openmpi 10-100" $(MKDIR) $(@D) $(LOGDIR); $(PBWA) aln -f $(@D)/$* $(REF_FASTA) $^ 2> $(LOGDIR)/$(@F).log 22 | 23 | %.bam.bai : %.bam 24 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,1G,2G)" $(MKDIR) $(@D) $(LOGDIR); $(SAMTOOLS) index $< &> $(LOGDIR)/$(@F).log 25 | 26 | fastq/%.1.fastq.gz fastq/%.2.fastq.gz : gsc_bam/%.bam 27 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,10G,12G)" $(MKDIR) $(@D) $(LOGDIR); $(BAM2FASTQ) -o fastq/$*#.fastq $< &> $(LOGDIR)/$(@F).log && mv fastq/$*_1.fastq fastq/$*.1.fastq && mv fastq/$*_2.fastq fastq/$*.2.fastq && gzip fastq/$*.1.fastq fastq/$*.2.fastq 28 | 29 | bwa/bam/%.bwa.sam : bwa/sai/%.1.sai bwa/sai/%.2.sai fastq/%.1.fastq.gz fastq/%.2.fastq.gz 30 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,7G,9G) -pe openmpi 10-100" $(MKDIR) $(@D) $(LOGDIR); \ 31 | LBID=`echo "$*" | sed 's/_[0-9]\+//'`; \ 32 | $(PBWA) sampe -f $@ -P -r "@RG\tID:$*\tLB:$${LBID}\tPL:${SEQ_PLATFORM}\tSM:$*" $(REF_FASTA) $(basename $(word 1,$^)) $(basename $(word 2,$^)) $(word 3,$^) $(word 4,$^) 2> $(LOGDIR)/$(@F).log 33 | 34 | 35 | bam/%.bam : bwa/bam/%.bwa.sorted.filtered.fixmate.markdup.bam 36 | $(MKDIR) $(@D); ln -f $< $@ 37 | 38 | include modules/bam_tools/processBam.mk 39 | -------------------------------------------------------------------------------- /aligners/starFusionAligner.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | STAR_CHIMERIC = true 4 | 5 | STAR_FUSION = STAR-Fusion 6 | 7 | STAR_FUSION_ENV = $(HOME)/share/usr/anaconda-envs/star-fusion-1.0.0 8 | 9 | STAR_FUSION_TO_USV = python modules/sv_callers/starfusion2usv.py 10 | 11 | $(if $(STAR_CTAT_DIR),,$(error no STAR CTAT dir)) 12 | 13 | PHONY += star_fusion 14 | star_fusion : $(foreach sample,$(SAMPLES),star_fusion/$(sample).star_fusion_timestamp) 15 | 16 | star_fusion/%.star_fusion_timestamp : star/%.Chimeric.out.junction 17 | $(call RUN,-v $(STAR_FUSION_ENV) -s 8G -m 12G,"$(STAR_FUSION) --genome_lib_dir $(STAR_CTAT_DIR) -J $< --output_dir $(@D)/$* && touch $@") 18 | 19 | usv/%.star_fusion.tsv : star_fusion/%.star_fusion_timestamp 20 | $(call RUN,,"$(STAR_FUSION_TO_USV) < $( $@") 21 | 22 | include modules/aligners/starAligner.mk 23 | -------------------------------------------------------------------------------- /aligners/tmapAligner.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | include modules/variant_callers/gatk.inc 3 | include modules/aligners/align.inc 4 | 5 | ALIGNER := tmap 6 | LOGDIR := log/tmap.$(NOW) 7 | 8 | 9 | SAMTOOLS_SORT_MEM = 2000000000 10 | 11 | FASTQ_CHUNKS := 10 12 | FASTQ_CHUNK_SEQ := $(shell seq 1 $(FASTQ_CHUNKS)) 13 | FASTQUTILS = $(HOME)/share/usr/ngsutils/bin/fastqutils 14 | 15 | TMAP = $(HOME)/share/usr/bin/tmap 16 | TMAP_MODE ?= map3 17 | TMAP_OPTS = 18 | 19 | SEQ_PLATFORM = IONTORRENT 20 | 21 | .SECONDARY: 22 | .DELETE_ON_ERROR: 23 | .PHONY: tmap 24 | 25 | TMAP_BAMS = $(foreach sample,$(SAMPLES),bam/$(sample).bam) 26 | tmap : $(TMAP_BAMS) $(addsuffix .bai,$(TMAP_BAMS)) 27 | 28 | bam/%.bam : tmap/bam/%.tmap.$(BAM_SUFFIX) 29 | $(INIT) cp $< $@ && ln -f $(<) $(@) 30 | 31 | tmap/sam/%.header.sam : unprocessed_bam/%.bam 32 | $(INIT) $(SAMTOOLS) view -H $< | grep -e '^@HD' -e '^@RG' > $@ 33 | 34 | tmap/bam/%.$(TMAP_MODE).bam : tmap/sam/%.header.sam unprocessed_bam/%.bam 35 | $(call RUN,-c -n 4 -s 6G -m 8G,"$(SAMTOOLS) reheader $^ | $(TMAP) $(TMAP_MODE) $(TMAP_OPTS) -Q 2 -f $(REF_FASTA) -i bam -s $(@) -o 1 -n 4 ") 36 | 37 | define align-split-fastq 38 | tmap/bam/$2.tmap.bam : $3 39 | $$(call RUN,-c -n 4 -s 6G -m 8G,"zcat $$< | $$(TMAP) $$(TMAP_MODE) $$(TMAP_OPTS) -f $$(REF_FASTA) -i fastq -s $$(@) -Q 0 -o 1 -n 4 -R ID:$2 -R SM:$1 -R PL:$$(SEQ_PLATFORM) -R PU:00000000") 40 | endef 41 | $(foreach ss,$(SPLIT_SAMPLES),\ 42 | $(if $(fq.$(ss)), \ 43 | $(eval $(call align-split-fastq,$(split.$(ss)),$(ss),$(fq.$(ss)))))) 44 | 45 | tmap/bam/%.tmap.bam : fastq/%.fastq.gz 46 | $(call RUN,-c -n 4 -s 6G -m 8G,"zcat $< | $(TMAP) $(TMAP_MODE) $(TMAP_OPTS) -f $(REF_FASTA) -i fastq -s $(@) -Q 0 -o 1 -n 4 -R ID:$* -R SM:$* -R PL:$(SEQ_PLATFORM) -R PU:00000000 ") 47 | 48 | include modules/bam_tools/processBam.mk 49 | include modules/fastq_tools/fastq.mk 50 | include modules/aligners/align.mk 51 | -------------------------------------------------------------------------------- /bam_tools/fix_mate.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | include modules/hg19.inc 3 | 4 | FIXMATE = $(JAVA) -Xmx10G -jar $(JARDIR)/FixMateInformation.jar VALIDATION_STRINGENCY=LENIENT 5 | 6 | SAMPLE_FILE = samplesToFixMate.txt 7 | SAMPLES = $(shell cat $(SAMPLE_FILE)) 8 | 9 | LOGDIR = gsc_bam/logs 10 | 11 | .DELETE_ON_ERROR: 12 | 13 | .SECONDARY: 14 | 15 | all : $(foreach sample,$(SAMPLES),gsc_bam/$(sample).fixmate.bam) 16 | 17 | gsc_bam/%.fixmate.bam : gsc_bam/%.bam 18 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,10G,40G)" $(MKDIR) $(LOGDIR);\ 19 | $(FIXMATE) I=$< O=$@ &> ${LOGDIR}/$(@F).fixmate.log 20 | -------------------------------------------------------------------------------- /bam_tools/fix_rg.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | include modules/variant_callers/gatk.inc 3 | include modules/aligners/align.inc 4 | 5 | LOGDIR ?= log/fix_rg.$(NOW) 6 | 7 | BAMS = $(foreach sample,$(SAMPLES),bam/$(sample).bam) 8 | 9 | fixed_bams : $(BAMS) $(addsuffix .bai,$(BAMS)) 10 | 11 | bam/%.bam : unprocessed_bam/%.rg.bam 12 | $(INIT) ln -f $(<) $(@) 13 | 14 | include modules/bam_tools/processBam.mk 15 | -------------------------------------------------------------------------------- /bam_tools/get_bam_data_mirror.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/getbam_data_mirror.$(NOW) 4 | 5 | get_bam : $(foreach sample,$(SAMPLES),bam/$(sample).bam) \ 6 | $(foreach sample,$(SAMPLES),bam/$(sample).bam.bai) \ 7 | $(foreach sample,$(SAMPLES),bam/$(sample).bai) 8 | 9 | PROJECT_NAME = $(shell basename $(PWD)) 10 | 11 | define get-bam 12 | bam/$1.bam : 13 | $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ 14 | rsync -aP -e ssh $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam \ 15 | bam/") 16 | 17 | bam/$1.bam.bai : 18 | $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ 19 | rsync -aP -e ssh $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam.bai \ 20 | bam/") 21 | 22 | bam/$1.bai : 23 | $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ 24 | rsync -aP -e ssh $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bai \ 25 | bam/") 26 | 27 | 28 | endef 29 | $(foreach sample,$(SAMPLES),\ 30 | $(eval $(call get-bam,$(sample)))) 31 | 32 | ..DUMMY := $(shell mkdir -p version; \ 33 | which scp > version/getbam_data_mirror.txt) 34 | .SECONDARY: 35 | .DELETE_ON_ERROR: 36 | .PHONY: get_bam -------------------------------------------------------------------------------- /bam_tools/get_bam_irb_mirror.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/getbam_irb_mirror.$(NOW) 4 | 5 | get_bam : $(foreach sample,$(SAMPLES),bam/$(sample).bam) \ 6 | $(foreach sample,$(SAMPLES),bam/$(sample).bam.bai) \ 7 | $(foreach sample,$(SAMPLES),bam/$(sample).bai) 8 | 9 | define get-bam 10 | bam/$1.bam : 11 | $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ 12 | scp $(USER)@juno-xfer01.mskcc.org:/juno/dmp/share/irb12_245/`echo $1 | cut -c 1-1`/`echo $1 | cut -c 2-2`/$1.bam \ 13 | bam/") 14 | 15 | bam/$1.bam.bai : bam/$1.bam 16 | $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ 17 | $(SAMTOOLS) index $$(<)") 18 | 19 | bam/$1.bai : bam/$1.bam bam/$1.bam.bai 20 | $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ 21 | cp $$(<<) $$(@)") 22 | 23 | 24 | endef 25 | $(foreach sample,$(SAMPLES),\ 26 | $(eval $(call get-bam,$(sample)))) 27 | 28 | ..DUMMY := $(shell mkdir -p version; \ 29 | which scp > version/getbam_irb_mirror.txt) 30 | .SECONDARY: 31 | .DELETE_ON_ERROR: 32 | .PHONY: get_bam -------------------------------------------------------------------------------- /bam_tools/merge_bam.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/merge.$(NOW) 4 | 5 | merged_bam : $(foreach sample,$(MERGE_SAMPLES),bam/$(sample).bam bam/$(sample).bam.bai) 6 | 7 | define merged-bam 8 | %.header.sam : %.bam 9 | $$(INIT) $$(SAMTOOLS2) view -H $$< > $$@ 10 | 11 | merged_bam/$1.header.sam : $$(merge.$1:.bam=.header.sam) 12 | $$(call RUN,-s 16G -m 18G,"$$(call PICARD,MergeSamFiles,13G) $$(foreach sam,$$^,I=$$(sam) ) O=$$@") 13 | 14 | merged_bam/$1.bam : merged_bam/$1.header.sam $$(merge.$1) 15 | $$(call RUN,-s 12G -m 15G,"$$(SAMTOOLS2) merge -f -h $$< $$(@) $$(filter %.bam,$$^)") 16 | endef 17 | define rename-bam 18 | bam/$1.bam : $2 19 | $$(INIT) ln -f $$< $$@ 20 | endef 21 | $(foreach sample,$(MERGE_SAMPLES),\ 22 | $(if $(word 2,$(merge.$(sample))),\ 23 | $(eval $(call merged-bam,$(sample))),\ 24 | $(if $(merge.$(sample)),\ 25 | $(eval $(call rename-bam,$(sample),$(merge.$(sample))))))) 26 | 27 | 28 | bam/%.bam : merged_bam/%.rg.bam 29 | $(INIT) ln -f $< $@ 30 | 31 | .SECONDARY: 32 | .DELETE_ON_ERROR: 33 | .PHONY : merged_bam 34 | 35 | include modules/bam_tools/processBam.mk 36 | -------------------------------------------------------------------------------- /bam_tools/put_bam_data_mirror.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/putbam_data_mirror.$(NOW) 4 | 5 | put_bam : $(foreach sample,$(SAMPLES),bam/$(sample).taskcomplete) 6 | 7 | PROJECT_NAME = $(shell basename $(PWD)) 8 | 9 | define put-bam 10 | bam/$1.taskcomplete : bam/$1.bam 11 | $$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \ 12 | rsync -aP -e ssh bam/$1.bam $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam && \ 13 | rsync -aP -e ssh bam/$1.bam.bai $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam.bai && \ 14 | rsync -aP -e ssh bam/$1.bam.bai $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bai && \ 15 | echo 'finished!' > $$(@)") 16 | 17 | endef 18 | $(foreach sample,$(SAMPLES),\ 19 | $(eval $(call put-bam,$(sample)))) 20 | 21 | ..DUMMY := $(shell mkdir -p version; \ 22 | which scp > version/putbam_data_mirror.txt) 23 | .SECONDARY: 24 | .DELETE_ON_ERROR: 25 | .PHONY: put_bam -------------------------------------------------------------------------------- /clonality/tableToCloneHDFormat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use Getopt::Std; 7 | my %opt; 8 | getopts('h', \%opt); 9 | 10 | my $usage = < output.txt 12 | ENDL 13 | 14 | sub HELP_MESSAGE { 15 | print STDERR $usage; 16 | exit(1); 17 | } 18 | 19 | HELP_MESSAGE if $opt{h}; 20 | 21 | my @samples = @ARGV; 22 | 23 | my $headerLine = ; 24 | chomp $headerLine; 25 | $headerLine =~ s/^#//; 26 | my @header = split/\t/, $headerLine; 27 | 28 | while () { 29 | my @F = split /\t/; 30 | my %F = map { $_ => shift @F } @header; 31 | my $line = "$F{CHROM}\t$F{POS}"; 32 | for my $s (@samples) { 33 | my $ad = $F{$s . ".AD"}; 34 | my @ad = ($ad ne ".")? split /,/, $ad : qw/0 0/; 35 | my $dp = $ad[0] + $ad[1]; 36 | $line .= "\t$ad[0]\t$dp"; 37 | } 38 | print $line . "\n"; 39 | } 40 | 41 | 42 | -------------------------------------------------------------------------------- /conda_env/R3.2.2.txt: -------------------------------------------------------------------------------- 1 | R=3.2.2 2 | r-optparse=1.3.0 3 | -------------------------------------------------------------------------------- /conda_env/delly_env.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | @EXPLICIT 5 | https://conda.anaconda.org/bioconda/linux-64/bcftools-1.3.1-1.tar.bz2 6 | https://repo.continuum.io/pkgs/free/linux-64/click-6.6-py27_0.tar.bz2 7 | https://repo.continuum.io/pkgs/free/linux-64/curl-7.49.0-1.tar.bz2 8 | https://conda.anaconda.org/bioconda/linux-64/delly-0.7.6-0.tar.bz2 9 | https://repo.continuum.io/pkgs/free/linux-64/h5py-2.6.0-np111py27_2.tar.bz2 10 | https://repo.continuum.io/pkgs/free/linux-64/hdf5-1.8.17-1.tar.bz2 11 | https://conda.anaconda.org/bioconda/linux-64/htslib-1.3.1-1.tar.bz2 12 | https://repo.continuum.io/pkgs/free/linux-64/libgcc-5.2.0-0.tar.bz2 13 | https://repo.continuum.io/pkgs/free/linux-64/mkl-11.3.3-0.tar.bz2 14 | https://repo.continuum.io/pkgs/free/linux-64/numpy-1.11.2-py27_0.tar.bz2 15 | https://repo.continuum.io/pkgs/free/linux-64/openssl-1.0.2j-0.tar.bz2 16 | https://repo.continuum.io/pkgs/free/linux-64/pip-9.0.1-py27_0.tar.bz2 17 | https://conda.anaconda.org/bioconda/linux-64/pysam-0.9.1.4-py27_0.tar.bz2 18 | https://conda.anaconda.org/conda-forge/linux-64/python-2.7.12-1.tar.bz2 19 | https://repo.continuum.io/pkgs/free/linux-64/readline-6.2-2.tar.bz2 20 | https://conda.anaconda.org/bioconda/linux-64/samtools-1.3.1-5.tar.bz2 21 | https://repo.continuum.io/pkgs/free/linux-64/setuptools-27.2.0-py27_0.tar.bz2 22 | https://repo.continuum.io/pkgs/free/linux-64/six-1.10.0-py27_0.tar.bz2 23 | https://repo.continuum.io/pkgs/free/linux-64/sqlite-3.13.0-0.tar.bz2 24 | http://repo.continuum.io/pkgs/free/linux-64/tk-8.5.18-0.tar.bz2 25 | https://conda.anaconda.org/conda-forge/linux-64/wheel-0.29.0-py27_0.tar.bz2 26 | https://repo.continuum.io/pkgs/free/linux-64/zlib-1.2.8-3.tar.bz2 27 | -------------------------------------------------------------------------------- /contamination/clusterSamples.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | include modules/variant_callers/gatk.inc 3 | 4 | LOGDIR = log/cluster_samples.$(NOW) 5 | 6 | VPATH ?= bam 7 | ifeq ($(EXOME),true) 8 | DBSNP_SUBSET ?= $(HOME)/share/reference/dbsnp_137_exome.bed 9 | else 10 | DBSNP_SUBSET = $(HOME)/share/reference/dbsnp_tseq_intersect.bed 11 | endif 12 | 13 | CLUSTER_VCF = modules/contamination/clusterSampleVcf.R 14 | 15 | snp_cluster : $(foreach sample,$(SAMPLES),snp_vcf/$(sample).snps.vcf) \ 16 | snp_vcf/snps.vcf \ 17 | snp_vcf/snps_ft.vcf \ 18 | snp_vcf/snps_ft.pdf 19 | 20 | snp_vcf/%.snps.vcf : bam/%.bam 21 | $(call RUN,-n 4 -s 2.5G -m 3G,"set -o pipefail && \ 22 | $(call GATK_MEM,8G) \ 23 | -T UnifiedGenotyper \ 24 | -rf BadCigar \ 25 | -nt 4 \ 26 | -R $(REF_FASTA) \ 27 | --dbsnp $(DBSNP) \ 28 | $(foreach bam,$(filter %.bam,$^),-I $(bam) ) \ 29 | -L $(DBSNP_SUBSET) \ 30 | -o $@ \ 31 | --output_mode EMIT_ALL_SITES") 32 | 33 | 34 | snp_vcf/snps.vcf : $(foreach sample,$(SAMPLES),snp_vcf/$(sample).snps.vcf) 35 | $(call RUN,-s 16G -m 20G,"set -o pipefail && \ 36 | $(call GATK_MEM,14G) -T CombineVariants \ 37 | $(foreach vcf,$^,--variant $(vcf) ) \ 38 | -o $@ \ 39 | --genotypemergeoption UNSORTED \ 40 | -R $(REF_FASTA)") 41 | 42 | snp_vcf/snps_ft.vcf : snp_vcf/snps.vcf 43 | $(INIT) grep '^#' $< > $@ && grep -e '0/1' -e '1/1' $< >> $@ 44 | 45 | snp_vcf/snps_ft.pdf : snp_vcf/snps_ft.vcf 46 | $(call RUN,-n 1 -s 16G -m 20G -v $(VARIANT_ANNOTATION_ENV),"set -o pipefail && \ 47 | $(RSCRIPT) modules/contamination/clusterSampleVcf.R \ 48 | --input_file $(<) \ 49 | --output_file $(@) \ 50 | --sample_pairs '$(SAMPLE_PAIRS)' \ 51 | --genome b37") 52 | 53 | 54 | ..DUMMY := $(shell mkdir -p version; \ 55 | echo "GATK" > version/cluster_samples.txt;) 56 | .SECONDARY: 57 | .DELETE_ON_ERROR: 58 | .PHONY : snp_cluster 59 | 60 | include modules/vcf_tools/vcftools.mk 61 | -------------------------------------------------------------------------------- /contamination/contest.mk: -------------------------------------------------------------------------------- 1 | # This module runs ContEst on snp vcf files from gatk 2 | # Author: inodb, limr 3 | 4 | ##### MAKE INCLUDES ##### 5 | include modules/Makefile.inc 6 | include modules/variant_callers/gatk.inc 7 | 8 | LOGDIR ?= log/contest.$(NOW) 9 | 10 | .SECONDARY: 11 | .DELETE_ON_ERROR: 12 | .PHONY: contest 13 | 14 | contest : contest/all_contest.txt 15 | 16 | # ContEst doing on-the-fly genotyping 17 | define contest-tumor-normal 18 | contest/$1_$2.contest.txt : bam/$1.bam bam/$2.bam 19 | $$(call RUN,-s 12G -m 12G,"$$(call GATK_MEM2,4G) -T ContEst -I:eval $$(<) -I:genotype $$(<<) \ 20 | -pf $$(HAPMAP_POP_FILE) -o $$(@) -R $$(REF_FASTA)") 21 | endef 22 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call contest-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 23 | 24 | contest/all_contest.txt : $(foreach pair,$(SAMPLE_PAIRS),contest/$(pair).contest.txt) 25 | ( \ 26 | head -1 $< | sed "s/^/sample\t/"; \ 27 | for s in $(^); do \ 28 | grep -P "META\t" $$s | sed "s/^/`basename $$s _contamination.txt`/"; \ 29 | done | sort -rnk5,5; \ 30 | ) > $@ 31 | -------------------------------------------------------------------------------- /copy_number/CytoBand.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/copy_number/CytoBand.RData -------------------------------------------------------------------------------- /copy_number/absCNseq.mk: -------------------------------------------------------------------------------- 1 | # run absCNseq on varscan segmentation data 2 | include modules/Makefile.inc 3 | include modules/variant_callers/gatk.inc 4 | 5 | LOGDIR = log/absCN.$(NOW) 6 | ABS_CN_SEQ = $(RSCRIPT) modules/copy_number/absCNseq.R 7 | 8 | # file containing positions: chr:start-stop 9 | SNV_POS_FILE = snv_posn.intervals 10 | 11 | .DELETE_ON_ERROR: 12 | .SECONDARY: 13 | .PHONY : all 14 | 15 | all : $(foreach pair,$(SAMPLE_PAIRS),absCN/$(pair).absCN.txt) 16 | 17 | define abs-gatk-tumor-normal 18 | absCN/$1_$2.gatk.vcf : bam/$1.bam bam/$2.bam 19 | $$(call RUN,-s 9G -m 12G,"$$(call GATK_MEM,8G) -T UnifiedGenotyper -o $$@ -I $$< -I $$(<<) -R $$(REF_FASTA) --output_mode EMIT_ALL_SITES -L $$(SNV_POS_FILE)") 20 | endef 21 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call abs-gatk-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 22 | 23 | absCN/%.absCN.txt absCN/%.absSNV.txt : varscan/segment/%.varscan2copynumber.txt absCN/%.gatk.vcf 24 | $(call RUN,-s 4G -m 6G,"$(ABS_CN_SEQ) --genome $(REF) --outPrefix absCN/$* $^") 25 | -------------------------------------------------------------------------------- /copy_number/compare_facets_cncf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import sys 5 | import argparse 6 | import math 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser(prog='compare_facets_cncf.py', 10 | description='compare two facets cncf files') 11 | parser.add_argument('cncf1') 12 | parser.add_argument('cncf2') 13 | args = parser.parse_args() 14 | 15 | df1 = pd.read_table(args.cncf1).fillna(0) 16 | df2 = pd.read_table(args.cncf2).fillna(0) 17 | mafr_diff = 0 18 | for i, row1 in df1.iterrows(): 19 | for j, row2 in df2.iterrows(): 20 | if (row1['loc.start'] >= row2['loc.start'] and 21 | row1['loc.start'] <= row2['loc.end']) or \ 22 | (row1['loc.end'] >= row2['loc.start'] and 23 | row1['loc.end'] <= row2['loc.start']): 24 | mafr_diff += math.fabs(row1['mafR.clust'] - row2['mafR.clust']) 25 | break 26 | if mafr_diff < 20: 27 | print(("success, CNCF files are similar: {} {}".format(args.cncf1, args.cncf2))) 28 | sys.exit(0) 29 | else: 30 | print(("failed, files have high mafR difference: {} {}".format(args.cncf1, args.cncf2))) 31 | sys.exit(1) 32 | -------------------------------------------------------------------------------- /copy_number/convert_basecount_to_snp_pileup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import pandas as pd 4 | import sys 5 | import argparse 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser(prog='convert_basecount_to_snp_pileup.py', 9 | description='convert old basecount facets files to snp-pileup') 10 | parser.add_argument('basecount') 11 | args = parser.parse_args() 12 | 13 | bc = pd.read_table(args.basecount, dtype={'Chrom': str}) 14 | sp = pd.DataFrame() 15 | sp['Chromosome'] = bc.Chrom 16 | sp['Position'] = bc.Pos 17 | sp['Ref'] = bc.Ref 18 | sp['Alt'] = bc.Alt 19 | for i, row in bc.iterrows(): 20 | sp.ix[i, 'File1R'] = int(bc.ix[i, "NOR.{}p".format(sp.ix[i, 'Ref'])] + 21 | bc.ix[i, "NOR.{}n".format(sp.ix[i, 'Ref'])]) 22 | sp.ix[i, 'File1A'] = int(bc.ix[i, "NOR.{}p".format(sp.ix[i, 'Alt'])] + 23 | bc.ix[i, "NOR.{}n".format(sp.ix[i, 'Alt'])]) 24 | sp.ix[i, 'File1E'] = 0 25 | sp.ix[i, 'File1D'] = 0 26 | sp.ix[i, 'File2R'] = int(bc.ix[i, "TUM.{}p".format(sp.ix[i, 'Ref'])] + 27 | bc.ix[i, "TUM.{}n".format(sp.ix[i, 'Ref'])]) 28 | sp.ix[i, 'File2A'] = int(bc.ix[i, "TUM.{}p".format(sp.ix[i, 'Alt'])] + 29 | bc.ix[i, "TUM.{}n".format(sp.ix[i, 'Alt'])]) 30 | sp.ix[i, 'File2E'] = 0 31 | sp.ix[i, 'File2D'] = 0 32 | 33 | for col in sp.columns[4:]: 34 | sp[col] = sp[col].astype(int) 35 | 36 | sp.to_csv(sys.stdout, index=False) 37 | -------------------------------------------------------------------------------- /copy_number/createFacetsSummary.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | for (lib in c("optparse", "dplyr")) { 4 | suppressPackageStartupMessages(library(lib, character.only=TRUE)) 5 | } 6 | 7 | optList <- list(make_option("--outFile", default = NULL, help = "output file")) 8 | parser <- OptionParser(usage = "%prog [options] [facets files]", option_list = optList) 9 | arguments <- parse_args(parser, positional_arguments = T) 10 | opt <- arguments$options 11 | 12 | if (length(arguments$args) < 1) { 13 | cat("Need facets output files\n") 14 | print_help(parser) 15 | stop() 16 | } else if (is.null(opt$outFile)) { 17 | cat("Need output file\n") 18 | print_help(parser) 19 | stop() 20 | } else { 21 | facetsFiles <- arguments$args 22 | } 23 | 24 | 25 | Df <- data.frame() 26 | for (facetsFile in facetsFiles) { 27 | load(facetsFile) 28 | tumorName <- facetsFile %>% sub('.*/', '', .) %>% sub('_.*', '', .) %>% sub('\\..*', '', .) 29 | normalName <- facetsFile %>% sub('.*/', '', .) %>% sub('^.*_', '', .) %>% sub('\\..*', '', .) 30 | n <- paste(tumorName, normalName, sep = '_') 31 | Df[n, 'tumorName'] <- tumorName 32 | if (tumorName != normalName) { 33 | Df[n, 'normalName'] <- normalName 34 | } 35 | Df[n, 'purity'] <- fit$purity 36 | Df[n, 'ploidy'] <- fit$ploidy 37 | Df[n, 'dipLogR'] <- fit$dipLogR 38 | } 39 | Df <- mutate(Df, bad = purity <= 0.3 | is.na(purity)) 40 | 41 | write.table(Df, file = opt$outFile, sep = '\t', quote = F, row.names = F) 42 | -------------------------------------------------------------------------------- /copy_number/exomeCNVLOH.mk: -------------------------------------------------------------------------------- 1 | # Use ExomeCNV to detect copy number variants and LOH 2 | # vim: set ft=make : 3 | 4 | include modules/Makefile.inc 5 | 6 | LOGDIR = log/exomeCNVLOH.$(NOW) 7 | EXOMECNV = modules/copy_number/exomeCNV.R 8 | EXOMECNVLOH = modules/copy_number/exomeCNVLOH.R 9 | CREATE_BAF = $(PERL) $(HOME)/share/usr/bin/for.loh.files.pl 10 | 11 | .SECONDARY: 12 | .DELETE_ON_ERROR: 13 | .PHONY: all loh 14 | 15 | all : loh 16 | 17 | ifdef SAMPLE_PAIRS 18 | LOH += $(foreach pair,$(SAMPLE_PAIRS),exomecnv/loh/$(pair).loh.txt) 19 | 20 | define exomecnv-baf-tumor-normal-set 21 | exomecnv/baf/$1_$2.baf_timestamp : vcf/$(subst $( ),_,$3).gatk_snps.target_ft.dp_ft.pass.vcf 22 | normal=`grep -m1 '^#CHROM' $$< | cut -f10- | tr '\t' '\n' | grep -n '^$2$$$$' | cut -f1 -d:`; \ 23 | tumor=`grep -m1 '^#CHROM' $$< | cut -f10- | tr '\t' '\n' | grep -n '^$1$$$$' | cut -f1 -d:`; \ 24 | $$(INIT) $$(CREATE_BAF) $$< exomecnv/baf/$1_$2.baf_1.txt exomecnv/baf/$1_$2.baf_2.txt $$$$normal $$$$tumor && touch $$@ 25 | exomecnv/baf/$1_$2.baf_1.txt : exomecnv/baf/$1_$2.baf_timestamp 26 | exomecnv/baf/$1_$2.baf_2.txt : exomecnv/baf/$1_$2.baf_timestamp 27 | endef 28 | $(foreach pair,$(SAMPLE_PAIRS),\ 29 | $(eval $(call exomecnv-baf-tumor-normal-set,$(tumor.$(pair)),$(normal.$(pair)),$(set.$(pair))))) 30 | 31 | 32 | define exomecnv-loh-pair 33 | exomecnv/loh/$1.loh.txt : exomecnv/baf/$1.baf_1.txt exomecnv/baf/$1.baf_2.txt 34 | $$(call RUN,-s 4G -m 6G,"$$(RSCRIPT) $$(EXOMECNVLOH) --tumor $$< --normal $$(word 2,$$^) --outPrefix $$(@D)/$1") 35 | endef 36 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call exomecnv-loh-pair,$(pair)))) 37 | 38 | else 39 | LOH += $(foreach sample,$(SAMPLES),exomecnv/loh/$(sample).loh.txt) 40 | 41 | 42 | endif 43 | 44 | loh : $(LOH) 45 | 46 | include modules/vcf_tools/vcftools.mk 47 | include modules/variant_callers/gatk.mk 48 | -------------------------------------------------------------------------------- /copy_number/exomeCNVLOHHeatmap.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/exome_cnv_loh_heatmap.$(NOW) 4 | 5 | SHELL = modules/scripts/Rshell 6 | .SHELLFLAGS = -m $(MEM) -p $(PE) -n $(@F) -l $(LOGDIR) -e 7 | 8 | .ONESHELL: 9 | .DELETE_ON_ERROR: 10 | .SECONDARY: 11 | .PHONY: all 12 | 13 | MEM := 2G 14 | PE := 1 15 | 16 | all : exomecnv/lohheatmap.png 17 | 18 | exomecnv/loh/lohmat.Rdata : $(foreach pair,$(SAMPLE_PAIRS),exomecnv/loh/$(pair).loh.txt) 19 | lohFiles <- unlist(strsplit("$^", " ")) 20 | lohNames <- sub(".*/", "", sub("\\..*", "", lohFiles)) 21 | suppressPackageStartupMessages(library("rtracklayer")); 22 | targets <- import('$(TARGETS_FILE)'); 23 | for (i in 1:length(lohFiles)) { 24 | lohFile <- lohFiles[i] 25 | lohName <- lohNames[i] 26 | s <- read.delim(lohFile, header = T, as.is = T) 27 | lohGR <- GRanges(seqnames = sub('chr', '', s[, "chr"],), ranges = IRanges(start = s[, "position.start"], end = s[, "position.end"]), loh = s[, "LOH"]) 28 | x <- suppressWarnings(findOverlaps(targets, lohGR)) 29 | mcols(targets)[queryHits(x), lohName] <- lohGR[subjectHits(x)]$$loh 30 | } 31 | names(targets) <- paste(seqnames(targets), start(targets), sep="_") 32 | lohmat <- as.matrix(mcols(targets)) 33 | rownames(lohmat) <- names(targets) 34 | lohmat[lohmat] <- 1 35 | lohmat[which(!lohmat | is.na(lohmat))] <- 0 36 | dir.create('$(@D)', showWarnings = F) 37 | save(lohmat, file = "$@") 38 | 39 | exomecnv/lohheatmap.png : exomecnv/loh/lohmat.Rdata 40 | load("$<") 41 | suppressPackageStartupMessages(library("RColorBrewer")); 42 | suppressPackageStartupMessages(library("gplots")); 43 | cols <- c(brewer.pal(8, "Dark2"), brewer.pal(8, "Set1"), brewer.pal(8, "Set2")) 44 | chr <- unlist(lapply(rownames(lohmat), function(x) {strsplit(x, split="_", fixed=T)[[1]][1]})) 45 | dir.create('$(@D)', showWarnings = F) 46 | png("$@", height=600, width=1200, type="cairo") 47 | heatmap.2(t(lohmat), trace="none", scale = 'none', Colv = NA, col=c("white", "red"), margin=c(5,15), labCol="", ColSideColors=cols[as.integer(as.factor(chr))], cexCol=1.4, dendrogram = 'row', key = F) 48 | null <- dev.off() 49 | 50 | -------------------------------------------------------------------------------- /copy_number/hmmCopy.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/hmmcopy.$(NOW) 4 | 5 | HMMCOPY_WINDOW_SIZE ?= 1000 6 | HMMCOPY = $(RSCRIPT) modules/copy_number/hmmCopy.R 7 | 8 | READ_COUNTER = $(HOME)/share/usr/bin/readCounter 9 | MAP_COUNTER = $(HOME)/share/usr/bin/mapCounter 10 | GC_COUNTER = $(HOME)/share/usr/bin/gcCounter 11 | 12 | MAP_BW = $(HOME)/share/references/genomes/wgEncodeCrgMapabilityAlign100mer.bigWig 13 | 14 | .SECONDARY: 15 | .DELETE_ON_ERROR: 16 | .PHONY : hmmcopy 17 | 18 | hmmcopy : $(foreach pair,$(SAMPLE_PAIRS),hmmcopy/results.w$(HMMCOPY_WINDOW_SIZE)/$(tumor.$(pair))_$(normal.$(pair)).hmmcopy_seg.txt) 19 | 20 | hmmcopy/wig/%.w$(HMMCOPY_WINDOW_SIZE).wig : bam/%.bam bam/%.bam.bai 21 | $(call RUN,-s 6G -m 8G,"$(READ_COUNTER) -w $(HMMCOPY_WINDOW_SIZE) -c $(subst $( ),$(,),$(strip $(CHROMOSOMES))) $< > $@") 22 | 23 | hmmcopy/wig/gc.w$(HMMCOPY_WINDOW_SIZE).wig : 24 | $(call RUN,-s 6G -m 8G,"$(GC_COUNTER) -w $(HMMCOPY_WINDOW_SIZE) -c $(subst $( ),$(,),$(strip $(CHROMOSOMES))) $(REF_FASTA) > $@") 25 | 26 | hmmcopy/wig/map.w$(HMMCOPY_WINDOW_SIZE).wig : 27 | $(call RUN,-s 6G -m 8G,"$(MAP_COUNTER) -w $(HMMCOPY_WINDOW_SIZE) -c $(subst $( ),$(,),$(strip $(CHROMOSOMES))) $(MAP_BIGWIG) > $@") 28 | 29 | define hmmcopy-tumor-normal 30 | hmmcopy/results.w$$(HMMCOPY_WINDOW_SIZE)/$1_$2.hmmcopy_seg.txt : hmmcopy/wig/$1.w$$(HMMCOPY_WINDOW_SIZE).wig hmmcopy/wig/$2.w$$(HMMCOPY_WINDOW_SIZE).wig hmmcopy/wig/gc.w$$(HMMCOPY_WINDOW_SIZE).wig hmmcopy/wig/map.w$$(HMMCOPY_WINDOW_SIZE).wig 31 | $$(call RUN,-s 8G -m 12G,"$$(HMMCOPY) --normalWig $$(<<) --gcWig $$(<<<) --mapWig $$(<<<<) --outPrefix $$(@D)/$1_$2 $$<") 32 | endef 33 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call hmmcopy-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 34 | -------------------------------------------------------------------------------- /copy_number/normalisedCopyNum.mk: -------------------------------------------------------------------------------- 1 | # MSK Impact procedure 2 | include modules/Makefile.inc 3 | include modules/variant_callers/gatk.inc 4 | 5 | LOGDIR = log/norm_copynum.$(NOW) 6 | NORMALISE_COPYNUM = $(RSCRIPT) modules/copy_number/normaliseCopyNum.R 7 | NORMALISE_MIN_COV ?= 50 8 | NORMALISE_UNDO_SD ?= 2 9 | NORMALISE_WINDOW_SIZE ?= 100 10 | NORMALISE_OUTLIER_SD_SCALE ?= 2.5 11 | NORMALISE_ALPHA ?= 0.05 12 | NORMALISE_TRIM ?= 0.05 13 | NORMALISE_COPYNUM_OPTS = --undoSD $(NORMALISE_UNDO_SD) --outlierSDscale $(NORMALISE_OUTLIER_SD_SCALE) --alpha $(NORMALISE_ALPHA) --minCov $(NORMALISE_MIN_COV) --trim $(NORMALISE_TRIM) 14 | 15 | .SECONDARY: 16 | .DELETE_ON_ERROR: 17 | .PHONY : norm_copynum 18 | 19 | norm_copynum : norm_copynum/seg.txt 20 | 21 | norm_copynum/targets.bed : $(TARGETS_FILE) 22 | $(INIT) $(BEDTOOLS) makewindows -w $(NORMALISE_WINDOW_SIZE) -s $$(($(NORMALISE_WINDOW_SIZE) + 1)) -b $< > $@ 23 | 24 | %.nuc.bed : %.bed 25 | $(INIT) $(BEDTOOLS) nuc -fi $(REF_FASTA) -bed $< > $@ 26 | 27 | norm_copynum/doc/%.doc : bam/%.bam norm_copynum/targets.bed 28 | $(call RUN,-s 9G -m 15G,"$(call GATK_MEM,7G) -T DepthOfCoverage -R $(REF_FASTA) -I $< -L $(<<) -o $@") 29 | 30 | norm_copynum/seg.txt : norm_copynum/targets.nuc.bed $(foreach sample,$(SAMPLES),norm_copynum/doc/$(sample).doc) 31 | $(call RUN,-s 8G -m 10G,"$(NORMALISE_COPYNUM) $(NORMALISE_COPYNUM_OPTS) --sampleSetsFile $(SAMPLE_SET_FILE) --nucFile $< --centromereFile $(CENTROMERE_TABLE2) --outDir $(@D) $(addsuffix .sample_interval_summary,$(filter %.doc,$^))") 32 | -------------------------------------------------------------------------------- /copy_number/titan.inc: -------------------------------------------------------------------------------- 1 | ifndef TITAN_INC 2 | EXTRACT_ALLELE_READ_COUNTS = $(ANACONDA_PYTHON) $(HOME)/share/usr/TITANRunner-0.0.3/scripts/count.py 3 | TITAN = $(RSCRIPT) modules/copy_number/runTitan.R 4 | TITAN_SEG = $(PERL) $(HOME)/share/usr/TITANRunner-0.0.3/scripts/createTITANsegmentfiles.pl 5 | SUMMARIZE_TITAN = $(RSCRIPT) modules/copy_number/summarizeTitan.R 6 | ANNOTATE_TITAN_LOH_VCF = $(RSCRIPT) modules/copy_number/annotateTitanLOHVcf.R 7 | NUM_CLUSTERS ?= $(shell seq 1 5) 8 | PLOIDY_PRIORS = 2 3 4 9 | DEFAULT_PLOIDY_PRIOR ?= 2 10 | 11 | BQ_THRESHOLD ?= 20 12 | MQ_THRESHOLD ?= 20 13 | TITAN_WINDOW_SIZE ?= 10000 14 | 15 | TITAN_SELF_TRANSITION ?= 1e15 16 | TITAN_CLONAL_CLUSTER_TRANSITION ?= 5e5 17 | 18 | 19 | override TITAN_OPTS := $(if $(UCSC_REF),--genomeStyle UCSC,--genomeStyle NCBI) $(if $(TARGETS_FILE),--targetBed $(TARGETS_FILE)) 20 | READ_COUNTER = $(HOME)/share/usr/bin/readCounter 21 | MAP_COUNTER = $(HOME)/share/usr/bin/mapCounter 22 | GC_COUNTER = $(HOME)/share/usr/bin/gcCounter 23 | 24 | #VCF_FIELDS += titanCN titanMinorCN titanMajorCN titanCall titanMedianRatio titanMedianLogR 25 | endif 26 | TITAN_INC = true 27 | -------------------------------------------------------------------------------- /db/chasm_db.yaml: -------------------------------------------------------------------------------- 1 | host: cpu-6-2 2 | db: CHASM 3 | port: 9991 4 | docker_repo: limr/chasm-db 5 | data_dir: /cbio/ski/reis-filho/home/limr/share/usr/chasm3-db 6 | user: chasm 7 | password: chasm 8 | -------------------------------------------------------------------------------- /db/create_mysql_docker_images.sh: -------------------------------------------------------------------------------- 1 | docker run --name fathmm-db \ 2 | -v ~/downloads/fathmm:/docker-entrypoint-initdb.d \ 3 | -e MYSQL_ROOT_PASSWORD=fathmm \ 4 | -e MYSQL_DATABASE=fathmm \ 5 | -e MYSQL_USER=fathmm \ 6 | -e MYSQL_PASSWORD=fathmm \ 7 | -p 9990:3306 \ 8 | -v ~/share/usr/fathmm-db:/var/lib/mysql \ 9 | -d mysql:5.7.14 10 | docker run --name chasm-db \ 11 | -v ~/downloads/chasm:/docker-entrypoint-initdb.d \ 12 | -e MYSQL_ROOT_PASSWORD=chasm \ 13 | -e MYSQL_DATABASE=CHASM \ 14 | -e MYSQL_USER=chasm \ 15 | -e MYSQL_PASSWORD=chasm \ 16 | -p 9991:3306 \ 17 | -v ~/share/usr/chasm3-db:/var/lib/mysql \ 18 | -d mysql:5.7.14 19 | docker run --name ensembl-hs-core-85-37-db \ 20 | -v ~/downloads/homo_sapiens_core_85_37:/docker-entrypoint-initdb.d \ 21 | -e MYSQL_ROOT_PASSWORD=embl \ 22 | -e MYSQL_DATABASE=homo_sapiens_core_85_37 \ 23 | -e MYSQL_USER=embl \ 24 | -e MYSQL_PASSWORD=embl \ 25 | -e MYSQL_ALLOW_EMPTY_PASSWORD=yes \ 26 | -p 9992:3306 \ 27 | -v ~/share/usr/ensembl-hs-core-85-37-db:/var/lib/mysql \ 28 | -d mysql:5.5 29 | -------------------------------------------------------------------------------- /db/ensembl-hs-core-85-37_db.yaml: -------------------------------------------------------------------------------- 1 | host: cpu-6-2 2 | db: homo_sapiens_core_85_37 3 | port: 9992 4 | docker_repo: limr/ensembl-hs-core-85-37-db 5 | data_dir: /cbio/ski/reis-filho/home/limr/share/usr/ensembl-hs-core-85-37-db/ 6 | user: embl 7 | password: embl 8 | -------------------------------------------------------------------------------- /db/fathmm_config-cpu-6-2.ini: -------------------------------------------------------------------------------- 1 | [DATABASE] 2 | HOST = cpu-6-2 3 | PORT = 9990 4 | USER = fathmm 5 | PASSWD = fathmm 6 | DB = fathmm 7 | -------------------------------------------------------------------------------- /db/fathmm_config-e01.ini: -------------------------------------------------------------------------------- 1 | [DATABASE] 2 | HOST = 10.0.200.48 3 | PORT = 3306 4 | USER = fathmm_user 5 | PASSWD = CSred74pop 6 | DB = fathmm 7 | -------------------------------------------------------------------------------- /db/fathmm_config-ika.ini: -------------------------------------------------------------------------------- 1 | [DATABASE] 2 | HOST = 10.0.200.44 3 | PORT = 3306 4 | USER = fathmm_user 5 | PASSWD = CSred74pop 6 | DB = fathmm 7 | -------------------------------------------------------------------------------- /db/fathmm_config-lilac.ini: -------------------------------------------------------------------------------- 1 | [DATABASE] 2 | HOST = 10.230.1.20 3 | PORT = 3306 4 | USER = fathmm_user 5 | PASSWD = CSred74pop 6 | DB = fathmm 7 | -------------------------------------------------------------------------------- /db/fathmm_config-swan.ini: -------------------------------------------------------------------------------- 1 | [DATABASE] 2 | HOST = 10.230.1.43 3 | PORT = 3306 4 | USER = fathmm_user 5 | PASSWD = CSred74pop 6 | DB = fathmm 7 | -------------------------------------------------------------------------------- /db/fathmm_db.yaml: -------------------------------------------------------------------------------- 1 | host: cpu-6-2 2 | db: fathmm 3 | port: 9990 4 | docker_repo: limr/fathmm-db 5 | data_dir: /cbio/ski/reis-filho/home/limr/share/usr/fathmm-db 6 | user: fathmm 7 | password: fathmm 8 | -------------------------------------------------------------------------------- /db/run_mysql_docker_images.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # qsub -I -l walltime=999:99:99 -l nodes=1:docker 3 | docker run -d -v /cbio/ski/reis-filho/home/limr/share/usr/fathmm-db:/var/lib/mysql -p 9990:3306 limr/fathmm-db 4 | docker run -d -v /cbio/ski/reis-filho/home/limr/share/usr/chasm3-db:/var/lib/mysql -p 9991:3306 limr/chasm-db 5 | docker run -d -v /cbio/ski/reis-filho/home/limr/share/usr/ensembl-hs-core-85-37-db/:/var/lib/mysql -p 9992:3306 limr/ensembl-hs-core-85-37-db 6 | -------------------------------------------------------------------------------- /db/snv_box-cpu-6-2.conf: -------------------------------------------------------------------------------- 1 | ; SNV-Box configuration file 2 | ; Contains the Database specifications 3 | chasmDB=CHASM 4 | db.user=chasm 5 | db.password=chasm 6 | db.host=cpu-6-2 7 | db.port=9991 8 | ;db.unix_socket=/tmp/mysql.sock 9 | -------------------------------------------------------------------------------- /db/snv_box-e01.conf: -------------------------------------------------------------------------------- 1 | ; SNV-Box configuration file 2 | ; Contains the Database specifications 3 | chasmDB=CHASM 4 | db.user=chasm_user 5 | db.password=password 6 | db.host=10.0.200.71 7 | db.port=38493 8 | ;db.unix_socket=/tmp/mysql.sock 9 | -------------------------------------------------------------------------------- /db/snv_box-ika.conf: -------------------------------------------------------------------------------- 1 | ; SNV-Box configuration file 2 | ; Contains the Database specifications 3 | chasmDB=CHASM 4 | db.user=chasm_user 5 | db.password=password 6 | db.host=10.0.200.44 7 | db.port=38493 8 | ;db.unix_socket=/tmp/mysql.sock 9 | -------------------------------------------------------------------------------- /db/snv_box-lilac.conf: -------------------------------------------------------------------------------- 1 | ; SNV-Box configuration file 2 | ; Contains the Database specifications 3 | chasmDB=CHASM 4 | db.user=chasm_user 5 | db.password=password 6 | db.host=10.230.1.20 7 | db.port=38493 8 | ;db.unix_socket=/tmp/mysql.sock 9 | -------------------------------------------------------------------------------- /db/snv_box-swan.conf: -------------------------------------------------------------------------------- 1 | ; SNV-Box configuration file 2 | ; Contains the Database specifications 3 | chasmDB=CHASM 4 | db.user=chasm_user 5 | db.password=password 6 | db.host=10.230.1.43 7 | db.port=38493 8 | ;db.unix_socket=/tmp/mysql.sock 9 | -------------------------------------------------------------------------------- /default_yaml/project_config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | cluster_engine: LSF ## PBS ## SGE 3 | 4 | use_cluster: true 5 | 6 | ref: b37 7 | 8 | aligner: bwamem ## tophat ## hisat ## bwa ## bowtie ## tmap 9 | bam_chr1_base_recal: true 10 | bam_dup_type: markdup 11 | bam_no_filter: false 12 | bam_no_recal: false 13 | bam_no_realn: false 14 | bam_no_sort: false 15 | bam_fix_rg: false 16 | bam_phred64: false 17 | bam_reprocess: false 18 | 19 | snv_type: mutect # or mutect_snps 20 | mutect_split_chr: true 21 | mutect_use_contest: false 22 | indel_types: varscan_indels strelka_indels scalpel_indels lancet_indels platypus_indels #pindel mutect_indels to use mutect2 23 | 24 | vcf_post_ann_filter_expression: ExAC_AF > 0.01 25 | 26 | # vcf annotations 27 | ann_facets: true 28 | ann_mut_taste: false 29 | ann_provean: false 30 | ann_pathogen: true 31 | 32 | # target panels 33 | targets_file: ~/share/reference/target_panels/ 34 | 35 | # gatk options 36 | gatk_hard_filter_snps: true 37 | gatk_pool_snp_recal: false 38 | 39 | # facets options 40 | # pre-processing crit val 41 | facets_pre_cval: 50 42 | # crit val for estimating diploid log ratio 43 | facets_cval1: 150 44 | # starting crit val for segmentation 45 | facets_cval2: 50 46 | # min number of het snps in a segment used for bivariate t-statistic during clustering of segement 47 | facets_min_nhet: 25 48 | # union of gatk and dbsnp for snp-pileup 49 | facets_union_gatk_dbsnp: false 50 | 51 | # slack_channel: 52 | 53 | qsub_priority: 0 54 | ... 55 | -------------------------------------------------------------------------------- /default_yaml/sample_attr.yaml: -------------------------------------------------------------------------------- 1 | #facets_diplogr: 2 | # S18_Pt08N: 3 3 | #facets_cval1: 4 | # S18_Pt08N: 500 5 | -------------------------------------------------------------------------------- /default_yaml/summary_config.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | # Uncomment if you want to add a sample order to heatmaps 3 | # sample_order: [AWT, AWX1, AWX2, AWX3, AWX4, AWL, AWM1, AWM2, AWX5] 4 | # blacklisted genes, don't report these 5 | gene_blacklist: [TTN] 6 | # Uncomment if you want to report names of samples differently in the 7 | # summary/plots 8 | # sample_rename: 9 | # AWM1: AWM1BR 10 | # AWM2: AWM2BO 11 | ... 12 | -------------------------------------------------------------------------------- /export/cbioportal.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | include modules/variant_callers/gatk.inc 3 | SHELL = /bin/bash 4 | 5 | LOGDIR = log/cbioportal.$(NOW) 6 | 7 | .PHONY: mafs 8 | 9 | mafs : $(foreach pair,$(SAMPLE_PAIRS),$(foreach caller,mutect_snps mutect_indels,export/cbioportal/$(pair).$(caller).maf)) 10 | 11 | define CBIOPORTAL_VCF_RULE 12 | unset PERL5LIB PERL_MB_OPT PERLBREW_ROOT PERL_LOCAL_LIB_ROOT PERL_MM_OPT && \ 13 | source activate /ifs/e63data/reis-filho/usr/anaconda-envs/vcf2maf && \ 14 | mkdir -p $(@D) && \ 15 | /opt/common/CentOS_6-dev/perl/perl-5.22.0/bin/perl \ 16 | /ifs/e63data/reis-filho/usr/vcf2maf/vcf2maf.pl \ 17 | --vep-path /opt/common/CentOS_6/vep/v84 \ 18 | --vep-data /opt/common/CentOS_6/vep/v84 \ 19 | --ref-fasta $(REF_FASTA) \ 20 | --input-vcf $< \ 21 | --output-maf $@ \ 22 | --tumor-id $(tumor.$*) \ 23 | --normal-id $(normal.$*) 24 | endef 25 | 26 | export/cbioportal/%.maf: vcf/%.vcf 27 | $(CBIOPORTAL_VCF_RULE) 28 | 29 | include modules/vcf_tools/vcftools.mk 30 | -------------------------------------------------------------------------------- /external/SNVBox/README: -------------------------------------------------------------------------------- 1 | SNV-Box v3.0 2 | ============ 3 | Thank you for using our software. Just a couple of things to take note before you get started: 4 | 5 | 6 | FOR INSTALLATION INSTRUCTIONS, SUPPORT AND DOWNLOADS 7 | ---------------------------------------------------- 8 | Please visit our website at http://wiki.chasmsoftware.org 9 | 10 | 11 | TO JOIN OUR MAILING LIST 12 | ------------------------ 13 | Please email chasm-beta-testers@lists.johnshopkins.edu 14 | 15 | 16 | TO REPORT A BUG 17 | --------------- 18 | Please create an account at http://bugzilla.chasmsoftware.org/ and file a bug report. 19 | 20 | 21 | ABOUT SOFTWARE LICENSING 22 | ------------------------ 23 | Please note that our software is licensed under the JHU Academic Software License Agreement. For more details please refer to the license in the doc folder. 24 | 25 | 26 | Thank you! 27 | 28 | From KarchinLab 29 | -------------------------------------------------------------------------------- /external/SNVBox/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/external/SNVBox/db/__init__.py -------------------------------------------------------------------------------- /external/SNVBox/doc/CHASM_VEST_SNVGet_UserManual.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/external/SNVBox/doc/CHASM_VEST_SNVGet_UserManual.doc -------------------------------------------------------------------------------- /external/SNVBox/doc/CHASM_VEST_SNVGet_UserManual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/external/SNVBox/doc/CHASM_VEST_SNVGet_UserManual.pdf -------------------------------------------------------------------------------- /fastq_tools/bamtoFasta.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/bam_to_fasta.$(NOW) 4 | PHONY += unmapped_reads 5 | 6 | bam_to_fasta : $(foreach sample,$(SAMPLES),unmapped_reads/$(sample).fasta) 7 | 8 | define bam-to-fasta 9 | unmapped_reads/%.fasta : unmapped_reads/%.bam 10 | $(call RUN,-n 4 -s 4G -m 9G,"$(SAMTOOLS2) fasta $$< > unmapped_reads/$$*.fasta") 11 | endef 12 | $(foreach sample,$(SAMPLES),\ 13 | $(eval $(call bam-to-fasta,$(sample)))) 14 | 15 | 16 | .DELETE_ON_ERROR: 17 | .SECONDARY: 18 | .PHONY: $(PHONY) 19 | -------------------------------------------------------------------------------- /fastq_tools/blastReads.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/blast_reads.$(NOW) 4 | PHONY += unmapped_read 5 | 6 | blast_reads : $(foreach sample,$(SAMPLES),unmapped_reads/$(sample).blast) 7 | 8 | define blast-reads 9 | unmapped_reads/%.blast : unmapped_reads/%.fasta 10 | $(call RUN,-n 32 -s 3G -m 4G -w 360,"blastn -num_threads 32 -evalue 0.001 -word_size 28 -db ~/share/reference/ncbi_nt/nt -query $$< -outfmt 7 -out unmapped_reads/$$*.blast") 11 | endef 12 | $(foreach sample,$(SAMPLES),\ 13 | $(eval $(call blast-reads,$(sample)))) 14 | 15 | 16 | .DELETE_ON_ERROR: 17 | .SECONDARY: 18 | .PHONY: $(PHONY) 19 | -------------------------------------------------------------------------------- /fastq_tools/extractFastq.mk: -------------------------------------------------------------------------------- 1 | # This module extract fastq files from a bam file. It will use either the Picard (SamToFastq.jar) or bam2fastq programs to extract the fastq. You can specify which program to use with the EXTRACT_TOOL variable 2 | # input: $(SAMPLES) 3 | # Author: Fong Chun Chan 4 | 5 | include modules/Makefile.inc 6 | 7 | LOGDIR ?= log/extract_fastq.$(NOW) 8 | 9 | .DELETE_ON_ERROR: 10 | .SECONDARY: 11 | .PHONY: extract_fastq 12 | 13 | VPATH = rawdata unprocessed_bam 14 | 15 | extract_fastq : $(foreach sample,$(SAMPLES),fastq/$(sample).1.fastq.gz) 16 | 17 | fastq/%.1.fastq.gz fastq/%.2.fastq.gz : %.bam 18 | $(call RUN,-n 4 -s 4G -m 9G,"$(SAMTOOLS2) sort -T $((gzip -c > fastq/$*.1.fastq.gz) -2 >(gzip -c > fastq/$*.2.fastq.gz) -") 19 | -------------------------------------------------------------------------------- /fastq_tools/extractReads.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/extract_unmapped.$(NOW) 4 | PHONY += unmapped_reads 5 | 6 | extract_unmapped : $(foreach sample,$(SAMPLES),unmapped_reads/$(sample).bam) 7 | 8 | define extract-unmapped-reads 9 | unmapped_reads/%.bam : bam/%.bam 10 | $(call RUN,-n 4 -s 4G -m 9G,"$(SAMTOOLS2) view -f 0x04 -h -@ 4 -b $$< -o unmapped_reads/$$*.bam") 11 | endef 12 | $(foreach sample,$(SAMPLES),\ 13 | $(eval $(call extract-unmapped-reads,$(sample)))) 14 | 15 | 16 | .DELETE_ON_ERROR: 17 | .SECONDARY: 18 | .PHONY: $(PHONY) 19 | -------------------------------------------------------------------------------- /fastq_tools/extractunmappedpairs.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/extract_unmapped_pairs.$(NOW) 4 | 5 | .DELETE_ON_ERROR: 6 | .SECONDARY: 7 | .PHONY: extract_unmapped_pairs 8 | 9 | VPATH = bam 10 | JAVA = $(HOME)/share/usr/jdk1.8.0_121/bin/java 11 | PICARD = /lila/data/riazlab/lib/src/picard.jar 12 | 13 | extract_unmapped_pairs : $(foreach sample,$(SAMPLES),extracted_reads/unmapped_pairs/$(sample)_1.fastq) 14 | 15 | define extract-unmapped-pairs 16 | extracted_reads/unmapped_pairs/%.bam : extracted_reads/unmapped_pairs/%.txt 17 | $$(call RUN,-c -n 4 -s 4G -m 9G,"$(JAVA) -jar $(PICARD) FilterSamReads I=unmapped_reads/$$*.bam O=extracted_reads/unmapped_pairs/$$*.bam \ 18 | READ_LIST_FILE=extracted_reads/unmapped_pairs/$$*.txt FILTER=includeReadList") 19 | 20 | extracted_reads/unmapped_pairs/%.txt: unmapped_reads/%.bam 21 | $$(call RUN,-c -n 1 -s 4G -m 9G,"$(SAMTOOLS2) view $$< | cut -f1 | sort | uniq > extracted_reads/unmapped_pairs/$$*.txt") 22 | 23 | extracted_reads/unmapped_pairs/%_1.fastq extracted_reads/unmapped_pairs/%_2.fastq : extracted_reads/unmapped_pairs/%.bam 24 | $$(call RUN,-n 4 -s 4G -m 9G,"bamToFastq -i $$< -fq extracted_reads/unmapped_pairs/$$*_1.fastq -fq2 extracted_reads/unmapped_pairs/$$*_2.fastq") 25 | 26 | endef 27 | $(foreach pair,$(SAMPLES),\ 28 | $(eval $(call extract-unmapped-pairs,$sample))) 29 | -------------------------------------------------------------------------------- /fastq_tools/fixFastqReadNames.mk: -------------------------------------------------------------------------------- 1 | # This module is used for fixing read names of paired fastq files 2 | # input: $(SAMPLES) 3 | # Author: Fong Chun Chan 4 | 5 | include modules/Makefile.inc 6 | include modules/hg19.inc 7 | 8 | FIX_FASTQ_READ_NAMES = $(PYTHON) modules/fastq_tools/fixFastqReadNames.py 9 | 10 | LOGDIR = fastq/logs 11 | 12 | .DELETE_ON_ERROR: 13 | 14 | .SECONDARY: 15 | 16 | all : $(foreach sample,${SAMPLES},fastq/$(sample).1.fixed.fastq) 17 | 18 | fastq/%.1.fixed.fastq fastq/%.2.fixed.fastq : fastq/%.1.fastq fastq/%.2.fastq 19 | SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,2G,5G)" $(MKDIR) $(LOGDIR);\ 20 | $(FIX_FASTQ_READ_NAMES) $(word 1,$^) $(word 2,$^) fastq/$*.1.fixed.fastq fastq/$*.2.fixed.fastq &> ${LOGDIR}/$(@F).log;\ 21 | -------------------------------------------------------------------------------- /fastq_tools/mergeFastq.mk: -------------------------------------------------------------------------------- 1 | # add in new fastq to existing bams 2 | include modules/Makefile.inc 3 | 4 | LOGDIR ?= log/merge_fastq.$(NOW) 5 | 6 | .PHONY: merge_fastq 7 | .DELETE_ON_ERROR: 8 | .SECONDARY: 9 | 10 | MERGE_SAMPLE_FILE ?= merge_samples.txt 11 | ifneq ($(wildcard $(MERGE_SAMPLE_FILE)),) 12 | MERGE_SAMPLES ?= $(shell sed '/^\#/d' $(MERGE_SAMPLE_FILE)) 13 | endif 14 | 15 | ALIGNER ?= bwamem 16 | 17 | merge_fastq : $(foreach sample,$(MERGE_SAMPLES),$(if $(wildcard bam/$(sample).bam),merged_bam/$(sample).bam,bam/$(sample).bam)) 18 | $(call RUN,-s 7G -m 8G,"for bam in $(filter merged_bam/%.bam,$^); do \ 19 | ln -f \$${bam} bam/\$$(basename \$${bam}) && \ 20 | $(SAMTOOLS) index \$${bam}; \ 21 | done") 22 | 23 | include modules/aligners/$(ALIGNER)Aligner.mk 24 | 25 | merged_bam/%.1.bam merged_bam/%.2.bam : $(ALIGNER)/bam/%.$(ALIGNER).$(BAM_SUFFIX) 26 | $(INIT) ln -f $( $@ 34 | 35 | merged_bam/%.bam : merged_bam/%.header.sam merged_bam/%.1.bam merged_bam/%.2.bam 36 | $(call RUN,-s 12G -m 15G,"$(SAMTOOLS) merge -f -h $< $(@) $(filter %.bam,$(^))") 37 | 38 | -------------------------------------------------------------------------------- /fastq_tools/mergeSplitFastq.mk: -------------------------------------------------------------------------------- 1 | # merge split fastqs for workflows like defuse 2 | 3 | include modules/Makefile.inc 4 | 5 | LOGDIR ?= log/merge_split_fastq.$(NOW) 6 | 7 | .SECONDARY: 8 | .DELETE_ON_ERROR: 9 | .PHONY : fastq 10 | 11 | fastq: $(foreach sample,$(SAMPLES),fastq/$(sample).1.fastq.gz fastq/$(sample).2.fastq.gz) 12 | 13 | define merged-fastq 14 | fastq/$1.%.fastq.gz : $$(foreach split,$2,fastq/$$(split).%.fastq.gz) 15 | $$(call RUN,,"zcat $$(^) | gzip -c > $$(@)") 16 | endef 17 | $(foreach sample,$(SAMPLES),$(eval $(call merged-fastq,$(sample),$(split.$(sample))))) 18 | 19 | define merged-fastq2 20 | fastq/$1.1.fastq.gz : $$(foreach split,$2,$$(word 1, $$(fq.$$(split)))) 21 | $$(call RUN,,"zcat $$(^) | gzip -c > $$(@)") 22 | fastq/$1.2.fastq.gz : $$(foreach split,$2,$$(word 2, $$(fq.$$(split)))) 23 | $$(call RUN,,"zcat $$(^) | gzip -c > $$(@)") 24 | endef 25 | $(foreach sample,$(SAMPLES),$(eval $(call merged-fastq2,$(sample),$(split.$(sample))))) 26 | -------------------------------------------------------------------------------- /fastq_tools/trimFastq.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # trim fastq file 3 | 4 | use strict; 5 | use warnings; 6 | 7 | use Getopt::Std; 8 | 9 | my %opt; 10 | getopts('hs:e:l:', \%opt); 11 | 12 | my $usage = <) { 29 | chomp; 30 | if ($i % 2 == 0) { 31 | print; 32 | } else { 33 | my $ss = $_; 34 | if ($opt{s}) { 35 | $ss = substr($ss, $opt{s}); 36 | } 37 | if ($opt{e}) { 38 | $ss = substr($ss, 0, length($ss) - $opt{e}); 39 | } 40 | if ($opt{l}) { 41 | $ss = substr($ss, 0, $opt{l}); 42 | } 43 | print $ss; 44 | } 45 | print "\n"; 46 | $i++; 47 | } 48 | -------------------------------------------------------------------------------- /genome_inc/GRCm38.inc: -------------------------------------------------------------------------------- 1 | # vim: set ft=make: 2 | ifndef GRCM38_INC 3 | REF = GRCm38 4 | TOPHAT_REF = GRCm38 5 | REF_DIR = $(HOME)/share/reference 6 | 7 | INTEGRATE_ANN = $(REF_DIR)/mm10_annot.ucsc.txt 8 | INTEGRATE_BWTS =$(REF_DIR)/Mus_musculus_GRCm38/bwts 9 | 10 | GENES_GTF = $(REF_DIR)/Mus_musculus/NCBI/$(TOPHAT_REF)/Annotation/Genes/genes.gtf 11 | 12 | REF_FASTA := $(REF_DIR)/Mus_musculus_GRCm38/Mus_musculus.GRCm38.71.dna.chromosome.genome.fa 13 | REF_DICT := $(REF_DIR)/Mus_musculus_GRCm38/Mus_musculus.GRCm38.71.dna.chromosome.genome.dict 14 | SNP_EFF_GENOME = GRCm38.86 # == mm10 15 | DBSNP := $(REF_DIR)/mgp.v5.merged.snps_all.dbSNP142.vcf.gz 16 | MGP_SNP_DBSNP := $(DBSNP) 17 | MGP_INDEL_DBSNP := $(REF_DIR)/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz 18 | 19 | CENTROMERE_TABLE = $(REF_DIR)/centromere_mm10.txt 20 | 21 | EXOME_BED = $(REF_DIR)/mus_musculus_known_genes_exons_GRCm38.bed 22 | EXOME_BED_NOHEADER = $(REF_DIR)/mus_musculus_known_genes_exons_GRCm38_noheader.bed 23 | 24 | FREEC_REF := $(REF_DIR)/Mus_musculus_GRCm38/freec 25 | CHR_LEN = $(REF_DIR)/Mus_musculus_GRCm38/mm10.len 26 | 27 | BOWTIE_REF = $(REF_DIR)/Mus_musculus_GRCm38/Mus_musculus.GRCm38.71.dna.chromosome.genome 28 | 29 | ENSEMBL_TXDB = $(REF_DIR)/mus_musculus_ensembl_biomart.2014-04-28.sqlite 30 | 31 | #TXDB = $(HOME)/ensmusg70.08032013.sqlite 32 | 33 | RIBOSOMAL_INTERVALS = $(REF_DIR)/mm10_rrna_intervals.txt 34 | GENE_REF_FLAT = $(REF_DIR)/mm10_genes.refFlat.txt 35 | 36 | EXOME ?= false 37 | ifeq ($(EXOME),true) 38 | TARGETS_FILE = $(EXOME_BED_NOHEADER) 39 | QUALIMAP_TARGETS_FILE = $(TARGETS_FILE) 40 | endif 41 | 42 | INCLUDE_CHR_Y ?= true 43 | ifneq ($(and $(TARGETS_FILE),$(findstring false,$(EXOME))),) 44 | CHROMOSOMES := $(shell grep -v '@' $(TARGETS_FILE) | cut -f1 | sort | uniq) 45 | else 46 | CHROMOSOMES := $(shell seq 1 19) X $(if $(findstring true,$(INCLUDE_CHR_Y)),Y) MT 47 | endif 48 | 49 | 50 | endif 51 | GRCM38_INC = true 52 | -------------------------------------------------------------------------------- /genome_inc/hg18.inc: -------------------------------------------------------------------------------- 1 | REF = hg18 2 | REF_FASTA = ~/share/references/genomes/hg18.fa 3 | SNP_EFF_GENOME = hg36.54 # == hg18 4 | DBSNP = ~/share/references/dbsnp/dbsnp_132.hg18.vcf 5 | KNOWN_INDELS = ~/share/references/1000g/1000G_biallelic.indels.hg18.vcf 6 | OMNI = ~/share/references/1000g/1000G_omni2.5.hg18.sites.vcf 7 | HAPMAP = ~/share/references/hapmap3/hapmap_3.3.hg18.sites.vcf 8 | GC_WIG = ~/share/references/gc.hg18.wig 9 | MAP_WIG = ~/share/references/map.hg18.wig 10 | -------------------------------------------------------------------------------- /genome_inc/hg38.inc: -------------------------------------------------------------------------------- 1 | # vim: set ft=make: 2 | 3 | ifndef HG38_INC 4 | 5 | REF?= hg38 6 | REF_FASTA ?= $(REF_DIR)/hg38_gatk_bundle/Homo_sapiens_assembly38.fasta 7 | 8 | DBSNP ?= $(REF_DIR)/hg38_gatk_bundle/dbsnp_146.hg38.vcf.gz 9 | HAPMAP ?= $(REF_DIR)/hg38_gatk_bundle/hapmap_3.3_grch38_pop_stratified_af.vcf.gz 10 | KNOWN_INDELS ?= $(REF_DIR)/hg38_gatk_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz 11 | OMNI ?= $(REF_DIR)/hg38_gatk_bundle/1000G_omni2.5.hg38.vcf.gz 12 | 13 | TOPHAT_REF = $(REF) 14 | 15 | ANNOVAR_REF = hg38 16 | 17 | ERICSCRIPT_DB = $(REF_DIR)/ericscript_db_hg38_84 18 | ERICSCRIPT_SPECIES = homo_sapiens 19 | 20 | endif 21 | HG38_INC = true 22 | -------------------------------------------------------------------------------- /ploidy/bicseq.mk: -------------------------------------------------------------------------------- 1 | # run bicseq for cnvs 2 | 3 | include modules/Makefile.inc 4 | 5 | LOGDIR = log/bicseq.$(NOW) 6 | 7 | SHELL = modules/scripts/Rshell 8 | .SHELLFLAGS = -s -m $(MEM) -n $(@F) -l $(LOGDIR) -e 9 | 10 | .ONESHELL: 11 | .DELETE_ON_ERROR: 12 | .SECONDARY: 13 | .PHONY: all 14 | 15 | MEM = 9G 16 | 17 | all : $(foreach pair,$(SAMPLE_PAIRS),expands/rdata/$(pair).cbs_snv.Rdata) 18 | 19 | 20 | 21 | expands/rdata/%.cbs_snv.Rdata : mutect/tables/%.mutect.txt varscan/segment/%.varscan2copynumber.txt 22 | library(expands) 23 | snv <- read.table("$<", header = T, sep = "\t") 24 | snv <- subset(snv, judgement != "REJECT") 25 | colnames(snv)[1:2] <- c("chr", "startpos") 26 | cbs <- read.table("$(<<)", header = T, sep = "\t") 27 | cbs <- transform(cbs, CN_estimate = 2^Segmented) 28 | colnames(cbs)[c(2,3,4)] <- c("chr", "startpos", "endpos") 29 | dir.create("$(@D)", recursive = T) 30 | save(snv, cbs, file = "$@") 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /ploidy/expands.mk: -------------------------------------------------------------------------------- 1 | # run expands for determining tumor ploidy 2 | 3 | include modules/Makefile.inc 4 | 5 | LOGDIR = log/expands.$(NOW) 6 | 7 | SHELL = modules/scripts/Rshell 8 | .SHELLFLAGS = -s -m $(MEM) -n $(@F) -l $(LOGDIR) -e 9 | 10 | .ONESHELL: 11 | .DELETE_ON_ERROR: 12 | .SECONDARY: 13 | .PHONY: all 14 | 15 | MEM = 20G 16 | 17 | all : $(foreach pair,$(SAMPLE_PAIRS),expands/rdata/$(pair).cbs_snv.Rdata) 18 | 19 | expands/rdata/%.cbs.Rdata : varscan/copycall/%.copycall 20 | cn <- read.table("$<", header=T, as.is=T) 21 | keep <- which(cn[,1] %in% c(1:22, "X")) 22 | if (length(rm) > 0) { cn <- cn[keep,]} 23 | cn[which(cn[,1]=="X"),1] <- 23 24 | cn[,1] <- as.numeric(cn[,1]) 25 | cn <- cn[order(cn[,1], cn[,2]),] 26 | cn <- cbind(name = paste(cn[,1], cn[,2], sep="_"), cn[,c(1:3,7)]) 27 | cgh <- make_cghRaw(cn) 28 | normalized <- normalize(cgh) 29 | segmented <- segmentData(normalized, relSDlong=2, undo.splits="sdundo", undo.SD=1.5) 30 | calls <- CGHcall(segmented, nclass=3) 31 | excalls <- ExpandCGHcall(calls, segmented) 32 | cbs <- with(fData(excalls), data.frame(chr = as.character(Chromosome[calls[[5]][,"wm"]]), startpos = Start[calls[[5]][,"wm"]], endpos = End[calls[[5]][,"wmend"]], CN_Estimate = 2^calls[[5]][, "smwh"], stringsAsFactors = F)) 33 | cbs <- transform(cbs, segmentLength = endpos - startpos) 34 | 35 | 36 | expands/rdata/%.snv.Rdata : mutect/tables/%.mutect.txt 37 | library(expands) 38 | snv <- read.table("$<", header = T, sep = "\t", stringsAsFactors = F) 39 | snv <- subset(snv, judgement != "REJECT") 40 | colnames(snv)[1:2] <- c("chr", "startpos") 41 | snv <- subset(snv, select = 'chr', 'startpos') 42 | snv$$chr <- as.integer(snv$$chr) 43 | snv <- as.matrix(snv[!is.na(snv$$chr), ]) 44 | dir.create("$(@D)", recursive = T) 45 | dm <- assignQuantityToMutation(snv, cbs, "CN_Estimate") 46 | save(snv, cbs, file = "$@") 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /qc/bamStats.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | LOGDIR ?= log/bam_stats.$(NOW) 3 | 4 | BAM_STATS_USE_REF ?= true 5 | 6 | .SECONDARY: 7 | .DELETE_ON_ERROR: 8 | .PHONY: bam_stats 9 | 10 | bam_stats: $(foreach sample,$(SAMPLES),metrics/$(sample).bam_stats.html) 11 | 12 | metrics/%.bc : bam/%.bam 13 | $(call RUN,-s 6G -m 8G,"samtools stats $(if $(findstring true,$(BAM_STATS_USE_REF)),-r $(REF_FASTA)) $< > $@") 14 | metrics/%.bam_stats.html : metrics/%.bc 15 | $(call RUN,-s 6G -m 8G,"plot-bamstats -p $(@D)/$* $<") 16 | -------------------------------------------------------------------------------- /qc/fastqc.mk: -------------------------------------------------------------------------------- 1 | # vim: set ft=make : 2 | # Run Fastqc on bam files 3 | 4 | include modules/Makefile.inc 5 | 6 | FASTQC_SUMMARY_PLOT = $(RSCRIPT) modules/qc/fastqcSummaryPlot.R 7 | 8 | LOGDIR ?= log/fastqc.$(NOW) 9 | 10 | .PHONY: fastqc 11 | .SECONDARY: 12 | 13 | fastqc : $(foreach sample,$(SAMPLES),fastqc/$(sample)_fastqc/summary.txt) fastqc/all_summary.txt 14 | 15 | fastqc/%_fastqc.zip : bam/%.bam 16 | $(call RUN,-N $*_fastqc -s 4G -m 12G,"$(FASTQC) -o fastqc $^") 17 | 18 | fastqc/%_fastqc/summary.txt : fastqc/%_fastqc.zip 19 | $(INIT) $(UNZIP) -o -d fastqc $< &> $(LOG) && touch $@ 20 | 21 | fastqc/all_summary.txt : $(foreach sample,$(SAMPLES),fastqc/$(sample)_fastqc/summary.txt) 22 | $(INIT) $(FASTQC_SUMMARY_PLOT) --outPrefix fastqc/all_summary $^ &> $(LOG) 23 | -------------------------------------------------------------------------------- /qc/fastqcSummaryPlot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("plyr")) 5 | suppressPackageStartupMessages(library("dplyr")) 6 | suppressPackageStartupMessages(library("tidyr")) 7 | suppressPackageStartupMessages(library("magrittr")) 8 | suppressPackageStartupMessages(library("stringr")) 9 | suppressPackageStartupMessages(library("ggplot2")) 10 | 11 | if (!interactive()) { 12 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 13 | } 14 | 15 | optList <- list( 16 | make_option("--outPrefix", default = 'summary', type = "character", action = "store", help ="Output file prefix (default %default)"), 17 | make_option("--width", default = 1000, action = "store", help ="width of heatmap image (default %default)"), 18 | make_option("--height", default = 1000, action = "store", help ="height of heatmap image (default %default)")) 19 | parser <- OptionParser(usage = "%prog [options] summary file(s)", option_list = optList); 20 | arguments <- parse_args(parser, positional_arguments = T); 21 | opt <- arguments$options; 22 | 23 | summ <- arguments$args %>% 24 | llply(read.table, sep = '\t', stringsAsFactors = F) %>% 25 | bind_rows %>% 26 | setNames(c("Status", "Metric", "Sample")) %>% 27 | mutate(Sample = str_replace(Sample, '\\.bam', '')) %>% 28 | mutate_each(funs(as.factor)) 29 | 30 | fn <- str_c(opt$outPrefix, '.png') 31 | png(fn, width = opt$width + 10 * nlevels(summ$Sample), height = opt$height, type = 'cairo-png'); 32 | p <- summ %>% ggplot(aes(x = Sample, y = Metric)) + 33 | geom_tile(aes(fill = Status)) + 34 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) 35 | p 36 | dev.off(); 37 | 38 | fn <- str_c(opt$outPrefix, '.txt') 39 | summ %>% spread(Metric, Status) %>% write.table(file = fn, sep = '\t', quote = F, row.names = F) 40 | -------------------------------------------------------------------------------- /qc/intervalBamQC.mk: -------------------------------------------------------------------------------- 1 | # vim: set ft=make : 2 | # amplicon qc using bams and gatk vcf results 3 | 4 | include modules/Makefile.inc 5 | 6 | INTERVAL_FILE ?= intervals.bed 7 | 8 | VPATH ?= bam 9 | 10 | TEQC = modules/qc/TEQC.R 11 | INTERVAL_BAM_QC = modules/qc/intervalBamQC.R 12 | VARIANT_EVAL_REPORT = modules/qc/variantEvalGatkReport.R 13 | 14 | LOGDIR ?= log/interval_qc.$(NOW) 15 | 16 | .SECONDARY: 17 | .DELETE_ON_ERROR: 18 | .PHONY: rdata all 19 | 20 | all : interval_qc/coverage/index.html # amplicon_qc/variant_eval/index.html 21 | rdata : $(foreach sample,$(SAMPLES),amplicon_qc/rdata/$(sample).Rdata) 22 | 23 | # load each bam file into R and create R data files 24 | interval_qc/rdata/%.Rdata : %.bam 25 | $(call RUN,-s 8G -m 18G,"$(RSCRIPT) $(TEQC) --outFile $@ --ref $(REF) $< $(INTERVAL_FILE)") 26 | 27 | # GATK variant eval for each sample variants.vcf file 28 | # stratified by intervals and filter 29 | interval_qc/variantEval.grp : $(foreach sample,$(SAMPLES),gatk/vcf/$(sample).variants.vcf) 30 | $(call RUN,-n 4 -s 1G -m 1.5G,"$(call GATK_MEM,4G) -T VariantEval -nt 4 -o $@ -R $(REF_FASTA) --stratIntervals $(INTERVAL_FILE) --dbsnp $(DBSNP) $(foreach vcf,$^, --eval:$(call strip-suffix,$(notdir $(vcf))) $(vcf)) -ST IntervalStratification -ST Filter") 31 | 32 | # Create amplicon coverage plots using R script AMPLICON_BAM_QC 33 | interval_qc/coverage/index.html : $(foreach sample,$(SAMPLES),interval_qc/rdata/$(sample).Rdata) 34 | $(call RUN,-s 2G -m 4G,"$(RSCRIPT) $(INTERVAL_BAM_QC) --outDir $(@D) $^") 35 | 36 | # Create variant evaluation plots using R script VARIANT_EVAL_REPORT 37 | interval_qc/variant_eval/index.html : interval_qc/variantEval.grp 38 | $(call RUN,-s 2G -m 4G,"$(RSCRIPT) $(VARIANT_EVAL_REPORT) --outDir $(@D) $< &> $(LOG)") 39 | -------------------------------------------------------------------------------- /qc/nonRefFreqFromPileup.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use Getopt::Std; 7 | my %opt; 8 | getopts('hb:d:', \%opt); 9 | 10 | my $usage = <) { 29 | chomp; 30 | my @F = split /\t/; 31 | next unless $F[4]; 32 | my $depth = length($F[4]); 33 | next if $depth < $minDepth; 34 | next if $F[4] =~ /[-+]/; 35 | $F[4] =~ s/[-+][ATCGNatcgn]+//g; 36 | for my $base (@bases) { 37 | my $count = () = $F[4] =~ /$base/i; 38 | my $freq = $count / $depth; 39 | $nonRefFreq{$F[2]}{$base}[int($freq / $binSize)]++ if $freq > $freqThreshold; 40 | } 41 | } 42 | 43 | print "Ref\tVar"; 44 | for my $bin (0..$lastBin) { 45 | print "\tBin$bin"; 46 | } 47 | print "\n"; 48 | for my $refBase (@bases) { 49 | for my $varBase (@bases) { 50 | next if $refBase eq $varBase; 51 | print "$refBase\t$varBase"; 52 | for my $bin (0..$lastBin) { 53 | print "\t";; 54 | if (defined $nonRefFreq{$refBase}{$varBase}[$bin]) { 55 | print $nonRefFreq{$refBase}{$varBase}[$bin]; 56 | } else { 57 | print "0"; 58 | } 59 | } 60 | print "\n"; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /qc/plotHsMetrics.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("hwriter")) 5 | 6 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 7 | 8 | optList <- list( 9 | make_option("--outDir", default = ".", help = "Output dir")) 10 | 11 | parser <- OptionParser(usage = "%prog [options] [hs_metrics.txt]", option_list = optList); 12 | 13 | arguments <- parse_args(parser, positional_arguments = T); 14 | opt <- arguments$options; 15 | 16 | if (length(arguments$args) < 1) { 17 | cat("Need input hs metrics file\n"); 18 | print_help(parser); 19 | stop(); 20 | } else { 21 | f <- arguments$args[1]; 22 | } 23 | 24 | hsMetrics <- read.table(f, header = T, row.names = 1, sep = '\t') 25 | colsToPlot <- c("TOTAL_READS", "PCT_PF_UQ_READS", "PCT_PF_UQ_READS_ALIGNED", "PCT_OFF_BAIT", "MEAN_TARGET_COVERAGE", "FOLD_ENRICHMENT", "PCT_TARGET_BASES_10X", "PCT_TARGET_BASES_30X", "PCT_TARGET_BASES_50X", "AT_DROPOUT", "GC_DROPOUT") 26 | 27 | for (cp in colsToPlot) { 28 | gfn <- paste(opt$outDir, "/", tolower(cp), "_barplot.pdf", sep = "") 29 | pdf(gfn, height = 3 + nrow(hsMetrics) / 2, width = 6) 30 | par(mar = c(5,10,5,5)) 31 | barplot(hsMetrics[,cp], names.arg = rownames(hsMetrics), horiz = T, las = 2, main = cp) 32 | null <- dev.off() 33 | } 34 | 35 | -------------------------------------------------------------------------------- /qc/qualimap.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | QUALIMAP_BAMQC_OPTS = -gd HUMAN 4 | QUALIMAP = unset DISPLAY; $(JAVA) -Xmx16G -classpath $(HOME)/share/usr/qualimap/qualimap.jar:$(HOME)/share/usr/qualimap/lib/* org.bioinfo.ngs.qc.qualimap.main.NgsSmartMain 5 | 6 | LOGDIR = log/qualimap.$(NOW) 7 | 8 | ifdef QUALIMAP_TARGETS_FILE 9 | QUALIMAP_BAMQC_OPTS += -gff $(QUALIMAP_TARGETS_FILE) -os 10 | endif 11 | 12 | .SECONDARY: 13 | .DELETE_ON_ERROR: 14 | .PHONY : all 15 | 16 | all : $(foreach sample,$(SAMPLES),qualimap/$(sample)_bamqc.timestamp) 17 | 18 | qualimap/%_bamqc.timestamp : bam/%.bam 19 | $(call RUN,-n 4 -s 4.5G -m 5G,"$(QUALIMAP) bamqc $(QUALIMAP_BAMQC_OPTS) -bam $< -nr 6 -nt 8 -outdir qualimap/$*_bamqc && touch $@") 20 | 21 | 22 | include modules/bam_tools/processBam.mk 23 | -------------------------------------------------------------------------------- /qc/readDepth.mk: -------------------------------------------------------------------------------- 1 | # Run gatk depth of coverage on bam files 2 | 3 | include modules/Makefile.inc 4 | include modules/variant_callers/gatk.inc 5 | 6 | LOGDIR = log/read_depth.$(NOW) 7 | 8 | EXOME ?= false 9 | 10 | ifeq ($(EXOME),true) 11 | READ_DEPTH_ARGS += -L $(EXOME_BED) 12 | endif 13 | 14 | .PHONY: all 15 | 16 | all: $(foreach sample,$(SAMPLES),gatk/read_depth/$(sample).read_depth) 17 | 18 | #ifeq ($(SPLIT_CHR),true) 19 | #define read-depth-chr 20 | #gatk/chr_read_depth/%.$1.read_depth : %.bam 21 | # $$(call INIT_MEM,8G,12G) $$(call GATK_MEM,7G) -T DepthOfCoverage -R $$(REF_FASTA) -L $1 -o $$@ -I $$< &> $$(LOG) 22 | #endef 23 | #$(foreach chr,$(CHROMOSOMES),$(eval $(call read-depth-chr,$(chr)))) 24 | #gatk/read_depth/%.read_depth : $(foreach chr,$(CHROMOSOMES),gatk/chr_read_depth/%.$(chr).read_depth) 25 | # $(INIT) head -1 $< > $@ && for x in $^; do sed '1d' $$x >> $@; done 26 | #else 27 | 28 | gatk/read_depth/%.read_depth : %.bam 29 | $(call RUN,-s 8G -m 12G,"$(call GATK_MEM,7G) -T DepthOfCoverage -R $(REF_FASTA) $(READ_DEPTH_ARGS) -o $@ -I $<") 30 | 31 | -------------------------------------------------------------------------------- /qc/rnaseqMetrics.mk: -------------------------------------------------------------------------------- 1 | ## defaults 2 | VPATH ?= bam 3 | LOGDIR = log/rnaseq_metrics.$(NOW) 4 | 5 | ## includes 6 | include modules/Makefile.inc 7 | include modules/variant_callers/gatk.inc 8 | 9 | PLOT_RNASEQ_METRICS = $(RSCRIPT) modules/qc/plotRnaseqMetrics.R 10 | 11 | .DELETE_ON_ERROR: 12 | .SECONDARY: 13 | .PHONY: all report 14 | 15 | COLLECT_RNASEQ_METRICS = $(JAVA) -Xmx7G -jar $(JARDIR)/CollectRnaSeqMetrics.jar VALIDATION_STRINGENCY=LENIENT 16 | STRAND_SPECIFICITY ?= NONE 17 | 18 | all : $(foreach sample,$(SAMPLES),metrics/$(sample).rnaseq_metrics) metrics/all.rnaseq_metrics metrics/all.normalized_coverage.rnaseq_metrics report 19 | 20 | report : metrics/rnaseq_report/index.html 21 | 22 | 23 | metrics/%.rnaseq_metrics : bam/%.bam 24 | $(call RUN,-c -s 8G -m 12G,"$(COLLECT_RNASEQ_METRICS) REF_FLAT=$(GENE_REF_FLAT) RIBOSOMAL_INTERVALS=$(RIBOSOMAL_INTERVALS) STRAND_SPECIFICITY=$(STRAND_SPECIFICITY) INPUT=$< REFERENCE_SEQUENCE=$(REF_FASTA) OUTPUT=$@ CHART_OUTPUT=$@.pdf VERBOSITY=ERROR") 25 | 26 | metrics/all.rnaseq_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).rnaseq_metrics) 27 | grep '^PF_BASES' $< > $@ && for i in $^; do sample=`echo $$i | sed 's:.*/::; s/\..*//'`; grep -A1 '^PF_BASES' $$i | tail -1 | awk -v sample=$$sample 'BEGIN { OFS = "\t" } { $$23=sample; print $$0 }' >> $@; done 28 | 29 | metrics/all.normalized_coverage.rnaseq_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).rnaseq_metrics) 30 | grep -A101 '^normalized_position' $< | cut -f1 > $@ && for i in $^; do sample=`echo $$i | sed 's:.*/::; s/\..*//'`; grep -A101 '^normalized_position' $$i | cut -f2 | sed "s/All_Reads/$$sample/" | paste $@ - > $@.tmp && mv $@.tmp $@; done 31 | 32 | metrics/rnaseq_report/index.html : metrics/all.rnaseq_metrics metrics/all.normalized_coverage.rnaseq_metrics 33 | $(PLOT_RNASEQ_METRICS) --outDir $(@D) $^ 34 | -------------------------------------------------------------------------------- /qc/rseqc.mk: -------------------------------------------------------------------------------- 1 | LOGDIR = log/rseqc.$(NOW) 2 | ## includes 3 | include modules/Makefile.inc 4 | 5 | INFER_EXPT = python $(HOME)/share/usr/anaconda/bin/infer_experiment.py 6 | INNER_DIST = python $(HOME)/share/usr/anaconda/bin/inner_distance.py 7 | JUNC_ANNOT = python $(HOME)/share/usr/anaconda/bin/junction_annotation.py 8 | BAM_STAT = python $(HOME)/share/usr/anaconda/bin/junction_annotation.py 9 | CLIP_PROFILE = python $(HOME)/share/usr/anaconda/bin/clipping_profile.py 10 | MISMATCH_PROFILE = python $(HOME)/share/usr/anaconda/bin/mismatch_profile.py 11 | INSERTION_PROFILE = python $(HOME)/share/usr/anaconda/bin/insertion_profile.py 12 | DELETION_PROFILE = python $(HOME)/share/usr/anaconda/bin/deletion_profile.py 13 | GENEBODY_COV = python $(HOME)/share/usr/anaconda/bin/geneBody_coverage.py 14 | READ_HEXAMER = python $(HOME)/share/usr/anaconda/bin/read_hexamer.py 15 | READ_QUALITY = python $(HOME)/share/usr/anaconda/bin/read_quality.py 16 | READ_NVC = python $(HOME)/share/usr/anaconda/bin/read_NVC.py 17 | READ_GC = python $(HOME)/share/usr/anaconda/bin/read_GC.py 18 | READ_DUP = python $(HOME)/share/usr/anaconda/bin/read_duplication.py 19 | READ_DIST = python $(HOME)/share/usr/anaconda/bin/read_distribution.py 20 | RPKM_SAT = python $(HOME)/share/usr/anaconda/bin/RPKM_saturation.py 21 | RPKM_COUNT = python $(HOME)/share/usr/anaconda/bin/RPKM_count.py 22 | 23 | 24 | .DELETE_ON_ERROR: 25 | .SECONDARY: 26 | .PHONY: rseqc 27 | 28 | rseqc : $(foreach sample,$(SAMPLES), \ 29 | rseqc/infer/$(sample).infer \ 30 | rseqc/gene_body_cov/$(sample).geneBodyCoverage.txt \ 31 | rseqc/inner_dist/$(sample).innerDistance.txt) 32 | 33 | rseqc/infer/%.infer : bam/%.bam bam/%.bam.bai 34 | $(call RUN,-s 7G -m 8G,"$(INFER_EXPT) -i $< -r $(REF_HOUSEKEEPING_GENE_BED) > $@") 35 | 36 | rseqc/gene_body_cov/%.geneBodyCoverage.txt : bam/%.bam 37 | $(call RUN,-s 7G -m 8G,"$(GENEBODY_COV) -i $< -r $(REF_HOUSEKEEPING_GENE_BED) -o rseqc/gene_body_cov/$*") 38 | 39 | rseqc/inner_dist/%.innerDistance.txt : bam/%.bam 40 | $(call RUN,-s 7G -m 8G,"$(INNER_DIST) -i $< -r $(REF_HOUSEKEEPING_GENE_BED) -o rseqc/inner_dist/$*") 41 | 42 | -------------------------------------------------------------------------------- /qc/teqc.mk: -------------------------------------------------------------------------------- 1 | # Run TEQC R library on bams 2 | # vim: set ft=make : 3 | include modules/Makefile.inc 4 | 5 | LOGDIR ?= teqc/log 6 | 7 | .PHONY: teqc 8 | .DELETE_ON_ERROR: 9 | .SECONDARY: 10 | 11 | teqc : teqc_report/index.html 12 | 13 | teqc/%.Rdata : bam/%.bam bam/%.bam.bai 14 | $(call INIT_MEM,12G,14G) $(RSCRIPT) modules/qc/TEQC.R --ref=$(REF) --outFile $@ $< $(TARGETS_FILE) &> $(LOGDIR)/$(@F).log 15 | 16 | teqc_report/index.html : $(foreach sample,$(SAMPLES),teqc/$(sample).Rdata) 17 | $(call INIT_MEM,12G,14G) $(MKDIR) teqc_report; $(RSCRIPT) modules/qc/TEQCreport.R --outDir=$(@D) $^ 18 | 19 | -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v1.hg19.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v1.hg19.vcf.gz -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v1.hg19.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v1.hg19.vcf.gz.tbi -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v1.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v1.vcf.gz -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v1.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v1.vcf.gz.tbi -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v2.hg19.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v2.hg19.vcf.gz -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v2.hg19.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v2.hg19.vcf.gz.tbi -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v2.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v2.vcf.gz -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v2.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v2.vcf.gz.tbi -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v3.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v3.vcf.gz -------------------------------------------------------------------------------- /reference/hotspots/hotspot-v3.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v3.vcf.gz.tbi -------------------------------------------------------------------------------- /rnaseq/immunedeconv.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/immunedeconv.$(NOW) 4 | 5 | immunedeconv : immunedeconv/quantiseq.txt \ 6 | immunedeconv/mcpcounter.txt \ 7 | immunedeconv/cibersort.txt 8 | 9 | immunedeconv/quantiseq.txt : kallisto/tpm_by_gene.txt 10 | $(call RUN, -c -n 1 -s 8G -m 16G -v $(IMMUNE_ENV),"set -o pipefail && \ 11 | $(RSCRIPT) $(SCRIPTS_DIR)/immunedeconv.R --option 1 --input_file $(<) --output_file $(@)") 12 | 13 | immunedeconv/mcpcounter.txt : kallisto/tpm_by_gene.txt 14 | $(call RUN, -c -n 1 -s 8G -m 16G -v $(IMMUNE_ENV),"set -o pipefail && \ 15 | $(RSCRIPT) $(SCRIPTS_DIR)/immunedeconv.R --option 2 --input_file $(<) --output_file $(@)") 16 | 17 | immunedeconv/cibersort.txt : kallisto/tpm_by_gene.txt 18 | $(call RUN, -c -n 1 -s 8G -m 16G -v $(IMMUNE_ENV),"set -o pipefail && \ 19 | $(RSCRIPT) $(SCRIPTS_DIR)/immunedeconv.R --option 3 --input_file $(<) --output_file $(@)") 20 | 21 | ..DUMMY := $(shell mkdir -p version; \ 22 | ~/share/usr/env/r-immunedeconv-2.1.0/bin/R --version >> version/immunedeconv.txt;) 23 | .SECONDARY: 24 | .DELETE_ON_ERROR: 25 | .PHONY: immunedeconv 26 | -------------------------------------------------------------------------------- /rnaseq/sumreads.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/sum_reads.$(NOW) 4 | 5 | sumreads : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_gene.txt) \ 6 | $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_exon.txt) \ 7 | sumreads/rpkm_by_gene.txt \ 8 | sumreads/rpkm_by_exon.txt \ 9 | sumreads/counts_by_gene.txt \ 10 | sumreads/counts_by_exon.txt 11 | 12 | SUM_READS_OPTS = 13 | REF ?= b37 14 | 15 | sumreads/%.sumreads.by_gene.txt : bam/%.bam bam/%.bam.bai 16 | $(call RUN,-v $(SUMREADS_ENV) -s 24G -m 48G,"$(SUM_READS_RSCRIPT) --genome $(REF) --outFile $@ $(SUM_READS_OPTS) $<") 17 | 18 | sumreads/%.sumreads.by_exon.txt : bam/%.bam bam/%.bam.bai 19 | $(call RUN,-v $(SUMREADS_ENV) -s 24G -m 48G,"$(SUM_EXONS_RSCRIPT) --genome $(REF) --outFile $@ $(SUM_READS_OPTS) $<") 20 | 21 | sumreads/rpkm_by_gene.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_gene.txt) 22 | cut -f 2 $< > $@; \ 23 | for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 7 $$x | sed "s/exonRPKM/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done 24 | 25 | sumreads/rpkm_by_exon.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_exon.txt) 26 | cut -f 1-2 $< > $@; \ 27 | for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 6 $$x | sed "s/exonRPKM/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done 28 | 29 | sumreads/counts_by_gene.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_gene.txt) 30 | cut -f 2 $< > $@; \ 31 | for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 3 $$x | sed "s/countsByGene/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done 32 | 33 | sumreads/counts_by_exon.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_exon.txt) 34 | cut -f 1-2 $< > $@; \ 35 | for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 4 $$x | sed "s/exonCount/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done 36 | 37 | ..DUMMY := $(shell mkdir -p version; \ 38 | $(SUMREADS_ENV)/bin/R --version >> version/sumreads.txt;) 39 | .SECONDARY: 40 | .DELETE_ON_ERROR: 41 | .PHONY: sumreads 42 | -------------------------------------------------------------------------------- /scripts/Rshell: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o pipefail 4 | 5 | RCMD="Rscript" 6 | NAME="R" 7 | TMPDIR="$HOME/share/tmp" 8 | QUEUE="jrf.q" 9 | LOGDIR=. 10 | MEM="1G" 11 | PARALLEL=1 12 | QSUB="perl modules/scripts/qsub.pl" 13 | 14 | while getopts "e:sm:n:l:p:" opt; do 15 | case $opt in 16 | l) 17 | LOGDIR=$OPTARG 18 | ;; 19 | e) 20 | R=$OPTARG 21 | ;; 22 | m) 23 | MEM=$OPTARG 24 | ;; 25 | p) 26 | PARALLEL=$OPTARG 27 | ;; 28 | n) 29 | NAME=$OPTARG 30 | ;; 31 | s) 32 | SGE=true 33 | ;; 34 | \:) 35 | echo "Argument missing: -$OPTARG" >&2 36 | exit -1 37 | ;; 38 | \?) 39 | echo "Invalid option: -$OPTARG" >&2 40 | exit -1 41 | ;; 42 | esac 43 | done 44 | 45 | echo "#---------------------------------"; 46 | 47 | if [[ $PARALLEL -gt 1 ]]; then 48 | PE="-pe smp $PARALLEL" 49 | fi 50 | 51 | umask 002 52 | mkdir -p $LOGDIR 53 | 54 | if [[ -n "$R" ]]; then 55 | TMP=$(mktemp --tmpdir=${TMPDIR}) 56 | #TMP=$(mktemp -t x) 57 | trap "{ rm -f ${TMP}; exit 255; }" SIGINT 58 | echo "Sys.umask('002')" > ${TMP} 59 | echo "${R}" >> ${TMP} 60 | echo "${R}" > $LOGDIR/$NAME.R 61 | chmod +rx ${TMP} 62 | source ${HOME}/.bashrc 63 | if [[ -n "$SGE" ]]; then 64 | mkdir -p $LOGDIR 65 | echo "umask 002; ${RCMD} ${TMP}" | $QSUB -- -cwd -V -now n -q $QUEUE -N X$NAME $PE -l virtual_free=$MEM,h_vmem=$MEM -o $LOGDIR/$NAME.log -j y -b n 66 | RET_CODE=$? 67 | else 68 | $RCMD ${TMP} | tee -a $LOGDIR/$NAME.log 69 | RET_CODE=$? 70 | fi 71 | rm -f ${TMP} 72 | exit ${RET_CODE} 73 | else 74 | echo "Undefined script" >&2 75 | exit -1 76 | fi 77 | 78 | -------------------------------------------------------------------------------- /scripts/Sweave.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | 5 | allArguments <- commandArgs(trailingOnly = T) 6 | 7 | if (length(allArguments) < 1) { 8 | cat("Need Rnw file"); 9 | stop(); 10 | } 11 | 12 | rnwFile <- allArguments[1]; 13 | arguments <<- allArguments[-1]; 14 | 15 | Sweave(rnwFile); 16 | -------------------------------------------------------------------------------- /scripts/add_dbsnp_gmaf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # add INFO field for RARE variants, GMAF < 0.01 3 | 4 | import argparse 5 | import vcf 6 | import re 7 | import sys 8 | 9 | parser = argparse.ArgumentParser(prog='add_dbsnp_gmaf.py', 10 | description='add GMAF field to dbsnp using CAF field') 11 | parser.add_argument('vcf_infile') 12 | 13 | args = parser.parse_args() 14 | 15 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 16 | 17 | vcf_reader.infos['GMAF'] = vcf.parser._Info(id='GMAF', num=1, type='Float', 18 | desc="global minor allele frequency from 1000g", 19 | source=None, version=None) 20 | 21 | 22 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 23 | 24 | for record in vcf_reader: 25 | if 'CAF' in record.INFO: 26 | caf_str = [re.sub(r'^\.$', '0', re.sub(r'[\[\]]', '', x)) for x in record.INFO['CAF'] if x is not None] 27 | caf = sorted([float(x) for x in caf_str]) 28 | gmaf = caf[len(caf) - 2] 29 | record.INFO['GMAF'] = gmaf 30 | vcf_writer.write_record(record) 31 | 32 | vcf_writer.close() 33 | -------------------------------------------------------------------------------- /scripts/classify_snv_pathogenicity_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ classify pathogenicity of vcf records 4 | """ 5 | 6 | import argparse 7 | import vcf 8 | import sys 9 | import classify_pathogenicity_vcf as cp 10 | 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(prog='classify_snv_pathogenicity_vcf.py', 14 | description='Add pathogenicity to vcf file') 15 | parser.add_argument('vcf_infile') 16 | args = parser.parse_args() 17 | 18 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 19 | 20 | assert "ANN" in vcf_reader.infos 21 | assert "HOTSPOT" in vcf_reader.infos or "hotspot" in vcf_reader.infos 22 | assert "FATHMM_pred" in vcf_reader.infos 23 | assert "facetsLOH" in vcf_reader.infos or "LOH" in vcf_reader.infos 24 | assert "MutationTaster_pred" in vcf_reader.infos 25 | 26 | # add necessary info headers 27 | vcf_reader.infos['pathogenicity'] = vcf.parser._Info(id='pathogenicity', num=-1, type='String', 28 | desc="Classification of pathogenicity", 29 | source=None, version=None) 30 | records = [x for x in vcf_reader] 31 | for record in records: 32 | if len(record.FILTER) == 0 and record.is_snp: 33 | cp.classify_pathogenicity(record) 34 | 35 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 36 | for record in records: 37 | vcf_writer.write_record(record) 38 | vcf_writer.close() 39 | -------------------------------------------------------------------------------- /scripts/convert_sample_txt2yaml.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Convert a space-space delimited samples.txt/sample_sets.txt file to samples.yaml 4 | """ 5 | 6 | import sys 7 | import yaml 8 | import argparse 9 | 10 | parser = argparse.ArgumentParser(prog='convert_sample_txt2yaml.py', 11 | description='Convert samples.txt/sample_sets.txt to yaml') 12 | parser.add_argument('sample_txt_file') 13 | args = parser.parse_args() 14 | 15 | samples = [] 16 | with open(args.sample_txt_file, 'r') as f: 17 | for line in f: 18 | split_sp = line.rstrip().split() 19 | if (len(split_sp) == 1): 20 | samples.append({'tumor': split_sp}) 21 | else: 22 | samples.append({'normal': split_sp[-1], 'tumor': split_sp[:-1]}) 23 | 24 | yaml.dump(samples, sys.stdout) 25 | -------------------------------------------------------------------------------- /scripts/create_sample_sets.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my %tumorSamples; 7 | my %normalSamples; 8 | for my $s (<>) { 9 | chomp $s; 10 | my $id; 11 | if ($s =~ /(\d+)/) { 12 | $id = $1; 13 | if ($s =~ m/N$/) { 14 | print STDERR "Warning: sample $id ($s) has two normals\n" if (exists $normalSamples{$id}); 15 | $normalSamples{$id} = $s; 16 | } else { 17 | push @{$tumorSamples{$id}}, $s; 18 | } 19 | } 20 | } 21 | 22 | while (my ($id, $normal) = each %normalSamples) { 23 | next and print STDERR "Warning: no tumor samples for $id ($normal)" unless (exists $tumorSamples{$id}); 24 | print join(" ", @{$tumorSamples{$id}}) . " $normal\n"; 25 | } 26 | 27 | for my $id (keys %tumorSamples) { 28 | unless (exists $normalSamples{$id}) { 29 | print STDERR "Warning: no normal sample for $id (" . join(" ", @{$tumorSamples{$id}}) . ")\n"; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /scripts/extract_signatures.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("readr")) 5 | suppressPackageStartupMessages(library("deconstructSigs")) 6 | suppressPackageStartupMessages(library("dplyr")) 7 | suppressPackageStartupMessages(library("magrittr")) 8 | 9 | if (!interactive()) { 10 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 11 | } 12 | 13 | args_list <- list( 14 | make_option("--sample_name", default = NA, type = 'character', help = "tumor sample name") 15 | ) 16 | 17 | parser <- OptionParser(usage = "%prog", option_list = args_list) 18 | arguments <- parse_args(parser, positional_arguments = T) 19 | opt <- arguments$options 20 | 21 | mutation_summary = read_tsv(file="summary/tsv/mutation_summary.tsv", col_types = cols(.default = col_character())) %>% 22 | type_convert() %>% 23 | filter(variantCaller=="mutect") %>% 24 | filter(TUMOR_SAMPLE==opt$sample_name) %>% 25 | mutate(CHROM = paste0("chr", CHROM)) %>% 26 | select(sample_id = TUMOR_SAMPLE, chrom=CHROM, pos=POS, ref=REF, alt=ALT) 27 | 28 | signature_input = mut.to.sigs.input(mut.ref = data.frame(mutation_summary), 29 | sample.id = "sample_id", 30 | chr = "chrom", 31 | pos = "pos", 32 | ref = "ref", 33 | alt = "alt") 34 | 35 | extracted_signatures = whichSignatures(tumor.ref = signature_input, 36 | signatures.ref = signatures.cosmic, 37 | contexts.needed = TRUE) 38 | 39 | save(list=ls(all=TRUE), file=paste0("deconstructsigs/signatures/", opt$sample_name, ".RData")) 40 | -------------------------------------------------------------------------------- /scripts/facets_suite.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("dplyr")) 5 | suppressPackageStartupMessages(library("readr")) 6 | suppressPackageStartupMessages(library("magrittr")) 7 | 8 | if (!interactive()) { 9 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 10 | } 11 | 12 | args_list <- list(make_option("--option", default = NA, type = 'character', help = "type of analysis"), 13 | make_option("--sample_pairs", default = NA, type = 'character', help = "sample pairs")) 14 | parser <- OptionParser(usage = "%prog", option_list = args_list) 15 | arguments <- parse_args(parser, positional_arguments = T) 16 | opt <- arguments$options 17 | 18 | if (as.numeric(opt$option) == 1) { 19 | sample_names = unlist(strsplit(as.character(opt$sample_pairs), split = " ", fixed = TRUE)) 20 | CN = list() 21 | for (i in 1:length(sample_names)) { 22 | CN[[i]] = readr::read_tsv(file = paste0("facets_suite/", sample_names[i], "/", sample_names[i], ".gene_level.txt"), 23 | col_names = TRUE, col_types = cols(.default = col_character())) %>% 24 | readr::type_convert() 25 | } 26 | CN = do.call(rbind, CN) 27 | readr::write_tsv(x = CN, path = "facets_suite/summary.txt", col_names = TRUE, append = FALSE) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /scripts/filter_dbsnp_gmaf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # filter SNPs that are rare and have a global minor allele frequency less than 0.01 3 | 4 | import argparse 5 | import vcf 6 | import re 7 | import sys 8 | 9 | parser = argparse.ArgumentParser(prog='filter_dbsnp_gmaf.py', 10 | description='filter dbsnp gmaf (CAF info)') 11 | parser.add_argument('vcf_infile') 12 | 13 | args = parser.parse_args() 14 | 15 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 16 | 17 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 18 | 19 | for record in vcf_reader: 20 | if 'CAF' not in record.INFO: 21 | vcf_writer.write_record(record) 22 | else: 23 | caf1 = float(re.sub(r'[\[\]]', '', record.INFO['CAF'][0])) 24 | if caf1 < 0.99: 25 | vcf_writer.write_record(record) 26 | vcf_writer.close() 27 | -------------------------------------------------------------------------------- /scripts/filter_sv.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("readr")) 5 | suppressPackageStartupMessages(library("dplyr")) 6 | suppressPackageStartupMessages(library("magrittr")) 7 | 8 | if (!interactive()) { 9 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 10 | } 11 | 12 | optList = list(make_option("--input_file", default = NA, type = 'character', help = "Input VCF file"), 13 | make_option("--output_file", default = NA, type = 'character', help = "Output VCF file")) 14 | parser = OptionParser(usage = "%prog", option_list = optList) 15 | arguments = parse_args(parser, positional_arguments = T) 16 | opt = arguments$options 17 | 18 | vcf = readr::read_tsv(file = as.character(opt$input_file), comment = "#", col_names = FALSE, col_types = cols(.default = col_character())) %>% 19 | readr::type_convert() %>% 20 | dplyr::filter(!grepl("SUPP_VEC=110", X8, fixed = TRUE)) %>% 21 | dplyr::mutate(X3 = X12) %>% 22 | dplyr::mutate(X3 = unlist(lapply(X3, function(x) { unlist(strsplit(x, split = ":", fixed = TRUE))[8] }))) %>% 23 | dplyr::mutate(X3 = gsub(pattern = "_", replacement = ":", x = X3, fixed = TRUE)) %>% 24 | dplyr::mutate(X5 = case_when( 25 | grepl("DUP", X3, fixed = TRUE) ~ "", 26 | grepl("DEL", X3, fixed = TRUE) ~ "", 27 | grepl("INV", X3, fixed = TRUE) ~ "", 28 | TRUE ~ X5 29 | )) %>% 30 | dplyr::mutate(X8 = case_when( 31 | grepl("DUP", X3, fixed = TRUE) ~ gsub("SVTYPE=INV", "SVTYPE=DUP", X8), 32 | grepl("DEL", X3, fixed = TRUE) ~ gsub("SVTYPE=INV", "SVTYPE=DEL", X8), 33 | TRUE ~ X8 34 | )) %>% 35 | dplyr::rename(`#CHROM` = X1, 36 | POS = X2, 37 | ID = X3, 38 | REF = X4, 39 | ALT = X5, 40 | QUAL = X6, 41 | FILTER = X7, 42 | INFO = X8, 43 | FORMAT = X9, 44 | SVABA = X10, 45 | GRIDSS = X11, 46 | MANTA = X12) 47 | 48 | readr::write_tsv(x = vcf, path = as.character(opt$output_file), append = TRUE, col_names = TRUE) 49 | 50 | 51 | -------------------------------------------------------------------------------- /scripts/get_basecounts.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("readr")) 5 | suppressPackageStartupMessages(library("dplyr")) 6 | suppressPackageStartupMessages(library("magrittr")) 7 | 8 | if (!interactive()) { 9 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 10 | } 11 | 12 | args_list <- list(make_option("--option", default = NA, type = 'character', help = "Which option?"), 13 | make_option("--sample_name", default = NA, type = 'character', help = "sample name")) 14 | parser <- OptionParser(usage = "%prog", option_list = args_list) 15 | arguments <- parse_args(parser, positional_arguments = T) 16 | opt <- arguments$options 17 | 18 | if (as.numeric(opt$option) == 1) { 19 | sample_names = unlist(strsplit(x = as.character(opt$sample_name), split = " ", fixed = TRUE)) 20 | data = list() 21 | for (i in 1:length(sample_names)) { 22 | data[[i]] = readr::read_tsv(file = paste0("gbc/", sample_names[i], ".txt.gz"), col_names = TRUE, col_types = cols(.default = col_character())) %>% 23 | readr::type_convert() %>% 24 | dplyr::mutate(sample_name = sample_names[i]) 25 | } 26 | data = do.call(bind_rows, data) 27 | readr::write_tsv(x = data, path = "gbc/summary.txt", append = FALSE, col_names = TRUE) 28 | } 29 | 30 | -------------------------------------------------------------------------------- /scripts/init_project.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Cwd; 6 | use File::Copy; 7 | 8 | my $MAKEFILE = <Makefile"; 14 | print OUT $MAKEFILE; 15 | } 16 | close OUT; 17 | 18 | unless (-e "project_config.yaml") { 19 | copy("modules/default_yaml/project_config.yaml", "project_config.yaml") or die "Unable to create project_config.yaml: $!"; 20 | } 21 | 22 | unless (-e "summary_config.yaml") { 23 | copy("modules/default_yaml/summary_config.yaml", "summary_config.yaml") or die "Unable to create summary_config.yaml: $!"; 24 | } 25 | -------------------------------------------------------------------------------- /scripts/join_eff.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use List::MoreUtils qw(first_index indexes); 5 | 6 | my $line = <>; 7 | print $line; 8 | chomp $line; 9 | my @header = split /\t/, $line; 10 | my @effIndexes = indexes { /^ANN\[/ } @header; 11 | 12 | my %lines; 13 | while (<>) { 14 | chomp; 15 | my @F = split /\t/, $_, -1; 16 | for my $i (0..$#F) { 17 | $F[$i] = "." unless $F[$i] =~ /\S/; 18 | } 19 | push @{$lines{$F[0]}{$F[1]}}, \@F; 20 | } 21 | 22 | foreach my $chrom (sort keys %lines) { 23 | foreach my $posn (sort keys %{$lines{$chrom}}) { 24 | my $F = pop @{$lines{$chrom}{$posn}}; 25 | while (my $Fn = pop @{$lines{$chrom}{$posn}}) { 26 | for my $i (@effIndexes) { 27 | $F->[$i] .= "|" . $Fn->[$i]; 28 | } 29 | } 30 | print join("\t", @{$F}) . "\n"; 31 | } 32 | } 33 | 34 | -------------------------------------------------------------------------------- /scripts/knit.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library(knitr)) 4 | suppressPackageStartupMessages(library(markdown)) 5 | 6 | 7 | args <- commandArgs(T) 8 | 9 | if (length(args) < 2) stop("Need input and output script") 10 | 11 | input <- args[1] 12 | outPrefix <- args[2] 13 | args <- args[c(-1,-2)] 14 | 15 | figPath <- file.path(outPrefix, 'figure/') 16 | cachePath <- file.path(outPrefix, 'cache/') 17 | dir.create(figPath, showWarnings = F, recursive = T) 18 | dir.create(cachePath, showWarnings = F, recursive = T) 19 | 20 | opts_chunk$set(dev = c("png", 'pdf'), cache.path = cachePath) # , fig.path = file.path('mutsig_report/figure/')) 21 | opts_knit$set(root.dir = getwd(), base.dir = file.path(paste(outPrefix, '/', sep = '')), progress = F, verbose = T) 22 | 23 | #options(warn = -1, error = quote({ traceback(2); q('no', status = 1) })) 24 | 25 | knit(input, paste(outPrefix, '/index.md', sep = '')) 26 | markdownToHTML(paste(outPrefix, '/index.md', sep = ''), paste(outPrefix, '/index.html', sep = '')) 27 | 28 | -------------------------------------------------------------------------------- /scripts/launcher_sql_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ launches the mysql server on a docker node and returns the server ip 3 | """ 4 | 5 | import argparse 6 | import MySQLdb 7 | import yaml 8 | from qsub_pbs import Job 9 | import sys 10 | import time 11 | 12 | parser = argparse.ArgumentParser(description='launch mysql server on docker node') 13 | parser.add_argument('server_yaml') 14 | 15 | args = parser.parse_args() 16 | 17 | server_info = yaml.load(open(args.server_yaml, 'r')) 18 | host = server_info['host'] 19 | db = server_info['db'] 20 | 21 | con = None 22 | for attempt in range(2): 23 | try: 24 | con = MySQLdb.connect(host=server_info['host'], 25 | user=server_info['user'], 26 | passwd=server_info['password'], 27 | port=server_info['port'], 28 | db=server_info['db']) 29 | break 30 | except: 31 | print(("Failed to connect to {} mysql server. Running docker".format(server_info['db']))) 32 | docker_cmd = "docker run -d -v {}:/var/lib/mysql -p {}:3306 {}".format(server_info['data_dir'], 33 | server_info['port'], 34 | server_info['docker_repo']) 35 | print((docker_cmd + "\n")) 36 | #job = Job(docker_cmd, '-I -l nodes=1:docker -l host={}'.format(server_info['host'])) 37 | #job.run_job() 38 | #job.wait() 39 | #time.sleep(90) # wait for mysqld to start 40 | 41 | if not con: 42 | print("Failed to initialize mysql server") 43 | sys.exit(1) 44 | -------------------------------------------------------------------------------- /scripts/mimsi.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("dplyr")) 5 | suppressPackageStartupMessages(library("readr")) 6 | suppressPackageStartupMessages(library("magrittr")) 7 | 8 | if (!interactive()) { 9 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 10 | } 11 | 12 | args_list <- list(make_option("--option", default = NA, type = 'character', help = "type of analysis"), 13 | make_option("--sample_names", default = NA, type = 'character', help = "sample name")) 14 | parser <- OptionParser(usage = "%prog", option_list = args_list) 15 | arguments <- parse_args(parser, positional_arguments = T) 16 | opt <- arguments$options 17 | 18 | if (as.numeric(opt$option)==1) { 19 | sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE)) 20 | smry = list() 21 | for (i in 1:length(sample_names)) { 22 | smry[[i]] = readr::read_tsv(file = paste0("mimsi/", sample_names[i], "/", sample_names[i], ".txt"), 23 | col_names = TRUE, col_types = cols(.default = col_character())) %>% 24 | readr::type_convert() 25 | } 26 | smry = do.call(rbind, smry) 27 | write_tsv(smry, path="mimsi/summary.txt", append = FALSE, col_names = TRUE) 28 | 29 | } 30 | -------------------------------------------------------------------------------- /scripts/monitorMySQL.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | UP=$(pgrep -u limr mysqld | wc -l); 4 | if [ "$UP" -ne 1 ]; 5 | then 6 | echo "MySQL is down."; 7 | mysqld --defaults-file=/home/limr/share/usr/mysql/my.cnf 8 | else 9 | echo "All is well."; 10 | fi 11 | -------------------------------------------------------------------------------- /scripts/monitor_gfserver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | UP=$(pgrep -u limr gfServer | wc -l); 4 | if [ "$UP" -ne 1 ]; 5 | then 6 | echo "gfServer is down."; 7 | /home/limr/share/usr/bin/gfServer start localhost 88878 -stepSize=5 -log=/home/limr/.blatserver.log /home/limr/share/reference/GATK_bundle/2.3/human_g1k_v37.2bit 8 | else 9 | echo "All is well."; 10 | fi 11 | -------------------------------------------------------------------------------- /scripts/mutation_taster_query.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | 4 | 5 | _prediction_priority = ['disease_causing_automatic', 6 | 'disease_causing', 7 | 'polymorphism_automatic', 8 | 'polymorphism'] 9 | 10 | 11 | def query(chrom, pos, ref, alt): 12 | pred = 'none' 13 | score = None 14 | url = "http://www.mutationtaster.org/cgi-bin/MutationTaster/MT_ChrPos.cgi" \ 15 | "?chromosome={}&position={}&ref={}&alt={}".format(chrom, pos, ref, alt) 16 | sys.stderr.write("Querying: {}\n".format(url)) 17 | dfs = pd.read_html(url) 18 | if (len(dfs) < 2): 19 | raise Exception('query failed') 20 | else: 21 | # summary is the second dataframe 22 | summary = dfs[1][1:] 23 | summary.columns = dfs[1].iloc[0] 24 | if 'prediction' in summary.columns: 25 | pred_score = {} 26 | for i, row in summary.iterrows(): 27 | if row['prediction'] not in pred_score: 28 | pred_score[row['prediction']] = [] 29 | pred_score[row['prediction']].append(float(row['probability'])) 30 | for p in _prediction_priority: 31 | if p in pred_score: 32 | pred = p 33 | score = max(pred_score[p]) 34 | break 35 | return (pred, score) 36 | -------------------------------------------------------------------------------- /scripts/normalFilterVCF.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # filter tumor based on normal 3 | # usage: normalFilterVCF.pl [tumor.vcf] [normal.vcf] 4 | 5 | use strict; 6 | 7 | if (@ARGV != 2) { 8 | print "Usage: normalFilterVCF.pl [tumor.vcf] [normal.vcf]\n" and exit(1); 9 | } 10 | 11 | my $tumorVCF = $ARGV[0]; 12 | my $normalVCF = $ARGV[1]; 13 | 14 | my $varPosn = {}; 15 | open IN, $normalVCF or die("Unable to open " . $normalVCF . "\n"); 16 | while () { 17 | next if /^#/; 18 | my @F = split /\t/; 19 | my $chr = $F[0]; 20 | my $posn = $F[1]; 21 | my $alt = $F[3]; 22 | $varPosn->{$chr} = {} unless exists $varPosn->{$chr}; 23 | $varPosn->{$chr}{$posn} = {} unless exists $varPosn->{$chr}{$posn}; 24 | $varPosn->{$chr}{$posn}{$alt} = 1; 25 | } 26 | close IN; 27 | 28 | open IN, $tumorVCF or die("Unable to open " . $tumorVCF . "\n"); 29 | while () { 30 | print and next if /^#/; 31 | my @F = split /\t/; 32 | my $chr = $F[0]; 33 | my $posn = $F[1]; 34 | my $alt = $F[3]; 35 | print unless (exists $varPosn->{$chr}{$posn}{$alt}); 36 | } 37 | close IN; 38 | -------------------------------------------------------------------------------- /scripts/posnGeneLookup.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # lookup the gene(s) at a base position (first column of the input) 3 | 4 | use strict; 5 | 6 | use DBI; 7 | use Bio::EnsEMBL::DBSQL::DBAdaptor; 8 | use Bio::EnsEMBL::Registry; 9 | 10 | print STDERR "Connecting to Ensembl core...\n"; 11 | my $dbCore = Bio::EnsEMBL::DBSQL::DBAdaptor->new( 12 | -user => "anonymous", 13 | -dbname => "homo_sapiens_core_65_37", 14 | -host => "localhost", 15 | -pass => "", 16 | -driver => 'mysql', 17 | -port => 33387 18 | ); 19 | print STDERR "Connected.\n"; 20 | 21 | my $slice_adaptor = $dbCore->get_SliceAdaptor(); 22 | while (<>) { 23 | chomp; 24 | my @F = split / /; 25 | print STDERR "Looking up position $F[0]\n"; 26 | my ($chr, $pos) = split /:/, $F[0]; 27 | $chr =~ s/chr//; 28 | my $slice = $slice_adaptor->fetch_by_region('chromosome', $chr, $pos, $pos); 29 | my @genes = @{$slice->get_all_Genes()}; 30 | my @ids; 31 | my @strands; 32 | while (my $gene = shift @genes) { 33 | my $stable_id = $gene->stable_id(); 34 | my $strand = $gene->strand(); 35 | push @ids, $stable_id; 36 | push @strands, $strand; 37 | } 38 | print "$chr:$pos " . ((@ids > 0)? join("|", @ids) . ' ' . join("|", @strands): "NA NA") . ' ' . join(" ", @F[1..$#F]) . "\n"; 39 | } 40 | -------------------------------------------------------------------------------- /scripts/prepareFastq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # rename fastq files and create samples.txt and possibly samples.split.txt 3 | # find -name '*.gz' | xargs prename 's:./([A-Z]+)_([0-9]{4})/(.*)\.fastq\.gz:./$1_$2/$3$2.fastq.gz:' 4 | mkdir -p fastq 5 | find rawdata -name '*.gz' | xargs ln -t fastq 6 | 7 | for x in fastq/*.fastq.gz; do 8 | if [ `grep -o "_" <<< "$x" | wc -l` -gt 1 ]; then 9 | prename 's/-//g; s/(.+)_([ATGCN]{6,8}|S\d+)_L([^_])+_R([12])_([0-9]+)/$1-$2$3$5.$4/; s/(.+)_[^_]+_L([^_])+_R([12])_([0-9]+)/$1_$2$4.$3/; s/_//g; s/-/_/g' $x; 10 | fi; 11 | done 12 | prename 's/IGO[^_]*//' fastq/*.gz 13 | paste <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/[._].*//') <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/\..*//') | awk 'BEGIN { OFS = "\t" } $1 != $2 { print }' | uniq > samples.split.txt 14 | 'ls' fastq/*.fastq.gz | sed 's:.*/::; s/_.*//' | sort | uniq 15 | #if [ -z `cut -f1 samples.split.txt | uniq -d` ]; then 16 | # prename 's/_.*//' fastq/*.fastq.gz 17 | # rm samples.split.txt 18 | #fi 19 | #for x in `cut -f1 samples.split.txt | uniq -u`; do 20 | #prename 's/_.*//' fastq/${x}_*.fastq.gz 21 | #sed -i "/^$x\t/d" samples.split.txt 22 | #done 23 | -------------------------------------------------------------------------------- /scripts/prepareFastq2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # rename fastq files and create samples.txt and possibly samples.split.txt 3 | 4 | mkdir -p fastq 5 | find rawdata -name '*.gz' | xargs ln -t fastq 6 | 7 | for x in fastq/*.fastq.gz; do 8 | if [ `grep -o "_" <<< "$x" | wc -l` -gt 1 ]; then 9 | prename 's/-//g; s/(.+)_(S\d+)_L([^_])+_R([12])_([0-9]+)/$1-$2$3$5.$4/; s/(.+)_[^_]+_L([^_])+_R([12])_([0-9]+)/$1_$2$4.$3/; s/_//g; s/-/_/g' $x; 10 | fi; 11 | done 12 | paste <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/[._].*//') <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/\..*//') | awk 'BEGIN { OFS = "\t" } $1 != $2 { print }' | uniq > samples.split.txt 13 | 'ls' fastq/*.fastq.gz | sed 's:.*/::; s/_.*//' | sort | uniq 14 | -------------------------------------------------------------------------------- /scripts/prepareMultirunFastq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #rename fastq files and create samples.txt and possibly samples.split.txt 3 | 4 | mkdir -p fastq 5 | cd rawdata 6 | find -name '*.gz' | xargs prename 's:./([A-Z]+)_([0-9]{4})/(.*)\.fastq\.gz:./$1_$2/$3$2.fastq.gz:' 7 | find -name '*.gz' | xargs ln -t ../fastq 8 | cd .. 9 | 10 | for x in fastq/*.fastq.gz; do 11 | if [ `grep -o "_" <<< "$x" | wc -l` -gt 1 ]; then 12 | prename 's/-//g; s/(.+)_([NATGC]{6,8}|S\d+)_L([^_])+_R([12])_([0-9]+)/$1-$2$3$5.$4/; s/(.+)_[^_]+_L([^_])+_R([12])_([0-9]+)/$1_$2$4.$3/; s/_//g; s/-/_/g' $x; 13 | fi; 14 | done 15 | prename 's/IGO[^_]*//' fastq/*.gz 16 | paste <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/[._].*//') <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/\..*//') | awk 'BEGIN { OFS = "\t" } $3 != $2 { print }' | uniq > samples.split.txt 17 | 'ls' fastq/*.fastq.gz | sed 's:.*/::; s/_.*//' | sort | uniq 18 | -------------------------------------------------------------------------------- /scripts/provean_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ classify provean of vcf records 4 | """ 5 | 6 | import vcf 7 | import argparse 8 | import sys 9 | import remote_provean_query as rpq 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser(prog='provean_vcf.py', 13 | description='Add provean to vcf file') 14 | parser.add_argument('vcf_infile') 15 | args = parser.parse_args() 16 | 17 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 18 | vcf_reader.infos['provean_protein_id'] = vcf.parser._Info(id='provean_protein_id', num=1, type='String', 19 | desc="provean protein id", 20 | source=None, version=None) 21 | vcf_reader.infos['provean_pred'] = vcf.parser._Info(id='provean_pred', num=1, type='String', 22 | desc="Mutation taster prediction using webquery if indel", 23 | source=None, version=None) 24 | vcf_reader.infos['provean_score'] = vcf.parser._Info(id='provean_score', num=1, type='Float', 25 | desc="Mutation taster score using webquery if indel", 26 | source=None, version=None) 27 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 28 | 29 | records = [record for record in vcf_reader] 30 | query_records = [] 31 | 32 | for record in records: 33 | if record.is_indel: 34 | query_records.append(record) 35 | 36 | if len(query_records) > 0: 37 | query = rpq.RemoteProveanQuery(query_records) 38 | query.run_query() 39 | 40 | for record in records: 41 | vcf_writer.write_record(record) 42 | vcf_writer.close() 43 | -------------------------------------------------------------------------------- /scripts/split_bed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import math 5 | 6 | if __name__ == "__main__": 7 | parser = argparse.ArgumentParser(prog='split_bed.py', 8 | description='split a bed file into chunks') 9 | parser.add_argument('--num_chunks', '-c', type=int, default=100, help='number of chunks') 10 | parser.add_argument('--out_prefix', '-o', required=True, help='output prefix') 11 | parser.add_argument('bed_file', help='bed file to split') 12 | args = parser.parse_args() 13 | 14 | bed = [line for line in open(args.bed_file, 'r')] 15 | 16 | n = int(math.ceil(len(bed) / args.num_chunks)) 17 | x = 1 18 | for i in range(0, len(bed), n): 19 | f = open(args.out_prefix + '{0:03d}.bed'.format(x), 'w') 20 | if x == args.num_chunks: 21 | # last chunk, write everything 22 | for line in bed[i:]: 23 | f.write(line) 24 | f.close() 25 | break 26 | else: 27 | for line in bed[i:i + n]: 28 | f.write(line) 29 | f.close() 30 | x = x + 1 31 | -------------------------------------------------------------------------------- /scripts/split_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import math 5 | import vcf 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser(prog='split_vcf.py', 9 | description='split a vcf file into chunks') 10 | parser.add_argument('--num_chunks', '-c', type=int, default=100, help='number of chunks') 11 | parser.add_argument('--out_prefix', '-o', required=True, help='output prefix') 12 | parser.add_argument('vcf_file', help='bed file to split') 13 | args = parser.parse_args() 14 | 15 | vcf_reader = vcf.Reader(open(args.vcf_file, 'r')) 16 | records = [record for record in vcf_reader] 17 | 18 | n = int(math.ceil(len(records) / args.num_chunks)) 19 | x = 1 20 | for i in range(0, len(records), n): 21 | vcf_writer = vcf.Writer(open(args.out_prefix + '{0:03d}.vcf'.format(x), 'w'), vcf_reader) 22 | if x == args.num_chunks: 23 | # last chunk, write everything 24 | for record in records[i:]: 25 | vcf_writer.write_record(record) 26 | vcf_writer.close() 27 | break 28 | else: 29 | for record in records[i:i + n]: 30 | vcf_writer.write_record(record) 31 | vcf_writer.close() 32 | x = x + 1 33 | -------------------------------------------------------------------------------- /scripts/summarize_sleuth.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("readr")) 5 | suppressPackageStartupMessages(library("dplyr")) 6 | suppressPackageStartupMessages(library("magrittr")) 7 | suppressPackageStartupMessages(library("sleuth")) 8 | 9 | 10 | if (!interactive()) { 11 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 12 | } 13 | 14 | optList = list(make_option('--annotation', type = 'character', default = NA, help = 'path to annotation file'), 15 | make_option('--samples', type = 'character', default = NA, help = 'list of samples names')) 16 | parser = OptionParser(usage = "%prog", option_list=optList) 17 | arguments = parse_args(parser, positional_arguments = T) 18 | opt = arguments$options 19 | 20 | sample_names = unlist(strsplit(x=opt$samples, split=" ", fixed=TRUE)) 21 | annotation = readr::read_tsv(file=opt$annotation, col_names=TRUE, col_types=cols(.default=col_character())) 22 | manifest = dplyr::tibble(sample = sample_names, 23 | condition = rep(1, length(sample_names)), 24 | path = paste0("kallisto/", sample_names)) 25 | data = sleuth::sleuth_prep(sample_to_covariates = manifest, 26 | extra_bootstrap_summary = TRUE, 27 | read_bootstrap_tpm = TRUE, 28 | target_mapping = annotation, 29 | aggregation_column = "hugo", 30 | gene_mode = TRUE) 31 | res = as.data.frame(sleuth_to_matrix(data, "obs_norm", "tpm")) 32 | tpm_bygene = dplyr::tibble(gene_symbol = rownames(res)) %>% 33 | dplyr::bind_cols(dplyr::as_tibble(res)) 34 | write_tsv(x=tpm_bygene, path="kallisto/tpm_by_gene.txt", append=FALSE, col_names=TRUE, quote_escape=FALSE) 35 | -------------------------------------------------------------------------------- /scripts/swapvcf.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | 5 | if (!interactive()) { 6 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 7 | } 8 | 9 | args_list <- list(make_option("--file", default = NA, type = 'character', help = "input file name"), 10 | make_option("--tumor", default = NA, type = 'character', help = "tumor sample name"), 11 | make_option("--normal", default = NA, type = 'character', help = "normal sample name")) 12 | 13 | parser <- OptionParser(usage = "%prog", option_list = args_list) 14 | arguments <- parse_args(parser, positional_arguments = T) 15 | opt <- arguments$options 16 | 17 | vcf = read.table(file=opt$file, header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE) 18 | n = ncol(vcf) 19 | n1 = vcf[,n,drop=TRUE] 20 | n2 = vcf[,n-1,drop=TRUE] 21 | vcf[,n-1] = n1 22 | vcf[,n] = n2 23 | colnames(vcf) = c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", opt$tumor, opt$normal) 24 | system(paste0("grep '##' ", opt$file, " > ", opt$file, ".tmp")) 25 | write.table(vcf, file=paste0(opt$file, ".tmp"), append=TRUE, quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE) 26 | warnings() 27 | -------------------------------------------------------------------------------- /scripts/tsvToExcel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Write tsv to excel sheet 4 | """ 5 | import argparse 6 | import pandas as pd 7 | import os 8 | from openpyxl import load_workbook 9 | 10 | 11 | def write_to_excel(tsv_file, excel_file, sheet_name, column_names, delimiter, overwrite): 12 | df = pd.read_csv(tsv_file, sep=delimiter) 13 | if not overwrite and os.path.isfile(excel_file): 14 | book = load_workbook(excel_file) 15 | writer = pd.ExcelWriter(excel_file, engine='openpyxl') 16 | writer.book = book 17 | writer.sheets = dict((ws.title, ws) for ws in book.worksheets) 18 | else: 19 | writer = pd.ExcelWriter(excel_file) 20 | if column_names: 21 | df.to_excel(writer, sheet_name, columns=column_names, index=False) 22 | else: 23 | df.to_excel(writer, sheet_name, index=False) 24 | if not overwrite and os.path.isfile(excel_file): 25 | writer.close() 26 | 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser(description=__doc__, 30 | formatter_class=argparse.RawDescriptionHelpFormatter) 31 | parser.add_argument("tsv_file", type=str, help="TSV") 32 | parser.add_argument("excel_file", type=str, help="Excel output") 33 | parser.add_argument("sheet_name", type=str, help="Sheet name") 34 | parser.add_argument("-c", "--column_names", type=str, default=None, help="Which columns to write (comma separated)") 35 | parser.add_argument("-d", "--delimiter", type=str, default="\t", help="Set delimiter") 36 | parser.add_argument("--overwrite", action="store_true", help="Overwrite existing excel") 37 | args = parser.parse_args() 38 | if args.column_names: 39 | column_names = args.column_names.split(",") 40 | else: 41 | column_names = None 42 | sheet_name = (args.sheet_name[:25] + '..') if len(args.sheet_name) > 25 else args.sheet_name 43 | write_to_excel(args.tsv_file, args.excel_file, sheet_name, column_names, args.delimiter, args.overwrite) 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /signatures/deconstruct_sigs.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/deconstruct_sigs.$(NOW) 4 | 5 | deconstructsigs : $(foreach sample,$(TUMOR_SAMPLES),deconstructsigs/signatures/$(sample).RData) \ 6 | $(foreach sample,$(TUMOR_SAMPLES),deconstructsigs/plots/context/$(sample).pdf) 7 | 8 | define extract-signatures 9 | deconstructsigs/signatures/%.RData : summary/tsv/mutation_summary.tsv 10 | $$(call RUN,-s 4G -m 6G -v $(DECONSTRUCTSIGS_ENV),"set -o pipefial && \ 11 | $(RSCRIPT) modules/signatures/extract_signatures.R \ 12 | --sample_name $$()") 13 | 14 | deconstructsigs/plots/context/%.pdf : deconstructsigs/signatures/%.RData 15 | $$(call RUN,-s 4G -m 6G -v $(DECONSTRUCTSIGS_ENV),"set -o pipefail && \ 16 | $(RSCRIPT) modules/signatures/plot_signatures.R \ 17 | --sample_name $$(*)") 18 | 19 | endef 20 | $(foreach sample,$(TUMOR_SAMPLES),\ 21 | $(eval $(call extract-signatures,$(sample)))) 22 | 23 | ..DUMMY := $(shell mkdir -p version; \ 24 | $(DECONSTRUCTSIGS_ENV)/bin/R --version > version/deconstruct_sigs.txt) 25 | .SECONDARY: 26 | .DELETE_ON_ERROR: 27 | .PHONY: deconstructsigs -------------------------------------------------------------------------------- /snp6/absolute.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("ABSOLUTE")) 5 | 6 | options(warn = -1, error = quote({ traceback(2); q('no', status = 1) })) 7 | 8 | optList <- list( 9 | make_option("--disease", default = 'breastcancer', help = "disease [default %default]"), 10 | make_option("--platform", default = "SNP_6.0", help = "platform [default %default]"), 11 | make_option("--tumour", default = NULL, help = "tumour sample name"), 12 | make_option("--mafFile", default = NULL, help = "MAF file"), 13 | make_option("--minMutAF", default = NULL, help = "Minimum Mutation Allele Frequency"), 14 | make_option("--resultsDir", default = NULL, help = "results directory"), 15 | make_option("--outPrefix", default = NULL, help = "output prefix") 16 | ) 17 | parser <- OptionParser(usage = "%prog segDat.Rdata", option_list = optList); 18 | arguments <- parse_args(parser, positional_arguments = T); 19 | opt <- arguments$options; 20 | 21 | if (is.null(opt$resultsDir)) { 22 | cat("Need results dir\n"); 23 | print_help(parser); 24 | stop(); 25 | } else if (is.null(opt$tumour)) { 26 | cat("Need tumour sample name\n"); 27 | print_help(parser); 28 | stop(); 29 | } else if (length(arguments$args) != 1) { 30 | cat("Need hapseg data file\n"); 31 | print_help(parser); 32 | stop(); 33 | } 34 | 35 | 36 | fn <- arguments$args[1]; 37 | RunAbsolute(seg.dat.fn = fn, output.fn.base = opt$outPrefix, 38 | sigma.p=0, max.sigma.h=0.02, 39 | min.ploidy=0.95, max.ploidy=10, primary.disease=opt$disease, 40 | platform=opt$platform, sample.name=opt$tumour, 41 | results.dir=opt$resultsDir, 42 | maf.fn = opt$mafFile, min.mut.af=opt$minMutAF, 43 | max.as.seg.count=1500, copy_num_type="allelic", 44 | max.neg.genome=0, max.non.clonal=0, 45 | verbose=TRUE) 46 | -------------------------------------------------------------------------------- /summary/cravat_summary.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | 5 | if (!interactive()) { 6 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 7 | } 8 | 9 | args_list = list(make_option("--sample_names", default = NA, type = 'character', help = "sample name")) 10 | 11 | parser = OptionParser(usage = "%prog", option_list = args_list) 12 | arguments = parse_args(parser, positional_arguments = T) 13 | opt = arguments$options 14 | 15 | sample_names = unlist(strsplit(x=opt$sample_names, split=" ", fixed=TRUE)) 16 | tsv = list() 17 | for (i in 1:length(sample_names)) { 18 | tsv[[i]] = read.csv(file=paste0("cravat/", sample_names[i], ".txt"), header=TRUE, sep="\t", stringsAsFactors=FALSE) 19 | } 20 | tsv = do.call(rbind, tsv) 21 | write.table(tsv, file="summary/tsv/cravat_summary.tsv", sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE, append=FALSE) 22 | -------------------------------------------------------------------------------- /summary/cravat_summary.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/cravat_summary.$(NOW) 4 | PHONY += cravat summary summary/tsv 5 | 6 | cravat_summary : summary/tsv/cravat_summary.tsv summary/cravat_summary.xlsx 7 | 8 | summary/tsv/cravat_summary.tsv : $(wildcard cravat/$(SAMPLES).txt) 9 | $(call RUN,-c -s 24G -m 48G -w 7200,"$(RSCRIPT) modules/summary/cravat_summary.R --sample_names '$(SAMPLES)'") 10 | 11 | summary/cravat_summary.xlsx : summary/tsv/cravat_summary.tsv 12 | $(call RUN,-c -s 24G -m 48G -w 7200,"python modules/summary/cravat_summary.py") 13 | 14 | .DELETE_ON_ERROR: 15 | .SECONDARY: 16 | .PHONY: $(PHONY) 17 | -------------------------------------------------------------------------------- /summary/cravat_summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import csv 3 | from xlsxwriter.workbook import Workbook 4 | 5 | tsv_file = 'summary/tsv/cravat_summary.tsv' 6 | xlsx_file = 'summary/cravat_summary.xlsx' 7 | 8 | workbook = Workbook(xlsx_file) 9 | worksheet = workbook.add_worksheet() 10 | 11 | tsv_reader = csv.reader(open(tsv_file, 'rb'), delimiter='\t') 12 | 13 | for row, data in enumerate(tsv_reader): 14 | worksheet.write_row(row, 0, data) 15 | 16 | workbook.close() 17 | -------------------------------------------------------------------------------- /summary/delmh_summary.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/delmh_summary.$(NOW) 4 | PHONY += delmh_summary 5 | 6 | delmh_summary : summary/tsv/delmh_summary.tsv 7 | 8 | summary/tsv/delmh_summary.tsv : summary/tsv/mutation_summary.tsv 9 | $(call RUN,-n 1 -s 8G -m 8G,"set -o pipefail && \ 10 | $(RSCRIPT) modules/summary/delmh_summary.R --input_file $(<)") 11 | 12 | .DELETE_ON_ERROR: 13 | .SECONDARY: 14 | .PHONY: $(PHONY) 15 | -------------------------------------------------------------------------------- /summary/genome_summary_excel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import csv 3 | from xlsxwriter.workbook import Workbook 4 | 5 | tsv_file = 'summary/tsv/genome_summary.tsv' 6 | xlsx_file = 'summary/genome_summary.xlsx' 7 | 8 | workbook = Workbook(xlsx_file) 9 | worksheet = workbook.add_worksheet() 10 | 11 | tsv_reader = csv.reader(open(tsv_file, 'rb'), delimiter='\t') 12 | 13 | for row, data in enumerate(tsv_reader): 14 | worksheet.write_row(row, 0, data) 15 | 16 | workbook.close() 17 | -------------------------------------------------------------------------------- /summary/hotspot_summary_excel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import csv 3 | from xlsxwriter.workbook import Workbook 4 | 5 | tsv_file = 'summary/tsv/hotspot_summary.tsv' 6 | xlsx_file = 'summary/hotspot_summary.xlsx' 7 | 8 | workbook = Workbook(xlsx_file) 9 | worksheet = workbook.add_worksheet() 10 | 11 | tsv_reader = csv.reader(open(tsv_file, 'rb'), delimiter='\t') 12 | 13 | for row, data in enumerate(tsv_reader): 14 | worksheet.write_row(row, 0, data) 15 | 16 | workbook.close() 17 | -------------------------------------------------------------------------------- /summary/hotspotsummary.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/hotspot_summary.$(NOW) 4 | PHONY += hotspot summary summary/tsv 5 | 6 | HOTSPOT ?= $(wildcard $(foreach sample,$(SAMPLES),hotspot/$(sample).txt)) 7 | 8 | hotspot_summary : summary/tsv/hotspot_summary.tsv summary/hotspot_summary.xlsx 9 | 10 | summary/tsv/hotspot_summary.tsv : $(wildcard hotspot/$(SAMPLES).txt) 11 | $(call RUN,-n 1 -s 4G -m 4G,"$(RSCRIPT) modules/summary/hotspotsummary.R --in_file '$(HOTSPOT)' --out_file summary/tsv/hotspot_summary.tsv") 12 | 13 | summary/hotspot_summary.xlsx : summary/tsv/hotspot_summary.tsv 14 | $(call RUN,-n 1 -s 4G -m 4G,"python modules/summary/hotspot_summary_excel.py") 15 | 16 | .DELETE_ON_ERROR: 17 | .SECONDARY: 18 | .PHONY: $(PHONY) 19 | -------------------------------------------------------------------------------- /summary/mouse_summary_excel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import csv 3 | from xlsxwriter.workbook import Workbook 4 | 5 | tsv_file = 'summary/tsv/mouse_summary.tsv' 6 | xlsx_file = 'summary/mouse_summary.xlsx' 7 | 8 | workbook = Workbook(xlsx_file) 9 | worksheet = workbook.add_worksheet() 10 | 11 | tsv_reader = csv.reader(open(tsv_file, 'rb'), delimiter='\t') 12 | 13 | for row, data in enumerate(tsv_reader): 14 | worksheet.write_row(row, 0, data) 15 | 16 | workbook.close() 17 | -------------------------------------------------------------------------------- /summary/mousesummary.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | 5 | if (!interactive()) { 6 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 7 | } 8 | 9 | args_list <- list(make_option("--sample_names", default = NA, type = 'character', help = "input file name"), 10 | make_option("--out_file", default = NA, type = 'character', help = "output file name")) 11 | parser <- OptionParser(usage = "%prog", option_list = args_list) 12 | arguments <- parse_args(parser, positional_arguments = T) 13 | opt <- arguments$options 14 | 15 | sample_names = unlist(strsplit(x=opt$sample_names, split=" ", fixed=TRUE)) 16 | out_file_name = opt$out_file 17 | DP = AD = MAF = list() 18 | for (i in 1:length(sample_names)) { 19 | tmp = read.csv(file=paste0("sufam/", sample_names[i], ".txt"), header=TRUE, sep="\t", stringsAsFactors=FALSE) 20 | DP[[i]] = tmp[,"cov"] 21 | AD[[i]] = tmp[,"val_al_count"] 22 | MAF[[i]] = tmp[,"val_maf"] 23 | } 24 | DP = do.call(cbind, DP) 25 | colnames(DP) = paste0("DP_", sample_names) 26 | AD = do.call(cbind, AD) 27 | colnames(AD) = paste0("AD_", sample_names) 28 | MAF = do.call(cbind, MAF) 29 | colnames(MAF) = paste0("MAF_", sample_names) 30 | vcf = read.table(file="sufam/pdx.vcf", header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE) 31 | chr = vcf[,1] 32 | pos = vcf[,2] 33 | ref = vcf[,4] 34 | alt = vcf[,5] 35 | gene_symbol = unlist(lapply(strsplit(vcf[,7], "p.", fixed=TRUE), function(x) { x[1] })) 36 | hgvsp_short = paste0("p.", unlist(lapply(strsplit(vcf[,7], "p.", fixed=TRUE), function(x) { x[2] }))) 37 | res = cbind(chr, pos, ref, alt, gene_symbol, hgvsp_short, DP, AD, MAF) 38 | colnames(res)[1:6] = c("Chromosome", "Position", "Reference_Allele", "Alternate_Allele", "Gene_Symbol", "HGVSp") 39 | write.table(res, file=out_file_name, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE) 40 | -------------------------------------------------------------------------------- /sv_callers/crest.mk: -------------------------------------------------------------------------------- 1 | # run crest 2 | 3 | include modules/Makefile.inc 4 | 5 | LOGDIR = log/crest.$(NOW) 6 | 7 | CREST_DIR = $(HOME)/share/usr/crest 8 | CREST = PERL5LIB=/home/limr/share/usr/crest:/home/limr/share/usr/src/bioperl-live:/home/limr/share/usr/perl5/lib/perl5:/home/limr/share/usr/src/ensembl/modules:/home/limr/share/usr/src/ensembl-compara/modules:/home/limr/share/usr/src/ensembl-variation/modules:/home/limr/share/usr/src/ensembl-funcgen/modules $(PERL) $(CREST_DIR)/CREST.pl 9 | CREST_OPTS = --blat $(BLAT) --cap3 $(CAP3) --blatclient $(GFCLIENT) --blatserver 140.163.153.48 --blatport 88878 -t $(REF_2BIT) --ref_genome $(REF_FASTA) 10 | EXTRACT_SCLIP = PERL5LIB=$(CREST_DIR):$(PERL5LIB) $(CREST_DIR)/extractSClip.pl 11 | EXTRACT_SCLIP_OPTS = --ref_genome $(REF_FASTA) 12 | 13 | .SECONDARY: 14 | .DELETE_ON_ERROR: 15 | 16 | 17 | ifdef SAMPLE_PAIRS 18 | PHONY += crestTN 19 | crestTN : $(foreach pair,$(SAMPLE_PAIRS),crest/$(pair).crest_timestamp) 20 | else 21 | PHONY += crest 22 | crest : $(foreach sample,$(SAMPLES),crest/$(sample).crest_timestamp) 23 | endif 24 | 25 | crest/%.read_len : bam/%.bam 26 | $(call RUN,-s 3G -m 5G,"$(SAMTOOLS) view $< | tail -n+100000 | head -1 | awk '{ print length(\$$10) }' > $@") 27 | 28 | crest/%.sclip.txt : bam/%.bam 29 | $(call RUN,-s 6G -m 8G,"$(EXTRACT_SCLIP) $(EXTRACT_SCLIP_OPTS) -p $(@D)/$* -i $<") 30 | 31 | crest/%.crest_timestamp : bam/%.bam crest/%.sclip.txt crest/%.read_len 32 | $(call RUN,-s 15G -m 60G,"$(CREST) $(CREST_OPTS) -f $(<<) -d $< -p $(@D)/$* -l `cat $(<<<)` && touch $@") 33 | 34 | define crest-tumor-normal 35 | crest/$1_$2.crest_timestamp : bam/$1.bam bam/$2.bam crest/$1.sclip.txt crest/$1.read_len 36 | $$(call RUN,-s 15G -m 60G,"$$(CREST) $$(CREST_OPTS) -f $$(<<<) -d $$< -g $$(<<) -p $$(@D)/$1_$2 -l `cat $$(<<<<)` && touch $$@") 37 | endef 38 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call crest-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 39 | 40 | .PHONY: $(PHONY) 41 | -------------------------------------------------------------------------------- /sv_callers/delly.mk: -------------------------------------------------------------------------------- 1 | # run delly 2 | LOGDIR = log/delly.$(NOW) 3 | 4 | include modules/Makefile.inc 5 | 6 | DELLY_ENV = $(HOME)/share/usr/anaconda-envs/delly-0.7.6 7 | 8 | SUAVE_BAM_TO_H5 = python $(HOME)/share/usr/delly/vis/suave/suave_bam_to_h5.py 9 | 10 | DELLY_TYPES = DEL DUP INV TRA INS 11 | 12 | .SECONDARY: 13 | .DELETE_ON_ERROR: 14 | 15 | .PHONY: delly 16 | 17 | delly: $(foreach pair,$(SAMPLE_PAIRS),$(foreach type,$(DELLY_TYPES),delly/bcf/$(pair).delly_$(type).bcf)) \ 18 | $(foreach sample,$(SAMPLES),delly/h5/$(sample).h5) 19 | 20 | define delly-pair-type 21 | delly/bcf/$1_$2.delly_$3.bcf : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai 22 | $$(call RUN,-v $$(DELLY_ENV) -w 256:00:00,8G,12G,"delly call -t $3 -o $$@ -g $$(REF_FASTA) $$< $$(<<)") 23 | endef 24 | $(foreach pair,$(SAMPLE_PAIRS),\ 25 | $(foreach type,$(DELLY_TYPES),\ 26 | $(eval $(call delly-pair-type,$(tumor.$(pair)),$(normal.$(pair)),$(type))))) 27 | 28 | delly/h5/%.h5 : bam/%.bam 29 | $(call RUN,-v $(DELLY_ENV) -s 8G -m 10G,"$(SUAVE_BAM_TO_H5) -s $* -c gzip -o $@ $<") 30 | 31 | 32 | -------------------------------------------------------------------------------- /sv_callers/destruct.mk: -------------------------------------------------------------------------------- 1 | # Run destruct 2 | # 3 | # 4 | # Author: Fong Chun Chan 5 | # 6 | 7 | SHELL := /bin/bash 8 | 9 | include modules/Makefile.inc 10 | 11 | LOGDIR = destruct/log 12 | 13 | DESTRUCT_CONFIG_FILE = $(HOME)/share/usr/destruct/destruct/config.txt 14 | #ANALYZE_DNA_BAM = $(HOME)/share/usr/nfuse-0.1.2/scripts/analyze_dna_bam.pl -c $(HOME)/usr/nfuse-0.1.2/scripts/config.txt 15 | #DESTRUCT = /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/python2.7 /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/destruct.py /scratch/sohrab_temp/amcpherson_tmp/forray/genesis_config.ini 16 | #DESTRUCT = /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/python2.7 $(HOME)/share/usr/destruct/destruct/destruct.py /scratch/sohrab_temp/amcpherson_tmp/config.ini 17 | DESTRUCT = $(PYTHON) $(HOME)/share/usr/destruct/destruct/destruct.py $(DESTRUCT_CONFIG_FILE) 18 | 19 | VPATH = bam 20 | 21 | SAMPLE_FILE = samples.txt 22 | 23 | .SECONDARY: 24 | .DELETE_ON_ERROR: 25 | .PHONY : all 26 | 27 | all : destruct/all.timestamp 28 | 29 | #### 30 | ## Build the destruct file list 31 | #### 32 | destruct/all_file_list.txt : $(foreach sample,$(SAMPLES),$(sample).bam) 33 | mkdir -p $(@D); rm -f $@; for bam in $^; do \ 34 | sample=`echo "$$bam" | sed 's/.*\///; s/\..*//;'`; \ 35 | echo -e "$$sample\t$$bam" >> $@; \ 36 | done 37 | 38 | destruct/%.timestamp : destruct/%_file_list.txt 39 | mkdir -p $(@D)/$*.tmp $(@D)/breakpoints $(@D)/breakreads $(LOGDIR); $(DESTRUCT) $< $(@D)/$*.tmp $(@D)/breakpoints/$*.breakpoints $(@D)/breakreads/$*.breakreads qsub -p 100 &> $(LOGDIR)/$*.log && touch $@ 40 | -------------------------------------------------------------------------------- /sv_callers/ericScript.mk: -------------------------------------------------------------------------------- 1 | # run EricScript 2 | # author: Raymond Lim 3 | 4 | ERICSCRIPT_ENV = $(HOME)/share/usr/anaconda-envs/ericscript-0.5.5/ 5 | ERICSCRIPT = ericscript.pl 6 | ERICSCRIPT_OPTS ?= --refid $(ERICSCRIPT_SPECIES) -db $(ERICSCRIPT_DB) --remove 7 | ERICSCRIPT_TO_USV = python modules/sv_callers/ericscript2usv.py 8 | 9 | .PHONY: ericscript 10 | .SECONDARY: 11 | .DELETE_ON_ERROR: 12 | 13 | ericscript: $(foreach sample,$(SAMPLES),usv/$(sample).ericscript.tsv) 14 | 15 | ericscript/%_ericscript.timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz 16 | $(call RUN,-s 14G -m 14G -n 7 -N $*_ericscript,"$(ERICSCRIPT) $(ERICSCRIPT_OPTS) -p 7 -name $* -o $(@D)/$* $^ && \ 17 | touch $@") 18 | 19 | usv/%.ericscript.tsv : ericscript/%_ericscript.timestamp 20 | $(call RUN,,"$(ERICSCRIPT_TO_USV) < eriscript/$*/$*.results.filtered.tsv > $@") 21 | -------------------------------------------------------------------------------- /sv_callers/extractCoordsFromDefuse.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | use Getopt::Std; 7 | 8 | my %opt; 9 | getopts('t:', \%opt); 10 | 11 | my $usage = <) { 31 | chomp $line; 32 | 33 | if ($line =~ /^cluster_id/) { 34 | @header = split /\t/, $line; 35 | next; 36 | } 37 | 38 | my @arr = split /\t/, $line; 39 | my %F = map { $_ => shift @arr } @header; 40 | 41 | my $upstream = ($F{upstream_gene} eq $F{gene_name1})? 1 : 2; 42 | my $downstream = ($F{downstream_gene} eq $F{gene_name1})? 1 : 2; 43 | 44 | my $upstreamChr = "chr" . $F{"gene_chromosome" . $upstream }; 45 | my $downstreamChr = "chr" . $F{"gene_chromosome" . $downstream }; 46 | 47 | # give first/last nt lost 48 | my $upstreamPosn = ($F{"gene_strand" . $upstream} eq "+")? $F{"genomic_break_pos" . $upstream } + 1 : $F{"genomic_break_pos" . $upstream } - 1; 49 | my $downstreamPosn = ($F{"gene_strand" . $downstream} eq "+")? $F{"genomic_break_pos" . $downstream } - 1 : $F{"genomic_break_pos" . $downstream } + 1; 50 | 51 | print join("\t", ($upstreamChr, $upstreamPosn, $downstreamChr, $downstreamPosn, $tissueType)) . "\n"; 52 | } 53 | -------------------------------------------------------------------------------- /sv_callers/filterDefuse.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | #use strict; 4 | #use warnings; 5 | 6 | #use Getopt::Std; 7 | #my %opt; 8 | #getopts('h', \%opt); 9 | 10 | #my $usage <) { 20 | chomp $line; 21 | 22 | if ($line =~ /^cluster_id/) { 23 | @header = split /\t/, $line; 24 | push @header, "upstream_gene"; 25 | push @header, "downstream_gene"; 26 | #print join ("\t", 0..$#header), "\n"; 27 | print join("\t", @header) . "\n"; 28 | next; 29 | } 30 | 31 | my @arr = split /\t/, $line; 32 | my %F = map { $_ => shift @arr } @header; 33 | 34 | my $gene1 = $F{"gene_strand1"}; 35 | my $gene2 = $F{"gene_strand2"}; 36 | my $genomic1 = $F{"genomic_strand1"}; 37 | my $genomic2 = $F{"genomic_strand2"}; 38 | 39 | if ($gene1 eq $genomic1 && $gene2 ne $genomic2) { 40 | $F{"upstream_gene"} = $F{"gene_name1"}; 41 | $F{"downstream_gene"} = $F{"gene_name2"}; 42 | } elsif ($gene1 ne $genomic1 && $gene2 eq $genomic2) { 43 | $F{"upstream_gene"} = $F{"gene_name2"}; 44 | $F{"downstream_gene"} = $F{"gene_name1"}; 45 | } else { 46 | $F{"upstream_gene"} = ""; 47 | $F{"downstream_gene"} = ""; 48 | } 49 | 50 | if ($F{'orf'} eq "N") { 51 | print join("\t", @F{@header}) . "\n"; 52 | next; 53 | } 54 | 55 | 56 | if ($gene1 eq $genomic1) { 57 | $p5 = $F{"gene_location1"}; 58 | } else { 59 | $p3 = $F{"gene_location2"}; 60 | } 61 | 62 | if ($gene2 eq $genomic2) { 63 | $p5 = $F{"gene_location1"}; 64 | } else { 65 | $p3 = $F{"gene_location2"}; 66 | } 67 | if ($p5 eq "utr3p" || $p5 eq "downstream") { 68 | $F{"orf"} = "Y (UTR mid-fusion)"; 69 | } elsif ($p3 eq "utr5p" || $p3 eq "upstream") { 70 | $F{"orf"} = "Y (UTR mid-fusion)"; 71 | } 72 | print join("\t", @F{@header}) . "\n"; 73 | } 74 | -------------------------------------------------------------------------------- /sv_callers/fusioncatcher.mk: -------------------------------------------------------------------------------- 1 | # Run fusioncatcher 2 | ##### DEFAULTS ###### 3 | 4 | LOGDIR = log/fusioncatcher.$(NOW) 5 | 6 | ##### MAKE INCLUDES ##### 7 | include modules/Makefile.inc 8 | 9 | FUSIONCATCHER = $(HOME)/share/usr/fusioncatcher/fusioncatcher_v0.99.2/fusioncatcher 10 | FUSIONCATCHER_OPTS = -d $(HOME)/share/usr/fusioncatcher/data/current --extract-buffer-size=35000000000 11 | 12 | .DELETE_ON_ERROR: 13 | .SECONDARY: 14 | .PHONY: all 15 | 16 | all : $(foreach sample,$(SAMPLES),fusioncatcher/$(sample).fusioncatcher_timestamp) 17 | 18 | fusioncatcher/%.fusioncatcher_timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz 19 | $(call RUN,-n 8 -s 1G -m 4G,"$(FUSIONCATCHER) $(FUSIONCATCHER_OPTS) -p 8 -o $(@D)/$* -i $<$(,)$(<<) && touch $@") 20 | -------------------------------------------------------------------------------- /sv_callers/fusionfinder.mk: -------------------------------------------------------------------------------- 1 | # Run fusionfinder 2 | ##### DEFAULTS ###### 3 | 4 | LOGDIR = log/fusionfinder.$(NOW) 5 | 6 | ##### MAKE INCLUDES ##### 7 | include modules/Makefile.inc 8 | 9 | FUSIONFINDER = 10 | FUSIONCATCHER_OPTS = --phred33 --cref --ncref 11 | 12 | .DELETE_ON_ERROR: 13 | .SECONDARY: 14 | .PHONY: all 15 | 16 | all : $(foreach sample,$(SAMPLES),fusioncatcher/$(sample).fusioncatcher_timestamp) 17 | 18 | fusioncatcher/%.fusioncatcher_timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz 19 | $(call RUN,-n 8 -s 1G -m 4G,"$(FUSIONCATCHER) $(FUSIONCATCHER_OPTS) -p 8 -o $(@D)/$* -i $<$(,)$(<<) && touch $@") 20 | -------------------------------------------------------------------------------- /sv_callers/hydra.mk: -------------------------------------------------------------------------------- 1 | # run hydra 2 | 3 | LOGDIR = log/hydra.$(NOW) 4 | 5 | HYDRA = $(HOME)/share/usr/bin/hydra 6 | override HYDRA_OPTS ?= -mld 500 -mn 1500 7 | BAM_TO_FASTQ = $(HOME)/share/usr/bin/bamToFastq 8 | BAM_TO_BED = /opt/common/bedtools/bedtools-2.17.0/bin/bamToBed 9 | DEDUP_DISCORDANTS = $(HOME)/share/usr/bin/dedupDiscordants.py 10 | PAIR_DISCORDANTS = $(HOME)/share/usr/bin/pairDiscordants.py 11 | 12 | include modules/Makefile.inc 13 | include modules/variant_callers/gatk.inc 14 | 15 | .SECONDARY: 16 | .DELETE_ON_ERROR: 17 | .PHONY: all 18 | 19 | all : $(foreach sample,$(SAMPLES),hydra/breaks/$(sample).breaks) 20 | 21 | 22 | #hydra/disc_fastq/%.disc.1.fastq.gz hydra/disc_fastq/%.disc.2.fastq.gz : bam/%.bam 23 | #$(INIT) $(SAMTOOLS) view -uF 2 $< | $(BAM_TO_FASTQ) -bam stdin -fq1 >( gzip -c > hydra/disc_fastq/$*.disc.1.fastq.gz) -fq2 >( gzip -c > hydra/disc_fastq/$*.disc.2.fastq.gz) 24 | 25 | hydra/bam/%.disc.bam : bam/%.bam 26 | $(call RUN,,"$(SAMTOOLS) view -bF 2 $< > $@") 27 | 28 | hydra/bed/%.disc.bedpe : hydra/bam/%.disc.bam 29 | $(call RUN,,"$(BAM_TO_BED) -i $< -tag NM | $(PAIR_DISCORDANTS) -i stdin -m hydra -z 800 > $@") 30 | 31 | hydra/bed/%.disc.dedup.bedpe : hydra/bed/%.disc.bedpe 32 | $(call RUN,,"$(DEDUP_DISCORDANTS) -i $< -s 3 > $@") 33 | 34 | hydra/breaks/%.breaks : hydra/bed/%.disc.dedup.bedpe 35 | $(call RUN,,"$(HYDRA) -in $< -out $@ $(HYDRA_OPTS)") 36 | -------------------------------------------------------------------------------- /sv_callers/manta.inc: -------------------------------------------------------------------------------- 1 | ifndef MANTA_INC 2 | CONFIG_MANTA = $(HOME)/share/usr/manta-0.29.6.centos5_x86_64/bin/configManta.py 3 | CONFIG_MANTA_OPTS ?= --referenceFasta $(REF_FASTA) --config modules/sv_callers/manta_config.py.ini $(if $(TARGETS_FILE),--exome) 4 | MANTA_HS_CONFIG = modules/sv_callers/manta_hs_config.py.ini 5 | MANTA_CONFIG = modules/sv_callers/manta_config.py.ini 6 | MANTA_HIGH_SENS ?= false 7 | CONFIG_MANTA_OPTS = --referenceFasta $(REF_FASTA) \ 8 | --config $(if $(findstring true,$(MANTA_HIGH_SENS)),\ 9 | $(MANTA_HS_CONFIG),$(MANTA_CONFIG)) \ 10 | $(if $(TARGETS_FILE),--exome) \ 11 | $(if $(MANTA_REGION),--region $(MANTA_REGION)) 12 | endif 13 | MANTA_INC = true 14 | 15 | -------------------------------------------------------------------------------- /sv_callers/manta.mk: -------------------------------------------------------------------------------- 1 | # run manta on tumour-normal matched pairs 2 | 3 | include modules/Makefile.inc 4 | include modules/sv_callers/manta.inc 5 | 6 | LOGDIR ?= log/manta.$(NOW) 7 | PHONY += manta manta_vcfs 8 | 9 | manta : manta_vcfs 10 | 11 | manta_vcfs: $(foreach sample,$(SAMPLES),vcf/$(sample).manta_sv.eff.vcf vcf/$(sample).manta_indels.eff.vcf vcf/$(sample).manta_candidate_sv.eff.vcf) 12 | 13 | manta/%/runWorkflow.py : bam/%.bam bam/%.bam.bai 14 | $(INIT) $(CONFIG_MANTA) $(CONFIG_MANTA_OPTS) --tumorBam $< --runDir $(@D) 15 | 16 | manta/%/results/variants/tumorSV.vcf.gz manta/%/results/variants/candidateSmallIndels.vcf.gz manta/%/results/variants/candidateSV.vcf.gz : manta/%/runWorkflow.py 17 | $(call RUN,-n 8 -s 2G -m 2G,"python $< -m local -j 8") 18 | 19 | vcf/%.manta_sv.vcf : manta/%/results/variants/tumorSV.vcf.gz 20 | $(INIT) zcat $< > $@ 21 | 22 | vcf/%.manta_indels.vcf : manta/%/results/variants/candidateSmallIndels.vcf.gz 23 | $(INIT) zcat $< > $@ 24 | 25 | vcf/%.manta_candidate_sv.vcf : manta/%/results/variants/candidateSV.vcf.gz 26 | $(INIT) zcat $< > $@ 27 | 28 | .PHONY: $(PHONY) 29 | 30 | include modules/vcf_tools/vcftools.mk 31 | -------------------------------------------------------------------------------- /sv_callers/mantaRnaseq.mk: -------------------------------------------------------------------------------- 1 | # run manta on rna-seq 2 | 3 | CONFIG_MANTA_OPTS += --rna 4 | 5 | include modules/sv_callers/manta.mk 6 | -------------------------------------------------------------------------------- /sv_callers/manta_config.py.ini: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # This section contains all configuration settings for the top-level manta workflow, 4 | # 5 | [manta] 6 | 7 | referenceFasta = /illumina/development/Isis/Genomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa 8 | 9 | # Run discovery and candidate reporting for all SVs/indels at or above this size: 10 | minCandidateVariantSize = 8 11 | 12 | # Remove all edges from the graph unless they're supported by this many 'observations'. 13 | # Note that one supporting read pair or split read usually equals one observation, but evidence is sometimes downweighted. 14 | minEdgeObservations = 3 15 | 16 | # Run discovery and candidate reporting for all SVs/indels with at least this 17 | # many spanning support observations 18 | minCandidateSpanningCount = 3 19 | 20 | # After candidate identification, only score and report SVs/indels at or above this size: 21 | minScoredVariantSize = 51 22 | 23 | # minimum VCF "QUAL" score for a variant to be included in the diploid vcf: 24 | minDiploidVariantScore = 10 25 | 26 | # VCF "QUAL" score below which a variant is marked as filtered in the diploid vcf: 27 | minPassDiploidVariantScore = 20 28 | 29 | # minimum genotype quality score below which single samples are filtered for a variant in the diploid vcf: 30 | minPassDiploidGTScore = 15 31 | 32 | # somatic quality scores below this level are not included in the somatic vcf: 33 | minSomaticScore = 10 34 | 35 | # somatic quality scores below this level are filtered in the somatic vcf: 36 | minPassSomaticScore = 30 37 | -------------------------------------------------------------------------------- /sv_callers/manta_hs_config.py.ini: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # This section contains all configuration settings for the top-level manta workflow, 4 | # 5 | [manta] 6 | 7 | referenceFasta = /home/limr/share/reference/GATK_bundle/2.3/human_g1k_v37.fa 8 | 9 | # Run discovery and candidate reporting for all SVs/indels at or above this size: 10 | minCandidateVariantSize = 8 11 | 12 | # Remove all edges from the graph unless they're supported by this many 'observations'. 13 | # Note that one supporting read pair or split read usually equals one observation, but evidence is sometimes downweighted. 14 | minEdgeObservations = 1 15 | 16 | # Run discovery and candidate reporting for all SVs/indels with at least this 17 | # many spanning support observations 18 | minCandidateSpanningCount = 1 19 | 20 | # After candidate identification, only score and report SVs/indels at or above this size: 21 | minScoredVariantSize = 51 22 | 23 | # minimum VCF "QUAL" score for a variant to be included in the diploid vcf: 24 | minDiploidVariantScore = 10 25 | 26 | # VCF "QUAL" score below which a variant is marked as filtered in the diploid vcf: 27 | minPassDiploidVariantScore = 20 28 | 29 | # minimum genotype quality score below which single samples are filtered for a variant in the diploid vcf: 30 | minPassDiploidGTScore = 15 31 | 32 | # somatic quality scores below this level are not included in the somatic vcf: 33 | minSomaticScore = 10 34 | 35 | # somatic quality scores below this level are filtered in the somatic vcf: 36 | minPassSomaticScore = 30 37 | -------------------------------------------------------------------------------- /sv_callers/manta_tumor_normal.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | include modules/sv_callers/manta.inc 3 | 4 | LOGDIR ?= log/manta_tumor_normal.$(NOW) 5 | 6 | manta : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).manta_sv.vcf) 7 | 8 | define manta-tumor-normal 9 | manta/$1_$2/runWorkflow.py : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai 10 | $$(INIT) $$(CONFIG_MANTA) $$(CONFIG_MANTA_OPTS) --tumorBam $$(<) --normalBam $$(<<) --runDir $$(@D) 11 | 12 | manta/$1_$2.manta_timestamp : manta/$1_$2/runWorkflow.py 13 | $$(call RUN,-n 8 -s 2G -m 4G -w 72:00:00,"set -o pipefail && \ 14 | python $$(<) -m local -j 8 && touch $$(@)") 15 | 16 | manta/$1_$2/results/variants/somaticSV.vcf.gz : manta/$1_$2.manta_timestamp 17 | 18 | vcf/$1_$2.manta_sv.vcf : manta/$1_$2/results/variants/somaticSV.vcf.gz 19 | $$(INIT) zcat $$(<) > $$(@) 20 | 21 | endef 22 | $(foreach pair,$(SAMPLE_PAIRS), \ 23 | $(eval $(call manta-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 24 | 25 | ..DUMMY := $(shell mkdir -p version; \ 26 | python --version &> version/manta_tumor_normal.txt) 27 | .SECONDARY: 28 | .DELETE_ON_ERROR: 29 | .PHONY: manta 30 | -------------------------------------------------------------------------------- /sv_callers/mapsplice.mk: -------------------------------------------------------------------------------- 1 | # Run mapsplice 2 | ##### DEFAULTS ###### 3 | 4 | LOGDIR = log/mapsplice.$(NOW) 5 | 6 | ##### MAKE INCLUDES ##### 7 | include modules/Makefile.inc 8 | 9 | MAPSPLICE_TO_USV = python modules/sv_callers/mapsplice2usv.py 10 | 11 | MAPSPLICE_ENV = $(HOME)/share/usr/anaconda-envs/mapsplice-2.2.1 12 | MAPSPLICE = mapsplice.py 13 | MAPSPLICE_OPTS = -c $(MAPSPLICE_REF_DIR) -x $(MAPSPLICE_REF_BASENAME) --bam --gene-gtf $(GENES_GTF) --fusion 14 | 15 | ifeq ($(BAM_PHRED64),true) 16 | MAPSPLICE_OPTS += --qual-scal phred64 17 | else 18 | MAPSPLICE_OPTS += --qual-scal phred33 19 | endif 20 | 21 | .DELETE_ON_ERROR: 22 | .SECONDARY: 23 | .PHONY: mapsplice 24 | 25 | mapsplice : $(foreach sample,$(SAMPLES),usv/$(sample.mapsplice.tsv) 26 | 27 | mapsplice/%_mapsplice.timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz 28 | $(call RUN,-n 6 -s 2G -m 3G,"TMP1=`mktemp --tmpdir=$(TMPDIR)`.1.fastq; \ 29 | TMP2=`mktemp --tmpdir=$(TMPDIR)`.2.fastq; \ 30 | gzip -c $< > \$$TMP1; \ 31 | gzip -c $(<<) > \$$TMP2; \ 32 | mkdir -p mapsplice/$*; \ 33 | $(MAPSPLICE) $(MAPSPLICE_OPTS) -p 6 -o mapsplice/$* -1 \$$TMP1 -2 \$$TMP2 && touch $@; \ 34 | rm \$$TMP1 \$$TMP2") 35 | 36 | usv/%.mapsplice.tsv : mapsplice/%_mapsplice.timestamp 37 | $(RUN,,"$(MAPSPLICE_TO_USV) < mapsplice/$*/fusions_not_well_annotated.txt mapsplice/%*/fusions_well_annotated.txt > $@") 38 | -------------------------------------------------------------------------------- /sv_callers/nfuseDNA.mk: -------------------------------------------------------------------------------- 1 | # Run nfuse on dna bams 2 | # This module is defunct now. Use destruct to call rearrangements for DNA only 3 | # 4 | # 5 | # Author: Raymond Lim 6 | # 7 | 8 | SHELL := /bin/bash 9 | 10 | include modules/Makefile.inc 11 | 12 | LOGDIR = nfuse/log 13 | ANALYZE_DNA_BAM = $(HOME)/share/usr/nfuse-0.1.2/scripts/analyze_dna_bam.pl -c $(HOME)/usr/nfuse-0.1.2/scripts/config.txt 14 | 15 | ANALYZE_DNA_BAM = /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/python2.7 /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/destruct.py /scratch/sohrab_temp/amcpherson_tmp/forray/genesis_config.ini 16 | 17 | VPATH = bam 18 | 19 | SAMPLE_FILE = samples.txt 20 | SAMPLES = $(shell cat $(SAMPLE_FILE)) 21 | 22 | .SECONDARY: 23 | .DELETE_ON_ERROR: 24 | .PHONY : all 25 | 26 | all : nfuse/timestamp 27 | 28 | nfuse/file_list.txt : $(foreach sample,$(SAMPLES),$(sample).bam} 29 | mkdir -p $(@D); rm -f $@; for bam in $^; do \ 30 | sample=`echo "$$bam" | sed 's/.*\///; s/\..*//;'`; \ 31 | echo -e "$$sample\t$$bam" >> $@; \ 32 | done 33 | 34 | nfuse/timestamp : nfuse/file_list.txt 35 | mkdir -p tmp $(LOGDIR); $(ANALYZE_DNA_BAM) $< tmp $(@D) sge -p 100 &> $(LOGDIR)/nfuse.log && touch $@ 36 | 37 | -------------------------------------------------------------------------------- /sv_callers/nfuseWGSSWTSS.mk: -------------------------------------------------------------------------------- 1 | # vim: set ft=make : 2 | 3 | include modules/Makefile.inc 4 | 5 | WGSS_WTSS_PAIR_FILE ?= wgss_wtss_pairs.txt 6 | 7 | WGSS_SAMPLES ?= $(shell cut -f 1 $(WGSS_WTSS_PAIR_FILE)) 8 | WTSS_SAMPLES ?= $(shell cut -f 2 $(WGSS_WTSS_PAIR_FILE)) 9 | SAMPLES ?= $(WGSS_SAMPLES) $(WTSS_SAMPLES) 10 | NSAMPLES ?= $(words $(WGSS_SAMPLES)) 11 | 12 | $(foreach i,$(shell seq 1 $(NSAMPLES)),$(eval wgss_lookup.$(word $i,$(WGSS_SAMPLES)) := $(word $i,$(WTSS_SAMPLES)))) 13 | 14 | NFUSE = $(HOME)/share/usr/nfuse-0.2.0/scripts/nfuse.pl -c $(HOME)/share/usr/nfuse-0.2.0/scripts/config.txt -s sge -p 100 15 | 16 | #VPATH = ../WTSS/bam ../WGSS/bam 17 | 18 | LOGDIR ?= log/nfuse.$(NOW) 19 | 20 | .SECONDARY: 21 | .DELETE_ON_ERROR: 22 | .PHONY: all 23 | 24 | all : $(foreach wgss_sample,$(WGSS_SAMPLES),nfuse/$(wgss_sample)_$(wgss_lookup.$(wgss_sample)).timestamp) 25 | 26 | #all : $(foreach i,$(shell seq 1 $(NSAMPLES)),nfuse/$(word $i,$(WGSS_SAMPLES))_$(word $i,$(WTSS_SAMPLES)).timestamp) 27 | 28 | #$(call nfuse-wgss-wtss,wgss-sample,wtss-sample) 29 | define nfuse-wgss-wtss 30 | nfuse/$1_$2.timestamp : fastq/$1.1.fastq.gz fastq/$1.2.fastq.gz fastq/$2.1.fastq.gz fastq/$2.2.fastq.gz 31 | $$(INIT) $$(NFUSE) --dnafq1 $$(word 1,$$^) --dnafq2 $$(word 2,$$^) --rnafq1 $$(word 3,$$^) --rnafq2 $$(word 4,$$^) -o nfuse/$1_$2 -n $1_$2 && touch $$@ &> $(LOGDIR)/$1_$2.log 32 | endef 33 | $(foreach i,$(shell seq 1 $(NSAMPLES)),$(eval $(call nfuse-wgss-wtss,$(word $i,$(WGSS_SAMPLES)),$(word $i,$(WTSS_SAMPLES))))) 34 | 35 | 36 | #include modules/fastq_tools/fastq.mk 37 | -------------------------------------------------------------------------------- /sv_callers/oncofuse.mk: -------------------------------------------------------------------------------- 1 | # run oncofuse 2 | # b37 only 3 | 4 | ONCOFUSE_MEM = $(JAVA7) -Xmx$1 -jar $(ONCOFUSE_JAR) 5 | ONCOFUSE_TISSUE_TYPE ?= EPI 6 | 7 | %.oncofuse.txt : %.coord.txt 8 | $(call RUN,-s 8G -m 12G,"$(call ONCOFUSE_MEM,7G) $< coord $(ONCOFUSE_TISSUE_TYPE) $@") 9 | 10 | %.oncofuse.merged.txt : %.txt %.oncofuse.txt 11 | $(INIT) head -1 $< | sed 's/^/RowID\t/' > $<.tmp && awk 'BEGIN {OFS = "\t" } NR > 1 { print NR-1, $$0 }' $< >> $<.tmp ;\ 12 | cut -f 2- $(<<) > $(<<).tmp; \ 13 | $(RSCRIPT) $(MERGE) -X --byColX 1 --byColY 1 -H $<.tmp $(<<).tmp > $@ && rm -f $<.tmp $(<<).tmp 14 | 15 | -------------------------------------------------------------------------------- /sv_callers/prepareSoapFuse.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # prepare soapFuse file structure using samples.txt file 3 | # print out soapfuse samples file 4 | 5 | use strict; 6 | use warnings; 7 | 8 | use File::Path qw/make_path/; 9 | 10 | use Getopt::Std; 11 | my %opt; 12 | getopts('h', \%opt); 13 | 14 | my $usage = <; 29 | return length(); 30 | } 31 | 32 | 33 | while (my $line = <>) { 34 | chomp $line; 35 | my $sample = $line; 36 | my $fq1 = "fastq/$sample.1.fastq.gz"; 37 | my $fq2 = "fastq/$sample.2.fastq.gz"; 38 | die "Cannot find fastq files ($fq1 and $fq2)" unless (-e $fq1 && -e $fq1); 39 | my $sampleDir = "soapfuse/$sample/$sample"; 40 | make_path($sampleDir); 41 | system "ln -f $fq1 $sampleDir/${sample}_1.fastq.gz"; 42 | system "ln -f $fq2 $sampleDir/${sample}_2.fastq.gz"; 43 | my $readLength = &getReadLength($fq1); 44 | 45 | print "$sample\t$sample\t$sample\t$readLength\n"; 46 | } 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /sv_callers/starFusion.mk: -------------------------------------------------------------------------------- 1 | # run star fusion on fastqs 2 | include modules/Makefile.inc 3 | 4 | LOGDIR = log/star_fusion.$(NOW) 5 | 6 | $(if $(STAR_CTAT_DIR),,$(error no STAR CTAT dir)) 7 | STAR_FUSION = STAR-Fusion 8 | STAR_FUSION_ENV = $(HOME)/share/usr/anaconda-envs/star-fusion-1.0.0 9 | STAR_FUSION_OPTS = --genome_lib_dir $(STAR_CTAT_DIR) 10 | 11 | STAR_FUSION_TO_USV = python modules/sv_callers/starfusion2usv.py 12 | 13 | 14 | PHONY += star_fusion 15 | star_fusion : $(foreach sample,$(SAMPLES),usv/$(sample).star_fusion.tsv) 16 | 17 | star_fusion/%.star_fusion_timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz 18 | $(call RUN,-v $(STAR_FUSION_ENV) -n 8 -s 2G -m 5G,"$(STAR_FUSION) \ 19 | --CPU 8 \ 20 | --output_dir $(@D)/$* \ 21 | --genome_lib_dir $(STAR_CTAT_DIR) \ 22 | --verbose_level 2 \ 23 | --left_fq $< --right_fq $(<<) && touch $@") 24 | 25 | usv/%.star_fusion.tsv : star_fusion/%.star_fusion_timestamp 26 | $(call RUN,,"$(STAR_FUSION_TO_USV) < $( $@") 27 | 28 | .PHONY: $(PHONY) 29 | .SECONDARY: 30 | .DELETE_ON_ERROR: 31 | 32 | include modules/fastq_tools/mergeSplitFastq.mk 33 | -------------------------------------------------------------------------------- /sv_callers/svaba_tumor_normal.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR = log/svaba_tumor_normal.$(NOW) 4 | 5 | SVABA_CORES ?= 8 6 | SVABA_MEM_CORE ?= 6G 7 | SVABA_REF ?= $(REF_FASTA) 8 | SVABA_DBSNP ?= $(HOME)/share/lib/resource_files/svaba/dbsnp_indel.vcf 9 | SVABA_BLACKLIST ?= $(HOME)/share/lib/resource_files/svaba/wgs_blacklist_meres.bed 10 | SVABA ?= svaba 11 | 12 | svaba : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).svaba_sv.vcf) 13 | 14 | define svaba-tumor-normal 15 | svaba/$1_$2.svaba.somatic.sv.vcf : bam/$1.bam bam/$2.bam 16 | $$(call RUN,-c -n $(SVABA_CORES) -s 4G -m $(SVABA_MEM_CORE) -v $(SVABA_ENV) -w 72:00:00,"set -o pipefail && \ 17 | mkdir -p svaba && \ 18 | cd svaba && \ 19 | $$(SVABA) run \ 20 | -t ../bam/$1.bam \ 21 | -n ../bam/$2.bam \ 22 | -p $$(SVABA_CORES) \ 23 | -D $$(SVABA_DBSNP) \ 24 | -L 100000 \ 25 | -x 25000 \ 26 | -k $$(SVABA_BLACKLIST) \ 27 | -a $1_$2 \ 28 | -G $$(SVABA_REF)") 29 | 30 | vcf/$1_$2.svaba_sv.vcf : svaba/$1_$2.svaba.somatic.sv.vcf 31 | $$(INIT) cat $$< > $$@ 32 | 33 | endef 34 | $(foreach pair,$(SAMPLE_PAIRS),\ 35 | $(eval $(call svaba-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 36 | 37 | 38 | ..DUMMY := $(shell mkdir -p version; \ 39 | $(SVABA) --help &> version/svaba_tumor_normal.txt) 40 | .SECONDARY: 41 | .DELETE_ON_ERROR: 42 | .PHONY: svaba 43 | -------------------------------------------------------------------------------- /sv_callers/tophatFusion.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | include modules/variant_callers/gatk.inc 3 | 4 | TOPHAT := $(HOME)/share/usr/bin/tophat2 5 | TOPHAT_OPTS := --no-coverage-search --fusion-ignore-chromosomes MT --fusion-search --keep-fasta-order 6 | 7 | ifeq ($(BAM_PHRED64),true) 8 | TOPHAT_OPTS += --solexa1.3-quals 9 | endif 10 | 11 | .SECONDARY: 12 | .DELETEONERROR: 13 | .PHONY: all 14 | 15 | all : $(foreach sample,$(SAMPLES),tophat/$(sample)/fusions.out) 16 | 17 | tophat/%/fusions.out : fastq/%.1.fastq.gz fastq/%.2.fastq.gz tophat/ins_size/%.insert_size.txt 18 | DIST_OPTS=`perl -e '$$text = do {local $$/ ; <>}; $$text =~ m/Read length: mean (\d+).*\nRead span: mean (\d+).*STD=(\d+)/; print "--mate-inner-dist " . ($$2 - $$1 * 2) . " --mate-std-dev $$3";' $(word 3,$^)`; \ 19 | $(call RUN,-N $*_tophat -n 4 -s 6G -m 10G,"$(TOPHAT) $(TOPHAT_OPTS) -p 4 $$DIST_OPTS -o $(@D) $(BOWTIE_REF) $(<) $(word 2,$(^))") 20 | 21 | tophat/ins_size/%.insert_size.txt : bam/%.bam 22 | $(call RUN,,"$(SAMTOOLS) view $< | $(GET_INSERT_SIZE) - > $@") 23 | 24 | tophat/fusions/%.fusions.ft.txt : tophat/%/fusions.out 25 | awk '$5 > 100 { print }' $< 26 | 27 | -------------------------------------------------------------------------------- /variant_callers/gatk.inc: -------------------------------------------------------------------------------- 1 | ifndef GATK_INC 2 | DEFAULT_JAVA_MEM = 18G 3 | GATK = $(call GATK_MEM,$(DEFAULT_JAVA_MEM)) 4 | GATK_MEM = $(JAVA7) -Xmx$(1) -jar $(GATK_JAR) -S LENIENT 5 | GATK2 = $(call GATK_MEM2,$(DEFAULT_JAVA_MEM)) 6 | GATK_MEM2 = $(JAVA8) -Xmx$(1) -jar $(GATK_JAR2) -S LENIENT 7 | 8 | endif 9 | GATK_INC = true 10 | -------------------------------------------------------------------------------- /variant_callers/get_basecounts.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/get_basecount.$(NOW) 4 | 5 | MAPQ := 0 6 | BAQ := 0 7 | COV := 0 8 | 9 | getbasecount : $(foreach sample,$(SAMPLES),gbc/$(sample).txt.gz) \ 10 | gbc/summary.txt 11 | 12 | define get-basecount 13 | gbc/$1.txt.gz : bam/$1.bam vcf/dataSilentNoPoleNotTertPromot.vcf 14 | $$(call RUN,-n 6 -s 3G -m 6G,"set -o pipefail && \ 15 | $(GBC) --fasta $(REF_FASTA) \ 16 | --bam $$(<) \ 17 | --vcf $$(<<) \ 18 | --output $$(@) \ 19 | --thread 6 \ 20 | --sort_output \ 21 | --compress_output \ 22 | --maq $(MAPQ) \ 23 | --baq $(BAQ) \ 24 | --cov $(COV) \ 25 | --filter_duplicate 0 \ 26 | --filter_improper_pair 0 \ 27 | --filter_qc_failed 1 \ 28 | --filter_indel 0 \ 29 | --filter_non_primary 1") 30 | 31 | endef 32 | $(foreach sample,$(SAMPLES),\ 33 | $(eval $(call get-basecount,$(sample)))) 34 | 35 | 36 | gbc/summary.txt : $(foreach sample,$(SAMPLES),gbc/$(sample).txt.gz) 37 | $(call RUN,-n 1 -s 24G -m 32G,"set -o pipefail && \ 38 | $(RSCRIPT) $(SCRIPTS_DIR)/get_basecounts.R \ 39 | --option 1 \ 40 | --sample_name '$(SAMPLES)'") 41 | 42 | 43 | ..DUMMY := $(shell mkdir -p version; \ 44 | ${GBC} &> version/get_basecount.txt;) 45 | .SECONDARY: 46 | .DELETE_ON_ERROR: 47 | .PHONY: getbasecount 48 | -------------------------------------------------------------------------------- /variant_callers/hotspot.mk: -------------------------------------------------------------------------------- 1 | # run unified genotyper on hotspots 2 | 3 | include modules/Makefile.inc 4 | include modules/variant_callers/gatk.inc 5 | 6 | LOGDIR ?= log/hotspot.$(NOW) 7 | PHONY += hotspot hotspot_vcfs hotspot_tables 8 | 9 | .DELETE_ON_ERROR: 10 | .SECONDARY: 11 | .PHONY: $(PHONY) 12 | 13 | HOTSPOT_GATK_OPTS = --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -R $(REF_FASTA) -stand_call_conf 0 14 | 15 | 16 | hotspot : hotspot_vcfs hotspot_tables 17 | hotspot_vcfs : $(if $(SAMPLE_PAIRS),\ 18 | $(foreach pair,$(SAMPLE_PAIRS),vcf_ann/$(pair).hotspot.vcf),\ 19 | $(foreach sample,$(SAMPLES),vcf_ann/$(sample).hotspot.vcf)) 20 | hotspot_tables : $(if $(SAMPLE_PAIRS),alltables/allTN.hotspot.tab.txt) 21 | 22 | vcf_ann/%.hotspot.vcf : vcf/%.hotspot.ac_ft.hotspot_int_ann.hotspot_ext_ann.vcf 23 | $(INIT) cp $< $@ 24 | 25 | vcf/%.hotspot.vcf : $(foreach i,int ext,hotspot/%.hotspot-$i.vcf.gz hotspot/%.hotspot-$i.vcf.gz.tbi) 26 | $(call RUN,-c -s 2G -m 3G,"$(BCFTOOLS2) concat -a $(filter %.vcf.gz,$^) > $@.tmp && \ 27 | $(call VERIFY_VCF,$@.tmp,$@)") 28 | 29 | define hotspot-vcf-tumor-normal-i 30 | hotspot/$1_$2.hotspot-$3.vcf : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai 31 | $$(call RUN,-c -s 9G -m 12G,"$$(call GATK_MEM2,4G) \ 32 | -T UnifiedGenotyper $$(HOTSPOT_GATK_OPTS) -I $$(<) -I $$(<<) \ 33 | -alleles $$(HOTSPOT_VCF.$3) -L $$(HOTSPOT_VCF.$3) -o $$@.tmp && \ 34 | $$(call VERIFY_VCF,$$@.tmp,$$@)") 35 | endef 36 | $(if $(SAMPLE_PAIRS),$(foreach pair,$(SAMPLE_PAIRS),\ 37 | $(foreach i,int ext,\ 38 | $(eval $(call hotspot-vcf-tumor-normal-i,$(tumor.$(pair)),$(normal.$(pair)),$i))))) 39 | 40 | define hotspot-vcf-sample-i 41 | hotspot/$1.hotspot-$2.vcf : bam/$1.bam bam/$1.bam.bai 42 | $$(call RUN,-c -s 9G -m 12G,"$$(call GATK_MEM2,4G) \ 43 | -T UnifiedGenotyper $$(HOTSPOT_GATK_OPTS) -I $$(<) \ 44 | -alleles $$(HOTSPOT_VCF.$2) -L $$(HOTSPOT_VCF.$2) \ 45 | -o $$@.tmp && $$(call VERIFY_VCF,$$@.tmp,$$@)") 46 | endef 47 | $(foreach sample,$(SAMPLES),\ 48 | $(foreach i,int ext,\ 49 | $(eval $(call hotspot-vcf-sample-i,$(sample),$i)))) 50 | 51 | include modules/vcf_tools/vcftools.mk 52 | -------------------------------------------------------------------------------- /variant_callers/qsnp.mk: -------------------------------------------------------------------------------- 1 | # run pyrohmmvar: realignment-based variant calling method for 454 and ion torrent 2 | 3 | include modules/Makefile.inc 4 | include modules/variant_callers/gatk.inc 5 | 6 | .SECONDARY: 7 | .DELETE_ON_ERROR: 8 | .PHONY: all 9 | 10 | LOGDIR = log/qsnp.$(NOW) 11 | QSNP = $(JAVA) $(JARDIR)/qsnp-1.0.jar 12 | 13 | define QSNP_CONFIG 14 | [general]\n\ 15 | chrFiles=$(FREEC_REF)\n\ 16 | chrLenFile=$(CHR_LEN)\n\ 17 | maxThreads=$(FREEC_THREADS)\n\ 18 | samtools=$(SAMTOOLS)\n\ 19 | outputDir=$3\n\ 20 | noisyData=$(NOISY_DATA)\n\ 21 | ploidy=2\n\ 22 | window=$(FREEC_WINDOW_SIZE)\n\ 23 | gemMappabilityFile=$(GEM_MAP_FILE)\n\ 24 | printNA=$(PRINT_NA)\n\ 25 | [sample]\n\ 26 | mateFile=$1\n\ 27 | inputFormat=BAM\n\ 28 | mateOrientation=FR\n\ 29 | [control]\n\ 30 | mateFile=$2\n\ 31 | inputFormat=BAM\n\ 32 | mateOrientation=FR\n\ 33 | [BAF]\n\ 34 | shiftInQuality=33\n\ 35 | SNPfile=$(SNP_TXT)\n\ 36 | $(FREEC_TARGET_CONFIG) 37 | endef 38 | 39 | -------------------------------------------------------------------------------- /variant_callers/samtoolsHet.mk: -------------------------------------------------------------------------------- 1 | # Run samtools to detect heterozygous positions 2 | ##### DEFAULTS ###### 3 | include modules/Makefile.inc 4 | include modules/variant_callers/gatk.inc 5 | 6 | LOGDIR ?= log/samtools_het.$(NOW) 7 | 8 | .DELETE_ON_ERROR: 9 | .SECONDARY: 10 | .PHONY : het_snps 11 | 12 | het_snps : $(foreach s,$(SAMPLES),vcf/$s.het_snp.vcf) 13 | 14 | include modules/vcf_tools/vcftools.mk 15 | 16 | define hetsnp-chr 17 | chr_vcf/%.$1.het_snp.vcf : bam/%.bam 18 | $$(call RUN,-s 6G -m 8G,"$$(SAMTOOLS2) mpileup -r $1 -f $$(REF_FASTA) -g -I $$< | $$(BCFTOOLS2) call -c | $$(BCFTOOLS2) view -g het | $$(VCFUTILS) varFilter -d 10 -a 5 - > $$@") 19 | endef 20 | $(foreach chr,$(CHROMOSOMES),$(eval $(call hetsnp-chr,$(chr)))) 21 | 22 | vcf/%.het_snp.vcf : $(foreach chr,$(CHROMOSOMES),chr_vcf/%.$(chr).het_snp.vcf) 23 | $(INIT) { \ 24 | grep -P '^#' $<; \ 25 | sed '/^#/d' $^ | sort -V; \ 26 | } > $@ 27 | 28 | -------------------------------------------------------------------------------- /variant_callers/somatic/crest.mk: -------------------------------------------------------------------------------- 1 | # Run somatic sniper on tumour-normal matched pairs 2 | # Detect point mutations 3 | ##### DEFAULTS ###### 4 | 5 | ##### MAKE INCLUDES ##### 6 | include modules/Makefile.inc 7 | 8 | LOGDIR = log/crest.$(NOW) 9 | CREST_DIR = $(HOME)/share/usr/crest 10 | CREST = PERL5LIB=$(PERL5LIB):$(CREST_DIR) $(PERL) $(CREST_DIR)/CREST.pl 11 | 12 | EXTRACT_SCLIP = PERL5LIB=$(PERL5LIB):$(CREST_DIR) $(PERL) $(CREST_DIR)/extractSClip.pl 13 | 14 | .SECONDARY: 15 | .DELETE_ON_ERROR: 16 | .PHONY: all 17 | 18 | ifdef SAMPLE_PAIRS 19 | all : $(foreach pair,$(SAMPLE_PAIRS),crest/sv/$(pair).predSV.txt) 20 | else 21 | all : $(foreach sample,$(SAMPLES),crest/sv/$(sample).predSV.txt) 22 | endif 23 | 24 | define sclip-chr 25 | crest/sclip/%.$1.sclip.txt crest/sclip/%.$1.cover : bam/%.bam 26 | $$(call RUN,-c -s 4G -m 6G,"$$(EXTRACT_SCLIP) -i $$< --ref_genome $$(REF_FASTA) -r $1 -o $$(@D) -p $$*") 27 | endef 28 | $(foreach chr,$(CHROMOSOMES),$(eval $(call sclip-chr,$(chr)))) 29 | 30 | crest/sclip/%.cover : $(foreach chr,$(CHROMOSOMES),crest/sclip/%.$(chr).cover) 31 | $(INIT) cat $^ > $@ 32 | 33 | crest/sclip/%.sclip.txt : $(foreach chr,$(CHROMOSOMES),crest/sclip/%.$(chr).sclip.txt) 34 | $(INIT) cat $^ > $@ 35 | 36 | define sv-tumor-normal-chr 37 | crest/sv/$1_$2.$3.predSV.txt : bam/$1.bam bam/$2.bam crest/sclip/$1.cover 38 | $$(call RUN,-c -s 4G -m 6G,"$$(CREST) -p $1_$2.$3 -f $$(<<<) -d $$< -g $$(<<) --ref_genome $$(REF_FASTA) -t $$(REF_2BIT) -r $3") 39 | endef 40 | $(foreach pair,$(SAMPLE_PAIRS),\ 41 | $(foreach chr,$(CHROMOSOMES),\ 42 | $(eval $(call sv-tumor-normal-chr,$(tumor.$(pair)),$(normal.$(pair)),$(chr))))) 43 | 44 | define sv-chr 45 | crest/sv/%.$1.predSV.txt : bam/%.bam crest/sclip/%.cover 46 | $$(call RUN,-c -s 4G -m 6G,"$$(CREST) -p $$*.$1 -f $$(<<) -d $$< --ref_genome $$(REF_FASTA) -t $$(REF_2BIT) -r $1") 47 | endef 48 | $(foreach chr,$(CHROMOSOMES),$(eval $(call sv-chr,$(chr)))) 49 | 50 | crest/sv/%.predSV.txt : $(foreach chr,$(CHROMOSOMES),crest/sv/%.$(chr).predSV.txt) 51 | $(INIT) cat $^ > $@ 52 | 53 | -------------------------------------------------------------------------------- /variant_callers/somatic/dindelTNFilter.mk: -------------------------------------------------------------------------------- 1 | # Create tumour-normal dindel vcf files 2 | include modules/Makefile.inc 3 | 4 | SAMPLE_PAIR_FILE ?= sample_pairs.txt 5 | 6 | TUMOR_SAMPLES := $(shell cut -f 1 $(SAMPLE_PAIR_FILE)) 7 | NORMAL_SAMPLES := $(shell cut -f 2 $(SAMPLE_PAIR_FILE)) 8 | NSAMPLES = $(words $(TUMOR_SAMPLES)) 9 | 10 | VPATH = dindel/vcf 11 | 12 | LOGDIR = log/dindel.$(NOW) 13 | 14 | .SECONDARY: 15 | .DELETE_ON_ERROR: 16 | .PHONY : all 17 | 18 | VCFS = $(foreach sample,$(TUMOR_SAMPLES),dindel/vcf/$(sample).dindel.sorted.annotated.tnFiltered.vcf) 19 | 20 | all : $(VCFS) $(addsuffix .idx,$(VCFS))) 21 | 22 | include modules/tnFilter.mk 23 | include modules/dindel.mk 24 | -------------------------------------------------------------------------------- /variant_callers/somatic/gatkTNFilter.mk: -------------------------------------------------------------------------------- 1 | # naive tumour-normal filter for gatk indels 2 | 3 | include modules/Makefile.inc 4 | 5 | SAMPLE_PAIR_FILE = sample_pairs.txt 6 | 7 | TUMOR_SAMPLES := $(shell cut -f 1 $(SAMPLE_PAIR_FILE)) 8 | NORMAL_SAMPLES := $(shell cut -f 2 $(SAMPLE_PAIR_FILE)) 9 | NSAMPLES = $(words $(TUMOR_SAMPLES)) 10 | 11 | VPATH = gatk/vcf 12 | 13 | .SECONDARY: 14 | .DELETE_ON_ERROR: 15 | .PHONY : all 16 | 17 | VCFS = $(foreach sample,$(TUMOR_SAMPLES),gatk/vcf/$(sample).indels.annotated.filtered.tnFiltered.vcf) 18 | TABLES = $(foreach sample,$(TUMOR_SAMPLES),gatk/tables/$(sample).indels.annotated.filtered.tnFiltered.novel.txt) 19 | 20 | all : $(VCFS) $(addsuffix .idx,$(VCFS)) $(TABLES) 21 | 22 | include modules/tnFilter.mk 23 | include modules/gatkVariantCaller.mk 24 | -------------------------------------------------------------------------------- /variant_callers/somatic/gatkValidation.mk: -------------------------------------------------------------------------------- 1 | # use GATK to gather allelic depths at somatic positions 2 | include modules/Makefile.inc 3 | include modules/variant_callers/gatk.inc 4 | 5 | LOGDIR ?= log/gatk_validation.$(NOW) 6 | 7 | SOMATIC_BED ?= somatic.bed 8 | 9 | 10 | .DELETE_ON_ERROR: 11 | .SECONDARY: 12 | .PHONY : val_vcfs 13 | 14 | val_vcfs : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).gatkval.vcf) 15 | 16 | define gatkval-tumor-normal 17 | vcf/$1_$2.gatkval.vcf : bam/$1.bam bam/$2.bam 18 | $$(call RUN,-n 4 -s 2.5G -m 3G,"$$(call GATK_MEM,8G) -T UnifiedGenotyper -glm BOTH -nt 4 -R $(REF_FASTA) --dbsnp $(DBSNP) -I $$< -I $$(<<) -L $(SOMATIC_BED) -o $$@ --output_mode EMIT_ALL_SITES") 19 | endef 20 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call gatkval-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 21 | 22 | -------------------------------------------------------------------------------- /variant_callers/somatic/mimsi.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/mimsi.$(NOW) 4 | 5 | mimsi: $(foreach pair,$(SAMPLE_PAIRS),mimsi/$(pair)/$(pair).txt) \ 6 | mimsi/summary.txt 7 | 8 | MICROSATELLITES_LIST = $(HOME)/share/lib/resource_files/mimsi/microsatellites_impact_only.list 9 | MODEL = $(HOME)/share/lib/resource_files/mimsi/mi_msi_v0_4_0_200x.model 10 | 11 | define mimsi-tumor-normal 12 | mimsi/$1_$2/$1_$2.txt : bam/$1.bam bam/$2.bam 13 | $$(call RUN,-c -n 8 -s 1G -m 2G -v $(MIMSI_ENV),"set -o pipefail && \ 14 | mkdir -p mimsi/$1_$2/ && \ 15 | analyze \ 16 | --tumor-bam $$(<) \ 17 | --normal-bam $$(<<) \ 18 | --case-id $1 \ 19 | --norm-case-id $2 \ 20 | --microsatellites-list $$(MICROSATELLITES_LIST) \ 21 | --save-location mimsi/$1_$2/ \ 22 | --model $$(MODEL) \ 23 | --save && \ 24 | mv mimsi/$1_$2/BATCH_results.txt $$(@)") 25 | 26 | endef 27 | $(foreach pair,$(SAMPLE_PAIRS),\ 28 | $(eval $(call mimsi-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 29 | 30 | mimsi/summary.txt : $(foreach pair,$(SAMPLE_PAIRS),mimsi/$(pair)/$(pair).txt) 31 | $(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \ 32 | $(RSCRIPT) $(SCRIPTS_DIR)/mimsi.R --option 1 --sample_names '$(SAMPLE_PAIRS)'") 33 | 34 | 35 | .SECONDARY: 36 | .DELETE_ON_ERROR: 37 | .PHONY: mimsi 38 | -------------------------------------------------------------------------------- /variant_callers/somatic/msisensor.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/msisensor.$(NOW) 4 | 5 | msisensor: $(foreach pair,$(SAMPLE_PAIRS),msisensor/$(pair).msi) \ 6 | msisensor/msi.tsv 7 | 8 | MICROSATELLITES_LIST = $(HOME)/share/lib/resource_files/MSIsensor/microsatellites.list 9 | MSI_REGIONS = $(HOME)/share/lib/resource_files/MSIsensor/msiregions.bed 10 | 11 | define msisensor-tumor-normal 12 | msisensor/$1_$2.msi : bam/$1.bam bam/$2.bam 13 | $$(call RUN,-c -n 8 -s 1G -m 2G -v $(MSISENSOR_ENV),"set -o pipefail && \ 14 | msisensor msi $$(MSISENSOR_OPTS) \ 15 | -d $$(MICROSATELLITES_LIST) \ 16 | -e $$(MSI_REGIONS) \ 17 | -n $$(<<) \ 18 | -t $$(<) \ 19 | -b 8 \ 20 | -o $$(@)") 21 | endef 22 | $(foreach pair,$(SAMPLE_PAIRS),\ 23 | $(eval $(call msisensor-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 24 | 25 | msisensor/msi.tsv : $(foreach pair,$(SAMPLE_PAIRS),msisensor/$(pair).msi) 26 | $(INIT) (head -1 $< | sed 's/^/sample\t/'; for x in $^; do sed "1d; s/^/$$(basename $$x)\t/" $$x; done | sed 's/_.*msi//' ) > $@ 27 | 28 | .SECONDARY: 29 | .DELETE_ON_ERROR: 30 | .PHONY: msisensor 31 | -------------------------------------------------------------------------------- /variant_callers/somatic/platypus.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | include modules/variant_callers/gatk.inc 3 | 4 | .DELETE_ON_ERROR: 5 | .SECONDARY: 6 | .PHONY : $(PHONY) 7 | 8 | PLATYPUS_ENV = $(HOME)/share/usr/anaconda-envs/platypus-0.8.1 9 | 10 | PHONY += platypus_indels 11 | platypus_indels: $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).platypus_indels.vcf) 12 | 13 | define platypus-tumor-normal-chr 14 | platypus/chr_vcf/$1_$2.$3.platypus.vcf : bam/$1.bam bam/$2.bam 15 | $$(call RUN,-v $$(PLATYPUS_ENV) -n 4 -s 2G -m 3G,"platypus callVariants --regions=$3 \ 16 | --bamFiles=$$(<)$$(,)$$(<<) --nCPU 4 --refFile=$$(REF_FASTA) --output=$$@ --logFileName platypus/$1_$2.$3.log") 17 | endef 18 | $(foreach chr,$(CHROMOSOMES),\ 19 | $(foreach pair,$(SAMPLE_PAIRS),\ 20 | $(eval $(call platypus-tumor-normal-chr,$(tumor.$(pair)),$(normal.$(pair)),$(chr))))) 21 | 22 | INDEL_FILTER_VCF = python modules/vcf_tools/indel_filter_vcf.py 23 | SNP_FILTER_VCF = python modules/vcf_tools/snp_filter_vcf.py 24 | PLATYPUS_SOURCE_ANN_VCF = python modules/vcf_tools/annotate_source_vcf.py --source platypus 25 | 26 | vcf/%.platypus_indels.vcf : $(foreach chr,$(CHROMOSOMES),platypus/chr_vcf/%.$(chr).platypus.vcf) 27 | $(call RUN,-c -s 4G -m 8G,"(grep '^#' $<; cat $^ | grep -v '^#' | \ 28 | $(VCF_SORT) $(REF_DICT) - ) | $(INDEL_FILTER_VCF) | $(PLATYPUS_SOURCE_ANN_VCF) > $@.tmp && \ 29 | $(call VERIFY_VCF,$@.tmp,$@)") 30 | 31 | vcf/%.platypus_snps.vcf : $(foreach chr,$(CHROMOSOMES),platypus/chr_vcf/%.$(chr).platypus.vcf) 32 | $(call RUN,-c -s 4G -m 8G,"(grep '^#' $<; cat $^ | grep -v '^#' | \ 33 | $(VCF_SORT) $(REF_DICT) - ) | $(SNP_FILTER_VCF) | $(PLATYPUS_SOURCE_ANN_VCF) > $@.tmp && \ 34 | $(call VERIFY_VCF,$@.tmp,$@)") 35 | 36 | include modules/vcf_tools/vcftools.mk 37 | -------------------------------------------------------------------------------- /variant_callers/somatic/somaticVariants.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | LOGDIR = log/somatic_variants.$(NOW) 3 | 4 | SNV_TYPE ?= mutect 5 | INDEL_TYPE ?= somatic_indels 6 | VARIANT_TYPES ?= $(SNV_TYPE) $(INDEL_TYPE) 7 | 8 | PHONY += all_somatic 9 | all_somatic: somatic_vcfs somatic_tables facets 10 | 11 | CONCAT_VCF = python modules/vcf_tools/concat_vcf.py 12 | 13 | vcf/%.somatic_variants.vcf : vcf/%.$(SNV_TYPE).vcf vcf/%.$(INDEL_TYPE).vcf 14 | $(call RUN,-s 9G -m 12G,"$(CONCAT_VCF) $^ | $(VCF_SORT) $(REF_DICT) - > $@") 15 | 16 | include modules/variant_callers/somatic/mutect.mk 17 | include modules/variant_callers/somatic/somaticIndels.mk 18 | include modules/copy_number/facets.mk 19 | include modules/vcf_tools/annotateSomaticVcf.mk 20 | 21 | .DELETE_ON_ERROR: 22 | .SECONDARY: 23 | .PHONY: $(PHONY) 24 | 25 | -------------------------------------------------------------------------------- /variant_callers/somatic/strelkaVarscanIndels.mk: -------------------------------------------------------------------------------- 1 | # Run VarScan and strelka on tumour-normal matched pairs for indels 2 | # 3 | include modules/Makefile.inc 4 | LOGDIR = log/strelkaVarscan.$(NOW) 5 | 6 | 7 | .PHONY : strelka_varscan_merge_vcfs strelka_varscan_merge 8 | strelka_varscan_merge : strelka_varscan_merge_vcfs 9 | strelka_varscan_merge_vcfs : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).strelka_varscan_indels.vcf) 10 | #strelka_varscan_merge_mafs : $(foreach pair,$(SAMPLE_PAIRS),maf/$(pair).strelka_varscan_indels.vcf) 11 | 12 | vcf/%.strelka_varscan_indels.vcf : vcf/%.varscan_indels.vcf vcf/%.strelka_indels.vcf 13 | $(call RUN,-s 9G -m 12G,"(grep -P '^#' $<; $(BEDTOOLS) intersect -a $< -b <($(PASS_FILTER_VCF) $(<<))) | uniq > $@") 14 | 15 | 16 | include modules/variant_callers/somatic/strelka.mk 17 | include modules/variant_callers/somatic/varscanTN.mk 18 | -------------------------------------------------------------------------------- /variant_callers/somatic/tvcTN.mk: -------------------------------------------------------------------------------- 1 | # use torrent variant caller on ion torrent bams 2 | 3 | LOGDIR ?= log/tvcTN.$(NOW) 4 | 5 | include modules/Makefile.inc 6 | include modules/variant_callers/gatk.inc 7 | 8 | VPATH ?= bam 9 | 10 | TVC_OPTS ?= -r $(REF_FASTA) $(if $(TARGETS_FILE),-t $(TARGETS_FILE)) 11 | 12 | .DELETE_ON_ERROR: 13 | .SECONDARY: 14 | 15 | PHONY += tvc tvc_vcfs tvc_tables 16 | 17 | tvc : tvc_vcfs tvc_tables 18 | 19 | tvc_vcfs : $(call SOMATIC_VCFS,tvc_snps_indels) 20 | tvc_tables : $(call SOMATIC_TABLES,tvc_snps_indels) 21 | 22 | %.contig.vcf : %.vcf 23 | $(INIT) awk '{print "##contig="'} $(REF_FASTA).fai | $(BCFTOOLS2) annotate -h - $< > $@ 24 | 25 | %.vcf.gz : %.vcf 26 | $(call RUN,,"bgzip -c $< > $@") 27 | 28 | define tvc-tumor 29 | vcf/$1.tvc_snps_indels.vcf : bam/$1.bam bam/$1.bam.bai 30 | $$(call RUN,-n 4 -s 1G -m 2G,"$$(TVC) $$(TVC_OPTS) -b $$< -o $$@ -n 4") 31 | endef 32 | $(foreach tumor,$(TUMOR_SAMPLES),$(eval $(call tvc-tumor,$(tumor)))) 33 | 34 | define tvc-normal 35 | vcf/$1.tumor_tvc_snps_indels.vcf : $$(foreach tumor,$2,vcf/$$(tumor).tvc_snps_indels.vcf.gz vcf/$$(tumor).tvc_snps_indels.vcf.gz.tbi) 36 | $$(INIT) $$(BCFTOOLS2) merge $$(filter %.vcf.gz,$$^) > $$@ 37 | 38 | vcf/$1.tvc_snps_indels.vcf : bam/$1.bam vcf/$1.tumor_tvc_snps_indels.vcf bam/$1.bam.bai 39 | $$(call RUN,-n 4 -s 1G -m 2G,"$$(TVC) $$(TVC_OPTS) -c $$(<<) -b $$< -o $$@ -n 4") 40 | endef 41 | $(foreach normal,$(NORMAL_SAMPLES),$(eval $(call tvc-normal,$(normal),$(tumor.$(normal))))) 42 | 43 | define tvc-tumor-normal 44 | vcf/$1_$2.tvc_snps_indels.vcf : vcf/$1.tvc_snps_indels.vcf.gz vcf/$2.tvc_snps_indels.vcf.gz vcf/$1.tvc_snps_indels.vcf.gz.tbi vcf/$2.tvc_snps_indels.vcf.gz.tbi 45 | $$(INIT) $$(BCFTOOLS2) merge $$(filter %.vcf.gz,$$^) > $$@ 46 | endef 47 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call tvc-tumor-normal,$(tumor.$(pair)),$(normal.$(pair))))) 48 | 49 | .PHONY: $(PHONY) 50 | 51 | include modules/vcf_tools/vcftools.mk 52 | -------------------------------------------------------------------------------- /variant_callers/sufamsampleset.mk: -------------------------------------------------------------------------------- 1 | LOGDIR = log/sufam_ss.$(NOW) 2 | 3 | include modules/Makefile.inc 4 | 5 | SUFAM_ENV = $(HOME)/share/usr/anaconda-envs/sufam-dev 6 | SUFAM_OPTS = --format vcf --mpileup-parameters='-A -q 15 -Q 15 -d 15000' 7 | SOMATIC_VCF2TSV = python modules/vcf_tools/somatic_vcf2tsv.py 8 | 9 | ANNOTATE_SUFAM_GT_VCF = python modules/vcf_tools/annotate_sufam_gt_vcf.py 10 | 11 | .DELETE_ON_ERROR: 12 | .SECONDARY: 13 | .PHONY: 14 | 15 | sufam_sampleset: $(foreach set,$(SAMPLE_SETS),vcf_ann/$(set).sufam.vcf) tsv/sufam_variants.tsv 16 | 17 | define sufam-set 18 | sufam/$1.set.vcf : $$(foreach pair,$$(pairs.$1),vcf_ann/$$(pair).somatic_variants.vcf.gz vcf_ann/$$(pair).somatic_variants.vcf.gz.tbi) 19 | $$(call RUN,-s 6G -m 8G,"bcftools merge -O v --force-samples $$(filter %.vcf.gz,$$^) > $$@") 20 | 21 | vcf/$1.sufam.vcf : sufam/$1.set.vcf $$(foreach sample,$$(set.$1),bam/$$(sample).bam) 22 | $$(call RUN,-v $$(SUFAM_ENV) -s 2G -m 3G,"sufam --sample_name $$(set.$1) $$(SUFAM_OPTS) $$(REF_FASTA) $$^ > $$@") 23 | 24 | vcf_ann/$1.sufam.vcf : vcf/$1.sufam.vcf $$(foreach pair,$$(pairs.$1),vcf_ann/$$(pair).somatic_variants.vcf.gz) 25 | $$(call RUN,-s 2G -m 3G,"$$(ANNOTATE_SUFAM_GT_VCF) $$^ > $$@") 26 | 27 | tsv/$1.sufam.tsv : vcf_ann/$1.sufam.vcf.gz 28 | $$(call RUN,-s 4G -m 6G,"$$(SOMATIC_VCF2TSV) --normal $$(normal.$1) $$< > $$@") 29 | 30 | endef 31 | $(foreach set,$(SAMPLE_SETS),$(eval $(call sufam-set,$(set)))) 32 | 33 | tsv/sufam_variants.tsv : $(foreach set,$(SAMPLE_SETS),tsv/$(set).sufam.tsv) 34 | $(call RUN,-s 4G -m 6G,"(sed -n 1p $<; for x in $^; do sed 1d \$$x; done) > $@") 35 | 36 | 37 | include modules/vcf_tools/vcftools.mk 38 | -------------------------------------------------------------------------------- /variant_callers/tvc.mk: -------------------------------------------------------------------------------- 1 | # use torrent variant caller on ion torrent bams 2 | 3 | LOGDIR ?= log/tvc.$(NOW) 4 | 5 | include modules/Makefile.inc 6 | 7 | VPATH ?= bam 8 | 9 | TVC_OPTS ?= -r $(REF_FASTA) $(if $(TARGETS_FILE),-t $(TARGETS_FILE)) 10 | 11 | .DELETE_ON_ERROR: 12 | .SECONDARY: 13 | 14 | PHONY += tvc tvc_vcfs 15 | 16 | tvc : tvc_vcfs 17 | 18 | tvc_vcfs : $(foreach sample,$(SAMPLES),vcf/$(sample).tvc_snps_indels.vcf) 19 | 20 | vcf/%.tvc_snps_indels.vcf : bam/%.bam bam/%.bam.bai 21 | $(call RUN,-n 4 -s 1G -m 2G,"$(TVC) $(TVC_OPTS) -n 4 -o $(@) -b $<") 22 | 23 | .PHONY: $(PHONY) 24 | 25 | -------------------------------------------------------------------------------- /vcf_tools/annotateExtVcf.mk: -------------------------------------------------------------------------------- 1 | # annotate external vcfs 2 | include modules/Makefile.inc 3 | 4 | LOGDIR ?= log/ext_vcf.$(NOW) 5 | 6 | EXT_NAME ?= ext 7 | 8 | ext_ann : ext_vcfs ext_tables 9 | 10 | ext_vcfs : $(foreach pair,$(SAMPLE_PAIRS),vcf_ann/$(pair).$(EXT_NAME).vcf) 11 | ext_tables : alltables/allTN.$(EXT_NAME).tab.txt 12 | 13 | LOGDIR ?= log/annotate_ext_vcf.$(NOW) 14 | 15 | ANN_MUT_TASTE ?= false 16 | ANN_PROVEAN ?= false 17 | SOMATIC_ANN1 = fathmm chasm dbsnp hotspot_ann eff exac_nontcga cosmic clinvar cn_reg gene_ann nsfp $(ANNOVAR_REF)_multianno \ 18 | $(if $(findstring true,$(ANN_MUT_TASTE)),mut_taste) $(if $(findstring true,$(ANN_PROVEAN)),provean) 19 | SOMATIC_ANN2 = $(if $(findstring true,$(ANN_PATHOGEN)),snp_pathogen indel_pathogen) 20 | 21 | # target filter 22 | 23 | PHONY += all vcfs 24 | all : vcfs 25 | vcfs : $(foreach type,$(VARIANT_TYPES),$(foreach sample,$(SAMPLES),vcf_ann/$(sample).$(type).vcf)) 26 | 27 | MERGE_VCF = $(PYTHON) modules/vcf_tools/merge_vcf.py 28 | MERGE_SCRIPT = $(call RUN,-c -s 6G -m 7G,"$(MERGE_VCF) --ignore_filter $^ | $(VCF_SORT) $(REF_DICT) - > $@") 29 | 30 | # first filter round 31 | # first annotation round 32 | vcf/%.$(EXT_NAME).ann.vcf : $(foreach ann,$(SOMATIC_ANN1),vcf/%.$(EXT_NAME).$(ann).vcf) 33 | $(MERGE_SCRIPT) 34 | # second annotation round 35 | vcf_ann/%.$(EXT_NAME).vcf : $(if $(strip $(SOMATIC_ANN2)),$(foreach ann,$(SOMATIC_ANN2),vcf/%.$(EXT_NAME).ann.$(ann).vcf),vcf/%.$(EXT_NAME).ann.vcf) 36 | $(MERGE_SCRIPT) 37 | 38 | 39 | .DELETE_ON_ERROR: 40 | .SECONDARY: 41 | .PHONY: $(PHONY) 42 | 43 | include modules/vcf_tools/vcftools.mk 44 | include modules/variant_callers/gatk.inc 45 | -------------------------------------------------------------------------------- /vcf_tools/annotateSummaryVcf.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | include modules/genome_inc/b37.inc 3 | 4 | LOGDIR ?= log/annotate_smry_maf.$(NOW) 5 | 6 | annotate_smry_maf : vcf2maf/mutation_summary.vcf \ 7 | vcf2maf/mutation_summary.maf \ 8 | vcf2maf/mutation_summary.txt 9 | 10 | vcf2maf/mutation_summary.vcf : summary/tsv/mutation_summary.tsv 11 | $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ 12 | $(RSCRIPT) $(SCRIPTS_DIR)/annotateSummaryVcf.R --option 1 --input $(<) --output $(@)") 13 | 14 | vcf2maf/mutation_summary.maf : vcf2maf/mutation_summary.vcf 15 | $(call RUN, -c -n 12 -s 2G -m 3G -v $(VEP_ENV) -w 72:00:00,"set -o pipefail && \ 16 | $(VCF2MAF) \ 17 | --input-vcf $(<) \ 18 | --output-maf $(@) \ 19 | --tmp-dir $(TMPDIR) \ 20 | --tumor-id NA \ 21 | --normal-id NA \ 22 | --vep-path $(VEP_ENV)/bin \ 23 | --vep-data $(HOME)/share/reference/vep/v86/ \ 24 | --vep-forks 12 \ 25 | --ref-fasta $(HOME)/share/reference/vep/v86/homo_sapiens/86_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa \ 26 | --filter-vcf $(HOME)/share/reference/vep/v86/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz \ 27 | --species homo_sapiens \ 28 | --ncbi-build GRCh37 \ 29 | --maf-center MSKCC && \ 30 | $(RM) $(TMPDIR)/mutation_summary.vep.vcf") 31 | 32 | vcf2maf/mutation_summary.txt : summary/tsv/mutation_summary.tsv vcf2maf/mutation_summary.maf 33 | $(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \ 34 | $(RSCRIPT) $(SCRIPTS_DIR)/annotateSummaryVcf.R --option 2 --input $(<) --maf $(<<) --output $(@)") 35 | 36 | ..DUMMY := $(shell mkdir -p version; \ 37 | source $(VCF2MAF_ENV)/bin/activate $(VCF2MAF_ENV) && $(VCF2MAF) --man >> version/annotate_smry_maf.txt) 38 | .DELETE_ON_ERROR: 39 | .SECONDARY: 40 | .PHONY: annotate_smry_maf 41 | -------------------------------------------------------------------------------- /vcf_tools/annotate_source_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ add variant source (name of caller) to a vcf 3 | """ 4 | 5 | import argparse 6 | import vcf 7 | import sys 8 | 9 | if __name__ == "__main__": 10 | parser = argparse.ArgumentParser(prog='annotate_source_vcf.py', 11 | description='source annotation to add to each variant INFO') 12 | parser.add_argument('--source', required=True, help='name of source') 13 | parser.add_argument('--vcf_infile', required=False, type=argparse.FileType('r'), default=sys.stdin) 14 | 15 | args = parser.parse_args() 16 | 17 | vcf_reader = vcf.Reader(args.vcf_infile) 18 | 19 | vcf_reader.infos['variantCaller'] = vcf.parser._Info(id='variantCaller', num='.', type='String', 20 | desc="variant caller(s) used to find the variant", 21 | source=None, version=None) 22 | 23 | if args.source == 'lancet': 24 | vcf_reader.infos['LEN'] = vcf.parser._Info(id='LEN', num='1', type='Integer', 25 | desc="length of insertion/deletion", 26 | source=None, version=None) 27 | vcf_reader.infos['TYPE'] = vcf.parser._Info(id='TYPE', num='1', type='String', 28 | desc="insertion or deletion", 29 | source=None, version=None) 30 | 31 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 32 | 33 | for record in vcf_reader: 34 | record.INFO['variantCaller'] = [args.source] 35 | vcf_writer.write_record(record) 36 | vcf_writer.close() 37 | -------------------------------------------------------------------------------- /vcf_tools/annotate_sufam_gt_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Annotate genotype to sufam vcf file 4 | """ 5 | 6 | import argparse 7 | import vcf 8 | import collections 9 | import sys 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser(description=__doc__, 13 | formatter_class=argparse.RawDescriptionHelpFormatter) 14 | parser.add_argument('sufam_vcf_file', help='multi-sample sufam file') 15 | parser.add_argument('vcf_files', nargs='+', help='sample pair vcf files') 16 | args = parser.parse_args() 17 | 18 | sample_variants = collections.defaultdict(set) 19 | for f in args.vcf_files: 20 | vcf_reader = vcf.Reader(open(f, 'r')) 21 | for record in vcf_reader: 22 | recid = "{}:{}:{}/{}".format(record.CHROM, record.POS, record.REF, record.ALT) 23 | s = record.samples[0].sample 24 | sample_variants[s].add(recid) 25 | 26 | sufam_vcf_reader = vcf.Reader(open(args.sufam_vcf_file, 'r')) 27 | sufam_vcf_reader.infos['samples_called_in'] = vcf.parser._Info(id='samples_called_in', num='.', 28 | type='String', 29 | desc='samples called in', 30 | source=None, 31 | version=None) 32 | vcf_writer = vcf.Writer(sys.stdout, sufam_vcf_reader) 33 | for record in sufam_vcf_reader: 34 | recid = "{}:{}:{}/{}".format(record.CHROM, record.POS, record.REF, record.ALT) 35 | for s, v in list(sample_variants.items()): 36 | if recid in v: 37 | if 'samples_called_in' not in record.INFO: 38 | record.INFO['samples_called_in'] = [] 39 | record.INFO['samples_called_in'].append(s) 40 | vcf_writer.write_record(record) 41 | vcf_writer.close() 42 | -------------------------------------------------------------------------------- /vcf_tools/annotate_sv.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/anotate_sv.$(NOW) 4 | 5 | SV_CALLERS = svaba manta gridss merged 6 | ANNOTATE_SV ?= $(HOME)/share/usr/env/annot_sv-3.1.3/opt/AnnotSV/bin/AnnotSV 7 | 8 | annotate_sv : $(foreach pair,$(SAMPLE_PAIRS), \ 9 | $(foreach caller,$(SV_CALLERS),annotate_sv/$(pair)/$(pair).$(caller)_sv.tsv)) \ 10 | $(foreach pair,$(SAMPLE_PAIRS), \ 11 | $(foreach caller,$(SV_CALLERS),annotate_sv/$(pair)/$(pair).$(caller)_sv.maf)) 12 | 13 | define annotate-sv 14 | annotate_sv/$1/$2/$1.$2_sv.tsv : vcf/$1.$2_sv.vcf 15 | $$(call RUN,-c -n 1 -s 4G -m 8G -v $(ANNOTATE_SV_ENV),"set -o pipefail && \ 16 | mkdir -p annotate_sv/$1/$2 && \ 17 | $$(ANNOTATE_SV) \ 18 | -SVinputFile $$(<) \ 19 | -outputFile ./annotate_sv/$1/$2/$1.$2_sv.tsv \ 20 | -genomeBuild GRCh37") 21 | 22 | annotate_sv/$1/$1.$2_sv.tsv : annotate_sv/$1/$2/$1.$2_sv.tsv 23 | $$(INIT) cat $$(<) > $$(@) 24 | 25 | annotate_sv/$1/$1.$2_sv.maf : vcf/$1.$2_sv.vcf 26 | $$(call RUN,-c -n 12 -s 1G -m 2G -v $(VEP_ENV),"set -o pipefail && \ 27 | $$(VCF2MAF) \ 28 | --input-vcf $$(<) \ 29 | --tumor-id $1 \ 30 | --filter-vcf $$(EXAC_NONTCGA) \ 31 | --ref-fasta $$(REF_FASTA) \ 32 | --vep-path $$(VEP_PATH) \ 33 | --vep-data $$(VEP_DATA) \ 34 | --tmp-dir `mktemp -d` \ 35 | --output-maf $$(@)") 36 | 37 | endef 38 | $(foreach pair,$(SAMPLE_PAIRS),\ 39 | $(foreach caller,$(SV_CALLERS), \ 40 | $(eval $(call annotate-sv,$(pair),$(caller))))) 41 | 42 | .DELETE_ON_ERROR: 43 | .SECONDARY: 44 | .PHONY: annotate_sv 45 | -------------------------------------------------------------------------------- /vcf_tools/bed_annotate_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ annotate off-target and filter off-target with low depth 3 | """ 4 | 5 | import argparse 6 | import vcf 7 | import sys 8 | import pandas as pd 9 | import numpy as np 10 | import intervaltree 11 | import re 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser(prog='bed_annotate_vcf.py', 15 | description='annotate vcf file using a bed file') 16 | parser.add_argument('--info_tag', help='info tag for annotation') 17 | parser.add_argument('interval_file') 18 | parser.add_argument('vcf_infile') 19 | args = parser.parse_args() 20 | 21 | intervals = pd.read_table(args.interval_file, header=None, dtype={0: str, 1: np.int32, 2: np.int32}) 22 | intervals = intervals.rename(columns={0: 'chr', 1: 'start', 2: 'end'}) 23 | trees = {} 24 | for chrom, interval in intervals.groupby('chr'): 25 | chrom = re.sub(r'chr', '', chrom) 26 | trees[chrom] = intervaltree.IntervalTree.from_tuples(list(zip(interval.start, interval.end))) 27 | 28 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 29 | vcf_reader.infos[args.info_tag] = vcf.parser._Info(id=args.info_tag, num='0', type='Flag', 30 | desc='{}: overlap'.format(args.info_tag), 31 | source=None, 32 | version=None) 33 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 34 | 35 | for record in vcf_reader: 36 | chrom = re.sub(r'chr', '', record.CHROM) 37 | if record.FILTER is None: 38 | record.FILTER = [] 39 | if chrom in trees: 40 | query = trees[chrom].search(record.POS) 41 | if len(query) == 0 & len(record.REF) > 1: 42 | query = trees[chrom].search(record.POS + len(record.REF)) 43 | if len(query) != 0: 44 | record.INFO[args.info_tag] = True 45 | vcf_writer.write_record(record) 46 | 47 | vcf_writer.close() 48 | -------------------------------------------------------------------------------- /vcf_tools/combine_vcf.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | 5 | if (!interactive()) { 6 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 7 | } 8 | 9 | args_list = list(make_option("--sample_name", default = NA, type = 'character', help = "sample name")) 10 | 11 | parser = OptionParser(usage = "%prog", option_list = args_list) 12 | arguments = parse_args(parser, positional_arguments = T) 13 | opt = arguments$options 14 | 15 | vcf_snp = read.csv(file=paste0("vcf_ann/", opt$sample_name, ".gatk_snps.vcf"), header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE) 16 | vcf_indel = read.csv(file=paste0("vcf_ann/", opt$sample_name, ".gatk_indels.vcf"), header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE) 17 | vcf = rbind(vcf_snp, vcf_indel) 18 | pos = as.numeric(vcf[,2]) 19 | index = order(pos, decreasing=FALSE) 20 | vcf = vcf[index,,drop=FALSE] 21 | chr = as.character(vcf[,1]) 22 | chr[chr=="X"] = 23 23 | chr[chr=="Y"] = 24 24 | chr = as.numeric(chr) 25 | index = is.na(chr) 26 | chr = chr[!index] 27 | vcf = vcf[!index,,drop=FALSE] 28 | index = order(chr, decreasing=FALSE) 29 | vcf = vcf[index,,drop=FALSE] 30 | vcf = vcf[,1:7,drop=FALSE] 31 | colnames(vcf) = c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER") 32 | vcf = cbind(vcf, "INFO"=rep(".", nrow(vcf))) 33 | index = grepl(",", vcf[,"REF"]) | grepl(",", vcf[,"ALT"]) 34 | vcf = vcf[!index,,drop=FALSE] 35 | index = duplicated(paste0(vcf[,1], ":", vcf[,2])) 36 | vcf = vcf[!index,,drop=FALSE] 37 | 38 | cat("##fileformat=VCFv4.1\n", file=paste0("cravat/", opt$sample_name, ".vcf"), append=FALSE) 39 | write.table(vcf, file=paste0("cravat/", opt$sample_name, ".vcf"), sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE, append=TRUE) 40 | -------------------------------------------------------------------------------- /vcf_tools/common_filter_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # retain only novel variants 3 | 4 | import argparse 5 | import vcf 6 | import re 7 | import sys 8 | 9 | parser = argparse.ArgumentParser(prog='common_filter_vcf.py', 10 | description='filter vcf file for novel variants (no rs-id/GMAF < 0.01 or cosmic id)') 11 | parser.add_argument('vcf_infile') 12 | 13 | args = parser.parse_args() 14 | 15 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 16 | 17 | vcf_reader.filters['Common'] = vcf.parser._Filter(id='Common', 18 | desc='no cosmic id, or has dbsnp ID and GMAF is > 0.01') 19 | 20 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 21 | 22 | for record in vcf_reader: 23 | if record.ID is not None: 24 | # ignore entries with cosmic IDs 25 | cosm_match = re.search(r'COSM', record.ID) 26 | if cosm_match is None: 27 | # filter entries with dbsnp IDs unless GMAF > 0.01 28 | rs_match = re.search(r'rs', record.ID) 29 | if rs_match is not None and ('GMAF' not in record.INFO or record.INFO['GMAF'] > 0.01): 30 | record.FILTER.append('Common') 31 | vcf_writer.write_record(record) 32 | 33 | vcf_writer.close() 34 | -------------------------------------------------------------------------------- /vcf_tools/concat_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import vcf 5 | import sys 6 | 7 | parser = argparse.ArgumentParser(prog='merge_vcf.py', 8 | description='merge vcf files') 9 | parser.add_argument('vcf_files', nargs='+', help='vcf files to merge') 10 | parser.add_argument('--out_file', nargs='?', help='merged vcf output file', default=sys.stdout, 11 | type=argparse.FileType('w')) 12 | 13 | args = parser.parse_args() 14 | 15 | vcf_readers = [vcf.Reader(open(f, 'r')) for f in args.vcf_files] 16 | vcf_reader = vcf_readers[0] 17 | # merge header 18 | if len(vcf_readers) > 1: 19 | for vcf_reader2 in vcf_readers[1:]: 20 | for form in vcf_reader2.formats: 21 | if form not in vcf_reader.formats: 22 | vcf_reader.formats[form] = vcf_reader2.formats[form] 23 | for inf in vcf_reader2.infos: 24 | if inf not in vcf_reader.infos: 25 | vcf_reader.infos[inf] = vcf_reader2.infos[inf] 26 | for filt in vcf_reader2.filters: 27 | if filt not in vcf_reader.infos: 28 | vcf_reader.filters[filt] = vcf_reader2.filters[filt] 29 | 30 | vcf_writer = vcf.Writer(args.out_file, vcf_reader) 31 | 32 | cpra = set() 33 | for vcf_reader in vcf_readers: 34 | for record in vcf_reader: 35 | chr_pos_ref_alt = ":".join([record.CHROM, str(record.POS), record.REF, str(record.ALT)]) 36 | if chr_pos_ref_alt not in cpra: 37 | vcf_writer.write_record(record) 38 | cpra.add(chr_pos_ref_alt) 39 | 40 | vcf_writer.close() 41 | -------------------------------------------------------------------------------- /vcf_tools/filter_vcf.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | 5 | if (!interactive()) { 6 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) })) 7 | } 8 | 9 | args_list = list(make_option("--sample_name", default = NA, type = 'character', help = "sample name")) 10 | 11 | parser = OptionParser(usage = "%prog", option_list = args_list) 12 | arguments = parse_args(parser, positional_arguments = T) 13 | opt = arguments$options 14 | 15 | vcf = read.csv(file=paste0("cravat/", opt$sample_name, ".vcf"), header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE) 16 | maf = read.csv(file=paste0("cravat/", opt$sample_name, ".maf"), header=TRUE, sep="\t", comment.char="#", stringsAsFactors=FALSE) 17 | index = maf[,"Variant_Classification"] %in% c("Frame_Shift_Del", "Frame_Shift_Ins", "Missense_Mutation", "Nonsense_Mutation", "Nonstop_Mutation", "Splice_Site") 18 | vcf = vcf[index,,drop=FALSE] 19 | vcf[,1] = paste0("chr", vcf[,1]) 20 | colnames(vcf) = c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO") 21 | cat("##fileformat=VCFv4.1\n", file=paste0("cravat/", opt$sample_name, ".cravat.vcf"), append=FALSE) 22 | write.table(vcf, file=paste0("cravat/", opt$sample_name, ".cravat.vcf"), sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE, append=TRUE) 23 | -------------------------------------------------------------------------------- /vcf_tools/gemini.mk: -------------------------------------------------------------------------------- 1 | 2 | # vim: set ft=make : 3 | # sub module containing vcf related tools 4 | 5 | include modules/Makefile.inc 6 | 7 | LOGDIR = log/gemini.$(NOW) 8 | GEMINI = unset PYTHONPATH; $(HOME)/share/usr/bin/gemini 9 | GEMINI_LOAD_OPTS = -t snpEff 10 | 11 | .DELETE_ON_ERROR: 12 | .SECONDARY: 13 | .PHONY: gemini 14 | 15 | GEMINI_DB = gemini/gemini.db 16 | gemini : $(if $(SAMPLE_PAIRS),gemini/mutect_gemini.timestamp gemini/strelka_gemini.timestamp gemini/varscan_gemini.timestamp) 17 | 18 | gemini/samples.ped : $(SAMPLE_SET_FILE) 19 | $(INIT) perl -t$$' ' -lane 'BEGIN { print "#Family_ID\tIndividual_ID\tPaternal_ID\tMaternal_ID\tSex\tPhenotype\tEthnicity"; } $$i = 0; while ($$f = pop @F) { print "$$.\t$$f\t-9\t-9\t0\t" . (($$i++ == 0)? "1" : "2") . "\t-9"; }' $< > $@ 20 | 21 | 22 | GATK_VCFS = $(foreach sample,$(SAMPLES),\ 23 | $(foreach type,gatk_snps gatk_indels,\ 24 | vcf_ann/$(sample).$(type).norm.vcf.gz)) 25 | gemini/gatk_gemini.timestamp : $(if $(SAMPLE_PAIRS),gemini/samples.ped) $(GATK_VCFS) $(addsuffix .tbi,$(GATK_VCFS)) 26 | $(call RUN,-n 8 -s 1G -m 2G,"for vcf in $(filter %.vcf.gz,$^); do $(GEMINI) load --cores 8 $(GEMINI_LOAD_OPTS) -v $$vcf $(if $(SAMPLE_PAIRS),-p $(filter %.ped,$^)) $(GEMINI_DB) ; done && touch $@") 27 | 28 | ifdef SAMPLE_PAIRS 29 | MUTECT_VCFS = $(foreach pair,$(SAMPLE_PAIRS),vcf_ann/$(pair).mutect_snps.norm.vcf.gz vcf_ann/$(pair).mutect_indels.norm.vcf.gz) 30 | gemini/mutect_gemini.timestamp : gemini/samples.ped $(MUTECT_VCFS) $(addsuffix .tbi,$(MUTECT_VCFS)) 31 | $(call RUN,-n 8 -s 1G -m 2G,"for vcf in $(filter %.vcf.gz,$^); do $(GEMINI) load --cores 8 $(GEMINI_LOAD_OPTS) -v \$$vcf -p $< $(GEMINI_DB); done && touch $@") 32 | endif 33 | 34 | 35 | include modules/vcf_tools/vcftools.mk 36 | -------------------------------------------------------------------------------- /vcf_tools/indel_filter_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ output indels only 3 | """ 4 | 5 | import vcf 6 | import sys 7 | 8 | if __name__ == "__main__": 9 | vcf_reader = vcf.Reader(sys.stdin) 10 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 11 | 12 | for record in vcf_reader: 13 | if record.is_indel: 14 | vcf_writer.write_record(record) 15 | 16 | vcf_writer.close() 17 | -------------------------------------------------------------------------------- /vcf_tools/interval_filter_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import vcf 5 | import sys 6 | import pandas as pd 7 | import numpy as np 8 | import intervaltree 9 | import re 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser(prog='interval_filter_vcf.py', 13 | description='filter vcf file according to a bed file') 14 | parser.add_argument('interval_file') 15 | parser.add_argument('vcf_infile') 16 | args = parser.parse_args() 17 | 18 | intervals = pd.read_table(args.interval_file, header=None, dtype={0: str, 1: np.int32, 2: np.int32}) 19 | intervals = intervals.rename(columns={0: 'chr', 1: 'start', 2: 'end'}) 20 | trees = {} 21 | for chrom, interval in intervals.groupby('chr'): 22 | chrom = re.sub(r'chr', '', chrom) 23 | trees[chrom] = intervaltree.IntervalTree.from_tuples(list(zip(interval.start - 50, interval.end + 50))) 24 | 25 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 26 | vcf_reader.filters['targetInterval'] = vcf.parser._Filter(id='targetInterval', 27 | desc='no overlap with intervals') 28 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 29 | 30 | for record in vcf_reader: 31 | chrom = re.sub(r'chr', '', record.CHROM) 32 | if record.FILTER is None: 33 | record.FILTER = [] 34 | if chrom not in trees: 35 | record.FILTER.append('targetInterval') 36 | else: 37 | query = trees[chrom].search(record.POS) 38 | if len(query) == 0: 39 | record.FILTER.append('targetInterval') 40 | vcf_writer.write_record(record) 41 | 42 | vcf_writer.close() 43 | -------------------------------------------------------------------------------- /vcf_tools/merge_sv.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/merge_sv.$(NOW) 4 | 5 | SV_CALLERS = svaba gridss manta 6 | MAX_DIST = 500 7 | NUM_CALLERS = 2 8 | TYPE = 0 9 | STRAND = 0 10 | MIN_SIZE = 30 11 | 12 | merge_sv : $(foreach pair,$(SAMPLE_PAIRS),merge_sv/$(pair)/samples.txt) \ 13 | $(foreach pair,$(SAMPLE_PAIRS),merge_sv/$(pair)/$(pair).merged_sv.vcf) \ 14 | $(foreach pair,$(SAMPLE_PAIRS),merge_sv/$(pair)/$(pair).merged_sv_ft.vcf) \ 15 | $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).merged_sv.vcf) 16 | 17 | define merge-sv 18 | merge_sv/$1_$2/samples.txt : $(foreach caller,$(SV_CALLERS),vcf/$1_$2.$(caller)_sv.vcf) 19 | mkdir -p merge_sv/$1_$2 && \ 20 | $(foreach caller,$(SV_CALLERS),echo vcf/$1_$2.$(caller)_sv.vcf >> $$(@);) 21 | 22 | merge_sv/$1_$2/$1_$2.merged_sv.vcf : merge_sv/$1_$2/samples.txt 23 | $$(call RUN,-c -n 1 -s 4G -m 8G -v $(SURVIVOR_ENV),"set -o pipefail && \ 24 | SURVIVOR merge $$(<) \ 25 | $(MAX_DIST) $(NUM_CALLERS) $(TYPE) $(STRAND) 0 $(MIN_SIZE) $$(@)") 26 | 27 | merge_sv/$1_$2/$1_$2.merged_sv_ft.vcf : merge_sv/$1_$2/$1_$2.merged_sv.vcf 28 | $$(call RUN,-c -n 1 -s 4G -m 8G -v $(INNOVATION_ENV),"set -o pipefail && \ 29 | grep '##' $$(<) > $$(@) && \ 30 | $$(RSCRIPT) modules/scripts/filter_sv.R --input_file $$(<) --output_file $$(@)") 31 | 32 | 33 | vcf/$1_$2.merged_sv.vcf : merge_sv/$1_$2/$1_$2.merged_sv_ft.vcf 34 | $$(INIT) cat $$(<) > $$(@) 35 | 36 | endef 37 | $(foreach pair,$(SAMPLE_PAIRS),\ 38 | $(eval $(call merge-sv,$(tumor.$(pair)),$(normal.$(pair))))) 39 | 40 | .DELETE_ON_ERROR: 41 | .SECONDARY: 42 | .PHONY: merge_sv 43 | -------------------------------------------------------------------------------- /vcf_tools/merge_uvcf_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ add ups-coordinate to INFO of vcf file 3 | """ 4 | 5 | import argparse 6 | import vcf 7 | import pandas as pd 8 | import sys 9 | import copy 10 | 11 | if __name__ == "__main__": 12 | parser = argparse.ArgumentParser(prog='merge_uvcf_vcf.py', 13 | description='add ups-coordinate to INFO of vcf file') 14 | parser.add_argument('uvcf_infile') 15 | parser.add_argument('vcf_infile') 16 | args = parser.parse_args() 17 | 18 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 19 | uvcf = pd.read_csv(args.uvcf_infile, comment='#', sep='\t') 20 | uvcf.columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'UPS-COORDINATE', 'INFO'] 21 | 22 | vcf_reader.infos['UPS_Coord'] = vcf.parser._Info(id='UPS_Coord', num='.', type='String', 23 | desc="UPS-coordinate", source=None, version=None) 24 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 25 | 26 | ups_map = {} 27 | for i, row in uvcf.iterrows(): 28 | x = '{}:{}_{}/{}'.format(row['CHROM'], row['POS'], row['REF'], row['ALT']) 29 | ups_map[x] = row['UPS-COORDINATE'].replace(" ", "") 30 | 31 | for record in vcf_reader: 32 | ups_coords = [] 33 | for alt in record.ALT: 34 | alt = str(alt) 35 | x = '{}:{}_{}/{}'.format(record.CHROM, record.POS, record.REF, alt) 36 | if x in ups_map: 37 | ups_coords.append(ups_map[x]) 38 | else: 39 | ups_coords.append('N/A[]') 40 | record.INFO['UPS_Coord'] = ups_coords 41 | vcf_writer.write_record(record) 42 | vcf_writer.close() 43 | -------------------------------------------------------------------------------- /vcf_tools/oncokb_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import vcf 5 | import sys 6 | import pandas as pd 7 | import re 8 | try: 9 | from itertools import izip as zip 10 | except ImportError: 11 | pass 12 | 13 | if __name__ == "__main__": 14 | parser = argparse.ArgumentParser(prog='oncokb_vcf.py', 15 | description='Add oncoKB annotation to vcf') 16 | parser.add_argument('--oncokb', help='oncoKB annotation file') 17 | parser.add_argument('vcf_infile') 18 | 19 | args = parser.parse_args() 20 | 21 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 22 | 23 | oncokb = pd.read_table(args.oncokb) 24 | 25 | vcf_reader.infos['oncoKB_level'] = vcf.parser._Info(id='oncoKB_level', num='.', type='String', 26 | desc="OncoKB level(s)", source=None, version=None) 27 | vcf_reader.infos['oncoKB_cancer_type'] = vcf.parser._Info(id='oncoKB_cancer_type', num='.', type='String', 28 | desc="OncoKB cancer type", source=None, version=None) 29 | 30 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 31 | 32 | for record in vcf_reader: 33 | if 'SYMBOL' in record.INFO and 'HGVSp_Short' in record.INFO: 34 | assert len(record.INFO['SYMBOL']) == len(record.INFO['HGVSp_Short']) 35 | for symb, hgvsp in zip(record.INFO['SYMBOL'], record.INFO['HGVSp_Short']): 36 | hgvsp = re.sub(r'^p\.', r'', hgvsp) 37 | q = oncokb.query( 38 | 'Gene == "{}" and Alteration == "{}"'.format(symb, hgvsp)) 39 | if len(q) > 0: 40 | if 'oncoKB_level' not in record.INFO: 41 | record.INFO['oncoKB_level'] = [] 42 | record.INFO['oncoKB_cancer_type'] = [] 43 | record.INFO['oncoKB_level'].extend(q['Level']) 44 | record.INFO['oncoKB_cancer_type'].extend(map(lambda x: re.sub(' ', '_', x), q['Cancer Type'])) 45 | vcf_writer.write_record(record) 46 | vcf_writer.close() 47 | -------------------------------------------------------------------------------- /vcf_tools/pass_filter_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # retain only PASS variants 3 | 4 | import argparse 5 | import vcf 6 | import re 7 | import sys 8 | 9 | parser = argparse.ArgumentParser(prog='pass_filter_vcf.py', 10 | description='filter vcf file for PASS variants') 11 | parser.add_argument('vcf_infile') 12 | 13 | args = parser.parse_args() 14 | 15 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 16 | 17 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 18 | 19 | for record in vcf_reader: 20 | if len(record.FILTER) == 0: 21 | vcf_writer.write_record(record) 22 | 23 | vcf_writer.close() 24 | -------------------------------------------------------------------------------- /vcf_tools/recurVcf.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("optparse")) 4 | suppressPackageStartupMessages(library("plyr")) 5 | suppressPackageStartupMessages(library("VariantAnnotation")) 6 | 7 | #options(warn = -1, error = quote({ traceback(2); q('no', status = 1) })) 8 | #options(error = recover) 9 | options(error = quote(dump.frames("testdump", TRUE))) 10 | 11 | optList <- list( 12 | make_option("--genome", default = 'b37', help = "genome build [default %default]"), 13 | make_option("--tumor", default = NULL, help = "tumor sample"), 14 | make_option("--outFile", default = NULL, help = "output file [default %default]")) 15 | 16 | parser <- OptionParser(usage = "%prog vcf.files", option_list = optList); 17 | arguments <- parse_args(parser, positional_arguments = T); 18 | opt <- arguments$options; 19 | 20 | if (is.null(opt$outFile)) { 21 | cat("Need output file\n"); 22 | print_help(parser); 23 | stop(); 24 | } else if (length(arguments$args) <= 1) { 25 | cat("Need vcf files\n"); 26 | print_help(parser); 27 | stop(); 28 | } 29 | 30 | files <- arguments$args; 31 | 32 | vcfs <- list() 33 | for (f in files) { 34 | vcf <- readVcf(f, genome = opt$genome) 35 | tum <- ifelse(opt$tumor %in% colnames(geno(vcf)$GT), opt$tumor, "TUMOR") 36 | gt <- geno(vcf)$GT[, tum] 37 | vcf <- vcf[gt != "./." & gt != "0/0" & gt != "0", ] 38 | vcfs <- append(vcfs, vcf) 39 | } 40 | 41 | all <- do.call('rbind', lapply(vcfs, function(x) as.data.frame(subset(rowRanges(x), FILTER == "PASS")))) 42 | all <- all[, c("seqnames", "start", "end")] 43 | cnt <- ddply(all, .(seqnames, start, end), nrow) 44 | 45 | cnt <- subset(cnt, V1 > 1) 46 | write(paste(cnt[,1], ":", cnt[,2], "-", cnt[,3], sep = ''), file = opt$outFile) 47 | #write.table(cnt[,c(1:3)], file = opt$outFile, sep = '\t', quote = F, row.names = F, col.names = F) 48 | -------------------------------------------------------------------------------- /vcf_tools/recurVcf.mk: -------------------------------------------------------------------------------- 1 | # intersect vcf files 2 | ##### DEFAULTS ###### 3 | 4 | ##### MAKE INCLUDES ##### 5 | include modules/Makefile.inc 6 | include modules/variant_callers/gatk.inc 7 | 8 | LOGDIR = log/recur_vcf.$(NOW) 9 | 10 | SET_VCF_SUFFIXES = gatk_snps.dp_ft.som_ft 11 | PAIR_VCF_SUFFIXES = som_sniper.ss_dp_ft.ss_ft.pass mutect.som_ad_ft.pass 12 | SAMPLE_SET_PAIR_VCF = $(foreach suff,$(SET_VCF_SUFFIXES),vcf/$(get_set.$1).$(suff).vcf) $(foreach suff,$(PAIR_VCF_SUFFIXES),vcf/$(get_pair.$1).$(suff).vcf) 13 | 14 | RECUR_VCF = $(RSCRIPT) modules/vcf_tools/recurVcf.R 15 | 16 | .DELETE_ON_ERROR: 17 | .SECONDARY: 18 | .PHONY : all pileups 19 | 20 | all : $(foreach sample,$(TUMOR_SAMPLES),recur_pos/$(sample).recur.txt) 21 | pileups : $(foreach sample,$(TUMOR_SAMPLES),pileup/$(sample).pileup) 22 | 23 | define recur-pos-tumor 24 | recur_pos/$1.recur.txt : $$(call SAMPLE_SET_PAIR_VCF,$1) 25 | $$(INIT) $$(RECUR_VCF) --tumor $1 --outFile $$@ $$^ 26 | endef 27 | $(foreach tumor,$(TUMOR_SAMPLES),$(eval $(call recur-pos-tumor,$(tumor)))) 28 | 29 | 30 | -------------------------------------------------------------------------------- /vcf_tools/snp_filter_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ output snps only 3 | """ 4 | 5 | import vcf 6 | import sys 7 | 8 | if __name__ == "__main__": 9 | vcf_reader = vcf.Reader(sys.stdin) 10 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader) 11 | 12 | for record in vcf_reader: 13 | if record.is_snp: 14 | vcf_writer.write_record(record) 15 | 16 | vcf_writer.close() 17 | -------------------------------------------------------------------------------- /vcf_tools/split_snps_indels_vcf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ split_snps_indels_vcf.py 4 | split a vcf file into snps and indels/everything else 5 | """ 6 | 7 | import argparse 8 | import vcf 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser(prog='split_snps_indels_vcf.py', 12 | description='split vcf file into snps and indels') 13 | parser.add_argument('vcf_infile') 14 | parser.add_argument('--snps', '-s', nargs='?', required=True, help='snp output vcf file') 15 | parser.add_argument('--indels', '-i', nargs='?', required=True, help='indel/everything else output vcf file') 16 | args = parser.parse_args() 17 | 18 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r')) 19 | 20 | snp_vcf_writer = vcf.Writer(open(args.snps, 'w'), vcf_reader) 21 | indel_vcf_writer = vcf.Writer(open(args.indels, 'w'), vcf_reader) 22 | 23 | for record in vcf_reader: 24 | if record.is_snp: 25 | snp_vcf_writer.write_record(record) 26 | else: 27 | indel_vcf_writer.write_record(record) 28 | 29 | snp_vcf_writer.close() 30 | indel_vcf_writer.close() 31 | -------------------------------------------------------------------------------- /vcf_tools/vcfCompare.mk: -------------------------------------------------------------------------------- 1 | # Compare vcf files 2 | ##### DEFAULTS ###### 3 | include modules/Makefile.inc 4 | 5 | LOGDIR = log/vcfComp.$(NOW) 6 | ##### MAKE INCLUDES ##### 7 | 8 | .DELETE_ON_ERROR: 9 | .SECONDARY: 10 | .PHONY : all variant_eval gt_concordance 11 | 12 | FILTER_SUFFIX := dp_ft.target_ft 13 | ifdef TARGETS_FILE 14 | FILTER_SUFFIX := $(FILTER_SUFFIX).target_ft 15 | endif 16 | ifdef NORMAL_VCF 17 | FILTER_SUFFIX := nft.$(FILTER_SUFFIX) 18 | endif 19 | #VARIANT_TYPES := gatk_snps snvmix2 20 | EVAL_TYPES ?= rnaseq_gatk_snps 21 | COMP_TYPES ?= exonseq_museq exonseq_mutect 22 | 23 | all : variant_eval 24 | 25 | gt_concordance : $(foreach sample,$(SAMPLES),cmp_vcf/grp/$(sample).gt_concord.grp) 26 | 27 | variant_eval : $(foreach sample,$(SAMPLES),cmp_vcf/grp/$(sample).variant_eval.grp) 28 | 29 | cmp_vcf/grp/%.gt_concord.grp : $(foreach type,$(VARIANT_TYPES),vcf/%.$(type).$(FILTER_SUFFIX).vcf) 30 | $(call RUN,-s 9G -m 12G,"$(call GATK_MEM,8G) -T GenotypeConcordance -R $(REF_FASTA) $(foreach i,$^,--eval:$(notdir $(i:.$(FILTER_SUFFIX).vcf=)) $i ) $(foreach i,$^,--comp:$(notdir $(i:.$(FILTER_SUFFIX).vcf=)) $i ) -o $@") 31 | 32 | cmp_vcf/grp/%.variant_eval.grp : $(foreach type,$(EVAL_TYPES),vcf/%.$(type).vcf) $(foreach type,$(COMP_TYPES),vcf/%.$(type).vcf) 33 | $(call RUN,-s 9G -m 12G,"$(call GATK_MEM,8G) -T VariantEval --dbsnp $(DBSNP) -R $(REF_FASTA) $(foreach i,$(EVAL_TYPES),--eval:$i vcf/$*.$i.vcf ) $(foreach i,$(COMP_TYPES),--comp:$i vcf/$*.$i.vcf ) -o $@") 34 | #$(call RUN,-s 4G -m 6G,"$(call GATK_MEM,4G) -T VariantEval --dbsnp $(DBSNP) -R $(REF_FASTA) --eval:$( 0.1 3 | vcf/%.cft.vcf : vcf/%.vcf 4 | $(call CHECK_VCF,$(call RUN,-c -s 8G -m 12G,"$(call GATK_MEM,8G) -T VariantFiltration -U LENIENT_VCF_PROCESSING -R $(REF_FASTA) -V $< -o $@.tmp \ 5 | --filterExpression '$(VCF_POST_ANN_FILTER_EXPRESSION)' --filterName customFilter && $(call VERIFY_VCF,$@.tmp,$@)")) 6 | 7 | COMMON_FILTER_VCF = $(PYTHON) modules/vcf_tools/common_filter_vcf.py 8 | vcf/%.common_ft.vcf : vcf/%.vcf 9 | $(call CHECK_VCF,$(call RUN,-c -s 4G -m 5G,"$(COMMON_FILTER_VCF) $< > $@.tmp && $(call VERIFY_VCF,$@.tmp,$@)")) 10 | -------------------------------------------------------------------------------- /virus/krona_classify.mk: -------------------------------------------------------------------------------- 1 | include modules/Makefile.inc 2 | 3 | LOGDIR ?= log/krona_classify.$(NOW) 4 | PHONY += unmapped_reads 5 | 6 | krona_classify : $(foreach sample,$(SAMPLES),unmapped_reads/$(sample).html) 7 | 8 | define krona-classify 9 | unmapped_reads/%.html : unmapped_reads/%.blast 10 | $(call RUN,-n 1 -s 4G -m 9G,"ktClassifyBLAST -s $$< -o unmapped_reads/$$*.tax && ktImportTaxonomy -m 1 unmapped_reads/$$*.tax -o unmapped_reads/$$*.html") 11 | endef 12 | $(foreach sample,$(SAMPLES),\ 13 | $(eval $(call krona-classify,$(sample)))) 14 | 15 | 16 | .DELETE_ON_ERROR: 17 | .SECONDARY: 18 | .PHONY: $(PHONY) 19 | --------------------------------------------------------------------------------