├── .gitignore
├── .gitmodules
├── .travis.yml
├── Makefile
├── Makefile.inc
├── README.md
├── aligners
    ├── align.inc
    ├── align.mk
    ├── bowtieAligner.mk
    ├── bwaAligner.mk
    ├── bwamemAligner.mk
    ├── gsnapAligner.mk
    ├── gsnapIADB.mk
    ├── hisatAligner.mk
    ├── novoalignIADB.mk
    ├── pBwaAligner.mk
    ├── starAligner.mk
    ├── starFusionAligner.mk
    ├── tmapAligner.mk
    └── tophatAligner.mk
├── bam_tools
    ├── fix_bam.mk
    ├── fix_mate.mk
    ├── fix_rg.mk
    ├── get_bam_data_mirror.mk
    ├── get_bam_irb_mirror.mk
    ├── merge_bam.mk
    ├── processBam.mk
    └── put_bam_data_mirror.mk
├── clonality
    ├── absoluteSeq.mk
    ├── clonehd.mk
    ├── pyclone_13.mk
    ├── pyclone_vi.mk
    └── tableToCloneHDFormat.pl
├── conda_env
    ├── R3.2.2.txt
    ├── delly_env.txt
    ├── jrflab_modules_env.txt
    ├── mutsig_report_env.txt
    ├── sum_reads_env.txt
    └── varscan_env.txt
├── config.inc
├── config
    ├── defuse.conf
    └── snpEff.conf
├── contamination
    ├── clusterSampleVcf.R
    ├── clusterSamples.mk
    └── contest.mk
├── copy_number
    ├── CytoBand.RData
    ├── absCNseq.R
    ├── absCNseq.mk
    ├── annotateFacets2Vcf.R
    ├── annotateFacetsCCF2Vcf.R
    ├── annotateFreeC.R
    ├── annotateTitanLOHVcf.R
    ├── ascat.R
    ├── ascat.mk
    ├── cbindCNVs.R
    ├── cghCall.R
    ├── cnvkit.mk
    ├── compare_facets_cncf.py
    ├── controlFreeC.mk
    ├── controlFreeCLOHTN.mk
    ├── controlFreeCTN.mk
    ├── convert_basecount_to_snp_pileup.py
    ├── createFacetsSummary.R
    ├── exomeCNV.R
    ├── exomeCNV.mk
    ├── exomeCNVLOH.R
    ├── exomeCNVLOH.mk
    ├── exomeCNVLOHHeatmap.mk
    ├── facets.mk
    ├── facetsFillGeneCN.R
    ├── facetsGeneCN.R
    ├── facetsGeneCNPlot.R
    ├── facetsPlotSampleLRR.R
    ├── facets_merge_tn.py
    ├── facets_suite.mk
    ├── gistic.mk
    ├── gisticFacets.mk
    ├── hg19_chrominfo.txt
    ├── hg19_cytoBandIdeo.txt
    ├── hg19_gaps.txt
    ├── hmmCopy.R
    ├── hmmCopy.mk
    ├── makeControlFreeCGraph.R
    ├── medicc2.mk
    ├── normaliseCopyNum.R
    ├── normalisedCopyNum.mk
    ├── oncosnp.mk
    ├── oncosnpseq.mk
    ├── plotFacets.R
    ├── plotFreeCCopyNum.R
    ├── plotFreeCLogRatio.R
    ├── plotGisticHeatmap.R
    ├── recenter_base_count.py
    ├── runFacets.R
    ├── runTitan.R
    ├── segmentVarscanCNV.R
    ├── summarizeTitan.R
    ├── titan.inc
    ├── titan.mk
    ├── varscanCNV.mk
    └── varscanCNVGeneCN.R
├── db
    ├── chasm_db.yaml
    ├── create_mysql_docker_images.sh
    ├── ensembl-hs-core-85-37_db.yaml
    ├── fathmm_config-cpu-6-2.ini
    ├── fathmm_config-e01.ini
    ├── fathmm_config-ika.ini
    ├── fathmm_config-lilac.ini
    ├── fathmm_config-swan.ini
    ├── fathmm_db.yaml
    ├── run_mysql_docker_images.sh
    ├── snv_box-cpu-6-2.conf
    ├── snv_box-e01.conf
    ├── snv_box-ika.conf
    ├── snv_box-lilac.conf
    └── snv_box-swan.conf
├── default_yaml
    ├── project_config.yaml
    ├── sample_attr.yaml
    └── summary_config.yaml
├── export
    └── cbioportal.mk
├── external
    └── SNVBox
    │   ├── README
    │   ├── db
    │       ├── ARFFutil.py
    │       ├── CodonMap.py
    │       ├── Config.py
    │       ├── DBUtil.py
    │       ├── DataSet.py
    │       ├── FeatureDb.py
    │       └── __init__.py
    │   ├── doc
    │       ├── CHASM_VEST_SNVGet_UserManual.doc
    │       ├── CHASM_VEST_SNVGet_UserManual.pdf
    │       └── License.txt
    │   ├── genomicToProtein
    │   ├── snvGetGenomic
    │   ├── snvGetTranscript
    │   └── snvGetTranscriptList
├── fastq_tools
    ├── bamtoFasta.mk
    ├── blastReads.mk
    ├── extractFastq.mk
    ├── extractReads.mk
    ├── extractunmappedpairs.mk
    ├── fastq.mk
    ├── fixFastqReadNames.mk
    ├── fixFastqReadNames.py
    ├── mergeFastq.mk
    ├── mergeSplitFastq.mk
    └── trimFastq.pl
├── genome_inc
    ├── GRCm38.inc
    ├── b37.inc
    ├── hg18.inc
    ├── hg19.inc
    └── hg38.inc
├── isoforms
    └── miso.mk
├── ploidy
    ├── bicseq.mk
    ├── expands.mk
    └── pyloh.mk
├── qc
    ├── TEQC.R
    ├── TEQCreport.R
    ├── TEQCreportFun.R
    ├── bamStats.mk
    ├── bam_interval_metrics.mk
    ├── bam_metrics.mk
    ├── fastqc.mk
    ├── fastqcSummaryPlot.R
    ├── intervalBamQC.R
    ├── intervalBamQC.mk
    ├── nonRefFreqFromPileup.pl
    ├── plotHsMetrics.R
    ├── plotRnaseqMetrics.R
    ├── qualimap.mk
    ├── readDepth.mk
    ├── rnaseqMetrics.mk
    ├── rseqc.mk
    ├── summarize_hs_metrics.py
    ├── summarize_idxstats.py
    ├── teqc.mk
    ├── variantEvalGatkReport.R
    └── wgs_metrics.mk
├── recurrent_mutations
    └── report.mk
├── reference
    ├── gene_lists
    │   ├── Kandoth_127genes.bed
    │   ├── Kandoth_127genes.hg19.bed
    │   ├── Lawrence_cancer5000-S.bed
    │   ├── Lawrence_cancer5000-S.hg19.bed
    │   ├── cancer_gene_census.b37.2016-10-14.bed
    │   ├── cancer_gene_census.b37.2017-05-25.bed
    │   ├── cancer_gene_census.hg19.2017-05-25.bed
    │   └── haplo_insuff_genes.bed
    └── hotspots
    │   ├── hotspot-dedup.vcf
    │   ├── hotspot-v1.hg19.vcf.gz
    │   ├── hotspot-v1.hg19.vcf.gz.tbi
    │   ├── hotspot-v1.vcf.gz
    │   ├── hotspot-v1.vcf.gz.tbi
    │   ├── hotspot-v2.hg19.vcf.gz
    │   ├── hotspot-v2.hg19.vcf.gz.tbi
    │   ├── hotspot-v2.vcf.gz
    │   ├── hotspot-v2.vcf.gz.tbi
    │   ├── hotspot-v3.vcf.gz
    │   └── hotspot-v3.vcf.gz.tbi
├── rnaseq
    ├── immunedeconv.mk
    ├── kallisto.mk
    └── sumreads.mk
├── scripts
    ├── Rshell
    ├── Sweave.R
    ├── add_dbsnp_gmaf.py
    ├── annotateSummaryVcf.R
    ├── bam_metrics.R
    ├── classify_indel_pathogenicity_vcf.py
    ├── classify_pathogenicity_vcf.py
    ├── classify_snv_pathogenicity_vcf.py
    ├── cnvkit.R
    ├── configure.py
    ├── convert_sample_txt2yaml.py
    ├── create_iontorrent_sample_merge_yaml.py
    ├── create_iontorrent_sample_yaml.py
    ├── create_sample_sets.pl
    ├── create_sample_yaml.py
    ├── create_sample_yaml2.py
    ├── drmaa_job.py
    ├── extract_signatures.R
    ├── facets_suite.R
    ├── filter_dbsnp_gmaf.py
    ├── filter_sv.R
    ├── get_basecounts.R
    ├── get_insert_size.py
    ├── hr_detect.R
    ├── immunedeconv.R
    ├── init_project.pl
    ├── job.py
    ├── join_eff.pl
    ├── knit.R
    ├── launcher_sql_db.py
    ├── medicc2.R
    ├── merge.R
    ├── mimsi.R
    ├── monitorMySQL.sh
    ├── monitor_gfserver.sh
    ├── mutation_taster_query.py
    ├── normalFilterVCF.pl
    ├── posnGeneLookup.pl
    ├── prepareFastq.sh
    ├── prepareFastq2.sh
    ├── prepareMultirunFastq.sh
    ├── provean_query.py
    ├── provean_vcf.py
    ├── pyclone_13.R
    ├── pyclone_vi.R
    ├── qmake.pl
    ├── qsub.pl
    ├── qsub.py
    ├── qsubClient.pl
    ├── qsubDaemon.pl
    ├── qsub_pbs.py
    ├── rbind.R
    ├── recurrent_mutations_plot.py
    ├── recurrent_mutations_sufam.ipynb
    ├── remote_provean_query.py
    ├── run.py
    ├── somaticFilterVCF.pl
    ├── split_bed.py
    ├── split_vcf.py
    ├── star_fish.R
    ├── sufam_gt.R
    ├── summarize_rnaseqreads.R
    ├── summarize_rnaseqreads_byexon.R
    ├── summarize_rnaseqreads_byintron.R
    ├── summarize_sleuth.R
    ├── sv_signature.R
    ├── swapvcf.R
    ├── tsvToExcel.py
    ├── vcfToTable.R
    └── wgs_metrics.R
├── signatures
    ├── deconstruct_sigs.mk
    ├── hr_detect.mk
    ├── star_fish.mk
    └── sv_signature.mk
├── snp6
    ├── absolute.R
    ├── hapseg.R
    └── snp6.mk
├── summary
    ├── cravat_summary.R
    ├── cravat_summary.mk
    ├── cravat_summary.py
    ├── delmh_summary.R
    ├── delmh_summary.mk
    ├── genome_summary_excel.py
    ├── genomesummary.R
    ├── genomesummary.mk
    ├── hotspot_summary_excel.py
    ├── hotspotsummary.R
    ├── hotspotsummary.mk
    ├── mouse_summary_excel.py
    ├── mousesummary.R
    ├── mutation_summary_excel.py
    └── mutationsummary.mk
├── sv_callers
    ├── brass.mk
    ├── chimerascan.mk
    ├── crest.mk
    ├── defuse.mk
    ├── defuse2usv.py
    ├── defuseOncofuse.R
    ├── delly.mk
    ├── destruct.mk
    ├── ericScript.mk
    ├── ericscript2usv.py
    ├── extractCoordsFromDefuse.pl
    ├── filterDefuse.pl
    ├── fusioncatcher.mk
    ├── fusionfinder.mk
    ├── gridss_tumor_normal.mk
    ├── hydra.mk
    ├── integrate.mk
    ├── integrate2usv.py
    ├── integrateOncofuse.R
    ├── integrateRnaseq.mk
    ├── lumpy.mk
    ├── manta.inc
    ├── manta.mk
    ├── mantaRnaseq.mk
    ├── manta_config.py.ini
    ├── manta_hs_config.py.ini
    ├── manta_tumor_normal.mk
    ├── mapsplice.mk
    ├── mapsplice2usv.py
    ├── nfuseDNA.mk
    ├── nfuseWGSSWTSS.mk
    ├── normalFilterChimerascan.pl
    ├── normalFilterDefuse.pl
    ├── normalFilterSoapFuse.pl
    ├── oncofuse.mk
    ├── prepareSoapFuse.pl
    ├── recurrentFusions.R
    ├── soapFuse.mk
    ├── starFusion.mk
    ├── starfusion2usv.py
    ├── svaba_tumor_normal.mk
    └── tophatFusion.mk
├── variant_callers
    ├── dindel.mk
    ├── fixVarscanVcf.pl
    ├── gatk.inc
    ├── gatk.mk
    ├── get_basecounts.mk
    ├── haplotypeCaller.mk
    ├── hotspot.mk
    ├── museq.mk
    ├── pindel.mk
    ├── qsnp.mk
    ├── samtoolsHet.mk
    ├── somatic
    │   ├── crest.mk
    │   ├── dindelTNFilter.mk
    │   ├── gatkTNFilter.mk
    │   ├── gatkValidation.mk
    │   ├── hla_summary.R
    │   ├── lancet.mk
    │   ├── mimsi.mk
    │   ├── msisensor.mk
    │   ├── museqTN.mk
    │   ├── mutect.mk
    │   ├── mutect2.mk
    │   ├── mutectReport.Rmd
    │   ├── pindelTN.mk
    │   ├── platypus.mk
    │   ├── plotSeqLogoFromMutect.R
    │   ├── polysolver.mk
    │   ├── scalpel.mk
    │   ├── somaticIndelDetector.mk
    │   ├── somaticIndels.mk
    │   ├── somaticSniper.mk
    │   ├── somaticSniperFixAD.R
    │   ├── somaticVariants.mk
    │   ├── strelka.mk
    │   ├── strelkaVarscanIndels.mk
    │   ├── tvcTN.mk
    │   ├── varscanTN.mk
    │   └── varscanTNtoVcf.pl
    ├── sufam_gt.mk
    ├── sufamsampleset.mk
    ├── tvc.mk
    ├── variantEvalGatkReport.R
    ├── varscan.mk
    ├── varscanFpfilter.mk
    └── varscanToVcf.pl
├── vcf_tools
    ├── addGeneListAnnotationToVcf.R
    ├── annotateExtVcf.mk
    ├── annotateSomaticVcf.mk
    ├── annotateSummaryVcf.mk
    ├── annotateVcf.mk
    ├── annotate_source_vcf.py
    ├── annotate_sufam_gt_vcf.py
    ├── annotate_sv.mk
    ├── annotate_vcf2maf.py
    ├── bed_annotate_vcf.py
    ├── chasmVcf.R
    ├── combine_vcf.R
    ├── common_filter_vcf.py
    ├── compare_vcf.py
    ├── concat_vcf.py
    ├── cravat_annotation.mk
    ├── fathmmVcf.R
    ├── filter_vcf.R
    ├── gemini.mk
    ├── hotspot_vcf.py
    ├── indel_filter_vcf.py
    ├── interval_depth_filter_vcf.py
    ├── interval_filter_vcf.py
    ├── merge_indel_vcf.py
    ├── merge_sv.mk
    ├── merge_uvcf_vcf.py
    ├── merge_vcf.py
    ├── mutAssVcf.R
    ├── mutation_taster_vcf.py
    ├── oncokb_vcf.py
    ├── parsSNPVcf.R
    ├── pass_filter_vcf.py
    ├── proveanVcf.R
    ├── recurVcf.R
    ├── recurVcf.mk
    ├── snp_filter_vcf.py
    ├── somatic_ad_filter_vcf.py
    ├── somatic_vcf2tsv.py
    ├── split_snps_indels_vcf.py
    ├── summary_vcf.R
    ├── transficVcf.R
    ├── tumor_variant_read_filter_vcf.py
    ├── vcfAnnotations.mk
    ├── vcfCompare.mk
    ├── vcfCompareTN.mk
    ├── vcfFilters.mk
    ├── vcfMerge.mk
    ├── vcfMergePlatform.mk
    ├── vcfMergeTN.mk
    ├── vcfPostAnnotations.mk
    ├── vcfPostFilters.mk
    ├── vcfsorter.pl
    └── vcftools.mk
└── virus
    ├── krona_classify.mk
    └── virus_detection_bowtie2.mk


/.gitignore:
--------------------------------------------------------------------------------
1 | .*.tmp
2 | *.pyc
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "external/fathmm"]
2 | 	path = external/fathmm
3 | 	url = https://github.com/jrflab/fathmm.git
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 | sudo: false
 5 | install:
 6 |   - case "$TRAVIS_PYTHON_VERSION" in
 7 |     2*)
 8 |       wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh;
 9 |       PYTHON="python"
10 |       ;;
11 |     3*)
12 |       wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
13 |       PYTHON="python3"
14 |       ;;
15 |     esac
16 |   - bash miniconda.sh -b -p $HOME/miniconda
17 |   - export PATH="$HOME/miniconda/bin:$PATH"
18 |   - hash -r
19 |   - conda config --set always_yes yes --set changeps1 no
20 |   - conda update -q conda
21 |   - conda info -a
22 |   - conda config --add channels r
23 |   - conda config --add channels bioconda 
24 |   - conda config --add channels auto
25 |   - conda config --add channels jrderuiter
26 |   - conda config --add channels biobuilds
27 |   - conda create -q -n jrflab --file conda_env/jrflab_modules_env.txt
28 |   - conda create -q -n r-env --file conda_env/R3.2.2.txt
29 | script:
30 | # TODO: copy number heatmap test not working:
31 | # source activate r-env
32 | # bash -x test/copy_number/test_copynumber_heatmap.sh
33 |   - source activate jrflab
34 |   - bash -x test/vcf_tools/test_common_filter.sh
35 |   - bash -x test/vcf_tools/test_hotspot.sh
36 |   - bash -x test/vcf_tools/test_pathogenicity.sh
37 |   - bash -x test/scripts/test_create_sample_yaml.sh
38 |   - bash -x test/scripts/test_configure.sh
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # modules
2 | 
3 | 


--------------------------------------------------------------------------------
/aligners/align.inc:
--------------------------------------------------------------------------------
 1 | ifndef ALIGN_INC
 2 | BAM_PHRED64 ?= false
 3 | BAM_DUP_TYPE ?= markdup
 4 | BAM_NO_FILTER ?= false
 5 | BAM_NO_RECAL ?= false
 6 | BAM_NO_REALN ?= false
 7 | SPLIT_CHR ?= true
 8 | SPLIT_FASTQ ?= false
 9 | BAM_NO_SORT ?= false
10 | BAM_FIX_RG ?= false
11 | SEQ_PLATFORM ?= illumina
12 | 
13 | BAM_SUFFIX := $(subst $( ),.,$(strip \
14 |         $(if $(findstring false,$(BAM_NO_SORT)),sorted)\
15 |         $(if $(findstring false,$(BAM_NO_FILTER)),filtered)\
16 |         $(if $(findstring true,$(PDX)),pdx_filtered)\
17 |         $(if $(findstring true,$(BAM_FIX_RG)),rg)\
18 |         $(if $(findstring false,$(BAM_NO_REALN)),realn)\
19 |         $(if $(findstring rmdup,$(BAM_DUP_TYPE)),rmdup)\
20 |         $(if $(findstring markdup,$(BAM_DUP_TYPE)),markdup)\
21 |         $(if $(findstring false,$(BAM_NO_RECAL)),recal)\
22 |         bam))
23 | endif
24 | ALIGN_INC = true
25 | 


--------------------------------------------------------------------------------
/aligners/align.mk:
--------------------------------------------------------------------------------
 1 | define bam-header
 2 | $$(ALIGNER)/sam/$1.header.sam : $$(foreach split,$2,$$(ALIGNER)/bam/$$(split).$$(ALIGNER).sorted.bam)
 3 | 	$$(INIT) $$(SAMTOOLS) view -H $$< | grep -v '^@RG' > $$@.tmp; \
 4 | 	for bam in $$^; do $$(SAMTOOLS) view -H $$$$bam | grep '^@RG' >> $$@.tmp; done; \
 5 | 	uniq $$@.tmp > $$@ && $$(RM) $$@.tmp
 6 | endef
 7 | $(foreach sample,$(SAMPLES),\
 8 | 	$(eval $(call bam-header,$(sample),$(split.$(sample)))))
 9 | 
10 | define merged-bam
11 | $$(ALIGNER)/bam/$1.$$(ALIGNER).sorted.bam : $$(ALIGNER)/sam/$1.header.sam $$(foreach split,$2,$$(ALIGNER)/bam/$$(split).$$(ALIGNER).sorted.bam)
12 | 	$$(call RUN,-s 12G -m 15G,"$$(SAMTOOLS) merge -f -h $$< $$(@) $$(filter %.bam,$$^) && $$(RM) $$^")
13 | endef
14 | define rename-bam
15 | $$(ALIGNER)/bam/$1.$$(ALIGNER).bam : $$(ALIGNER)/bam/$2.$$(ALIGNER).bam
16 | 	mv $$< $$@
17 | $$(ALIGNER)/bam/$1.$$(ALIGNER).sorted.bam : $$(ALIGNER)/bam/$2.$$(ALIGNER).sorted.bam
18 | 	mv $$< $$@
19 | endef
20 | $(foreach sample,$(SAMPLES),\
21 | 	$(if $(word 2,$(split.$(sample))),\
22 | 	$(eval $(call merged-bam,$(sample),$(split.$(sample)))),\
23 | 	$(if $(split.$(sample)),\
24 | 	$(eval $(call rename-bam,$(sample),$(split.$(sample)))))))
25 | 


--------------------------------------------------------------------------------
/aligners/gsnapIADB.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | OPTS = -d IADB -D ${GSNAP_REF} -B 4 -t 4 -A sam --novelsplicing=1 --pairexpect=200 -n 1 --quiet-if-excessive --nofails
 4 | ifeq ($(BAM_PHRED64),true)
 5 | 	OPTS += -J 64 -j -31
 6 | endif
 7 | GSNAP_SGE_RREQ = $(call MEM_FREE,2G,4G) -q all.q -pe $(PARALLEL_ENV) 4 -now n
 8 | 
 9 | REQUIRED_FLAGS = 4
10 | BAM_FILTER_FLAGS = 1536
11 | 
12 | SAMPLE_FILE = samples.txt
13 | SAMPLES = $(shell cat $(SAMPLE_FILE))
14 | 
15 | VPATH = bam
16 | 
17 | LOGDIR = iadb/log
18 | 
19 | .DELETE_ON_ERROR:
20 | .SECONDARY: 
21 | .PHONY : all
22 | 
23 | all : $(foreach sample,$(SAMPLES),iadb/bam/$(sample).bam)
24 | 
25 | iadb/unaln_bam/%.bam : %.bam
26 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,1G,2G)" $(MKDIR) $(@D) $(LOGDIR); $(SAMTOOLS) view -f $(REQUIRED_FLAGS) -F $(BAM_FILTER_FLAGS) -bh $< > $@ 2> $(LOGDIR)/$(@F).log
27 | 
28 | 
29 | iadb/fastq/%.1.fastq iadb/fastq/%.2.fastq : iadb/unaln_bam/%.bam
30 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,5G,7G)" $(MKDIR) $(@D) $(LOGDIR); \
31 | 	$(call SAM_TO_FASTQ_MEM,5G) I=$< FASTQ=iadb/fastq/$*.1.fastq SECOND_END_FASTQ=iadb/fastq/$*.2.fastq &> $(LOGDIR)/$(@F).log
32 | 
33 | iadb/bam/%.gsnap.bam : iadb/fastq/%.1.fastq iadb/fastq/%.2.fastq
34 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,1.5G,2G) -pe $(PARALLEL_ENV) 4" $(MKDIR) $(@D) $(LOGDIR); \
35 | 	$(MKDIR) $(@D) $(LOGDIR); $(GSNAP) $(OPTS) --read-group-id=$* $^ 2> $(LOGDIR)/$(@F).log | $(SAMTOOLS) view -bhS - > $@
36 | 
37 | iadb/bam/%.bam : iadb/bam/%.gsnap.sorted.filtered.markdup.bam
38 | 	$(MKDIR) $(@D); ln -v $< $@
39 | 
40 | include modules/bam_tools/processBam.mk
41 | 


--------------------------------------------------------------------------------
/aligners/novoalignIADB.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | NOVOALIGN = $(HOME)/share/usr/bin/novoalignMPI
 4 | NOVOINDEX = $(HOME)/share/usr/bin/novoindex
 5 | 
 6 | REF_FASTA = $(HOME)/share/references/IADB_feb2011.fa
 7 | IADB_NIX = $(HOME)/share/references/IADB_feb2011.fa.nix
 8 | 
 9 | SAMPLE_FILE = samples.txt
10 | SAMPLES = $(shell cat $(SAMPLE_FILE))
11 | 
12 | VPATH = bam
13 | 
14 | LOGDIR = iadb/log
15 | 
16 | .DELETE_ON_ERROR:
17 | .SECONDARY: 
18 | .PHONY : all
19 | 
20 | all : $(foreach sample,$(SAMPLES),iadb/processed_bam/$(sample).bam)
21 | 
22 | iadb/fastq/%.1.fastq.gz iadb/fastq/%.2.fastq.gz : %.bam
23 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,10G,12G)" $(MKDIR) $(@D) $(LOGDIR); $(BAM2FASTQ) --no-aligned -o $(@D)/$*#.fastq $< &> $(LOGDIR)/$(@F).log && mv $(@D)/$*_1.fastq $(@D)/$*.1.fastq && mv $(@D)/$*_2.fastq $(@D)/$*.2.fastq && gzip $(@D)/$*.1.fastq $(@D)/$*.2.fastq
24 | 
25 | iadb/bam/%.novoalign.bam : iadb/fastq/%.1.fastq.gz iadb/fastq/%.2.fastq.gz
26 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,14G,18G) -pe openmpi 5" $(MKDIR) $(@D) $(LOGDIR); mpiexec -np 5 $(NOVOALIGN) -i 200,50 -r A -R 0 -d $(IADB_NIX) -f $^ -o SAM $$'@RG\tID:$*\tPU:illumina\tLB:$*' 2> $(LOGDIR)/$(@F).log | $(SAMTOOLS) view -bS - > $@
27 | 	
28 | iadb/processed_bam/%.bam : iadb/bam/%.novoalign.sorted.filtered.fixmate.markdup.bam
29 | 	$(MKDIR) $(@D); ln -v $< $@
30 | 
31 | include modules/bam_tools/processBam.mk
32 | 


--------------------------------------------------------------------------------
/aligners/pBwaAligner.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | SAM_TO_FASTQ = $(JAVA) -Xmx2G -jar $(JARDIR)/SamToFastq.jar VALIDATION_STRINGENCY=LENIENT
 4 | SAMPLE_FILE = samples.txt
 5 | SAMPLES = $(shell cat $(SAMPLE_FILE))
 6 | 
 7 | LOGDIR = log/pBwa.$(NOW)
 8 | 
 9 | BWA_BAMS = $(foreach sample,$(SAMPLES),bam/$(sample).bam)
10 | SAMTOOLS_SORT_MEM = 2000000000
11 | SEQ_PLATFORM = illumina
12 | 
13 | .SECONDARY:
14 | .DELETE_ON_ERROR: 
15 | .PHONY : all bwa_bams
16 | 
17 | all : bwa_bams
18 | bwa_bams : $(BWA_BAMS) $(addsuffix .bai,$(BWA_BAMS))
19 | 
20 | bwa/sai/%.1.sai bwa/sai/%.2.sai : fastq/%.1.fastq.gz fastq/%.2.fastq.gz
21 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,8G,9G) -pe openmpi 10-100" $(MKDIR) $(@D) $(LOGDIR); $(PBWA) aln -f $(@D)/$* $(REF_FASTA) $^ 2> $(LOGDIR)/$(@F).log
22 | 
23 | %.bam.bai : %.bam
24 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,1G,2G)" $(MKDIR) $(@D) $(LOGDIR); $(SAMTOOLS) index $< &> $(LOGDIR)/$(@F).log
25 | 
26 | fastq/%.1.fastq.gz fastq/%.2.fastq.gz : gsc_bam/%.bam
27 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,10G,12G)" $(MKDIR) $(@D) $(LOGDIR); $(BAM2FASTQ) -o fastq/$*#.fastq $< &> $(LOGDIR)/$(@F).log && mv fastq/$*_1.fastq fastq/$*.1.fastq && mv fastq/$*_2.fastq fastq/$*.2.fastq && gzip fastq/$*.1.fastq fastq/$*.2.fastq
28 | 
29 | bwa/bam/%.bwa.sam : bwa/sai/%.1.sai bwa/sai/%.2.sai fastq/%.1.fastq.gz fastq/%.2.fastq.gz
30 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,7G,9G) -pe openmpi 10-100" $(MKDIR) $(@D) $(LOGDIR); \
31 | 			 LBID=`echo "$*" | sed 's/_[0-9]\+//'`; \
32 | 			 $(PBWA) sampe -f $@ -P -r "@RG\tID:$*\tLB:$${LBID}\tPL:${SEQ_PLATFORM}\tSM:$*" $(REF_FASTA) $(basename $(word 1,$^)) $(basename $(word 2,$^)) $(word 3,$^) $(word 4,$^) 2> $(LOGDIR)/$(@F).log
33 | 
34 | 
35 | bam/%.bam : bwa/bam/%.bwa.sorted.filtered.fixmate.markdup.bam
36 | 	$(MKDIR) $(@D); ln -f $< $@
37 | 
38 | include modules/bam_tools/processBam.mk
39 | 


--------------------------------------------------------------------------------
/aligners/starFusionAligner.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | STAR_CHIMERIC = true
 4 | 
 5 | STAR_FUSION = STAR-Fusion
 6 | 
 7 | STAR_FUSION_ENV = $(HOME)/share/usr/anaconda-envs/star-fusion-1.0.0
 8 | 
 9 | STAR_FUSION_TO_USV = python modules/sv_callers/starfusion2usv.py
10 | 
11 | $(if $(STAR_CTAT_DIR),,$(error no STAR CTAT dir))
12 | 
13 | PHONY += star_fusion
14 | star_fusion : $(foreach sample,$(SAMPLES),star_fusion/$(sample).star_fusion_timestamp)
15 | 
16 | star_fusion/%.star_fusion_timestamp : star/%.Chimeric.out.junction
17 | 	$(call RUN,-v $(STAR_FUSION_ENV) -s 8G -m 12G,"$(STAR_FUSION) --genome_lib_dir $(STAR_CTAT_DIR) -J $< --output_dir $(@D)/$* && touch $@")
18 | 
19 | usv/%.star_fusion.tsv : star_fusion/%.star_fusion_timestamp
20 | 	$(call RUN,,"$(STAR_FUSION_TO_USV) < $(<D)/$*/star-fusion.fusion_candidates.final > $@")
21 | 
22 | include modules/aligners/starAligner.mk
23 | 


--------------------------------------------------------------------------------
/aligners/tmapAligner.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | include modules/variant_callers/gatk.inc
 3 | include modules/aligners/align.inc
 4 | 
 5 | ALIGNER := tmap
 6 | LOGDIR := log/tmap.$(NOW)
 7 | 
 8 | 
 9 | SAMTOOLS_SORT_MEM = 2000000000
10 | 
11 | FASTQ_CHUNKS := 10
12 | FASTQ_CHUNK_SEQ := $(shell seq 1 $(FASTQ_CHUNKS))
13 | FASTQUTILS = $(HOME)/share/usr/ngsutils/bin/fastqutils
14 | 
15 | TMAP = $(HOME)/share/usr/bin/tmap
16 | TMAP_MODE ?= map3
17 | TMAP_OPTS =
18 | 
19 | SEQ_PLATFORM = IONTORRENT
20 | 
21 | .SECONDARY:
22 | .DELETE_ON_ERROR: 
23 | .PHONY: tmap
24 | 
25 | TMAP_BAMS = $(foreach sample,$(SAMPLES),bam/$(sample).bam)
26 | tmap : $(TMAP_BAMS) $(addsuffix .bai,$(TMAP_BAMS))
27 | 
28 | bam/%.bam : tmap/bam/%.tmap.$(BAM_SUFFIX)
29 | 	$(INIT) cp $< $@ && ln -f $(<) $(@)
30 | 
31 | tmap/sam/%.header.sam : unprocessed_bam/%.bam
32 | 	$(INIT) $(SAMTOOLS) view -H $< | grep -e '^@HD' -e '^@RG' > $@
33 | 
34 | tmap/bam/%.$(TMAP_MODE).bam : tmap/sam/%.header.sam unprocessed_bam/%.bam
35 | 	$(call RUN,-c -n 4 -s 6G -m 8G,"$(SAMTOOLS) reheader $^ | $(TMAP) $(TMAP_MODE) $(TMAP_OPTS) -Q 2 -f $(REF_FASTA) -i bam -s $(@) -o 1 -n 4 ")
36 | 
37 | define align-split-fastq
38 | tmap/bam/$2.tmap.bam : $3
39 | 	$$(call RUN,-c -n 4 -s 6G -m 8G,"zcat $$< | $$(TMAP) $$(TMAP_MODE) $$(TMAP_OPTS) -f $$(REF_FASTA) -i fastq -s $$(@) -Q 0 -o 1 -n 4 -R ID:$2 -R SM:$1 -R PL:$$(SEQ_PLATFORM) -R PU:00000000")
40 | endef
41 | $(foreach ss,$(SPLIT_SAMPLES),\
42 | 	$(if $(fq.$(ss)), \
43 | 	$(eval $(call align-split-fastq,$(split.$(ss)),$(ss),$(fq.$(ss))))))
44 | 
45 | tmap/bam/%.tmap.bam : fastq/%.fastq.gz
46 | 	$(call RUN,-c -n 4 -s 6G -m 8G,"zcat $< | $(TMAP) $(TMAP_MODE) $(TMAP_OPTS) -f $(REF_FASTA) -i fastq -s $(@) -Q 0 -o 1 -n 4 -R ID:$* -R SM:$* -R PL:$(SEQ_PLATFORM) -R PU:00000000 ")
47 | 
48 | include modules/bam_tools/processBam.mk
49 | include modules/fastq_tools/fastq.mk
50 | include modules/aligners/align.mk
51 | 


--------------------------------------------------------------------------------
/bam_tools/fix_mate.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | include modules/hg19.inc
 3 | 
 4 | FIXMATE = $(JAVA) -Xmx10G -jar $(JARDIR)/FixMateInformation.jar VALIDATION_STRINGENCY=LENIENT
 5 | 
 6 | SAMPLE_FILE = samplesToFixMate.txt
 7 | SAMPLES = $(shell cat $(SAMPLE_FILE))
 8 | 
 9 | LOGDIR = gsc_bam/logs
10 | 
11 | .DELETE_ON_ERROR:
12 | 
13 | .SECONDARY:
14 | 
15 | all : $(foreach sample,$(SAMPLES),gsc_bam/$(sample).fixmate.bam)
16 | 
17 | gsc_bam/%.fixmate.bam : gsc_bam/%.bam
18 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,10G,40G)" $(MKDIR) $(LOGDIR);\
19 | 	$(FIXMATE) I=$< O=$@ &> ${LOGDIR}/$(@F).fixmate.log
20 | 


--------------------------------------------------------------------------------
/bam_tools/fix_rg.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | include modules/variant_callers/gatk.inc
 3 | include modules/aligners/align.inc
 4 | 
 5 | LOGDIR ?= log/fix_rg.$(NOW)
 6 | 
 7 | BAMS = $(foreach sample,$(SAMPLES),bam/$(sample).bam)
 8 | 
 9 | fixed_bams : $(BAMS) $(addsuffix .bai,$(BAMS))
10 | 
11 | bam/%.bam : unprocessed_bam/%.rg.bam
12 | 	$(INIT) ln -f $(<) $(@)
13 | 
14 | include modules/bam_tools/processBam.mk
15 | 


--------------------------------------------------------------------------------
/bam_tools/get_bam_data_mirror.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/getbam_data_mirror.$(NOW)
 4 | 
 5 | get_bam : $(foreach sample,$(SAMPLES),bam/$(sample).bam) \
 6 | 	  $(foreach sample,$(SAMPLES),bam/$(sample).bam.bai) \
 7 | 	  $(foreach sample,$(SAMPLES),bam/$(sample).bai)
 8 | 	  
 9 | PROJECT_NAME = $(shell basename $(PWD))
10 | 
11 | define get-bam
12 | bam/$1.bam :
13 | 	$$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \
14 | 					  rsync -aP -e ssh $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam \
15 | 					  bam/")
16 | 					  
17 | bam/$1.bam.bai :
18 | 	$$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \
19 | 					  rsync -aP -e ssh $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam.bai \
20 | 					  bam/")
21 | 					  
22 | bam/$1.bai :
23 | 	$$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \
24 | 					  rsync -aP -e ssh $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bai \
25 | 					  bam/")
26 | 
27 | 
28 | endef
29 |  $(foreach sample,$(SAMPLES),\
30 | 		$(eval $(call get-bam,$(sample))))
31 | 
32 | ..DUMMY := $(shell mkdir -p version; \
33 |              which scp > version/getbam_data_mirror.txt)
34 | .SECONDARY: 
35 | .DELETE_ON_ERROR:
36 | .PHONY: get_bam


--------------------------------------------------------------------------------
/bam_tools/get_bam_irb_mirror.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/getbam_irb_mirror.$(NOW)
 4 | 
 5 | get_bam : $(foreach sample,$(SAMPLES),bam/$(sample).bam) \
 6 | 	  $(foreach sample,$(SAMPLES),bam/$(sample).bam.bai) \
 7 | 	  $(foreach sample,$(SAMPLES),bam/$(sample).bai)
 8 | 
 9 | define get-bam
10 | bam/$1.bam :
11 | 	$$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \
12 | 					  scp $(USER)@juno-xfer01.mskcc.org:/juno/dmp/share/irb12_245/`echo $1 | cut -c 1-1`/`echo $1 | cut -c 2-2`/$1.bam \
13 | 					  bam/")
14 | 					  
15 | bam/$1.bam.bai : bam/$1.bam
16 | 	$$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \
17 | 					  $(SAMTOOLS) index $$(<)")
18 | 					  
19 | bam/$1.bai : bam/$1.bam bam/$1.bam.bai
20 | 	$$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \
21 | 					  cp $$(<<) $$(@)")
22 | 
23 | 
24 | endef
25 |  $(foreach sample,$(SAMPLES),\
26 | 		$(eval $(call get-bam,$(sample))))
27 | 
28 | ..DUMMY := $(shell mkdir -p version; \
29 |              which scp > version/getbam_irb_mirror.txt)
30 | .SECONDARY: 
31 | .DELETE_ON_ERROR:
32 | .PHONY: get_bam


--------------------------------------------------------------------------------
/bam_tools/merge_bam.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/merge.$(NOW)
 4 | 
 5 | merged_bam : $(foreach sample,$(MERGE_SAMPLES),bam/$(sample).bam bam/$(sample).bam.bai)
 6 | 
 7 | define merged-bam
 8 | %.header.sam : %.bam
 9 | 	$$(INIT) $$(SAMTOOLS2) view -H $$< > $$@
10 | 
11 | merged_bam/$1.header.sam : $$(merge.$1:.bam=.header.sam)
12 | 	$$(call RUN,-s 16G -m 18G,"$$(call PICARD,MergeSamFiles,13G) $$(foreach sam,$$^,I=$$(sam) ) O=$$@")
13 | 
14 | merged_bam/$1.bam : merged_bam/$1.header.sam $$(merge.$1)
15 | 	$$(call RUN,-s 12G -m 15G,"$$(SAMTOOLS2) merge -f -h $$< $$(@) $$(filter %.bam,$$^)")
16 | endef
17 | define rename-bam
18 | bam/$1.bam : $2
19 | 	$$(INIT) ln -f $$< $$@
20 | endef
21 | $(foreach sample,$(MERGE_SAMPLES),\
22 | 	$(if $(word 2,$(merge.$(sample))),\
23 | 	$(eval $(call merged-bam,$(sample))),\
24 | 	$(if $(merge.$(sample)),\
25 | 	$(eval $(call rename-bam,$(sample),$(merge.$(sample)))))))
26 | 
27 | 
28 | bam/%.bam : merged_bam/%.rg.bam
29 | 	$(INIT) ln -f $< $@
30 | 
31 | .SECONDARY:
32 | .DELETE_ON_ERROR: 
33 | .PHONY : merged_bam
34 | 
35 | include modules/bam_tools/processBam.mk
36 | 


--------------------------------------------------------------------------------
/bam_tools/put_bam_data_mirror.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/putbam_data_mirror.$(NOW)
 4 | 
 5 | put_bam : $(foreach sample,$(SAMPLES),bam/$(sample).taskcomplete)
 6 | 	  
 7 | PROJECT_NAME = $(shell basename $(PWD))
 8 | 
 9 | define put-bam
10 | bam/$1.taskcomplete : bam/$1.bam
11 | 	$$(call RUN,-c -n 1 -s 2G -m 4G, "set -o pipefail && \
12 | 					  rsync -aP -e ssh bam/$1.bam $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam && \
13 | 					  rsync -aP -e ssh bam/$1.bam.bai $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bam.bai && \
14 | 					  rsync -aP -e ssh bam/$1.bam.bai $(USER)@lilac-xfer01.mskcc.org:/oscar/warm/reis-filho/by_user/$(USER)/$(PROJECT_NAME)/$1.bai && \
15 | 					  echo 'finished!' > $$(@)")
16 | 					  
17 | endef
18 |  $(foreach sample,$(SAMPLES),\
19 | 		$(eval $(call put-bam,$(sample))))
20 | 
21 | ..DUMMY := $(shell mkdir -p version; \
22 |              which scp > version/putbam_data_mirror.txt)
23 | .SECONDARY: 
24 | .DELETE_ON_ERROR:
25 | .PHONY: put_bam


--------------------------------------------------------------------------------
/clonality/tableToCloneHDFormat.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | use Getopt::Std;
 7 | my %opt;
 8 | getopts('h', \%opt);
 9 | 
10 | my $usage = <<ENDL;
11 | Usage: perl tableToCloneHDBaf.pl [samples] < table.txt > output.txt
12 | ENDL
13 | 
14 | sub HELP_MESSAGE {
15 |    print STDERR $usage;
16 |    exit(1);
17 | }
18 | 
19 | HELP_MESSAGE if $opt{h};
20 | 
21 | my @samples = @ARGV;
22 | 
23 | my $headerLine = <STDIN>;
24 | chomp $headerLine;
25 | $headerLine =~ s/^#//;
26 | my @header = split/\t/, $headerLine;
27 | 
28 | while (<STDIN>) {
29 |     my @F = split /\t/;
30 |     my %F = map { $_ => shift @F } @header;
31 |     my $line = "$F{CHROM}\t$F{POS}";
32 |     for my $s (@samples) {
33 |         my $ad = $F{$s . ".AD"};
34 |         my @ad = ($ad ne ".")? split /,/, $ad : qw/0 0/;
35 |         my $dp = $ad[0] + $ad[1];
36 |         $line .= "\t$ad[0]\t$dp";
37 |     }
38 |     print $line . "\n";
39 | }
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/conda_env/R3.2.2.txt:
--------------------------------------------------------------------------------
1 | R=3.2.2
2 | r-optparse=1.3.0
3 | 


--------------------------------------------------------------------------------
/conda_env/delly_env.txt:
--------------------------------------------------------------------------------
 1 | # This file may be used to create an environment using:
 2 | # $ conda create --name <env> --file <this file>
 3 | # platform: linux-64
 4 | @EXPLICIT
 5 | https://conda.anaconda.org/bioconda/linux-64/bcftools-1.3.1-1.tar.bz2
 6 | https://repo.continuum.io/pkgs/free/linux-64/click-6.6-py27_0.tar.bz2
 7 | https://repo.continuum.io/pkgs/free/linux-64/curl-7.49.0-1.tar.bz2
 8 | https://conda.anaconda.org/bioconda/linux-64/delly-0.7.6-0.tar.bz2
 9 | https://repo.continuum.io/pkgs/free/linux-64/h5py-2.6.0-np111py27_2.tar.bz2
10 | https://repo.continuum.io/pkgs/free/linux-64/hdf5-1.8.17-1.tar.bz2
11 | https://conda.anaconda.org/bioconda/linux-64/htslib-1.3.1-1.tar.bz2
12 | https://repo.continuum.io/pkgs/free/linux-64/libgcc-5.2.0-0.tar.bz2
13 | https://repo.continuum.io/pkgs/free/linux-64/mkl-11.3.3-0.tar.bz2
14 | https://repo.continuum.io/pkgs/free/linux-64/numpy-1.11.2-py27_0.tar.bz2
15 | https://repo.continuum.io/pkgs/free/linux-64/openssl-1.0.2j-0.tar.bz2
16 | https://repo.continuum.io/pkgs/free/linux-64/pip-9.0.1-py27_0.tar.bz2
17 | https://conda.anaconda.org/bioconda/linux-64/pysam-0.9.1.4-py27_0.tar.bz2
18 | https://conda.anaconda.org/conda-forge/linux-64/python-2.7.12-1.tar.bz2
19 | https://repo.continuum.io/pkgs/free/linux-64/readline-6.2-2.tar.bz2
20 | https://conda.anaconda.org/bioconda/linux-64/samtools-1.3.1-5.tar.bz2
21 | https://repo.continuum.io/pkgs/free/linux-64/setuptools-27.2.0-py27_0.tar.bz2
22 | https://repo.continuum.io/pkgs/free/linux-64/six-1.10.0-py27_0.tar.bz2
23 | https://repo.continuum.io/pkgs/free/linux-64/sqlite-3.13.0-0.tar.bz2
24 | http://repo.continuum.io/pkgs/free/linux-64/tk-8.5.18-0.tar.bz2
25 | https://conda.anaconda.org/conda-forge/linux-64/wheel-0.29.0-py27_0.tar.bz2
26 | https://repo.continuum.io/pkgs/free/linux-64/zlib-1.2.8-3.tar.bz2
27 | 


--------------------------------------------------------------------------------
/contamination/clusterSamples.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | include modules/variant_callers/gatk.inc
 3 | 
 4 | LOGDIR = log/cluster_samples.$(NOW)
 5 | 
 6 | VPATH ?= bam
 7 | ifeq ($(EXOME),true)
 8 | DBSNP_SUBSET ?= $(HOME)/share/reference/dbsnp_137_exome.bed
 9 | else
10 | DBSNP_SUBSET = $(HOME)/share/reference/dbsnp_tseq_intersect.bed
11 | endif
12 | 
13 | CLUSTER_VCF = modules/contamination/clusterSampleVcf.R
14 | 
15 | snp_cluster : $(foreach sample,$(SAMPLES),snp_vcf/$(sample).snps.vcf) \
16 | 	      snp_vcf/snps.vcf \
17 | 	      snp_vcf/snps_ft.vcf \
18 | 	      snp_vcf/snps_ft.pdf
19 | 
20 | snp_vcf/%.snps.vcf : bam/%.bam 
21 | 	$(call RUN,-n 4 -s 2.5G -m 3G,"set -o pipefail && \
22 | 				       $(call GATK_MEM,8G) \
23 | 				       -T UnifiedGenotyper \
24 | 				       -rf BadCigar \
25 | 				       -nt 4 \
26 | 				       -R $(REF_FASTA) \
27 | 				       --dbsnp $(DBSNP) \
28 | 				       $(foreach bam,$(filter %.bam,$^),-I $(bam) ) \
29 | 				       -L $(DBSNP_SUBSET) \
30 | 				       -o $@ \
31 | 				       --output_mode EMIT_ALL_SITES")
32 | 
33 | 
34 | snp_vcf/snps.vcf : $(foreach sample,$(SAMPLES),snp_vcf/$(sample).snps.vcf)
35 | 	$(call RUN,-s 16G -m 20G,"set -o pipefail && \
36 | 				  $(call GATK_MEM,14G) -T CombineVariants \
37 | 				  $(foreach vcf,$^,--variant $(vcf) ) \
38 | 				  -o $@ \
39 | 				  --genotypemergeoption UNSORTED \
40 | 				  -R $(REF_FASTA)")
41 | 
42 | snp_vcf/snps_ft.vcf : snp_vcf/snps.vcf
43 | 	$(INIT) grep '^#' $< > $@ && grep -e '0/1' -e '1/1' $< >> $@
44 | 
45 | snp_vcf/snps_ft.pdf : snp_vcf/snps_ft.vcf
46 | 	$(call RUN,-n 1 -s 16G -m 20G -v $(VARIANT_ANNOTATION_ENV),"set -o pipefail && \
47 | 								    $(RSCRIPT) modules/contamination/clusterSampleVcf.R \
48 | 								    --input_file $(<) \
49 | 								    --output_file $(@) \
50 | 								    --sample_pairs '$(SAMPLE_PAIRS)' \
51 | 								    --genome b37")
52 | 	
53 | 	
54 | ..DUMMY := $(shell mkdir -p version; \
55 |              echo "GATK" > version/cluster_samples.txt;)
56 | .SECONDARY:
57 | .DELETE_ON_ERROR:
58 | .PHONY : snp_cluster
59 | 
60 | include modules/vcf_tools/vcftools.mk
61 | 


--------------------------------------------------------------------------------
/contamination/contest.mk:
--------------------------------------------------------------------------------
 1 | # This module runs ContEst on snp vcf files from gatk
 2 | # Author: inodb, limr
 3 | 
 4 | ##### MAKE INCLUDES #####
 5 | include modules/Makefile.inc
 6 | include modules/variant_callers/gatk.inc
 7 | 
 8 | LOGDIR ?= log/contest.$(NOW)
 9 | 
10 | .SECONDARY:
11 | .DELETE_ON_ERROR:
12 | .PHONY: contest
13 | 
14 | contest : contest/all_contest.txt
15 | 
16 | # ContEst doing on-the-fly genotyping
17 | define contest-tumor-normal
18 | contest/$1_$2.contest.txt : bam/$1.bam bam/$2.bam
19 | 	$$(call RUN,-s 12G -m 12G,"$$(call GATK_MEM2,4G) -T ContEst -I:eval $$(<) -I:genotype $$(<<) \
20 | 		-pf $$(HAPMAP_POP_FILE) -o $$(@) -R $$(REF_FASTA)")
21 | endef
22 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call contest-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
23 | 
24 | contest/all_contest.txt : $(foreach pair,$(SAMPLE_PAIRS),contest/$(pair).contest.txt)
25 | 	( \
26 | 		head -1 $< | sed "s/^/sample\t/"; \
27 | 		for s in $(^); do \
28 | 			grep -P "META\t" $$s | sed "s/^/`basename $$s _contamination.txt`/"; \
29 | 		done | sort -rnk5,5; \
30 | 	) > $@
31 | 


--------------------------------------------------------------------------------
/copy_number/CytoBand.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/copy_number/CytoBand.RData


--------------------------------------------------------------------------------
/copy_number/absCNseq.mk:
--------------------------------------------------------------------------------
 1 | # run absCNseq on varscan segmentation data
 2 | include modules/Makefile.inc
 3 | include modules/variant_callers/gatk.inc
 4 | 
 5 | LOGDIR = log/absCN.$(NOW)
 6 | ABS_CN_SEQ = $(RSCRIPT) modules/copy_number/absCNseq.R
 7 | 
 8 | # file containing positions: chr:start-stop
 9 | SNV_POS_FILE = snv_posn.intervals
10 | 
11 | .DELETE_ON_ERROR:
12 | .SECONDARY: 
13 | .PHONY : all
14 | 
15 | all : $(foreach pair,$(SAMPLE_PAIRS),absCN/$(pair).absCN.txt)
16 | 
17 | define abs-gatk-tumor-normal
18 | absCN/$1_$2.gatk.vcf : bam/$1.bam bam/$2.bam
19 | 	$$(call RUN,-s 9G -m 12G,"$$(call GATK_MEM,8G) -T UnifiedGenotyper -o $$@ -I $$< -I $$(<<) -R $$(REF_FASTA)  --output_mode EMIT_ALL_SITES -L $$(SNV_POS_FILE)")
20 | endef
21 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call abs-gatk-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
22 | 
23 | absCN/%.absCN.txt absCN/%.absSNV.txt : varscan/segment/%.varscan2copynumber.txt absCN/%.gatk.vcf
24 | 	$(call RUN,-s 4G -m 6G,"$(ABS_CN_SEQ) --genome $(REF) --outPrefix absCN/$* $^")
25 | 


--------------------------------------------------------------------------------
/copy_number/compare_facets_cncf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pandas as pd
 4 | import sys
 5 | import argparse
 6 | import math
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser(prog='compare_facets_cncf.py',
10 |                                      description='compare two facets cncf files')
11 |     parser.add_argument('cncf1')
12 |     parser.add_argument('cncf2')
13 |     args = parser.parse_args()
14 | 
15 |     df1 = pd.read_table(args.cncf1).fillna(0)
16 |     df2 = pd.read_table(args.cncf2).fillna(0)
17 |     mafr_diff = 0
18 |     for i, row1 in df1.iterrows():
19 |         for j, row2 in df2.iterrows():
20 |             if (row1['loc.start'] >= row2['loc.start'] and
21 |                     row1['loc.start'] <= row2['loc.end']) or \
22 |                     (row1['loc.end'] >= row2['loc.start'] and
23 |                      row1['loc.end'] <= row2['loc.start']):
24 |                 mafr_diff += math.fabs(row1['mafR.clust'] - row2['mafR.clust'])
25 |                 break
26 |     if mafr_diff < 20:
27 |         print(("success, CNCF files are similar: {} {}".format(args.cncf1, args.cncf2)))
28 |         sys.exit(0)
29 |     else:
30 |         print(("failed, files have high mafR difference: {} {}".format(args.cncf1, args.cncf2)))
31 |         sys.exit(1)
32 | 


--------------------------------------------------------------------------------
/copy_number/convert_basecount_to_snp_pileup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import pandas as pd
 4 | import sys
 5 | import argparse
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = argparse.ArgumentParser(prog='convert_basecount_to_snp_pileup.py',
 9 |                                      description='convert old basecount facets files to snp-pileup')
10 |     parser.add_argument('basecount')
11 |     args = parser.parse_args()
12 | 
13 |     bc = pd.read_table(args.basecount, dtype={'Chrom': str})
14 |     sp = pd.DataFrame()
15 |     sp['Chromosome'] = bc.Chrom
16 |     sp['Position'] = bc.Pos
17 |     sp['Ref'] = bc.Ref
18 |     sp['Alt'] = bc.Alt
19 |     for i, row in bc.iterrows():
20 |         sp.ix[i, 'File1R'] = int(bc.ix[i, "NOR.{}p".format(sp.ix[i, 'Ref'])] +
21 |                                  bc.ix[i, "NOR.{}n".format(sp.ix[i, 'Ref'])])
22 |         sp.ix[i, 'File1A'] = int(bc.ix[i, "NOR.{}p".format(sp.ix[i, 'Alt'])] +
23 |                                  bc.ix[i, "NOR.{}n".format(sp.ix[i, 'Alt'])])
24 |         sp.ix[i, 'File1E'] = 0
25 |         sp.ix[i, 'File1D'] = 0
26 |         sp.ix[i, 'File2R'] = int(bc.ix[i, "TUM.{}p".format(sp.ix[i, 'Ref'])] +
27 |                                  bc.ix[i, "TUM.{}n".format(sp.ix[i, 'Ref'])])
28 |         sp.ix[i, 'File2A'] = int(bc.ix[i, "TUM.{}p".format(sp.ix[i, 'Alt'])] +
29 |                                  bc.ix[i, "TUM.{}n".format(sp.ix[i, 'Alt'])])
30 |         sp.ix[i, 'File2E'] = 0
31 |         sp.ix[i, 'File2D'] = 0
32 | 
33 |     for col in sp.columns[4:]:
34 |         sp[col] = sp[col].astype(int)
35 | 
36 |     sp.to_csv(sys.stdout, index=False)
37 | 


--------------------------------------------------------------------------------
/copy_number/createFacetsSummary.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | for (lib in c("optparse", "dplyr")) {
 4 |     suppressPackageStartupMessages(library(lib, character.only=TRUE))
 5 | }
 6 | 
 7 | optList <- list(make_option("--outFile", default = NULL, help = "output file"))
 8 | parser <- OptionParser(usage = "%prog [options] [facets files]", option_list = optList)
 9 | arguments <- parse_args(parser, positional_arguments = T)
10 | opt <- arguments$options
11 | 
12 | if (length(arguments$args) < 1) {
13 | 	cat("Need facets output files\n")
14 | 	print_help(parser)
15 | 	stop()
16 | } else if (is.null(opt$outFile)) {
17 | 	cat("Need output file\n")
18 | 	print_help(parser)
19 | 	stop()
20 | } else {
21 | 	facetsFiles <- arguments$args
22 | }
23 | 
24 | 
25 | Df <- data.frame()
26 | for (facetsFile in facetsFiles) {
27 |     load(facetsFile)
28 |     tumorName <- facetsFile %>% sub('.*/', '', .) %>% sub('_.*', '', .) %>% sub('\\..*', '', .)
29 |     normalName <- facetsFile %>% sub('.*/', '', .) %>% sub('^.*_', '', .) %>% sub('\\..*', '', .)
30 |     n <- paste(tumorName, normalName, sep = '_')
31 |     Df[n, 'tumorName'] <- tumorName
32 |     if (tumorName != normalName) {
33 |         Df[n, 'normalName'] <- normalName
34 |     }
35 |     Df[n, 'purity'] <- fit$purity
36 |     Df[n, 'ploidy'] <- fit$ploidy
37 |     Df[n, 'dipLogR'] <- fit$dipLogR
38 | }
39 | Df <- mutate(Df, bad = purity <= 0.3 | is.na(purity))
40 | 
41 | write.table(Df, file = opt$outFile, sep = '\t', quote = F, row.names = F)
42 | 


--------------------------------------------------------------------------------
/copy_number/exomeCNVLOH.mk:
--------------------------------------------------------------------------------
 1 | # Use ExomeCNV to detect copy number variants and LOH
 2 | # vim: set ft=make :
 3 | 
 4 | include modules/Makefile.inc
 5 | 
 6 | LOGDIR = log/exomeCNVLOH.$(NOW)
 7 | EXOMECNV = modules/copy_number/exomeCNV.R
 8 | EXOMECNVLOH = modules/copy_number/exomeCNVLOH.R
 9 | CREATE_BAF = $(PERL) $(HOME)/share/usr/bin/for.loh.files.pl
10 | 
11 | .SECONDARY:
12 | .DELETE_ON_ERROR:
13 | .PHONY: all loh
14 | 
15 | all : loh
16 | 
17 | ifdef SAMPLE_PAIRS
18 | LOH += $(foreach pair,$(SAMPLE_PAIRS),exomecnv/loh/$(pair).loh.txt)
19 | 
20 | define exomecnv-baf-tumor-normal-set
21 | exomecnv/baf/$1_$2.baf_timestamp : vcf/$(subst $( ),_,$3).gatk_snps.target_ft.dp_ft.pass.vcf
22 | 	normal=`grep -m1 '^#CHROM' $$< | cut -f10- | tr '\t' '\n' | grep -n '^$2$$$$' | cut -f1 -d:`; \
23 | 	tumor=`grep -m1 '^#CHROM' $$< | cut -f10- | tr '\t' '\n' | grep -n '^$1$$$$' | cut -f1 -d:`; \
24 | 	$$(INIT) $$(CREATE_BAF) $$< exomecnv/baf/$1_$2.baf_1.txt exomecnv/baf/$1_$2.baf_2.txt $$$$normal $$$$tumor && touch $$@
25 | exomecnv/baf/$1_$2.baf_1.txt : exomecnv/baf/$1_$2.baf_timestamp
26 | exomecnv/baf/$1_$2.baf_2.txt : exomecnv/baf/$1_$2.baf_timestamp
27 | endef
28 | $(foreach pair,$(SAMPLE_PAIRS),\
29 | 	$(eval $(call exomecnv-baf-tumor-normal-set,$(tumor.$(pair)),$(normal.$(pair)),$(set.$(pair)))))
30 | 
31 | 
32 | define exomecnv-loh-pair
33 | exomecnv/loh/$1.loh.txt : exomecnv/baf/$1.baf_1.txt exomecnv/baf/$1.baf_2.txt
34 | 	$$(call RUN,-s 4G -m 6G,"$$(RSCRIPT) $$(EXOMECNVLOH) --tumor $$< --normal $$(word 2,$$^) --outPrefix $$(@D)/$1")
35 | endef
36 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call exomecnv-loh-pair,$(pair))))
37 | 
38 | else
39 | LOH += $(foreach sample,$(SAMPLES),exomecnv/loh/$(sample).loh.txt)
40 | 
41 | 
42 | endif
43 | 
44 | loh : $(LOH) 
45 | 
46 | include modules/vcf_tools/vcftools.mk
47 | include modules/variant_callers/gatk.mk
48 | 


--------------------------------------------------------------------------------
/copy_number/exomeCNVLOHHeatmap.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/exome_cnv_loh_heatmap.$(NOW)
 4 | 
 5 | SHELL = modules/scripts/Rshell
 6 | .SHELLFLAGS = -m $(MEM) -p $(PE) -n $(@F) -l $(LOGDIR) -e 
 7 | 
 8 | .ONESHELL:
 9 | .DELETE_ON_ERROR:
10 | .SECONDARY:
11 | .PHONY: all
12 | 
13 | MEM := 2G
14 | PE := 1
15 | 
16 | all : exomecnv/lohheatmap.png
17 | 
18 | exomecnv/loh/lohmat.Rdata : $(foreach pair,$(SAMPLE_PAIRS),exomecnv/loh/$(pair).loh.txt)
19 | 	lohFiles <- unlist(strsplit("$^", " "))
20 | 	lohNames <- sub(".*/", "", sub("\\..*", "", lohFiles))
21 | 	suppressPackageStartupMessages(library("rtracklayer"));
22 | 	targets <- import('$(TARGETS_FILE)');
23 | 	for (i in 1:length(lohFiles)) {
24 | 		lohFile <- lohFiles[i]
25 | 		lohName <- lohNames[i]
26 | 		s <- read.delim(lohFile, header = T, as.is = T)
27 | 		lohGR <- GRanges(seqnames = sub('chr', '', s[, "chr"],), ranges = IRanges(start = s[, "position.start"], end = s[, "position.end"]), loh = s[, "LOH"])
28 | 		x <- suppressWarnings(findOverlaps(targets, lohGR))
29 | 		mcols(targets)[queryHits(x), lohName] <- lohGR[subjectHits(x)]$$loh
30 | 	}
31 | 	names(targets) <- paste(seqnames(targets), start(targets), sep="_")
32 | 	lohmat <- as.matrix(mcols(targets))
33 | 	rownames(lohmat) <- names(targets)
34 | 	lohmat[lohmat] <- 1
35 | 	lohmat[which(!lohmat | is.na(lohmat))] <- 0
36 | 	dir.create('$(@D)', showWarnings = F)
37 | 	save(lohmat, file = "$@")
38 | 
39 | exomecnv/lohheatmap.png : exomecnv/loh/lohmat.Rdata
40 | 	load("$<")
41 | 	suppressPackageStartupMessages(library("RColorBrewer"));
42 | 	suppressPackageStartupMessages(library("gplots"));
43 | 	cols <- c(brewer.pal(8, "Dark2"), brewer.pal(8, "Set1"), brewer.pal(8, "Set2"))
44 | 	chr <- unlist(lapply(rownames(lohmat), function(x) {strsplit(x, split="_", fixed=T)[[1]][1]}))
45 | 	dir.create('$(@D)', showWarnings = F)
46 | 	png("$@", height=600, width=1200, type="cairo")
47 | 	heatmap.2(t(lohmat), trace="none", scale = 'none', Colv = NA, col=c("white", "red"), margin=c(5,15), labCol="", ColSideColors=cols[as.integer(as.factor(chr))], cexCol=1.4, dendrogram = 'row', key = F)
48 | 	null <- dev.off()
49 | 
50 | 


--------------------------------------------------------------------------------
/copy_number/hmmCopy.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/hmmcopy.$(NOW)
 4 | 
 5 | HMMCOPY_WINDOW_SIZE ?= 1000
 6 | HMMCOPY = $(RSCRIPT) modules/copy_number/hmmCopy.R
 7 | 
 8 | READ_COUNTER = $(HOME)/share/usr/bin/readCounter
 9 | MAP_COUNTER = $(HOME)/share/usr/bin/mapCounter
10 | GC_COUNTER = $(HOME)/share/usr/bin/gcCounter
11 | 
12 | MAP_BW = $(HOME)/share/references/genomes/wgEncodeCrgMapabilityAlign100mer.bigWig
13 | 
14 | .SECONDARY:
15 | .DELETE_ON_ERROR:
16 | .PHONY : hmmcopy
17 | 
18 | hmmcopy : $(foreach pair,$(SAMPLE_PAIRS),hmmcopy/results.w$(HMMCOPY_WINDOW_SIZE)/$(tumor.$(pair))_$(normal.$(pair)).hmmcopy_seg.txt)
19 | 
20 | hmmcopy/wig/%.w$(HMMCOPY_WINDOW_SIZE).wig : bam/%.bam bam/%.bam.bai
21 | 	$(call RUN,-s 6G -m 8G,"$(READ_COUNTER) -w $(HMMCOPY_WINDOW_SIZE) -c $(subst $( ),$(,),$(strip $(CHROMOSOMES))) $< > $@")
22 | 
23 | hmmcopy/wig/gc.w$(HMMCOPY_WINDOW_SIZE).wig :
24 | 	$(call RUN,-s 6G -m 8G,"$(GC_COUNTER) -w $(HMMCOPY_WINDOW_SIZE) -c $(subst $( ),$(,),$(strip $(CHROMOSOMES))) $(REF_FASTA) > $@")
25 | 
26 | hmmcopy/wig/map.w$(HMMCOPY_WINDOW_SIZE).wig :
27 | 	$(call RUN,-s 6G -m 8G,"$(MAP_COUNTER) -w $(HMMCOPY_WINDOW_SIZE) -c $(subst $( ),$(,),$(strip $(CHROMOSOMES))) $(MAP_BIGWIG) > $@")
28 | 
29 | define hmmcopy-tumor-normal
30 | hmmcopy/results.w$$(HMMCOPY_WINDOW_SIZE)/$1_$2.hmmcopy_seg.txt : hmmcopy/wig/$1.w$$(HMMCOPY_WINDOW_SIZE).wig hmmcopy/wig/$2.w$$(HMMCOPY_WINDOW_SIZE).wig hmmcopy/wig/gc.w$$(HMMCOPY_WINDOW_SIZE).wig hmmcopy/wig/map.w$$(HMMCOPY_WINDOW_SIZE).wig 
31 | 	$$(call RUN,-s 8G -m 12G,"$$(HMMCOPY) --normalWig $$(<<) --gcWig $$(<<<) --mapWig $$(<<<<) --outPrefix $$(@D)/$1_$2 $$<")
32 | endef
33 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call hmmcopy-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
34 | 


--------------------------------------------------------------------------------
/copy_number/normalisedCopyNum.mk:
--------------------------------------------------------------------------------
 1 | # MSK Impact procedure
 2 | include modules/Makefile.inc
 3 | include modules/variant_callers/gatk.inc
 4 | 
 5 | LOGDIR = log/norm_copynum.$(NOW)
 6 | NORMALISE_COPYNUM = $(RSCRIPT) modules/copy_number/normaliseCopyNum.R
 7 | NORMALISE_MIN_COV ?= 50
 8 | NORMALISE_UNDO_SD ?= 2
 9 | NORMALISE_WINDOW_SIZE ?= 100
10 | NORMALISE_OUTLIER_SD_SCALE ?= 2.5
11 | NORMALISE_ALPHA ?= 0.05
12 | NORMALISE_TRIM ?= 0.05
13 | NORMALISE_COPYNUM_OPTS = --undoSD $(NORMALISE_UNDO_SD) --outlierSDscale $(NORMALISE_OUTLIER_SD_SCALE) --alpha $(NORMALISE_ALPHA) --minCov $(NORMALISE_MIN_COV) --trim $(NORMALISE_TRIM)
14 | 
15 | .SECONDARY:
16 | .DELETE_ON_ERROR:
17 | .PHONY : norm_copynum
18 | 
19 | norm_copynum : norm_copynum/seg.txt
20 | 
21 | norm_copynum/targets.bed : $(TARGETS_FILE)
22 | 	$(INIT) $(BEDTOOLS) makewindows -w $(NORMALISE_WINDOW_SIZE) -s $$(($(NORMALISE_WINDOW_SIZE) + 1)) -b $< > $@
23 | 
24 | %.nuc.bed : %.bed
25 | 	$(INIT) $(BEDTOOLS) nuc -fi $(REF_FASTA) -bed $< > $@
26 | 
27 | norm_copynum/doc/%.doc : bam/%.bam norm_copynum/targets.bed
28 | 	$(call RUN,-s 9G -m 15G,"$(call GATK_MEM,7G) -T DepthOfCoverage -R $(REF_FASTA) -I $< -L $(<<) -o $@")
29 | 
30 | norm_copynum/seg.txt : norm_copynum/targets.nuc.bed $(foreach sample,$(SAMPLES),norm_copynum/doc/$(sample).doc)
31 | 	$(call RUN,-s 8G -m 10G,"$(NORMALISE_COPYNUM) $(NORMALISE_COPYNUM_OPTS) --sampleSetsFile $(SAMPLE_SET_FILE) --nucFile $< --centromereFile $(CENTROMERE_TABLE2) --outDir $(@D) $(addsuffix .sample_interval_summary,$(filter %.doc,$^))")
32 | 


--------------------------------------------------------------------------------
/copy_number/titan.inc:
--------------------------------------------------------------------------------
 1 | ifndef TITAN_INC
 2 | EXTRACT_ALLELE_READ_COUNTS = $(ANACONDA_PYTHON) $(HOME)/share/usr/TITANRunner-0.0.3/scripts/count.py
 3 | TITAN = $(RSCRIPT) modules/copy_number/runTitan.R
 4 | TITAN_SEG = $(PERL) $(HOME)/share/usr/TITANRunner-0.0.3/scripts/createTITANsegmentfiles.pl
 5 | SUMMARIZE_TITAN = $(RSCRIPT) modules/copy_number/summarizeTitan.R
 6 | ANNOTATE_TITAN_LOH_VCF = $(RSCRIPT) modules/copy_number/annotateTitanLOHVcf.R
 7 | NUM_CLUSTERS ?= $(shell seq 1 5)
 8 | PLOIDY_PRIORS = 2 3 4
 9 | DEFAULT_PLOIDY_PRIOR ?= 2
10 | 
11 | BQ_THRESHOLD ?= 20
12 | MQ_THRESHOLD ?= 20
13 | TITAN_WINDOW_SIZE ?= 10000
14 | 
15 | TITAN_SELF_TRANSITION ?= 1e15
16 | TITAN_CLONAL_CLUSTER_TRANSITION ?= 5e5
17 | 
18 | 
19 | override TITAN_OPTS := $(if $(UCSC_REF),--genomeStyle UCSC,--genomeStyle NCBI) $(if $(TARGETS_FILE),--targetBed $(TARGETS_FILE)) 
20 | READ_COUNTER = $(HOME)/share/usr/bin/readCounter
21 | MAP_COUNTER = $(HOME)/share/usr/bin/mapCounter
22 | GC_COUNTER = $(HOME)/share/usr/bin/gcCounter
23 | 
24 | #VCF_FIELDS += titanCN titanMinorCN titanMajorCN titanCall titanMedianRatio titanMedianLogR
25 | endif
26 | TITAN_INC = true
27 | 


--------------------------------------------------------------------------------
/db/chasm_db.yaml:
--------------------------------------------------------------------------------
1 | host: cpu-6-2
2 | db: CHASM
3 | port: 9991
4 | docker_repo: limr/chasm-db
5 | data_dir: /cbio/ski/reis-filho/home/limr/share/usr/chasm3-db
6 | user: chasm
7 | password: chasm
8 | 


--------------------------------------------------------------------------------
/db/create_mysql_docker_images.sh:
--------------------------------------------------------------------------------
 1 | docker run --name fathmm-db \
 2 |     -v ~/downloads/fathmm:/docker-entrypoint-initdb.d \
 3 |     -e MYSQL_ROOT_PASSWORD=fathmm \
 4 |     -e MYSQL_DATABASE=fathmm \
 5 |     -e MYSQL_USER=fathmm \
 6 |     -e MYSQL_PASSWORD=fathmm \
 7 |     -p 9990:3306 \
 8 |     -v ~/share/usr/fathmm-db:/var/lib/mysql \
 9 |     -d mysql:5.7.14
10 | docker run --name chasm-db \
11 |     -v ~/downloads/chasm:/docker-entrypoint-initdb.d \
12 |     -e MYSQL_ROOT_PASSWORD=chasm \
13 |     -e MYSQL_DATABASE=CHASM \
14 |     -e MYSQL_USER=chasm \
15 |     -e MYSQL_PASSWORD=chasm \
16 |     -p 9991:3306 \
17 |     -v ~/share/usr/chasm3-db:/var/lib/mysql \
18 |     -d mysql:5.7.14
19 | docker run --name ensembl-hs-core-85-37-db \
20 |     -v ~/downloads/homo_sapiens_core_85_37:/docker-entrypoint-initdb.d \
21 |     -e MYSQL_ROOT_PASSWORD=embl \
22 |     -e MYSQL_DATABASE=homo_sapiens_core_85_37 \
23 |     -e MYSQL_USER=embl \
24 |     -e MYSQL_PASSWORD=embl \
25 |     -e MYSQL_ALLOW_EMPTY_PASSWORD=yes \
26 |     -p 9992:3306 \
27 |     -v ~/share/usr/ensembl-hs-core-85-37-db:/var/lib/mysql \
28 |     -d mysql:5.5
29 | 


--------------------------------------------------------------------------------
/db/ensembl-hs-core-85-37_db.yaml:
--------------------------------------------------------------------------------
1 | host: cpu-6-2
2 | db: homo_sapiens_core_85_37
3 | port: 9992
4 | docker_repo: limr/ensembl-hs-core-85-37-db
5 | data_dir: /cbio/ski/reis-filho/home/limr/share/usr/ensembl-hs-core-85-37-db/
6 | user: embl
7 | password: embl
8 | 


--------------------------------------------------------------------------------
/db/fathmm_config-cpu-6-2.ini:
--------------------------------------------------------------------------------
1 | [DATABASE]
2 | HOST   = cpu-6-2
3 | PORT   = 9990
4 | USER   = fathmm
5 | PASSWD = fathmm
6 | DB     = fathmm
7 | 


--------------------------------------------------------------------------------
/db/fathmm_config-e01.ini:
--------------------------------------------------------------------------------
1 | [DATABASE]
2 | HOST   = 10.0.200.48
3 | PORT   = 3306
4 | USER   = fathmm_user
5 | PASSWD = CSred74pop
6 | DB     = fathmm
7 | 


--------------------------------------------------------------------------------
/db/fathmm_config-ika.ini:
--------------------------------------------------------------------------------
1 | [DATABASE]
2 | HOST   = 10.0.200.44
3 | PORT   = 3306
4 | USER   = fathmm_user
5 | PASSWD = CSred74pop
6 | DB     = fathmm
7 | 


--------------------------------------------------------------------------------
/db/fathmm_config-lilac.ini:
--------------------------------------------------------------------------------
1 | [DATABASE]
2 | HOST   = 10.230.1.20
3 | PORT   = 3306
4 | USER   = fathmm_user
5 | PASSWD = CSred74pop
6 | DB     = fathmm
7 | 


--------------------------------------------------------------------------------
/db/fathmm_config-swan.ini:
--------------------------------------------------------------------------------
1 | [DATABASE]
2 | HOST   = 10.230.1.43
3 | PORT   = 3306
4 | USER   = fathmm_user
5 | PASSWD = CSred74pop
6 | DB     = fathmm
7 | 


--------------------------------------------------------------------------------
/db/fathmm_db.yaml:
--------------------------------------------------------------------------------
1 | host: cpu-6-2
2 | db: fathmm
3 | port: 9990
4 | docker_repo: limr/fathmm-db
5 | data_dir: /cbio/ski/reis-filho/home/limr/share/usr/fathmm-db
6 | user: fathmm
7 | password: fathmm
8 | 


--------------------------------------------------------------------------------
/db/run_mysql_docker_images.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | # qsub -I -l walltime=999:99:99 -l nodes=1:docker
3 | docker run -d -v /cbio/ski/reis-filho/home/limr/share/usr/fathmm-db:/var/lib/mysql -p 9990:3306 limr/fathmm-db
4 | docker run -d -v /cbio/ski/reis-filho/home/limr/share/usr/chasm3-db:/var/lib/mysql -p 9991:3306 limr/chasm-db
5 | docker run -d -v /cbio/ski/reis-filho/home/limr/share/usr/ensembl-hs-core-85-37-db/:/var/lib/mysql -p 9992:3306 limr/ensembl-hs-core-85-37-db
6 | 


--------------------------------------------------------------------------------
/db/snv_box-cpu-6-2.conf:
--------------------------------------------------------------------------------
1 | ; SNV-Box configuration file
2 | ; Contains the Database specifications
3 | chasmDB=CHASM
4 | db.user=chasm
5 | db.password=chasm
6 | db.host=cpu-6-2
7 | db.port=9991
8 | ;db.unix_socket=/tmp/mysql.sock
9 | 


--------------------------------------------------------------------------------
/db/snv_box-e01.conf:
--------------------------------------------------------------------------------
1 | ; SNV-Box configuration file
2 | ; Contains the Database specifications
3 | chasmDB=CHASM
4 | db.user=chasm_user
5 | db.password=password
6 | db.host=10.0.200.71
7 | db.port=38493
8 | ;db.unix_socket=/tmp/mysql.sock
9 | 


--------------------------------------------------------------------------------
/db/snv_box-ika.conf:
--------------------------------------------------------------------------------
1 | ; SNV-Box configuration file
2 | ; Contains the Database specifications
3 | chasmDB=CHASM
4 | db.user=chasm_user
5 | db.password=password
6 | db.host=10.0.200.44
7 | db.port=38493
8 | ;db.unix_socket=/tmp/mysql.sock
9 | 


--------------------------------------------------------------------------------
/db/snv_box-lilac.conf:
--------------------------------------------------------------------------------
1 | ; SNV-Box configuration file
2 | ; Contains the Database specifications
3 | chasmDB=CHASM
4 | db.user=chasm_user
5 | db.password=password
6 | db.host=10.230.1.20
7 | db.port=38493
8 | ;db.unix_socket=/tmp/mysql.sock
9 | 


--------------------------------------------------------------------------------
/db/snv_box-swan.conf:
--------------------------------------------------------------------------------
1 | ; SNV-Box configuration file
2 | ; Contains the Database specifications
3 | chasmDB=CHASM
4 | db.user=chasm_user
5 | db.password=password
6 | db.host=10.230.1.43
7 | db.port=38493
8 | ;db.unix_socket=/tmp/mysql.sock
9 | 


--------------------------------------------------------------------------------
/default_yaml/project_config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | cluster_engine: LSF ## PBS ## SGE
 3 | 
 4 | use_cluster: true
 5 | 
 6 | ref: b37
 7 | 
 8 | aligner: bwamem ## tophat ## hisat ## bwa ## bowtie ## tmap
 9 | bam_chr1_base_recal: true
10 | bam_dup_type: markdup
11 | bam_no_filter: false
12 | bam_no_recal: false
13 | bam_no_realn: false
14 | bam_no_sort: false
15 | bam_fix_rg: false
16 | bam_phred64: false
17 | bam_reprocess: false
18 | 
19 | snv_type: mutect # or mutect_snps
20 | mutect_split_chr: true
21 | mutect_use_contest: false
22 | indel_types: varscan_indels strelka_indels scalpel_indels lancet_indels platypus_indels #pindel mutect_indels to use mutect2
23 | 
24 | vcf_post_ann_filter_expression: ExAC_AF > 0.01
25 | 
26 | # vcf annotations
27 | ann_facets: true
28 | ann_mut_taste: false
29 | ann_provean: false
30 | ann_pathogen: true
31 | 
32 | # target panels
33 | targets_file: ~/share/reference/target_panels/
34 | 
35 | # gatk options
36 | gatk_hard_filter_snps: true
37 | gatk_pool_snp_recal: false
38 | 
39 | # facets options
40 | # pre-processing crit val
41 | facets_pre_cval: 50
42 | # crit val for estimating diploid log ratio
43 | facets_cval1: 150
44 | # starting crit val for segmentation 
45 | facets_cval2: 50
46 | # min number of het snps in a segment used for bivariate t-statistic during clustering of segement
47 | facets_min_nhet: 25
48 | # union of gatk and dbsnp for snp-pileup
49 | facets_union_gatk_dbsnp: false
50 | 
51 | # slack_channel: 
52 | 
53 | qsub_priority: 0
54 | ...
55 | 


--------------------------------------------------------------------------------
/default_yaml/sample_attr.yaml:
--------------------------------------------------------------------------------
1 | #facets_diplogr:
2 | #    S18_Pt08N: 3
3 | #facets_cval1:
4 | #   S18_Pt08N: 500
5 | 


--------------------------------------------------------------------------------
/default_yaml/summary_config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Uncomment if you want to add a sample order to heatmaps
 3 | # sample_order: [AWT, AWX1, AWX2, AWX3, AWX4, AWL, AWM1, AWM2, AWX5]
 4 | # blacklisted genes, don't report these
 5 | gene_blacklist: [TTN]
 6 | # Uncomment if you want to report names of samples differently in the
 7 | # summary/plots
 8 | # sample_rename:
 9 | #    AWM1: AWM1BR
10 | #    AWM2: AWM2BO
11 | ...
12 | 


--------------------------------------------------------------------------------
/export/cbioportal.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | include modules/variant_callers/gatk.inc
 3 | SHELL = /bin/bash
 4 | 
 5 | LOGDIR = log/cbioportal.$(NOW)
 6 | 
 7 | .PHONY: mafs
 8 | 
 9 | mafs : $(foreach pair,$(SAMPLE_PAIRS),$(foreach caller,mutect_snps mutect_indels,export/cbioportal/$(pair).$(caller).maf))
10 | 
11 | define CBIOPORTAL_VCF_RULE
12 | unset PERL5LIB PERL_MB_OPT PERLBREW_ROOT PERL_LOCAL_LIB_ROOT PERL_MM_OPT && \
13 | source activate /ifs/e63data/reis-filho/usr/anaconda-envs/vcf2maf && \
14 | mkdir -p $(@D) && \
15 | /opt/common/CentOS_6-dev/perl/perl-5.22.0/bin/perl \
16 | 	/ifs/e63data/reis-filho/usr/vcf2maf/vcf2maf.pl \
17 | 	--vep-path /opt/common/CentOS_6/vep/v84 \
18 | 	--vep-data /opt/common/CentOS_6/vep/v84 \
19 | 	--ref-fasta $(REF_FASTA) \
20 | 	--input-vcf $< \
21 | 	--output-maf $@ \
22 | 	--tumor-id $(tumor.$*) \
23 | 	--normal-id $(normal.$*)
24 | endef
25 | 
26 | export/cbioportal/%.maf: vcf/%.vcf
27 | 	$(CBIOPORTAL_VCF_RULE)
28 | 
29 | include modules/vcf_tools/vcftools.mk
30 | 


--------------------------------------------------------------------------------
/external/SNVBox/README:
--------------------------------------------------------------------------------
 1 | SNV-Box v3.0
 2 | ============
 3 | Thank you for using our software. Just a couple of things to take note before you get started:
 4 | 
 5 | 
 6 | FOR INSTALLATION INSTRUCTIONS, SUPPORT AND DOWNLOADS
 7 | ----------------------------------------------------
 8 | Please visit our website at http://wiki.chasmsoftware.org
 9 | 
10 | 
11 | TO JOIN OUR MAILING LIST
12 | ------------------------
13 | Please email chasm-beta-testers@lists.johnshopkins.edu
14 | 
15 | 
16 | TO REPORT A BUG
17 | ---------------
18 | Please create an account at http://bugzilla.chasmsoftware.org/ and file a bug report.
19 | 
20 | 
21 | ABOUT SOFTWARE LICENSING
22 | ------------------------
23 | Please note that our software is licensed under the JHU Academic Software License Agreement. For more details please refer to the license in the doc folder. 
24 | 
25 | 
26 | Thank you!
27 | 
28 | From KarchinLab
29 | 


--------------------------------------------------------------------------------
/external/SNVBox/db/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/external/SNVBox/db/__init__.py


--------------------------------------------------------------------------------
/external/SNVBox/doc/CHASM_VEST_SNVGet_UserManual.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/external/SNVBox/doc/CHASM_VEST_SNVGet_UserManual.doc


--------------------------------------------------------------------------------
/external/SNVBox/doc/CHASM_VEST_SNVGet_UserManual.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/external/SNVBox/doc/CHASM_VEST_SNVGet_UserManual.pdf


--------------------------------------------------------------------------------
/fastq_tools/bamtoFasta.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/bam_to_fasta.$(NOW)
 4 | PHONY += unmapped_reads
 5 | 
 6 | bam_to_fasta : $(foreach sample,$(SAMPLES),unmapped_reads/$(sample).fasta)
 7 | 
 8 | define bam-to-fasta
 9 | unmapped_reads/%.fasta : unmapped_reads/%.bam
10 | 	$(call RUN,-n 4 -s 4G -m 9G,"$(SAMTOOLS2) fasta $$< > unmapped_reads/$$*.fasta")
11 | endef
12 | $(foreach sample,$(SAMPLES),\
13 | 		$(eval $(call bam-to-fasta,$(sample))))
14 | 
15 | 
16 | .DELETE_ON_ERROR:
17 | .SECONDARY:
18 | .PHONY: $(PHONY)
19 | 


--------------------------------------------------------------------------------
/fastq_tools/blastReads.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/blast_reads.$(NOW)
 4 | PHONY += unmapped_read
 5 | 
 6 | blast_reads : $(foreach sample,$(SAMPLES),unmapped_reads/$(sample).blast)
 7 | 
 8 | define blast-reads
 9 | unmapped_reads/%.blast : unmapped_reads/%.fasta
10 | 	$(call RUN,-n 32 -s 3G -m 4G -w 360,"blastn -num_threads 32 -evalue 0.001 -word_size 28 -db ~/share/reference/ncbi_nt/nt -query $$< -outfmt 7 -out unmapped_reads/$$*.blast")
11 | endef
12 | $(foreach sample,$(SAMPLES),\
13 | 		$(eval $(call blast-reads,$(sample))))
14 | 
15 | 
16 | .DELETE_ON_ERROR:
17 | .SECONDARY:
18 | .PHONY: $(PHONY)
19 | 


--------------------------------------------------------------------------------
/fastq_tools/extractFastq.mk:
--------------------------------------------------------------------------------
 1 | # This module extract fastq files from a bam file.  It will use either the Picard (SamToFastq.jar) or bam2fastq programs to extract the fastq.  You can specify which program to use with the EXTRACT_TOOL variable
 2 | # input: $(SAMPLES)
 3 | # Author: Fong Chun Chan <fongchunchan@gmail.com>
 4 | 
 5 | include modules/Makefile.inc
 6 | 
 7 | LOGDIR ?= log/extract_fastq.$(NOW)
 8 | 
 9 | .DELETE_ON_ERROR:
10 | .SECONDARY:
11 | .PHONY: extract_fastq
12 | 
13 | VPATH = rawdata unprocessed_bam
14 | 
15 | extract_fastq : $(foreach sample,$(SAMPLES),fastq/$(sample).1.fastq.gz)
16 | 
17 | fastq/%.1.fastq.gz fastq/%.2.fastq.gz : %.bam
18 | 	$(call RUN,-n 4 -s 4G -m 9G,"$(SAMTOOLS2) sort -T $(<D)/$* -O bam -n -@ 4 -m 6G $< | $(SAMTOOLS2) fastq -f 1 -1 >(gzip -c > fastq/$*.1.fastq.gz) -2 >(gzip -c > fastq/$*.2.fastq.gz) -")
19 | 


--------------------------------------------------------------------------------
/fastq_tools/extractReads.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/extract_unmapped.$(NOW)
 4 | PHONY += unmapped_reads
 5 | 
 6 | extract_unmapped : $(foreach sample,$(SAMPLES),unmapped_reads/$(sample).bam)
 7 | 
 8 | define extract-unmapped-reads
 9 | unmapped_reads/%.bam : bam/%.bam
10 | 	$(call RUN,-n 4 -s 4G -m 9G,"$(SAMTOOLS2) view -f 0x04 -h -@ 4 -b $$< -o unmapped_reads/$$*.bam")
11 | endef
12 | $(foreach sample,$(SAMPLES),\
13 | 		$(eval $(call extract-unmapped-reads,$(sample))))
14 | 
15 | 
16 | .DELETE_ON_ERROR:
17 | .SECONDARY:
18 | .PHONY: $(PHONY)
19 | 


--------------------------------------------------------------------------------
/fastq_tools/extractunmappedpairs.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/extract_unmapped_pairs.$(NOW)
 4 | 
 5 | .DELETE_ON_ERROR:
 6 | .SECONDARY:
 7 | .PHONY: extract_unmapped_pairs
 8 | 
 9 | VPATH = bam
10 | JAVA = $(HOME)/share/usr/jdk1.8.0_121/bin/java
11 | PICARD = /lila/data/riazlab/lib/src/picard.jar
12 | 
13 | extract_unmapped_pairs : $(foreach sample,$(SAMPLES),extracted_reads/unmapped_pairs/$(sample)_1.fastq)
14 | 
15 | define extract-unmapped-pairs
16 | extracted_reads/unmapped_pairs/%.bam : extracted_reads/unmapped_pairs/%.txt
17 | 	$$(call RUN,-c -n 4 -s 4G -m 9G,"$(JAVA) -jar $(PICARD) FilterSamReads I=unmapped_reads/$$*.bam O=extracted_reads/unmapped_pairs/$$*.bam \
18 | 		READ_LIST_FILE=extracted_reads/unmapped_pairs/$$*.txt FILTER=includeReadList")
19 | 
20 | extracted_reads/unmapped_pairs/%.txt: unmapped_reads/%.bam
21 | 	$$(call RUN,-c -n 1 -s 4G -m 9G,"$(SAMTOOLS2) view $$< | cut -f1 | sort | uniq > extracted_reads/unmapped_pairs/$$*.txt")
22 | 
23 | extracted_reads/unmapped_pairs/%_1.fastq extracted_reads/unmapped_pairs/%_2.fastq : extracted_reads/unmapped_pairs/%.bam
24 | 	$$(call RUN,-n 4 -s 4G -m 9G,"bamToFastq -i $$< -fq extracted_reads/unmapped_pairs/$$*_1.fastq -fq2 extracted_reads/unmapped_pairs/$$*_2.fastq")
25 | 
26 | endef
27 | $(foreach pair,$(SAMPLES),\
28 | 		$(eval $(call extract-unmapped-pairs,$sample)))
29 | 


--------------------------------------------------------------------------------
/fastq_tools/fixFastqReadNames.mk:
--------------------------------------------------------------------------------
 1 | # This module is used for fixing read names of paired fastq files
 2 | # input: $(SAMPLES)
 3 | # Author: Fong Chun Chan <fongchunchan@gmail.com>
 4 | 
 5 | include modules/Makefile.inc
 6 | include modules/hg19.inc
 7 | 
 8 | FIX_FASTQ_READ_NAMES = $(PYTHON) modules/fastq_tools/fixFastqReadNames.py
 9 | 
10 | LOGDIR = fastq/logs
11 | 
12 | .DELETE_ON_ERROR:
13 | 
14 | .SECONDARY:
15 | 
16 | all : $(foreach sample,${SAMPLES},fastq/$(sample).1.fixed.fastq)
17 | 
18 | fastq/%.1.fixed.fastq fastq/%.2.fixed.fastq : fastq/%.1.fastq fastq/%.2.fastq
19 | 	SGE_RREQ="$(SGE_RREQ) $(call MEM_FREE,2G,5G)" $(MKDIR) $(LOGDIR);\
20 | 	$(FIX_FASTQ_READ_NAMES) $(word 1,$^) $(word 2,$^) fastq/$*.1.fixed.fastq fastq/$*.2.fixed.fastq &> ${LOGDIR}/$(@F).log;\
21 | 


--------------------------------------------------------------------------------
/fastq_tools/mergeFastq.mk:
--------------------------------------------------------------------------------
 1 | # add in new fastq to existing bams
 2 | include modules/Makefile.inc
 3 | 
 4 | LOGDIR ?= log/merge_fastq.$(NOW)
 5 | 
 6 | .PHONY: merge_fastq
 7 | .DELETE_ON_ERROR:
 8 | .SECONDARY:
 9 | 
10 | MERGE_SAMPLE_FILE ?= merge_samples.txt
11 | ifneq ($(wildcard $(MERGE_SAMPLE_FILE)),)
12 |   MERGE_SAMPLES ?= $(shell sed '/^\#/d' $(MERGE_SAMPLE_FILE))
13 | endif
14 | 
15 | ALIGNER ?= bwamem
16 | 
17 | merge_fastq : $(foreach sample,$(MERGE_SAMPLES),$(if $(wildcard bam/$(sample).bam),merged_bam/$(sample).bam,bam/$(sample).bam))
18 | 	$(call RUN,-s 7G -m 8G,"for bam in $(filter merged_bam/%.bam,$^); do \
19 | 		ln -f \$${bam} bam/\$$(basename \$${bam}) && \
20 | 		$(SAMTOOLS) index \$${bam}; \
21 | 		done")
22 | 
23 | include modules/aligners/$(ALIGNER)Aligner.mk
24 | 
25 | merged_bam/%.1.bam merged_bam/%.2.bam : $(ALIGNER)/bam/%.$(ALIGNER).$(BAM_SUFFIX)
26 | 	$(INIT)  ln -f $(<M) merged_bam/$*.1.bam && \
27 | 	ln -f bam/$*.bam merged_bam/$*.2.bam
28 | 
29 | merged_bam/%.header.sam : merged_bam/%.1.bam merged_bam/%.2.bam
30 | 	$(INIT) { $(SAMTOOLS) view -H $(<M) | grep -v '^@RG'; \
31 | 	for bam in $(^); do \
32 | 	$(SAMTOOLS) view -H $$bam | grep '^@RG'; \
33 | 	done | sort | uniq; } > $@
34 | 
35 | merged_bam/%.bam : merged_bam/%.header.sam merged_bam/%.1.bam merged_bam/%.2.bam
36 | 	$(call RUN,-s 12G -m 15G,"$(SAMTOOLS) merge -f -h $< $(@) $(filter %.bam,$(^))")
37 | 
38 | 


--------------------------------------------------------------------------------
/fastq_tools/mergeSplitFastq.mk:
--------------------------------------------------------------------------------
 1 | # merge split fastqs for workflows like defuse
 2 | 
 3 | include modules/Makefile.inc
 4 | 
 5 | LOGDIR ?= log/merge_split_fastq.$(NOW)
 6 | 
 7 | .SECONDARY:
 8 | .DELETE_ON_ERROR: 
 9 | .PHONY : fastq
10 | 
11 | fastq: $(foreach sample,$(SAMPLES),fastq/$(sample).1.fastq.gz fastq/$(sample).2.fastq.gz)
12 | 
13 | define merged-fastq
14 | fastq/$1.%.fastq.gz : $$(foreach split,$2,fastq/$$(split).%.fastq.gz)
15 | 	$$(call RUN,,"zcat $$(^) | gzip -c > $$(@)")
16 | endef
17 | $(foreach sample,$(SAMPLES),$(eval $(call merged-fastq,$(sample),$(split.$(sample)))))
18 | 
19 | define merged-fastq2
20 | fastq/$1.1.fastq.gz : $$(foreach split,$2,$$(word 1, $$(fq.$$(split))))
21 | 	$$(call RUN,,"zcat $$(^) | gzip -c > $$(@)")
22 | fastq/$1.2.fastq.gz : $$(foreach split,$2,$$(word 2, $$(fq.$$(split))))
23 | 	$$(call RUN,,"zcat $$(^) | gzip -c > $$(@)")
24 | endef
25 | $(foreach sample,$(SAMPLES),$(eval $(call merged-fastq2,$(sample),$(split.$(sample)))))
26 | 


--------------------------------------------------------------------------------
/fastq_tools/trimFastq.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # trim fastq file
 3 | 
 4 | use strict;
 5 | use warnings;
 6 | 
 7 | use Getopt::Std;
 8 | 
 9 | my %opt;
10 | getopts('hs:e:l:', \%opt);
11 | 
12 | my $usage = <<ENDL;
13 | Usage: filterFastq.pl -l [read length] 
14 | -h: this help message
15 | -l [integer]: max length
16 | -s [integer]: trim x bases from start of read
17 | -e [integer]: trim x bases from end of read
18 | ENDL
19 | 
20 | sub HELP_MESSAGE {
21 |     print STDERR $usage;
22 |     exit(1);
23 | }
24 | 
25 | print "Missing read-trim length\n" and HELP_MESSAGE() unless ($opt{l} || $opt{s} || $opt{e});
26 | 
27 | my $i = 0;
28 | while (<STDIN>) {
29 |     chomp;
30 |     if ($i % 2 == 0) {
31 |         print;
32 |     } else {
33 |         my $ss = $_;
34 |         if ($opt{s}) {
35 |             $ss = substr($ss, $opt{s});
36 |         }
37 |         if ($opt{e}) {
38 |             $ss = substr($ss, 0, length($ss) - $opt{e});
39 |         }
40 |         if ($opt{l}) {
41 |             $ss = substr($ss, 0, $opt{l});
42 |         }
43 |         print $ss;
44 |     }
45 |     print "\n";
46 |     $i++;
47 | }
48 | 


--------------------------------------------------------------------------------
/genome_inc/GRCm38.inc:
--------------------------------------------------------------------------------
 1 | # vim: set ft=make:
 2 | ifndef GRCM38_INC
 3 | REF = GRCm38
 4 | TOPHAT_REF = GRCm38
 5 | REF_DIR = $(HOME)/share/reference
 6 | 
 7 | INTEGRATE_ANN = $(REF_DIR)/mm10_annot.ucsc.txt
 8 | INTEGRATE_BWTS =$(REF_DIR)/Mus_musculus_GRCm38/bwts
 9 | 
10 | GENES_GTF = $(REF_DIR)/Mus_musculus/NCBI/$(TOPHAT_REF)/Annotation/Genes/genes.gtf
11 | 
12 | REF_FASTA := $(REF_DIR)/Mus_musculus_GRCm38/Mus_musculus.GRCm38.71.dna.chromosome.genome.fa
13 | REF_DICT := $(REF_DIR)/Mus_musculus_GRCm38/Mus_musculus.GRCm38.71.dna.chromosome.genome.dict
14 | SNP_EFF_GENOME = GRCm38.86 # == mm10
15 | DBSNP := $(REF_DIR)/mgp.v5.merged.snps_all.dbSNP142.vcf.gz
16 | MGP_SNP_DBSNP := $(DBSNP)
17 | MGP_INDEL_DBSNP := $(REF_DIR)/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz
18 | 
19 | CENTROMERE_TABLE = $(REF_DIR)/centromere_mm10.txt
20 | 
21 | EXOME_BED = $(REF_DIR)/mus_musculus_known_genes_exons_GRCm38.bed
22 | EXOME_BED_NOHEADER = $(REF_DIR)/mus_musculus_known_genes_exons_GRCm38_noheader.bed
23 | 
24 | FREEC_REF := $(REF_DIR)/Mus_musculus_GRCm38/freec
25 | CHR_LEN = $(REF_DIR)/Mus_musculus_GRCm38/mm10.len
26 | 
27 | BOWTIE_REF = $(REF_DIR)/Mus_musculus_GRCm38/Mus_musculus.GRCm38.71.dna.chromosome.genome
28 | 
29 | ENSEMBL_TXDB = $(REF_DIR)/mus_musculus_ensembl_biomart.2014-04-28.sqlite
30 | 
31 | #TXDB = $(HOME)/ensmusg70.08032013.sqlite
32 | 
33 | RIBOSOMAL_INTERVALS = $(REF_DIR)/mm10_rrna_intervals.txt
34 | GENE_REF_FLAT = $(REF_DIR)/mm10_genes.refFlat.txt
35 | 
36 | EXOME ?= false
37 | ifeq ($(EXOME),true)
38 | TARGETS_FILE = $(EXOME_BED_NOHEADER)
39 | QUALIMAP_TARGETS_FILE = $(TARGETS_FILE)
40 | endif
41 | 
42 | INCLUDE_CHR_Y ?= true
43 | ifneq ($(and $(TARGETS_FILE),$(findstring false,$(EXOME))),)
44 | CHROMOSOMES := $(shell grep -v '@' $(TARGETS_FILE) | cut -f1 | sort | uniq)
45 | else
46 | CHROMOSOMES := $(shell seq 1 19) X $(if $(findstring true,$(INCLUDE_CHR_Y)),Y) MT
47 | endif
48 | 
49 | 
50 | endif
51 | GRCM38_INC = true
52 | 


--------------------------------------------------------------------------------
/genome_inc/hg18.inc:
--------------------------------------------------------------------------------
 1 | REF = hg18
 2 | REF_FASTA = ~/share/references/genomes/hg18.fa
 3 | SNP_EFF_GENOME = hg36.54 # == hg18
 4 | DBSNP = ~/share/references/dbsnp/dbsnp_132.hg18.vcf
 5 | KNOWN_INDELS = ~/share/references/1000g/1000G_biallelic.indels.hg18.vcf
 6 | OMNI = ~/share/references/1000g/1000G_omni2.5.hg18.sites.vcf
 7 | HAPMAP = ~/share/references/hapmap3/hapmap_3.3.hg18.sites.vcf
 8 | GC_WIG = ~/share/references/gc.hg18.wig
 9 | MAP_WIG = ~/share/references/map.hg18.wig
10 | 


--------------------------------------------------------------------------------
/genome_inc/hg38.inc:
--------------------------------------------------------------------------------
 1 | # vim: set ft=make:
 2 | 
 3 | ifndef HG38_INC
 4 | 
 5 | REF?= hg38
 6 | REF_FASTA ?= $(REF_DIR)/hg38_gatk_bundle/Homo_sapiens_assembly38.fasta
 7 | 
 8 | DBSNP ?= $(REF_DIR)/hg38_gatk_bundle/dbsnp_146.hg38.vcf.gz
 9 | HAPMAP ?= $(REF_DIR)/hg38_gatk_bundle/hapmap_3.3_grch38_pop_stratified_af.vcf.gz
10 | KNOWN_INDELS ?= $(REF_DIR)/hg38_gatk_bundle/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz
11 | OMNI ?= $(REF_DIR)/hg38_gatk_bundle/1000G_omni2.5.hg38.vcf.gz
12 | 
13 | TOPHAT_REF = $(REF)
14 | 
15 | ANNOVAR_REF = hg38
16 | 
17 | ERICSCRIPT_DB = $(REF_DIR)/ericscript_db_hg38_84
18 | ERICSCRIPT_SPECIES = homo_sapiens
19 | 
20 | endif
21 | HG38_INC = true
22 | 


--------------------------------------------------------------------------------
/ploidy/bicseq.mk:
--------------------------------------------------------------------------------
 1 | # run bicseq for cnvs
 2 | 
 3 | include modules/Makefile.inc
 4 | 
 5 | LOGDIR = log/bicseq.$(NOW)
 6 | 
 7 | SHELL = modules/scripts/Rshell
 8 | .SHELLFLAGS = -s -m $(MEM) -n $(@F) -l $(LOGDIR) -e 
 9 | 
10 | .ONESHELL:
11 | .DELETE_ON_ERROR:
12 | .SECONDARY:
13 | .PHONY: all
14 | 
15 | MEM = 9G
16 | 
17 | all : $(foreach pair,$(SAMPLE_PAIRS),expands/rdata/$(pair).cbs_snv.Rdata)
18 | 
19 | 
20 | 
21 | expands/rdata/%.cbs_snv.Rdata : mutect/tables/%.mutect.txt varscan/segment/%.varscan2copynumber.txt
22 | 	library(expands)
23 | 	snv <- read.table("$<", header = T, sep = "\t")
24 | 	snv <- subset(snv, judgement != "REJECT")
25 | 	colnames(snv)[1:2] <- c("chr", "startpos")
26 | 	cbs <- read.table("$(<<)", header = T, sep = "\t")
27 | 	cbs <- transform(cbs, CN_estimate = 2^Segmented)
28 | 	colnames(cbs)[c(2,3,4)] <- c("chr", "startpos", "endpos")
29 | 	dir.create("$(@D)", recursive = T)
30 | 	save(snv, cbs, file = "$@")
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/ploidy/expands.mk:
--------------------------------------------------------------------------------
 1 | # run expands for determining tumor ploidy
 2 | 
 3 | include modules/Makefile.inc
 4 | 
 5 | LOGDIR = log/expands.$(NOW)
 6 | 
 7 | SHELL = modules/scripts/Rshell
 8 | .SHELLFLAGS = -s -m $(MEM) -n $(@F) -l $(LOGDIR) -e 
 9 | 
10 | .ONESHELL:
11 | .DELETE_ON_ERROR:
12 | .SECONDARY:
13 | .PHONY: all
14 | 
15 | MEM = 20G
16 | 
17 | all : $(foreach pair,$(SAMPLE_PAIRS),expands/rdata/$(pair).cbs_snv.Rdata)
18 | 
19 | expands/rdata/%.cbs.Rdata : varscan/copycall/%.copycall
20 | 	cn <- read.table("$<", header=T, as.is=T)
21 | 	keep <- which(cn[,1] %in% c(1:22, "X"))
22 | 	if (length(rm) > 0) { cn <- cn[keep,]}
23 | 	cn[which(cn[,1]=="X"),1] <- 23
24 | 	cn[,1] <- as.numeric(cn[,1])
25 | 	cn <- cn[order(cn[,1], cn[,2]),]
26 | 	cn <- cbind(name = paste(cn[,1], cn[,2], sep="_"), cn[,c(1:3,7)])
27 | 	cgh <- make_cghRaw(cn)
28 | 	normalized <- normalize(cgh)
29 | 	segmented <- segmentData(normalized, relSDlong=2, undo.splits="sdundo", undo.SD=1.5)
30 | 	calls <- CGHcall(segmented, nclass=3)
31 | 	excalls <- ExpandCGHcall(calls, segmented)
32 | 	cbs <- with(fData(excalls), data.frame(chr = as.character(Chromosome[calls[[5]][,"wm"]]), startpos = Start[calls[[5]][,"wm"]], endpos = End[calls[[5]][,"wmend"]], CN_Estimate = 2^calls[[5]][, "smwh"], stringsAsFactors = F))
33 | 	cbs <- transform(cbs, segmentLength = endpos - startpos)
34 | 
35 | 
36 | expands/rdata/%.snv.Rdata : mutect/tables/%.mutect.txt
37 | 	library(expands)
38 | 	snv <- read.table("$<", header = T, sep = "\t", stringsAsFactors = F)
39 | 	snv <- subset(snv, judgement != "REJECT")
40 | 	colnames(snv)[1:2] <- c("chr", "startpos")
41 | 	snv <- subset(snv, select = 'chr', 'startpos')
42 | 	snv$$chr <- as.integer(snv$$chr)
43 | 	snv <- as.matrix(snv[!is.na(snv$$chr), ])
44 | 	dir.create("$(@D)", recursive = T)
45 | 	dm <- assignQuantityToMutation(snv, cbs, "CN_Estimate")
46 | 	save(snv, cbs, file = "$@")
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/qc/bamStats.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | LOGDIR ?= log/bam_stats.$(NOW)
 3 | 
 4 | BAM_STATS_USE_REF ?= true
 5 | 
 6 | .SECONDARY:
 7 | .DELETE_ON_ERROR:
 8 | .PHONY: bam_stats
 9 | 
10 | bam_stats: $(foreach sample,$(SAMPLES),metrics/$(sample).bam_stats.html)
11 | 
12 | metrics/%.bc : bam/%.bam
13 | 	$(call RUN,-s 6G -m 8G,"samtools stats $(if $(findstring true,$(BAM_STATS_USE_REF)),-r $(REF_FASTA)) $< > $@")
14 | metrics/%.bam_stats.html : metrics/%.bc
15 | 	$(call RUN,-s 6G -m 8G,"plot-bamstats -p $(@D)/$* $<")
16 | 


--------------------------------------------------------------------------------
/qc/fastqc.mk:
--------------------------------------------------------------------------------
 1 | # vim: set ft=make :
 2 | # Run Fastqc on bam files
 3 | 
 4 | include modules/Makefile.inc
 5 | 
 6 | FASTQC_SUMMARY_PLOT = $(RSCRIPT) modules/qc/fastqcSummaryPlot.R
 7 | 
 8 | LOGDIR ?= log/fastqc.$(NOW)
 9 | 
10 | .PHONY: fastqc
11 | .SECONDARY: 
12 | 
13 | fastqc : $(foreach sample,$(SAMPLES),fastqc/$(sample)_fastqc/summary.txt) fastqc/all_summary.txt
14 | 
15 | fastqc/%_fastqc.zip : bam/%.bam
16 | 	$(call RUN,-N $*_fastqc -s 4G -m 12G,"$(FASTQC) -o fastqc $^")
17 | 
18 | fastqc/%_fastqc/summary.txt : fastqc/%_fastqc.zip
19 | 	$(INIT) $(UNZIP) -o -d fastqc $< &> $(LOG) && touch $@
20 | 
21 | fastqc/all_summary.txt : $(foreach sample,$(SAMPLES),fastqc/$(sample)_fastqc/summary.txt)
22 | 	$(INIT) $(FASTQC_SUMMARY_PLOT) --outPrefix fastqc/all_summary $^ &> $(LOG)
23 | 


--------------------------------------------------------------------------------
/qc/fastqcSummaryPlot.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("plyr"))
 5 | suppressPackageStartupMessages(library("dplyr"))
 6 | suppressPackageStartupMessages(library("tidyr"))
 7 | suppressPackageStartupMessages(library("magrittr"))
 8 | suppressPackageStartupMessages(library("stringr"))
 9 | suppressPackageStartupMessages(library("ggplot2"))
10 | 
11 | if (!interactive()) {
12 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
13 | }
14 | 
15 | optList <- list(
16 |                 make_option("--outPrefix", default = 'summary', type = "character", action = "store", help ="Output file prefix (default %default)"),
17 |                 make_option("--width", default = 1000, action = "store", help ="width of heatmap image (default %default)"),
18 |                 make_option("--height", default = 1000, action = "store", help ="height of heatmap image (default %default)"))
19 | parser <- OptionParser(usage = "%prog [options] summary file(s)", option_list = optList);
20 | arguments <- parse_args(parser, positional_arguments = T);
21 | opt <- arguments$options;
22 | 
23 | summ <- arguments$args %>%
24 |     llply(read.table, sep = '\t', stringsAsFactors = F) %>%
25 |     bind_rows %>%
26 |     setNames(c("Status", "Metric", "Sample")) %>%
27 |     mutate(Sample = str_replace(Sample, '\\.bam', '')) %>%
28 |     mutate_each(funs(as.factor))
29 | 
30 | fn <- str_c(opt$outPrefix, '.png')
31 | png(fn, width = opt$width + 10 * nlevels(summ$Sample), height = opt$height, type = 'cairo-png');
32 | p <- summ %>% ggplot(aes(x = Sample, y = Metric)) +
33 |     geom_tile(aes(fill = Status)) +
34 |     theme(axis.text.x = element_text(angle = 90, hjust = 1))
35 | p
36 | dev.off();
37 | 
38 | fn <- str_c(opt$outPrefix, '.txt')
39 | summ %>% spread(Metric, Status) %>% write.table(file = fn, sep = '\t', quote = F, row.names = F)
40 | 


--------------------------------------------------------------------------------
/qc/intervalBamQC.mk:
--------------------------------------------------------------------------------
 1 | # vim: set ft=make :
 2 | # amplicon qc using bams and gatk vcf results
 3 | 
 4 | include modules/Makefile.inc
 5 | 
 6 | INTERVAL_FILE ?= intervals.bed
 7 | 
 8 | VPATH ?= bam
 9 | 
10 | TEQC = modules/qc/TEQC.R
11 | INTERVAL_BAM_QC = modules/qc/intervalBamQC.R
12 | VARIANT_EVAL_REPORT = modules/qc/variantEvalGatkReport.R
13 | 
14 | LOGDIR ?= log/interval_qc.$(NOW)
15 | 
16 | .SECONDARY:
17 | .DELETE_ON_ERROR:
18 | .PHONY: rdata all
19 | 
20 | all : interval_qc/coverage/index.html # amplicon_qc/variant_eval/index.html
21 | rdata : $(foreach sample,$(SAMPLES),amplicon_qc/rdata/$(sample).Rdata)
22 | 
23 | # load each bam file into R and create R data files
24 | interval_qc/rdata/%.Rdata : %.bam
25 | 	$(call RUN,-s 8G -m 18G,"$(RSCRIPT) $(TEQC) --outFile $@ --ref $(REF) $< $(INTERVAL_FILE)")
26 | 
27 | # GATK variant eval for each sample variants.vcf file
28 | # stratified by intervals and filter
29 | interval_qc/variantEval.grp : $(foreach sample,$(SAMPLES),gatk/vcf/$(sample).variants.vcf)
30 | 	$(call RUN,-n 4 -s 1G -m 1.5G,"$(call GATK_MEM,4G) -T VariantEval -nt 4 -o $@ -R $(REF_FASTA) --stratIntervals $(INTERVAL_FILE) --dbsnp $(DBSNP) $(foreach vcf,$^, --eval:$(call strip-suffix,$(notdir $(vcf))) $(vcf)) -ST IntervalStratification -ST Filter")
31 | 
32 | # Create amplicon coverage plots using R script AMPLICON_BAM_QC
33 | interval_qc/coverage/index.html : $(foreach sample,$(SAMPLES),interval_qc/rdata/$(sample).Rdata)
34 | 	$(call RUN,-s 2G -m 4G,"$(RSCRIPT) $(INTERVAL_BAM_QC) --outDir $(@D) $^")
35 | 
36 | # Create variant evaluation plots using R script VARIANT_EVAL_REPORT
37 | interval_qc/variant_eval/index.html : interval_qc/variantEval.grp
38 | 	$(call RUN,-s 2G -m 4G,"$(RSCRIPT) $(VARIANT_EVAL_REPORT) --outDir $(@D) $< &> $(LOG)")
39 | 


--------------------------------------------------------------------------------
/qc/nonRefFreqFromPileup.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | use Getopt::Std;
 7 | my %opt;
 8 | getopts('hb:d:', \%opt);
 9 | 
10 | my $usage = <<ENDL;
11 | perl countNonRefPileup.pl -d [min depth] -b [bin size] < [pileup]
12 | ENDL
13 | 
14 | sub HELP_MESSAGE {
15 |    print STDERR $usage;
16 |    exit(1);
17 | }
18 | 
19 | HELP_MESSAGE if $opt{h};
20 | 
21 | my @bases = qw/A T C G/;
22 | my $freqThreshold = 0.001;
23 | my $binSize = $opt{b}? $opt{b} : 0.05;
24 | my $minDepth = $opt{d}? $opt{d} : 10;
25 | my $lastBin = 1 / $binSize;
26 | 
27 | my %nonRefFreq;
28 | while (<>) {
29 |     chomp;
30 |     my @F = split /\t/;
31 |     next unless $F[4];
32 |     my $depth = length($F[4]);
33 |     next if $depth < $minDepth;
34 |     next if $F[4] =~ /[-+]/;
35 |     $F[4] =~ s/[-+][ATCGNatcgn]+//g;
36 |     for my $base (@bases) {
37 |         my $count = () = $F[4] =~ /$base/i;
38 |         my $freq = $count / $depth;
39 |         $nonRefFreq{$F[2]}{$base}[int($freq / $binSize)]++ if $freq > $freqThreshold;
40 |     }
41 | }
42 | 
43 | print "Ref\tVar";
44 | for my $bin (0..$lastBin) {
45 |     print "\tBin$bin";
46 | }
47 | print "\n";
48 | for my $refBase (@bases) {
49 |     for my $varBase (@bases) {
50 |         next if $refBase eq $varBase;
51 |         print "$refBase\t$varBase";
52 |         for my $bin (0..$lastBin) {
53 |             print "\t";;
54 |             if (defined $nonRefFreq{$refBase}{$varBase}[$bin]) {
55 |                 print $nonRefFreq{$refBase}{$varBase}[$bin];
56 |             } else {
57 |                 print "0";
58 |             }
59 |         }
60 |         print "\n";
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/qc/plotHsMetrics.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("hwriter"))
 5 | 
 6 | options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
 7 | 
 8 | optList <- list(
 9 |                 make_option("--outDir", default = ".", help = "Output dir"))
10 | 
11 | parser <- OptionParser(usage = "%prog [options] [hs_metrics.txt]", option_list = optList);
12 | 
13 | arguments <- parse_args(parser, positional_arguments = T);
14 | opt <- arguments$options;
15 | 
16 | if (length(arguments$args) < 1) {
17 |     cat("Need input hs metrics file\n");
18 |     print_help(parser);
19 |     stop();
20 | } else {
21 |     f <- arguments$args[1];
22 | }
23 | 
24 | hsMetrics <- read.table(f, header = T, row.names = 1, sep = '\t')
25 | colsToPlot <- c("TOTAL_READS", "PCT_PF_UQ_READS", "PCT_PF_UQ_READS_ALIGNED", "PCT_OFF_BAIT", "MEAN_TARGET_COVERAGE", "FOLD_ENRICHMENT", "PCT_TARGET_BASES_10X", "PCT_TARGET_BASES_30X", "PCT_TARGET_BASES_50X", "AT_DROPOUT", "GC_DROPOUT")
26 | 
27 | for (cp in colsToPlot) {
28 |     gfn <- paste(opt$outDir, "/", tolower(cp), "_barplot.pdf", sep = "")
29 |     pdf(gfn, height = 3 + nrow(hsMetrics) / 2, width = 6)
30 |     par(mar = c(5,10,5,5))
31 |     barplot(hsMetrics[,cp], names.arg = rownames(hsMetrics), horiz = T, las = 2, main = cp)
32 |     null <- dev.off()
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/qc/qualimap.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | QUALIMAP_BAMQC_OPTS = -gd HUMAN 
 4 | QUALIMAP = unset DISPLAY; $(JAVA) -Xmx16G -classpath $(HOME)/share/usr/qualimap/qualimap.jar:$(HOME)/share/usr/qualimap/lib/* org.bioinfo.ngs.qc.qualimap.main.NgsSmartMain 
 5 | 
 6 | LOGDIR = log/qualimap.$(NOW)
 7 | 
 8 | ifdef QUALIMAP_TARGETS_FILE
 9 | QUALIMAP_BAMQC_OPTS += -gff $(QUALIMAP_TARGETS_FILE) -os
10 | endif
11 | 
12 | .SECONDARY:
13 | .DELETE_ON_ERROR:
14 | .PHONY : all
15 | 
16 | all : $(foreach sample,$(SAMPLES),qualimap/$(sample)_bamqc.timestamp)
17 | 
18 | qualimap/%_bamqc.timestamp : bam/%.bam
19 | 	$(call RUN,-n 4 -s 4.5G -m 5G,"$(QUALIMAP) bamqc $(QUALIMAP_BAMQC_OPTS) -bam $< -nr 6 -nt 8 -outdir qualimap/$*_bamqc && touch $@")
20 | 
21 | 
22 | include modules/bam_tools/processBam.mk
23 | 


--------------------------------------------------------------------------------
/qc/readDepth.mk:
--------------------------------------------------------------------------------
 1 | # Run gatk depth of coverage on bam files
 2 | 
 3 | include modules/Makefile.inc
 4 | include modules/variant_callers/gatk.inc
 5 | 
 6 | LOGDIR = log/read_depth.$(NOW)
 7 | 
 8 | EXOME ?= false
 9 | 
10 | ifeq ($(EXOME),true)
11 | READ_DEPTH_ARGS += -L $(EXOME_BED)
12 | endif
13 | 
14 | .PHONY: all
15 | 
16 | all: $(foreach sample,$(SAMPLES),gatk/read_depth/$(sample).read_depth)
17 | 
18 | #ifeq ($(SPLIT_CHR),true)
19 | #define read-depth-chr
20 | #gatk/chr_read_depth/%.$1.read_depth : %.bam
21 | #	$$(call INIT_MEM,8G,12G) $$(call GATK_MEM,7G) -T DepthOfCoverage -R $$(REF_FASTA) -L $1 -o $$@ -I $$< &> $$(LOG)
22 | #endef
23 | #$(foreach chr,$(CHROMOSOMES),$(eval $(call read-depth-chr,$(chr))))
24 | #gatk/read_depth/%.read_depth : $(foreach chr,$(CHROMOSOMES),gatk/chr_read_depth/%.$(chr).read_depth)
25 | #	$(INIT) head -1 $< > $@ && for x in $^; do sed '1d' $$x >> $@; done
26 | #else
27 | 
28 | gatk/read_depth/%.read_depth : %.bam
29 | 	$(call RUN,-s 8G -m 12G,"$(call GATK_MEM,7G) -T DepthOfCoverage -R $(REF_FASTA) $(READ_DEPTH_ARGS) -o $@ -I $<")
30 | 
31 | 


--------------------------------------------------------------------------------
/qc/rnaseqMetrics.mk:
--------------------------------------------------------------------------------
 1 | ## defaults
 2 | VPATH ?= bam
 3 | LOGDIR = log/rnaseq_metrics.$(NOW)
 4 | 
 5 | ## includes
 6 | include modules/Makefile.inc
 7 | include modules/variant_callers/gatk.inc
 8 | 
 9 | PLOT_RNASEQ_METRICS = $(RSCRIPT) modules/qc/plotRnaseqMetrics.R
10 | 
11 | .DELETE_ON_ERROR:
12 | .SECONDARY: 
13 | .PHONY: all report
14 | 
15 | COLLECT_RNASEQ_METRICS = $(JAVA) -Xmx7G -jar $(JARDIR)/CollectRnaSeqMetrics.jar VALIDATION_STRINGENCY=LENIENT
16 | STRAND_SPECIFICITY ?= NONE
17 | 
18 | all : $(foreach sample,$(SAMPLES),metrics/$(sample).rnaseq_metrics) metrics/all.rnaseq_metrics metrics/all.normalized_coverage.rnaseq_metrics report
19 | 
20 | report : metrics/rnaseq_report/index.html
21 | 
22 | 
23 | metrics/%.rnaseq_metrics : bam/%.bam
24 | 	$(call RUN,-c -s 8G -m 12G,"$(COLLECT_RNASEQ_METRICS) REF_FLAT=$(GENE_REF_FLAT) RIBOSOMAL_INTERVALS=$(RIBOSOMAL_INTERVALS) STRAND_SPECIFICITY=$(STRAND_SPECIFICITY) INPUT=$< REFERENCE_SEQUENCE=$(REF_FASTA) OUTPUT=$@ CHART_OUTPUT=$@.pdf VERBOSITY=ERROR")
25 | 
26 | metrics/all.rnaseq_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).rnaseq_metrics)
27 | 	grep '^PF_BASES' $< > $@ && for i in $^; do sample=`echo $$i | sed 's:.*/::; s/\..*//'`; grep -A1 '^PF_BASES' $$i | tail -1 | awk -v sample=$$sample 'BEGIN { OFS = "\t" } { $$23=sample; print $$0 }'  >> $@; done
28 | 
29 | metrics/all.normalized_coverage.rnaseq_metrics : $(foreach sample,$(SAMPLES),metrics/$(sample).rnaseq_metrics)
30 | 	grep -A101 '^normalized_position' $< | cut -f1 > $@ && for i in $^; do sample=`echo $$i | sed 's:.*/::; s/\..*//'`; grep -A101 '^normalized_position' $$i | cut -f2 | sed "s/All_Reads/$$sample/" | paste $@ - > $@.tmp && mv $@.tmp $@; done
31 | 
32 | metrics/rnaseq_report/index.html : metrics/all.rnaseq_metrics metrics/all.normalized_coverage.rnaseq_metrics
33 | 	$(PLOT_RNASEQ_METRICS) --outDir $(@D) $^
34 | 


--------------------------------------------------------------------------------
/qc/rseqc.mk:
--------------------------------------------------------------------------------
 1 | LOGDIR = log/rseqc.$(NOW)
 2 | ## includes
 3 | include modules/Makefile.inc
 4 | 
 5 | INFER_EXPT = python $(HOME)/share/usr/anaconda/bin/infer_experiment.py
 6 | INNER_DIST = python $(HOME)/share/usr/anaconda/bin/inner_distance.py
 7 | JUNC_ANNOT = python $(HOME)/share/usr/anaconda/bin/junction_annotation.py
 8 | BAM_STAT = python $(HOME)/share/usr/anaconda/bin/junction_annotation.py
 9 | CLIP_PROFILE = python $(HOME)/share/usr/anaconda/bin/clipping_profile.py
10 | MISMATCH_PROFILE = python $(HOME)/share/usr/anaconda/bin/mismatch_profile.py
11 | INSERTION_PROFILE = python $(HOME)/share/usr/anaconda/bin/insertion_profile.py
12 | DELETION_PROFILE = python $(HOME)/share/usr/anaconda/bin/deletion_profile.py
13 | GENEBODY_COV = python $(HOME)/share/usr/anaconda/bin/geneBody_coverage.py
14 | READ_HEXAMER = python $(HOME)/share/usr/anaconda/bin/read_hexamer.py
15 | READ_QUALITY = python $(HOME)/share/usr/anaconda/bin/read_quality.py
16 | READ_NVC = python $(HOME)/share/usr/anaconda/bin/read_NVC.py
17 | READ_GC = python $(HOME)/share/usr/anaconda/bin/read_GC.py
18 | READ_DUP = python $(HOME)/share/usr/anaconda/bin/read_duplication.py
19 | READ_DIST = python $(HOME)/share/usr/anaconda/bin/read_distribution.py
20 | RPKM_SAT = python $(HOME)/share/usr/anaconda/bin/RPKM_saturation.py
21 | RPKM_COUNT = python $(HOME)/share/usr/anaconda/bin/RPKM_count.py
22 | 
23 | 
24 | .DELETE_ON_ERROR:
25 | .SECONDARY:
26 | .PHONY: rseqc
27 | 
28 | rseqc : $(foreach sample,$(SAMPLES), \
29 | 	rseqc/infer/$(sample).infer \
30 | 	rseqc/gene_body_cov/$(sample).geneBodyCoverage.txt \
31 | 	rseqc/inner_dist/$(sample).innerDistance.txt)
32 | 
33 | rseqc/infer/%.infer : bam/%.bam bam/%.bam.bai
34 | 	$(call RUN,-s 7G -m 8G,"$(INFER_EXPT) -i $< -r $(REF_HOUSEKEEPING_GENE_BED) > $@")
35 | 
36 | rseqc/gene_body_cov/%.geneBodyCoverage.txt : bam/%.bam
37 | 	$(call RUN,-s 7G -m 8G,"$(GENEBODY_COV) -i $< -r $(REF_HOUSEKEEPING_GENE_BED) -o rseqc/gene_body_cov/$*")
38 | 
39 | rseqc/inner_dist/%.innerDistance.txt : bam/%.bam
40 | 	$(call RUN,-s 7G -m 8G,"$(INNER_DIST) -i $< -r $(REF_HOUSEKEEPING_GENE_BED) -o rseqc/inner_dist/$*")
41 | 
42 | 


--------------------------------------------------------------------------------
/qc/teqc.mk:
--------------------------------------------------------------------------------
 1 | # Run TEQC R library on bams
 2 | # vim: set ft=make :
 3 | include modules/Makefile.inc
 4 | 
 5 | LOGDIR ?= teqc/log
 6 | 
 7 | .PHONY: teqc
 8 | .DELETE_ON_ERROR:
 9 | .SECONDARY:
10 | 
11 | teqc : teqc_report/index.html
12 | 
13 | teqc/%.Rdata : bam/%.bam bam/%.bam.bai
14 | 	$(call INIT_MEM,12G,14G) $(RSCRIPT) modules/qc/TEQC.R --ref=$(REF) --outFile $@ $< $(TARGETS_FILE) &> $(LOGDIR)/$(@F).log
15 | 
16 | teqc_report/index.html : $(foreach sample,$(SAMPLES),teqc/$(sample).Rdata)
17 | 	$(call INIT_MEM,12G,14G) $(MKDIR) teqc_report; $(RSCRIPT) modules/qc/TEQCreport.R --outDir=$(@D) $^
18 | 
19 | 


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v1.hg19.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v1.hg19.vcf.gz


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v1.hg19.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v1.hg19.vcf.gz.tbi


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v1.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v1.vcf.gz


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v1.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v1.vcf.gz.tbi


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v2.hg19.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v2.hg19.vcf.gz


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v2.hg19.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v2.hg19.vcf.gz.tbi


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v2.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v2.vcf.gz


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v2.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v2.vcf.gz.tbi


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v3.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v3.vcf.gz


--------------------------------------------------------------------------------
/reference/hotspots/hotspot-v3.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jrflab/modules/c30682a93fcbb84960ed9794c5849e8367ba1875/reference/hotspots/hotspot-v3.vcf.gz.tbi


--------------------------------------------------------------------------------
/rnaseq/immunedeconv.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/immunedeconv.$(NOW)
 4 | 
 5 | immunedeconv : immunedeconv/quantiseq.txt \
 6 | 	       immunedeconv/mcpcounter.txt \
 7 | 	       immunedeconv/cibersort.txt
 8 | 
 9 | immunedeconv/quantiseq.txt : kallisto/tpm_by_gene.txt
10 | 	$(call RUN, -c -n 1 -s 8G -m 16G -v $(IMMUNE_ENV),"set -o pipefail && \
11 | 							   $(RSCRIPT) $(SCRIPTS_DIR)/immunedeconv.R --option 1 --input_file $(<) --output_file $(@)")
12 | 
13 | immunedeconv/mcpcounter.txt : kallisto/tpm_by_gene.txt
14 | 	$(call RUN, -c -n 1 -s 8G -m 16G -v $(IMMUNE_ENV),"set -o pipefail && \
15 | 							   $(RSCRIPT) $(SCRIPTS_DIR)/immunedeconv.R --option 2 --input_file $(<) --output_file $(@)")
16 | 
17 | immunedeconv/cibersort.txt : kallisto/tpm_by_gene.txt
18 | 	$(call RUN, -c -n 1 -s 8G -m 16G -v $(IMMUNE_ENV),"set -o pipefail && \
19 | 							   $(RSCRIPT) $(SCRIPTS_DIR)/immunedeconv.R --option 3 --input_file $(<) --output_file $(@)")
20 | 
21 | ..DUMMY := $(shell mkdir -p version; \
22 | 	     ~/share/usr/env/r-immunedeconv-2.1.0/bin/R --version >> version/immunedeconv.txt;)
23 | .SECONDARY:
24 | .DELETE_ON_ERROR:
25 | .PHONY: immunedeconv
26 | 


--------------------------------------------------------------------------------
/rnaseq/sumreads.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/sum_reads.$(NOW)
 4 | 
 5 | sumreads : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_gene.txt) \
 6 | 	   $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_exon.txt) \
 7 | 	   sumreads/rpkm_by_gene.txt \
 8 | 	   sumreads/rpkm_by_exon.txt \
 9 | 	   sumreads/counts_by_gene.txt \
10 | 	   sumreads/counts_by_exon.txt
11 | 
12 | SUM_READS_OPTS =
13 | REF ?= b37
14 | 
15 | sumreads/%.sumreads.by_gene.txt : bam/%.bam bam/%.bam.bai
16 | 	$(call RUN,-v $(SUMREADS_ENV) -s 24G -m 48G,"$(SUM_READS_RSCRIPT) --genome $(REF) --outFile $@ $(SUM_READS_OPTS) $<")
17 | 
18 | sumreads/%.sumreads.by_exon.txt : bam/%.bam bam/%.bam.bai
19 | 	$(call RUN,-v $(SUMREADS_ENV) -s 24G -m 48G,"$(SUM_EXONS_RSCRIPT) --genome $(REF) --outFile $@ $(SUM_READS_OPTS) $<")
20 | 
21 | sumreads/rpkm_by_gene.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_gene.txt)
22 | 	cut -f 2 $< > $@; \
23 | 	for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 7 $$x | sed "s/exonRPKM/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done
24 | 
25 | sumreads/rpkm_by_exon.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_exon.txt)
26 | 	cut -f 1-2 $< > $@; \
27 | 	for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 6 $$x | sed "s/exonRPKM/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done
28 | 
29 | sumreads/counts_by_gene.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_gene.txt)
30 | 	cut -f 2 $< > $@; \
31 | 	for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 3 $$x | sed "s/countsByGene/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done
32 | 
33 | sumreads/counts_by_exon.txt : $(foreach sample,$(SAMPLES),sumreads/$(sample).sumreads.by_exon.txt)
34 | 	cut -f 1-2 $< > $@; \
35 | 	for x in $^; do sample=`echo $$x | sed 's/.*\///; s/\..*//'`; cut -f 4 $$x | sed "s/exonCount/$$sample/" | paste $@ - > $@.tmp; mv $@.tmp $@; done
36 | 
37 | ..DUMMY := $(shell mkdir -p version; \
38 | 	     $(SUMREADS_ENV)/bin/R --version >> version/sumreads.txt;)
39 | .SECONDARY:
40 | .DELETE_ON_ERROR:
41 | .PHONY: sumreads
42 | 


--------------------------------------------------------------------------------
/scripts/Rshell:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o pipefail
 4 | 
 5 | RCMD="Rscript"
 6 | NAME="R"
 7 | TMPDIR="$HOME/share/tmp"
 8 | QUEUE="jrf.q"
 9 | LOGDIR=.
10 | MEM="1G"
11 | PARALLEL=1
12 | QSUB="perl modules/scripts/qsub.pl"
13 | 
14 | while getopts "e:sm:n:l:p:" opt; do
15 |   case $opt in
16 |     l)
17 |     LOGDIR=$OPTARG
18 |       ;;
19 |     e)
20 |     R=$OPTARG
21 |       ;;
22 |     m)
23 |     MEM=$OPTARG
24 |       ;;
25 |     p)
26 |     PARALLEL=$OPTARG
27 |       ;;
28 |     n)
29 |     NAME=$OPTARG
30 |       ;;
31 |     s)
32 |     SGE=true
33 |       ;;
34 |     \:)
35 |       echo "Argument missing: -$OPTARG" >&2
36 |       exit -1
37 |       ;;
38 |     \?)
39 |       echo "Invalid option: -$OPTARG" >&2
40 |       exit -1
41 |       ;;
42 |   esac
43 | done
44 | 
45 | echo "#---------------------------------";
46 | 
47 | if [[ $PARALLEL -gt 1 ]]; then
48 |     PE="-pe smp $PARALLEL"
49 | fi
50 | 
51 | umask 002
52 | mkdir -p $LOGDIR
53 |  
54 | if [[ -n "$R" ]]; then 
55 |     TMP=$(mktemp --tmpdir=${TMPDIR})
56 |     #TMP=$(mktemp -t x)
57 |     trap "{ rm -f ${TMP}; exit 255;  }" SIGINT
58 |     echo "Sys.umask('002')" > ${TMP}
59 |     echo "${R}" >> ${TMP}
60 |     echo "${R}" > $LOGDIR/$NAME.R
61 |     chmod +rx ${TMP}
62 |     source ${HOME}/.bashrc
63 |     if [[ -n "$SGE" ]]; then
64 |         mkdir -p $LOGDIR
65 |         echo "umask 002; ${RCMD} ${TMP}" | $QSUB -- -cwd -V -now n -q $QUEUE -N X$NAME $PE -l virtual_free=$MEM,h_vmem=$MEM -o $LOGDIR/$NAME.log -j y -b n
66 |         RET_CODE=$?
67 |     else
68 |         $RCMD ${TMP} | tee -a $LOGDIR/$NAME.log
69 |         RET_CODE=$?
70 |     fi
71 |     rm -f ${TMP}
72 |     exit ${RET_CODE}
73 | else
74 |     echo "Undefined script" >&2
75 |     exit -1
76 | fi
77 | 
78 | 


--------------------------------------------------------------------------------
/scripts/Sweave.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | 
 5 | allArguments <- commandArgs(trailingOnly = T)
 6 | 
 7 | if (length(allArguments) < 1) {
 8 |     cat("Need Rnw file");
 9 |     stop();
10 | }
11 | 
12 | rnwFile <- allArguments[1];
13 | arguments <<- allArguments[-1];
14 | 
15 | Sweave(rnwFile);
16 | 


--------------------------------------------------------------------------------
/scripts/add_dbsnp_gmaf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # add INFO field for RARE variants, GMAF < 0.01
 3 | 
 4 | import argparse
 5 | import vcf
 6 | import re
 7 | import sys
 8 | 
 9 | parser = argparse.ArgumentParser(prog='add_dbsnp_gmaf.py',
10 |                                  description='add GMAF field to dbsnp using CAF field')
11 | parser.add_argument('vcf_infile')
12 | 
13 | args = parser.parse_args()
14 | 
15 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
16 | 
17 | vcf_reader.infos['GMAF'] = vcf.parser._Info(id='GMAF', num=1, type='Float',
18 |                                             desc="global minor allele frequency from 1000g",
19 |                                             source=None, version=None)
20 | 
21 | 
22 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
23 | 
24 | for record in vcf_reader:
25 |     if 'CAF' in record.INFO:
26 |         caf_str = [re.sub(r'^\.$', '0', re.sub(r'[\[\]]', '', x)) for x in record.INFO['CAF'] if x is not None]
27 |         caf = sorted([float(x) for x in caf_str])
28 |         gmaf = caf[len(caf) - 2]
29 |         record.INFO['GMAF'] = gmaf
30 |     vcf_writer.write_record(record)
31 | 
32 | vcf_writer.close()
33 | 


--------------------------------------------------------------------------------
/scripts/classify_snv_pathogenicity_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ classify pathogenicity of vcf records
 4 | """
 5 | 
 6 | import argparse
 7 | import vcf
 8 | import sys
 9 | import classify_pathogenicity_vcf as cp
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser(prog='classify_snv_pathogenicity_vcf.py',
14 |                                      description='Add pathogenicity to vcf file')
15 |     parser.add_argument('vcf_infile')
16 |     args = parser.parse_args()
17 | 
18 |     vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
19 | 
20 |     assert "ANN" in vcf_reader.infos
21 |     assert "HOTSPOT" in vcf_reader.infos or "hotspot" in vcf_reader.infos
22 |     assert "FATHMM_pred" in vcf_reader.infos
23 |     assert "facetsLOH" in vcf_reader.infos or "LOH" in vcf_reader.infos
24 |     assert "MutationTaster_pred" in vcf_reader.infos
25 | 
26 |     # add necessary info headers
27 |     vcf_reader.infos['pathogenicity'] = vcf.parser._Info(id='pathogenicity', num=-1, type='String',
28 |                                                          desc="Classification of pathogenicity",
29 |                                                          source=None, version=None)
30 |     records = [x for x in vcf_reader]
31 |     for record in records:
32 |         if len(record.FILTER) == 0 and record.is_snp:
33 |             cp.classify_pathogenicity(record)
34 | 
35 |     vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
36 |     for record in records:
37 |         vcf_writer.write_record(record)
38 |     vcf_writer.close()
39 | 


--------------------------------------------------------------------------------
/scripts/convert_sample_txt2yaml.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Convert a space-space delimited samples.txt/sample_sets.txt file to samples.yaml
 4 | """
 5 | 
 6 | import sys
 7 | import yaml
 8 | import argparse
 9 | 
10 | parser = argparse.ArgumentParser(prog='convert_sample_txt2yaml.py',
11 |                                  description='Convert samples.txt/sample_sets.txt to yaml')
12 | parser.add_argument('sample_txt_file')
13 | args = parser.parse_args()
14 | 
15 | samples = []
16 | with open(args.sample_txt_file, 'r') as f:
17 |     for line in f:
18 |         split_sp = line.rstrip().split()
19 |         if (len(split_sp) == 1):
20 |             samples.append({'tumor': split_sp})
21 |         else:
22 |             samples.append({'normal': split_sp[-1], 'tumor': split_sp[:-1]})
23 | 
24 | yaml.dump(samples, sys.stdout)
25 | 


--------------------------------------------------------------------------------
/scripts/create_sample_sets.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my %tumorSamples;
 7 | my %normalSamples;
 8 | for my $s (<>) {
 9 |     chomp $s;
10 |     my $id;
11 |     if ($s =~ /(\d+)/) {
12 |         $id = $1;
13 |         if ($s =~ m/N$/) {
14 |             print STDERR "Warning: sample $id ($s) has two normals\n" if (exists $normalSamples{$id});
15 |             $normalSamples{$id} = $s;
16 |         } else {
17 |             push @{$tumorSamples{$id}}, $s;
18 |         }
19 |     }
20 | }
21 | 
22 | while (my ($id, $normal) = each %normalSamples) {
23 |     next and print STDERR "Warning: no tumor samples for $id ($normal)" unless (exists $tumorSamples{$id});
24 |     print join(" ", @{$tumorSamples{$id}}) . " $normal\n";
25 | }
26 | 
27 | for my $id (keys %tumorSamples) {
28 |     unless (exists $normalSamples{$id}) {
29 |         print STDERR "Warning: no normal sample for $id (" . join(" ", @{$tumorSamples{$id}}) . ")\n";
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/scripts/extract_signatures.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("readr"))
 5 | suppressPackageStartupMessages(library("deconstructSigs"))
 6 | suppressPackageStartupMessages(library("dplyr"))
 7 | suppressPackageStartupMessages(library("magrittr"))
 8 | 
 9 | if (!interactive()) {
10 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
11 | }
12 | 
13 | args_list <- list(
14 | 					make_option("--sample_name", default = NA, type = 'character', help = "tumor sample name")
15 | 				  )
16 | 				  
17 | parser <- OptionParser(usage = "%prog", option_list = args_list)
18 | arguments <- parse_args(parser, positional_arguments = T)
19 | opt <- arguments$options
20 | 
21 | mutation_summary = read_tsv(file="summary/tsv/mutation_summary.tsv", col_types = cols(.default = col_character()))  %>%
22 |  				   type_convert() %>%
23 |  				   filter(variantCaller=="mutect") %>%
24 |  				   filter(TUMOR_SAMPLE==opt$sample_name) %>%
25 |  				   mutate(CHROM = paste0("chr", CHROM)) %>%
26 |  				   select(sample_id = TUMOR_SAMPLE, chrom=CHROM, pos=POS, ref=REF, alt=ALT)
27 | 
28 | signature_input = mut.to.sigs.input(mut.ref = data.frame(mutation_summary),
29 | 									sample.id = "sample_id", 
30 | 									chr = "chrom", 
31 | 									pos = "pos", 
32 | 									ref = "ref", 
33 | 									alt = "alt")
34 | 									
35 | extracted_signatures = whichSignatures(tumor.ref = signature_input,
36 | 									   signatures.ref = signatures.cosmic,
37 | 									   contexts.needed = TRUE)
38 | 									   
39 | save(list=ls(all=TRUE), file=paste0("deconstructsigs/signatures/", opt$sample_name, ".RData"))
40 | 


--------------------------------------------------------------------------------
/scripts/facets_suite.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("dplyr"))
 5 | suppressPackageStartupMessages(library("readr"))
 6 | suppressPackageStartupMessages(library("magrittr"))
 7 | 
 8 | if (!interactive()) {
 9 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
10 | }
11 | 
12 | args_list <- list(make_option("--option", default = NA, type = 'character', help = "type of analysis"),
13 | 		  make_option("--sample_pairs", default = NA, type = 'character', help = "sample pairs"))
14 | parser <- OptionParser(usage = "%prog", option_list = args_list)
15 | arguments <- parse_args(parser, positional_arguments = T)
16 | opt <- arguments$options
17 | 
18 | if (as.numeric(opt$option) == 1) {
19 | 	sample_names = unlist(strsplit(as.character(opt$sample_pairs), split = " ", fixed = TRUE))
20 | 	CN = list()
21 | 	for (i in 1:length(sample_names)) {
22 | 		CN[[i]] = readr::read_tsv(file = paste0("facets_suite/", sample_names[i], "/", sample_names[i], ".gene_level.txt"),
23 | 					  col_names = TRUE, col_types = cols(.default = col_character())) %>%
24 | 			  readr::type_convert()
25 | 	}
26 | 	CN = do.call(rbind, CN)
27 | 	readr::write_tsv(x = CN, path = "facets_suite/summary.txt", col_names = TRUE, append = FALSE)
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/scripts/filter_dbsnp_gmaf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # filter SNPs that are rare and have a global minor allele frequency less than 0.01
 3 | 
 4 | import argparse
 5 | import vcf
 6 | import re
 7 | import sys
 8 | 
 9 | parser = argparse.ArgumentParser(prog='filter_dbsnp_gmaf.py',
10 |                                  description='filter dbsnp gmaf (CAF info)')
11 | parser.add_argument('vcf_infile')
12 | 
13 | args = parser.parse_args()
14 | 
15 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
16 | 
17 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
18 | 
19 | for record in vcf_reader:
20 |     if 'CAF' not in record.INFO:
21 |         vcf_writer.write_record(record)
22 |     else:
23 |         caf1 = float(re.sub(r'[\[\]]', '', record.INFO['CAF'][0]))
24 |         if caf1 < 0.99:
25 |             vcf_writer.write_record(record)
26 | vcf_writer.close()
27 | 


--------------------------------------------------------------------------------
/scripts/filter_sv.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("readr"))
 5 | suppressPackageStartupMessages(library("dplyr"))
 6 | suppressPackageStartupMessages(library("magrittr"))
 7 | 
 8 | if (!interactive()) {
 9 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
10 | }
11 | 
12 | optList = list(make_option("--input_file", default = NA, type = 'character', help = "Input VCF file"),
13 |                make_option("--output_file", default = NA, type = 'character', help = "Output VCF file"))
14 | parser = OptionParser(usage = "%prog", option_list = optList)
15 | arguments = parse_args(parser, positional_arguments = T)
16 | opt = arguments$options
17 | 
18 | vcf = readr::read_tsv(file = as.character(opt$input_file), comment = "#", col_names = FALSE, col_types = cols(.default = col_character())) %>%
19 |       readr::type_convert() %>%
20 |       dplyr::filter(!grepl("SUPP_VEC=110", X8, fixed = TRUE)) %>%
21 |       dplyr::mutate(X3 = X12) %>%
22 |       dplyr::mutate(X3 = unlist(lapply(X3, function(x) { unlist(strsplit(x, split = ":", fixed = TRUE))[8] }))) %>%
23 |       dplyr::mutate(X3 = gsub(pattern = "_", replacement = ":", x = X3, fixed = TRUE)) %>%
24 |       dplyr::mutate(X5 = case_when(
25 | 	      grepl("DUP", X3, fixed = TRUE) ~ "<DUP:TANDEM>",
26 | 	      grepl("DEL", X3, fixed = TRUE) ~ "<DEL>",
27 | 	      grepl("INV", X3, fixed = TRUE) ~ "<INV>",
28 | 	      TRUE ~ X5
29 |       )) %>%
30 |       dplyr::mutate(X8 = case_when(
31 | 	      grepl("DUP", X3, fixed = TRUE) ~ gsub("SVTYPE=INV", "SVTYPE=DUP", X8),
32 | 	      grepl("DEL", X3, fixed = TRUE) ~ gsub("SVTYPE=INV", "SVTYPE=DEL", X8),
33 | 	      TRUE ~ X8
34 |       )) %>%
35 |       dplyr::rename(`#CHROM` = X1,
36 | 		    POS = X2,
37 | 		    ID = X3,
38 | 		    REF = X4,
39 | 		    ALT = X5,
40 | 		    QUAL = X6,
41 | 		    FILTER = X7,
42 | 		    INFO = X8,
43 | 		    FORMAT = X9,
44 | 		    SVABA = X10,
45 | 		    GRIDSS = X11,
46 | 		    MANTA = X12)
47 | 
48 | readr::write_tsv(x = vcf, path = as.character(opt$output_file), append = TRUE, col_names = TRUE)
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/scripts/get_basecounts.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("readr"))
 5 | suppressPackageStartupMessages(library("dplyr"))
 6 | suppressPackageStartupMessages(library("magrittr"))
 7 | 
 8 | if (!interactive()) {
 9 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
10 | }
11 | 
12 | args_list <- list(make_option("--option", default = NA, type = 'character', help = "Which option?"),
13 | 		  make_option("--sample_name", default = NA, type = 'character', help = "sample name"))
14 | parser <- OptionParser(usage = "%prog", option_list = args_list)
15 | arguments <- parse_args(parser, positional_arguments = T)
16 | opt <- arguments$options
17 | 
18 | if (as.numeric(opt$option) == 1) {
19 | 	sample_names = unlist(strsplit(x = as.character(opt$sample_name), split = " ", fixed = TRUE))
20 | 	data = list()
21 | 	for (i in 1:length(sample_names)) {
22 | 		data[[i]] = readr::read_tsv(file = paste0("gbc/", sample_names[i], ".txt.gz"), col_names = TRUE, col_types = cols(.default = col_character())) %>%
23 | 			    readr::type_convert() %>%
24 | 			    dplyr::mutate(sample_name = sample_names[i])
25 | 	}
26 | 	data = do.call(bind_rows, data)
27 | 	readr::write_tsv(x = data, path = "gbc/summary.txt", append = FALSE, col_names = TRUE)
28 | }
29 | 
30 | 


--------------------------------------------------------------------------------
/scripts/init_project.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Cwd;
 6 | use File::Copy;
 7 | 
 8 | my $MAKEFILE = <<ENDL;
 9 | include modules/Makefile
10 | ENDL
11 | 
12 | unless (-e "Makefile") {
13 |     open OUT, ">Makefile";
14 |     print OUT $MAKEFILE;
15 | }
16 | close OUT;
17 | 
18 | unless (-e "project_config.yaml") {
19 |     copy("modules/default_yaml/project_config.yaml", "project_config.yaml") or die "Unable to create project_config.yaml: $!";
20 | }
21 | 
22 | unless (-e "summary_config.yaml") {
23 |     copy("modules/default_yaml/summary_config.yaml", "summary_config.yaml") or die "Unable to create summary_config.yaml: $!";
24 | }
25 | 


--------------------------------------------------------------------------------
/scripts/join_eff.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use List::MoreUtils qw(first_index indexes);
 5 | 
 6 | my $line = <>;
 7 | print $line;
 8 | chomp $line;
 9 | my @header = split /\t/, $line;
10 | my @effIndexes = indexes { /^ANN\[/ } @header;
11 | 
12 | my %lines;
13 | while (<>) {
14 |     chomp;
15 |     my @F = split /\t/, $_, -1;
16 |     for my $i (0..$#F) {
17 |         $F[$i] = "." unless $F[$i] =~ /\S/;
18 |     }
19 |     push @{$lines{$F[0]}{$F[1]}}, \@F;
20 | }
21 | 
22 | foreach my $chrom (sort keys %lines) {
23 |     foreach my $posn (sort keys %{$lines{$chrom}}) {
24 |         my $F = pop @{$lines{$chrom}{$posn}};
25 |         while (my $Fn = pop @{$lines{$chrom}{$posn}}) {
26 |             for my $i (@effIndexes) {
27 |                 $F->[$i] .= "|" . $Fn->[$i];
28 |             }
29 |         }
30 |         print join("\t", @{$F}) . "\n";
31 |     }
32 | }
33 |     
34 | 


--------------------------------------------------------------------------------
/scripts/knit.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library(knitr))
 4 | suppressPackageStartupMessages(library(markdown))
 5 | 
 6 | 
 7 | args <- commandArgs(T)
 8 | 
 9 | if (length(args) < 2) stop("Need input and output script")
10 | 
11 | input <- args[1]
12 | outPrefix <- args[2]
13 | args <- args[c(-1,-2)]
14 | 
15 | figPath <- file.path(outPrefix, 'figure/')
16 | cachePath <- file.path(outPrefix, 'cache/')
17 | dir.create(figPath, showWarnings = F, recursive = T)
18 | dir.create(cachePath, showWarnings = F, recursive = T)
19 | 
20 | opts_chunk$set(dev = c("png", 'pdf'), cache.path = cachePath) # , fig.path = file.path('mutsig_report/figure/'))
21 | opts_knit$set(root.dir = getwd(), base.dir = file.path(paste(outPrefix, '/', sep = '')), progress = F, verbose = T)
22 | 
23 | #options(warn = -1, error = quote({ traceback(2); q('no', status = 1) }))
24 | 
25 | knit(input, paste(outPrefix, '/index.md', sep = ''))
26 | markdownToHTML(paste(outPrefix, '/index.md', sep = ''), paste(outPrefix, '/index.html', sep = ''))
27 | 
28 | 


--------------------------------------------------------------------------------
/scripts/launcher_sql_db.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ launches the mysql server on a docker node and returns the server ip
 3 | """
 4 | 
 5 | import argparse
 6 | import MySQLdb
 7 | import yaml
 8 | from qsub_pbs import Job
 9 | import sys
10 | import time
11 | 
12 | parser = argparse.ArgumentParser(description='launch mysql server on docker node')
13 | parser.add_argument('server_yaml')
14 | 
15 | args = parser.parse_args()
16 | 
17 | server_info = yaml.load(open(args.server_yaml, 'r'))
18 | host = server_info['host']
19 | db = server_info['db']
20 | 
21 | con = None
22 | for attempt in range(2):
23 |     try:
24 |         con = MySQLdb.connect(host=server_info['host'],
25 |                               user=server_info['user'],
26 |                               passwd=server_info['password'],
27 |                               port=server_info['port'],
28 |                               db=server_info['db'])
29 |         break
30 |     except:
31 |         print(("Failed to connect to {} mysql server. Running docker".format(server_info['db'])))
32 |         docker_cmd = "docker run -d -v {}:/var/lib/mysql -p {}:3306 {}".format(server_info['data_dir'],
33 |                                                                                server_info['port'],
34 |                                                                                server_info['docker_repo'])
35 |         print((docker_cmd + "\n"))
36 |         #job = Job(docker_cmd, '-I -l nodes=1:docker -l host={}'.format(server_info['host']))
37 |         #job.run_job()
38 |         #job.wait()
39 |         #time.sleep(90) # wait for mysqld to start
40 | 
41 | if not con:
42 |     print("Failed to initialize mysql server")
43 |     sys.exit(1)
44 | 


--------------------------------------------------------------------------------
/scripts/mimsi.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("dplyr"))
 5 | suppressPackageStartupMessages(library("readr"))
 6 | suppressPackageStartupMessages(library("magrittr"))
 7 | 
 8 | if (!interactive()) {
 9 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
10 | }
11 | 
12 | args_list <- list(make_option("--option", default = NA, type = 'character', help = "type of analysis"),
13 | 		  make_option("--sample_names", default = NA, type = 'character', help = "sample name"))
14 | parser <- OptionParser(usage = "%prog", option_list = args_list)
15 | arguments <- parse_args(parser, positional_arguments = T)
16 | opt <- arguments$options
17 | 
18 | if (as.numeric(opt$option)==1) {
19 | 	sample_names = unlist(strsplit(x=as.character(opt$sample_names), split=" ", fixed=TRUE))
20 | 	smry = list()
21 | 	for (i in 1:length(sample_names)) {
22 | 		smry[[i]] = readr::read_tsv(file = paste0("mimsi/", sample_names[i], "/", sample_names[i], ".txt"),
23 | 					    col_names = TRUE, col_types = cols(.default = col_character())) %>%
24 | 			    readr::type_convert()
25 | 	}
26 | 	smry = do.call(rbind, smry)
27 | 	write_tsv(smry, path="mimsi/summary.txt", append = FALSE, col_names = TRUE)
28 | 	
29 | }
30 | 


--------------------------------------------------------------------------------
/scripts/monitorMySQL.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | UP=$(pgrep -u limr mysqld | wc -l);
 4 | if [ "$UP" -ne 1 ];
 5 | then
 6 |     echo "MySQL is down.";
 7 |     mysqld --defaults-file=/home/limr/share/usr/mysql/my.cnf
 8 | else
 9 |     echo "All is well.";
10 | fi
11 | 


--------------------------------------------------------------------------------
/scripts/monitor_gfserver.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | UP=$(pgrep -u limr gfServer | wc -l);
 4 | if [ "$UP" -ne 1 ];
 5 | then
 6 |     echo "gfServer is down.";
 7 |     /home/limr/share/usr/bin/gfServer start localhost 88878 -stepSize=5 -log=/home/limr/.blatserver.log /home/limr/share/reference/GATK_bundle/2.3/human_g1k_v37.2bit
 8 | else
 9 |     echo "All is well.";
10 | fi
11 | 


--------------------------------------------------------------------------------
/scripts/mutation_taster_query.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sys
 3 | 
 4 | 
 5 | _prediction_priority = ['disease_causing_automatic',
 6 |                         'disease_causing',
 7 |                         'polymorphism_automatic',
 8 |                         'polymorphism']
 9 | 
10 | 
11 | def query(chrom, pos, ref, alt):
12 |     pred = 'none'
13 |     score = None
14 |     url = "http://www.mutationtaster.org/cgi-bin/MutationTaster/MT_ChrPos.cgi" \
15 |         "?chromosome={}&position={}&ref={}&alt={}".format(chrom, pos, ref, alt)
16 |     sys.stderr.write("Querying: {}\n".format(url))
17 |     dfs = pd.read_html(url)
18 |     if (len(dfs) < 2):
19 |         raise Exception('query failed')
20 |     else:
21 |         # summary is the second dataframe
22 |         summary = dfs[1][1:]
23 |         summary.columns = dfs[1].iloc[0]
24 |         if 'prediction' in summary.columns:
25 |             pred_score = {}
26 |             for i, row in summary.iterrows():
27 |                 if row['prediction'] not in pred_score:
28 |                     pred_score[row['prediction']] = []
29 |                     pred_score[row['prediction']].append(float(row['probability']))
30 |                     for p in _prediction_priority:
31 |                         if p in pred_score:
32 |                             pred = p
33 |                             score = max(pred_score[p])
34 |                             break
35 |     return (pred, score)
36 | 


--------------------------------------------------------------------------------
/scripts/normalFilterVCF.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # filter tumor based on normal
 3 | # usage: normalFilterVCF.pl [tumor.vcf] [normal.vcf]
 4 | 
 5 | use strict;
 6 | 
 7 | if (@ARGV != 2) {
 8 |     print "Usage: normalFilterVCF.pl [tumor.vcf] [normal.vcf]\n" and exit(1);
 9 | }
10 | 
11 | my $tumorVCF = $ARGV[0];
12 | my $normalVCF = $ARGV[1];
13 | 
14 | my $varPosn = {};
15 | open IN, $normalVCF or die("Unable to open " . $normalVCF . "\n");
16 | while (<IN>) {
17 | 	next if /^#/;
18 | 	my @F = split /\t/;
19 | 	my $chr = $F[0];
20 | 	my $posn = $F[1];
21 |     my $alt = $F[3];
22 | 	$varPosn->{$chr} = {} unless exists $varPosn->{$chr};
23 | 	$varPosn->{$chr}{$posn} = {} unless exists $varPosn->{$chr}{$posn};
24 | 	$varPosn->{$chr}{$posn}{$alt} = 1;
25 | }
26 | close IN;
27 | 
28 | open IN, $tumorVCF or die("Unable to open " . $tumorVCF . "\n");
29 | while (<IN>) {
30 | 	print and next if /^#/;
31 | 	my @F = split /\t/;
32 | 	my $chr = $F[0];
33 | 	my $posn = $F[1];
34 |     my $alt = $F[3];
35 | 	print unless (exists $varPosn->{$chr}{$posn}{$alt});
36 | }
37 | close IN;
38 | 


--------------------------------------------------------------------------------
/scripts/posnGeneLookup.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | # lookup the gene(s) at a base position (first column of the input)
 3 | 
 4 | use strict;
 5 | 
 6 | use DBI;
 7 | use Bio::EnsEMBL::DBSQL::DBAdaptor;
 8 | use Bio::EnsEMBL::Registry;
 9 | 
10 | print STDERR "Connecting to Ensembl core...\n";
11 | my $dbCore = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
12 |     -user   => "anonymous",
13 |     -dbname => "homo_sapiens_core_65_37",
14 |     -host   => "localhost",
15 |     -pass   => "",
16 |     -driver => 'mysql',
17 |     -port   => 33387
18 | );
19 | print STDERR "Connected.\n";
20 | 
21 | my $slice_adaptor = $dbCore->get_SliceAdaptor();
22 | while (<>) {
23 |     chomp;
24 |     my @F = split / /;
25 |     print STDERR "Looking up position $F[0]\n";
26 |     my ($chr, $pos) = split /:/, $F[0];
27 |     $chr =~ s/chr//;
28 |     my $slice = $slice_adaptor->fetch_by_region('chromosome', $chr, $pos, $pos);
29 |     my @genes = @{$slice->get_all_Genes()};
30 |     my @ids;
31 |     my @strands;
32 |     while (my $gene = shift @genes) {
33 |         my $stable_id = $gene->stable_id();
34 |         my $strand = $gene->strand();
35 |         push @ids, $stable_id;
36 |         push @strands, $strand;
37 |     }
38 |     print "$chr:$pos " . ((@ids > 0)? join("|", @ids) . ' ' . join("|", @strands): "NA NA") . ' ' . join(" ", @F[1..$#F]) . "\n";
39 | }
40 | 


--------------------------------------------------------------------------------
/scripts/prepareFastq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # rename fastq files and create samples.txt and possibly samples.split.txt
 3 | # find -name '*.gz' | xargs prename 's:./([A-Z]+)_([0-9]{4})/(.*)\.fastq\.gz:./$1_$2/$3$2.fastq.gz:'
 4 | mkdir -p fastq
 5 | find rawdata -name '*.gz' | xargs ln -t fastq
 6 | 
 7 | for x in fastq/*.fastq.gz; do
 8 |     if [ `grep -o "_" <<< "$x" | wc -l` -gt 1 ]; then
 9 |         prename 's/-//g; s/(.+)_([ATGCN]{6,8}|S\d+)_L([^_])+_R([12])_([0-9]+)/$1-$2$3$5.$4/; s/(.+)_[^_]+_L([^_])+_R([12])_([0-9]+)/$1_$2$4.$3/; s/_//g; s/-/_/g' $x;
10 |     fi;
11 | done
12 | prename 's/IGO[^_]*//' fastq/*.gz
13 | paste <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/[._].*//') <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/\..*//') | awk 'BEGIN { OFS = "\t" } $1 != $2 { print }' | uniq > samples.split.txt
14 | 'ls' fastq/*.fastq.gz | sed 's:.*/::; s/_.*//' | sort | uniq
15 | #if [ -z `cut -f1 samples.split.txt | uniq -d` ]; then
16 | #    prename 's/_.*//' fastq/*.fastq.gz
17 | #    rm samples.split.txt
18 | #fi
19 | #for x in `cut -f1 samples.split.txt | uniq -u`; do
20 | #prename 's/_.*//' fastq/${x}_*.fastq.gz
21 | #sed -i "/^$x\t/d" samples.split.txt
22 | #done
23 | 


--------------------------------------------------------------------------------
/scripts/prepareFastq2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # rename fastq files and create samples.txt and possibly samples.split.txt
 3 | 
 4 | mkdir -p fastq
 5 | find rawdata -name '*.gz' | xargs ln -t fastq
 6 | 
 7 | for x in fastq/*.fastq.gz; do
 8 |     if [ `grep -o "_" <<< "$x" | wc -l` -gt 1 ]; then
 9 |         prename 's/-//g; s/(.+)_(S\d+)_L([^_])+_R([12])_([0-9]+)/$1-$2$3$5.$4/; s/(.+)_[^_]+_L([^_])+_R([12])_([0-9]+)/$1_$2$4.$3/; s/_//g; s/-/_/g' $x;
10 |     fi;
11 | done
12 | paste <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/[._].*//') <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/\..*//') | awk 'BEGIN { OFS = "\t" } $1 != $2 { print }' | uniq > samples.split.txt
13 | 'ls' fastq/*.fastq.gz | sed 's:.*/::; s/_.*//' | sort | uniq
14 | 


--------------------------------------------------------------------------------
/scripts/prepareMultirunFastq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #rename fastq files and create samples.txt and possibly samples.split.txt
 3 | 
 4 | mkdir -p fastq
 5 | cd rawdata
 6 | find -name '*.gz' | xargs prename 's:./([A-Z]+)_([0-9]{4})/(.*)\.fastq\.gz:./$1_$2/$3$2.fastq.gz:'
 7 | find -name '*.gz' | xargs ln -t ../fastq
 8 | cd ..
 9 | 
10 | for x in fastq/*.fastq.gz; do
11 |     if [ `grep -o "_" <<< "$x" | wc -l` -gt 1 ]; then
12 |         prename 's/-//g; s/(.+)_([NATGC]{6,8}|S\d+)_L([^_])+_R([12])_([0-9]+)/$1-$2$3$5.$4/; s/(.+)_[^_]+_L([^_])+_R([12])_([0-9]+)/$1_$2$4.$3/; s/_//g; s/-/_/g' $x;
13 |     fi;
14 | done
15 | prename 's/IGO[^_]*//' fastq/*.gz
16 | paste <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/[._].*//') <('ls' fastq/*.fastq.gz | sed 's:.*/::; s/\..*//') | awk 'BEGIN { OFS = "\t" } $3 != $2 { print }' | uniq > samples.split.txt
17 | 'ls' fastq/*.fastq.gz | sed 's:.*/::; s/_.*//' | sort | uniq
18 | 


--------------------------------------------------------------------------------
/scripts/provean_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ classify provean of vcf records
 4 | """
 5 | 
 6 | import vcf
 7 | import argparse
 8 | import sys
 9 | import remote_provean_query as rpq
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser(prog='provean_vcf.py',
13 |                                      description='Add provean to vcf file')
14 |     parser.add_argument('vcf_infile')
15 |     args = parser.parse_args()
16 | 
17 |     vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
18 |     vcf_reader.infos['provean_protein_id'] = vcf.parser._Info(id='provean_protein_id', num=1, type='String',
19 |                                                               desc="provean protein id",
20 |                                                               source=None, version=None)
21 |     vcf_reader.infos['provean_pred'] = vcf.parser._Info(id='provean_pred', num=1, type='String',
22 |                                                         desc="Mutation taster prediction using webquery if indel",
23 |                                                         source=None, version=None)
24 |     vcf_reader.infos['provean_score'] = vcf.parser._Info(id='provean_score', num=1, type='Float',
25 |                                                          desc="Mutation taster score using webquery if indel",
26 |                                                          source=None, version=None)
27 |     vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
28 | 
29 |     records = [record for record in vcf_reader]
30 |     query_records = []
31 | 
32 |     for record in records:
33 |         if record.is_indel:
34 |             query_records.append(record)
35 | 
36 |     if len(query_records) > 0:
37 |         query = rpq.RemoteProveanQuery(query_records)
38 |         query.run_query()
39 | 
40 |     for record in records:
41 |         vcf_writer.write_record(record)
42 |     vcf_writer.close()
43 | 


--------------------------------------------------------------------------------
/scripts/split_bed.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import math
 5 | 
 6 | if __name__ == "__main__":
 7 |     parser = argparse.ArgumentParser(prog='split_bed.py',
 8 |                                      description='split a bed file into chunks')
 9 |     parser.add_argument('--num_chunks', '-c', type=int, default=100, help='number of chunks')
10 |     parser.add_argument('--out_prefix', '-o', required=True, help='output prefix')
11 |     parser.add_argument('bed_file', help='bed file to split')
12 |     args = parser.parse_args()
13 | 
14 |     bed = [line for line in open(args.bed_file, 'r')]
15 | 
16 |     n = int(math.ceil(len(bed) / args.num_chunks))
17 |     x = 1
18 |     for i in range(0, len(bed), n):
19 |         f = open(args.out_prefix + '{0:03d}.bed'.format(x), 'w')
20 |         if x == args.num_chunks:
21 |             # last chunk, write everything
22 |             for line in bed[i:]:
23 |                 f.write(line)
24 |             f.close()
25 |             break
26 |         else:
27 |             for line in bed[i:i + n]:
28 |                 f.write(line)
29 |             f.close()
30 |         x = x + 1
31 | 


--------------------------------------------------------------------------------
/scripts/split_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import math
 5 | import vcf
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser(prog='split_vcf.py',
 9 |                                      description='split a vcf file into chunks')
10 |     parser.add_argument('--num_chunks', '-c', type=int, default=100, help='number of chunks')
11 |     parser.add_argument('--out_prefix', '-o', required=True, help='output prefix')
12 |     parser.add_argument('vcf_file', help='bed file to split')
13 |     args = parser.parse_args()
14 | 
15 |     vcf_reader = vcf.Reader(open(args.vcf_file, 'r'))
16 |     records = [record for record in vcf_reader]
17 | 
18 |     n = int(math.ceil(len(records) / args.num_chunks))
19 |     x = 1
20 |     for i in range(0, len(records), n):
21 |         vcf_writer = vcf.Writer(open(args.out_prefix + '{0:03d}.vcf'.format(x), 'w'), vcf_reader)
22 |         if x == args.num_chunks:
23 |             # last chunk, write everything
24 |             for record in records[i:]:
25 |                 vcf_writer.write_record(record)
26 |             vcf_writer.close()
27 |             break
28 |         else:
29 |             for record in records[i:i + n]:
30 |                 vcf_writer.write_record(record)
31 |             vcf_writer.close()
32 |         x = x + 1
33 | 


--------------------------------------------------------------------------------
/scripts/summarize_sleuth.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("readr"))
 5 | suppressPackageStartupMessages(library("dplyr"))
 6 | suppressPackageStartupMessages(library("magrittr"))
 7 | suppressPackageStartupMessages(library("sleuth"))
 8 | 
 9 | 
10 | if (!interactive()) {
11 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
12 | }
13 | 
14 | optList = list(make_option('--annotation', type = 'character', default = NA, help = 'path to annotation file'),
15 |                make_option('--samples', type = 'character', default = NA, help = 'list of samples names'))
16 | parser = OptionParser(usage = "%prog",  option_list=optList)
17 | arguments = parse_args(parser, positional_arguments = T)
18 | opt = arguments$options
19 | 
20 | sample_names = unlist(strsplit(x=opt$samples, split=" ", fixed=TRUE))
21 | annotation = readr::read_tsv(file=opt$annotation, col_names=TRUE, col_types=cols(.default=col_character()))
22 | manifest = dplyr::tibble(sample = sample_names,
23 | 			 condition = rep(1, length(sample_names)),
24 | 			 path = paste0("kallisto/", sample_names))
25 | data = sleuth::sleuth_prep(sample_to_covariates = manifest,
26 | 			   extra_bootstrap_summary = TRUE,
27 | 			   read_bootstrap_tpm = TRUE,
28 | 			   target_mapping = annotation,
29 | 			   aggregation_column = "hugo",
30 | 			   gene_mode = TRUE)
31 | res = as.data.frame(sleuth_to_matrix(data, "obs_norm", "tpm"))
32 | tpm_bygene = dplyr::tibble(gene_symbol = rownames(res)) %>%
33 | 	     dplyr::bind_cols(dplyr::as_tibble(res))
34 | write_tsv(x=tpm_bygene, path="kallisto/tpm_by_gene.txt", append=FALSE, col_names=TRUE, quote_escape=FALSE)
35 | 


--------------------------------------------------------------------------------
/scripts/swapvcf.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | 
 5 | if (!interactive()) {
 6 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
 7 | }
 8 | 
 9 | args_list <- list(make_option("--file", default = NA, type = 'character', help = "input file name"),
10 | 				  make_option("--tumor", default = NA, type = 'character', help = "tumor sample name"),
11 | 				  make_option("--normal", default = NA, type = 'character', help = "normal sample name"))
12 | 				  
13 | parser <- OptionParser(usage = "%prog", option_list = args_list)
14 | arguments <- parse_args(parser, positional_arguments = T)
15 | opt <- arguments$options
16 | 
17 | vcf = read.table(file=opt$file, header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE)
18 | n = ncol(vcf)
19 | n1 = vcf[,n,drop=TRUE]
20 | n2 = vcf[,n-1,drop=TRUE]
21 | vcf[,n-1] = n1
22 | vcf[,n] = n2
23 | colnames(vcf) = c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", opt$tumor, opt$normal)
24 | system(paste0("grep '##' ", opt$file, " > ", opt$file, ".tmp"))
25 | write.table(vcf, file=paste0(opt$file, ".tmp"), append=TRUE, quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)
26 | warnings()
27 | 


--------------------------------------------------------------------------------
/scripts/tsvToExcel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Write tsv to excel sheet
 4 | """
 5 | import argparse
 6 | import pandas as pd
 7 | import os
 8 | from openpyxl import load_workbook
 9 | 
10 | 
11 | def write_to_excel(tsv_file, excel_file, sheet_name, column_names, delimiter, overwrite):
12 |     df = pd.read_csv(tsv_file, sep=delimiter)
13 |     if not overwrite and os.path.isfile(excel_file):
14 |         book = load_workbook(excel_file)
15 |         writer = pd.ExcelWriter(excel_file, engine='openpyxl')
16 |         writer.book = book
17 |         writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
18 |     else:
19 |         writer = pd.ExcelWriter(excel_file)
20 |     if column_names:
21 |         df.to_excel(writer, sheet_name, columns=column_names, index=False)
22 |     else:
23 |         df.to_excel(writer, sheet_name, index=False)
24 |     if not overwrite and os.path.isfile(excel_file):
25 |         writer.close()
26 | 
27 | 
28 | def main():
29 |     parser = argparse.ArgumentParser(description=__doc__,
30 |                                         formatter_class=argparse.RawDescriptionHelpFormatter)
31 |     parser.add_argument("tsv_file", type=str, help="TSV")
32 |     parser.add_argument("excel_file", type=str, help="Excel output")
33 |     parser.add_argument("sheet_name", type=str, help="Sheet name")
34 |     parser.add_argument("-c", "--column_names", type=str, default=None, help="Which columns to write (comma separated)")
35 |     parser.add_argument("-d", "--delimiter", type=str, default="\t", help="Set delimiter")
36 |     parser.add_argument("--overwrite", action="store_true", help="Overwrite existing excel")
37 |     args = parser.parse_args()
38 |     if args.column_names:
39 |         column_names = args.column_names.split(",")
40 |     else:
41 |         column_names = None
42 |     sheet_name = (args.sheet_name[:25] + '..') if len(args.sheet_name) > 25 else args.sheet_name
43 |     write_to_excel(args.tsv_file, args.excel_file, sheet_name, column_names, args.delimiter, args.overwrite)
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/signatures/deconstruct_sigs.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/deconstruct_sigs.$(NOW)
 4 | 
 5 | deconstructsigs : $(foreach sample,$(TUMOR_SAMPLES),deconstructsigs/signatures/$(sample).RData) \
 6 | 		  $(foreach sample,$(TUMOR_SAMPLES),deconstructsigs/plots/context/$(sample).pdf)
 7 | 
 8 | define extract-signatures
 9 | deconstructsigs/signatures/%.RData : summary/tsv/mutation_summary.tsv
10 | 	$$(call RUN,-s 4G -m 6G -v $(DECONSTRUCTSIGS_ENV),"set -o pipefial && \
11 | 							   $(RSCRIPT) modules/signatures/extract_signatures.R \
12 | 							   --sample_name $$()")
13 | 	
14 | deconstructsigs/plots/context/%.pdf : deconstructsigs/signatures/%.RData
15 | 	$$(call RUN,-s 4G -m 6G -v $(DECONSTRUCTSIGS_ENV),"set -o pipefail && \
16 | 							   $(RSCRIPT) modules/signatures/plot_signatures.R \
17 | 							   --sample_name $$(*)")
18 | 
19 | endef
20 | $(foreach sample,$(TUMOR_SAMPLES),\
21 | 		$(eval $(call extract-signatures,$(sample))))
22 | 
23 | ..DUMMY := $(shell mkdir -p version; \
24 | 	     $(DECONSTRUCTSIGS_ENV)/bin/R --version > version/deconstruct_sigs.txt)
25 | .SECONDARY:
26 | .DELETE_ON_ERROR:
27 | .PHONY: deconstructsigs


--------------------------------------------------------------------------------
/snp6/absolute.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("ABSOLUTE"))
 5 | 
 6 | options(warn = -1, error = quote({ traceback(2); q('no', status = 1) }))
 7 | 
 8 | optList <- list(
 9 |         make_option("--disease", default = 'breastcancer', help = "disease [default %default]"),
10 |         make_option("--platform", default = "SNP_6.0", help = "platform [default %default]"),
11 |         make_option("--tumour", default = NULL, help = "tumour sample name"),
12 |         make_option("--mafFile", default = NULL, help = "MAF file"),
13 |         make_option("--minMutAF", default = NULL, help = "Minimum Mutation Allele Frequency"),
14 |         make_option("--resultsDir", default = NULL, help = "results directory"),
15 |         make_option("--outPrefix", default = NULL, help = "output prefix")
16 |         )
17 | parser <- OptionParser(usage = "%prog segDat.Rdata", option_list = optList);
18 | arguments <- parse_args(parser, positional_arguments = T);
19 | opt <- arguments$options;
20 | 
21 | if (is.null(opt$resultsDir)) {
22 |     cat("Need results dir\n");
23 |     print_help(parser);
24 |     stop();
25 | } else if (is.null(opt$tumour)) {
26 |     cat("Need tumour sample name\n");
27 |     print_help(parser);
28 |     stop();
29 | } else if (length(arguments$args) != 1) {
30 |     cat("Need hapseg data file\n");
31 |     print_help(parser);
32 |     stop();
33 | }
34 | 
35 | 
36 | fn <- arguments$args[1];
37 | RunAbsolute(seg.dat.fn = fn, output.fn.base = opt$outPrefix,
38 |     sigma.p=0, max.sigma.h=0.02,
39 |     min.ploidy=0.95, max.ploidy=10, primary.disease=opt$disease,
40 |     platform=opt$platform, sample.name=opt$tumour,
41 |     results.dir=opt$resultsDir,
42 |     maf.fn = opt$mafFile, min.mut.af=opt$minMutAF,
43 |     max.as.seg.count=1500, copy_num_type="allelic",
44 |     max.neg.genome=0, max.non.clonal=0,
45 |     verbose=TRUE)
46 | 


--------------------------------------------------------------------------------
/summary/cravat_summary.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | 
 5 | if (!interactive()) {
 6 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
 7 | }
 8 | 
 9 | args_list = list(make_option("--sample_names", default = NA, type = 'character', help = "sample name"))
10 | 				  
11 | parser = OptionParser(usage = "%prog", option_list = args_list)
12 | arguments = parse_args(parser, positional_arguments = T)
13 | opt = arguments$options
14 | 
15 | sample_names = unlist(strsplit(x=opt$sample_names, split=" ", fixed=TRUE))
16 | tsv = list()
17 | for (i in 1:length(sample_names)) {
18 | 	tsv[[i]] = read.csv(file=paste0("cravat/", sample_names[i], ".txt"), header=TRUE, sep="\t", stringsAsFactors=FALSE)
19 | }
20 | tsv = do.call(rbind, tsv)
21 | write.table(tsv, file="summary/tsv/cravat_summary.tsv", sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE, append=FALSE)
22 | 


--------------------------------------------------------------------------------
/summary/cravat_summary.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/cravat_summary.$(NOW)
 4 | PHONY += cravat summary summary/tsv
 5 | 
 6 | cravat_summary : summary/tsv/cravat_summary.tsv  summary/cravat_summary.xlsx
 7 | 
 8 | summary/tsv/cravat_summary.tsv : $(wildcard cravat/$(SAMPLES).txt)
 9 | 	$(call RUN,-c -s 24G -m 48G -w 7200,"$(RSCRIPT) modules/summary/cravat_summary.R --sample_names '$(SAMPLES)'")
10 | 	
11 | summary/cravat_summary.xlsx : summary/tsv/cravat_summary.tsv
12 | 	$(call RUN,-c -s 24G -m 48G -w 7200,"python modules/summary/cravat_summary.py")
13 | 
14 | .DELETE_ON_ERROR:
15 | .SECONDARY:
16 | .PHONY: $(PHONY)
17 | 


--------------------------------------------------------------------------------
/summary/cravat_summary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import csv
 3 | from xlsxwriter.workbook import Workbook
 4 | 
 5 | tsv_file = 'summary/tsv/cravat_summary.tsv'
 6 | xlsx_file = 'summary/cravat_summary.xlsx'
 7 | 
 8 | workbook = Workbook(xlsx_file)
 9 | worksheet = workbook.add_worksheet()
10 | 
11 | tsv_reader = csv.reader(open(tsv_file, 'rb'), delimiter='\t')
12 | 
13 | for row, data in enumerate(tsv_reader):
14 |     worksheet.write_row(row, 0, data)
15 | 
16 | workbook.close()
17 | 


--------------------------------------------------------------------------------
/summary/delmh_summary.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/delmh_summary.$(NOW)
 4 | PHONY += delmh_summary
 5 | 
 6 | delmh_summary : summary/tsv/delmh_summary.tsv
 7 | 
 8 | summary/tsv/delmh_summary.tsv : summary/tsv/mutation_summary.tsv
 9 | 	$(call RUN,-n 1 -s 8G -m 8G,"set -o pipefail && \
10 | 								 $(RSCRIPT) modules/summary/delmh_summary.R --input_file $(<)")
11 | 	
12 | .DELETE_ON_ERROR:
13 | .SECONDARY:
14 | .PHONY: $(PHONY)
15 | 


--------------------------------------------------------------------------------
/summary/genome_summary_excel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import csv
 3 | from xlsxwriter.workbook import Workbook
 4 | 
 5 | tsv_file = 'summary/tsv/genome_summary.tsv'
 6 | xlsx_file = 'summary/genome_summary.xlsx'
 7 | 
 8 | workbook = Workbook(xlsx_file)
 9 | worksheet = workbook.add_worksheet()
10 | 
11 | tsv_reader = csv.reader(open(tsv_file, 'rb'), delimiter='\t')
12 | 
13 | for row, data in enumerate(tsv_reader):
14 |     worksheet.write_row(row, 0, data)
15 | 
16 | workbook.close()
17 | 


--------------------------------------------------------------------------------
/summary/hotspot_summary_excel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import csv
 3 | from xlsxwriter.workbook import Workbook
 4 | 
 5 | tsv_file = 'summary/tsv/hotspot_summary.tsv'
 6 | xlsx_file = 'summary/hotspot_summary.xlsx'
 7 | 
 8 | workbook = Workbook(xlsx_file)
 9 | worksheet = workbook.add_worksheet()
10 | 
11 | tsv_reader = csv.reader(open(tsv_file, 'rb'), delimiter='\t')
12 | 
13 | for row, data in enumerate(tsv_reader):
14 |     worksheet.write_row(row, 0, data)
15 | 
16 | workbook.close()
17 | 


--------------------------------------------------------------------------------
/summary/hotspotsummary.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/hotspot_summary.$(NOW)
 4 | PHONY += hotspot summary summary/tsv
 5 | 
 6 | HOTSPOT ?= $(wildcard $(foreach sample,$(SAMPLES),hotspot/$(sample).txt))
 7 | 
 8 | hotspot_summary : summary/tsv/hotspot_summary.tsv summary/hotspot_summary.xlsx
 9 | 
10 | summary/tsv/hotspot_summary.tsv : $(wildcard hotspot/$(SAMPLES).txt)
11 | 	$(call RUN,-n 1 -s 4G -m 4G,"$(RSCRIPT) modules/summary/hotspotsummary.R --in_file '$(HOTSPOT)' --out_file summary/tsv/hotspot_summary.tsv")
12 | 		
13 | summary/hotspot_summary.xlsx : summary/tsv/hotspot_summary.tsv
14 | 	$(call RUN,-n 1 -s 4G -m 4G,"python modules/summary/hotspot_summary_excel.py")
15 | 
16 | .DELETE_ON_ERROR:
17 | .SECONDARY:
18 | .PHONY: $(PHONY)
19 | 


--------------------------------------------------------------------------------
/summary/mouse_summary_excel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import csv
 3 | from xlsxwriter.workbook import Workbook
 4 | 
 5 | tsv_file = 'summary/tsv/mouse_summary.tsv'
 6 | xlsx_file = 'summary/mouse_summary.xlsx'
 7 | 
 8 | workbook = Workbook(xlsx_file)
 9 | worksheet = workbook.add_worksheet()
10 | 
11 | tsv_reader = csv.reader(open(tsv_file, 'rb'), delimiter='\t')
12 | 
13 | for row, data in enumerate(tsv_reader):
14 |     worksheet.write_row(row, 0, data)
15 | 
16 | workbook.close()
17 | 


--------------------------------------------------------------------------------
/summary/mousesummary.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | 
 5 | if (!interactive()) {
 6 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
 7 | }
 8 | 
 9 | args_list <- list(make_option("--sample_names", default = NA, type = 'character', help = "input file name"),
10 | 				  make_option("--out_file", default = NA, type = 'character', help = "output file name"))
11 | parser <- OptionParser(usage = "%prog", option_list = args_list)
12 | arguments <- parse_args(parser, positional_arguments = T)
13 | opt <- arguments$options
14 | 
15 | sample_names = unlist(strsplit(x=opt$sample_names, split=" ", fixed=TRUE))
16 | out_file_name = opt$out_file
17 | DP = AD = MAF = list()
18 | for (i in 1:length(sample_names)) {
19 | 	tmp = read.csv(file=paste0("sufam/", sample_names[i], ".txt"), header=TRUE, sep="\t", stringsAsFactors=FALSE)
20 | 	DP[[i]] = tmp[,"cov"]
21 | 	AD[[i]] = tmp[,"val_al_count"]
22 | 	MAF[[i]] = tmp[,"val_maf"]
23 | }
24 | DP = do.call(cbind, DP)
25 | colnames(DP) = paste0("DP_", sample_names)
26 | AD = do.call(cbind, AD)
27 | colnames(AD) = paste0("AD_", sample_names)
28 | MAF = do.call(cbind, MAF)
29 | colnames(MAF) = paste0("MAF_", sample_names)
30 | vcf = read.table(file="sufam/pdx.vcf", header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE)
31 | chr = vcf[,1]
32 | pos = vcf[,2]
33 | ref = vcf[,4]
34 | alt = vcf[,5]
35 | gene_symbol = unlist(lapply(strsplit(vcf[,7], "p.", fixed=TRUE), function(x) { x[1] }))
36 | hgvsp_short = paste0("p.", unlist(lapply(strsplit(vcf[,7], "p.", fixed=TRUE), function(x) { x[2] })))
37 | res = cbind(chr, pos, ref, alt, gene_symbol, hgvsp_short, DP, AD, MAF)
38 | colnames(res)[1:6] = c("Chromosome", "Position", "Reference_Allele", "Alternate_Allele", "Gene_Symbol", "HGVSp")
39 | write.table(res, file=out_file_name, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
40 | 


--------------------------------------------------------------------------------
/sv_callers/crest.mk:
--------------------------------------------------------------------------------
 1 | # run crest
 2 | 
 3 | include modules/Makefile.inc
 4 | 
 5 | LOGDIR = log/crest.$(NOW)
 6 | 
 7 | CREST_DIR = $(HOME)/share/usr/crest
 8 | CREST = PERL5LIB=/home/limr/share/usr/crest:/home/limr/share/usr/src/bioperl-live:/home/limr/share/usr/perl5/lib/perl5:/home/limr/share/usr/src/ensembl/modules:/home/limr/share/usr/src/ensembl-compara/modules:/home/limr/share/usr/src/ensembl-variation/modules:/home/limr/share/usr/src/ensembl-funcgen/modules $(PERL) $(CREST_DIR)/CREST.pl
 9 | CREST_OPTS = --blat $(BLAT) --cap3 $(CAP3) --blatclient $(GFCLIENT) --blatserver 140.163.153.48 --blatport 88878 -t $(REF_2BIT) --ref_genome $(REF_FASTA) 
10 | EXTRACT_SCLIP = PERL5LIB=$(CREST_DIR):$(PERL5LIB) $(CREST_DIR)/extractSClip.pl
11 | EXTRACT_SCLIP_OPTS = --ref_genome $(REF_FASTA)
12 | 
13 | .SECONDARY:
14 | .DELETE_ON_ERROR:
15 | 
16 | 
17 | ifdef SAMPLE_PAIRS
18 | PHONY += crestTN
19 | crestTN : $(foreach pair,$(SAMPLE_PAIRS),crest/$(pair).crest_timestamp)
20 | else
21 | PHONY += crest
22 | crest : $(foreach sample,$(SAMPLES),crest/$(sample).crest_timestamp)
23 | endif
24 | 
25 | crest/%.read_len : bam/%.bam
26 | 	$(call RUN,-s 3G -m 5G,"$(SAMTOOLS) view $< | tail -n+100000 | head -1 | awk '{ print length(\$$10) }' > $@")
27 | 
28 | crest/%.sclip.txt : bam/%.bam
29 | 	$(call RUN,-s 6G -m 8G,"$(EXTRACT_SCLIP) $(EXTRACT_SCLIP_OPTS) -p $(@D)/$* -i $<")
30 | 
31 | crest/%.crest_timestamp : bam/%.bam crest/%.sclip.txt crest/%.read_len
32 | 	$(call RUN,-s 15G -m 60G,"$(CREST) $(CREST_OPTS) -f $(<<) -d $< -p $(@D)/$* -l `cat $(<<<)` && touch $@")
33 | 
34 | define crest-tumor-normal
35 | crest/$1_$2.crest_timestamp : bam/$1.bam bam/$2.bam crest/$1.sclip.txt crest/$1.read_len
36 | 	$$(call RUN,-s 15G -m 60G,"$$(CREST) $$(CREST_OPTS) -f $$(<<<) -d $$< -g $$(<<) -p $$(@D)/$1_$2 -l `cat $$(<<<<)` && touch $$@")
37 | endef
38 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call crest-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
39 | 
40 | .PHONY: $(PHONY)
41 | 


--------------------------------------------------------------------------------
/sv_callers/delly.mk:
--------------------------------------------------------------------------------
 1 | # run delly
 2 | LOGDIR = log/delly.$(NOW)
 3 | 
 4 | include modules/Makefile.inc
 5 | 
 6 | DELLY_ENV = $(HOME)/share/usr/anaconda-envs/delly-0.7.6
 7 | 
 8 | SUAVE_BAM_TO_H5 = python $(HOME)/share/usr/delly/vis/suave/suave_bam_to_h5.py
 9 | 
10 | DELLY_TYPES = DEL DUP INV TRA INS
11 | 
12 | .SECONDARY:
13 | .DELETE_ON_ERROR:
14 | 
15 | .PHONY: delly
16 | 
17 | delly: $(foreach pair,$(SAMPLE_PAIRS),$(foreach type,$(DELLY_TYPES),delly/bcf/$(pair).delly_$(type).bcf)) \
18 | 	$(foreach sample,$(SAMPLES),delly/h5/$(sample).h5)
19 | 
20 | define delly-pair-type
21 | delly/bcf/$1_$2.delly_$3.bcf : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai
22 | 	$$(call RUN,-v $$(DELLY_ENV) -w 256:00:00,8G,12G,"delly call -t $3 -o $$@ -g $$(REF_FASTA) $$< $$(<<)")
23 | endef
24 | $(foreach pair,$(SAMPLE_PAIRS),\
25 | 	$(foreach type,$(DELLY_TYPES),\
26 | 	$(eval $(call delly-pair-type,$(tumor.$(pair)),$(normal.$(pair)),$(type)))))
27 | 
28 | delly/h5/%.h5 : bam/%.bam
29 | 	$(call RUN,-v $(DELLY_ENV) -s 8G -m 10G,"$(SUAVE_BAM_TO_H5) -s $* -c gzip -o $@ $<")
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/sv_callers/destruct.mk:
--------------------------------------------------------------------------------
 1 | # Run destruct
 2 | #
 3 | #
 4 | # Author: Fong Chun Chan <fongchunchan@gmail.com>
 5 | #
 6 | 
 7 | SHELL := /bin/bash
 8 | 
 9 | include modules/Makefile.inc
10 | 
11 | LOGDIR = destruct/log
12 | 
13 | DESTRUCT_CONFIG_FILE = $(HOME)/share/usr/destruct/destruct/config.txt
14 | #ANALYZE_DNA_BAM = $(HOME)/share/usr/nfuse-0.1.2/scripts/analyze_dna_bam.pl -c $(HOME)/usr/nfuse-0.1.2/scripts/config.txt
15 | #DESTRUCT = /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/python2.7 /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/destruct.py /scratch/sohrab_temp/amcpherson_tmp/forray/genesis_config.ini
16 | #DESTRUCT = /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/python2.7 $(HOME)/share/usr/destruct/destruct/destruct.py /scratch/sohrab_temp/amcpherson_tmp/config.ini
17 | DESTRUCT = $(PYTHON) $(HOME)/share/usr/destruct/destruct/destruct.py $(DESTRUCT_CONFIG_FILE)
18 | 
19 | VPATH = bam
20 | 
21 | SAMPLE_FILE = samples.txt
22 | 
23 | .SECONDARY:
24 | .DELETE_ON_ERROR:
25 | .PHONY : all
26 | 
27 | all : destruct/all.timestamp
28 | 
29 | ####
30 | ## Build the destruct file list
31 | ####
32 | destruct/all_file_list.txt : $(foreach sample,$(SAMPLES),$(sample).bam)
33 | 	mkdir -p $(@D); rm -f $@; for bam in $^; do \
34 | 		sample=`echo "$$bam" | sed 's/.*\///; s/\..*//;'`; \
35 | 		echo -e "$$sample\t$$bam" >> $@; \
36 | 	done
37 | 	
38 | destruct/%.timestamp : destruct/%_file_list.txt
39 | 	mkdir -p $(@D)/$*.tmp $(@D)/breakpoints $(@D)/breakreads $(LOGDIR); $(DESTRUCT) $< $(@D)/$*.tmp $(@D)/breakpoints/$*.breakpoints $(@D)/breakreads/$*.breakreads qsub -p 100 &> $(LOGDIR)/$*.log && touch $@
40 | 


--------------------------------------------------------------------------------
/sv_callers/ericScript.mk:
--------------------------------------------------------------------------------
 1 | # run EricScript
 2 | # author: Raymond Lim
 3 | 
 4 | ERICSCRIPT_ENV = $(HOME)/share/usr/anaconda-envs/ericscript-0.5.5/
 5 | ERICSCRIPT = ericscript.pl
 6 | ERICSCRIPT_OPTS ?= --refid $(ERICSCRIPT_SPECIES)  -db $(ERICSCRIPT_DB) --remove
 7 | ERICSCRIPT_TO_USV = python modules/sv_callers/ericscript2usv.py
 8 | 
 9 | .PHONY: ericscript
10 | .SECONDARY:
11 | .DELETE_ON_ERROR:
12 | 
13 | ericscript: $(foreach sample,$(SAMPLES),usv/$(sample).ericscript.tsv)
14 | 
15 | ericscript/%_ericscript.timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz
16 | 	$(call RUN,-s 14G -m 14G -n 7 -N $*_ericscript,"$(ERICSCRIPT) $(ERICSCRIPT_OPTS) -p 7 -name $* -o $(@D)/$* $^ && \
17 | 		touch $@")
18 | 
19 | usv/%.ericscript.tsv : ericscript/%_ericscript.timestamp
20 | 	$(call RUN,,"$(ERICSCRIPT_TO_USV) < eriscript/$*/$*.results.filtered.tsv > $@")
21 | 


--------------------------------------------------------------------------------
/sv_callers/extractCoordsFromDefuse.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | use Getopt::Std;
 7 | 
 8 | my %opt;
 9 | getopts('t:', \%opt);
10 | 
11 | my $usage = <<ENDL;
12 | Usage: extractCoordsFromDefuse.pl -t [tissue type] defuse_results
13 | -t [tissue type]: either EPI, HEM, MES or AVG
14 | ENDL
15 | 
16 | my $tissueType;
17 | unless ($opt{t}) {
18 |     $tissueType = "EPI";
19 | } else {
20 |     $tissueType = $opt{t};
21 | }
22 | 
23 | sub HELP_MESSAGE {
24 |     print STDERR $usage;
25 |     exit(1);
26 | }
27 | 
28 | 
29 | my @header;
30 | while (my $line = <>) {
31 |     chomp $line;
32 | 
33 |     if ($line =~ /^cluster_id/) {
34 |         @header = split /\t/, $line;
35 |         next;
36 |     }
37 | 
38 |     my @arr = split /\t/, $line;
39 |     my %F = map { $_ => shift @arr } @header;
40 | 
41 |     my $upstream = ($F{upstream_gene} eq $F{gene_name1})? 1 : 2;
42 |     my $downstream = ($F{downstream_gene} eq $F{gene_name1})? 1 : 2;
43 | 
44 |     my $upstreamChr = "chr" . $F{"gene_chromosome" . $upstream };
45 |     my $downstreamChr = "chr" . $F{"gene_chromosome" . $downstream };
46 | 
47 |     # give first/last nt lost
48 |     my $upstreamPosn = ($F{"gene_strand" . $upstream} eq "+")? $F{"genomic_break_pos" . $upstream } + 1 : $F{"genomic_break_pos" . $upstream } - 1;
49 |     my $downstreamPosn = ($F{"gene_strand" . $downstream} eq "+")? $F{"genomic_break_pos" . $downstream } - 1 : $F{"genomic_break_pos" . $downstream } + 1;
50 | 
51 |     print join("\t", ($upstreamChr, $upstreamPosn, $downstreamChr, $downstreamPosn, $tissueType)) . "\n";
52 | }
53 | 


--------------------------------------------------------------------------------
/sv_callers/filterDefuse.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | #use strict;
 4 | #use warnings;
 5 | 
 6 | #use Getopt::Std;
 7 | #my %opt;
 8 | #getopts('h', \%opt);
 9 | 
10 | #my $usage <<ENDL;
11 | 
12 | #sub HELP_MESSAGE {
13 | #print STDERR $usage;
14 | #exit(1);
15 | #}
16 | 
17 | #HELP_MESSAGE if $opt{h};
18 | 
19 | while (my $line = <>) {
20 |     chomp $line;
21 | 
22 |     if ($line =~ /^cluster_id/) {
23 |         @header = split /\t/, $line;
24 |         push @header, "upstream_gene";
25 |         push @header, "downstream_gene";
26 |         #print join ("\t", 0..$#header), "\n";
27 |         print join("\t", @header) . "\n";
28 |         next;
29 |     }
30 | 
31 |     my @arr = split /\t/, $line;
32 |     my %F = map { $_ => shift @arr } @header;
33 | 
34 |     my $gene1 = $F{"gene_strand1"};
35 |     my $gene2 = $F{"gene_strand2"};
36 |     my $genomic1 = $F{"genomic_strand1"};
37 |     my $genomic2 = $F{"genomic_strand2"};
38 | 
39 |     if ($gene1 eq $genomic1 && $gene2 ne $genomic2) {
40 |         $F{"upstream_gene"} = $F{"gene_name1"};
41 |         $F{"downstream_gene"} = $F{"gene_name2"};
42 |     } elsif ($gene1 ne $genomic1 && $gene2 eq $genomic2) {
43 |         $F{"upstream_gene"} = $F{"gene_name2"};
44 |         $F{"downstream_gene"} = $F{"gene_name1"};
45 |     } else {
46 |         $F{"upstream_gene"} = "";
47 |         $F{"downstream_gene"} = "";
48 |     }
49 | 
50 |     if ($F{'orf'} eq "N") { 
51 |         print join("\t", @F{@header}) . "\n";
52 |         next;
53 |     }
54 | 
55 | 
56 |     if ($gene1 eq $genomic1) {
57 |         $p5 = $F{"gene_location1"};
58 |     } else {
59 |         $p3 = $F{"gene_location2"};
60 |     }
61 | 
62 |     if ($gene2 eq $genomic2) {
63 |         $p5 = $F{"gene_location1"};
64 |     } else {
65 |         $p3 = $F{"gene_location2"};
66 |     }
67 |     if ($p5 eq "utr3p" || $p5 eq "downstream") {
68 |         $F{"orf"} = "Y (UTR mid-fusion)";
69 |     } elsif ($p3 eq "utr5p" || $p3 eq "upstream") {
70 |         $F{"orf"} = "Y (UTR mid-fusion)";
71 |     }
72 |     print join("\t", @F{@header}) . "\n";
73 | }
74 | 


--------------------------------------------------------------------------------
/sv_callers/fusioncatcher.mk:
--------------------------------------------------------------------------------
 1 | # Run fusioncatcher
 2 | ##### DEFAULTS ######
 3 | 
 4 | LOGDIR = log/fusioncatcher.$(NOW)
 5 | 
 6 | ##### MAKE INCLUDES #####
 7 | include modules/Makefile.inc
 8 | 
 9 | FUSIONCATCHER = $(HOME)/share/usr/fusioncatcher/fusioncatcher_v0.99.2/fusioncatcher
10 | FUSIONCATCHER_OPTS = -d $(HOME)/share/usr/fusioncatcher/data/current --extract-buffer-size=35000000000
11 | 
12 | .DELETE_ON_ERROR:
13 | .SECONDARY: 
14 | .PHONY: all
15 | 
16 | all : $(foreach sample,$(SAMPLES),fusioncatcher/$(sample).fusioncatcher_timestamp)
17 | 
18 | fusioncatcher/%.fusioncatcher_timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz
19 | 	$(call RUN,-n 8 -s 1G -m 4G,"$(FUSIONCATCHER) $(FUSIONCATCHER_OPTS) -p 8 -o $(@D)/$* -i $<$(,)$(<<) && touch $@")
20 | 


--------------------------------------------------------------------------------
/sv_callers/fusionfinder.mk:
--------------------------------------------------------------------------------
 1 | # Run fusionfinder
 2 | ##### DEFAULTS ######
 3 | 
 4 | LOGDIR = log/fusionfinder.$(NOW)
 5 | 
 6 | ##### MAKE INCLUDES #####
 7 | include modules/Makefile.inc
 8 | 
 9 | FUSIONFINDER = 
10 | FUSIONCATCHER_OPTS = --phred33 --cref  --ncref
11 | 
12 | .DELETE_ON_ERROR:
13 | .SECONDARY: 
14 | .PHONY: all
15 | 
16 | all : $(foreach sample,$(SAMPLES),fusioncatcher/$(sample).fusioncatcher_timestamp)
17 | 
18 | fusioncatcher/%.fusioncatcher_timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz
19 | 	$(call RUN,-n 8 -s 1G -m 4G,"$(FUSIONCATCHER) $(FUSIONCATCHER_OPTS) -p 8 -o $(@D)/$* -i $<$(,)$(<<) && touch $@")
20 | 


--------------------------------------------------------------------------------
/sv_callers/hydra.mk:
--------------------------------------------------------------------------------
 1 | # run hydra
 2 | 
 3 | LOGDIR = log/hydra.$(NOW)
 4 | 
 5 | HYDRA = $(HOME)/share/usr/bin/hydra
 6 | override HYDRA_OPTS ?= -mld 500 -mn 1500
 7 | BAM_TO_FASTQ = $(HOME)/share/usr/bin/bamToFastq
 8 | BAM_TO_BED = /opt/common/bedtools/bedtools-2.17.0/bin/bamToBed
 9 | DEDUP_DISCORDANTS = $(HOME)/share/usr/bin/dedupDiscordants.py
10 | PAIR_DISCORDANTS = $(HOME)/share/usr/bin/pairDiscordants.py
11 | 
12 | include modules/Makefile.inc
13 | include modules/variant_callers/gatk.inc
14 | 
15 | .SECONDARY:
16 | .DELETE_ON_ERROR:
17 | .PHONY: all
18 | 
19 | all : $(foreach sample,$(SAMPLES),hydra/breaks/$(sample).breaks)
20 | 
21 | 
22 | #hydra/disc_fastq/%.disc.1.fastq.gz hydra/disc_fastq/%.disc.2.fastq.gz : bam/%.bam
23 | #$(INIT) $(SAMTOOLS) view -uF 2 $< | $(BAM_TO_FASTQ) -bam stdin -fq1 >( gzip -c > hydra/disc_fastq/$*.disc.1.fastq.gz) -fq2  >( gzip -c > hydra/disc_fastq/$*.disc.2.fastq.gz)
24 | 
25 | hydra/bam/%.disc.bam : bam/%.bam
26 | 	$(call RUN,,"$(SAMTOOLS) view -bF 2 $< > $@")
27 | 
28 | hydra/bed/%.disc.bedpe : hydra/bam/%.disc.bam
29 | 	$(call RUN,,"$(BAM_TO_BED) -i $< -tag NM | $(PAIR_DISCORDANTS) -i stdin -m hydra -z 800 > $@")
30 | 
31 | hydra/bed/%.disc.dedup.bedpe : hydra/bed/%.disc.bedpe
32 | 	$(call RUN,,"$(DEDUP_DISCORDANTS) -i $< -s 3 > $@")
33 | 
34 | hydra/breaks/%.breaks : hydra/bed/%.disc.dedup.bedpe
35 | 	$(call RUN,,"$(HYDRA) -in $< -out $@ $(HYDRA_OPTS)")
36 | 


--------------------------------------------------------------------------------
/sv_callers/manta.inc:
--------------------------------------------------------------------------------
 1 | ifndef MANTA_INC
 2 | CONFIG_MANTA = $(HOME)/share/usr/manta-0.29.6.centos5_x86_64/bin/configManta.py
 3 | CONFIG_MANTA_OPTS ?= --referenceFasta $(REF_FASTA) --config modules/sv_callers/manta_config.py.ini $(if $(TARGETS_FILE),--exome)
 4 | MANTA_HS_CONFIG = modules/sv_callers/manta_hs_config.py.ini
 5 | MANTA_CONFIG = modules/sv_callers/manta_config.py.ini
 6 | MANTA_HIGH_SENS ?= false
 7 | CONFIG_MANTA_OPTS = --referenceFasta $(REF_FASTA) \
 8 | 		    --config $(if $(findstring true,$(MANTA_HIGH_SENS)),\
 9 | 		    				    $(MANTA_HS_CONFIG),$(MANTA_CONFIG)) \
10 | 		    $(if $(TARGETS_FILE),--exome) \
11 | 		    $(if $(MANTA_REGION),--region $(MANTA_REGION))
12 | endif
13 | MANTA_INC = true
14 | 
15 | 


--------------------------------------------------------------------------------
/sv_callers/manta.mk:
--------------------------------------------------------------------------------
 1 | # run manta on tumour-normal matched pairs
 2 | 
 3 | include modules/Makefile.inc
 4 | include modules/sv_callers/manta.inc
 5 | 
 6 | LOGDIR ?= log/manta.$(NOW)
 7 | PHONY += manta manta_vcfs
 8 | 
 9 | manta : manta_vcfs 
10 | 
11 | manta_vcfs: $(foreach sample,$(SAMPLES),vcf/$(sample).manta_sv.eff.vcf vcf/$(sample).manta_indels.eff.vcf vcf/$(sample).manta_candidate_sv.eff.vcf)
12 | 
13 | manta/%/runWorkflow.py : bam/%.bam bam/%.bam.bai
14 | 	$(INIT) $(CONFIG_MANTA) $(CONFIG_MANTA_OPTS) --tumorBam $< --runDir $(@D) 
15 | 
16 | manta/%/results/variants/tumorSV.vcf.gz manta/%/results/variants/candidateSmallIndels.vcf.gz manta/%/results/variants/candidateSV.vcf.gz : manta/%/runWorkflow.py
17 | 	$(call RUN,-n 8 -s 2G -m 2G,"python $< -m local -j 8")
18 | 
19 | vcf/%.manta_sv.vcf : manta/%/results/variants/tumorSV.vcf.gz
20 | 	$(INIT) zcat $< > $@
21 | 
22 | vcf/%.manta_indels.vcf : manta/%/results/variants/candidateSmallIndels.vcf.gz
23 | 	$(INIT) zcat $< > $@
24 | 
25 | vcf/%.manta_candidate_sv.vcf : manta/%/results/variants/candidateSV.vcf.gz
26 | 	$(INIT) zcat $< > $@
27 | 
28 | .PHONY: $(PHONY)
29 | 
30 | include modules/vcf_tools/vcftools.mk
31 | 


--------------------------------------------------------------------------------
/sv_callers/mantaRnaseq.mk:
--------------------------------------------------------------------------------
1 | # run manta on rna-seq
2 | 
3 | CONFIG_MANTA_OPTS += --rna
4 | 
5 | include modules/sv_callers/manta.mk
6 | 


--------------------------------------------------------------------------------
/sv_callers/manta_config.py.ini:
--------------------------------------------------------------------------------
 1 | 
 2 | #
 3 | # This section contains all configuration settings for the top-level manta workflow,
 4 | #
 5 | [manta]
 6 | 
 7 | referenceFasta = /illumina/development/Isis/Genomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa
 8 | 
 9 | # Run discovery and candidate reporting for all SVs/indels at or above this size:
10 | minCandidateVariantSize = 8
11 | 
12 | # Remove all edges from the graph unless they're supported by this many 'observations'.
13 | # Note that one supporting read pair or split read usually equals one observation, but evidence is sometimes downweighted.
14 | minEdgeObservations = 3
15 | 
16 | # Run discovery and candidate reporting for all SVs/indels with at least this
17 | # many spanning support observations
18 | minCandidateSpanningCount = 3
19 | 
20 | # After candidate identification, only score and report SVs/indels at or above this size:
21 | minScoredVariantSize = 51
22 | 
23 | # minimum VCF "QUAL" score for a variant to be included in the diploid vcf:
24 | minDiploidVariantScore = 10
25 | 
26 | # VCF "QUAL" score below which a variant is marked as filtered in the diploid vcf:
27 | minPassDiploidVariantScore = 20
28 | 
29 | # minimum genotype quality score below which single samples are filtered for a variant in the diploid vcf:
30 | minPassDiploidGTScore = 15
31 | 
32 | # somatic quality scores below this level are not included in the somatic vcf:
33 | minSomaticScore = 10
34 | 
35 | # somatic quality scores below this level are filtered in the somatic vcf:
36 | minPassSomaticScore = 30
37 | 


--------------------------------------------------------------------------------
/sv_callers/manta_hs_config.py.ini:
--------------------------------------------------------------------------------
 1 | 
 2 | #
 3 | # This section contains all configuration settings for the top-level manta workflow,
 4 | #
 5 | [manta]
 6 | 
 7 | referenceFasta = /home/limr/share/reference/GATK_bundle/2.3/human_g1k_v37.fa
 8 | 
 9 | # Run discovery and candidate reporting for all SVs/indels at or above this size:
10 | minCandidateVariantSize = 8
11 | 
12 | # Remove all edges from the graph unless they're supported by this many 'observations'.
13 | # Note that one supporting read pair or split read usually equals one observation, but evidence is sometimes downweighted.
14 | minEdgeObservations = 1
15 | 
16 | # Run discovery and candidate reporting for all SVs/indels with at least this
17 | # many spanning support observations
18 | minCandidateSpanningCount = 1
19 | 
20 | # After candidate identification, only score and report SVs/indels at or above this size:
21 | minScoredVariantSize = 51
22 | 
23 | # minimum VCF "QUAL" score for a variant to be included in the diploid vcf:
24 | minDiploidVariantScore = 10
25 | 
26 | # VCF "QUAL" score below which a variant is marked as filtered in the diploid vcf:
27 | minPassDiploidVariantScore = 20
28 | 
29 | # minimum genotype quality score below which single samples are filtered for a variant in the diploid vcf:
30 | minPassDiploidGTScore = 15
31 | 
32 | # somatic quality scores below this level are not included in the somatic vcf:
33 | minSomaticScore = 10
34 | 
35 | # somatic quality scores below this level are filtered in the somatic vcf:
36 | minPassSomaticScore = 30
37 | 


--------------------------------------------------------------------------------
/sv_callers/manta_tumor_normal.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | include modules/sv_callers/manta.inc
 3 | 
 4 | LOGDIR ?= log/manta_tumor_normal.$(NOW)
 5 | 
 6 | manta : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).manta_sv.vcf)
 7 | 
 8 | define manta-tumor-normal
 9 | manta/$1_$2/runWorkflow.py : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai
10 | 	$$(INIT) $$(CONFIG_MANTA) $$(CONFIG_MANTA_OPTS) --tumorBam $$(<) --normalBam $$(<<) --runDir $$(@D) 
11 | 
12 | manta/$1_$2.manta_timestamp : manta/$1_$2/runWorkflow.py
13 | 	$$(call RUN,-n 8 -s 2G -m 4G -w 72:00:00,"set -o pipefail && \
14 | 						  python $$(<) -m local -j 8 && touch $$(@)")
15 | 
16 | manta/$1_$2/results/variants/somaticSV.vcf.gz : manta/$1_$2.manta_timestamp
17 | 
18 | vcf/$1_$2.manta_sv.vcf : manta/$1_$2/results/variants/somaticSV.vcf.gz
19 | 	$$(INIT) zcat $$(<) > $$(@)
20 | 
21 | endef
22 | $(foreach pair,$(SAMPLE_PAIRS), \
23 | 	$(eval $(call manta-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
24 | 
25 | ..DUMMY := $(shell mkdir -p version; \
26 | 	     python --version &> version/manta_tumor_normal.txt)
27 | .SECONDARY:
28 | .DELETE_ON_ERROR:
29 | .PHONY: manta
30 | 


--------------------------------------------------------------------------------
/sv_callers/mapsplice.mk:
--------------------------------------------------------------------------------
 1 | # Run mapsplice
 2 | ##### DEFAULTS ######
 3 | 
 4 | LOGDIR = log/mapsplice.$(NOW)
 5 | 
 6 | ##### MAKE INCLUDES #####
 7 | include modules/Makefile.inc
 8 | 
 9 | MAPSPLICE_TO_USV = python modules/sv_callers/mapsplice2usv.py
10 | 
11 | MAPSPLICE_ENV = $(HOME)/share/usr/anaconda-envs/mapsplice-2.2.1
12 | MAPSPLICE = mapsplice.py
13 | MAPSPLICE_OPTS = -c $(MAPSPLICE_REF_DIR) -x $(MAPSPLICE_REF_BASENAME) --bam --gene-gtf $(GENES_GTF) --fusion
14 | 
15 | ifeq ($(BAM_PHRED64),true)
16 | MAPSPLICE_OPTS += --qual-scal phred64
17 | else
18 | MAPSPLICE_OPTS += --qual-scal phred33
19 | endif
20 | 
21 | .DELETE_ON_ERROR:
22 | .SECONDARY: 
23 | .PHONY: mapsplice
24 | 
25 | mapsplice : $(foreach sample,$(SAMPLES),usv/$(sample.mapsplice.tsv)
26 | 
27 | mapsplice/%_mapsplice.timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz
28 | 	$(call RUN,-n 6 -s 2G -m 3G,"TMP1=`mktemp --tmpdir=$(TMPDIR)`.1.fastq; \
29 | 		TMP2=`mktemp --tmpdir=$(TMPDIR)`.2.fastq; \
30 | 		gzip -c $< > \$$TMP1; \
31 | 		gzip -c $(<<) > \$$TMP2; \
32 | 		mkdir -p mapsplice/$*; \
33 | 		$(MAPSPLICE) $(MAPSPLICE_OPTS) -p 6 -o mapsplice/$* -1 \$$TMP1 -2 \$$TMP2 && touch $@; \
34 | 		rm \$$TMP1 \$$TMP2")
35 | 
36 | usv/%.mapsplice.tsv : mapsplice/%_mapsplice.timestamp
37 | 	$(RUN,,"$(MAPSPLICE_TO_USV) < mapsplice/$*/fusions_not_well_annotated.txt mapsplice/%*/fusions_well_annotated.txt > $@")
38 | 


--------------------------------------------------------------------------------
/sv_callers/nfuseDNA.mk:
--------------------------------------------------------------------------------
 1 | # Run nfuse on dna bams
 2 | # This module is defunct now. Use destruct to call rearrangements for DNA only
 3 | #
 4 | #
 5 | # Author: Raymond Lim <raylim@mm.st>
 6 | #
 7 | 
 8 | SHELL := /bin/bash
 9 | 
10 | include modules/Makefile.inc
11 | 
12 | LOGDIR = nfuse/log
13 | ANALYZE_DNA_BAM = $(HOME)/share/usr/nfuse-0.1.2/scripts/analyze_dna_bam.pl -c $(HOME)/usr/nfuse-0.1.2/scripts/config.txt
14 | 
15 | ANALYZE_DNA_BAM = /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/python2.7 /scratch/sohrab_temp/amcpherson_tmp/forray/install/bin/destruct.py /scratch/sohrab_temp/amcpherson_tmp/forray/genesis_config.ini
16 | 
17 | VPATH = bam
18 | 
19 | SAMPLE_FILE = samples.txt
20 | SAMPLES = $(shell cat $(SAMPLE_FILE))
21 | 
22 | .SECONDARY:
23 | .DELETE_ON_ERROR:
24 | .PHONY : all
25 | 
26 | all : nfuse/timestamp
27 | 
28 | nfuse/file_list.txt : $(foreach sample,$(SAMPLES),$(sample).bam}
29 | 	mkdir -p $(@D); rm -f $@; for bam in $^; do \
30 | 		sample=`echo "$$bam" | sed 's/.*\///; s/\..*//;'`; \
31 | 		echo -e "$$sample\t$$bam" >> $@; \
32 | 	done
33 | 	
34 | nfuse/timestamp : nfuse/file_list.txt
35 | 	mkdir -p tmp $(LOGDIR); $(ANALYZE_DNA_BAM) $< tmp $(@D) sge -p 100 &> $(LOGDIR)/nfuse.log && touch $@
36 | 
37 | 


--------------------------------------------------------------------------------
/sv_callers/nfuseWGSSWTSS.mk:
--------------------------------------------------------------------------------
 1 | # vim: set ft=make :
 2 | 
 3 | include modules/Makefile.inc
 4 | 
 5 | WGSS_WTSS_PAIR_FILE ?= wgss_wtss_pairs.txt
 6 | 
 7 | WGSS_SAMPLES ?= $(shell cut -f 1 $(WGSS_WTSS_PAIR_FILE))
 8 | WTSS_SAMPLES ?= $(shell cut -f 2 $(WGSS_WTSS_PAIR_FILE))
 9 | SAMPLES ?= $(WGSS_SAMPLES) $(WTSS_SAMPLES)
10 | NSAMPLES ?= $(words $(WGSS_SAMPLES))
11 | 
12 | $(foreach i,$(shell seq 1 $(NSAMPLES)),$(eval wgss_lookup.$(word $i,$(WGSS_SAMPLES)) := $(word $i,$(WTSS_SAMPLES))))
13 | 
14 | NFUSE = $(HOME)/share/usr/nfuse-0.2.0/scripts/nfuse.pl -c $(HOME)/share/usr/nfuse-0.2.0/scripts/config.txt -s sge -p 100
15 | 
16 | #VPATH = ../WTSS/bam ../WGSS/bam
17 | 
18 | LOGDIR ?= log/nfuse.$(NOW)
19 | 
20 | .SECONDARY:
21 | .DELETE_ON_ERROR:
22 | .PHONY: all
23 | 
24 | all : $(foreach wgss_sample,$(WGSS_SAMPLES),nfuse/$(wgss_sample)_$(wgss_lookup.$(wgss_sample)).timestamp)
25 | 
26 | #all : $(foreach i,$(shell seq 1 $(NSAMPLES)),nfuse/$(word $i,$(WGSS_SAMPLES))_$(word $i,$(WTSS_SAMPLES)).timestamp)
27 | 
28 | #$(call nfuse-wgss-wtss,wgss-sample,wtss-sample)
29 | define nfuse-wgss-wtss
30 | nfuse/$1_$2.timestamp : fastq/$1.1.fastq.gz fastq/$1.2.fastq.gz fastq/$2.1.fastq.gz fastq/$2.2.fastq.gz
31 | 	$$(INIT) $$(NFUSE) --dnafq1 $$(word 1,$$^) --dnafq2 $$(word 2,$$^) --rnafq1 $$(word 3,$$^) --rnafq2 $$(word 4,$$^) -o nfuse/$1_$2 -n $1_$2 && touch $$@ &> $(LOGDIR)/$1_$2.log
32 | endef
33 | $(foreach i,$(shell seq 1 $(NSAMPLES)),$(eval $(call nfuse-wgss-wtss,$(word $i,$(WGSS_SAMPLES)),$(word $i,$(WTSS_SAMPLES)))))
34 | 
35 | 
36 | #include modules/fastq_tools/fastq.mk
37 | 


--------------------------------------------------------------------------------
/sv_callers/oncofuse.mk:
--------------------------------------------------------------------------------
 1 | # run oncofuse
 2 | # b37 only
 3 | 
 4 | ONCOFUSE_MEM = $(JAVA7) -Xmx$1 -jar $(ONCOFUSE_JAR)
 5 | ONCOFUSE_TISSUE_TYPE ?= EPI
 6 | 
 7 | %.oncofuse.txt : %.coord.txt
 8 | 	$(call RUN,-s 8G -m 12G,"$(call ONCOFUSE_MEM,7G) $< coord $(ONCOFUSE_TISSUE_TYPE) $@")
 9 | 
10 | %.oncofuse.merged.txt : %.txt %.oncofuse.txt 
11 | 	$(INIT) head -1 $< | sed 's/^/RowID\t/' > $<.tmp && awk 'BEGIN {OFS = "\t" } NR > 1 { print NR-1, $$0 }' $< >> $<.tmp ;\
12 | 		cut -f 2- $(<<) > $(<<).tmp; \
13 | 		$(RSCRIPT) $(MERGE) -X --byColX 1 --byColY 1 -H $<.tmp $(<<).tmp > $@ && rm -f $<.tmp $(<<).tmp
14 | 
15 | 


--------------------------------------------------------------------------------
/sv_callers/prepareSoapFuse.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | # prepare soapFuse file structure using samples.txt file
 3 | # print out soapfuse samples file
 4 | 
 5 | use strict;
 6 | use warnings;
 7 | 
 8 | use File::Path qw/make_path/;
 9 | 
10 | use Getopt::Std;
11 | my %opt;
12 | getopts('h', \%opt);
13 | 
14 | my $usage = <<ENDL;
15 | Usage: ./prepareSoapFuse.pl [samples_sets.txt]
16 | ENDL
17 | 
18 | sub HELP_MESSAGE {
19 |    print STDERR $usage;
20 |    exit(1);
21 | }
22 | 
23 | HELP_MESSAGE if $opt{h};
24 | 
25 | sub getReadLength {
26 |     my $fqFile = $_[0];
27 |     open IN, "zcat $fqFile | head |" or die "Unable to open $fqFile\n";
28 |     <IN>;
29 |     return length(<IN>);
30 | }
31 | 
32 | 
33 | while (my $line = <>) {
34 |     chomp $line;
35 |     my $sample = $line;
36 |     my $fq1 = "fastq/$sample.1.fastq.gz";
37 |     my $fq2 = "fastq/$sample.2.fastq.gz";
38 |     die "Cannot find fastq files ($fq1 and $fq2)" unless (-e $fq1 && -e $fq1);
39 |     my $sampleDir = "soapfuse/$sample/$sample";
40 |     make_path($sampleDir);
41 |     system "ln -f $fq1 $sampleDir/${sample}_1.fastq.gz";
42 |     system "ln -f $fq2 $sampleDir/${sample}_2.fastq.gz";
43 |     my $readLength = &getReadLength($fq1);
44 | 
45 |     print "$sample\t$sample\t$sample\t$readLength\n";
46 | }
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/sv_callers/starFusion.mk:
--------------------------------------------------------------------------------
 1 | # run star fusion on fastqs
 2 | include modules/Makefile.inc
 3 | 
 4 | LOGDIR = log/star_fusion.$(NOW)
 5 | 
 6 | $(if $(STAR_CTAT_DIR),,$(error no STAR CTAT dir))
 7 | STAR_FUSION = STAR-Fusion
 8 | STAR_FUSION_ENV = $(HOME)/share/usr/anaconda-envs/star-fusion-1.0.0
 9 | STAR_FUSION_OPTS = --genome_lib_dir $(STAR_CTAT_DIR)
10 | 
11 | STAR_FUSION_TO_USV = python modules/sv_callers/starfusion2usv.py
12 | 
13 | 
14 | PHONY += star_fusion
15 | star_fusion : $(foreach sample,$(SAMPLES),usv/$(sample).star_fusion.tsv)
16 | 	
17 | star_fusion/%.star_fusion_timestamp : fastq/%.1.fastq.gz fastq/%.2.fastq.gz
18 | 	$(call RUN,-v $(STAR_FUSION_ENV) -n 8 -s 2G -m 5G,"$(STAR_FUSION) \
19 | 		--CPU 8 \
20 | 		--output_dir $(@D)/$* \
21 | 		--genome_lib_dir $(STAR_CTAT_DIR) \
22 | 		--verbose_level 2 \
23 | 		--left_fq $< --right_fq $(<<) && touch $@")
24 | 
25 | usv/%.star_fusion.tsv : star_fusion/%.star_fusion_timestamp
26 | 	$(call RUN,,"$(STAR_FUSION_TO_USV) < $(<D)/$*/star-fusion.fusion_candidates.final > $@")
27 | 
28 | .PHONY: $(PHONY)
29 | .SECONDARY: 
30 | .DELETE_ON_ERROR:
31 | 
32 | include modules/fastq_tools/mergeSplitFastq.mk
33 | 


--------------------------------------------------------------------------------
/sv_callers/svaba_tumor_normal.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR = log/svaba_tumor_normal.$(NOW)
 4 | 
 5 | SVABA_CORES ?= 8
 6 | SVABA_MEM_CORE ?= 6G
 7 | SVABA_REF ?= $(REF_FASTA)
 8 | SVABA_DBSNP ?= $(HOME)/share/lib/resource_files/svaba/dbsnp_indel.vcf
 9 | SVABA_BLACKLIST ?= $(HOME)/share/lib/resource_files/svaba/wgs_blacklist_meres.bed
10 | SVABA ?= svaba
11 | 
12 | svaba : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).svaba_sv.vcf)
13 | 
14 | define svaba-tumor-normal
15 | svaba/$1_$2.svaba.somatic.sv.vcf : bam/$1.bam bam/$2.bam
16 | 	$$(call RUN,-c -n $(SVABA_CORES) -s 4G -m $(SVABA_MEM_CORE) -v $(SVABA_ENV) -w 72:00:00,"set -o pipefail && \
17 | 												 mkdir -p svaba && \
18 | 										 		 cd svaba && \
19 | 												 $$(SVABA) run \
20 | 												 -t ../bam/$1.bam \
21 | 												 -n ../bam/$2.bam \
22 | 												 -p $$(SVABA_CORES) \
23 | 												 -D $$(SVABA_DBSNP) \
24 | 												 -L 100000 \
25 | 												 -x 25000 \
26 | 												 -k $$(SVABA_BLACKLIST) \
27 | 												 -a $1_$2 \
28 | 												 -G $$(SVABA_REF)")
29 | 
30 | vcf/$1_$2.svaba_sv.vcf : svaba/$1_$2.svaba.somatic.sv.vcf
31 | 	$$(INIT) cat $$< > $$@
32 | 
33 | endef
34 | $(foreach pair,$(SAMPLE_PAIRS),\
35 | 		$(eval $(call svaba-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
36 | 
37 | 
38 | ..DUMMY := $(shell mkdir -p version; \
39 | 	     $(SVABA) --help &> version/svaba_tumor_normal.txt)
40 | .SECONDARY:
41 | .DELETE_ON_ERROR:
42 | .PHONY: svaba
43 | 


--------------------------------------------------------------------------------
/sv_callers/tophatFusion.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | include modules/variant_callers/gatk.inc
 3 | 
 4 | TOPHAT := $(HOME)/share/usr/bin/tophat2
 5 | TOPHAT_OPTS := --no-coverage-search --fusion-ignore-chromosomes MT --fusion-search --keep-fasta-order
 6 | 
 7 | ifeq ($(BAM_PHRED64),true)
 8 | 	TOPHAT_OPTS += --solexa1.3-quals
 9 | endif
10 | 
11 | .SECONDARY:
12 | .DELETEONERROR:
13 | .PHONY: all
14 | 
15 | all : $(foreach sample,$(SAMPLES),tophat/$(sample)/fusions.out)
16 | 
17 | tophat/%/fusions.out : fastq/%.1.fastq.gz fastq/%.2.fastq.gz tophat/ins_size/%.insert_size.txt
18 | 	DIST_OPTS=`perl -e '$$text = do {local $$/ ; <>}; $$text =~ m/Read length: mean (\d+).*\nRead span: mean (\d+).*STD=(\d+)/; print "--mate-inner-dist " . ($$2 - $$1 * 2) . " --mate-std-dev $$3";' $(word 3,$^)`; \
19 | 	$(call RUN,-N $*_tophat -n 4 -s 6G -m 10G,"$(TOPHAT) $(TOPHAT_OPTS) -p 4 $$DIST_OPTS -o $(@D) $(BOWTIE_REF) $(<) $(word 2,$(^))")
20 | 
21 | tophat/ins_size/%.insert_size.txt : bam/%.bam
22 | 	$(call RUN,,"$(SAMTOOLS) view $< | $(GET_INSERT_SIZE) - > $@")
23 | 
24 | tophat/fusions/%.fusions.ft.txt : tophat/%/fusions.out
25 | 	awk '$5 > 100 { print }' $< 
26 | 
27 | 


--------------------------------------------------------------------------------
/variant_callers/gatk.inc:
--------------------------------------------------------------------------------
 1 | ifndef GATK_INC
 2 | DEFAULT_JAVA_MEM = 18G
 3 | GATK = $(call GATK_MEM,$(DEFAULT_JAVA_MEM))
 4 | GATK_MEM = $(JAVA7) -Xmx$(1) -jar $(GATK_JAR) -S LENIENT
 5 | GATK2 = $(call GATK_MEM2,$(DEFAULT_JAVA_MEM))
 6 | GATK_MEM2 = $(JAVA8) -Xmx$(1) -jar $(GATK_JAR2) -S LENIENT
 7 | 
 8 | endif
 9 | GATK_INC = true
10 | 


--------------------------------------------------------------------------------
/variant_callers/get_basecounts.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/get_basecount.$(NOW)
 4 | 
 5 | MAPQ := 0
 6 | BAQ := 0
 7 | COV := 0
 8 | 
 9 | getbasecount : $(foreach sample,$(SAMPLES),gbc/$(sample).txt.gz) \
10 | 	       gbc/summary.txt
11 | 
12 | define get-basecount
13 | gbc/$1.txt.gz : bam/$1.bam vcf/dataSilentNoPoleNotTertPromot.vcf
14 | 	$$(call RUN,-n 6 -s 3G -m 6G,"set -o pipefail && \
15 | 				      $(GBC) --fasta $(REF_FASTA) \
16 | 				      --bam $$(<) \
17 | 				      --vcf $$(<<) \
18 | 				      --output $$(@) \
19 | 				      --thread 6 \
20 | 				      --sort_output \
21 | 				      --compress_output \
22 | 				      --maq $(MAPQ) \
23 | 				      --baq $(BAQ) \
24 | 				      --cov $(COV) \
25 | 				      --filter_duplicate 0 \
26 | 				      --filter_improper_pair 0 \
27 | 				      --filter_qc_failed 1 \
28 | 				      --filter_indel 0 \
29 | 				      --filter_non_primary 1")
30 | 						    
31 | endef
32 | $(foreach sample,$(SAMPLES),\
33 | 		$(eval $(call get-basecount,$(sample))))
34 | 		
35 | 
36 | gbc/summary.txt : $(foreach sample,$(SAMPLES),gbc/$(sample).txt.gz)
37 | 	$(call RUN,-n 1 -s 24G -m 32G,"set -o pipefail && \
38 | 				       $(RSCRIPT) $(SCRIPTS_DIR)/get_basecounts.R \
39 | 				       --option 1 \
40 | 				       --sample_name '$(SAMPLES)'")
41 | 		
42 | 
43 | ..DUMMY := $(shell mkdir -p version; \
44 | 	     ${GBC} &> version/get_basecount.txt;)
45 | .SECONDARY:
46 | .DELETE_ON_ERROR:
47 | .PHONY: getbasecount
48 | 


--------------------------------------------------------------------------------
/variant_callers/hotspot.mk:
--------------------------------------------------------------------------------
 1 | # run unified genotyper on hotspots
 2 | 
 3 | include modules/Makefile.inc
 4 | include modules/variant_callers/gatk.inc
 5 | 
 6 | LOGDIR ?= log/hotspot.$(NOW)
 7 | PHONY += hotspot hotspot_vcfs hotspot_tables
 8 | 
 9 | .DELETE_ON_ERROR:
10 | .SECONDARY:
11 | .PHONY: $(PHONY)
12 | 
13 | HOTSPOT_GATK_OPTS = --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -R $(REF_FASTA) -stand_call_conf 0
14 | 
15 | 
16 | hotspot : hotspot_vcfs hotspot_tables
17 | hotspot_vcfs : $(if $(SAMPLE_PAIRS),\
18 | 	$(foreach pair,$(SAMPLE_PAIRS),vcf_ann/$(pair).hotspot.vcf),\
19 | 	$(foreach sample,$(SAMPLES),vcf_ann/$(sample).hotspot.vcf))
20 | hotspot_tables : $(if $(SAMPLE_PAIRS),alltables/allTN.hotspot.tab.txt)
21 | 
22 | vcf_ann/%.hotspot.vcf : vcf/%.hotspot.ac_ft.hotspot_int_ann.hotspot_ext_ann.vcf
23 | 	$(INIT) cp $< $@
24 | 
25 | vcf/%.hotspot.vcf : $(foreach i,int ext,hotspot/%.hotspot-$i.vcf.gz hotspot/%.hotspot-$i.vcf.gz.tbi)
26 | 	$(call RUN,-c -s 2G -m 3G,"$(BCFTOOLS2) concat -a $(filter %.vcf.gz,$^) > $@.tmp && \
27 | 		$(call VERIFY_VCF,$@.tmp,$@)")
28 | 
29 | define hotspot-vcf-tumor-normal-i
30 | hotspot/$1_$2.hotspot-$3.vcf : bam/$1.bam bam/$2.bam bam/$1.bam.bai bam/$2.bam.bai 
31 | 	$$(call RUN,-c -s 9G -m 12G,"$$(call GATK_MEM2,4G) \
32 | 		-T UnifiedGenotyper $$(HOTSPOT_GATK_OPTS) -I $$(<) -I $$(<<) \
33 | 		-alleles $$(HOTSPOT_VCF.$3) -L $$(HOTSPOT_VCF.$3) -o $$@.tmp && \
34 | 		$$(call VERIFY_VCF,$$@.tmp,$$@)")
35 | endef
36 | $(if $(SAMPLE_PAIRS),$(foreach pair,$(SAMPLE_PAIRS),\
37 | 	$(foreach i,int ext,\
38 | 		$(eval $(call hotspot-vcf-tumor-normal-i,$(tumor.$(pair)),$(normal.$(pair)),$i)))))
39 | 
40 | define hotspot-vcf-sample-i
41 | hotspot/$1.hotspot-$2.vcf : bam/$1.bam bam/$1.bam.bai
42 | 	$$(call RUN,-c -s 9G -m 12G,"$$(call GATK_MEM2,4G) \
43 | 		-T UnifiedGenotyper $$(HOTSPOT_GATK_OPTS) -I $$(<) \
44 | 		-alleles $$(HOTSPOT_VCF.$2) -L $$(HOTSPOT_VCF.$2) \
45 | 		-o $$@.tmp && $$(call VERIFY_VCF,$$@.tmp,$$@)")
46 | endef
47 | $(foreach sample,$(SAMPLES),\
48 | 	$(foreach i,int ext,\
49 | 		$(eval $(call hotspot-vcf-sample-i,$(sample),$i))))
50 | 
51 | include modules/vcf_tools/vcftools.mk
52 | 


--------------------------------------------------------------------------------
/variant_callers/qsnp.mk:
--------------------------------------------------------------------------------
 1 | # run pyrohmmvar: realignment-based variant calling method for 454 and ion torrent
 2 | 
 3 | include modules/Makefile.inc
 4 | include modules/variant_callers/gatk.inc
 5 | 
 6 | .SECONDARY:
 7 | .DELETE_ON_ERROR: 
 8 | .PHONY: all
 9 | 
10 | LOGDIR = log/qsnp.$(NOW)
11 | QSNP = $(JAVA) $(JARDIR)/qsnp-1.0.jar
12 | 
13 | define QSNP_CONFIG
14 | [general]\n\
15 | chrFiles=$(FREEC_REF)\n\
16 | chrLenFile=$(CHR_LEN)\n\
17 | maxThreads=$(FREEC_THREADS)\n\
18 | samtools=$(SAMTOOLS)\n\
19 | outputDir=$3\n\
20 | noisyData=$(NOISY_DATA)\n\
21 | ploidy=2\n\
22 | window=$(FREEC_WINDOW_SIZE)\n\
23 | gemMappabilityFile=$(GEM_MAP_FILE)\n\
24 | printNA=$(PRINT_NA)\n\
25 | [sample]\n\
26 | mateFile=$1\n\
27 | inputFormat=BAM\n\
28 | mateOrientation=FR\n\
29 | [control]\n\
30 | mateFile=$2\n\
31 | inputFormat=BAM\n\
32 | mateOrientation=FR\n\
33 | [BAF]\n\
34 | shiftInQuality=33\n\
35 | SNPfile=$(SNP_TXT)\n\
36 | $(FREEC_TARGET_CONFIG)
37 | endef
38 | 
39 | 


--------------------------------------------------------------------------------
/variant_callers/samtoolsHet.mk:
--------------------------------------------------------------------------------
 1 | # Run samtools to detect heterozygous positions
 2 | ##### DEFAULTS ######
 3 | include modules/Makefile.inc
 4 | include modules/variant_callers/gatk.inc
 5 | 
 6 | LOGDIR ?= log/samtools_het.$(NOW)
 7 | 
 8 | .DELETE_ON_ERROR:
 9 | .SECONDARY: 
10 | .PHONY : het_snps
11 | 
12 | het_snps : $(foreach s,$(SAMPLES),vcf/$s.het_snp.vcf)
13 | 
14 | include modules/vcf_tools/vcftools.mk
15 | 
16 | define hetsnp-chr
17 | chr_vcf/%.$1.het_snp.vcf : bam/%.bam
18 | 	$$(call RUN,-s 6G -m 8G,"$$(SAMTOOLS2) mpileup -r $1 -f $$(REF_FASTA) -g -I $$< | $$(BCFTOOLS2) call -c | $$(BCFTOOLS2) view -g het | $$(VCFUTILS) varFilter -d 10 -a 5 - > $$@")
19 | endef
20 | $(foreach chr,$(CHROMOSOMES),$(eval $(call hetsnp-chr,$(chr))))
21 | 
22 | vcf/%.het_snp.vcf : $(foreach chr,$(CHROMOSOMES),chr_vcf/%.$(chr).het_snp.vcf)
23 | 	$(INIT) { \
24 | 		grep -P '^#' $<; \
25 | 		sed '/^#/d' $^ | sort -V; \
26 | 		} > $@
27 | 
28 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/crest.mk:
--------------------------------------------------------------------------------
 1 | # Run somatic sniper on tumour-normal matched pairs
 2 | # Detect point mutations
 3 | ##### DEFAULTS ######
 4 | 
 5 | ##### MAKE INCLUDES #####
 6 | include modules/Makefile.inc
 7 | 
 8 | LOGDIR = log/crest.$(NOW)
 9 | CREST_DIR = $(HOME)/share/usr/crest
10 | CREST = PERL5LIB=$(PERL5LIB):$(CREST_DIR) $(PERL) $(CREST_DIR)/CREST.pl
11 | 
12 | EXTRACT_SCLIP = PERL5LIB=$(PERL5LIB):$(CREST_DIR) $(PERL) $(CREST_DIR)/extractSClip.pl
13 | 
14 | .SECONDARY:
15 | .DELETE_ON_ERROR:
16 | .PHONY: all
17 | 
18 | ifdef SAMPLE_PAIRS
19 | all : $(foreach pair,$(SAMPLE_PAIRS),crest/sv/$(pair).predSV.txt)
20 | else
21 | all : $(foreach sample,$(SAMPLES),crest/sv/$(sample).predSV.txt)
22 | endif
23 | 
24 | define sclip-chr
25 | crest/sclip/%.$1.sclip.txt crest/sclip/%.$1.cover : bam/%.bam
26 | 	$$(call RUN,-c -s 4G -m 6G,"$$(EXTRACT_SCLIP) -i $$< --ref_genome $$(REF_FASTA) -r $1 -o $$(@D) -p $$*")
27 | endef
28 | $(foreach chr,$(CHROMOSOMES),$(eval $(call sclip-chr,$(chr))))
29 | 
30 | crest/sclip/%.cover : $(foreach chr,$(CHROMOSOMES),crest/sclip/%.$(chr).cover)
31 | 	$(INIT) cat $^ > $@
32 | 
33 | crest/sclip/%.sclip.txt : $(foreach chr,$(CHROMOSOMES),crest/sclip/%.$(chr).sclip.txt)
34 | 	$(INIT) cat $^ > $@
35 | 
36 | define sv-tumor-normal-chr
37 | crest/sv/$1_$2.$3.predSV.txt : bam/$1.bam bam/$2.bam crest/sclip/$1.cover 
38 | 	$$(call RUN,-c -s 4G -m 6G,"$$(CREST) -p $1_$2.$3 -f $$(<<<) -d $$< -g $$(<<) --ref_genome $$(REF_FASTA) -t $$(REF_2BIT) -r $3")
39 | endef
40 | $(foreach pair,$(SAMPLE_PAIRS),\
41 | 	$(foreach chr,$(CHROMOSOMES),\
42 | 		$(eval $(call sv-tumor-normal-chr,$(tumor.$(pair)),$(normal.$(pair)),$(chr)))))
43 | 
44 | define sv-chr
45 | crest/sv/%.$1.predSV.txt : bam/%.bam crest/sclip/%.cover 
46 | 	$$(call RUN,-c -s 4G -m 6G,"$$(CREST) -p $$*.$1 -f $$(<<) -d $$< --ref_genome $$(REF_FASTA) -t $$(REF_2BIT) -r $1")
47 | endef
48 | $(foreach chr,$(CHROMOSOMES),$(eval $(call sv-chr,$(chr))))
49 | 
50 | crest/sv/%.predSV.txt : $(foreach chr,$(CHROMOSOMES),crest/sv/%.$(chr).predSV.txt)
51 | 	$(INIT) cat $^ > $@
52 | 
53 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/dindelTNFilter.mk:
--------------------------------------------------------------------------------
 1 | # Create tumour-normal dindel vcf files
 2 | include modules/Makefile.inc
 3 | 
 4 | SAMPLE_PAIR_FILE ?= sample_pairs.txt
 5 | 
 6 | TUMOR_SAMPLES := $(shell cut -f 1 $(SAMPLE_PAIR_FILE))
 7 | NORMAL_SAMPLES := $(shell cut -f 2 $(SAMPLE_PAIR_FILE))
 8 | NSAMPLES = $(words $(TUMOR_SAMPLES))
 9 | 
10 | VPATH = dindel/vcf
11 | 
12 | LOGDIR = log/dindel.$(NOW)
13 | 
14 | .SECONDARY:
15 | .DELETE_ON_ERROR:
16 | .PHONY : all
17 | 
18 | VCFS = $(foreach sample,$(TUMOR_SAMPLES),dindel/vcf/$(sample).dindel.sorted.annotated.tnFiltered.vcf)
19 | 
20 | all : $(VCFS) $(addsuffix .idx,$(VCFS)))
21 | 
22 | include modules/tnFilter.mk
23 | include modules/dindel.mk
24 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/gatkTNFilter.mk:
--------------------------------------------------------------------------------
 1 | # naive tumour-normal filter for gatk indels
 2 | 
 3 | include modules/Makefile.inc
 4 | 
 5 | SAMPLE_PAIR_FILE = sample_pairs.txt
 6 | 
 7 | TUMOR_SAMPLES := $(shell cut -f 1 $(SAMPLE_PAIR_FILE))
 8 | NORMAL_SAMPLES := $(shell cut -f 2 $(SAMPLE_PAIR_FILE))
 9 | NSAMPLES = $(words $(TUMOR_SAMPLES))
10 | 
11 | VPATH = gatk/vcf
12 | 
13 | .SECONDARY:
14 | .DELETE_ON_ERROR:
15 | .PHONY : all
16 | 
17 | VCFS = $(foreach sample,$(TUMOR_SAMPLES),gatk/vcf/$(sample).indels.annotated.filtered.tnFiltered.vcf)
18 | TABLES = $(foreach sample,$(TUMOR_SAMPLES),gatk/tables/$(sample).indels.annotated.filtered.tnFiltered.novel.txt)
19 | 
20 | all : $(VCFS) $(addsuffix .idx,$(VCFS)) $(TABLES)
21 | 	
22 | include modules/tnFilter.mk
23 | include modules/gatkVariantCaller.mk
24 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/gatkValidation.mk:
--------------------------------------------------------------------------------
 1 | # use GATK to gather allelic depths at somatic positions
 2 | include modules/Makefile.inc
 3 | include modules/variant_callers/gatk.inc
 4 | 
 5 | LOGDIR ?= log/gatk_validation.$(NOW)
 6 | 
 7 | SOMATIC_BED ?= somatic.bed
 8 | 
 9 | 
10 | .DELETE_ON_ERROR:
11 | .SECONDARY: 
12 | .PHONY : val_vcfs
13 | 
14 | val_vcfs : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).gatkval.vcf)
15 | 
16 | define gatkval-tumor-normal
17 | vcf/$1_$2.gatkval.vcf : bam/$1.bam bam/$2.bam
18 | 	$$(call RUN,-n 4 -s 2.5G -m 3G,"$$(call GATK_MEM,8G) -T UnifiedGenotyper -glm BOTH -nt 4 -R $(REF_FASTA) --dbsnp $(DBSNP) -I $$< -I $$(<<) -L $(SOMATIC_BED) -o $$@ --output_mode EMIT_ALL_SITES")
19 | endef
20 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call gatkval-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
21 | 
22 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/mimsi.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/mimsi.$(NOW)
 4 | 
 5 | mimsi: $(foreach pair,$(SAMPLE_PAIRS),mimsi/$(pair)/$(pair).txt) \
 6 |        mimsi/summary.txt
 7 | 
 8 | MICROSATELLITES_LIST = $(HOME)/share/lib/resource_files/mimsi/microsatellites_impact_only.list
 9 | MODEL = $(HOME)/share/lib/resource_files/mimsi/mi_msi_v0_4_0_200x.model
10 | 
11 | define mimsi-tumor-normal
12 | mimsi/$1_$2/$1_$2.txt : bam/$1.bam bam/$2.bam
13 | 	$$(call RUN,-c -n 8 -s 1G -m 2G -v $(MIMSI_ENV),"set -o pipefail && \
14 | 							 mkdir -p mimsi/$1_$2/ && \
15 | 							 analyze \
16 | 							 --tumor-bam $$(<) \
17 | 							 --normal-bam $$(<<) \
18 | 							 --case-id $1 \
19 | 							 --norm-case-id $2 \
20 | 							 --microsatellites-list $$(MICROSATELLITES_LIST) \
21 | 							 --save-location mimsi/$1_$2/ \
22 | 							 --model $$(MODEL) \
23 | 							 --save && \
24 | 							 mv mimsi/$1_$2/BATCH_results.txt $$(@)")
25 | 
26 | endef
27 | $(foreach pair,$(SAMPLE_PAIRS),\
28 | 	$(eval $(call mimsi-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
29 | 	
30 | mimsi/summary.txt : $(foreach pair,$(SAMPLE_PAIRS),mimsi/$(pair)/$(pair).txt)
31 | 	$(call RUN, -c -n 1 -s 8G -m 12G -v $(INNOVATION_ENV),"set -o pipefail && \
32 | 							       $(RSCRIPT) $(SCRIPTS_DIR)/mimsi.R --option 1 --sample_names '$(SAMPLE_PAIRS)'")
33 | 
34 | 
35 | .SECONDARY:
36 | .DELETE_ON_ERROR:
37 | .PHONY: mimsi
38 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/msisensor.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/msisensor.$(NOW)
 4 | 
 5 | msisensor: $(foreach pair,$(SAMPLE_PAIRS),msisensor/$(pair).msi) \
 6 | 	   msisensor/msi.tsv
 7 | 
 8 | MICROSATELLITES_LIST = $(HOME)/share/lib/resource_files/MSIsensor/microsatellites.list
 9 | MSI_REGIONS = $(HOME)/share/lib/resource_files/MSIsensor/msiregions.bed
10 | 
11 | define msisensor-tumor-normal
12 | msisensor/$1_$2.msi : bam/$1.bam bam/$2.bam
13 | 	$$(call RUN,-c -n 8 -s 1G -m 2G -v $(MSISENSOR_ENV),"set -o pipefail && \
14 | 							     msisensor msi $$(MSISENSOR_OPTS) \
15 | 							     -d $$(MICROSATELLITES_LIST) \
16 | 							     -e $$(MSI_REGIONS) \
17 | 							     -n $$(<<) \
18 | 							     -t $$(<) \
19 | 							     -b 8 \
20 | 							     -o $$(@)")
21 | endef
22 | $(foreach pair,$(SAMPLE_PAIRS),\
23 | 	$(eval $(call msisensor-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
24 | 
25 | msisensor/msi.tsv : $(foreach pair,$(SAMPLE_PAIRS),msisensor/$(pair).msi)
26 | 	$(INIT) (head -1 $< | sed 's/^/sample\t/'; for x in $^; do sed "1d; s/^/$$(basename $$x)\t/" $$x; done | sed 's/_.*msi//' ) > $@
27 | 
28 | .SECONDARY:
29 | .DELETE_ON_ERROR:
30 | .PHONY: msisensor
31 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/platypus.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | include modules/variant_callers/gatk.inc
 3 | 
 4 | .DELETE_ON_ERROR:
 5 | .SECONDARY: 
 6 | .PHONY : $(PHONY)
 7 | 
 8 | PLATYPUS_ENV = $(HOME)/share/usr/anaconda-envs/platypus-0.8.1
 9 | 
10 | PHONY += platypus_indels
11 | platypus_indels: $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).platypus_indels.vcf)
12 | 
13 | define platypus-tumor-normal-chr
14 | platypus/chr_vcf/$1_$2.$3.platypus.vcf : bam/$1.bam bam/$2.bam
15 | 	$$(call RUN,-v $$(PLATYPUS_ENV) -n 4 -s 2G -m 3G,"platypus callVariants --regions=$3 \
16 | 		--bamFiles=$$(<)$$(,)$$(<<) --nCPU 4 --refFile=$$(REF_FASTA) --output=$$@ --logFileName platypus/$1_$2.$3.log")
17 | endef
18 | $(foreach chr,$(CHROMOSOMES),\
19 | 	$(foreach pair,$(SAMPLE_PAIRS),\
20 | 	$(eval $(call platypus-tumor-normal-chr,$(tumor.$(pair)),$(normal.$(pair)),$(chr)))))
21 | 
22 | INDEL_FILTER_VCF = python modules/vcf_tools/indel_filter_vcf.py
23 | SNP_FILTER_VCF = python modules/vcf_tools/snp_filter_vcf.py
24 | PLATYPUS_SOURCE_ANN_VCF = python modules/vcf_tools/annotate_source_vcf.py --source platypus
25 | 
26 | vcf/%.platypus_indels.vcf : $(foreach chr,$(CHROMOSOMES),platypus/chr_vcf/%.$(chr).platypus.vcf)
27 | 	$(call RUN,-c -s 4G -m 8G,"(grep '^#' $<; cat $^ | grep -v '^#' | \
28 | 		$(VCF_SORT) $(REF_DICT) - ) | $(INDEL_FILTER_VCF) | $(PLATYPUS_SOURCE_ANN_VCF) > $@.tmp && \
29 | 		$(call VERIFY_VCF,$@.tmp,$@)")
30 | 
31 | vcf/%.platypus_snps.vcf : $(foreach chr,$(CHROMOSOMES),platypus/chr_vcf/%.$(chr).platypus.vcf)
32 | 	$(call RUN,-c -s 4G -m 8G,"(grep '^#' $<; cat $^ | grep -v '^#' | \
33 | 		$(VCF_SORT) $(REF_DICT) - ) | $(SNP_FILTER_VCF) | $(PLATYPUS_SOURCE_ANN_VCF) > $@.tmp && \
34 | 		$(call VERIFY_VCF,$@.tmp,$@)")
35 | 
36 | include modules/vcf_tools/vcftools.mk
37 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/somaticVariants.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | LOGDIR = log/somatic_variants.$(NOW)
 3 | 
 4 | SNV_TYPE ?= mutect
 5 | INDEL_TYPE ?= somatic_indels
 6 | VARIANT_TYPES ?= $(SNV_TYPE) $(INDEL_TYPE)
 7 | 
 8 | PHONY += all_somatic
 9 | all_somatic: somatic_vcfs somatic_tables facets
10 | 
11 | CONCAT_VCF = python modules/vcf_tools/concat_vcf.py
12 | 
13 | vcf/%.somatic_variants.vcf : vcf/%.$(SNV_TYPE).vcf vcf/%.$(INDEL_TYPE).vcf
14 | 	$(call RUN,-s 9G -m 12G,"$(CONCAT_VCF) $^ | $(VCF_SORT) $(REF_DICT) - > $@")
15 | 
16 | include modules/variant_callers/somatic/mutect.mk
17 | include modules/variant_callers/somatic/somaticIndels.mk
18 | include modules/copy_number/facets.mk
19 | include modules/vcf_tools/annotateSomaticVcf.mk
20 | 
21 | .DELETE_ON_ERROR:
22 | .SECONDARY:
23 | .PHONY: $(PHONY) 
24 | 
25 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/strelkaVarscanIndels.mk:
--------------------------------------------------------------------------------
 1 | # Run VarScan and strelka on tumour-normal matched pairs for indels
 2 | #
 3 | include modules/Makefile.inc
 4 | LOGDIR = log/strelkaVarscan.$(NOW)
 5 | 
 6 | 
 7 | .PHONY : strelka_varscan_merge_vcfs strelka_varscan_merge
 8 | strelka_varscan_merge : strelka_varscan_merge_vcfs 
 9 | strelka_varscan_merge_vcfs : $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).strelka_varscan_indels.vcf)
10 | #strelka_varscan_merge_mafs : $(foreach pair,$(SAMPLE_PAIRS),maf/$(pair).strelka_varscan_indels.vcf)
11 | 
12 | vcf/%.strelka_varscan_indels.vcf : vcf/%.varscan_indels.vcf vcf/%.strelka_indels.vcf
13 | 	$(call RUN,-s 9G -m 12G,"(grep -P '^#' $<; $(BEDTOOLS) intersect -a $< -b <($(PASS_FILTER_VCF) $(<<))) | uniq > $@")
14 | 
15 | 
16 | include modules/variant_callers/somatic/strelka.mk
17 | include modules/variant_callers/somatic/varscanTN.mk
18 | 


--------------------------------------------------------------------------------
/variant_callers/somatic/tvcTN.mk:
--------------------------------------------------------------------------------
 1 | # use torrent variant caller on ion torrent bams
 2 | 
 3 | LOGDIR ?= log/tvcTN.$(NOW)
 4 | 
 5 | include modules/Makefile.inc
 6 | include modules/variant_callers/gatk.inc
 7 | 
 8 | VPATH ?= bam
 9 | 
10 | TVC_OPTS ?= -r $(REF_FASTA) $(if $(TARGETS_FILE),-t $(TARGETS_FILE)) 
11 | 
12 | .DELETE_ON_ERROR:
13 | .SECONDARY: 
14 | 
15 | PHONY += tvc tvc_vcfs tvc_tables
16 | 
17 | tvc : tvc_vcfs tvc_tables
18 | 
19 | tvc_vcfs : $(call SOMATIC_VCFS,tvc_snps_indels)
20 | tvc_tables : $(call SOMATIC_TABLES,tvc_snps_indels)
21 | 
22 | %.contig.vcf : %.vcf 
23 | 	$(INIT) awk '{print "##contig=<ID=" $$1 ",length=" $$2 ",assembly=$(REF)>"'} $(REF_FASTA).fai | $(BCFTOOLS2) annotate -h - $< > $@ 
24 | 
25 | %.vcf.gz : %.vcf
26 | 	$(call RUN,,"bgzip -c $< > $@")
27 | 
28 | define tvc-tumor
29 | vcf/$1.tvc_snps_indels.vcf : bam/$1.bam bam/$1.bam.bai
30 | 	$$(call RUN,-n 4 -s 1G -m 2G,"$$(TVC) $$(TVC_OPTS) -b $$< -o $$@ -n 4")
31 | endef
32 | $(foreach tumor,$(TUMOR_SAMPLES),$(eval $(call tvc-tumor,$(tumor))))
33 | 
34 | define tvc-normal
35 | vcf/$1.tumor_tvc_snps_indels.vcf : $$(foreach tumor,$2,vcf/$$(tumor).tvc_snps_indels.vcf.gz vcf/$$(tumor).tvc_snps_indels.vcf.gz.tbi)
36 | 	$$(INIT) $$(BCFTOOLS2) merge $$(filter %.vcf.gz,$$^) > $$@
37 | 
38 | vcf/$1.tvc_snps_indels.vcf : bam/$1.bam vcf/$1.tumor_tvc_snps_indels.vcf bam/$1.bam.bai
39 | 	$$(call RUN,-n 4 -s 1G -m 2G,"$$(TVC) $$(TVC_OPTS) -c $$(<<) -b $$< -o $$@ -n 4")
40 | endef
41 | $(foreach normal,$(NORMAL_SAMPLES),$(eval $(call tvc-normal,$(normal),$(tumor.$(normal)))))
42 | 
43 | define tvc-tumor-normal
44 | vcf/$1_$2.tvc_snps_indels.vcf : vcf/$1.tvc_snps_indels.vcf.gz vcf/$2.tvc_snps_indels.vcf.gz vcf/$1.tvc_snps_indels.vcf.gz.tbi vcf/$2.tvc_snps_indels.vcf.gz.tbi
45 | 	$$(INIT) $$(BCFTOOLS2) merge $$(filter %.vcf.gz,$$^) > $$@
46 | endef
47 | $(foreach pair,$(SAMPLE_PAIRS),$(eval $(call tvc-tumor-normal,$(tumor.$(pair)),$(normal.$(pair)))))
48 | 
49 | .PHONY: $(PHONY)
50 | 
51 | include modules/vcf_tools/vcftools.mk
52 | 


--------------------------------------------------------------------------------
/variant_callers/sufamsampleset.mk:
--------------------------------------------------------------------------------
 1 | LOGDIR = log/sufam_ss.$(NOW)
 2 | 
 3 | include modules/Makefile.inc
 4 | 
 5 | SUFAM_ENV = $(HOME)/share/usr/anaconda-envs/sufam-dev
 6 | SUFAM_OPTS = --format vcf --mpileup-parameters='-A -q 15 -Q 15 -d 15000'
 7 | SOMATIC_VCF2TSV = python modules/vcf_tools/somatic_vcf2tsv.py
 8 | 
 9 | ANNOTATE_SUFAM_GT_VCF = python modules/vcf_tools/annotate_sufam_gt_vcf.py
10 | 
11 | .DELETE_ON_ERROR:
12 | .SECONDARY:
13 | .PHONY:
14 | 
15 | sufam_sampleset: $(foreach set,$(SAMPLE_SETS),vcf_ann/$(set).sufam.vcf) tsv/sufam_variants.tsv
16 | 
17 | define sufam-set
18 | sufam/$1.set.vcf : $$(foreach pair,$$(pairs.$1),vcf_ann/$$(pair).somatic_variants.vcf.gz vcf_ann/$$(pair).somatic_variants.vcf.gz.tbi)
19 | 	$$(call RUN,-s 6G -m 8G,"bcftools merge -O v --force-samples $$(filter %.vcf.gz,$$^) > $$@")
20 | 
21 | vcf/$1.sufam.vcf : sufam/$1.set.vcf $$(foreach sample,$$(set.$1),bam/$$(sample).bam)
22 | 	$$(call RUN,-v $$(SUFAM_ENV) -s 2G -m 3G,"sufam --sample_name $$(set.$1) $$(SUFAM_OPTS) $$(REF_FASTA) $$^ > $$@")
23 | 
24 | vcf_ann/$1.sufam.vcf : vcf/$1.sufam.vcf $$(foreach pair,$$(pairs.$1),vcf_ann/$$(pair).somatic_variants.vcf.gz)
25 | 	$$(call RUN,-s 2G -m 3G,"$$(ANNOTATE_SUFAM_GT_VCF) $$^ > $$@")
26 | 
27 | tsv/$1.sufam.tsv : vcf_ann/$1.sufam.vcf.gz
28 | 	$$(call RUN,-s 4G -m 6G,"$$(SOMATIC_VCF2TSV) --normal $$(normal.$1) $$< > $$@")
29 | 
30 | endef
31 | $(foreach set,$(SAMPLE_SETS),$(eval $(call sufam-set,$(set))))
32 | 
33 | tsv/sufam_variants.tsv : $(foreach set,$(SAMPLE_SETS),tsv/$(set).sufam.tsv)
34 | 	$(call RUN,-s 4G -m 6G,"(sed -n 1p $<; for x in $^; do sed 1d \$$x; done) > $@")
35 | 
36 | 
37 | include modules/vcf_tools/vcftools.mk
38 | 


--------------------------------------------------------------------------------
/variant_callers/tvc.mk:
--------------------------------------------------------------------------------
 1 | # use torrent variant caller on ion torrent bams
 2 | 
 3 | LOGDIR ?= log/tvc.$(NOW)
 4 | 
 5 | include modules/Makefile.inc
 6 | 
 7 | VPATH ?= bam
 8 | 
 9 | TVC_OPTS ?= -r $(REF_FASTA) $(if $(TARGETS_FILE),-t $(TARGETS_FILE)) 
10 | 
11 | .DELETE_ON_ERROR:
12 | .SECONDARY: 
13 | 
14 | PHONY += tvc tvc_vcfs
15 | 
16 | tvc : tvc_vcfs
17 | 
18 | tvc_vcfs : $(foreach sample,$(SAMPLES),vcf/$(sample).tvc_snps_indels.vcf)
19 | 
20 | vcf/%.tvc_snps_indels.vcf : bam/%.bam bam/%.bam.bai
21 | 	$(call RUN,-n 4 -s 1G -m 2G,"$(TVC) $(TVC_OPTS) -n 4 -o $(@) -b $<")
22 | 
23 | .PHONY: $(PHONY)
24 | 
25 | 


--------------------------------------------------------------------------------
/vcf_tools/annotateExtVcf.mk:
--------------------------------------------------------------------------------
 1 | # annotate external vcfs
 2 | include modules/Makefile.inc
 3 | 
 4 | LOGDIR ?= log/ext_vcf.$(NOW)
 5 | 
 6 | EXT_NAME ?= ext
 7 | 
 8 | ext_ann : ext_vcfs ext_tables
 9 | 
10 | ext_vcfs : $(foreach pair,$(SAMPLE_PAIRS),vcf_ann/$(pair).$(EXT_NAME).vcf)
11 | ext_tables : alltables/allTN.$(EXT_NAME).tab.txt
12 | 
13 | LOGDIR ?= log/annotate_ext_vcf.$(NOW)
14 | 
15 | ANN_MUT_TASTE ?= false
16 | ANN_PROVEAN ?= false
17 | SOMATIC_ANN1 = fathmm chasm dbsnp hotspot_ann eff exac_nontcga cosmic clinvar cn_reg gene_ann nsfp $(ANNOVAR_REF)_multianno \
18 | 			   $(if $(findstring true,$(ANN_MUT_TASTE)),mut_taste) $(if $(findstring true,$(ANN_PROVEAN)),provean)
19 | SOMATIC_ANN2 = $(if $(findstring true,$(ANN_PATHOGEN)),snp_pathogen indel_pathogen)
20 | 
21 | # target filter
22 | 
23 | PHONY += all vcfs
24 | all : vcfs
25 | vcfs : $(foreach type,$(VARIANT_TYPES),$(foreach sample,$(SAMPLES),vcf_ann/$(sample).$(type).vcf))
26 | 
27 | MERGE_VCF = $(PYTHON) modules/vcf_tools/merge_vcf.py
28 | MERGE_SCRIPT = $(call RUN,-c -s 6G -m 7G,"$(MERGE_VCF) --ignore_filter $^ | $(VCF_SORT) $(REF_DICT) - > $@")
29 | 
30 | # first filter round
31 | # first annotation round
32 | vcf/%.$(EXT_NAME).ann.vcf : $(foreach ann,$(SOMATIC_ANN1),vcf/%.$(EXT_NAME).$(ann).vcf)
33 | 	$(MERGE_SCRIPT)
34 | # second annotation round
35 | vcf_ann/%.$(EXT_NAME).vcf : $(if $(strip $(SOMATIC_ANN2)),$(foreach ann,$(SOMATIC_ANN2),vcf/%.$(EXT_NAME).ann.$(ann).vcf),vcf/%.$(EXT_NAME).ann.vcf)
36 | 	$(MERGE_SCRIPT)
37 | 
38 | 
39 | .DELETE_ON_ERROR:
40 | .SECONDARY:
41 | .PHONY: $(PHONY) 
42 | 
43 | include modules/vcf_tools/vcftools.mk
44 | include modules/variant_callers/gatk.inc
45 | 


--------------------------------------------------------------------------------
/vcf_tools/annotateSummaryVcf.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | include modules/genome_inc/b37.inc
 3 | 
 4 | LOGDIR ?= log/annotate_smry_maf.$(NOW)
 5 | 
 6 | annotate_smry_maf : vcf2maf/mutation_summary.vcf \
 7 | 		    vcf2maf/mutation_summary.maf \
 8 | 		    vcf2maf/mutation_summary.txt
 9 | 		   
10 | vcf2maf/mutation_summary.vcf : summary/tsv/mutation_summary.tsv
11 | 	$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
12 | 					  $(RSCRIPT) $(SCRIPTS_DIR)/annotateSummaryVcf.R --option 1 --input $(<) --output $(@)")
13 | 							
14 | vcf2maf/mutation_summary.maf : vcf2maf/mutation_summary.vcf
15 | 	$(call RUN, -c -n 12 -s 2G -m 3G -v $(VEP_ENV) -w 72:00:00,"set -o pipefail && \
16 | 									$(VCF2MAF) \
17 | 									--input-vcf $(<) \
18 | 									--output-maf $(@) \
19 | 									--tmp-dir $(TMPDIR) \
20 | 									--tumor-id NA \
21 | 									--normal-id NA \
22 | 									--vep-path $(VEP_ENV)/bin \
23 | 									--vep-data $(HOME)/share/reference/vep/v86/ \
24 | 									--vep-forks 12 \
25 | 									--ref-fasta $(HOME)/share/reference/vep/v86/homo_sapiens/86_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa \
26 | 									--filter-vcf $(HOME)/share/reference/vep/v86/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz \
27 | 									--species homo_sapiens \
28 | 									--ncbi-build GRCh37 \
29 | 									--maf-center MSKCC && \
30 | 									$(RM) $(TMPDIR)/mutation_summary.vep.vcf")
31 | 							
32 | vcf2maf/mutation_summary.txt : summary/tsv/mutation_summary.tsv vcf2maf/mutation_summary.maf
33 | 	$(call RUN, -c -n 1 -s 8G -m 12G,"set -o pipefail && \
34 | 					  $(RSCRIPT) $(SCRIPTS_DIR)/annotateSummaryVcf.R --option 2 --input $(<) --maf $(<<) --output $(@)")
35 | 							  
36 | ..DUMMY := $(shell mkdir -p version; \
37 | 	     source $(VCF2MAF_ENV)/bin/activate $(VCF2MAF_ENV) && $(VCF2MAF) --man >> version/annotate_smry_maf.txt)
38 | .DELETE_ON_ERROR:
39 | .SECONDARY: 
40 | .PHONY: annotate_smry_maf
41 | 


--------------------------------------------------------------------------------
/vcf_tools/annotate_source_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ add variant source (name of caller) to a vcf
 3 | """
 4 | 
 5 | import argparse
 6 | import vcf
 7 | import sys
 8 | 
 9 | if __name__ == "__main__":
10 |     parser = argparse.ArgumentParser(prog='annotate_source_vcf.py',
11 |                                      description='source annotation to add to each variant INFO')
12 |     parser.add_argument('--source', required=True, help='name of source')
13 |     parser.add_argument('--vcf_infile', required=False, type=argparse.FileType('r'), default=sys.stdin)
14 | 
15 |     args = parser.parse_args()
16 | 
17 |     vcf_reader = vcf.Reader(args.vcf_infile)
18 | 
19 |     vcf_reader.infos['variantCaller'] = vcf.parser._Info(id='variantCaller', num='.', type='String',
20 |                                                          desc="variant caller(s) used to find the variant",
21 |                                                          source=None, version=None)
22 | 
23 |     if args.source == 'lancet':
24 |         vcf_reader.infos['LEN'] = vcf.parser._Info(id='LEN', num='1', type='Integer',
25 |                                                    desc="length of insertion/deletion",
26 |                                                    source=None, version=None)
27 |         vcf_reader.infos['TYPE'] = vcf.parser._Info(id='TYPE', num='1', type='String',
28 |                                                     desc="insertion or deletion",
29 |                                                     source=None, version=None)
30 | 
31 |     vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
32 | 
33 |     for record in vcf_reader:
34 |         record.INFO['variantCaller'] = [args.source]
35 |         vcf_writer.write_record(record)
36 |     vcf_writer.close()
37 | 


--------------------------------------------------------------------------------
/vcf_tools/annotate_sufam_gt_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Annotate genotype to sufam vcf file
 4 | """
 5 | 
 6 | import argparse
 7 | import vcf
 8 | import collections
 9 | import sys
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser(description=__doc__,
13 |                                      formatter_class=argparse.RawDescriptionHelpFormatter)
14 |     parser.add_argument('sufam_vcf_file', help='multi-sample sufam file')
15 |     parser.add_argument('vcf_files', nargs='+', help='sample pair vcf files')
16 |     args = parser.parse_args()
17 | 
18 |     sample_variants = collections.defaultdict(set)
19 |     for f in args.vcf_files:
20 |         vcf_reader = vcf.Reader(open(f, 'r'))
21 |         for record in vcf_reader:
22 |             recid = "{}:{}:{}/{}".format(record.CHROM, record.POS, record.REF, record.ALT)
23 |             s = record.samples[0].sample
24 |             sample_variants[s].add(recid)
25 | 
26 |     sufam_vcf_reader = vcf.Reader(open(args.sufam_vcf_file, 'r'))
27 |     sufam_vcf_reader.infos['samples_called_in'] = vcf.parser._Info(id='samples_called_in', num='.',
28 |                                                                    type='String',
29 |                                                                    desc='samples called in',
30 |                                                                    source=None,
31 |                                                                    version=None)
32 |     vcf_writer = vcf.Writer(sys.stdout, sufam_vcf_reader)
33 |     for record in sufam_vcf_reader:
34 |         recid = "{}:{}:{}/{}".format(record.CHROM, record.POS, record.REF, record.ALT)
35 |         for s, v in list(sample_variants.items()):
36 |             if recid in v:
37 |                 if 'samples_called_in' not in record.INFO:
38 |                     record.INFO['samples_called_in'] = []
39 |                 record.INFO['samples_called_in'].append(s)
40 |         vcf_writer.write_record(record)
41 |     vcf_writer.close()
42 | 


--------------------------------------------------------------------------------
/vcf_tools/annotate_sv.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/anotate_sv.$(NOW)
 4 | 
 5 | SV_CALLERS = svaba manta gridss merged
 6 | ANNOTATE_SV ?= $(HOME)/share/usr/env/annot_sv-3.1.3/opt/AnnotSV/bin/AnnotSV
 7 | 
 8 | annotate_sv :  $(foreach pair,$(SAMPLE_PAIRS), \
 9 | 			$(foreach caller,$(SV_CALLERS),annotate_sv/$(pair)/$(pair).$(caller)_sv.tsv)) \
10 | 	       $(foreach pair,$(SAMPLE_PAIRS), \
11 | 			$(foreach caller,$(SV_CALLERS),annotate_sv/$(pair)/$(pair).$(caller)_sv.maf))
12 | 			
13 | define annotate-sv
14 | annotate_sv/$1/$2/$1.$2_sv.tsv : vcf/$1.$2_sv.vcf
15 | 	$$(call RUN,-c -n 1 -s 4G -m 8G -v $(ANNOTATE_SV_ENV),"set -o pipefail && \
16 | 							       mkdir -p annotate_sv/$1/$2 && \
17 | 							       $$(ANNOTATE_SV) \
18 | 							       -SVinputFile $$(<) \
19 | 							       -outputFile ./annotate_sv/$1/$2/$1.$2_sv.tsv \
20 | 							       -genomeBuild GRCh37")
21 | 							       
22 | annotate_sv/$1/$1.$2_sv.tsv : annotate_sv/$1/$2/$1.$2_sv.tsv
23 | 	$$(INIT) cat $$(<) > $$(@)
24 | 	
25 | annotate_sv/$1/$1.$2_sv.maf : vcf/$1.$2_sv.vcf
26 | 	$$(call RUN,-c -n 12 -s 1G -m 2G -v $(VEP_ENV),"set -o pipefail && \
27 | 							$$(VCF2MAF) \
28 | 							--input-vcf $$(<) \
29 | 							--tumor-id $1 \
30 | 							--filter-vcf $$(EXAC_NONTCGA) \
31 | 							--ref-fasta $$(REF_FASTA) \
32 | 							--vep-path $$(VEP_PATH) \
33 | 							--vep-data $$(VEP_DATA) \
34 | 							--tmp-dir `mktemp -d` \
35 | 							--output-maf $$(@)")
36 | 
37 | endef
38 | $(foreach pair,$(SAMPLE_PAIRS),\
39 | 	$(foreach caller,$(SV_CALLERS), \
40 | 		$(eval $(call annotate-sv,$(pair),$(caller)))))
41 | 		
42 | .DELETE_ON_ERROR:
43 | .SECONDARY:
44 | .PHONY: annotate_sv
45 | 


--------------------------------------------------------------------------------
/vcf_tools/bed_annotate_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ annotate off-target and filter off-target with low depth
 3 | """
 4 | 
 5 | import argparse
 6 | import vcf
 7 | import sys
 8 | import pandas as pd
 9 | import numpy as np
10 | import intervaltree
11 | import re
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser(prog='bed_annotate_vcf.py',
15 |                                      description='annotate vcf file using a bed file')
16 |     parser.add_argument('--info_tag', help='info tag for annotation')
17 |     parser.add_argument('interval_file')
18 |     parser.add_argument('vcf_infile')
19 |     args = parser.parse_args()
20 | 
21 |     intervals = pd.read_table(args.interval_file, header=None, dtype={0: str, 1: np.int32, 2: np.int32})
22 |     intervals = intervals.rename(columns={0: 'chr', 1: 'start', 2: 'end'})
23 |     trees = {}
24 |     for chrom, interval in intervals.groupby('chr'):
25 |         chrom = re.sub(r'chr', '', chrom)
26 |         trees[chrom] = intervaltree.IntervalTree.from_tuples(list(zip(interval.start, interval.end)))
27 | 
28 |     vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
29 |     vcf_reader.infos[args.info_tag] = vcf.parser._Info(id=args.info_tag, num='0', type='Flag',
30 |                                                        desc='{}: overlap'.format(args.info_tag),
31 |                                                        source=None,
32 |                                                        version=None)
33 |     vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
34 | 
35 |     for record in vcf_reader:
36 |         chrom = re.sub(r'chr', '', record.CHROM)
37 |         if record.FILTER is None:
38 |             record.FILTER = []
39 |         if chrom in trees:
40 |             query = trees[chrom].search(record.POS)
41 |             if len(query) == 0 & len(record.REF) > 1:
42 |                 query = trees[chrom].search(record.POS + len(record.REF))
43 |             if len(query) != 0:
44 |                 record.INFO[args.info_tag] = True
45 |         vcf_writer.write_record(record)
46 | 
47 |     vcf_writer.close()
48 | 


--------------------------------------------------------------------------------
/vcf_tools/combine_vcf.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | 
 5 | if (!interactive()) {
 6 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
 7 | }
 8 | 
 9 | args_list = list(make_option("--sample_name", default = NA, type = 'character', help = "sample name"))
10 | 				  
11 | parser = OptionParser(usage = "%prog", option_list = args_list)
12 | arguments = parse_args(parser, positional_arguments = T)
13 | opt = arguments$options
14 | 
15 | vcf_snp = read.csv(file=paste0("vcf_ann/", opt$sample_name, ".gatk_snps.vcf"), header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE)
16 | vcf_indel = read.csv(file=paste0("vcf_ann/", opt$sample_name, ".gatk_indels.vcf"), header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE)
17 | vcf = rbind(vcf_snp, vcf_indel)
18 | pos = as.numeric(vcf[,2])
19 | index = order(pos, decreasing=FALSE)
20 | vcf = vcf[index,,drop=FALSE]
21 | chr = as.character(vcf[,1])
22 | chr[chr=="X"] = 23
23 | chr[chr=="Y"] = 24
24 | chr = as.numeric(chr)
25 | index = is.na(chr)
26 | chr = chr[!index]
27 | vcf = vcf[!index,,drop=FALSE]
28 | index = order(chr, decreasing=FALSE)
29 | vcf = vcf[index,,drop=FALSE]
30 | vcf = vcf[,1:7,drop=FALSE]
31 | colnames(vcf) = c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER")
32 | vcf = cbind(vcf, "INFO"=rep(".", nrow(vcf)))
33 | index = grepl(",", vcf[,"REF"]) | grepl(",", vcf[,"ALT"])
34 | vcf = vcf[!index,,drop=FALSE]
35 | index = duplicated(paste0(vcf[,1], ":", vcf[,2]))
36 | vcf = vcf[!index,,drop=FALSE]
37 | 
38 | cat("##fileformat=VCFv4.1\n", file=paste0("cravat/", opt$sample_name, ".vcf"), append=FALSE)
39 | write.table(vcf, file=paste0("cravat/", opt$sample_name, ".vcf"), sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE, append=TRUE)
40 | 


--------------------------------------------------------------------------------
/vcf_tools/common_filter_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # retain only novel variants
 3 | 
 4 | import argparse
 5 | import vcf
 6 | import re
 7 | import sys
 8 | 
 9 | parser = argparse.ArgumentParser(prog='common_filter_vcf.py',
10 |                                  description='filter vcf file for novel variants (no rs-id/GMAF < 0.01 or cosmic id)')
11 | parser.add_argument('vcf_infile')
12 | 
13 | args = parser.parse_args()
14 | 
15 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
16 | 
17 | vcf_reader.filters['Common'] = vcf.parser._Filter(id='Common',
18 |                                                   desc='no cosmic id, or has dbsnp ID and GMAF is > 0.01')
19 | 
20 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
21 | 
22 | for record in vcf_reader:
23 |     if record.ID is not None:
24 |         # ignore entries with cosmic IDs
25 |         cosm_match = re.search(r'COSM', record.ID)
26 |         if cosm_match is None:
27 |             # filter entries with dbsnp IDs unless GMAF > 0.01
28 |             rs_match = re.search(r'rs', record.ID)
29 |             if rs_match is not None and ('GMAF' not in record.INFO or record.INFO['GMAF'] > 0.01):
30 |                 record.FILTER.append('Common')
31 |     vcf_writer.write_record(record)
32 | 
33 | vcf_writer.close()
34 | 


--------------------------------------------------------------------------------
/vcf_tools/concat_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import vcf
 5 | import sys
 6 | 
 7 | parser = argparse.ArgumentParser(prog='merge_vcf.py',
 8 |                                  description='merge vcf files')
 9 | parser.add_argument('vcf_files', nargs='+', help='vcf files to merge')
10 | parser.add_argument('--out_file', nargs='?', help='merged vcf output file', default=sys.stdout,
11 |                     type=argparse.FileType('w'))
12 | 
13 | args = parser.parse_args()
14 | 
15 | vcf_readers = [vcf.Reader(open(f, 'r')) for f in args.vcf_files]
16 | vcf_reader = vcf_readers[0]
17 | # merge header
18 | if len(vcf_readers) > 1:
19 |     for vcf_reader2 in vcf_readers[1:]:
20 |         for form in vcf_reader2.formats:
21 |             if form not in vcf_reader.formats:
22 |                 vcf_reader.formats[form] = vcf_reader2.formats[form]
23 |         for inf in vcf_reader2.infos:
24 |             if inf not in vcf_reader.infos:
25 |                 vcf_reader.infos[inf] = vcf_reader2.infos[inf]
26 |         for filt in vcf_reader2.filters:
27 |             if filt not in vcf_reader.infos:
28 |                 vcf_reader.filters[filt] = vcf_reader2.filters[filt]
29 | 
30 | vcf_writer = vcf.Writer(args.out_file, vcf_reader)
31 | 
32 | cpra = set()
33 | for vcf_reader in vcf_readers:
34 |     for record in vcf_reader:
35 |         chr_pos_ref_alt = ":".join([record.CHROM, str(record.POS), record.REF, str(record.ALT)])
36 |         if chr_pos_ref_alt not in cpra:
37 |             vcf_writer.write_record(record)
38 |             cpra.add(chr_pos_ref_alt)
39 | 
40 | vcf_writer.close()
41 | 


--------------------------------------------------------------------------------
/vcf_tools/filter_vcf.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | 
 5 | if (!interactive()) {
 6 |     options(warn = -1, error = quote({ traceback(); q('no', status = 1) }))
 7 | }
 8 | 
 9 | args_list = list(make_option("--sample_name", default = NA, type = 'character', help = "sample name"))
10 | 				  
11 | parser = OptionParser(usage = "%prog", option_list = args_list)
12 | arguments = parse_args(parser, positional_arguments = T)
13 | opt = arguments$options
14 | 
15 | vcf = read.csv(file=paste0("cravat/", opt$sample_name, ".vcf"), header=FALSE, sep="\t", comment.char="#", stringsAsFactors=FALSE)
16 | maf = read.csv(file=paste0("cravat/", opt$sample_name, ".maf"), header=TRUE, sep="\t", comment.char="#", stringsAsFactors=FALSE)
17 | index = maf[,"Variant_Classification"] %in% c("Frame_Shift_Del", "Frame_Shift_Ins", "Missense_Mutation", "Nonsense_Mutation", "Nonstop_Mutation", "Splice_Site")
18 | vcf = vcf[index,,drop=FALSE]
19 | vcf[,1] = paste0("chr", vcf[,1])
20 | colnames(vcf) = c("#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO")
21 | cat("##fileformat=VCFv4.1\n", file=paste0("cravat/", opt$sample_name, ".cravat.vcf"), append=FALSE)
22 | write.table(vcf, file=paste0("cravat/", opt$sample_name, ".cravat.vcf"), sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE, append=TRUE)
23 | 


--------------------------------------------------------------------------------
/vcf_tools/gemini.mk:
--------------------------------------------------------------------------------
 1 | 
 2 | # vim: set ft=make :
 3 | # sub module containing vcf related tools
 4 | 
 5 | include modules/Makefile.inc
 6 | 
 7 | LOGDIR = log/gemini.$(NOW)
 8 | GEMINI = unset PYTHONPATH; $(HOME)/share/usr/bin/gemini
 9 | GEMINI_LOAD_OPTS = -t snpEff
10 | 
11 | .DELETE_ON_ERROR:
12 | .SECONDARY: 
13 | .PHONY: gemini
14 | 
15 | GEMINI_DB = gemini/gemini.db
16 | gemini : $(if $(SAMPLE_PAIRS),gemini/mutect_gemini.timestamp gemini/strelka_gemini.timestamp gemini/varscan_gemini.timestamp)
17 | 
18 | gemini/samples.ped : $(SAMPLE_SET_FILE)
19 | 	$(INIT) perl -t$$' ' -lane 'BEGIN { print "#Family_ID\tIndividual_ID\tPaternal_ID\tMaternal_ID\tSex\tPhenotype\tEthnicity"; } $$i = 0; while ($$f = pop @F) { print "$$.\t$$f\t-9\t-9\t0\t" . (($$i++ == 0)? "1" : "2") . "\t-9"; }' $< > $@
20 | 
21 | 
22 | GATK_VCFS = $(foreach sample,$(SAMPLES),\
23 | 			$(foreach type,gatk_snps gatk_indels,\
24 | 			vcf_ann/$(sample).$(type).norm.vcf.gz))
25 | gemini/gatk_gemini.timestamp : $(if $(SAMPLE_PAIRS),gemini/samples.ped) $(GATK_VCFS) $(addsuffix .tbi,$(GATK_VCFS))
26 | 	$(call RUN,-n 8 -s 1G -m 2G,"for vcf in $(filter %.vcf.gz,$^); do $(GEMINI) load --cores 8 $(GEMINI_LOAD_OPTS) -v $$vcf $(if $(SAMPLE_PAIRS),-p $(filter %.ped,$^)) $(GEMINI_DB) ; done && touch $@")
27 | 
28 | ifdef SAMPLE_PAIRS
29 | MUTECT_VCFS = $(foreach pair,$(SAMPLE_PAIRS),vcf_ann/$(pair).mutect_snps.norm.vcf.gz vcf_ann/$(pair).mutect_indels.norm.vcf.gz)
30 | gemini/mutect_gemini.timestamp : gemini/samples.ped $(MUTECT_VCFS) $(addsuffix .tbi,$(MUTECT_VCFS))
31 | 	$(call RUN,-n 8 -s 1G -m 2G,"for vcf in $(filter %.vcf.gz,$^); do $(GEMINI) load --cores 8 $(GEMINI_LOAD_OPTS) -v \$$vcf -p $< $(GEMINI_DB); done && touch $@")
32 | endif
33 | 
34 | 
35 | include modules/vcf_tools/vcftools.mk
36 | 


--------------------------------------------------------------------------------
/vcf_tools/indel_filter_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ output indels only
 3 | """
 4 | 
 5 | import vcf
 6 | import sys
 7 | 
 8 | if __name__ == "__main__":
 9 |     vcf_reader = vcf.Reader(sys.stdin)
10 |     vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
11 | 
12 |     for record in vcf_reader:
13 |         if record.is_indel:
14 |             vcf_writer.write_record(record)
15 | 
16 |     vcf_writer.close()
17 | 


--------------------------------------------------------------------------------
/vcf_tools/interval_filter_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import vcf
 5 | import sys
 6 | import pandas as pd
 7 | import numpy as np
 8 | import intervaltree
 9 | import re
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser(prog='interval_filter_vcf.py',
13 |                                      description='filter vcf file according to a bed file')
14 |     parser.add_argument('interval_file')
15 |     parser.add_argument('vcf_infile')
16 |     args = parser.parse_args()
17 | 
18 |     intervals = pd.read_table(args.interval_file, header=None, dtype={0: str, 1: np.int32, 2: np.int32})
19 |     intervals = intervals.rename(columns={0: 'chr', 1: 'start', 2: 'end'})
20 |     trees = {}
21 |     for chrom, interval in intervals.groupby('chr'):
22 |         chrom = re.sub(r'chr', '', chrom)
23 |         trees[chrom] = intervaltree.IntervalTree.from_tuples(list(zip(interval.start - 50, interval.end + 50)))
24 | 
25 |     vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
26 |     vcf_reader.filters['targetInterval'] = vcf.parser._Filter(id='targetInterval',
27 |                                                               desc='no overlap with intervals')
28 |     vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
29 | 
30 |     for record in vcf_reader:
31 |         chrom = re.sub(r'chr', '', record.CHROM)
32 |         if record.FILTER is None:
33 |             record.FILTER = []
34 |         if chrom not in trees:
35 |             record.FILTER.append('targetInterval')
36 |         else:
37 |             query = trees[chrom].search(record.POS)
38 |             if len(query) == 0:
39 |                 record.FILTER.append('targetInterval')
40 |         vcf_writer.write_record(record)
41 | 
42 |     vcf_writer.close()
43 | 


--------------------------------------------------------------------------------
/vcf_tools/merge_sv.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/merge_sv.$(NOW)
 4 | 
 5 | SV_CALLERS = svaba gridss manta
 6 | MAX_DIST = 500
 7 | NUM_CALLERS = 2
 8 | TYPE = 0
 9 | STRAND = 0
10 | MIN_SIZE = 30
11 | 
12 | merge_sv :  $(foreach pair,$(SAMPLE_PAIRS),merge_sv/$(pair)/samples.txt) \
13 | 	    $(foreach pair,$(SAMPLE_PAIRS),merge_sv/$(pair)/$(pair).merged_sv.vcf) \
14 | 	    $(foreach pair,$(SAMPLE_PAIRS),merge_sv/$(pair)/$(pair).merged_sv_ft.vcf) \
15 | 	    $(foreach pair,$(SAMPLE_PAIRS),vcf/$(pair).merged_sv.vcf)
16 | 	   
17 | define merge-sv
18 | merge_sv/$1_$2/samples.txt : $(foreach caller,$(SV_CALLERS),vcf/$1_$2.$(caller)_sv.vcf)
19 | 	mkdir -p merge_sv/$1_$2 && \
20 | 	$(foreach caller,$(SV_CALLERS),echo vcf/$1_$2.$(caller)_sv.vcf >> $$(@);)
21 | 
22 | merge_sv/$1_$2/$1_$2.merged_sv.vcf : merge_sv/$1_$2/samples.txt
23 | 	$$(call RUN,-c -n 1 -s 4G -m 8G -v $(SURVIVOR_ENV),"set -o pipefail && \
24 | 							    SURVIVOR merge $$(<) \
25 | 							    $(MAX_DIST) $(NUM_CALLERS) $(TYPE) $(STRAND) 0 $(MIN_SIZE) $$(@)")
26 | 
27 | merge_sv/$1_$2/$1_$2.merged_sv_ft.vcf : merge_sv/$1_$2/$1_$2.merged_sv.vcf
28 | 	$$(call RUN,-c -n 1 -s 4G -m 8G -v $(INNOVATION_ENV),"set -o pipefail && \
29 | 							      grep '##' $$(<) > $$(@) && \
30 | 							      $$(RSCRIPT) modules/scripts/filter_sv.R --input_file $$(<) --output_file $$(@)")
31 | 
32 | 
33 | vcf/$1_$2.merged_sv.vcf : merge_sv/$1_$2/$1_$2.merged_sv_ft.vcf
34 | 	$$(INIT) cat $$(<) > $$(@)
35 | 	
36 | endef
37 | $(foreach pair,$(SAMPLE_PAIRS),\
38 | 		$(eval $(call merge-sv,$(tumor.$(pair)),$(normal.$(pair)))))
39 | 	
40 | .DELETE_ON_ERROR:
41 | .SECONDARY:
42 | .PHONY: merge_sv
43 | 


--------------------------------------------------------------------------------
/vcf_tools/merge_uvcf_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ add ups-coordinate to INFO of vcf file
 3 | """
 4 | 
 5 | import argparse
 6 | import vcf
 7 | import pandas as pd
 8 | import sys
 9 | import copy
10 | 
11 | if __name__ == "__main__":
12 |     parser = argparse.ArgumentParser(prog='merge_uvcf_vcf.py',
13 |                                      description='add ups-coordinate to INFO of vcf file')
14 |     parser.add_argument('uvcf_infile')
15 |     parser.add_argument('vcf_infile')
16 |     args = parser.parse_args()
17 | 
18 |     vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
19 |     uvcf = pd.read_csv(args.uvcf_infile, comment='#', sep='\t')
20 |     uvcf.columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'UPS-COORDINATE', 'INFO']
21 | 
22 |     vcf_reader.infos['UPS_Coord'] = vcf.parser._Info(id='UPS_Coord', num='.', type='String',
23 |                                                      desc="UPS-coordinate", source=None, version=None)
24 |     vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
25 | 
26 |     ups_map = {}
27 |     for i, row in uvcf.iterrows():
28 |         x = '{}:{}_{}/{}'.format(row['CHROM'], row['POS'], row['REF'], row['ALT'])
29 |         ups_map[x] = row['UPS-COORDINATE'].replace(" ", "")
30 | 
31 |     for record in vcf_reader:
32 |         ups_coords = []
33 |         for alt in record.ALT:
34 |             alt = str(alt)
35 |             x = '{}:{}_{}/{}'.format(record.CHROM, record.POS, record.REF, alt)
36 |             if x in ups_map:
37 |                 ups_coords.append(ups_map[x])
38 |             else:
39 |                 ups_coords.append('N/A[]')
40 |         record.INFO['UPS_Coord'] = ups_coords
41 |         vcf_writer.write_record(record)
42 |     vcf_writer.close()
43 | 


--------------------------------------------------------------------------------
/vcf_tools/oncokb_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse
 4 | import vcf
 5 | import sys
 6 | import pandas as pd
 7 | import re
 8 | try:
 9 |     from itertools import izip as zip
10 | except ImportError:
11 |     pass
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser(prog='oncokb_vcf.py',
15 |                                      description='Add oncoKB annotation to vcf')
16 |     parser.add_argument('--oncokb', help='oncoKB annotation file')
17 |     parser.add_argument('vcf_infile')
18 | 
19 |     args = parser.parse_args()
20 | 
21 |     vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
22 | 
23 |     oncokb = pd.read_table(args.oncokb)
24 | 
25 |     vcf_reader.infos['oncoKB_level'] = vcf.parser._Info(id='oncoKB_level', num='.', type='String',
26 |                                                         desc="OncoKB level(s)", source=None, version=None)
27 |     vcf_reader.infos['oncoKB_cancer_type'] = vcf.parser._Info(id='oncoKB_cancer_type', num='.', type='String',
28 |                                                               desc="OncoKB cancer type", source=None, version=None)
29 | 
30 |     vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
31 | 
32 |     for record in vcf_reader:
33 |         if 'SYMBOL' in record.INFO and 'HGVSp_Short' in record.INFO:
34 |             assert len(record.INFO['SYMBOL']) == len(record.INFO['HGVSp_Short'])
35 |             for symb, hgvsp in zip(record.INFO['SYMBOL'], record.INFO['HGVSp_Short']):
36 |                 hgvsp = re.sub(r'^p\.', r'', hgvsp)
37 |                 q = oncokb.query(
38 |                     'Gene == "{}" and Alteration == "{}"'.format(symb, hgvsp))
39 |                 if len(q) > 0:
40 |                     if 'oncoKB_level' not in record.INFO:
41 |                         record.INFO['oncoKB_level'] = []
42 |                         record.INFO['oncoKB_cancer_type'] = []
43 |                     record.INFO['oncoKB_level'].extend(q['Level'])
44 |                     record.INFO['oncoKB_cancer_type'].extend(map(lambda x: re.sub(' ', '_', x), q['Cancer Type']))
45 |         vcf_writer.write_record(record)
46 |     vcf_writer.close()
47 | 


--------------------------------------------------------------------------------
/vcf_tools/pass_filter_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # retain only PASS variants
 3 | 
 4 | import argparse
 5 | import vcf
 6 | import re
 7 | import sys
 8 | 
 9 | parser = argparse.ArgumentParser(prog='pass_filter_vcf.py',
10 |                                  description='filter vcf file for PASS variants')
11 | parser.add_argument('vcf_infile')
12 | 
13 | args = parser.parse_args()
14 | 
15 | vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
16 | 
17 | vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
18 | 
19 | for record in vcf_reader:
20 |     if len(record.FILTER) == 0:
21 |         vcf_writer.write_record(record)
22 | 
23 | vcf_writer.close()
24 | 


--------------------------------------------------------------------------------
/vcf_tools/recurVcf.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("optparse"))
 4 | suppressPackageStartupMessages(library("plyr"))
 5 | suppressPackageStartupMessages(library("VariantAnnotation"))
 6 | 
 7 | #options(warn = -1, error = quote({ traceback(2); q('no', status = 1) }))
 8 | #options(error = recover)
 9 | options(error = quote(dump.frames("testdump", TRUE)))
10 | 
11 | optList <- list(
12 |                 make_option("--genome", default = 'b37', help = "genome build [default %default]"),
13 |                 make_option("--tumor", default = NULL, help = "tumor sample"),
14 |                 make_option("--outFile", default = NULL, help = "output file [default %default]"))
15 | 
16 | parser <- OptionParser(usage = "%prog vcf.files", option_list = optList);
17 | arguments <- parse_args(parser, positional_arguments = T);
18 | opt <- arguments$options;
19 | 
20 | if (is.null(opt$outFile)) {
21 |     cat("Need output file\n");
22 |     print_help(parser);
23 |     stop();
24 | } else if (length(arguments$args) <= 1) {
25 |     cat("Need vcf files\n");
26 |     print_help(parser);
27 |     stop();
28 | }
29 | 
30 | files <- arguments$args;
31 | 
32 | vcfs <- list()
33 | for (f in files) {
34 |     vcf <- readVcf(f, genome = opt$genome)
35 |     tum <- ifelse(opt$tumor %in% colnames(geno(vcf)$GT), opt$tumor, "TUMOR")
36 |     gt <- geno(vcf)$GT[, tum]
37 |     vcf <- vcf[gt != "./." & gt != "0/0" & gt != "0", ]
38 |     vcfs <- append(vcfs, vcf)
39 | }
40 | 
41 | all <- do.call('rbind', lapply(vcfs, function(x) as.data.frame(subset(rowRanges(x), FILTER == "PASS"))))
42 | all <- all[, c("seqnames", "start", "end")]
43 | cnt <- ddply(all, .(seqnames, start, end), nrow)
44 | 
45 | cnt <- subset(cnt, V1 > 1)
46 | write(paste(cnt[,1], ":", cnt[,2], "-", cnt[,3], sep = ''), file = opt$outFile)
47 | #write.table(cnt[,c(1:3)], file = opt$outFile, sep = '\t', quote = F, row.names = F, col.names = F)
48 | 


--------------------------------------------------------------------------------
/vcf_tools/recurVcf.mk:
--------------------------------------------------------------------------------
 1 | # intersect vcf files
 2 | ##### DEFAULTS ######
 3 | 
 4 | ##### MAKE INCLUDES #####
 5 | include modules/Makefile.inc
 6 | include modules/variant_callers/gatk.inc
 7 | 
 8 | LOGDIR = log/recur_vcf.$(NOW)
 9 | 
10 | SET_VCF_SUFFIXES = gatk_snps.dp_ft.som_ft
11 | PAIR_VCF_SUFFIXES = som_sniper.ss_dp_ft.ss_ft.pass mutect.som_ad_ft.pass
12 | SAMPLE_SET_PAIR_VCF = $(foreach suff,$(SET_VCF_SUFFIXES),vcf/$(get_set.$1).$(suff).vcf) $(foreach suff,$(PAIR_VCF_SUFFIXES),vcf/$(get_pair.$1).$(suff).vcf)
13 | 
14 | RECUR_VCF = $(RSCRIPT) modules/vcf_tools/recurVcf.R
15 | 
16 | .DELETE_ON_ERROR:
17 | .SECONDARY: 
18 | .PHONY : all pileups
19 | 
20 | all : $(foreach sample,$(TUMOR_SAMPLES),recur_pos/$(sample).recur.txt)
21 | pileups : $(foreach sample,$(TUMOR_SAMPLES),pileup/$(sample).pileup)
22 | 
23 | define recur-pos-tumor
24 | recur_pos/$1.recur.txt : $$(call SAMPLE_SET_PAIR_VCF,$1)
25 | 	$$(INIT) $$(RECUR_VCF) --tumor $1 --outFile $$@ $$^
26 | endef
27 | $(foreach tumor,$(TUMOR_SAMPLES),$(eval $(call recur-pos-tumor,$(tumor))))
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/vcf_tools/snp_filter_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """ output snps only
 3 | """
 4 | 
 5 | import vcf
 6 | import sys
 7 | 
 8 | if __name__ == "__main__":
 9 |     vcf_reader = vcf.Reader(sys.stdin)
10 |     vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
11 | 
12 |     for record in vcf_reader:
13 |         if record.is_snp:
14 |             vcf_writer.write_record(record)
15 | 
16 |     vcf_writer.close()
17 | 


--------------------------------------------------------------------------------
/vcf_tools/split_snps_indels_vcf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """ split_snps_indels_vcf.py
 4 | split a vcf file into snps and indels/everything else
 5 | """
 6 | 
 7 | import argparse
 8 | import vcf
 9 | 
10 | if __name__ == '__main__':
11 |     parser = argparse.ArgumentParser(prog='split_snps_indels_vcf.py',
12 |                                      description='split vcf file into snps and indels')
13 |     parser.add_argument('vcf_infile')
14 |     parser.add_argument('--snps', '-s', nargs='?', required=True, help='snp output vcf file')
15 |     parser.add_argument('--indels', '-i', nargs='?', required=True, help='indel/everything else output vcf file')
16 |     args = parser.parse_args()
17 | 
18 |     vcf_reader = vcf.Reader(open(args.vcf_infile, 'r'))
19 | 
20 |     snp_vcf_writer = vcf.Writer(open(args.snps, 'w'), vcf_reader)
21 |     indel_vcf_writer = vcf.Writer(open(args.indels, 'w'), vcf_reader)
22 | 
23 |     for record in vcf_reader:
24 |         if record.is_snp:
25 |             snp_vcf_writer.write_record(record)
26 |         else:
27 |             indel_vcf_writer.write_record(record)
28 | 
29 |     snp_vcf_writer.close()
30 |     indel_vcf_writer.close()
31 | 


--------------------------------------------------------------------------------
/vcf_tools/vcfCompare.mk:
--------------------------------------------------------------------------------
 1 | # Compare vcf files
 2 | ##### DEFAULTS ######
 3 | include modules/Makefile.inc
 4 | 
 5 | LOGDIR = log/vcfComp.$(NOW)
 6 | ##### MAKE INCLUDES #####
 7 | 
 8 | .DELETE_ON_ERROR:
 9 | .SECONDARY: 
10 | .PHONY : all variant_eval gt_concordance
11 | 
12 | FILTER_SUFFIX := dp_ft.target_ft
13 | ifdef TARGETS_FILE
14 | FILTER_SUFFIX := $(FILTER_SUFFIX).target_ft
15 | endif
16 | ifdef NORMAL_VCF
17 | FILTER_SUFFIX := nft.$(FILTER_SUFFIX)
18 | endif
19 | #VARIANT_TYPES := gatk_snps snvmix2
20 | EVAL_TYPES ?= rnaseq_gatk_snps
21 | COMP_TYPES ?= exonseq_museq exonseq_mutect
22 | 
23 | all : variant_eval
24 | 
25 | gt_concordance : $(foreach sample,$(SAMPLES),cmp_vcf/grp/$(sample).gt_concord.grp)
26 | 
27 | variant_eval : $(foreach sample,$(SAMPLES),cmp_vcf/grp/$(sample).variant_eval.grp)
28 | 
29 | cmp_vcf/grp/%.gt_concord.grp : $(foreach type,$(VARIANT_TYPES),vcf/%.$(type).$(FILTER_SUFFIX).vcf)
30 | 	$(call RUN,-s 9G -m 12G,"$(call GATK_MEM,8G) -T GenotypeConcordance -R $(REF_FASTA) $(foreach i,$^,--eval:$(notdir $(i:.$(FILTER_SUFFIX).vcf=)) $i )  $(foreach i,$^,--comp:$(notdir $(i:.$(FILTER_SUFFIX).vcf=)) $i ) -o $@")
31 | 	
32 | cmp_vcf/grp/%.variant_eval.grp : $(foreach type,$(EVAL_TYPES),vcf/%.$(type).vcf) $(foreach type,$(COMP_TYPES),vcf/%.$(type).vcf)
33 | 	$(call RUN,-s 9G -m 12G,"$(call GATK_MEM,8G) -T VariantEval --dbsnp $(DBSNP) -R $(REF_FASTA) $(foreach i,$(EVAL_TYPES),--eval:$i vcf/$*.$i.vcf ) $(foreach i,$(COMP_TYPES),--comp:$i vcf/$*.$i.vcf ) -o $@")
34 | #$(call RUN,-s 4G -m 6G,"$(call GATK_MEM,4G) -T VariantEval --dbsnp $(DBSNP) -R $(REF_FASTA)  --eval:$(<F:.$(FILTER_SUFFIX).vcf=) $< $(foreach i,$(wordlist 2,$(words $^),$^),--comp:$(notdir $(i:.$(FILTER_SUFFIX).vcf=)) $i ) -o $@")
35 | 
36 | include modules/variant_callers/gatk.inc
37 | 


--------------------------------------------------------------------------------
/vcf_tools/vcfMerge.mk:
--------------------------------------------------------------------------------
 1 | # Merge vcf files
 2 | 
 3 | ##### MAKE INCLUDES #####
 4 | include modules/Makefile.inc
 5 | include modules/variant_callers/gatk.inc
 6 | 
 7 | .DELETE_ON_ERROR:
 8 | .SECONDARY: 
 9 | .PHONY : all variant_eval gt_concordance
10 | 
11 | 
12 | FILTER_SUFFIX := dp_ft
13 | VARIANT_TYPES := mutect museq
14 | MERGE_SUFFIX = $(subst $( ),_,$(VARIANT_TYPES))
15 | 
16 | 
17 | all : $(foreach 
18 | 
19 | merged_vcf/%.$(MERGE_SUFFIX).vcf : vcf/%.$(FILTER_SUFFIX).vcf
20 | 	
21 | 
22 | 


--------------------------------------------------------------------------------
/vcf_tools/vcfPostFilters.mk:
--------------------------------------------------------------------------------
 1 | # post-annotation filters
 2 | VCF_POST_ANN_FILTER_EXPRESSION ?= ExAC_AF > 0.1
 3 | vcf/%.cft.vcf : vcf/%.vcf
 4 | 	$(call CHECK_VCF,$(call RUN,-c -s 8G -m 12G,"$(call GATK_MEM,8G) -T VariantFiltration -U LENIENT_VCF_PROCESSING -R $(REF_FASTA) -V $< -o $@.tmp \
 5 | 		--filterExpression '$(VCF_POST_ANN_FILTER_EXPRESSION)' --filterName customFilter && $(call VERIFY_VCF,$@.tmp,$@)"))
 6 | 
 7 | COMMON_FILTER_VCF = $(PYTHON) modules/vcf_tools/common_filter_vcf.py
 8 | vcf/%.common_ft.vcf : vcf/%.vcf
 9 | 	$(call CHECK_VCF,$(call RUN,-c -s 4G -m 5G,"$(COMMON_FILTER_VCF) $< > $@.tmp && $(call VERIFY_VCF,$@.tmp,$@)"))
10 | 


--------------------------------------------------------------------------------
/virus/krona_classify.mk:
--------------------------------------------------------------------------------
 1 | include modules/Makefile.inc
 2 | 
 3 | LOGDIR ?= log/krona_classify.$(NOW)
 4 | PHONY += unmapped_reads
 5 | 
 6 | krona_classify : $(foreach sample,$(SAMPLES),unmapped_reads/$(sample).html)
 7 | 
 8 | define krona-classify
 9 | unmapped_reads/%.html : unmapped_reads/%.blast
10 | 	$(call RUN,-n 1 -s 4G -m 9G,"ktClassifyBLAST -s $$< -o unmapped_reads/$$*.tax && ktImportTaxonomy -m 1 unmapped_reads/$$*.tax -o unmapped_reads/$$*.html")
11 | endef
12 | $(foreach sample,$(SAMPLES),\
13 | 		$(eval $(call krona-classify,$(sample))))
14 | 
15 | 
16 | .DELETE_ON_ERROR:
17 | .SECONDARY:
18 | .PHONY: $(PHONY)
19 | 


--------------------------------------------------------------------------------