├── tests ├── README.md ├── inputs │ ├── empty-for-test.txt │ ├── wf_theiaprok_illumina_pe_cromwell.json │ └── wf_theiaprok_illumina_pe.json └── config │ └── pytest_filter.yml ├── validation_files └── kleb_assembly_input.json ├── workflows ├── wf_bc_n_qc_pe_local-dev.wdl ├── wf_rasusa.wdl ├── wf_serotypefinder.wdl ├── fetch_sra_to_fastq.wdl ├── wf_gambit_query.wdl ├── wf_kraken2_se.wdl ├── wf_pmga.wdl ├── wf_mycosnp_consensus_assembly.wdl ├── wf_kraken2_pe.wdl ├── wf_mycosnp_tree.wdl ├── wf_kleborate.wdl ├── wf_amrfinderplus.wdl ├── wf_mashtree_fasta.wdl ├── wf_tbprofiler_ont.wdl ├── wf_tbprofiler_pe.wdl ├── de_novo_assembly.wdl ├── wf_ksnp3.wdl ├── ecoli_char.wdl ├── wf_read_QC_trim.wdl ├── wf_read_QC_trim_se.wdl ├── compile_ecoli_results.wdl └── wf_core_gene_snp.wdl ├── tasks ├── task_versioning.wdl ├── gene_typing │ ├── task_gamma.wdl │ ├── task_prokka.wdl │ ├── task_abricate.wdl │ ├── task_plasmidfinder.wdl │ ├── task_bakta.wdl │ └── task_resfinder.wdl ├── species_typing │ ├── task_ssuissero.wdl │ ├── task_hpsuissero.wdl │ ├── task_lissero.wdl │ ├── task_legsta.wdl │ ├── task_pasty.wdl │ ├── task_seroba.wdl │ ├── task_spatyper.wdl │ ├── task_sistr.wdl │ ├── task_serotypefinder.wdl │ ├── task_pmga.wdl │ ├── task_ectyper.wdl │ ├── task_pbptyper.wdl │ ├── task_staphopiasccmec.wdl │ ├── task_meningotype.wdl │ ├── task_seqsero2.wdl │ ├── task_hicap.wdl │ ├── task_emmtyper.wdl │ ├── task_genotyphi.wdl │ ├── task_shigatyper.wdl │ ├── task_ngmaster.wdl │ ├── task_sonneityping.wdl │ ├── task_ts_mlst.wdl │ ├── task_agrvate.wdl │ ├── task_poppunk_streppneumo.wdl │ ├── task_srst2_vibrio.wdl │ ├── task_kleborate.wdl │ └── task_shigeifinder.wdl ├── phylogenetic_inference │ ├── task_iqtree.wdl │ ├── task_mashtree.wdl │ ├── task_mycosnp_tree.wdl │ ├── task_ksnp3.wdl │ ├── task_snp_dists.wdl │ └── task_pirate.wdl ├── assembly │ ├── task_mycosnp_consensus_assembly.wdl │ └── task_shovill.wdl ├── quality_control │ ├── task_quast.wdl │ ├── task_busco.wdl │ ├── task_trimmomatic.wdl │ ├── task_bbduk.wdl │ ├── task_fastp.wdl │ ├── task_fastq_scan.wdl │ ├── task_fastqc.wdl │ ├── task_cg_pipeline.wdl │ └── task_mummer_ani.wdl ├── utilities │ ├── task_rasusa.wdl │ └── task_summarize_data.wdl └── taxon_id │ ├── task_midas.wdl │ └── task_kraken2.wdl ├── README.md ├── .github └── workflows │ ├── miniwdl-check.yml │ └── pytest-workflows.yml └── .dockstore.yml /tests/README.md: -------------------------------------------------------------------------------- 1 | # Testing info 2 | -------------------------------------------------------------------------------- /tests/inputs/empty-for-test.txt: -------------------------------------------------------------------------------- 1 | This file is empty for test purposes. -------------------------------------------------------------------------------- /validation_files/kleb_assembly_input.json: -------------------------------------------------------------------------------- 1 | { 2 | "kleborate_wf.assembly": "./validation_files/GCF_000240185.1_ASM24018v2_genomic.fna", 3 | "kleborate_wf.samplename": "Sample1" 4 | } 5 | -------------------------------------------------------------------------------- /workflows/wf_bc_n_qc_pe_local-dev.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "wf_bc_n_qc_pe.wdl" as assembly 4 | 5 | workflow bc_n_qc_local { 6 | input { 7 | Array[Pair[Array[String], Pair[File,File]]] inputSamples 8 | } 9 | 10 | scatter (sample in inputSamples) { 11 | call assembly.bc_n_qc_pe { 12 | input: 13 | samplename = sample.left[0], 14 | read1_raw = sample.right.left, 15 | read2_raw = sample.right.right 16 | } 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /tests/inputs/wf_theiaprok_illumina_pe_cromwell.json: -------------------------------------------------------------------------------- 1 | { 2 | "theiaprok_illumina_pe.samplename": "test", 3 | "theiaprok_illumina_pe.read1_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R1.fastq.gz", 4 | "theiaprok_illumina_pe.read2_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R2.fastq.gz", 5 | "theiaprok_illumina_pe.skip_screen": true, 6 | "theiaprok_illumina_pe.read_QC_trim.call_midas": false, 7 | "theiaprok_illumina_pe.genome_annotation": "prokka", 8 | "theiaprok_illumina_pe.shovill_pe.assembler": "skesa", 9 | "theiaprok_illumina_pe.merlin_magic.call_poppunk": false 10 | } 11 | -------------------------------------------------------------------------------- /tasks/task_versioning.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task version_capture { 4 | input { 5 | String? timezone 6 | } 7 | meta { 8 | volatile: true 9 | } 10 | command { 11 | PHBG_Version="PHBG v1.3.0" 12 | ~{default='' 'export TZ=' + timezone} 13 | date +"%Y-%m-%d" > TODAY 14 | echo "$PHBG_Version" > PHBG_VERSION 15 | } 16 | output { 17 | String date = read_string("TODAY") 18 | String phbg_version = read_string("PHBG_VERSION") 19 | } 20 | runtime { 21 | memory: "1 GB" 22 | cpu: 1 23 | docker: "quay.io/theiagen/utility:1.1" 24 | disks: "local-disk 10 HDD" 25 | dx_instance_type: "mem1_ssd1_v2_x2" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tasks/gene_typing/task_gamma.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task gamma_one_sample { 4 | input { 5 | File assembly_fasta 6 | String samplename 7 | String docker = "quay.io/biocontainers/gamma:1.4--hdfd78af_0" 8 | File gamma_database 9 | Int disk_size = 100 10 | } 11 | String database_name = basename(gamma_database) 12 | command <<< 13 | GAMMA.py ~{assembly_fasta} ~{gamma_database} ~{samplename} 14 | 15 | mv ~{samplename}.gamma ~{samplename}_gamma_report.tsv 16 | 17 | >>> 18 | output { 19 | File gamma_results = "~{samplename}_gamma_report.tsv" 20 | String gamma_database_version = database_name 21 | String gamma_docker = docker 22 | } 23 | runtime { 24 | memory: "8 GB" 25 | cpu: 4 26 | docker: "~{docker}" 27 | disks: "local-disk " + disk_size + " SSD" 28 | disk: disk_size + " GB" 29 | maxRetries: 3 30 | } 31 | } -------------------------------------------------------------------------------- /workflows/wf_rasusa.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/utilities/task_rasusa.wdl" as rasusa 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow rasusa_workflow { 7 | input { 8 | File read1 9 | File? read2 10 | String samplename 11 | Float coverage 12 | String genome_size 13 | } 14 | call rasusa.rasusa as rasusa_task { 15 | input: 16 | read1 = read1, 17 | read2 = read2, 18 | samplename = samplename, 19 | genome_size = genome_size, 20 | coverage = coverage 21 | } 22 | call versioning.version_capture{ 23 | input: 24 | } 25 | output { 26 | String rasusa_wf_version = version_capture.phbg_version 27 | String rasusa_wf_analysis_date = version_capture.date 28 | 29 | String rasusa_version = rasusa_task.rasusa_version 30 | File read1_subsampled = rasusa_task.read1_subsampled 31 | File? read2_subsampled = rasusa_task.read2_subsampled 32 | } 33 | } -------------------------------------------------------------------------------- /workflows/wf_serotypefinder.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/task_taxon_id.wdl" as taxon_ID 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow serotypefinder { 7 | input { 8 | String samplename 9 | File ecoli_assembly 10 | } 11 | call taxon_ID.serotypefinder_one_sample { 12 | input: 13 | samplename = samplename, 14 | ecoli_assembly = ecoli_assembly 15 | } 16 | call versioning.version_capture{ 17 | input: 18 | } 19 | output { 20 | String serotypefinder_wf_version = version_capture.phbg_version 21 | String serotypefinder_wf_analysis_date = version_capture.date 22 | 23 | String serotypefinder_report = serotypefinder_one_sample.serotypefinder_report 24 | String serotypefinder_docker = serotypefinder_one_sample.serotypefinder_docker 25 | String serotypefinder_serotype = serotypefinder_one_sample.serotypefinder_serotype 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tasks/species_typing/task_ssuissero.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task ssuissero { 4 | meta { 5 | description: "Serotype prediction of Streptococcus suis assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/ssuissero:1.0.1--hdfd78af_0" 11 | Int disk_size = 100 12 | Int? cpu = 4 13 | String version = "1.0.1" 14 | } 15 | command <<< 16 | # Does not output a version 17 | echo ~{version} | tee VERSION 18 | SsuisSero.sh \ 19 | -i ~{assembly} \ 20 | -o ./ \ 21 | -s ~{samplename} \ 22 | -x fasta \\ 23 | -t ~{cpu} 24 | >>> 25 | output { 26 | File ssuissero_results = "~{samplename}.tsv" 27 | String ssuissero_version = read_string("VERSION") 28 | } 29 | runtime { 30 | docker: "~{docker}" 31 | memory: "8 GB" 32 | cpu: 4 33 | disks: "local-disk " + disk_size + " SSD" 34 | disk: disk_size + " GB" 35 | maxRetries: 3 36 | preemptible: 0 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tasks/species_typing/task_hpsuissero.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task hpsuissero { 4 | meta { 5 | description: "Serotype prediction of Haemophilus parasuis assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/hpsuissero:1.0.1--hdfd78af_0" 11 | Int disk_size = 100 12 | Int? cpu = 4 13 | String version = "1.0.1" 14 | } 15 | command <<< 16 | # Does not output a version 17 | echo ~{version} | tee VERSION 18 | HpsuisSero.sh \ 19 | -i ~{assembly} \ 20 | -o ./ \ 21 | -s ~{samplename} \ 22 | -x fasta \ 23 | -t ~{cpu} 24 | >>> 25 | output { 26 | File hpsuissero_results = "~{samplename}.tsv" 27 | String hpsuissero_version = read_string("VERSION") 28 | } 29 | runtime { 30 | docker: "~{docker}" 31 | memory: "8 GB" 32 | cpu: 4 33 | disks: "local-disk " + disk_size + " SSD" 34 | disk: disk_size + " GB" 35 | maxRetries: 3 36 | preemptible: 0 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /workflows/fetch_sra_to_fastq.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow fetch_sra_to_fastq { 4 | 5 | input { 6 | String SRR 7 | } 8 | 9 | call prefetch_fastq_dump { 10 | input: 11 | sra_id=SRR 12 | } 13 | 14 | output { 15 | File read1 =prefetch_fastq_dump.read1 16 | File read2 =prefetch_fastq_dump.read2 17 | } 18 | } 19 | 20 | task prefetch_fastq_dump { 21 | 22 | input { 23 | String sra_id 24 | } 25 | 26 | command { 27 | prefetch --version | head -1 | tee VERSION 28 | prefetch ${sra_id} 29 | fastq-dump --version | head -1 | tee VERSION 30 | fastq-dump \ 31 | --gzip \ 32 | --split-files \ 33 | ${sra_id} 34 | } 35 | 36 | output { 37 | File read1="${sra_id}_1.fastq.gz" 38 | File read2="${sra_id}_2.fastq.gz" 39 | } 40 | 41 | runtime { 42 | docker: "quay.io/staphb/sratoolkit:2.9.2" 43 | memory: "8 GB" 44 | cpu: 2 45 | disks: "local-disk 100 SSD" 46 | preemptible: 1 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /workflows/wf_gambit_query.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/taxon_id/task_gambit.wdl" as gambit 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow gambit_query { 7 | input { 8 | File assembly_fasta 9 | String samplename 10 | } 11 | call gambit.gambit { 12 | input: 13 | assembly = assembly_fasta, 14 | samplename = samplename, 15 | } 16 | call versioning.version_capture { 17 | input: 18 | } 19 | output { 20 | String gambit_query_wf_version = version_capture.phbg_version 21 | String gambit_query_wf_analysis_date = version_capture.date 22 | #Taxon ID 23 | File gambit_report = gambit.gambit_report_file 24 | File gambit_closest_genomes = gambit.gambit_closest_genomes_file 25 | String gambit_predicted_taxon = gambit.gambit_predicted_taxon 26 | String gambit_predicted_taxon_rank = gambit.gambit_predicted_taxon_rank 27 | String gambit_version = gambit.gambit_version 28 | String gambit_db_version = gambit.gambit_db_version 29 | String gambit_docker = gambit.gambit_docker 30 | } 31 | } -------------------------------------------------------------------------------- /tests/config/pytest_filter.yml: -------------------------------------------------------------------------------- 1 | wf_theiaprok_illumina_pe: 2 | - workflows/wf_theiaprok_illumina_pe.wdl 3 | - tasks/assembly/task_shovill.wdl 4 | - tasks/quality_control/task_quast.wdl 5 | - tasks/quality_control/task_cg_pipeline.wdl 6 | - tasks/quality_control/task_screen.wdl 7 | - tasks/taxon_id/task_gambit.wdl 8 | - tasks/gene_typing/task_amrfinderplus.wdl 9 | - tasks/species_typing/task_ts_mlst.wdl 10 | - tasks/task_versioning.wdl 11 | - tasks/utilities/task_broad_terra_tools.wdl 12 | - workflows/wf_read_QC_trim.wdl 13 | - tasks/quality_control/task_trimmomatic.wdl 14 | - tasks/quality_control/task_bbduk.wdl 15 | - tasks/quality_control/task_fastq_scan.wdl 16 | - workflows/wf_merlin_magic.wdl 17 | - tasks/species_typing/task_serotypefinder.wdl 18 | - tasks/species_typing/task_ectyper.wdl 19 | - tasks/species_typing/task_lissero.wdl 20 | - tasks/species_typing/task_sistr.wdl 21 | - tasks/species_typing/task_seqsero2.wdl 22 | - tasks/species_typing/task_kleborate.wdl 23 | - tasks/species_typing/task_tbprofiler.wdl 24 | - tasks/species_typing/task_legsta.wdl 25 | - tasks/species_typing/task_genotyphi.wdl 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | **

NOTE: WORKFLOWS FROM THIS REPOSITORY HAVE BEEN MIGRATED TO THE PUBLIC HEALTH BIOINFORMATICS (PHB) REPOSITORY. FUTURE DEVELOPMENTS AND UPDATES FOR THESE WORKFLOWS WILL OCCUR IN [https://github.com/theiagen/public_health_bioinformatics](https://github.com/theiagen/public_health_bioinformatics).

** 3 | 4 | ---- 5 | 6 | # Public Health Bacterial Genomics 7 | 8 | Bioinformatics workflows for genomic characteriation, submission preparation, and genomic epidemiology of bacterial pathogens of concern. 9 | 10 | **More information about the steps undertaken in these workflows is available via the [Theiagen Public Resources Documentation](https://theiagen.notion.site/Theiagen-Public-Health-Resources-a4bd134b0c5c4fe39870e21029a30566).** 11 | 12 | # Note to Users 13 | This repository and the workflows within are in the early stages of development. We recommend usage of our stable version releases as the main and development branches are subject to routine updates. Please contact support@terrapublichealth.zendesk.com if you would like to be added to our PHBG mailing list and 14 | receive updates and announcements regarding any resource associated with this repository. 15 | 16 | 17 | -------------------------------------------------------------------------------- /workflows/wf_kraken2_se.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/taxon_id/task_kraken2.wdl" as kraken2 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow kraken2_se_wf { 7 | meta { 8 | description: "Classify single-end reads using Kraken2" 9 | } 10 | 11 | input { 12 | String samplename 13 | File read1 14 | File kraken2_db 15 | } 16 | call kraken2.kraken2_se { 17 | input: 18 | samplename = samplename, 19 | read1 = read1, 20 | kraken2_db = kraken2_db 21 | } 22 | call versioning.version_capture{ 23 | input: 24 | } 25 | output { 26 | # PHBG Version Captures 27 | String kraken2_se_wf_version = version_capture.phbg_version 28 | String kraken2_se_wf_analysis_date = version_capture.date 29 | # Kraken2 30 | String kraken2_version = kraken2_se.kraken2_version 31 | String kraken2_docker = kraken2_se.kraken2_docker 32 | File kraken2_report = kraken2_se.kraken2_report 33 | File kraken2_classified_report = kraken2_se.kraken2_classified_report 34 | File kraken2_unclassified_read1 = kraken2_se.kraken2_unclassified_read1 35 | File kraken2_classified_read1 = kraken2_se.kraken2_classified_read1 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /workflows/wf_pmga.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/tools/task_pmga.wdl" as pmga 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow pmga_wf { 7 | input { 8 | File assembly 9 | String samplename 10 | } 11 | call pmga.pmga { 12 | input: 13 | assembly = assembly, 14 | samplename = samplename 15 | } 16 | call versioning.version_capture{ 17 | input: 18 | } 19 | output { 20 | String pmga_wf_version = version_capture.phbg_version 21 | String pmga_wf_analysis_date = version_capture.date 22 | String pmga_version = pmga.version 23 | String pmga_docker = pmga.docker 24 | String pmga_speciesdb = pmga.pmga_speciesdb 25 | String pmga_serotype = pmga.pmga_serotype 26 | String pmga_genes = pmga.pmga_genes 27 | String pmga_notes = pmga.pmga_notes 28 | File pmga_results = pmga.pmga_results 29 | File pmga_allele_matrix = pmga.pmga_allele_matrix 30 | File pmga_blast_final = pmga.pmga_blast_final 31 | File pmga_blast_raw = pmga.pmga_blast_raw 32 | File pmga_loci_counts = pmga.pmga_loci_counts 33 | File pmga_gff = pmga.pmga_gff 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tasks/species_typing/task_lissero.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task lissero { 4 | meta { 5 | description: "Serogroup typing prediction for Listeria monocytogenes" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/lissero:0.4.9--py_0" 11 | Int disk_size = 100 12 | Int? cpu = 2 13 | 14 | # Parameters 15 | # --min_id Minimum percent identity to accept a match [Default 95.0] 16 | # --min_cov Minimum coverage of the gene to accept a match [Default 95.0] 17 | Float min_id = 95.0 18 | Float min_cov = 95.0 19 | } 20 | command <<< 21 | echo $(lissero --version 2>&1) | sed 's/^.*LisSero //' | tee VERSION 22 | lissero \ 23 | ~{'--min_id ' + min_id} \ 24 | ~{'--min_cov ' + min_cov} \ 25 | ~{assembly} \ 26 | > ~{samplename}.tsv 27 | 28 | # pull out serotype 29 | tail -n+2 ~{samplename}.tsv | cut -f2 | tee SEROTYPE 30 | >>> 31 | output { 32 | File lissero_results = "~{samplename}.tsv" 33 | String lissero_version = read_string("VERSION") 34 | String lissero_serotype = read_string("SEROTYPE") 35 | } 36 | runtime { 37 | docker: "~{docker}" 38 | memory: "8 GB" 39 | cpu: 2 40 | disks: "local-disk " + disk_size + " SSD" 41 | disk: disk_size + " GB" 42 | maxRetries: 3 43 | preemptible: 0 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /workflows/wf_mycosnp_consensus_assembly.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/assembly/task_mycosnp_consensus_assembly.wdl" as mycosnp_nf 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow mycosnp_consensus_assembly { 7 | meta { 8 | description: "A WDL wrapper around the qc, processing and consensus assembly components of mycosnp-nf, for whole genome sequencing analysis of fungal organisms, including Candida auris." 9 | } 10 | input { 11 | File read1 12 | File read2 13 | String samplename 14 | } 15 | call mycosnp_nf.mycosnp { 16 | input: 17 | read1 = read1, 18 | read2 = read2, 19 | samplename = samplename 20 | } 21 | call versioning.version_capture{ 22 | input: 23 | } 24 | output { 25 | #Version Captures 26 | String mycosnp_consensus_assembly_version = version_capture.phbg_version 27 | String mycosnp_consensus_assembly_analysis_date = version_capture.date 28 | #MycoSNP QC and Assembly 29 | String mycosnp_version = mycosnp.mycosnp_version 30 | String mycosnp_docker = mycosnp.mycosnp_docker 31 | String analysis_date = mycosnp.analysis_date 32 | String reference_strain = mycosnp.reference_strain 33 | String reference_accession = mycosnp.reference_accession 34 | File assembly_fasta = mycosnp.assembly_fasta 35 | File full_results = mycosnp.full_results 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tasks/species_typing/task_legsta.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task legsta { 4 | meta { 5 | description: "Typing of Legionella pneumophila assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/legsta:0.5.1--hdfd78af_2" 11 | Int disk_size = 100 12 | Int? cpu = 2 13 | } 14 | command <<< 15 | echo $(legsta --version 2>&1) | sed 's/^.*legsta //; s/ .*\$//;' | tee VERSION 16 | legsta \ 17 | ~{assembly} > ~{samplename}.tsv 18 | 19 | # parse outputs 20 | if [ ! -f ~{samplename}.tsv ]; then 21 | SBT="No SBT predicted" 22 | else 23 | SBT="ST$(tail -n 1 ~{samplename}.tsv | cut -f 2)" 24 | if [ "$SBT" == "ST-" ]; then 25 | SBT="No SBT predicted" 26 | else 27 | if [ "$SBT" == "ST" ]; then 28 | SBT="No SBT predicted" 29 | fi 30 | fi 31 | fi 32 | 33 | echo $SBT | tee LEGSTA_SBT 34 | 35 | >>> 36 | output { 37 | File legsta_results = "~{samplename}.tsv" 38 | String legsta_predicted_sbt = read_string("LEGSTA_SBT") 39 | String legsta_version = read_string("VERSION") 40 | } 41 | runtime { 42 | docker: "~{docker}" 43 | memory: "8 GB" 44 | cpu: 2 45 | disks: "local-disk " + disk_size + " SSD" 46 | disk: disk_size + " GB" 47 | maxRetries: 3 48 | preemptible: 0 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /workflows/wf_kraken2_pe.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/taxon_id/task_kraken2.wdl" as kraken2 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow kraken2_pe_wf { 7 | meta { 8 | description: "Classify paired-end reads using Kraken2" 9 | } 10 | 11 | input { 12 | String samplename 13 | File read1 14 | File read2 15 | File kraken2_db 16 | } 17 | call kraken2.kraken2_pe { 18 | input: 19 | samplename = samplename, 20 | read1 = read1, 21 | read2 = read2, 22 | kraken2_db = kraken2_db 23 | } 24 | call versioning.version_capture{ 25 | input: 26 | } 27 | output { 28 | # PHBG Version Captures 29 | String kraken2_pe_wf_version = version_capture.phbg_version 30 | String kraken2_pe_wf_analysis_date = version_capture.date 31 | # Kraken2 32 | String kraken2_version = kraken2_pe.kraken2_version 33 | String kraken2_docker = kraken2_pe.kraken2_docker 34 | File kraken2_report = kraken2_pe.kraken2_report 35 | File kraken2_classified_report = kraken2_pe.kraken2_classified_report 36 | File kraken2_unclassified_read1 = kraken2_pe.kraken2_unclassified_read1 37 | File kraken2_unclassified_read2 = kraken2_pe.kraken2_unclassified_read2 38 | File kraken2_classified_read1 = kraken2_pe.kraken2_classified_read1 39 | File kraken2_classified_read2 = kraken2_pe.kraken2_classified_read2 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /workflows/wf_mycosnp_tree.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/phylogenetic_inference/task_mycosnp_tree.wdl" as mycosnptree_nf 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow mycosnp_tree { 7 | meta { 8 | description: "A WDL wrapper around the phylogeny components of mycosnp-nf, for whole genome sequencing analysis of fungal organisms, including Candida auris." 9 | } 10 | input { 11 | Array[String] samplename 12 | Array[File] assembly_fasta 13 | } 14 | call mycosnptree_nf.mycosnptree { 15 | input: 16 | assembly_fasta = assembly_fasta, 17 | samplename = samplename 18 | } 19 | call versioning.version_capture{ 20 | input: 21 | } 22 | output { 23 | #Version Captures 24 | String mycosnp_tree_version = version_capture.phbg_version 25 | String mycosnp_tree_analysis_date = version_capture.date 26 | #MycoSNP QC and Assembly 27 | String mycosnp_version = mycosnptree.mycosnptree_version 28 | String mycosnp_docker = mycosnptree.mycosnptree_docker 29 | String analysis_date = mycosnptree.analysis_date 30 | String reference_strain = mycosnptree.reference_strain 31 | String reference_accession = mycosnptree.reference_accession 32 | File mycosnp_tree_finaltree = mycosnptree.mycosnptree_tree 33 | File mycosnp_tree_iqtree_log = mycosnptree.mycosnptree_iqtree_log 34 | File mycosnp_tree_full_results = mycosnptree.mycosnptree_full_results 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /workflows/wf_kleborate.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | 4 | import "../tasks/task_taxon_id.wdl" as taxon 5 | import "../tasks/task_versioning.wdl" as versioning 6 | 7 | workflow kleborate_wf { 8 | input { 9 | File assembly 10 | String samplename 11 | } 12 | call taxon.kleborate_one_sample { 13 | input: 14 | assembly = assembly, 15 | samplename = samplename 16 | } 17 | call versioning.version_capture{ 18 | input: 19 | } 20 | output { 21 | String kleborate_wf_version = version_capture.phbg_version 22 | String kleborate_wf_analysis_date = version_capture.date 23 | 24 | File kleborate_report = kleborate_one_sample.kleborate_output_file 25 | String kleborate_version = kleborate_one_sample.version 26 | String kleborate_mlst_sequence_type = kleborate_one_sample.mlst_sequence_type 27 | String kleborate_virulence_score = kleborate_one_sample.virulence_score 28 | String kleborate_resistance_score = kleborate_one_sample.resistance_score 29 | String kleborate_num_resistance_genes = kleborate_one_sample.num_resistance_genes 30 | String kleborate_bla_resistance_genes = kleborate_one_sample.bla_resistance_genes 31 | String kleborate_esbl_resistance_genes = kleborate_one_sample.esbl_resistance_genes 32 | String kleborate_key_resistance_genes = kleborate_one_sample.key_resistance_genes 33 | String kleborate_resistance_mutations = kleborate_one_sample.resistance_mutations 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tasks/phylogenetic_inference/task_iqtree.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task iqtree { 4 | input { 5 | File alignment 6 | String cluster_name 7 | String iqtree_model = "GTR+I+G" # For comparison to other tools use HKY for bactopia, GTR+F+I for grandeur, GTR+G4 for nullarbor, GTR+G for dryad 8 | String iqtree_bootstraps = 1000 # Ultrafast bootstrap replicates 9 | String alrt = 1000 # SH-like approximate likelihood ratio test (SH-aLRT) replicates 10 | String? iqtree_opts = "" 11 | String docker = "staphb/iqtree:1.6.7" 12 | Int disk_size = 100 13 | } 14 | command <<< 15 | # date and version control 16 | date | tee DATE 17 | iqtree --version | grep version | sed 's/.*version/version/;s/ for Linux.*//' | tee VERSION 18 | 19 | numGenomes=`grep -o '>' ~{alignment} | wc -l` 20 | if [ $numGenomes -gt 3 ] 21 | then 22 | cp ~{alignment} ./msa.fasta 23 | iqtree \ 24 | -nt AUTO \ 25 | -s msa.fasta \ 26 | -m ~{iqtree_model} \ 27 | -bb ~{iqtree_bootstraps} \ 28 | -alrt ~{alrt} \ 29 | ~{iqtree_opts} 30 | 31 | cp msa.fasta.contree ~{cluster_name}_msa.tree 32 | fi 33 | >>> 34 | output { 35 | String date = read_string("DATE") 36 | String version = read_string("VERSION") 37 | File ml_tree = "~{cluster_name}_msa.tree" 38 | } 39 | runtime { 40 | docker: "~{docker}" 41 | memory: "32 GB" 42 | cpu: 4 43 | disks: "local-disk " + disk_size + " SSD" 44 | disk: disk_size + " GB" 45 | preemptible: 0 46 | maxRetries: 3 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tasks/gene_typing/task_prokka.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task prokka { 4 | input { 5 | File assembly 6 | String samplename 7 | Int cpu = 8 8 | Int memory = 16 9 | String docker = "staphb/prokka:1.14.5" 10 | Int disk_size = 100 11 | # Parameters 12 | # proteins recommended: when you have good quality reference genomes and want to ensure gene naming is consistent [false] 13 | # prodigal_tf: prodigal training file 14 | # prokka_arguments: free string to add any other additional prokka arguments 15 | Boolean proteins = false 16 | Boolean compliant = true 17 | File? prodigal_tf 18 | String? prokka_arguments 19 | } 20 | command <<< 21 | date | tee DATE 22 | prokka --version | tee PROKKA_VERSION 23 | 24 | prokka \ 25 | ~{prokka_arguments} \ 26 | --cpus 0 \ 27 | --prefix ~{samplename} \ 28 | ~{true='--compliant' false='' compliant} \ 29 | ~{true='--proteins' false='' proteins} \ 30 | ~{'--prodigaltf ' + prodigal_tf} \ 31 | ~{assembly} 32 | 33 | 34 | >>> 35 | output { 36 | File prokka_gff = "~{samplename}/~{samplename}.gff" 37 | File prokka_gbk = "~{samplename}/~{samplename}.gbk" 38 | File prokka_sqn = "~{samplename}/~{samplename}.sqn" 39 | Array[File] prokka_outs = glob("~{samplename}/~{samplename}*") 40 | String prokka_version = read_string("PROKKA_VERSION") 41 | } 42 | runtime { 43 | memory: "~{memory} GB" 44 | cpu: cpu 45 | docker: docker 46 | disks: "local-disk " + disk_size + " SSD" 47 | disk: disk_size + " GB" 48 | maxRetries: 3 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /workflows/wf_amrfinderplus.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/gene_typing/task_amrfinderplus.wdl" as amrfindertask 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow amrfinderplus_wf { 7 | input { 8 | File assembly 9 | String samplename 10 | } 11 | call amrfindertask.amrfinderplus_nuc { 12 | input: 13 | assembly = assembly, 14 | samplename = samplename 15 | } 16 | call versioning.version_capture{ 17 | input: 18 | } 19 | output { 20 | String amrfinderplus_version = amrfinderplus_nuc.amrfinderplus_version 21 | String amrfinderplus_db_version = amrfinderplus_nuc.amrfinderplus_db_version 22 | String amrfinderplus_wf_version = version_capture.phbg_version 23 | String amrfinderplus_wf_analysis_date = version_capture.date 24 | File amrfinderplus_all_report = amrfinderplus_nuc.amrfinderplus_all_report 25 | File amrfinderplus_amr_report = amrfinderplus_nuc.amrfinderplus_amr_report 26 | File amrfinderplus_stress_report = amrfinderplus_nuc.amrfinderplus_stress_report 27 | File amrfinderplus_virulence_report = amrfinderplus_nuc.amrfinderplus_virulence_report 28 | String amrfinderplus_amr_genes = amrfinderplus_nuc.amrfinderplus_amr_genes 29 | String amrfinderplus_stress_genes = amrfinderplus_nuc.amrfinderplus_stress_genes 30 | String amrfinderplus_virulence_genes = amrfinderplus_nuc.amrfinderplus_virulence_genes 31 | String amrfinderplus_amr_classes = amrfinderplus_nuc.amrfinderplus_amr_classes 32 | String amrfinderplus_amr_subclasses = amrfinderplus_nuc.amrfinderplus_amr_subclasses 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /tasks/phylogenetic_inference/task_mashtree.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task mashtree_fasta { 4 | input { 5 | Array[File] assembly_fasta 6 | String cluster_name 7 | Int truncLength = 250 8 | String sort_order = "ABC" 9 | Int genomesize = 5000000 10 | Int mindepth = 5 11 | Int kmerlength = 21 12 | Int sketchsize = 10000 13 | Int cpu = 16 14 | Int memory = 64 15 | Int disk_size = 100 16 | } 17 | command <<< 18 | # date and version control 19 | date | tee DATE 20 | mashtree -v | tee VERSION 21 | 22 | # organize input assemblies 23 | mkdir mash_assemblies 24 | mv ~{sep=' ' assembly_fasta} mash_assemblies 25 | #run mashtree 26 | mashtree \ 27 | ~{'--truncLength ' + truncLength} \ 28 | ~{'--sort-order ' + sort_order} \ 29 | ~{'--genomesize ' + genomesize} \ 30 | ~{'--mindepth ' + mindepth} \ 31 | ~{'--kmerlength ' + kmerlength} \ 32 | ~{'--sketch-size ' + sketchsize} \ 33 | ~{'--numcpus ' + cpu} \ 34 | ~{'--outmatrix ' + cluster_name + '.tsv'} \ 35 | ~{'--outtree ' + cluster_name + '.nwk'} \ 36 | mash_assemblies/* 37 | 38 | >>> 39 | output { 40 | String date = read_string("DATE") 41 | String version = read_string("VERSION") 42 | File mashtree_matrix = "~{cluster_name}.tsv" 43 | File mashtree_tree = "~{cluster_name}.nwk" 44 | } 45 | runtime { 46 | docker: "quay.io/staphb/mashtree:1.2.0" 47 | memory: "~{memory} GB" 48 | cpu: cpu 49 | disks: "local-disk " + disk_size + " SSD" 50 | disk: disk_size + " GB" 51 | maxRetries: 3 52 | preemptible: 0 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tasks/species_typing/task_pasty.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task pasty { 4 | input { 5 | File assembly 6 | String samplename 7 | Int min_pident = 95 8 | Int min_coverage = 95 9 | String docker = "staphb/pasty:1.0.2" 10 | Int disk_size = 100 11 | } 12 | command <<< 13 | # date and version control 14 | date | tee DATE 15 | pasty --version > VERSION && sed -i -e 's/pasty\, version //' VERSION 16 | pasty \ 17 | --assembly ~{assembly} \ 18 | --min_pident ~{min_pident} \ 19 | --min_coverage ~{min_coverage} \ 20 | --prefix ~{samplename} \ 21 | --outdir . 22 | awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f2 > SEROGROUP 23 | awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f3 > COVERAGE 24 | awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f4 > FRAGMENTS 25 | awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f5 > COMMENT 26 | >>> 27 | output { 28 | String pasty_serogroup = read_string("SEROGROUP") 29 | Float pasty_serogroup_coverage = read_float("COVERAGE") 30 | Int pasty_serogroup_fragments = read_int("FRAGMENTS") 31 | File pasty_summary_tsv = "~{samplename}.tsv" 32 | File pasty_blast_hits = "~{samplename}.blastn.tsv" 33 | File pasty_all_serogroups = "~{samplename}.details.tsv" 34 | String pasty_version = read_string("VERSION") 35 | String pasty_pipeline_date = read_string("DATE") 36 | String pasty_docker = docker 37 | String pasty_comment = read_string("COMMENT") 38 | } 39 | runtime { 40 | docker: "~{docker}" 41 | memory: "4 GB" 42 | cpu: 2 43 | disks: "local-disk " + disk_size + " SSD" 44 | disk: disk_size + " GB" 45 | maxRetries: 3 46 | preemptible: 0 47 | } 48 | } -------------------------------------------------------------------------------- /tasks/species_typing/task_seroba.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task seroba { 4 | input { 5 | File read1 6 | File? read2 7 | String samplename 8 | String docker = "staphb/seroba:1.0.2" 9 | Int disk_size = 100 10 | } 11 | command <<< 12 | # grab version 13 | seroba version > VERSION 14 | 15 | # database path will need to be changed if/when docker image is updated. 16 | seroba runSerotyping /seroba-1.0.2/database/ ~{read1} ~{read2} ~{samplename} 17 | 18 | # check for serotype grouping & contamination flag 19 | cut -f2 ~{samplename}/pred.tsv > SEROTYPE 20 | 21 | # check for detailed serogroup information 22 | if [ -f ~{samplename}/detailed_serogroup_info.txt ]; then 23 | grep "Serotype predicted by ariba" ~{samplename}/detailed_serogroup_info.txt | cut -f2 | sed 's/://' > ARIBA_SEROTYPE 24 | grep "assembly from ariba" ~{samplename}/detailed_serogroup_info.txt | cut -f2 | sed 's/://' > ARIBA_IDENTITY 25 | else 26 | # if the details do not exist, output blanks to ariba columns 27 | echo "" > ARIBA_SEROTYPE 28 | echo "" > ARIBA_IDENTITY 29 | fi 30 | >>> 31 | output { 32 | String seroba_version = read_string("VERSION") 33 | String seroba_docker = docker 34 | String seroba_serotype = read_string("SEROTYPE") 35 | String seroba_ariba_serotype = read_string("ARIBA_SEROTYPE") 36 | String seroba_ariba_identity = read_string("ARIBA_IDENTITY") 37 | File? seroba_details = "~{samplename}/detailed_serogroup_info.txt" 38 | } 39 | runtime { 40 | docker: "~{docker}" 41 | memory: "16 GB" 42 | cpu: 8 43 | disks: "local-disk " + disk_size + " SSD" 44 | disk: disk_size + " GB" 45 | maxRetries: 3 46 | } 47 | } -------------------------------------------------------------------------------- /tasks/assembly/task_mycosnp_consensus_assembly.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task mycosnp { 4 | input { 5 | File read1 6 | File read2 7 | String samplename 8 | String docker = "quay.io/theiagen/mycosnp:dev" 9 | String strain = "B11205" 10 | String accession = "GCA_016772135" 11 | Int memory = 16 12 | Int cpu = 4 13 | Int disk_size = 100 14 | } 15 | command <<< 16 | date | tee DATE 17 | echo $(nextflow pull rpetit3/mycosnp-nf 2>&1) | sed 's/^.*revision: //;' | tee MYCOSNP_VERSION 18 | 19 | # Make sample FOFN 20 | echo "sample,fastq_1,fastq_2" > sample.csv 21 | echo "~{samplename},~{read1},~{read2}" >> sample.csv 22 | 23 | # Run MycoSNP 24 | mkdir ~{samplename} 25 | cd ~{samplename} 26 | if nextflow run rpetit3/mycosnp-nf --input ../sample.csv --ref_dir /reference/~{accession} --publish_dir_mode copy --skip_phylogeny; then 27 | # Everything finished, pack up the results and clean up 28 | rm -rf .nextflow/ work/ 29 | cd .. 30 | tar -cf - ~{samplename}/ | gzip -n --best > ${samplename}.tar.gz 31 | else 32 | # Run failed 33 | exit 1 34 | fi 35 | >>> 36 | output { 37 | String mycosnp_version = read_string("MYCOSNP_VERSION") 38 | String mycosnp_docker = docker 39 | String analysis_date = read_string("DATE") 40 | String reference_strain = strain 41 | String reference_accession = accession 42 | File assembly_fasta = "~{samplename}/results/combined/consensus/~{samplename}.fasta.gz" 43 | File full_results = "~{samplename}.tar.gz" 44 | } 45 | runtime { 46 | docker: "~{docker}" 47 | memory: "~{memory} GB" 48 | cpu: cpu 49 | disks: "local-disk " + disk_size + " SSD" 50 | disk: disk_size + " GB" 51 | maxRetries: 3 52 | preemptible: 0 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tasks/species_typing/task_spatyper.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task spatyper { 4 | meta { 5 | description: "Computational method for finding spa types in Staphylococcus aureus" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/spatyper:0.3.3--pyhdfd78af_3" 11 | Int disk_size = 100 12 | Int cpu = 4 13 | 14 | # Parameters 15 | # --do_enrich Do PCR product enrichment 16 | Boolean do_enrich = false 17 | } 18 | command <<< 19 | spaTyper --version 2>&1 | sed 's/^.*spaTyper //' | tee VERSION 20 | spaTyper \ 21 | ~{true="--do_enrich" false="" do_enrich} \ 22 | --fasta ~{assembly} \ 23 | --output ~{samplename}.tsv 24 | 25 | python3 <>> 45 | output { 46 | File spatyper_tsv = "~{samplename}.tsv" 47 | String spatyper_repeats = read_string("REPEATS") 48 | String spatyper_type = read_string("TYPE") 49 | String spatyper_version = read_string("VERSION") 50 | String spatyper_docker = "~{docker}" 51 | } 52 | runtime { 53 | docker: "~{docker}" 54 | memory: "8 GB" 55 | cpu: cpu 56 | disks: "local-disk " + disk_size + " SSD" 57 | disk: disk_size + " GB" 58 | maxRetries: 3 59 | preemptible: 0 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /tasks/gene_typing/task_abricate.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task abricate { 4 | input { 5 | File assembly 6 | String samplename 7 | String database 8 | # Parameters 9 | # --minid Minimum DNA %identity [80] 10 | # --mincov Minimum DNA %coverage [80] 11 | Int? minid 12 | Int? mincov 13 | Int cpu = 2 14 | String docker = "staphb/abricate:1.0.1-abaum-plasmid" 15 | Int disk_size = 100 16 | } 17 | command <<< 18 | date | tee DATE 19 | abricate -v | tee ABRICATE_VERSION 20 | abricate --list 21 | abricate --check 22 | 23 | abricate \ 24 | --db ~{database} \ 25 | ~{'--minid ' + minid} \ 26 | ~{'--mincov ' + mincov} \ 27 | --threads ~{cpu} \ 28 | --nopath \ 29 | ~{assembly} > ~{samplename}_abricate_hits.tsv 30 | 31 | # parse out gene names into list of strings, comma-separated, final comma at end removed by sed 32 | abricate_genes=$(awk -F '\t' '{ print $6 }' ~{samplename}_abricate_hits.tsv | tail -n+2 | tr '\n' ',' | sed 's/.$//') 33 | 34 | # if variable for list of genes is EMPTY, write string saying it is empty to float to Terra table 35 | if [ -z "${abricate_genes}" ]; then 36 | abricate_genes="No genes detected by ABRicate" 37 | fi 38 | 39 | # create final output strings 40 | echo "${abricate_genes}" > ABRICATE_GENES 41 | >>> 42 | output { 43 | File abricate_results = "~{samplename}_abricate_hits.tsv" 44 | String abricate_genes = read_string("ABRICATE_GENES") 45 | String abricate_database = database 46 | String abricate_version = read_string("ABRICATE_VERSION") 47 | String abricate_docker = docker 48 | } 49 | runtime { 50 | memory: "8 GB" 51 | cpu: cpu 52 | docker: docker 53 | disks: "local-disk " + disk_size + " SSD" 54 | disk: disk_size + " GB" 55 | maxRetries: 3 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /tasks/species_typing/task_sistr.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task sistr { 4 | meta { 5 | description: "Serovar prediction of Salmonella assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/sistr_cmd:1.1.1--pyh864c0ab_2" 11 | Int disk_size = 100 12 | Int? cpu = 4 13 | 14 | # Parameters 15 | # --use-full-cgmlst-db Use the full set of cgMLST alleles which can include highly similar alleles. By default the smaller "centroid" alleles or representative alleles are used for each marker. 16 | Boolean use_full_cgmlst_db = false 17 | } 18 | command <<< 19 | echo $(sistr --version 2>&1) | sed 's/^.*sistr_cmd //; s/ .*\$//' | tee VERSION 20 | sistr \ 21 | --qc \ 22 | ~{true="--use-full-cgmlst-db" false="" use_full_cgmlst_db} \ 23 | --threads ~{cpu} \ 24 | --alleles-output ~{samplename}-allele.json \ 25 | --novel-alleles ~{samplename}-allele.fasta \ 26 | --cgmlst-profiles ~{samplename}-cgmlst.csv \ 27 | --output-prediction ~{samplename} \ 28 | --output-format tab \ 29 | ~{assembly} 30 | 31 | mv ~{samplename}.tab ~{samplename}.tsv 32 | 33 | # parse sistr TSV 34 | cut -f 15 ~{samplename}.tsv | tail -n 1 | tee PREDICTED_SEROTYPE 35 | 36 | >>> 37 | output { 38 | File sistr_results = "~{samplename}.tsv" 39 | File sistr_allele_json = "~{samplename}-allele.json" 40 | File sistr_allele_fasta = "~{samplename}-allele.fasta" 41 | File sistr_cgmlst = "~{samplename}-cgmlst.csv" 42 | String sistr_version = read_string("VERSION") 43 | String sistr_predicted_serotype = read_string("PREDICTED_SEROTYPE") 44 | } 45 | runtime { 46 | docker: "~{docker}" 47 | memory: "8 GB" 48 | cpu: 4 49 | disks: "local-disk " + disk_size + " SSD" 50 | disk: disk_size + " GB" 51 | maxRetries: 3 52 | preemptible: 0 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tasks/species_typing/task_serotypefinder.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task serotypefinder { 4 | input { 5 | File assembly 6 | String samplename 7 | String docker = "quay.io/staphb/serotypefinder:2.0.1" 8 | Int disk_size = 100 9 | } 10 | command <<< 11 | # capture date and version 12 | date | tee DATE 13 | 14 | serotypefinder.py -i ~{assembly} -x -o . 15 | mv results_tab.tsv ~{samplename}_results_tab.tsv 16 | 17 | # set H and O type based on serotypefinder ourputs 18 | python3 <>> 47 | output { 48 | File serotypefinder_report = "~{samplename}_results_tab.tsv" 49 | String serotypefinder_docker = docker 50 | String serotypefinder_serotype = read_string("STF_SEROTYPE") 51 | } 52 | runtime { 53 | docker: "~{docker}" 54 | memory: "8 GB" 55 | cpu: 2 56 | disks: "local-disk " + disk_size + " SSD" 57 | disk: disk_size + " GB" 58 | maxRetries: 3 59 | preemptible: 0 60 | } 61 | } -------------------------------------------------------------------------------- /tasks/quality_control/task_quast.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task quast { 4 | input { 5 | File assembly 6 | String samplename 7 | String docker="quay.io/staphb/quast:5.0.2" 8 | Int disk_size = 100 9 | } 10 | command <<< 11 | # capture date and version 12 | date | tee DATE 13 | quast.py --version | grep QUAST | tee VERSION 14 | 15 | quast.py ~{assembly} -o . 16 | mv report.tsv ~{samplename}_report.tsv 17 | 18 | python <>> 40 | output { 41 | File quast_report = "${samplename}_report.tsv" 42 | String version = read_string("VERSION") 43 | String pipeline_date = read_string("DATE") 44 | Int genome_length = read_int("GENOME_LENGTH") 45 | Int number_contigs = read_int("NUMBER_CONTIGS") 46 | Int n50_value = read_int("N50_VALUE") 47 | Float gc_percent = read_float("GC_PERCENT") 48 | } 49 | runtime { 50 | docker: "~{docker}" 51 | memory: "2 GB" 52 | cpu: 2 53 | disks: "local-disk " + disk_size + " SSD" 54 | disk: disk_size + " GB" 55 | maxRetries: 3 56 | preemptible: 0 57 | } 58 | } -------------------------------------------------------------------------------- /tasks/species_typing/task_pmga.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task pmga { 4 | meta { 5 | description: "Serogrouping and serotyping of all Neisseria species and Haemophilus influenzae" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/staphb/pmga:3.0.2" 11 | Int disk_size = 100 12 | Int? cpu = 4 13 | } 14 | command <<< 15 | echo $(pmga --version 2>&1) | sed 's/.*pmga //; s/ .*\$//' | tee VERSION 16 | pmga \ 17 | ~{assembly} \ 18 | --blastdir /data/blastdbs \ 19 | --threads ~{cpu} \ 20 | --prefix ~{samplename} 21 | # Parse pmga TSV 22 | # https://github.com/rpetit3/pmga#pmga-output-files 23 | cut -f 2 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_SPECIESDB 24 | cut -f 3 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_SEROTYPE 25 | cut -f 4 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_GENES 26 | cut -f 5 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_NOTES 27 | >>> 28 | output { 29 | String version = read_string("VERSION") 30 | String pmga_docker = "~{docker}" 31 | String pmga_speciesdb = read_string("PMGA_SPECIESDB") 32 | String pmga_serotype = read_string("PMGA_SEROTYPE") 33 | String pmga_genes = read_string("PMGA_GENES") 34 | String pmga_notes = read_string("PMGA_NOTES") 35 | File pmga_results = "./pmga/~{samplename}.txt" 36 | File pmga_allele_matrix = "./pmga/~{samplename}-allele-matrix.txt" 37 | File pmga_blast_final = "./pmga/~{samplename}-blast-final-results.json.gz" 38 | File pmga_blast_raw = "./pmga/~{samplename}-blast-raw-results.json.gz" 39 | File pmga_loci_counts = "./pmga/~{samplename}-loci-counts.txt" 40 | File pmga_gff = "./pmga/~{samplename}.gff.gz" 41 | } 42 | runtime { 43 | docker: "~{docker}" 44 | memory: "8 GB" 45 | cpu: 4 46 | disks: "local-disk " + disk_size + " SSD" 47 | disk: disk_size + " GB" 48 | maxRetries: 3 49 | preemptible: 0 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /workflows/wf_mashtree_fasta.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/phylogenetic_inference/task_mashtree.wdl" as mashtree 4 | import "../tasks/task_versioning.wdl" as versioning 5 | import "../tasks/utilities/task_summarize_data.wdl" as data_summary 6 | import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists 7 | 8 | 9 | workflow mashtree_fasta { 10 | input { 11 | Array[File] assembly_fasta 12 | String cluster_name 13 | Array[String]? sample_names 14 | String? data_summary_terra_project 15 | String? data_summary_terra_workspace 16 | String? data_summary_terra_table 17 | String? data_summary_column_names 18 | } 19 | call mashtree.mashtree_fasta as mashtree_task { 20 | input: 21 | assembly_fasta = assembly_fasta, 22 | cluster_name = cluster_name 23 | } 24 | call snp_dists.reorder_matrix { 25 | input: 26 | input_tree = mashtree_task.mashtree_tree, 27 | matrix = mashtree_task.mashtree_matrix, 28 | cluster_name = cluster_name 29 | } 30 | if (defined(data_summary_column_names)) { 31 | call data_summary.summarize_data { 32 | input: 33 | sample_names = sample_names, 34 | terra_project = data_summary_terra_project, 35 | terra_workspace = data_summary_terra_workspace, 36 | terra_table = data_summary_terra_table, 37 | column_names = data_summary_column_names, 38 | output_prefix = cluster_name 39 | } 40 | } 41 | call versioning.version_capture{ 42 | input: 43 | } 44 | output { 45 | # Versioning 46 | String mashtree_wf_version = version_capture.phbg_version 47 | String mashtree_wf_analysis_date = version_capture.date 48 | # Masthree Out 49 | File mashtree_matrix = reorder_matrix.ordered_matrix 50 | File mashtree_tree = reorder_matrix.tree 51 | String mashtree_version = mashtree_task.version 52 | # Data Summary Out 53 | File? mashtree_summarized_data = summarize_data.summarized_data 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /workflows/wf_tbprofiler_ont.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | 4 | import "../tasks/task_taxon_id.wdl" as taxon 5 | import "../tasks/task_versioning.wdl" as versioning 6 | 7 | workflow tbprofiler_wf { 8 | input { 9 | File reads 10 | String samplename 11 | String? mapper = "minimap2" 12 | String? caller = "bcftools" 13 | Int? min_depth = 20 14 | Float? min_af = 0.1 15 | Float? min_af_pred = 0.1 16 | Int? cov_frac_threshold = 1 17 | } 18 | call taxon.tbprofiler_one_sample_ont { 19 | input: 20 | reads = reads, 21 | samplename = samplename, 22 | mapper = mapper, 23 | caller = caller, 24 | min_depth = min_depth, 25 | min_af = min_af, 26 | min_af_pred = min_af_pred, 27 | cov_frac_threshold = cov_frac_threshold 28 | } 29 | call versioning.version_capture{ 30 | input: 31 | } 32 | output { 33 | String tb_profiler_wf_version = version_capture.phbg_version 34 | String tb_profiler_wf_analysis_date = version_capture.date 35 | File tbprofiler_output_alignment_bam = tbprofiler_one_sample_ont.tbprofiler_output_bam 36 | File tbprofiler_output_alignment_bai = tbprofiler_one_sample_ont.tbprofiler_output_bai 37 | File tb_profiler_report_csv = tbprofiler_one_sample_ont.tbprofiler_output_csv 38 | File tb_profiler_report_tsv =tbprofiler_one_sample_ont.tbprofiler_output_tsv 39 | String tb_profiler_version = tbprofiler_one_sample_ont.version 40 | String tb_profiler_main_lineage = tbprofiler_one_sample_ont.tb_profiler_main_lineage 41 | String tb_profiler_sub_lineage = tbprofiler_one_sample_ont.tb_profiler_sub_lineage 42 | String tb_profiler_dr_type = tbprofiler_one_sample_ont.tb_profiler_dr_type 43 | String tb_profiler_num_dr_variants = tbprofiler_one_sample_ont.tb_profiler_num_dr_variants 44 | String tb_profiler_num_other_variants = tbprofiler_one_sample_ont.tb_profiler_num_other_variants 45 | String tb_profiler_resistance_genes = tbprofiler_one_sample_ont.tb_profiler_resistance_genes 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /workflows/wf_tbprofiler_pe.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | 4 | import "../tasks/task_taxon_id.wdl" as taxon 5 | import "../tasks/task_versioning.wdl" as versioning 6 | 7 | workflow tbprofiler_wf { 8 | input { 9 | File read1 10 | File read2 11 | String samplename 12 | String? mapper = "bwa" 13 | String? caller = "bcftools" 14 | Int? min_depth = 10 15 | Float? min_af = 0.1 16 | Float? min_af_pred = 0.1 17 | Int? cov_frac_threshold = 1 18 | } 19 | call taxon.tbprofiler_one_sample_pe { 20 | input: 21 | read1 = read1, 22 | read2 = read2, 23 | samplename = samplename, 24 | mapper = mapper, 25 | caller = caller, 26 | min_depth = min_depth, 27 | min_af = min_af, 28 | min_af_pred = min_af_pred, 29 | cov_frac_threshold = cov_frac_threshold 30 | } 31 | call versioning.version_capture{ 32 | input: 33 | } 34 | output { 35 | String tb_profiler_wf_version = version_capture.phbg_version 36 | String tb_profiler_wf_analysis_date = version_capture.date 37 | File tb_profiler_report_csv = tbprofiler_one_sample_pe.tbprofiler_output_csv 38 | File tb_profiler_report_tsv = tbprofiler_one_sample_pe.tbprofiler_output_tsv 39 | File tbprofiler_output_alignment_bam = tbprofiler_one_sample_pe.tbprofiler_output_bam 40 | File tbprofiler_output_alignment_bai = tbprofiler_one_sample_pe.tbprofiler_output_bai 41 | String tb_profiler_version = tbprofiler_one_sample_pe.version 42 | String tb_profiler_main_lineage = tbprofiler_one_sample_pe.tb_profiler_main_lineage 43 | String tb_profiler_sub_lineage = tbprofiler_one_sample_pe.tb_profiler_sub_lineage 44 | String tb_profiler_dr_type = tbprofiler_one_sample_pe.tb_profiler_dr_type 45 | String tb_profiler_num_dr_variants = tbprofiler_one_sample_pe.tb_profiler_num_dr_variants 46 | String tb_profiler_num_other_variants = tbprofiler_one_sample_pe.tb_profiler_num_other_variants 47 | String tb_profiler_resistance_genes = tbprofiler_one_sample_pe.tb_profiler_resistance_genes 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /.github/workflows/miniwdl-check.yml: -------------------------------------------------------------------------------- 1 | # 2 | # This workflow will run on Pushes and Pull Requests against the main branch. It 3 | # will only run "miniwdl check" on wdl files that have had a change in the push 4 | # or PR. 5 | # 6 | name: MiniWDL Check 7 | on: 8 | push: 9 | branches: [main] 10 | pull_request: 11 | branches: [main] 12 | 13 | jobs: 14 | changes: 15 | name: Check for changes 16 | runs-on: ubuntu-latest 17 | outputs: 18 | # Expose workflows with changes 19 | workflows: ${{ steps.filter.outputs.wf }} 20 | workflows_files: ${{ steps.filter.outputs.wf_files }} 21 | steps: 22 | # Checkout the repo 23 | - uses: actions/checkout@v3 24 | 25 | # Select wdl files with changes 26 | - uses: dorny/paths-filter@v2 27 | id: filter 28 | with: 29 | filters: | 30 | wf: 31 | - 'tasks/**' 32 | - 'workflows/**' 33 | list-files: json 34 | 35 | check: 36 | runs-on: ubuntu-20.04 37 | name: ${{ matrix.wf }} 38 | needs: changes 39 | if: ${{ needs.changes.outputs.workflows != '[]' && needs.changes.outputs.workflows != '' }} 40 | strategy: 41 | fail-fast: false 42 | matrix: 43 | wf: ${{ fromJson(needs.changes.outputs.workflows_files) }} 44 | steps: 45 | # Checkout the repo 46 | - uses: actions/checkout@v3 47 | 48 | # Install a version of Python3 49 | - name: Set up Python 50 | uses: actions/setup-python@v2 51 | with: 52 | python-version: "3.x" 53 | 54 | # Install MiniWDL (WDL syntax) and ShellCheck (shell syntax) 55 | - name: install dependencies 56 | run: | 57 | sudo apt-get update 58 | sudo apt-get -y install shellcheck 59 | pip3 -q install miniwdl 'importlib-metadata==4.13.0' 60 | 61 | # Run MiniWDL check on each of the changed WDLs 62 | - name: MiniWDL Check ${{ matrix.wf }} 63 | run: miniwdl check ${{ matrix.wf }} 64 | -------------------------------------------------------------------------------- /tasks/gene_typing/task_plasmidfinder.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task plasmidfinder { 4 | input { 5 | File assembly 6 | String samplename 7 | Int cpu = 8 8 | Int memory = 16 9 | String docker = "staphb/plasmidfinder:2.1.6" 10 | Int disk_size = 100 11 | String? database 12 | String? database_path 13 | String? method_path 14 | # minimum coverage threshold 15 | Float? min_cov 16 | # minimum blast identity threshold 17 | Float? threshold 18 | 19 | } 20 | command <<< 21 | date | tee DATE 22 | 23 | if [[ ! -z "~{database}" ]]; then 24 | echo "User database identified; ~{database} will be utilized for analysis" 25 | plasmidfinder_db_version="~{database}" 26 | else 27 | plasmidfinder_db_version="unmodified from plasmidfinder docker container" 28 | fi 29 | 30 | echo ${plasmidfinder_db_version} | tee PLASMIDFINDER_DB_VERSION 31 | 32 | plasmidfinder.py \ 33 | -i ~{assembly} \ 34 | -x \ 35 | ~{'-d ' + database} \ 36 | ~{'-p ' + database_path} \ 37 | ~{'-mp ' + method_path} \ 38 | ~{'-l ' + min_cov} \ 39 | ~{'-t ' + threshold} 40 | 41 | # parse outputs 42 | if [ ! -f results_tab.tsv ]; then 43 | PF="No plasmids detected in database" 44 | else 45 | PF="$(tail -n +2 results_tab.tsv | cut -f 2 | sort | uniq -u | paste -s -d, - )" 46 | if [ "$PF" == "" ]; then 47 | PF="No plasmids detected in database" 48 | fi 49 | fi 50 | echo $PF | tee PLASMIDS 51 | 52 | mv results_tab.tsv ~{samplename}_results.tsv 53 | mv Hit_in_genome_seq.fsa ~{samplename}_seqs.fsa 54 | 55 | >>> 56 | output { 57 | String plasmidfinder_plasmids = read_string("PLASMIDS") 58 | File plasmidfinder_results = "~{samplename}_results.tsv" 59 | File plasmidfinder_seqs = "~{samplename}_seqs.fsa" 60 | String plasmidfinder_docker = docker 61 | String plasmidfinder_db_version = read_string("PLASMIDFINDER_DB_VERSION") 62 | } 63 | runtime { 64 | memory: "~{memory} GB" 65 | cpu: cpu 66 | docker: "~{docker}" 67 | disks: "local-disk " + disk_size + " SSD" 68 | disk: disk_size + " GB" 69 | maxRetries: 3 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /tasks/quality_control/task_busco.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task busco { 4 | meta { 5 | description: "Run BUSCO on assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "ezlabgva/busco:v5.3.2_cv1" 11 | Int disk_size = 100 12 | Boolean eukaryote = false 13 | } 14 | command <<< 15 | # get version 16 | busco --version | tee "VERSION" 17 | 18 | # run busco 19 | # -i input assembly 20 | # -m geno for genome input 21 | # -o output file tag 22 | # --auto-lineage-euk looks at only eukaryotic organisms 23 | # --auto-lineage-prok looks at only prokaryotic organisms; default 24 | busco \ 25 | -i ~{assembly} \ 26 | -m geno \ 27 | -o ~{samplename} \ 28 | ~{true='--auto-lineage-euk' false='--auto-lineage-prok' eukaryote} 29 | 30 | # check for existence of output file; otherwise display a string that says the output was not created 31 | if [ -f ~{samplename}/short_summary.specific.*.~{samplename}.txt ]; then 32 | 33 | # grab the database version and format it according to BUSCO recommendations 34 | cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "dataset is:" | cut -d' ' -f 6,9 | sed 's/,//' | sed 's/ / (/' | sed 's/$/)/' | tee DATABASE 35 | 36 | # extract the results string 37 | cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "C:" | tee BUSCO_RESULTS 38 | 39 | cp ~{samplename}/short_summary.specific.*.~{samplename}.txt ~{samplename}_busco-summary.txt 40 | else 41 | echo "BUSCO FAILED" | tee BUSCO_RESULTS 42 | echo "NA" > DATABASE 43 | fi 44 | >>> 45 | output { 46 | String busco_version = read_string("VERSION") 47 | String busco_database = read_string("DATABASE") 48 | String busco_results = read_string("BUSCO_RESULTS") 49 | File? busco_report = "~{samplename}_busco-summary.txt" 50 | } 51 | runtime { 52 | docker: "~{docker}" 53 | memory: "8 GB" 54 | cpu: 2 55 | disks: "local-disk " + disk_size + " SSD" 56 | disk: disk_size + " GB" 57 | maxRetries: 3 58 | preemptible: 0 59 | } 60 | } -------------------------------------------------------------------------------- /tasks/species_typing/task_ectyper.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task ectyper { 4 | meta { 5 | description: "In-silico prediction of Escherichia coli serotype" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/ectyper:1.0.0--pyhdfd78af_1" 11 | Int disk_size = 100 12 | Int? cpu = 4 13 | 14 | # ECTyper Parameters 15 | # --opid [integer] Percent identity required for an O antigen allele match [default: 90] 16 | # --opcov [integer] Minumum percent coverage required for an O antigen allele match [default: 90] 17 | # --hpid [integer] Percent identity required for an H antigen allele match [default: 95] 18 | # --hpcov [integer] Minumum percent coverage required for an H antigen allele match [default: 50] 19 | # --verify [boolean] Enable E. coli species verification 20 | # --print_alleles [boolean] Prints the allele sequences if enabled as the final column 21 | Int opid = 90 22 | Int hpid = 95 23 | Int opcov = 90 24 | Int hpcov = 50 25 | Boolean verify = false 26 | Boolean print_alleles = false 27 | } 28 | command <<< 29 | echo $(ectyper --version 2>&1) | sed 's/.*ectyper //; s/ .*\$//' | tee VERSION 30 | ectyper \ 31 | ~{'-opid ' + opid} \ 32 | ~{'-hpid ' + hpid} \ 33 | ~{'-opcov ' + opcov} \ 34 | ~{'-hpcov ' + hpcov} \ 35 | ~{true="--verify" false="" verify} \ 36 | ~{true="-s" false="" print_alleles} \ 37 | --cores ~{cpu} \ 38 | --output ./ \ 39 | --input ~{assembly} 40 | mv output.tsv ~{samplename}.tsv 41 | # parse ECTyper TSV 42 | cut -f 5 ~{samplename}.tsv | tail -n 1 | tee PREDICTED_SEROTYPE 43 | >>> 44 | output { 45 | File ectyper_results = "~{samplename}.tsv" 46 | String ectyper_predicted_serotype = read_string("PREDICTED_SEROTYPE") 47 | String ectyper_version = read_string("VERSION") 48 | } 49 | runtime { 50 | docker: "~{docker}" 51 | memory: "8 GB" 52 | cpu: 4 53 | disks: "local-disk " + disk_size + " SSD" 54 | disk: disk_size + " GB" 55 | maxRetries: 3 56 | preemptible: 0 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /tasks/species_typing/task_pbptyper.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task pbptyper { 4 | meta { 5 | description: " In silico Penicillin Binding Protein (PBP) typer for Streptococcus pneumoniae assemblies. https://github.com/rpetit3/pbptyper" 6 | } 7 | input { 8 | File assembly # An assembly in FASTA format (compressed with gzip, or uncompressed) to predict the PBP type on. 9 | String samplename 10 | String? db # A path to a directory containing FASTA files for 1A, 2B, and 2X proteins. In most cases using the default value will be all that is needed. 11 | Int min_pident = 95 # Minimum percent identity to count a hit [default: 95] 12 | Int min_coverage = 95 # Minimum percent coverage to count a hit [default: 95] 13 | String docker = "staphb/pbptyper:1.0.4" 14 | Int disk_size = 100 15 | Int cpus = 4 16 | 17 | } 18 | command <<< 19 | # get version information 20 | pbptyper --version | sed 's/pbptyper, //' | tee VERSION 21 | 22 | # run pbptyper 23 | pbptyper \ 24 | --assembly ~{assembly} \ 25 | ~{'--db ' + db} \ 26 | ~{'--min_pident ' + min_pident} \ 27 | ~{'--min_coverage ' + min_coverage} \ 28 | --prefix "~{samplename}" \ 29 | --outdir ./ 30 | 31 | # parse output tsv for pbptype 32 | cut -f 2 ~{samplename}.tsv | tail -n 1 > pbptype.txt 33 | 34 | >>> 35 | output { 36 | String pbptyper_predicted_1A_2B_2X = read_string("pbptype.txt") 37 | File pbptyper_pbptype_predicted_tsv = "~{samplename}.tsv" # A tab-delimited file with the predicted PBP type 38 | File pbptyper_pbptype_1A_tsv = "~{samplename}-1A.tblastn.tsv" # A tab-delimited file of all blast hits against 1A 39 | File pbptyper_pbptype_2B_tsv = "~{samplename}-2B.tblastn.tsv" # A tab-delimited file of all blast hits against 2B 40 | File pbptyper_pbptype_2X_tsv = "~{samplename}-2X.tblastn.tsv" # A tab-delimited file of all blast hits against 2X 41 | String pbptyper_version = read_string("VERSION") 42 | String pbptyper_docker = docker 43 | } 44 | runtime { 45 | docker: "~{docker}" 46 | memory: "16 GB" 47 | cpu: cpus 48 | disks: "local-disk " + disk_size + " SSD" 49 | disk: disk_size + " GB" 50 | maxRetries: 3 51 | preemptible: 0 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /.dockstore.yml: -------------------------------------------------------------------------------- 1 | version: 1.2 2 | workflows: 3 | - name: kSNP3 4 | subclass: WDL 5 | primaryDescriptorPath: /workflows/wf_ksnp3.wdl 6 | testParameterFiles: 7 | - empty.json 8 | - name: Gambit_Query 9 | subclass: WDL 10 | primaryDescriptorPath: /workflows/wf_gambit_query.wdl 11 | testParameterFiles: 12 | - empty.json 13 | - name: Kleborate 14 | subclass: WDL 15 | primaryDescriptorPath: /workflows/wf_kleborate.wdl 16 | testParameterFiles: 17 | - empty.json 18 | - name: SerotypeFinder 19 | subclass: WDL 20 | primaryDescriptorPath: /workflows/wf_serotypefinder.wdl 21 | testParameterFiles: 22 | - empty.json 23 | - name: TBProfiler_Illumina_PE 24 | subclass: WDL 25 | primaryDescriptorPath: /workflows/wf_tbprofiler_pe.wdl 26 | testParameterFiles: 27 | - empty.json 28 | - name: TBProfiler_ONT 29 | subclass: WDL 30 | primaryDescriptorPath: /workflows/wf_tbprofiler_ont.wdl 31 | testParameterFiles: 32 | - empty.json 33 | - name: TheiaProk_Illumina_PE 34 | subclass: WDL 35 | primaryDescriptorPath: /workflows/wf_theiaprok_illumina_pe.wdl 36 | testParameterFiles: 37 | - empty.json 38 | - name: TheiaProk_Illumina_SE 39 | subclass: WDL 40 | primaryDescriptorPath: /workflows/wf_theiaprok_illumina_se.wdl 41 | testParameterFiles: 42 | - empty.json 43 | - name: MashTree_FASTA 44 | subclass: WDL 45 | primaryDescriptorPath: /workflows/wf_mashtree_fasta.wdl 46 | testParameterFiles: 47 | - empty.json 48 | - name: NCBI-AMRFinderPlus 49 | subclass: WDL 50 | primaryDescriptorPath: /workflows/wf_amrfinderplus.wdl 51 | testParameterFiles: 52 | - empty.json 53 | - name: Kraken2_PE 54 | subclass: WDL 55 | primaryDescriptorPath: /workflows/wf_kraken2_pe.wdl 56 | testParameterFiles: 57 | - empty.json 58 | - name: Kraken2_SE 59 | subclass: WDL 60 | primaryDescriptorPath: /workflows/wf_kraken2_se.wdl 61 | testParameterFiles: 62 | - empty.json 63 | - name: RASUSA 64 | subclass: WDL 65 | primaryDescriptorPath: /workflows/wf_rasusa.wdl 66 | testParameterFiles: 67 | - empty.json 68 | - name: Core_Gene_SNP 69 | subclass: WDL 70 | primaryDescriptorPath: /workflows/wf_core_gene_snp.wdl 71 | testParameterFiles: 72 | - empty.json -------------------------------------------------------------------------------- /tasks/utilities/task_rasusa.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task rasusa { 4 | meta { 5 | description: "Randomly subsample sequencing reads to a specified coverage (https://github.com/mbhall88/rasusa)" 6 | } 7 | input { 8 | File read1 9 | File? read2 10 | String samplename 11 | String docker = "staphb/rasusa:0.7.0" 12 | Int disk_size = 100 13 | Int cpu = 4 14 | # RASUA Parameters 15 | # --bases [STRING] Explicitly set the number of bases required e.g., 4.3kb, 7Tb, 9000, 4.1MB. If this option is given, --coverage and --genome-size are ignored 16 | # --coverage [FLOAT] The desired coverage to sub-sample the reads to. If --bases is not provided, this option and --genome-size are required 17 | # --genome_size [STRING] Genome size to calculate coverage with respect to. e.g., 4.3kb, 7Tb, 9000, 4.1MB 18 | # --seed [INTERGER] Random seed to use 19 | # --frac [FLOAT] Subsample to a fraction of the reads - e.g., 0.5 samples half the reads 20 | # --num [INTEGER] Subsample to a specific number of reads 21 | String? bases 22 | Float coverage 23 | String genome_size 24 | Int? seed 25 | Float? frac 26 | Int? num 27 | } 28 | command <<< 29 | rasusa --version | tee VERSION 30 | # set single-end or paired-end outputs 31 | if [ -z "~{read2}" ]; then 32 | OUTPUT_FILES="~{samplename}_subsampled_R1.fastq.gz" 33 | else 34 | OUTPUT_FILES="~{samplename}_subsampled_R1.fastq.gz ~{samplename}_subsampled_R2.fastq.gz" 35 | fi 36 | # ignore coverage values if frac input provided 37 | if [ -z "~{frac}" ]; then 38 | COVERAGE="--coverage ~{coverage} --genome-size ~{genome_size}" 39 | else 40 | COVERAGE="" 41 | fi 42 | # run rasusa 43 | rasusa \ 44 | -i ~{read1} ~{read2} \ 45 | ${COVERAGE} \ 46 | ~{'--seed ' + seed} \ 47 | ~{'--bases ' + bases} \ 48 | ~{'--frac ' + frac} \ 49 | ~{'--num ' + num} \ 50 | -o ${OUTPUT_FILES} 51 | >>> 52 | output { 53 | File read1_subsampled = "~{samplename}_subsampled_R1.fastq.gz" 54 | File? read2_subsampled = "~{samplename}_subsampled_R2.fastq.gz" 55 | String rasusa_version = read_string("VERSION") 56 | } 57 | runtime { 58 | docker: "~{docker}" 59 | memory: "8 GB" 60 | cpu: cpu 61 | disks: "local-disk " + disk_size + " SSD" 62 | disk: disk_size + " GB" 63 | maxRetries: 3 64 | preemptible: 0 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /tasks/species_typing/task_staphopiasccmec.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task staphopiasccmec { 4 | meta { 5 | description: "Primer based SCCmec typing of Staphylococcus aureus genomes" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/staphopia-sccmec:1.0.0--hdfd78af_0" 11 | Int disk_size = 100 12 | Int cpu = 1 13 | } 14 | command <<< 15 | # get version 16 | staphopia-sccmec --version 2>&1 | sed 's/^.*staphopia-sccmec //' | tee VERSION 17 | 18 | # run staphopia-sccmec on input assembly; hamming option OFF; outputs are true/false 19 | staphopia-sccmec \ 20 | --assembly ~{assembly} > ~{samplename}.staphopia-sccmec.summary.tsv 21 | 22 | # run staphopia-sccmec on input assembly; hamming option ON; outputs are the hamming distance; 0 is exact match 23 | staphopia-sccmec \ 24 | --hamming \ 25 | --assembly ~{assembly} > ~{samplename}.staphopia-sccmec.hamming.tsv 26 | 27 | # please excuse this ugly bash code below :) 28 | 29 | # parse output summary TSV for true matches 30 | # look for columns that contain the word "True" and print the column numbers in a list to a file col_headers.txt 31 | awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "True") print i } }' ~{samplename}.staphopia-sccmec.summary.tsv | tee col_headers.txt 32 | 33 | # use column number list to print column headers (example: IV, mecA, etc.) to a file type.txt 34 | cat col_headers.txt | while read -r COL_NUMBER; do \ 35 | cut -f "$COL_NUMBER" ~{samplename}.staphopia-sccmec.summary.tsv | head -n 1 >>type.txt 36 | echo "," >>type.txt 37 | done 38 | 39 | # remove newlines, remove trailing comma; generate output string of comma separated values 40 | cat type.txt | tr -d '\n' | sed 's|.$||g' | tee TYPES_AND_MECA.txt 41 | 42 | >>> 43 | output { 44 | File staphopiasccmec_results_tsv = "~{samplename}.staphopia-sccmec.summary.tsv" 45 | File staphopiasccmec_hamming_distance_tsv = "~{samplename}.staphopia-sccmec.hamming.tsv" 46 | String staphopiasccmec_types_and_mecA_presence = read_string("TYPES_AND_MECA.txt") 47 | String staphopiasccmec_version = read_string("VERSION") 48 | String staphopiasccmec_docker = docker 49 | } 50 | runtime { 51 | docker: "~{docker}" 52 | memory: "4 GB" 53 | cpu: cpu 54 | disks: "local-disk " + disk_size + " SSD" 55 | disk: disk_size + " GB" 56 | maxRetries: 3 57 | preemptible: 0 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /tasks/gene_typing/task_bakta.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task bakta { 4 | input { 5 | File assembly 6 | File bakta_db = "gs://theiagen-public-files-rp/terra/theiaprok-files/bakta_db_2022-08-29.tar.gz" 7 | String samplename 8 | Int cpu = 8 9 | Int memory = 16 10 | String docker = "quay.io/biocontainers/bakta:1.5.1--pyhdfd78af_0" 11 | Int disk_size = 100 12 | # Parameters 13 | # proteins: Fasta file of trusted protein sequences for CDS annotation 14 | # prodigal_tf: Prodigal training file to use for CDS prediction 15 | # bakta_opts: any additional bakta arguments 16 | Boolean proteins = false 17 | Boolean compliant = false 18 | File? prodigal_tf 19 | String? bakta_opts 20 | } 21 | command <<< 22 | date | tee DATE 23 | bakta --version | tee BAKTA_VERSION 24 | 25 | # Extract Bakta DB 26 | mkdir db 27 | time tar xzvf ~{bakta_db} --strip-components=1 -C ./db 28 | 29 | # Install amrfinderplus db 30 | amrfinder_update --database db/amrfinderplus-db 31 | amrfinder --database_version | tee AMRFINDER_DATABASE_VERSION 32 | 33 | bakta \ 34 | ~{bakta_opts} \ 35 | --db db/ \ 36 | --threads ~{cpu} \ 37 | --prefix ~{samplename} \ 38 | --output ~{samplename} \ 39 | ~{true='--compliant' false='' compliant} \ 40 | ~{true='--proteins' false='' proteins} \ 41 | ~{'--prodigal-tf ' + prodigal_tf} \ 42 | ~{assembly} 43 | 44 | # rename gff3 to gff for compatibility with downstream analysis (pirate) 45 | mv "~{samplename}/~{samplename}.gff3" "~{samplename}/~{samplename}.gff" 46 | 47 | >>> 48 | output { 49 | File bakta_embl = "~{samplename}/~{samplename}.embl" 50 | File bakta_faa = "~{samplename}/~{samplename}.faa" 51 | File bakta_ffn = "~{samplename}/~{samplename}.ffn" 52 | File bakta_fna = "~{samplename}/~{samplename}.fna" 53 | File bakta_gbff = "~{samplename}/~{samplename}.gbff" 54 | File bakta_gff3 = "~{samplename}/~{samplename}.gff" 55 | File bakta_hypotheticals_faa = "~{samplename}/~{samplename}.hypotheticals.faa" 56 | File bakta_hypotheticals_tsv = "~{samplename}/~{samplename}.hypotheticals.tsv" 57 | File bakta_tsv = "~{samplename}/~{samplename}.tsv" 58 | File bakta_txt = "~{samplename}/~{samplename}.txt" 59 | String bakta_version = read_string("BAKTA_VERSION") 60 | } 61 | runtime { 62 | memory: "~{memory} GB" 63 | cpu: cpu 64 | docker: docker 65 | disks: "local-disk " + disk_size + " SSD" 66 | disk: disk_size + " GB" 67 | maxRetries: 3 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /tasks/species_typing/task_meningotype.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task meningotype { 4 | meta { 5 | description: "Serotyping of Neisseria meningitidis" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/meningotype:0.8.5--pyhdfd78af_0" 11 | Int disk_size = 100 12 | Int cpu = 2 13 | } 14 | command <<< 15 | 16 | # Parameters 17 | # --finetype perform porA and fetA fine typing (default=off) 18 | # --porB perform porB sequence typing (NEIS2020) (default=off) 19 | # --bast perform Bexsero antigen sequence typing (BAST) (default=off) 20 | # --mlst perform MLST (default=off) 21 | # --all perform MLST, porA, fetA, porB, BAST typing (default=off) 22 | 23 | echo $(meningotype --version 2>&1) | sed 's/^.*meningotype v//' | tee VERSION 24 | meningotype \ 25 | --finetype \ 26 | --porB \ 27 | --bast \ 28 | --cpus ~{cpu} \ 29 | ~{assembly} \ 30 | > ~{samplename}.tsv 31 | 32 | tail -1 ~{samplename}.tsv | awk '{print $2}' | tee MENINGOTYPE_SEROTYPE 33 | tail -1 ~{samplename}.tsv | awk '{print $5}' | tee MENINGOTYPE_PORA 34 | tail -1 ~{samplename}.tsv | awk '{print $6}' | tee MENINGOTYPE_FETA 35 | tail -1 ~{samplename}.tsv | awk '{print $7}' | tee MENINGOTYPE_PORB 36 | tail -1 ~{samplename}.tsv | awk '{print $8}' | tee MENINGOTYPE_FHBP 37 | tail -1 ~{samplename}.tsv | awk '{print $9}' | tee MENINGOTYPE_NHBA 38 | tail -1 ~{samplename}.tsv | awk '{print $10}' | tee MENINGOTYPE_NADA 39 | tail -1 ~{samplename}.tsv | awk '{print $11}' | tee MENINGOTYPE_BAST 40 | 41 | >>> 42 | output { 43 | File meningotype_tsv = "~{samplename}.tsv" 44 | String meningotype_version = read_string("VERSION") 45 | String meningotype_serogroup = read_string("MENINGOTYPE_SEROTYPE") 46 | String meningotype_PorA = read_string("MENINGOTYPE_PORA") 47 | String meningotype_FetA = read_string("MENINGOTYPE_FETA") 48 | String meningotype_PorB = read_string("MENINGOTYPE_PORB") 49 | String meningotype_fHbp = read_string("MENINGOTYPE_FHBP") 50 | String meningotype_NHBA = read_string("MENINGOTYPE_NHBA") 51 | String meningotype_NadA = read_string("MENINGOTYPE_NADA") 52 | String meningotype_BAST = read_string("MENINGOTYPE_BAST") 53 | } 54 | runtime { 55 | docker: "~{docker}" 56 | memory: "8 GB" 57 | cpu: cpu 58 | disks: "local-disk " + disk_size + " SSD" 59 | disk: disk_size + " GB" 60 | maxRetries: 3 61 | preemptible: 0 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /tests/inputs/wf_theiaprok_illumina_pe.json: -------------------------------------------------------------------------------- 1 | { 2 | "theiaprok_illumina_pe.samplename": "test", 3 | "theiaprok_illumina_pe.read1_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R1.fastq.gz", 4 | "theiaprok_illumina_pe.read2_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R2.fastq.gz", 5 | "theiaprok_illumina_pe.skip_screen": true, 6 | "theiaprok_illumina_pe.read_QC_trim.read_processing": "trimmomatic", 7 | "theiaprok_illumina_pe.read_QC_trim.call_midas": false, 8 | "theiaprok_illumina_pe.read_QC_trim.midas.midas_db" : "./tests/inputs/empty-for-test.txt", 9 | "theiaprok_illumina_pe.genome_annotation": "prokka", 10 | "theiaprok_illumina_pe.shovill_pe.assembler": "skesa", 11 | "theiaprok_illumina_pe.merlin_magic.call_poppunk": false, 12 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_dists_npy" : "./tests/inputs/empty-for-test.txt", 13 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_dists_pkl" : "./tests/inputs/empty-for-test.txt", 14 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_h5" : "./tests/inputs/empty-for-test.txt", 15 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs" : "./tests/inputs/empty-for-test.txt", 16 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_dists_npy" : "./tests/inputs/empty-for-test.txt", 17 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_dists_pkl" : "./tests/inputs/empty-for-test.txt", 18 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_h5" : "./tests/inputs/empty-for-test.txt", 19 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_clusters_csv" : "./tests/inputs/empty-for-test.txt", 20 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_fit_npz" : "./tests/inputs/empty-for-test.txt", 21 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_fit_pkl" : "./tests/inputs/empty-for-test.txt", 22 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_graph_gt" : "./tests/inputs/empty-for-test.txt", 23 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_qcreport_txt" : "./tests/inputs/empty-for-test.txt", 24 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_unword_clusters_csv" : "./tests/inputs/empty-for-test.txt", 25 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_graph_gt" : "./tests/inputs/empty-for-test.txt", 26 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_external_clusters_csv" : "./tests/inputs/empty-for-test.txt", 27 | "theiaprok_illumina_pe.bakta.bakta_db" : "./tests/inputs/empty-for-test.txt" 28 | } 29 | -------------------------------------------------------------------------------- /tasks/species_typing/task_seqsero2.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task seqsero2 { 4 | # Inputs 5 | input { 6 | File read1 7 | File? read2 8 | String samplename 9 | String mode ="a" 10 | String seqsero2_docker_image = "quay.io/staphb/seqsero2:1.2.1" 11 | Int disk_size = 100 12 | Boolean paired_end 13 | } 14 | 15 | command <<< 16 | # capture date and version 17 | # Print and save date 18 | date | tee DATE 19 | # Print and save version 20 | SeqSero2_package.py --version | tee VERSION 21 | # Run SeqSero2 on the input read data 22 | SeqSero2_package.py \ 23 | -p 8 \ 24 | ~{true='-t 2' false='-t 3' paired_end} \ 25 | -m ~{mode} \ 26 | -n ~{samplename} \ 27 | -d ~{samplename}_seqseqro2_output_dir \ 28 | -i ~{read1} ~{read2} 29 | # Run a python block to parse output file for terra data tables 30 | python3 <>> 53 | output { 54 | File seqsero2_report = "./~{samplename}_seqseqro2_output_dir/SeqSero_result.tsv" 55 | String seqsero2_version = read_string("VERSION") 56 | String seqsero2_predicted_antigenic_profile = read_string("PREDICTED_ANTIGENIC_PROFILE") 57 | String seqsero2_predicted_serotype = read_string("PREDICTED_SEROTYPE") 58 | String seqsero2_predicted_contamination = read_string("CONTAMINATION") 59 | } 60 | runtime { 61 | docker: "~{seqsero2_docker_image}" 62 | memory: "16 GB" 63 | cpu: 8 64 | disks: "local-disk " + disk_size + " SSD" 65 | disk: disk_size + " GB" 66 | preemptible: 0 67 | maxRetries: 3 68 | } 69 | } -------------------------------------------------------------------------------- /tasks/quality_control/task_trimmomatic.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task trimmomatic_pe { 4 | input { 5 | File read1 6 | File read2 7 | String samplename 8 | String docker = "quay.io/staphb/trimmomatic:0.39" 9 | Int? trimmomatic_window_size = 10 10 | Int? trimmomatic_quality_trim_score = 20 11 | Int? trimmomatic_minlen = 75 12 | Int? threads = 4 13 | Int disk_size = 100 14 | } 15 | command <<< 16 | # date and version control 17 | date | tee DATE 18 | trimmomatic -version > VERSION && sed -i -e 's/^/Trimmomatic /' VERSION 19 | 20 | trimmomatic PE \ 21 | -threads ~{threads} \ 22 | ~{read1} ~{read2} \ 23 | -baseout ~{samplename}.fastq.gz \ 24 | SLIDINGWINDOW:~{trimmomatic_window_size}:~{trimmomatic_quality_trim_score} \ 25 | MINLEN:~{trimmomatic_minlen} &> ~{samplename}.trim.stats.txt 26 | >>> 27 | output { 28 | File read1_trimmed = "~{samplename}_1P.fastq.gz" 29 | File read2_trimmed = "~{samplename}_2P.fastq.gz" 30 | File trimmomatic_stats = "~{samplename}.trim.stats.txt" 31 | String version = read_string("VERSION") 32 | String pipeline_date = read_string("DATE") 33 | } 34 | runtime { 35 | docker: "~{docker}" 36 | memory: "8 GB" 37 | cpu: 4 38 | disks: "local-disk " + disk_size + " SSD" 39 | disk: disk_size + " GB" 40 | maxRetries: 3 41 | preemptible: 0 42 | } 43 | } 44 | 45 | task trimmomatic_se { 46 | input { 47 | File read1 48 | String samplename 49 | String docker="quay.io/staphb/trimmomatic:0.39" 50 | Int? trimmomatic_window_size = 4 51 | Int? trimmomatic_quality_trim_score = 30 52 | Int? trimmomatic_minlen = 25 53 | Int? threads = 4 54 | Int disk_size = 100 55 | } 56 | command <<< 57 | # date and version control 58 | date | tee DATE 59 | trimmomatic -version > VERSION && sed -i -e 's/^/Trimmomatic /' VERSION 60 | 61 | trimmomatic SE \ 62 | -threads ~{threads} \ 63 | ~{read1} \ 64 | ~{samplename}_trimmed.fastq.gz \ 65 | SLIDINGWINDOW:~{trimmomatic_window_size}:~{trimmomatic_quality_trim_score} \ 66 | MINLEN:~{trimmomatic_minlen} > ~{samplename}.trim.stats.txt 67 | >>> 68 | output { 69 | File read1_trimmed = "${samplename}_trimmed.fastq.gz" 70 | File trimmomatic_stats = "${samplename}.trim.stats.txt" 71 | String version = read_string("VERSION") 72 | String pipeline_date = read_string("DATE") 73 | } 74 | runtime { 75 | docker: "~{docker}" 76 | memory: "8 GB" 77 | cpu: 4 78 | disks: "local-disk " + disk_size + " SSD" 79 | disk: disk_size + " GB" 80 | maxRetries: 3 81 | preemptible: 0 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /tasks/species_typing/task_hicap.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task hicap { 4 | meta { 5 | description: "Identify cap locus serotype and structure in your Haemophilus influenzae assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/hicap:1.0.3--py_0" 11 | Int disk_size = 100 12 | Int? cpu = 4 13 | 14 | # Parameters 15 | # --gene_coverage GENE_COVERAGE Minimum percentage coverage to consider a single gene complete. [default: 0.80] 16 | # --gene_identity GENE_IDENTITY Minimum percentage identity to consider a single gene complete. [default: 0.70] 17 | # --broken_gene_length BROKEN_GENE_LENGTH Minimum length to consider a broken gene. [default: 60] 18 | # --broken_gene_identity BROKEN_GENE_IDENTITY Minimum percentage identity to consider a broken gene. [default: 0.80] 19 | Float gene_coverage = 0.8 20 | Float gene_identity = 0.7 21 | Int broken_gene_length = 60 22 | Float broken_gene_identity = 0.8 23 | Boolean full_sequence = false 24 | Boolean debug = false 25 | } 26 | command <<< 27 | echo $( hicap --version 2>&1 ) | sed 's/^.*hicap //' | tee VERSION 28 | hicap \ 29 | --query_fp ~{assembly} \ 30 | ~{'--gene_coverage' + gene_coverage} \ 31 | ~{'--gene_identity' + gene_identity} \ 32 | ~{'--broken_gene_length' + broken_gene_length} \ 33 | ~{'--broken_gene_identity' + broken_gene_identity} \ 34 | ~{true="--full_sequence" false="" full_sequence} \ 35 | ~{true="--debug" false="" debug} \ 36 | --threads ~{cpu} \ 37 | -o ./ 38 | 39 | if [ ! -f ${samplename}.tsv ]; then 40 | # No hits, make a file to say so for downstream merging 41 | echo "isolatepredicted_serotypeattributesgenes_identifiedlocus_locationregion_I_genesregion_II_genesregion_III_genesIS1016_hits" | sed 's//\t/g' > ${samplename}.tsv 42 | echo "~{samplename}cap_not_found-------" | sed 's//\t/g' >> ~{samplename}.tsv 43 | else 44 | sed -i 's/#isolate/isolate/' ~{samplename}.tsv 45 | fi 46 | >>> 47 | output { 48 | File hicap_results = "~{samplename}.tsv" 49 | File hicap_genbank = "~{samplename}.gbk" 50 | File hicap_image = "~{samplename}.svg" 51 | String hicap_version = read_string("VERSION") 52 | } 53 | runtime { 54 | docker: "~{docker}" 55 | memory: "8 GB" 56 | cpu: 4 57 | disks: "local-disk " + disk_size + " SSD" 58 | disk: disk_size + " GB" 59 | maxRetries: 3 60 | preemptible: 0 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /tasks/phylogenetic_inference/task_mycosnp_tree.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task mycosnptree { 4 | input { 5 | Array[File] assembly_fasta 6 | Array[String] samplename 7 | String docker="quay.io/theiagen/mycosnp:dev" 8 | Int disk_size = 100 9 | String strain="B11205" 10 | String accession="GCA_016772135" 11 | } 12 | command <<< 13 | date | tee DATE 14 | echo $(nextflow pull rpetit3/mycosnp-nf 2>&1) | sed 's/^.*revision: //;' | tee MYCOSNPTREE_VERSION 15 | 16 | assembly_array=(~{sep=' ' assembly_fasta}) 17 | assembly_array_len=$(echo "${#assembly_array[@]}") 18 | samplename_array=(~{sep=' ' samplename}) 19 | samplename_array_len=$(echo "${#samplename_array[@]}") 20 | 21 | # Ensure assembly, and samplename arrays are of equal length 22 | if [ "$assembly_array_len" -ne "$samplename_array_len" ]; then 23 | echo "Assembly array (length: $assembly_array_len) and samplename array (length: $samplename_array_len) are of unequal length." >&2 24 | exit 1 25 | fi 26 | 27 | # Make sample FOFN 28 | echo "sample,fasta" > samples.csv 29 | for index in ${!assembly_array[@]}; do 30 | assembly=${assembly_array[$index]} 31 | samplename=${samplename_array[$index]} 32 | echo -e "${samplename},${assembly}" >> samples.csv 33 | done 34 | 35 | # Run MycoSNP 36 | mkdir mycosnptree 37 | cd mycosnptree 38 | if nextflow run rpetit3/mycosnp-nf -entry NFCORE_MYCOSNPTREE --input ../samples.csv --fasta /reference/~{accession}/masked/reference-consensus.fa --publish_dir_mode copy --rapidnj False --fasttree False --iqtree; then 39 | # Everything finished, pack up the results and clean up 40 | find work/ -name "*.iqtree" | xargs -I {} cp {} ./ 41 | rm -rf .nextflow/ work/ 42 | cd .. 43 | tar -cf - mycosnptree/ | gzip -n --best > mycosnptree.tar.gz 44 | else 45 | # Run failed 46 | exit 1 47 | fi 48 | >>> 49 | output { 50 | String mycosnptree_version = read_string("MYCOSNPTREE_VERSION") 51 | String mycosnptree_docker = docker 52 | String analysis_date = read_string("DATE") 53 | String reference_strain = strain 54 | String reference_accession = accession 55 | File mycosnptree_tree = "mycosnptree/results/combined/phylogeny/iqtree/alignment.fasta.treefile" 56 | File mycosnptree_iqtree_log = "mycosnptree/alignment.fasta.iqtree" 57 | File mycosnptree_full_results = "mycosnptree.tar.gz" 58 | } 59 | runtime { 60 | docker: "~{docker}" 61 | memory: "32 GB" 62 | cpu: 4 63 | disks: "local-disk " + disk_size + " SSD" 64 | disk: disk_size + " GB" 65 | maxRetries: 3 66 | preemptible: 0 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /tasks/species_typing/task_emmtyper.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task emmtyper { 4 | meta { 5 | description: "emm-typing of Streptococcus pyogenes assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/emmtyper:0.2.0--py_0" 11 | Int disk_size = 100 12 | Int? cpu = 2 13 | 14 | # Parameters 15 | # --workflow [blast|pcr] Choose workflow [default: blast] 16 | # --cluster-distance INTEGER Distance between cluster of matches to consider as different clusters. [default: 500] 17 | # --percent-identity INTEGER [BLAST] Minimal percent identity of sequence. [default: 95] 18 | # --culling-limit INTEGER [BLAST] Total hits to return in a position. [default: 5] 19 | # --mismatch INTEGER [BLAST] Threshold for number of mismatch to allow in BLAST hit. [default: 4] 20 | # --align-diff INTEGER [BLAST] Threshold for difference between alignment length and subject length in BLAST hit. [default: 5] 21 | # --gap INTEGER [BLAST] Threshold gap to allow in BLAST hit. [default: 2] 22 | # --min-perfect INTEGER [isPcr] Minimum size of perfect match at 3' primer end. [default: 15] 23 | # --min-good INTEGER [isPcr] Minimum size where there must be 2 matches for each mismatch. [default: 15] 24 | # --max-size INTEGER [isPcr] Maximum size of PCR product. [default: 2000] 25 | 26 | String wf = "blast" 27 | Int cluster_distance = 500 28 | Int percid = 95 29 | Int culling_limit = 5 30 | Int mismatch = 4 31 | Int align_diff = 5 32 | Int gap = 2 33 | Int min_perfect = 15 34 | Int min_good = 15 35 | Int max_size = 2000 36 | } 37 | command <<< 38 | echo $(emmtyper --version 2>&1) | sed 's/^.*emmtyper v//' | tee VERSION 39 | emmtyper \ 40 | ~{'--workflow' + wf} \ 41 | ~{'--cluster-distance' + cluster_distance} \ 42 | ~{'--percent-identity' + percid} \ 43 | ~{'--culling-limit' + culling_limit} \ 44 | ~{'--mismatch' + mismatch} \ 45 | ~{'--align-diff' + align_diff} \ 46 | ~{'--gap' + gap} \ 47 | ~{'--min-perfect' + min_perfect} \ 48 | ~{'--min-good' + min_good} \ 49 | ~{'--max-size' + max_size} \ 50 | ~{assembly} \ 51 | > ~{samplename}.tsv 52 | >>> 53 | output { 54 | File emmtyper_results = "~{samplename}.tsv" 55 | String emmtyper_version = read_string("VERSION") 56 | } 57 | runtime { 58 | docker: "~{docker}" 59 | memory: "8 GB" 60 | cpu: 2 61 | disks: "local-disk " + disk_size + " SSD" 62 | disk: disk_size + " GB" 63 | maxRetries: 3 64 | preemptible: 0 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /tasks/quality_control/task_bbduk.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task bbduk_pe { 4 | input { 5 | File read1_trimmed 6 | File read2_trimmed 7 | String samplename 8 | Int mem_size_gb=8 9 | String docker = "quay.io/staphb/bbtools:38.76" 10 | Int disk_size = 100 11 | } 12 | command <<< 13 | # date and version control 14 | date | tee DATE 15 | 16 | repair.sh in1=~{read1_trimmed} in2=~{read2_trimmed} out1=~{samplename}.paired_1.fastq.gz out2=~{samplename}.paired_2.fastq.gz 17 | 18 | bbduk.sh in1=~{samplename}.paired_1.fastq.gz in2=~{samplename}.paired_2.fastq.gz out1=~{samplename}.rmadpt_1.fastq.gz out2=~{samplename}.rmadpt_2.fastq.gz ref=/bbmap/resources/adapters.fa stats=~{samplename}.adapters.stats.txt ktrim=r k=23 mink=11 hdist=1 tpe tbo 19 | 20 | bbduk.sh in1=~{samplename}.rmadpt_1.fastq.gz in2=~{samplename}.rmadpt_2.fastq.gz out1=~{samplename}_1.clean.fastq.gz out2=~{samplename}_2.clean.fastq.gz outm=~{samplename}.matched_phix.fq ref=/bbmap/resources/phix174_ill.ref.fa.gz k=31 hdist=1 stats=~{samplename}.phix.stats.txt 21 | 22 | >>> 23 | output { 24 | File read1_clean = "~{samplename}_1.clean.fastq.gz" 25 | File read2_clean = "~{samplename}_2.clean.fastq.gz" 26 | File adapter_stats = "~{samplename}.adapters.stats.txt" 27 | File phiX_stats = "~{samplename}.phix.stats.txt" 28 | String bbduk_docker = docker 29 | String pipeline_date = read_string("DATE") 30 | } 31 | runtime { 32 | docker: "~{docker}" 33 | memory: "~{mem_size_gb} GB" 34 | cpu: 4 35 | disks: "local-disk " + disk_size + " SSD" 36 | disk: disk_size + " GB" 37 | preemptible: 0 38 | maxRetries: 3 39 | } 40 | } 41 | 42 | task bbduk_se { 43 | input { 44 | File read1_trimmed 45 | String samplename 46 | Int mem_size_gb=8 47 | String docker="quay.io/staphb/bbtools:38.76" 48 | Int disk_size = 100 49 | } 50 | command <<< 51 | # date and version control 52 | date | tee DATE 53 | 54 | bbduk.sh in1=~{read1_trimmed} out1=~{samplename}.rmadpt_1.fastq.gz ref=/bbmap/resources/adapters.fa stats=~{samplename}.adapters.stats.txt ktrim=r k=23 mink=11 hdist=1 tpe tbo 55 | 56 | bbduk.sh in1=~{read1_trimmed} out1=~{samplename}_1.clean.fastq.gz outm=~{samplename}.matched_phix.fq ref=/bbmap/resources/phix174_ill.ref.fa.gz k=31 hdist=1 stats=~{samplename}.phix.stats.txt 57 | >>> 58 | output { 59 | File read1_clean = "~{samplename}_1.clean.fastq.gz" 60 | File adapter_stats = "~{samplename}.adapters.stats.txt" 61 | File phiX_stats = "~{samplename}.phix.stats.txt" 62 | String bbduk_docker = docker 63 | String pipeline_date = read_string("DATE") 64 | } 65 | runtime { 66 | docker: "~{docker}" 67 | memory: "~{mem_size_gb} GB" 68 | cpu: 4 69 | disks: "local-disk " + disk_size + " SSD" 70 | disk: disk_size + " GB" 71 | preemptible: 0 72 | maxRetries: 3 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /tasks/species_typing/task_genotyphi.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task genotyphi { 4 | # Inputs 5 | input { 6 | File read1 7 | File? read2 8 | Boolean ont_data=false 9 | String samplename 10 | String genotyphi_docker_image = "staphb/mykrobe:0.11.0" 11 | Int disk_size = 100 12 | Int cpu = 4 13 | } 14 | command <<< 15 | # Print and save versions 16 | mykrobe --version | sed 's|mykrobe v||g' | tee MYKROBE_VERSION 17 | # super ugly oneliner since "python /genotyphi/genotyphi.py --version" does NOT work due to python syntax error 18 | grep '__version__ =' /genotyphi/genotyphi.py | sed "s|__version__ = '||" | sed "s|'||" | tee GENOTYPHI_VERSION 19 | 20 | # Run Mykrobe on the input read data 21 | mykrobe predict \ 22 | -t ~{cpu} \ 23 | --sample ~{samplename} \ 24 | --species typhi \ 25 | --format json \ 26 | --out ~{samplename}.mykrobe_genotyphi.json \ 27 | ~{true='--ont' false='' ont_data} \ 28 | --seq ~{read1} ~{read2} 29 | 30 | # use genotyphi script to produce TSV 31 | python /genotyphi/parse_typhi_mykrobe.py \ 32 | --jsons ~{samplename}.mykrobe_genotyphi.json \ 33 | --prefix ~{samplename}_mykrobe_genotyphi 34 | 35 | # Run a python block to parse output file for terra data tables 36 | python3 <>> 55 | output { 56 | File genotyphi_report_tsv = "./~{samplename}_mykrobe_genotyphi_predictResults.tsv" 57 | File genotyphi_mykrobe_json = "./~{samplename}.mykrobe_genotyphi.json" 58 | String genotyphi_version = read_string("GENOTYPHI_VERSION") 59 | String genotyphi_species = read_string("SPECIES") 60 | Float genotyphi_st_probes_percent_coverage = read_string("SPP_PERCENT") 61 | String genotyphi_final_genotype = read_string("FINAL_GENOTYPE") 62 | String genotyphi_genotype_confidence = read_string("CONFIDENCE") 63 | } 64 | runtime { 65 | docker: "~{genotyphi_docker_image}" 66 | memory: "8 GB" 67 | cpu: cpu 68 | disks: "local-disk " + disk_size + " SSD" 69 | disk: disk_size + " GB" 70 | preemptible: 0 71 | maxRetries: 3 72 | } 73 | } -------------------------------------------------------------------------------- /tasks/phylogenetic_inference/task_ksnp3.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task ksnp3 { 4 | input { 5 | Array[File] assembly_fasta 6 | Array[String] samplename 7 | String cluster_name 8 | Int kmer_size = 19 9 | String? ksnp3_args = "" # add -ML to calculate a maximum likelihood tree or -NJ to calculate a neighbor-joining tree 10 | String docker_image = "quay.io/staphb/ksnp3:3.1" 11 | Int memory = 8 12 | Int cpu = 4 13 | Int disk_size = 100 14 | } 15 | command <<< 16 | assembly_array=(~{sep=' ' assembly_fasta}) 17 | assembly_array_len=$(echo "${#assembly_array[@]}") 18 | samplename_array=(~{sep=' ' samplename}) 19 | samplename_array_len=$(echo "${#samplename_array[@]}") 20 | 21 | # Ensure assembly, and samplename arrays are of equal length 22 | if [ "$assembly_array_len" -ne "$samplename_array_len" ]; then 23 | echo "Assembly array (length: $assembly_array_len) and samplename array (length: $samplename_array_len) are of unequal length." >&2 24 | exit 1 25 | fi 26 | 27 | # create file of filenames for kSNP3 input 28 | touch ksnp3_input.tsv 29 | for index in ${!assembly_array[@]}; do 30 | assembly=${assembly_array[$index]} 31 | samplename=${samplename_array[$index]} 32 | echo -e "${assembly}\t${samplename}" >> ksnp3_input.tsv 33 | done 34 | # run ksnp3 on input assemblies 35 | kSNP3 -in ksnp3_input.tsv -outdir ksnp3 -k ~{kmer_size} -core -vcf ~{ksnp3_args} 36 | 37 | # rename ksnp3 outputs with cluster name 38 | mv -v ksnp3/core_SNPs_matrix.fasta ksnp3/~{cluster_name}_core_SNPs_matrix.fasta 39 | mv -v ksnp3/tree.core.tre ksnp3/~{cluster_name}_core.nwk 40 | mv -v ksnp3/VCF.*.vcf ksnp3/~{cluster_name}_core.vcf 41 | mv -v ksnp3/SNPs_all_matrix.fasta ksnp3/~{cluster_name}_pan_SNPs_matrix.fasta 42 | mv -v ksnp3/tree.parsimony.tre ksnp3/~{cluster_name}_pan_parsimony.nwk 43 | 44 | if [ -f ksnp3/tree.ML.tre ]; then 45 | mv -v ksnp3/tree.ML.tre ksnp3/~{cluster_name}_ML.nwk 46 | fi 47 | if [ -f ksnp3/tree.NJ.tre ]; then 48 | mv -v ksnp3/tree.NJ.tre ksnp3/~{cluster_name}_NJ.nwk 49 | fi 50 | 51 | >>> 52 | output { 53 | File ksnp3_core_matrix = "ksnp3/${cluster_name}_core_SNPs_matrix.fasta" 54 | File ksnp3_core_tree = "ksnp3/${cluster_name}_core.nwk" 55 | File ksnp3_core_vcf = "ksnp3/${cluster_name}_core.vcf" 56 | File ksnp3_pan_matrix = "ksnp3/~{cluster_name}_pan_SNPs_matrix.fasta" 57 | File ksnp3_pan_parsimony_tree = "ksnp3/~{cluster_name}_pan_parsimony.nwk" 58 | File? ksnp3_ml_tree = "ksnp3/~{cluster_name}_ML.nwk" 59 | File? ksnp3_nj_tree = "ksnp3/~{cluster_name}_NJ.nwk" 60 | File number_snps = "ksnp3/COUNT_SNPs" 61 | Array[File] ksnp_outs = glob("ksnp3/*") 62 | String ksnp3_docker_image = docker_image 63 | } 64 | runtime { 65 | docker: docker_image 66 | memory: "~{memory} GB" 67 | cpu: cpu 68 | disks: "local-disk " + disk_size + " SSD" 69 | disk: disk_size + " GB" 70 | preemptible: 0 71 | maxRetries: 3 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /workflows/de_novo_assembly.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow de_novo_assembly { 4 | 5 | input { 6 | String SRR 7 | File read1 8 | File read2 9 | } 10 | 11 | call seqyclean { 12 | input: 13 | samplename=SRR, 14 | read1=read1, 15 | read2=read2 16 | } 17 | 18 | call shovill { 19 | input: 20 | samplename=SRR, 21 | read1_cleaned=seqyclean.read1_cleaned, 22 | read2_cleaned=seqyclean.read2_cleaned 23 | } 24 | 25 | output { 26 | File read1_cleaned =seqyclean.read1_cleaned 27 | File read2_cleaned =seqyclean.read2_cleaned 28 | File contigs_fasta =shovill.contigs_fasta 29 | File contigs_gfa =shovill.contigs_gfa 30 | } 31 | } 32 | 33 | task seqyclean { 34 | 35 | input { 36 | File read1 37 | File read2 38 | String samplename 39 | File? adapters 40 | Int? seqyclean_minlen=25 41 | String? seqyclean_qual="20 20" 42 | Boolean? compress=true 43 | Boolean? seqyclean_dup=false 44 | Boolean? seqyclean_no_adapter_trim=false 45 | } 46 | 47 | command { 48 | seqyclean --version | head -1 | tee VERSION 49 | seqyclean \ 50 | ${'-minlen ' + seqyclean_minlen} \ 51 | ${'-qual ' + seqyclean_qual} \ 52 | ${'-c ' + adapters} \ 53 | ${true="-dup" false="" seqyclean_dup} \ 54 | ${true="-no_adapter_trim " false="" seqyclean_no_adapter_trim} \ 55 | ${true="-gz " false="" compress} \ 56 | ${'-1 ' + read1} \ 57 | ${'-2 ' + read2} \ 58 | ${'-o ' + samplename} 59 | } 60 | 61 | output { 62 | File read1_cleaned = "${samplename}_PE1.fastq.gz" 63 | File read2_cleaned = "${samplename}_PE2.fastq.gz" 64 | String seqyclean_version = read_string("VERSION") 65 | } 66 | 67 | runtime { 68 | docker: "quay.io/staphb/seqyclean:1.10.09" 69 | memory: "8 GB" 70 | cpu: 2 71 | disks: "local-disk 100 SSD" 72 | preemptible: 0 73 | } 74 | } 75 | 76 | task shovill { 77 | 78 | input { 79 | File read1_cleaned 80 | File read2_cleaned 81 | String samplename 82 | } 83 | 84 | command { 85 | shovill --version | head -1 | tee VERSION 86 | shovill \ 87 | --outdir out \ 88 | --R1 ${read1_cleaned} \ 89 | --R2 ${read2_cleaned} 90 | mv out/contigs.fa out/${samplename}_contigs.fasta 91 | mv out/contigs.gfa out/${samplename}_contigs.gfa 92 | } 93 | 94 | output { 95 | File contigs_fasta = "out/${samplename}_contigs.fasta" 96 | File contigs_gfa = "out/${samplename}_contigs.gfa" 97 | String shovill_version = read_string("VERSION") 98 | } 99 | 100 | runtime { 101 | docker: "quay.io/staphb/shovill:1.1.0" 102 | memory: "16 GB" 103 | cpu: 4 104 | disks: "local-disk 100 SSD" 105 | preemptible: 0 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /workflows/wf_ksnp3.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/phylogenetic_inference/task_ksnp3.wdl" as ksnp3 4 | import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists 5 | import "../tasks/task_versioning.wdl" as versioning 6 | import "../tasks/utilities/task_summarize_data.wdl" as data_summary 7 | 8 | workflow ksnp3_workflow { 9 | input { 10 | Array[File] assembly_fasta 11 | Array[String] samplename 12 | String cluster_name 13 | String? data_summary_terra_project 14 | String? data_summary_terra_workspace 15 | String? data_summary_terra_table 16 | String? data_summary_column_names # string of comma delimited column names 17 | } 18 | call ksnp3.ksnp3 as ksnp3_task { 19 | input: 20 | assembly_fasta = assembly_fasta, 21 | samplename = samplename, 22 | cluster_name = cluster_name 23 | } 24 | call snp_dists.snp_dists as core_snp_dists { 25 | input: 26 | cluster_name = cluster_name, 27 | alignment = ksnp3_task.ksnp3_core_matrix 28 | } 29 | call snp_dists.snp_dists as pan_snp_dists { 30 | input: 31 | cluster_name = cluster_name, 32 | alignment = ksnp3_task.ksnp3_pan_matrix 33 | } 34 | call snp_dists.reorder_matrix as core_reorder_matrix { 35 | input: 36 | input_tree = ksnp3_task.ksnp3_core_tree, 37 | matrix = core_snp_dists.snp_matrix, 38 | cluster_name = cluster_name + "_core" 39 | } 40 | call snp_dists.reorder_matrix as pan_reorder_matrix { 41 | input: 42 | input_tree = ksnp3_task.ksnp3_pan_parsimony_tree, 43 | matrix = pan_snp_dists.snp_matrix, 44 | cluster_name = cluster_name + "_pan" 45 | } 46 | if (defined(data_summary_column_names)) { 47 | call data_summary.summarize_data { 48 | input: 49 | sample_names = samplename, 50 | terra_project = data_summary_terra_project, 51 | terra_workspace = data_summary_terra_workspace, 52 | terra_table = data_summary_terra_table, 53 | column_names = data_summary_column_names, 54 | output_prefix = cluster_name 55 | } 56 | } 57 | call versioning.version_capture{ 58 | input: 59 | } 60 | output { 61 | # Version Capture 62 | String ksnp3_wf_version = version_capture.phbg_version 63 | String ksnp3_wf_analysis_date = version_capture.date 64 | String ksnp3_docker = ksnp3_task.ksnp3_docker_image 65 | # ksnp3_outputs 66 | String ksnp3_snp_dists_version = pan_snp_dists.version 67 | File ksnp3_core_vcf = ksnp3_task.ksnp3_core_vcf 68 | # ordered matrixes and reordered trees 69 | File ksnp3_core_snp_matrix = core_reorder_matrix.ordered_matrix 70 | File ksnp3_core_tree = core_reorder_matrix.tree 71 | File ksnp3_pan_snp_matrix = pan_reorder_matrix.ordered_matrix 72 | File ksnp3_pan_tree = pan_reorder_matrix.tree 73 | # optional tree outputs 74 | File? ksnp3_ml_tree = ksnp3_task.ksnp3_ml_tree 75 | File? ksnp3_nj_tree = ksnp3_task.ksnp3_nj_tree 76 | # data summary output 77 | File? ksnp3_summarized_data = summarize_data.summarized_data 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /tasks/species_typing/task_shigatyper.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task shigatyper { 4 | meta { 5 | description: "ShigaTyper is a quick and easy tool designed to determine Shigella serotype using Illumina (single or paired-end) or Oxford Nanopore reads with low computation requirement. https://github.com/CFSAN-Biostatistics/shigatyper" 6 | } 7 | input { 8 | File read1 9 | File? read2 10 | String samplename 11 | String docker = "staphb/shigatyper:2.0.3" 12 | Int disk_size = 100 13 | Int cpus = 4 14 | Boolean read1_is_ont = false 15 | } 16 | command <<< 17 | # get version information 18 | shigatyper --version | sed 's/ShigaTyper //' | tee VERSION.txt 19 | 20 | # if read2 DOES NOT EXIST, ASSUME SINGLE END OR ONT 21 | if [ -z "~{read2}" ] ; then 22 | INPUT_READS="--SE ~{read1}" 23 | # if read1_is_ont is set to TRUE, then use ONT flags 24 | if [ "~{read1_is_ont}" == "true" ]; then 25 | INPUT_READS="--SE ~{read1} --ont" 26 | fi 27 | # else read2 DOES EXIST, ASSUME PAIRED END 28 | else 29 | INPUT_READS="--R1 ~{read1} --R2 ~{read2}" 30 | fi 31 | echo "INPUT_READS set to: ${INPUT_READS}" 32 | echo 33 | 34 | # run shigatyper. 2 output files will be ~{samplename}.tsv and ~{samplename}-hits.tsv 35 | echo "Running ShigaTyper..." 36 | shigatyper \ 37 | ${INPUT_READS} \ 38 | -n ~{samplename} 39 | 40 | # rename output TSVs to be more descriptive 41 | mv -v ~{samplename}.tsv ~{samplename}_shigatyper_summary.tsv 42 | mv -v ~{samplename}-hits.tsv ~{samplename}_shigatyper_hits.tsv 43 | 44 | # parse summary tsv for prediction, ipaB absence/presence, and notes 45 | cut -f 2 ~{samplename}_shigatyper_summary.tsv | tail -n 1 > shigatyper_prediction.txt 46 | cut -f 3 ~{samplename}_shigatyper_summary.tsv | tail -n 1 > shigatyper_ipaB_presence_absence.txt 47 | cut -f 4 ~{samplename}_shigatyper_summary.tsv | tail -n 1 > shigatyper_notes.txt 48 | 49 | # if shigatyper notes field (really the txt file) is EMPTY, write string saying it is empty to float to Terra table 50 | if [ "$(cat shigatyper_notes.txt)" == "" ]; then 51 | echo "ShigaTyper notes field was empty" > shigatyper_notes.txt 52 | fi 53 | 54 | >>> 55 | output { 56 | String shigatyper_predicted_serotype = read_string("shigatyper_prediction.txt") 57 | String shigatyper_ipaB_presence_absence = read_string("shigatyper_ipaB_presence_absence.txt") 58 | String shigatyper_notes = read_string("shigatyper_notes.txt") 59 | File shigatyper_hits_tsv = "~{samplename}_shigatyper_hits.tsv" # A tab-delimited detailed report file 60 | File shigatyper_summary_tsv = "~{samplename}_shigatyper_summary.tsv" # A tab-delimited summary report file 61 | String shigatyper_version = read_string("VERSION.txt") 62 | String shigatyper_docker = docker 63 | } 64 | runtime { 65 | docker: "~{docker}" 66 | memory: "16 GB" 67 | cpu: cpus 68 | disks: "local-disk " + disk_size + " SSD" 69 | disk: disk_size + " GB" 70 | maxRetries: 3 71 | preemptible: 0 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /tasks/quality_control/task_fastp.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task fastp { 4 | input { 5 | File read1 6 | File read2 7 | String samplename 8 | String docker = "quay.io/staphb/fastp:0.23.2" 9 | Int disk_size = 100 10 | Int fastp_window_size = 20 11 | Int fastp_quality_trim_score = 30 12 | Int fastp_minlen = 50 13 | # -g enables polyg trimming with default value of 10 14 | String fastp_args = "--detect_adapter_for_pe -g -5 20 -3 20" 15 | Int threads = 4 16 | } 17 | command <<< 18 | # date 19 | date | tee DATE 20 | 21 | fastp \ 22 | --in1 ~{read1} --in2 ~{read2} \ 23 | --out1 ~{samplename}_1P.fastq.gz --out2 ~{samplename}_2P.fastq.gz \ 24 | --unpaired1 ~{samplename}_1U.fastq.gz --unpaired2 ~{samplename}_2U.fastq.gz \ 25 | --cut_right --cut_right_window_size ~{fastp_window_size} --cut_right_mean_quality ~{fastp_quality_trim_score} \ 26 | --length_required ~{fastp_minlen} \ 27 | --thread ~{threads} \ 28 | ~{fastp_args} \ 29 | --html ~{samplename}_fastp.html --json ~{samplename}_fastp.json 30 | >>> 31 | output { 32 | File read1_trimmed = "~{samplename}_1P.fastq.gz" 33 | File read2_trimmed = "~{samplename}_2P.fastq.gz" 34 | File read1_trimmed_unpaired = "~{samplename}_1U.fastq.gz" 35 | File read2_trimmed_unpaired = "~{samplename}_2U.fastq.gz" 36 | File fastp_stats = "~{samplename}_fastp.html" 37 | String version = "~{docker}" 38 | String pipeline_date = read_string("DATE") 39 | } 40 | runtime { 41 | docker: "quay.io/staphb/fastp:0.23.2" 42 | memory: "8 GB" 43 | cpu: 4 44 | disks: "local-disk " + disk_size + " SSD" 45 | disk: disk_size + " GB" 46 | preemptible: 0 47 | maxRetries: 3 48 | } 49 | } 50 | 51 | task fastp_se { 52 | input { 53 | File read1 54 | String samplename 55 | String docker = "quay.io/staphb/fastp:0.23.2" 56 | Int disk_size = 100 57 | Int fastp_window_size = 20 58 | Int fastp_quality_trim_score = 30 59 | Int fastp_minlen = 50 60 | # -g enables polyg trimming with default value of 10 61 | # --detect_adapter_for_pe argument was removed 62 | String fastp_args = "-g -5 20 -3 20" 63 | Int threads = 4 64 | } 65 | command <<< 66 | # date 67 | date | tee DATE 68 | 69 | fastp \ 70 | --in1 ~{read1} \ 71 | --out1 ~{samplename}_1P.fastq.gz \ 72 | --cut_right --cut_right_window_size ~{fastp_window_size} --cut_right_mean_quality ~{fastp_quality_trim_score} \ 73 | --length_required ~{fastp_minlen} \ 74 | --thread ~{threads} \ 75 | ~{fastp_args} \ 76 | --html ~{samplename}_fastp.html --json ~{samplename}_fastp.json 77 | >>> 78 | output { 79 | File read1_trimmed = "~{samplename}_1P.fastq.gz" 80 | File fastp_stats = "~{samplename}_fastp.html" 81 | String version = "~{docker}" 82 | String pipeline_date = read_string("DATE") 83 | } 84 | runtime { 85 | docker: "quay.io/staphb/fastp:0.23.2" 86 | memory: "8 GB" 87 | cpu: 4 88 | disks: "local-disk " + disk_size + " SSD" 89 | disk: disk_size + " GB" 90 | preemptible: 0 91 | maxRetries: 3 92 | } 93 | } -------------------------------------------------------------------------------- /workflows/ecoli_char.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow ecoli_char { 4 | 5 | input { 6 | String SRR 7 | File contigs 8 | } 9 | 10 | call abricate as abricate { 11 | input: 12 | samplename=SRR, 13 | contigs=contigs, 14 | database="ncbi" 15 | } 16 | 17 | call abricate as abricate_virfinder { 18 | input: 19 | samplename=SRR, 20 | contigs=contigs, 21 | database="ecoli_vf" 22 | } 23 | 24 | call amrfinderplus { 25 | input: 26 | samplename=SRR, 27 | contigs=contigs 28 | } 29 | 30 | call serotypefinder { 31 | input: 32 | samplename=SRR, 33 | contigs=contigs 34 | } 35 | 36 | output { 37 | File abricate_results =abricate.abricate_results 38 | File abricate_virfinder_results =abricate_virfinder.abricate_results 39 | File amrfinderplus_results =amrfinderplus.amrfinder_results 40 | File serotypefinder_results =serotypefinder.serotypefinder_results 41 | } 42 | } 43 | 44 | task abricate { 45 | 46 | input { 47 | File contigs 48 | String samplename 49 | String database 50 | } 51 | 52 | command { 53 | abricate --version | head -1 | tee VERSION 54 | abricate --db ${database} ${contigs} > ${samplename + '_abricate.tsv'} 55 | } 56 | 57 | output { 58 | File abricate_results="${samplename + '_abricate.tsv'}" 59 | } 60 | 61 | runtime { 62 | docker: "quay.io/staphb/abricate:1.0.0" 63 | memory: "8 GB" 64 | cpu: 2 65 | disks: "local-disk 100 SSD" 66 | preemptible: 0 67 | } 68 | } 69 | 70 | task amrfinderplus { 71 | input { 72 | File contigs 73 | String samplename 74 | } 75 | 76 | command { 77 | amrfinder --version | head -1 | tee VERSION 78 | amrfinder \ 79 | --nucleotide ${contigs} \ 80 | -o ${samplename + '_amrfinder.tsv'} 81 | } 82 | 83 | output { 84 | File amrfinder_results="${samplename + '_amrfinder.tsv'}" 85 | } 86 | 87 | runtime { 88 | docker: "quay.io/staphb/ncbi-amrfinderplus:3.8.28" 89 | memory: "8 GB" 90 | cpu: 2 91 | disks: "local-disk 100 SSD" 92 | preemptible: 0 93 | } 94 | } 95 | 96 | task serotypefinder { 97 | 98 | input { 99 | File contigs 100 | String samplename 101 | } 102 | 103 | command { 104 | serotypefinder.pl --version | head -1 | tee VERSION 105 | serotypefinder.pl \ 106 | -i ${contigs} \ 107 | -d /serotypefinder/database \ 108 | -b /blast-2.2.26 \ 109 | -s ecoli \ 110 | -k 85.00 \ 111 | -l 0.60 \ 112 | -o ${samplename} 113 | } 114 | 115 | output { 116 | File serotypefinder_results="${samplename}/results_table.txt" 117 | } 118 | 119 | runtime { 120 | docker: "quay.io/staphb/serotypefinder:1.1" 121 | memory: "8 GB" 122 | cpu: 2 123 | disks: "local-disk 100 SSD" 124 | preemptible: 0 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /tasks/phylogenetic_inference/task_snp_dists.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task snp_dists { 4 | input { 5 | File alignment 6 | String cluster_name 7 | Int disk_size = 100 8 | } 9 | command <<< 10 | # date and version control 11 | date | tee DATE 12 | snp-dists -v | tee VERSION 13 | 14 | # create snp-dists matrix file 15 | snp-dists ~{alignment} > ~{cluster_name}_snp_distance_matrix.tsv 16 | >>> 17 | output { 18 | String date = read_string("DATE") 19 | String version = read_string("VERSION") 20 | File snp_matrix = "~{cluster_name}_snp_distance_matrix.tsv" 21 | } 22 | runtime { 23 | docker: "quay.io/staphb/snp-dists:0.8.2" 24 | memory: "2 GB" 25 | cpu: 2 26 | disks: "local-disk " + disk_size + " SSD" 27 | disk: disk_size + " GB" 28 | maxRetries: 3 29 | preemptible: 0 30 | } 31 | } 32 | 33 | task reorder_matrix { 34 | input { 35 | File input_tree 36 | File matrix 37 | String cluster_name 38 | Int disk_size = 100 39 | } 40 | command <<< 41 | # removing any "_contigs" suffixes from the tree and matrix 42 | sed 's/_contigs//g' ~{input_tree} > temporary_tree.nwk 43 | sed 's/_contigs//g' ~{matrix} > temporary_matrix.tsv 44 | 45 | python3 <>> 83 | output{ 84 | File ordered_matrix = "~{cluster_name}_snp_matrix.csv" 85 | File tree = "~{cluster_name}_tree.nwk" 86 | } 87 | runtime { 88 | docker: "staphb/mykrobe:0.12.1" # used because it contains both biopython and pandas 89 | memory: "2 GB" 90 | cpu: 2 91 | disks: "local-disk " + disk_size + " SSD" 92 | disk: disk_size + " GB" 93 | # maxRetries: 3 94 | preemptible: 0 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /tasks/quality_control/task_fastq_scan.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task fastq_scan_pe { 4 | input { 5 | File read1 6 | File read2 7 | String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") 8 | String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") 9 | Int disk_size = 100 10 | } 11 | command <<< 12 | # capture date and version 13 | date | tee DATE 14 | fastq-scan -v | tee VERSION 15 | 16 | # set cat command based on compression 17 | if [[ "~{read1}" == *".gz" ]] ; then 18 | cat_reads="zcat" 19 | else 20 | cat_reads="cat" 21 | fi 22 | 23 | # capture forward read stats 24 | eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json >(jq .qc_stats.read_total > READ1_SEQS) 25 | read1_seqs=$(cat READ1_SEQS) 26 | eval "${cat_reads} ~{read2}" | fastq-scan | tee ~{read2_name}_fastq-scan.json >(jq .qc_stats.read_total > READ2_SEQS) 27 | read2_seqs=$(cat READ2_SEQS) 28 | 29 | # capture number of read pairs 30 | if [ "${read1_seqs}" == "${read2_seqs}" ]; then 31 | read_pairs=${read1_seqs} 32 | else 33 | read_pairs="Uneven pairs: R1=${read1_seqs}, R2=${read2_seqs}" 34 | fi 35 | 36 | echo $read_pairs | tee READ_PAIRS 37 | >>> 38 | output { 39 | File read1_fastq_scan_report = "~{read1_name}_fastq-scan.json" 40 | File read2_fastq_scan_report = "~{read2_name}_fastq-scan.json" 41 | Int read1_seq = read_string("READ1_SEQS") 42 | Int read2_seq = read_string("READ2_SEQS") 43 | String read_pairs = read_string("READ_PAIRS") 44 | String version = read_string("VERSION") 45 | String pipeline_date = read_string("DATE") 46 | } 47 | runtime { 48 | docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" 49 | memory: "2 GB" 50 | cpu: 2 51 | disks: "local-disk " + disk_size + " SSD" 52 | disk: disk_size + " GB" 53 | preemptible: 0 54 | maxRetries: 3 55 | } 56 | } 57 | 58 | task fastq_scan_se { 59 | input { 60 | File read1 61 | String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") 62 | Int disk_size = 100 63 | } 64 | command <<< 65 | # capture date and version 66 | date | tee DATE 67 | fastq-scan -v | tee VERSION 68 | 69 | # set cat command based on compression 70 | if [[ "~{read1}" == *".gz" ]] ; then 71 | cat_reads="zcat" 72 | else 73 | cat_reads="cat" 74 | fi 75 | 76 | # capture forward read stats 77 | eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json >(jq .qc_stats.read_total > READ1_SEQS) 78 | >>> 79 | output { 80 | File fastq_scan_report = "~{read1_name}_fastq-scan.json" 81 | Int read1_seq = read_string("READ1_SEQS") 82 | String version = read_string("VERSION") 83 | String pipeline_date = read_string("DATE") 84 | } 85 | runtime { 86 | docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1" 87 | memory: "2 GB" 88 | cpu: 2 89 | disks: "local-disk " + disk_size + " SSD" 90 | disk: disk_size + " GB" 91 | preemptible: 0 92 | maxRetries: 3 93 | } 94 | } -------------------------------------------------------------------------------- /tasks/quality_control/task_fastqc.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task fastqc_pe { 4 | input { 5 | File read1 6 | File read2 7 | String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") 8 | String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq") 9 | Int? cpus = 2 10 | String docker="quay.io/staphb/fastqc:0.11.9" 11 | Int disk_size = 100 12 | } 13 | command <<< 14 | # capture date and version 15 | date | tee DATE 16 | fastqc --version | grep FastQC | tee VERSION 17 | 18 | fastqc --outdir $PWD --threads ~{cpus} ~{read1} ~{read2} 19 | 20 | unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS 21 | unzip -p ~{read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ2_SEQS 22 | 23 | READ1_SEQS=$(unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 ) 24 | READ2_SEQS=$(unzip -p ~{read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 ) 25 | 26 | if [ $READ1_SEQS == $READ2_SEQS ]; then 27 | read_pairs=$READ1_SEQS 28 | else 29 | read_pairs="Uneven pairs: R1=$READ1_SEQS, R2=$READ2_SEQS" 30 | fi 31 | echo $read_pairs | tee READ_PAIRS 32 | >>> 33 | output { 34 | File fastqc1_html = "~{read1_name}_fastqc.html" 35 | File fastqc1_zip = "~{read1_name}_fastqc.zip" 36 | File fastqc2_html = "~{read2_name}_fastqc.html" 37 | File fastqc2_zip = "~{read2_name}_fastqc.zip" 38 | Int read1_seq = read_string("READ1_SEQS") 39 | Int read2_seq = read_string("READ2_SEQS") 40 | String read_pairs = read_string("READ_PAIRS") 41 | String version = read_string("VERSION") 42 | String pipeline_date = read_string("DATE") 43 | } 44 | runtime { 45 | docker: "~{docker}" 46 | memory: "4 GB" 47 | cpu: 2 48 | disks: "local-disk " + disk_size + " SSD" 49 | disk: disk_size + " GB" 50 | maxRetries: 3 51 | preemptible: 0 52 | } 53 | } 54 | 55 | task fastqc_se { 56 | input { 57 | File read1 58 | String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq") 59 | Int? cpus = 2 60 | String docker="quay.io/staphb/fastqc:0.11.9" 61 | Int disk_size = 100 62 | } 63 | command <<< 64 | # capture date and version 65 | date | tee DATE 66 | fastqc --version | grep FastQC | tee VERSION 67 | 68 | fastqc --outdir $PWD --threads ~{cpus} ~{read1} 69 | 70 | unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS 71 | 72 | READ_SEQS=$(unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 ) 73 | >>> 74 | output { 75 | File fastqc_html = "~{read1_name}_fastqc.html" 76 | File fastqc_zip = "~{read1_name}_fastqc.zip" 77 | Int number_reads = read_string("READ1_SEQS") 78 | String version = read_string("VERSION") 79 | String pipeline_date = read_string("DATE") 80 | } 81 | runtime { 82 | docker: "~{docker}" 83 | memory: "4 GB" 84 | cpu: 2 85 | disks: "local-disk " + disk_size + " SSD" 86 | disk: disk_size + " GB" 87 | maxRetries: 3 88 | preemptible: 0 89 | } 90 | } -------------------------------------------------------------------------------- /tasks/species_typing/task_ngmaster.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task ngmaster { 4 | meta { 5 | description: "Multi-antigen sequence typing for Neisseria gonorrhoeae" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "staphb/ngmaster:1.0.0" 11 | Int disk_size = 100 12 | Int cpu = 2 13 | } 14 | command <<< 15 | ngmaster --version 2>&1 | sed 's/^.*ngmaster //' | tee VERSION 16 | 17 | # run ngmaster on input assembly 18 | # unfortunately ngmaster 1.0.0 fails when either mincov or minid flags are supplied (this is with different install strategies too - bioconda & manually) 19 | # so we're forced to stick with default minid of 90 and mincov of 10. https://github.com/MDU-PHL/ngmaster/issues/39 20 | # ngmaster --comments also does not work 21 | ngmaster \ 22 | ~{assembly} \ 23 | > ~{samplename}.ngmaster.tsv 24 | 25 | # parse output TSV 26 | # first one is tricky since MLSTs are in the 3rd column, separated by a / 27 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $3}' | cut -d '/' -f 1 | tee NGMAST_SEQUENCE_TYPE 28 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $3}' | cut -d '/' -f 2 | tee NGSTAR_SEQUENCE_TYPE 29 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $4}' | tee NGMAST_PORB 30 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $5}' | tee NGMAST_TBPB 31 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $6}' | tee NGSTAR_PENA 32 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $7}' | tee NGSTAR_MTRR 33 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $8}' | tee NGSTAR_PORB 34 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $9}' | tee NGSTAR_PONA 35 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $10}' | tee NGSTAR_GYRA 36 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $11}' | tee NGSTAR_PARC 37 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $12}' | tee NGSTAR_23S 38 | 39 | >>> 40 | output { 41 | File ngmaster_tsv = "~{samplename}.ngmaster.tsv" 42 | String ngmaster_version = read_string("VERSION") 43 | # NG-MAST scheme's MLST and alleles (only 2 loci) 44 | String ngmaster_ngmast_sequence_type = read_string("NGMAST_SEQUENCE_TYPE") 45 | String ngmaster_ngmast_porB_allele = read_string("NGMAST_PORB") 46 | String ngmaster_ngmast_tbpB_allele = read_string("NGMAST_TBPB") 47 | # NG-STAR scheme's MLST and alleles (7 loci) 48 | String ngmaster_ngstar_sequence_type = read_string("NGSTAR_SEQUENCE_TYPE") 49 | String ngmaster_ngstar_penA_allele = read_string("NGSTAR_PENA") 50 | String ngmaster_ngstar_mtrR_allele = read_string("NGSTAR_MTRR") 51 | String ngmaster_ngstar_porB_allele = read_string("NGSTAR_PORB") 52 | String ngmaster_ngstar_ponA_allele = read_string("NGSTAR_PONA") 53 | String ngmaster_ngstar_gyrA_allele = read_string("NGSTAR_GYRA") 54 | String ngmaster_ngstar_parC_allele = read_string("NGSTAR_PARC") 55 | String ngmaster_ngstar_23S_allele = read_string("NGSTAR_23S") 56 | } 57 | runtime { 58 | docker: "~{docker}" 59 | memory: "8 GB" 60 | cpu: cpu 61 | disks: "local-disk " + disk_size + " SSD" 62 | disk: disk_size + " GB" 63 | maxRetries: 3 64 | preemptible: 0 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /tasks/species_typing/task_sonneityping.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task sonneityping { 4 | # Inputs 5 | input { 6 | File read1 7 | File? read2 8 | Boolean ont_data = false 9 | String samplename 10 | String docker = "staphb/mykrobe:0.12.1" 11 | Int disk_size = 100 12 | String? myrkobe_opts 13 | Int cpu = 4 14 | } 15 | command <<< 16 | # Print and save versions 17 | mykrobe --version | sed 's|mykrobe v||g' | tee MYKROBE_VERSION.txt 18 | # opting to skip capturing the sonneityping version since there is no --version flag or easy way to determine version 19 | # navigate here for docker image and version information: https://github.com/StaPH-B/docker-builds/tree/master/mykrobe 20 | 21 | # Run Mykrobe on the input read data 22 | mykrobe predict \ 23 | -t ~{cpu} \ 24 | --sample ~{samplename} \ 25 | --species sonnei \ 26 | --format json_and_csv \ 27 | --out ~{samplename}.mykrobe \ 28 | ~{true='--ont' false='' ont_data} \ 29 | --seq ~{read1} ~{read2} \ 30 | ~{myrkobe_opts} 31 | 32 | # use sonneityping script to produce final TSV; alleles.txt is required input for human-readable genotype names 33 | python /sonneityping/parse_mykrobe_predict.py \ 34 | --jsons ~{samplename}.mykrobe.json --alleles /sonneityping/alleles.txt \ 35 | --prefix ~{samplename}.sonneityping 36 | 37 | # rename output TSV to something prettier 38 | mv -v ~{samplename}.sonneityping_predictResults.tsv ~{samplename}.sonneityping.tsv 39 | 40 | # Run a python block to parse output sonneityping TSV file for terra data tables 41 | python3 <>> 60 | output { 61 | File sonneityping_mykrobe_report_csv = "~{samplename}.mykrobe.csv" 62 | File sonneityping_mykrobe_report_json = "~{samplename}.mykrobe.json" 63 | File sonneityping_final_report_tsv = "~{samplename}.sonneityping.tsv" 64 | String sonneityping_mykrobe_version = read_string("MYKROBE_VERSION.txt") 65 | String sonneityping_mykrobe_docker = docker 66 | String sonneityping_species = read_string("SPECIES.txt") 67 | String sonneityping_final_genotype = read_string("FINAL_GENOTYPE.txt") 68 | String sonneityping_genotype_confidence = read_string("CONFIDENCE.txt") 69 | String sonneityping_genotype_name = read_string("GENOTYPE_NAME.txt") 70 | } 71 | runtime { 72 | docker: "~{docker}" 73 | memory: "8 GB" 74 | cpu: cpu 75 | disks: "local-disk " + disk_size + " SSD" 76 | disk: disk_size + " GB" 77 | maxRetries: 3 78 | preemptible: 0 79 | } 80 | } -------------------------------------------------------------------------------- /tasks/species_typing/task_ts_mlst.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task ts_mlst { 4 | meta { 5 | description: "Torsten Seeman's (TS) automatic MLST calling from assembled contigs" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "staphb/mlst:2.23.0" 11 | Int disk_size = 100 12 | Int cpu = 4 13 | # Parameters 14 | # --nopath Strip filename paths from FILE column (default OFF) 15 | # --scheme [X] Don't autodetect, force this scheme on all inputs (default '') 16 | # --minid [n.n] DNA %identity of full allelle to consider 'similar' [~] (default '95') 17 | # --mincov [n.n] DNA %cov to report partial allele at all [?] (default '10') 18 | # --minscore [n.n] Minumum score out of 100 to match a scheme (when auto --scheme) (default '50') 19 | Boolean nopath = true 20 | String? scheme 21 | Float? minid 22 | Float? mincov 23 | Float? minscore 24 | } 25 | command <<< 26 | echo $(mlst --version 2>&1) | sed 's/mlst //' | tee VERSION 27 | 28 | #create output header 29 | echo -e "Filename\tPubMLST_Scheme_name\tSequence_Type_(ST)\tAllele_IDs" > ~{samplename}_ts_mlst.tsv 30 | 31 | mlst \ 32 | --threads ~{cpu} \ 33 | ~{true="--nopath" false="" nopath} \ 34 | ~{'--scheme ' + scheme} \ 35 | ~{'--minid ' + minid} \ 36 | ~{'--mincov ' + mincov} \ 37 | ~{'--minscore ' + minscore} \ 38 | --novel ~{samplename}_novel_mlst_alleles.fasta \ 39 | ~{assembly} \ 40 | >> ~{samplename}_ts_mlst.tsv 41 | 42 | # parse ts mlst tsv for relevant outputs 43 | # if output TSV only contains one line (header line); no ST predicted 44 | if [ $(wc -l ~{samplename}_ts_mlst.tsv | awk '{ print $1 }') -eq 1 ]; then 45 | predicted_mlst="No ST predicted" 46 | pubmlst_scheme="NA" 47 | # else, TSV has more than one line, so parse outputs 48 | else 49 | pubmlst_scheme="$(cut -f2 ~{samplename}_ts_mlst.tsv | tail -n 1)" 50 | predicted_mlst="ST$(cut -f3 ~{samplename}_ts_mlst.tsv | tail -n 1)" 51 | # allelic_profile: take second line of output TSV; cut to take 4th column and beyond; replace tabs with commas 52 | allelic_profile="$(cut -f 4- ~{samplename}_ts_mlst.tsv | tail -n 1 | sed -e 's|\t|,|g')" 53 | if [ "$pubmlst_scheme" == "-" ]; then 54 | predicted_mlst="No ST predicted" 55 | pubmlst_scheme="NA" 56 | else 57 | if [ "$predicted_mlst" == "ST-" ]; then 58 | predicted_mlst="No ST predicted" 59 | fi 60 | fi 61 | fi 62 | 63 | echo "$predicted_mlst" | tee PREDICTED_MLST 64 | echo "$pubmlst_scheme" | tee PUBMLST_SCHEME 65 | echo "$allelic_profile" | tee ALLELIC_PROFILE.txt 66 | >>> 67 | output { 68 | File ts_mlst_results = "~{samplename}_ts_mlst.tsv" 69 | String ts_mlst_predicted_st = read_string("PREDICTED_MLST") 70 | String ts_mlst_pubmlst_scheme = read_string("PUBMLST_SCHEME") 71 | String ts_mlst_allelic_profile = read_string("ALLELIC_PROFILE.txt") 72 | File? ts_mlst_novel_alleles = "~{samplename}_novel_mlst_alleles.fasta" 73 | String ts_mlst_version = read_string("VERSION") 74 | } 75 | runtime { 76 | docker: "~{docker}" 77 | memory: "8 GB" 78 | cpu: 4 79 | disks: "local-disk " + disk_size + " SSD" 80 | disk: disk_size + " GB" 81 | maxRetries: 3 82 | preemptible: 0 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /tasks/taxon_id/task_midas.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task midas { 4 | input { 5 | File read1 6 | File? read2 7 | File midas_db = "gs://theiagen-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz" 8 | Int disk_size = 100 9 | String samplename 10 | String docker = "quay.io/fhcrc-microbiome/midas:v1.3.2--6" 11 | Int? memory = 32 12 | Int? cpu = 4 13 | } 14 | command <<< 15 | date | tee DATE 16 | 17 | # Decompress the Midas database 18 | mkdir db 19 | tar -C ./db/ -xzvf ~{midas_db} 20 | 21 | # Run Midas 22 | run_midas.py species ~{samplename} -1 ~{read1} ~{'-2 ' + read2} -d db/midas_db_v1.2/ -t ~{cpu} 23 | 24 | # rename output files 25 | mv ~{samplename}/species/species_profile.txt ~{samplename}/species/~{samplename}_species_profile.tsv 26 | mv ~{samplename}/species/log.txt ~{samplename}/species/~{samplename}_log.txt 27 | 28 | # Run a python block to parse output file for terra data tables 29 | # pandas is available in default docker image for python2 but not python3 30 | python2 <>> 64 | output { 65 | String midas_docker = docker 66 | String midas_analysis_date = read_string("DATE") 67 | File midas_report = "~{samplename}/species/~{samplename}_species_profile.tsv" 68 | File midas_log = "~{samplename}/species/~{samplename}_log.txt" 69 | String midas_primary_genus = read_string("PRIMARY_GENUS") 70 | String midas_secondary_genus = read_string("SECONDARY_GENUS") 71 | Float midas_secondary_genus_abundance = read_string("SECONDARY_GENUS_ABUNDANCE") 72 | } 73 | runtime { 74 | docker: "~{docker}" 75 | memory: "~{memory} GB" 76 | cpu: cpu 77 | disks: "local-disk " + disk_size + " SSD" 78 | disk: disk_size + " GB" 79 | maxRetries: 3 80 | preemptible: 0 81 | } 82 | } -------------------------------------------------------------------------------- /.github/workflows/pytest-workflows.yml: -------------------------------------------------------------------------------- 1 | # 2 | # This workflow will run on Pushes and Pull Requests against the main branch. It 3 | # will run pytest with MiniWDL and Cromwell for any workflows with a change to 4 | # them or associated tasks. 5 | # 6 | name: Pytest Workflows 7 | on: 8 | push: 9 | branches: [main] 10 | pull_request: 11 | branches: [main] 12 | 13 | jobs: 14 | changes: 15 | name: Check for changes 16 | runs-on: ubuntu-latest 17 | outputs: 18 | # Expose workflows with changes 19 | workflows: ${{ steps.filter.outputs.changes }} 20 | steps: 21 | # Checkout the repo 22 | - uses: actions/checkout@v3 23 | 24 | # Select workflows with changes 25 | - uses: dorny/paths-filter@v2 26 | id: filter 27 | with: 28 | filters: "tests/config/pytest_filter.yml" 29 | 30 | check: 31 | runs-on: ubuntu-20.04 32 | name: ${{ matrix.tag }} ${{ matrix.engine }} 33 | needs: changes 34 | if: ${{ needs.changes.outputs.workflows != '[]' && needs.changes.outputs.workflows != '' }} 35 | strategy: 36 | fail-fast: false 37 | matrix: 38 | # For every workflow, test it with MiniWDL and Cromwell 39 | tag: ["${{ fromJson(needs.changes.outputs.workflows) }}"] 40 | engine: ["miniwdl", "cromwell"] 41 | defaults: 42 | run: 43 | # Play nicely with miniconda 44 | shell: bash -l {0} 45 | steps: 46 | # Checkout the repo 47 | - name: Checkout theiagen/public_health_bacterial_genomics 48 | uses: actions/checkout@v3 49 | 50 | # Import test data 51 | - name: Pull Test Data from bactopia/bactopia-tests 52 | uses: actions/checkout@v3 53 | with: 54 | repository: bactopia/bactopia-tests 55 | path: bactopia-tests 56 | 57 | # Setup Miniconda3 58 | - name: Setup miniconda 59 | uses: conda-incubator/setup-miniconda@v2 60 | with: 61 | activate-environment: actions 62 | auto-activate-base: false 63 | 64 | # Depends and env info (mostly for debug) 65 | - name: Install Dependencies 66 | run: | 67 | conda install -y -c conda-forge -c bioconda cromwell miniwdl=1.5.2 'python>=3.7' pytest pytest-workflow 'importlib-metadata<=4.13.0' 68 | uname -a && env 69 | 70 | - name: Test ${{ matrix.tag }} 71 | run: TMPDIR=~ pytest --tag ${{ matrix.tag }}_${{ matrix.engine }} --symlink --kwdof --color=yes 72 | 73 | - name: Upload logs on failure 74 | if: failure() 75 | uses: actions/upload-artifact@v3 76 | with: 77 | name: logs-${{ matrix.engine }} 78 | path: | 79 | /home/runner/pytest_workflow_*/**/stdout* 80 | /home/runner/pytest_workflow_*/**/stderr* 81 | /home/runner/pytest_workflow_*/**/script* 82 | /home/runner/pytest_workflow_*/**/rc 83 | /home/runner/pytest_workflow_*/**/command 84 | /home/runner/pytest_workflow_*/**/*.txt 85 | /home/runner/pytest_workflow_*/**/*.log 86 | /home/runner/pytest_workflow_*/**/*.out 87 | /home/runner/pytest_workflow_*/**/*.err 88 | /home/runner/pytest_workflow_*/**/DATE 89 | /home/runner/pytest_workflow_*/**/VERSION 90 | !/home/runner/pytest_workflow_*/**/*.bam* 91 | !/home/runner/pytest_workflow_*/**/*.fastq.gz 92 | -------------------------------------------------------------------------------- /workflows/wf_read_QC_trim.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/quality_control/task_trimmomatic.wdl" as trimmomatic 4 | import "../tasks/quality_control/task_fastp.wdl" as fastp 5 | import "../tasks/quality_control/task_bbduk.wdl" as bbduk 6 | import "../tasks/quality_control/task_fastq_scan.wdl" as fastq_scan 7 | import "../tasks/taxon_id/task_midas.wdl" as midas 8 | 9 | workflow read_QC_trim { 10 | meta { 11 | description: "Runs basic QC (fastq_scan), trimming (Trimmomatic), and adapter removal (bbduk) on illumina PE reads" 12 | } 13 | 14 | input { 15 | String samplename 16 | File read1_raw 17 | File read2_raw 18 | Int trim_window_size = 10 19 | Int trim_quality_trim_score = 20 20 | Int trim_minlen = 75 21 | Int bbduk_mem = 8 22 | Boolean call_midas = false 23 | File? midas_db 24 | String read_processing = "trimmomatic" 25 | String fastp_args = "--detect_adapter_for_pe -g -5 20 -3 20" 26 | } 27 | if (read_processing == "trimmomatic"){ 28 | call trimmomatic.trimmomatic_pe { 29 | input: 30 | samplename = samplename, 31 | read1 = read1_raw, 32 | read2 = read2_raw, 33 | trimmomatic_window_size = trim_window_size, 34 | trimmomatic_quality_trim_score = trim_quality_trim_score, 35 | trimmomatic_minlen = trim_minlen 36 | } 37 | } 38 | if (read_processing == "fastp"){ 39 | call fastp.fastp { 40 | input: 41 | samplename = samplename, 42 | read1 = read1_raw, 43 | read2 = read2_raw, 44 | fastp_window_size = trim_window_size, 45 | fastp_quality_trim_score = trim_quality_trim_score, 46 | fastp_minlen = trim_minlen, 47 | fastp_args = fastp_args 48 | } 49 | } 50 | call bbduk.bbduk_pe { 51 | input: 52 | samplename = samplename, 53 | read1_trimmed = select_first([trimmomatic_pe.read1_trimmed,fastp.read1_trimmed]), 54 | read2_trimmed = select_first([trimmomatic_pe.read2_trimmed,fastp.read2_trimmed]), 55 | mem_size_gb = bbduk_mem 56 | } 57 | call fastq_scan.fastq_scan_pe as fastq_scan_raw { 58 | input: 59 | read1 = read1_raw, 60 | read2 = read2_raw, 61 | } 62 | call fastq_scan.fastq_scan_pe as fastq_scan_clean { 63 | input: 64 | read1 = bbduk_pe.read1_clean, 65 | read2 = bbduk_pe.read2_clean 66 | } 67 | if (call_midas) { 68 | call midas.midas as midas { 69 | input: 70 | samplename = samplename, 71 | read1 = read1_raw, 72 | read2 = read2_raw, 73 | midas_db = midas_db 74 | } 75 | } 76 | 77 | output { 78 | File read1_clean = bbduk_pe.read1_clean 79 | File read2_clean = bbduk_pe.read2_clean 80 | Int fastq_scan_raw1 = fastq_scan_raw.read1_seq 81 | Int fastq_scan_raw2 = fastq_scan_raw.read2_seq 82 | String fastq_scan_raw_pairs = fastq_scan_raw.read_pairs 83 | Int fastq_scan_clean1 = fastq_scan_clean.read1_seq 84 | Int fastq_scan_clean2 = fastq_scan_clean.read2_seq 85 | String fastq_scan_clean_pairs = fastq_scan_clean.read_pairs 86 | String fastq_scan_version = fastq_scan_raw.version 87 | String bbduk_docker = bbduk_pe.bbduk_docker 88 | String? trimmomatic_version = trimmomatic_pe.version 89 | String? fastp_version = fastp.version 90 | String? midas_docker = midas.midas_docker 91 | File? midas_report = midas.midas_report 92 | String? midas_primary_genus = midas.midas_primary_genus 93 | String? midas_secondary_genus = midas.midas_secondary_genus 94 | Float? midas_secondary_genus_abundance = midas.midas_secondary_genus_abundance 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /tasks/phylogenetic_inference/task_pirate.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task pirate { 4 | input { 5 | Array[File] gff3 6 | String cluster_name 7 | Boolean align = true # align all genes and produce core/pangenome alignments 8 | String steps = "50,60,70,80,90,95,98" # % identity thresholds to use for pangenome construction [default: 50,60,70,80,90,95,98] 9 | String features = "CDS" # features to use for pangenome construction [default: CDS] 10 | Boolean nucl = false # create a pangenome on CDS features using nucleotide identity, default: amino acid identity 11 | String? panopt # additional arguments to pass to pangenome_contruction 12 | Int memory = 32 13 | Int cpu = 4 14 | String docker_image = "quay.io/biocontainers/pirate:1.0.5--hdfd78af_0" 15 | Int disk_size = 100 16 | } 17 | command <<< 18 | 19 | # date and version control 20 | date | tee DATE 21 | PIRATE -v | tee VERSION 22 | 23 | # pirate requires the directory containing the gff files as input 24 | mkdir INPUT_DIR 25 | ln -s ~{sep=' ' gff3} INPUT_DIR 26 | 27 | # run pirate on input gff 28 | PIRATE \ 29 | --input INPUT_DIR \ 30 | --output PIRATE \ 31 | ~{'--steps ' + steps} \ 32 | ~{'--features ' + features} \ 33 | ~{true="--nucl" false="" nucl} \ 34 | ~{true="--align" false="" align} \ 35 | ~{'--pan-opt ' + panopt} \ 36 | ~{'--threads ' + cpu} 37 | 38 | # generate gene_presence_absence.csv 39 | PIRATE_to_roary.pl -i PIRATE/PIRATE.*.tsv -o ~{cluster_name}_gene_presence_absence.csv 40 | 41 | # rename outputs with cluster name 42 | mv PIRATE/PIRATE.pangenome_summary.txt PIRATE/~{cluster_name}_pangenome_summary.txt 43 | mv PIRATE/PIRATE.log PIRATE/~{cluster_name}.log 44 | mv PIRATE/PIRATE.gene_families.ordered.tsv PIRATE/~{cluster_name}_gene_families.ordered.tsv 45 | mv PIRATE/PIRATE.unique_alleles.tsv PIRATE/~{cluster_name}_unique_alleles.tsv 46 | mv PIRATE/binary_presence_absence.fasta PIRATE/~{cluster_name}_binary_presence_absence.fasta 47 | mv PIRATE/binary_presence_absence.nwk PIRATE/~{cluster_name}_binary_presence_absence.nwk 48 | mv PIRATE/pangenome.gfa PIRATE/~{cluster_name}_pangenome.gfa 49 | 50 | if [[ ~{align} == "true" ]]; then 51 | mv PIRATE/pangenome_alignment.fasta PIRATE/~{cluster_name}_pangenome_alignment.fasta 52 | mv PIRATE/pangenome_alignment.gff PIRATE/~{cluster_name}_pangenome_alignment.gff 53 | mv PIRATE/core_alignment.fasta PIRATE/~{cluster_name}_core_alignment.fasta 54 | mv PIRATE/core_alignment.gff PIRATE/~{cluster_name}_core_alignment.gff 55 | fi 56 | 57 | >>> 58 | output { 59 | File pirate_pangenome_summary = "PIRATE/~{cluster_name}_pangenome_summary.txt" 60 | File pirate_gene_families_ordered = "PIRATE/~{cluster_name}_gene_families.ordered.tsv" 61 | File pirate_unique_alleles = "PIRATE/~{cluster_name}_unique_alleles.tsv" 62 | File pirate_binary_fasta = "PIRATE/~{cluster_name}_binary_presence_absence.fasta" 63 | File pirate_binary_tree = "PIRATE/~{cluster_name}_binary_presence_absence.nwk" 64 | File pirate_pangenome_gfa = "PIRATE/~{cluster_name}_pangenome.gfa" 65 | File? pirate_pangenome_alignment_fasta = "PIRATE/~{cluster_name}_pangenome_alignment.fasta" 66 | File? pirate_pangenome_alignment_gff = "PIRATE/~{cluster_name}_pangenome_alignment.gff" 67 | File? pirate_core_alignment_fasta = "PIRATE/~{cluster_name}_core_alignment.fasta" 68 | File? pirate_core_alignment_gff = "PIRATE/~{cluster_name}_core_alignment.gff" 69 | File? pirate_presence_absence_csv = "~{cluster_name}_gene_presence_absence.csv" 70 | String pirate_docker_image = docker_image 71 | } 72 | runtime { 73 | docker: "~{docker_image}" 74 | memory: "~{memory} GB" 75 | cpu: cpu 76 | disks: "local-disk " + disk_size + " SSD" 77 | disk: disk_size + " GB" 78 | maxRetries: 3 79 | preemptible: 0 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /workflows/wf_read_QC_trim_se.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/quality_control/task_trimmomatic.wdl" as trimmomatic 4 | import "../tasks/quality_control/task_fastp.wdl" as fastp 5 | import "../tasks/quality_control/task_bbduk.wdl" as bbduk 6 | import "../tasks/quality_control/task_fastq_scan.wdl" as fastq_scan 7 | import "../tasks/taxon_id/task_midas.wdl" as midas 8 | 9 | workflow read_QC_trim { 10 | meta { 11 | description: "Runs basic QC (fastq_scan), trimming (Trimmomatic), and adapter removal (bbduk) on illumina SE reads" 12 | } 13 | 14 | input { 15 | String samplename 16 | File read1_raw 17 | Int trim_window_size = 4 18 | Int trim_quality_trim_score = 30 19 | Int trim_minlen = 25 20 | Int bbduk_mem = 8 21 | Boolean call_midas = false 22 | File? midas_db 23 | String read_processing = "trimmomatic" 24 | String fastp_args = "-g -5 20 -3 20" 25 | } 26 | # call read_clean.ncbi_scrub_se { 27 | # input: 28 | # samplename = samplename, 29 | # read1 = read1_raw 30 | # } 31 | if (read_processing == "trimmomatic"){ 32 | call trimmomatic.trimmomatic_se { 33 | input: 34 | samplename = samplename, 35 | read1 = read1_raw, 36 | trimmomatic_window_size = trim_window_size, 37 | trimmomatic_quality_trim_score = trim_quality_trim_score, 38 | trimmomatic_minlen = trim_minlen 39 | } 40 | } 41 | if (read_processing == "fastp"){ 42 | call fastp.fastp_se { 43 | input: 44 | samplename = samplename, 45 | read1 = read1_raw, 46 | fastp_window_size = trim_window_size, 47 | fastp_quality_trim_score = trim_quality_trim_score, 48 | fastp_minlen = trim_minlen, 49 | fastp_args = fastp_args 50 | } 51 | } 52 | call bbduk.bbduk_se { 53 | input: 54 | samplename = samplename, 55 | read1_trimmed = select_first([trimmomatic_se.read1_trimmed,fastp_se.read1_trimmed]), 56 | mem_size_gb = bbduk_mem 57 | } 58 | call fastq_scan.fastq_scan_se as fastq_scan_raw { 59 | input: 60 | read1 = read1_raw 61 | } 62 | call fastq_scan.fastq_scan_se as fastq_scan_clean { 63 | input: 64 | read1 = bbduk_se.read1_clean 65 | } 66 | if (call_midas) { 67 | call midas.midas as midas { 68 | input: 69 | samplename = samplename, 70 | read1 = read1_raw, 71 | midas_db = midas_db 72 | } 73 | } 74 | # call taxonID.kraken2 as kraken2_raw { 75 | # input: 76 | # samplename = samplename, 77 | # read1 = bbduk_se.read1_clean 78 | # } 79 | # call taxonID.kraken2 as kraken2_dehosted { 80 | # input: 81 | # samplename = samplename, 82 | # read1 = ncbi_scrub_se.read1_dehosted 83 | # } 84 | 85 | output { 86 | File read1_clean = bbduk_se.read1_clean 87 | 88 | Int fastq_scan_raw_number_reads = fastq_scan_raw.read1_seq 89 | Int fastq_scan_clean_number_reads = fastq_scan_clean.read1_seq 90 | 91 | # String kraken_version = kraken2_raw.version 92 | # Float kraken_human = kraken2_raw.percent_human 93 | # Float kraken_sc2 = kraken2_raw.percent_sc2 94 | # String kraken_report = kraken2_raw.kraken_report 95 | # Float kraken_human_dehosted = kraken2_dehosted.percent_human 96 | # Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2 97 | # String kraken_report_dehosted = kraken2_dehosted.kraken_report 98 | 99 | String fastq_scan_version = fastq_scan_raw.version 100 | String bbduk_docker = bbduk_se.bbduk_docker 101 | String? trimmomatic_version = trimmomatic_se.version 102 | String? fastp_version = fastp_se.version 103 | String? midas_docker = midas.midas_docker 104 | File? midas_report = midas.midas_report 105 | String? midas_primary_genus = midas.midas_primary_genus 106 | String? midas_secondary_genus = midas.midas_secondary_genus 107 | String? midas_secondary_genus_abundance = midas.midas_secondary_genus_abundance 108 | } 109 | } -------------------------------------------------------------------------------- /tasks/taxon_id/task_kraken2.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task kraken2_pe { 4 | input { 5 | File read1 6 | File read2 7 | File kraken2_db 8 | String samplename 9 | String docker = "quay.io/staphb/kraken2:2.1.2-no-db" 10 | Int disk_size = 100 11 | 12 | String? kraken2_args = "" 13 | String? classified_out = "classified#.fastq" 14 | String? unclassified_out = "unclassified#.fastq" 15 | Int? memory = 32 16 | Int? cpu = 4 17 | } 18 | command <<< 19 | echo $(kraken2 --version 2>&1) | sed 's/^.*Kraken version //;s/ .*$//' | tee VERSION 20 | date | tee DATE 21 | 22 | # Decompress the Kraken2 database 23 | mkdir db 24 | tar -C ./db/ -xzvf ~{kraken2_db} 25 | 26 | # Run Kraken2 27 | kraken2 \ 28 | --db ./db/ \ 29 | --threads ~{cpu} \ 30 | --report ~{samplename}.report.txt \ 31 | --gzip-compressed \ 32 | --unclassified-out ~{samplename}.~{unclassified_out} \ 33 | --classified-out ~{samplename}.~{classified_out} \ 34 | --output ~{samplename}.classifiedreads.txt \ 35 | --paired \ 36 | ~{kraken2_args} \ 37 | ~{read1} ~{read2} 38 | 39 | # Compress and cleanup 40 | gzip *.fastq 41 | gzip ~{samplename}.classifiedreads.txt 42 | >>> 43 | output { 44 | String kraken2_version = read_string("VERSION") 45 | String kraken2_docker = docker 46 | String analysis_date = read_string("DATE") 47 | File kraken2_report = "~{samplename}.report.txt" 48 | File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz" 49 | File kraken2_unclassified_read1 = "~{samplename}.unclassified_1.fastq.gz" 50 | File kraken2_unclassified_read2 = "~{samplename}.unclassified_2.fastq.gz" 51 | File kraken2_classified_read1 = "~{samplename}.classified_1.fastq.gz" 52 | File kraken2_classified_read2 = "~{samplename}.classified_2.fastq.gz" 53 | } 54 | runtime { 55 | docker: "~{docker}" 56 | memory: "~{memory} GB" 57 | cpu: cpu 58 | disks: "local-disk " + disk_size + " SSD" 59 | disk: disk_size + " GB" 60 | maxRetries: 3 61 | preemptible: 0 62 | } 63 | } 64 | 65 | task kraken2_se { 66 | input { 67 | File read1 68 | File kraken2_db 69 | String samplename 70 | String docker = "quay.io/staphb/kraken2:2.1.2-no-db" 71 | Int disk_size = 100 72 | 73 | String? kraken2_args = "" 74 | String? classified_out = "classified.fastq" 75 | String? unclassified_out = "unclassified.fastq" 76 | Int? memory = 32 77 | Int? cpu = 4 78 | } 79 | command <<< 80 | echo $(kraken2 --version 2>&1) | sed 's/^.*Kraken version //;s/ .*$//' | tee VERSION 81 | date | tee DATE 82 | 83 | # Decompress the Kraken2 database 84 | mkdir db 85 | tar -C ./db/ -xzvf ~{kraken2_db} 86 | 87 | # Run Kraken2 88 | kraken2 \ 89 | --db ./db/ \ 90 | --threads ~{cpu} \ 91 | --report ~{samplename}.report.txt \ 92 | --gzip-compressed \ 93 | --unclassified-out ~{samplename}.~{unclassified_out} \ 94 | --classified-out ~{samplename}.~{classified_out} \ 95 | --output ~{samplename}.classifiedreads.txt \ 96 | ~{kraken2_args} \ 97 | ~{read1} 98 | 99 | # Compress and cleanup 100 | gzip *.fastq 101 | gzip ~{samplename}.classifiedreads.txt 102 | >>> 103 | output { 104 | String kraken2_version = read_string("VERSION") 105 | String kraken2_docker = docker 106 | String analysis_date = read_string("DATE") 107 | File kraken2_report = "~{samplename}.report.txt" 108 | File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz" 109 | File kraken2_unclassified_read1 = "~{samplename}.unclassified.fastq.gz" 110 | File kraken2_classified_read1 = "~{samplename}.classified.fastq.gz" 111 | } 112 | runtime { 113 | docker: "~{docker}" 114 | memory: "~{memory} GB" 115 | cpu: cpu 116 | disks: "local-disk " + disk_size + " SSD" 117 | disk: disk_size + " GB" 118 | maxRetries: 3 119 | preemptible: 0 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /tasks/species_typing/task_agrvate.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task agrvate { 4 | meta { 5 | description: "Rapid identification of Staphylococcus aureus agr locus type and agr operon variants." 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/agrvate:1.0.2--hdfd78af_0" 11 | Int disk_size = 50 12 | Int cpu = 1 13 | 14 | # Parameters 15 | # --typing_only agr typing only. Skips agr operon extraction and frameshift detection 16 | Boolean typing_only = false 17 | } 18 | command <<< 19 | # get version info 20 | agrvate -v 2>&1 | sed 's/agrvate v//;' | tee VERSION 21 | 22 | # run agrvate on assembly; usearch not available in biocontainer, cannot use that option 23 | # using -m flag for mummer frameshift detection since usearch is not available 24 | agrvate \ 25 | ~{true="--typing-only" false="" typing_only} \ 26 | -i ~{assembly} \ 27 | -m 28 | 29 | # agrvate names output directory and file based on name of .fasta file, so .fasta as input results in -results/ outdir 30 | # and results in -results/-summary.tab files 31 | basename=$(basename ~{assembly}) 32 | # strip off anything after the period 33 | fasta_prefix=${basename%.*} 34 | 35 | # rename outputs summary TSV to include samplename 36 | mv -v "${fasta_prefix}-results/${fasta_prefix}-summary.tab" ~{samplename}.agrvate.tsv 37 | 38 | # parse output summary TSV 39 | cut -f 2 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_GROUP 40 | cut -f 3 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_MATCH_SCORE 41 | cut -f 4 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_CANONICAL 42 | cut -f 5 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_MULTIPLE 43 | cut -f 6 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_NUM_FRAMESHIFTS 44 | 45 | # edit output string AGR_CANONICAL to be more informative: https://github.com/VishnuRaghuram94/AgrVATE#results 46 | if [[ $(cat AGR_CANONICAL) == 1 ]]; then 47 | echo "1. canonical agrD" >AGR_CANONICAL 48 | elif [[ $(cat AGR_CANONICAL) == 0 ]]; then 49 | echo "0. non-canonical agrD" >AGR_CANONICAL 50 | elif [[ $(cat AGR_CANONICAL) == "u" ]]; then 51 | echo "u. unknown agrD" >AGR_CANNONICAL 52 | else 53 | echo "result unrecognized, please see summary agrvate TSV file" >AGR_CANONICAL 54 | fi 55 | 56 | # edit output string AGR_MULTIPLE to be more informative: https://github.com/VishnuRaghuram94/AgrVATE#results 57 | if [[ $(cat AGR_MULTIPLE) == "s" ]]; then 58 | echo "s. single agr group found" >AGR_MULTIPLE 59 | elif [[ $(cat AGR_MULTIPLE) == "m" ]]; then 60 | echo "m. multiple agr groups found" >AGR_MULTIPLE 61 | elif [[ $(cat AGR_MULTIPLE) == "u" ]]; then 62 | echo "u. unknown agr groups found" >AGR_MULTIPLE 63 | else 64 | echo "result unrecognized, please see summary agrvate TSV file" >AGR_MULTIPLE 65 | fi 66 | 67 | # if AGR_NUM_FRAMESHIFTS is unknown, edit output string AGR_NUM_FRAMESHIFTS to be more informative, otherwise keep set to a number: https://github.com/VishnuRaghuram94/AgrVATE#results 68 | if [[ $(cat AGR_NUM_FRAMESHIFTS) == "u" ]]; then 69 | echo "u or unknown; agr operon not extracted" >AGR_NUM_FRAMESHIFTS 70 | fi 71 | 72 | # create tarball of all output files 73 | tar -czvf ~{samplename}.agrvate.tar.gz "${fasta_prefix}-results/" 74 | >>> 75 | output { 76 | File agrvate_summary = "~{samplename}.agrvate.tsv" 77 | File agrvate_results = "~{samplename}.agrvate.tar.gz" 78 | String agrvate_agr_group = read_string("AGR_GROUP") 79 | String agrvate_agr_match_score = read_string("AGR_MATCH_SCORE") 80 | String agrvate_agr_canonical = read_string("AGR_CANONICAL") 81 | String agrvate_agr_multiple = read_string("AGR_MULTIPLE") 82 | String agrvate_agr_num_frameshifts = read_string("AGR_NUM_FRAMESHIFTS") 83 | String agrvate_version = read_string("VERSION") 84 | String agrvate_docker = docker 85 | } 86 | runtime { 87 | docker: "~{docker}" 88 | memory: "4 GB" 89 | cpu: cpu 90 | disks: "local-disk " + disk_size + " SSD" 91 | disk: disk_size + " GB" 92 | maxRetries: 3 93 | preemptible: 0 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /tasks/quality_control/task_cg_pipeline.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task cg_pipeline { 4 | input { 5 | File read1 6 | File? read2 7 | String samplename 8 | String docker="quay.io/staphb/lyveset:1.1.4f" 9 | Int disk_size = 100 10 | String cg_pipe_opts="--fast" 11 | Int genome_length 12 | } 13 | command <<< 14 | # date and version control 15 | date | tee DATE 16 | 17 | run_assembly_readMetrics.pl ~{cg_pipe_opts} ~{read1} ~{read2} -e ~{genome_length} > ~{samplename}_readMetrics.tsv 18 | 19 | # repeat for concatenated read file 20 | # run_assembly_readMetrics.pl extension awareness 21 | if [[ "~{read1}" == *".gz" ]] ; then 22 | extension=".gz" 23 | else 24 | extension="" 25 | fi 26 | cat ~{read1} ~{read2} > ~{samplename}_concat.fastq"${extension}" 27 | run_assembly_readMetrics.pl ~{cg_pipe_opts} ~{samplename}_concat.fastq"${extension}" -e ~{genome_length} > ~{samplename}_concat_readMetrics.tsv 28 | 29 | python3 < R2_MEAN_Q 79 | fi 80 | # same for R2_MEAN_LENGTH 81 | if [[ ! -f R2_MEAN_LENGTH ]] ; then 82 | echo "0.0" > R2_MEAN_LENGTH 83 | fi 84 | 85 | >>> 86 | output { 87 | File cg_pipeline_report = "${samplename}_readMetrics.tsv" 88 | String cg_pipeline_docker = docker 89 | String pipeline_date = read_string("DATE") 90 | Float r1_mean_q = read_float("R1_MEAN_Q") 91 | Float r2_mean_q = read_float("R2_MEAN_Q") 92 | Float combined_mean_q = read_float("COMBINED_MEAN_Q") 93 | Float r1_mean_readlength = read_float("R1_MEAN_LENGTH") 94 | Float r2_mean_readlength = read_float("R2_MEAN_LENGTH") 95 | Float combined_mean_readlength = read_float("COMBINED_MEAN_LENGTH") 96 | Float est_coverage = read_float("EST_COVERAGE") 97 | } 98 | runtime { 99 | docker: "~{docker}" 100 | memory: "8 GB" 101 | cpu: 4 102 | disks: "local-disk " + disk_size + " SSD" 103 | disk: disk_size + " GB" 104 | maxRetries: 3 105 | preemptible: 0 106 | } 107 | } -------------------------------------------------------------------------------- /workflows/compile_ecoli_results.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow compile_results { 4 | 5 | input { 6 | Array[String] SRR_array 7 | Array[File] serotypefinder_array 8 | Array[File] abricate_array 9 | Array[File] abricate_virfinder_array 10 | Array[File] amrfinder_array 11 | } 12 | call compile_abricate { 13 | input: 14 | array_srr=SRR_array, 15 | array_abr=abricate_array 16 | } 17 | 18 | call compile_abricate as compile_abricate_virfinder { 19 | input: 20 | array_srr=SRR_array, 21 | array_abr=abricate_virfinder_array 22 | } 23 | 24 | call compile_amrfinder { 25 | input: 26 | array_srr=SRR_array, 27 | array_afp=amrfinder_array 28 | } 29 | 30 | call compile_serotypefinder { 31 | input: 32 | array_srr=SRR_array, 33 | array_stf=serotypefinder_array 34 | } 35 | 36 | output { 37 | File compiled_serotypefinder_results=compile_serotypefinder.compiled_results 38 | File compiled_abricate_results=compile_abricate.compiled_results 39 | File compiled_abricate_virfinder_results=compile_abricate_virfinder.compiled_results 40 | File compiled_amrfinderplus_results=compile_amrfinder.compiled_results 41 | } 42 | } 43 | 44 | 45 | task compile_abricate { 46 | input { 47 | Array[String] array_srr 48 | Array[File] array_abr 49 | } 50 | 51 | command <<< 52 | touch results.txt 53 | 54 | srr_array=(~{sep=' ' array_srr}) 55 | abr_array=(~{sep=' ' array_abr}) 56 | echo "I am here" 57 | 58 | for index in ${!srr_array[@]}; do 59 | SRR=${srr_array[$index]} 60 | file=${abr_array[$index]} 61 | echo "$index" 62 | echo "$SRR" 63 | echo "$file" 64 | 65 | while IFS= read -r result 66 | do 67 | printf "%s %s\n" "$SRR $result" >> results.txt 68 | done < <(grep -E 'fasta' "$file") 69 | 70 | done 71 | >>> 72 | 73 | output { 74 | File compiled_results="results.txt" 75 | } 76 | 77 | runtime { 78 | docker: "quay.io/staphb/abricate:1.0.0" 79 | memory: "4 GB" 80 | cpu: 1 81 | disks: "local-disk 100 SSD" 82 | preemptible: 0 83 | } 84 | } 85 | 86 | task compile_amrfinder { 87 | input { 88 | Array[String] array_srr 89 | Array[File] array_afp 90 | } 91 | 92 | command <<< 93 | touch results.txt 94 | 95 | srr_array=(~{sep=' ' array_srr}) 96 | afp_array=(~{sep=' ' array_afp}) 97 | echo "I am here" 98 | 99 | for index in ${!srr_array[@]}; do 100 | SRR=${srr_array[$index]} 101 | file=${afp_array[$index]} 102 | echo "$index" 103 | echo "$SRR" 104 | echo "$file" 105 | 106 | while IFS= read -r result 107 | do 108 | printf "%s %s\n" "$SRR $result" >> results.txt 109 | done < <(grep -E 'contig' "$file") 110 | 111 | done 112 | >>> 113 | 114 | output { 115 | File compiled_results="results.txt" 116 | } 117 | 118 | runtime { 119 | docker: "quay.io/staphb/ncbi-amrfinderplus:3.8.28" 120 | memory: "4 GB" 121 | cpu: 1 122 | disks: "local-disk 100 SSD" 123 | preemptible: 0 124 | } 125 | } 126 | 127 | 128 | task compile_serotypefinder { 129 | input { 130 | Array[String] array_srr 131 | Array[File] array_stf 132 | } 133 | 134 | command <<< 135 | touch results.txt 136 | 137 | srr_array=(~{sep=' ' array_srr}) 138 | stf_array=(~{sep=' ' array_stf}) 139 | echo "I am here" 140 | 141 | for index in ${!srr_array[@]}; do 142 | SRR=${srr_array[$index]} 143 | file=${stf_array[$index]} 144 | echo "$index" 145 | echo "$SRR" 146 | echo "$file" 147 | 148 | while IFS= read -r result 149 | do 150 | printf "%s %s\n" "$SRR $result" >> results.txt 151 | done < <(grep -E 'fliC|wzy|wzx' "$file") 152 | 153 | done 154 | >>> 155 | 156 | output { 157 | File compiled_results="results.txt" 158 | } 159 | 160 | runtime { 161 | docker: "quay.io/staphb/serotypefinder:1.1" 162 | memory: "4 GB" 163 | cpu: 1 164 | disks: "local-disk 100 SSD" 165 | preemptible: 0 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /tasks/utilities/task_summarize_data.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task summarize_data { 4 | input { 5 | Array[String]? sample_names 6 | String? terra_project 7 | String? terra_workspace 8 | String? terra_table 9 | String? column_names # string of comma-delimited column names 10 | String? output_prefix 11 | 12 | Int disk_size = 100 13 | File? input_table 14 | Boolean phandango_coloring = true 15 | } 16 | command <<< 17 | # when running on terra, comment out all input_table mentions 18 | python3 /scripts/export_large_tsv/export_large_tsv.py --project "~{terra_project}" --workspace "~{terra_workspace}" --entity_type ~{terra_table} --tsv_filename ~{terra_table}-data.tsv 19 | 20 | # when running locally, use the input_table in place of downloading from Terra 21 | #cp ~{input_table} ~{terra_table}-data.tsv 22 | 23 | if ~{phandango_coloring}; then 24 | export phandango_coloring="true" 25 | else 26 | export phandango_coloring="false" 27 | fi 28 | 29 | python3 <>> 104 | output { 105 | File summarized_data = "~{output_prefix}_summarized_data.csv" 106 | } 107 | runtime { 108 | docker: "broadinstitute/terra-tools:tqdm" 109 | memory: "8 GB" 110 | cpu: 1 111 | disks: "local-disk " + disk_size + " SSD" 112 | disk: disk_size + " GB" 113 | dx_instance_type: "mem1_ssd1_v2_x2" 114 | maxRetries: 3 115 | } 116 | } -------------------------------------------------------------------------------- /tasks/quality_control/task_mummer_ani.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task animummer { 4 | input { 5 | File assembly 6 | String samplename 7 | File? ref_genome 8 | Float mash_filter = 0.9 9 | String docker="staphb/mummer:4.0.0-rgdv2" 10 | Int disk_size = 100 11 | } 12 | command <<< 13 | # capture and version 14 | mummer --version | tee MUMMER_VERSION 15 | 16 | # set the reference genome 17 | # if not defined by user, then use all 43 genomes in RGDv2 18 | if [[ -z "~{ref_genome}" ]]; then 19 | # ref genome is not defined. default to RGDv2 20 | # BASH variable 21 | REF_GENOME="$(ls /RGDv2/*.fasta)" 22 | echo "user did not define a reference genome, defaulting to 43 genomes in RGDv2" 23 | echo "REF_GENOME is set to: ${REF_GENOME}" 24 | else 25 | echo "User specified a reference genome, will use this instead of RGDv2" 26 | REF_GENOME="~{ref_genome}" 27 | echo "REF_GENOME is set to: ${REF_GENOME}" 28 | fi 29 | 30 | # call Lee's ani-m.pl script and compare query genome against reference genome 31 | # first does a mash check on relatedness between 2 genomes. If greater than mash_filter, then run dnadiff 32 | # --symmetric flag runs ANI on query vs. ref; followed by ref vs. query 33 | ani-m.pl --symmetric \ 34 | --mash-filter ~{mash_filter} \ 35 | ~{assembly} \ 36 | ${REF_GENOME} | tee ~{samplename}.ani-mummer.out.tsv 37 | 38 | # CHECK FOR A NEARLY BLANK TSV (ONLY HEADER LINE), mean sample did not surpass mash-filter and thus no ANI was run 39 | LINE_COUNT_OUTPUT_TSV=$(wc -l ~{samplename}.ani-mummer.out.tsv | cut -d ' ' -f 1) 40 | echo "Number of lines in output TSV is: ${LINE_COUNT_OUTPUT_TSV}" 41 | if [[ ${LINE_COUNT_OUTPUT_TSV} -eq 1 ]]; then 42 | echo "~{samplename} did not surpass the minimum mash genetic distance filter, thus ANI was not performed" 43 | echo "The output TSV only contains the header line" 44 | # set output variables as 0s or descriptive strings 45 | echo "0.0" > ANI_HIGHEST_PERCENT_BASES_ALIGNED 46 | echo "0.0" > ANI_HIGHEST_PERCENT 47 | echo "ANI skipped due to high genetic divergence from reference genomes" > ANI_TOP_SPECIES_MATCH 48 | # if output TSV has greater than 1 lines, then parse for appropriate outputs 49 | else 50 | ## parse out highest percentBases aligned 51 | cut -f 5 ~{samplename}.ani-mummer.out.tsv | sort -nr | head -n 1 | tee ANI_HIGHEST_PERCENT_BASES_ALIGNED 52 | echo "highest percent bases aligned is: $(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)" 53 | 54 | ## parse out ANI value using highest percentBases aligned value 55 | grep "$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)" ~{samplename}.ani-mummer.out.tsv | cut -f 3 | tee ANI_HIGHEST_PERCENT 56 | echo "ANI value is: $(cat ANI_HIGHEST_PERCENT)" 57 | 58 | # have to separate out results for ani_top_species match because user-defined reference genome FASTAs will not be named as they are in RGDv2 59 | if [[ -z "~{ref_genome}" ]]; then 60 | ### ref genome is not user-defined, using RGDv2 and FASTA filenames ### 61 | # Parse out species name from reference fasta filename 62 | # use percent bases aligned to pull relevant line, cut down to query and ref fasta filenames, sed to remove your query filename, xargs to remove whitespaces & stuff 63 | # cut on periods to pull out genus_species (in future this will inlcude lineages for Listeria and other sub-species designations) 64 | # have to create assembly_file_basename bash variable since output TSV does not include full path to assembly file, only filename 65 | assembly_file_basename=$(basename ~{assembly}) 66 | grep "$(cat ANI_HIGHEST_PERCENT)" ~{samplename}.ani-mummer.out.tsv | cut -f 1,2 | sed "s|${assembly_file_basename}||g" | xargs | cut -d '.' -f 3 | tee ANI_TOP_SPECIES_MATCH 67 | echo "ANI top species match is: $(cat ANI_TOP_SPECIES_MATCH)" 68 | else 69 | # User specified a reference genome, use fasta filename as output string 70 | basename "${REF_GENOME}" > ANI_TOP_SPECIES_MATCH 71 | echo "Reference genome used for ANI is: ${REF_GENOME}" 72 | fi 73 | fi 74 | 75 | >>> 76 | output { 77 | Float ani_highest_percent = read_float("ANI_HIGHEST_PERCENT") 78 | Float ani_highest_percent_bases_aligned = read_float("ANI_HIGHEST_PERCENT_BASES_ALIGNED") 79 | File ani_output_tsv = "~{samplename}.ani-mummer.out.tsv" 80 | String ani_top_species_match = read_string("ANI_TOP_SPECIES_MATCH") 81 | String ani_mummer_version = read_string("MUMMER_VERSION") 82 | } 83 | runtime { 84 | docker: "~{docker}" 85 | memory: "8 GB" 86 | cpu: 4 87 | disks: "local-disk " + disk_size + " SSD" 88 | disk: disk_size + " GB" 89 | maxRetries: 3 90 | preemptible: 0 91 | } 92 | } -------------------------------------------------------------------------------- /workflows/wf_core_gene_snp.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/phylogenetic_inference/task_pirate.wdl" as pirate 4 | import "../tasks/phylogenetic_inference/task_iqtree.wdl" as iqtree 5 | import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists 6 | import "../tasks/task_versioning.wdl" as versioning 7 | import "../tasks/utilities/task_summarize_data.wdl" as data_summary 8 | 9 | 10 | workflow core_gene_snp_workflow { 11 | input { 12 | Array[File] gff3 13 | String cluster_name 14 | # if align = true, the pirate task will produce core and pangenome alignments for the sample set, 15 | # otherwise, pirate will only produce a pangenome summary 16 | Boolean align = true 17 | # use core_tree = true to produce a phylogenetic tree and snp distance matrix from the core genome alignment 18 | Boolean core_tree = true 19 | # use pan_tree = true to produce a phylogenetic tree and snp distance matrix from the pangenome alignment 20 | Boolean pan_tree = false 21 | # data summary input variables 22 | Array[String]? sample_names 23 | String? data_summary_terra_project 24 | String? data_summary_terra_workspace 25 | String? data_summary_terra_table 26 | String? data_summary_column_names 27 | } 28 | call pirate.pirate as pirate { 29 | input: 30 | gff3 = gff3, 31 | cluster_name = cluster_name, 32 | align = align 33 | } 34 | if (align) { 35 | if (core_tree) { 36 | call iqtree.iqtree as core_iqtree { 37 | input: 38 | alignment = select_first([pirate.pirate_core_alignment_fasta]), 39 | cluster_name = cluster_name 40 | } 41 | call snp_dists.snp_dists as core_snp_dists { 42 | input: 43 | alignment = select_first([pirate.pirate_core_alignment_fasta]), 44 | cluster_name = cluster_name 45 | } 46 | call snp_dists.reorder_matrix as core_reorder_matrix { 47 | input: 48 | input_tree = core_iqtree.ml_tree, 49 | matrix = core_snp_dists.snp_matrix, 50 | cluster_name = cluster_name + "_core" 51 | } 52 | } 53 | if (pan_tree) { 54 | call iqtree.iqtree as pan_iqtree { 55 | input: 56 | alignment = select_first([pirate.pirate_pangenome_alignment_fasta]), 57 | cluster_name = cluster_name 58 | } 59 | call snp_dists.snp_dists as pan_snp_dists { 60 | input: 61 | alignment = select_first([pirate.pirate_pangenome_alignment_fasta]), 62 | cluster_name = cluster_name 63 | } 64 | call snp_dists.reorder_matrix as pan_reorder_matrix { 65 | input: 66 | input_tree = pan_iqtree.ml_tree, 67 | matrix = pan_snp_dists.snp_matrix, 68 | cluster_name = cluster_name + "_pan" 69 | } 70 | } 71 | } 72 | if (defined(data_summary_column_names)) { 73 | call data_summary.summarize_data { 74 | input: 75 | sample_names = sample_names, 76 | terra_project = data_summary_terra_project, 77 | terra_workspace = data_summary_terra_workspace, 78 | terra_table = data_summary_terra_table, 79 | column_names = data_summary_column_names, 80 | output_prefix = cluster_name 81 | } 82 | } 83 | call versioning.version_capture{ 84 | input: 85 | } 86 | output { 87 | # Version Capture 88 | String core_gene_snp_wf_version = version_capture.phbg_version 89 | String core_gene_snp_wf_analysis_date = version_capture.date 90 | # pirate_outputs 91 | File pirate_pangenome_summary = pirate.pirate_pangenome_summary 92 | File pirate_gene_families_ordered = pirate.pirate_gene_families_ordered 93 | File? pirate_core_alignment_fasta = pirate.pirate_core_alignment_fasta 94 | File? pirate_core_alignment_gff = pirate.pirate_core_alignment_gff 95 | File? pirate_pan_alignment_fasta = pirate.pirate_pangenome_alignment_fasta 96 | File? pirate_pan_alignment_gff = pirate.pirate_pangenome_alignment_gff 97 | File? pirate_presence_absence_csv = pirate.pirate_presence_absence_csv 98 | String pirate_docker_image = pirate.pirate_docker_image 99 | # snp_dists outputs 100 | String? pirate_snps_dists_version = select_first([core_snp_dists.version,pan_snp_dists.version,""]) 101 | # iqtree outputs 102 | String? pirate_iqtree_version = select_first([core_iqtree.version,pan_iqtree.version,""]) 103 | # reorder matrix outputs 104 | File? pirate_core_snp_matrix = core_reorder_matrix.ordered_matrix 105 | File? pirate_iqtree_core_tree = core_reorder_matrix.tree 106 | File? pirate_pan_snp_matrix = pan_reorder_matrix.ordered_matrix 107 | File? pirate_iqtree_pan_tree = pan_reorder_matrix.tree 108 | # Data summary outputs 109 | File? pirate_summarized_data = summarize_data.summarized_data 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /tasks/gene_typing/task_resfinder.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task resfinder { 4 | input { 5 | File assembly # Input fasta file 6 | String samplename 7 | String? organism # Species in the sample, species should be entered with their full scientific names (e.g. "escherichia coli"), using quotation marks 8 | Boolean acquired = true # Run resfinder for acquired resistance genes 9 | Float? min_cov = 0.6 # Minimum (breadth-of) coverage of ResFinder 10 | Float? threshold = 0.9 # Threshold for identity of ResFinder 11 | Boolean point = false # Run pointfinder for chromosomal mutations 12 | String docker = "staphb/resfinder:4.1.11" 13 | Int disk_size = 100 14 | } 15 | command <<< 16 | date | tee DATE 17 | run_resfinder.py --version | tee RESFINDER_VERSION 18 | echo "unmodified from resfinder docker container" > RESFINDER_DB_VERSION 19 | 20 | # set $resfinder_organism BASH variable based on gambit_predicted_taxon or user-defined input string 21 | if [[ "~{organism}" == *"Campylobacter"*"jejuni"* ]]; then 22 | resfinder_organism="campylobacter jejuni" 23 | elif [[ "~{organism}" == *"Campylobacter"*"coli"* ]]; then 24 | resfinder_organism="campylobacter coli" 25 | elif [[ "~{organism}" == *"Campylobacter"* ]]; then 26 | resfinder_organism="campylobacter" 27 | elif [[ "~{organism}" == *"Enterococcus"*"faecalis"* ]]; then 28 | resfinder_organism="enterococcus faecalis" 29 | elif [[ "~{organism}" == *"Enterococcus"*"faecium"* ]]; then 30 | resfinder_organism="enterococcus faecium" 31 | elif [[ "~{organism}" == *"Escherichia"*"coli"* ]]; then 32 | resfinder_organism="escherichia coli" 33 | elif [[ "~{organism}" == *"Klebsiella"* ]]; then 34 | resfinder_organism="klebsiella" 35 | elif [[ "~{organism}" == *"Neisseria"*"gonorrhoeae"* ]]; then 36 | resfinder_organism="neisseria gonorrhoeae" 37 | elif [[ "~{organism}" == *"Salmonella"* ]]; then 38 | resfinder_organism="salmonella" 39 | elif [[ "~{organism}" == *"Staphylococcus"*"aureus"* ]]; then 40 | resfinder_organism="staphylococcus aureus" 41 | elif [[ "~{organism}" == *"Mycobacterium"*"tuberculosis"* ]]; then 42 | resfinder_organism="mycobacterium tuberculosis" 43 | elif [[ "~{organism}" == *"Helicobacter"*"pylori"* ]]; then 44 | resfinder_organism="helicobacter pylori" 45 | else 46 | echo "Either Gambit predicted taxon is not supported by resfinder or the user did not supply an organism as input." 47 | echo "Skipping the use of resfinder --species optional parameter." 48 | fi 49 | 50 | # if resfinder_organism variable is set, use --species flag, otherwise do not use --species flag 51 | if [[ -v resfinder_organism ]] ; then 52 | run_resfinder.py \ 53 | --inputfasta ~{assembly} \ 54 | --outputPath . \ 55 | --species "${resfinder_organism}" \ 56 | ~{true="--acquired" false="" acquired} \ 57 | ~{'--min_cov ' + min_cov} \ 58 | ~{'--threshold ' + threshold} \ 59 | ~{true="--point" false="" point} 60 | else 61 | run_resfinder.py \ 62 | --inputfasta ~{assembly} \ 63 | --outputPath . \ 64 | --species "other" \ 65 | ~{true="--acquired" false="" acquired} \ 66 | ~{'--min_cov ' + min_cov} \ 67 | ~{'--threshold ' + threshold} 68 | fi 69 | 70 | # replace space in resfinder_organism with underscore 71 | resfinder_organism="${resfinder_organism// /_}" 72 | 73 | # rename files 74 | mv pheno_table.txt ~{samplename}_pheno_table.txt 75 | if [ -f pheno_table_${resfinder_organism}.txt ]; then 76 | mv pheno_table_${resfinder_organism}.txt ~{samplename}_pheno_table_species.txt 77 | fi 78 | mv ResFinder_Hit_in_genome_seq.fsa ~{samplename}_ResFinder_Hit_in_genome_seq.fsa 79 | mv ResFinder_Resistance_gene_seq.fsa ~{samplename}_ResFinder_Resistance_gene_seq.fsa 80 | mv ResFinder_results_tab.txt ~{samplename}_ResFinder_results_tab.txt 81 | if [ -f PointFinder_prediction.txt ]; then 82 | mv PointFinder_prediction.txt ~{samplename}_PointFinder_prediction.txt 83 | mv PointFinder_results.txt ~{samplename}_PointFinder_results.txt 84 | fi 85 | 86 | >>> 87 | output { 88 | File resfinder_pheno_table = "~{samplename}_pheno_table.txt" 89 | File? resfinder_pheno_table_species = "~{samplename}_pheno_table_species.txt" 90 | File resfinder_hit_in_genome_seq = "~{samplename}_ResFinder_Hit_in_genome_seq.fsa" 91 | File resfinder_resistance_gene_seq = "~{samplename}_ResFinder_Resistance_gene_seq.fsa" 92 | File resfinder_results_tab = "~{samplename}_ResFinder_results_tab.txt" 93 | File? pointfinder_pheno_table = "~{samplename}_PointFinder_prediction.txt" 94 | File? pointfinder_results = "~{samplename}_PointFinder_results.txt" 95 | String resfinder_docker = "~{docker}" 96 | String resfinder_version = read_string("RESFINDER_VERSION") 97 | String resfinder_db_version = read_string("RESFINDER_DB_VERSION") 98 | } 99 | runtime { 100 | memory: "8 GB" 101 | cpu: 4 102 | docker: docker 103 | disks: "local-disk " + disk_size + " SSD" 104 | disk: disk_size + " GB" 105 | maxRetries: 3 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /tasks/species_typing/task_poppunk_streppneumo.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task poppunk { 4 | meta { 5 | description: "Using poppunk with GPS (Global Pneumococcal Sequencing project) database for Streptococcus pneumoniae typing" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "staphb/poppunk:2.4.0" 11 | Int disk_size = 100 12 | Int cpus = 4 13 | # database/reference files currently hosted on a public, requester-pays GCP bucket 14 | # hosting individually for speed purposes. Unzipping one big 20GB zip archive takes a long time, longer than downloading the files individually (which total 22GB uncompressed) 15 | # If future versions of the GPS database are released, we can update the links here or in Terra, and task should be future-proof 16 | File GPS_dists_npy = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.dists.npy" 17 | File GPS_dists_pkl = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.dists.pkl" 18 | File GPS_h5 = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.h5" 19 | File GPS_refs = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs" 20 | File GPS_refs_dists_npy = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.dists.npy" 21 | File GPS_refs_dists_pkl = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.dists.pkl" 22 | File GPS_refs_h5 = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.h5" 23 | File GPS_clusters_csv = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_clusters.csv" 24 | File GPS_fit_npz = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_fit.npz" 25 | File GPS_fit_pkl = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_fit.pkl" 26 | File GPS_graph_gt = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_graph.gt" 27 | File GPS_qcreport_txt = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_qcreport.txt" 28 | File GPS_unword_clusters_csv = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_unword_clusters.csv" 29 | File GPS_refs_graph_gt = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6refs_graph.gt" 30 | File GPS_external_clusters_csv = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_external_clusters.csv" 31 | } 32 | command <<< 33 | # get version information 34 | poppunk --version | sed 's/poppunk //' | tee VERSION 35 | 36 | # create input TSV 37 | echo -e "~{samplename}\t~{assembly}" > ~{samplename}_poppunk_input.tsv 38 | 39 | # determine the database name, which is also used as a prefix for all files included in database. Also used as GPS_DB_NAME directory to put database files in 40 | # doing this for future proofing 41 | # get file name of primary h5 file, strip off suffix 42 | GPS_DB_NAME=$(basename ~{GPS_h5} | sed 's|.h5||') 43 | # sending GPS_DB_NAME into text file for logging/output purposes 44 | echo "${GPS_DB_NAME}" > GPS_DB_NAME 45 | 46 | # move all database/reference files into single directory to feed into poppunk 47 | mkdir -v "${GPS_DB_NAME}" 48 | ln -vs ~{GPS_dists_npy} ~{GPS_dists_pkl} ~{GPS_h5} ~{GPS_refs} \ 49 | ~{GPS_refs_dists_npy} ~{GPS_refs_dists_pkl} ~{GPS_refs_h5} ~{GPS_clusters_csv} \ 50 | ~{GPS_fit_npz} ~{GPS_fit_pkl} ~{GPS_graph_gt} ~{GPS_qcreport_txt} \ 51 | ~{GPS_unword_clusters_csv} ~{GPS_refs_graph_gt} ~{GPS_external_clusters_csv} \ 52 | "${GPS_DB_NAME}"/ 53 | 54 | # to allow for compatibility with future versions of the database 55 | # poppunk requires this file to be explicitly passed as input 56 | GPS_EXTERNAL_CLUSTERS_CSV=$(ls "${GPS_DB_NAME}"/GPS_*_external_clusters.csv) 57 | 58 | # run poppunk 59 | poppunk_assign \ 60 | --threads ~{cpus} \ 61 | --db "${GPS_DB_NAME}" \ 62 | --distances "${GPS_DB_NAME}/${GPS_DB_NAME}.dists" \ 63 | --query ~{samplename}_poppunk_input.tsv \ 64 | --output ~{samplename}_poppunk \ 65 | --external-clustering "${GPS_EXTERNAL_CLUSTERS_CSV}" 66 | 67 | # parse output CSV for GPSC (Global Pneumococcal Sequence Cluster) 68 | if [ -f ~{samplename}_poppunk/~{samplename}_poppunk_external_clusters.csv ]; then 69 | cut -d ',' -f 2 ~{samplename}_poppunk/~{samplename}_poppunk_external_clusters.csv | tail -n 1 > GPSC.txt 70 | 71 | # if GPSC is "NA", overwrite with helpful message 72 | if [[ "$(cat GPSC.txt)" == "NA" ]]; then 73 | echo "Potential novel GPS Cluster identified, please email globalpneumoseq@gmail.com to have novel clusters added to the database and a GPSC cluster name assigned after you have checked for low level contamination which may contribute to biased accessory distances." >GPSC.txt 74 | fi 75 | else 76 | echo "poppunk failed" > GPSC.txt 77 | fi 78 | 79 | >>> 80 | output { 81 | String poppunk_gps_cluster = read_string("GPSC.txt") 82 | File? poppunk_gps_external_cluster_csv = "~{samplename}_poppunk/~{samplename}_poppunk_external_clusters.csv" 83 | String poppunk_version = read_string("VERSION") 84 | String poppunk_docker = docker 85 | String poppunk_GPS_db_version = read_string("GPS_DB_NAME") 86 | } 87 | runtime { 88 | docker: "~{docker}" 89 | # poppunk with the GPS v6 db used upwards of 12GB ram at times 90 | memory: "16 GB" 91 | cpu: cpus 92 | disks: "local-disk " + disk_size + " SSD" 93 | disk: disk_size + " GB" 94 | maxRetries: 3 95 | preemptible: 0 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /tasks/species_typing/task_srst2_vibrio.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task srst2_vibrio { 4 | meta { 5 | description: "Use of SRST2 to identify sequences of interest from a database of curated Vibrio sequences " 6 | } 7 | input { 8 | File reads1 9 | File? reads2 10 | String samplename 11 | Int srst2_min_cov 12 | Int srst2_max_divergence 13 | Int srst2_min_depth 14 | Int srst2_min_edge_depth 15 | Int srst2_gene_max_mismatch 16 | String docker = "quay.io/staphb/srst2:0.2.0-vcholerae" 17 | Int disk_size = 100 18 | Int cpu = 4 19 | } 20 | command <<< 21 | if [ -z "~{reads2}" ] ; then 22 | INPUT_READS="--input_se ~{reads1}" 23 | else 24 | # This task expects/requires that input FASTQ files end in "_1.clean.fastq.gz" and "_2.clean.fastq.gz" 25 | # which is the syntax from TheiaProk read cleaning tasks 26 | INPUT_READS="--input_pe ~{reads1} ~{reads2} --forward _1.clean --reverse _2.clean" 27 | fi 28 | 29 | srst2 --version 2>&1 | tee VERSION 30 | srst2 \ 31 | ${INPUT_READS} \ 32 | --gene_db /vibrio-cholerae-db/vibrio_230224.fasta \ 33 | --output ~{samplename} \ 34 | --min_coverage ~{srst2_min_cov} \ 35 | --max_divergence ~{srst2_max_divergence} \ 36 | --min_depth ~{srst2_min_depth} \ 37 | --min_edge_depth ~{srst2_min_edge_depth} \ 38 | --gene_max_mismatch ~{srst2_gene_max_mismatch} 39 | 40 | # capture output TSV 41 | mv ~{samplename}__genes__*__results.txt ~{samplename}.tsv 42 | 43 | # capture detailed output TSV - not available if no results are outputed 44 | mv ~{samplename}__fullgenes__*__results.txt ~{samplename}.detailed.tsv || echo "No results" > ~{samplename}.detailed.tsv 45 | 46 | # parsing block to account for when output columns do not exist 47 | python < 0: 76 | return '(' + ';'.join(translation) + ')' 77 | return "" 78 | 79 | # load output TSV as dict 80 | row = tsv_to_dict('~{samplename}.tsv') 81 | 82 | # presence or absence genes - ctxA, ompW and toxR 83 | with open("ctxA", "wb") as ctxA_fh: 84 | value = row.get("ctxA") 85 | presence = translate_chars(conv(value)) 86 | if presence == "(not detected)": 87 | ctxA_fh.write(presence) 88 | else: 89 | result = "present" + ' ' + presence 90 | ctxA_fh.write(result.strip()) 91 | 92 | with open("ompW", "wb") as ompW_fh: 93 | value = row.get("ompW") 94 | presence = translate_chars(conv(value)) 95 | if presence == "(not detected)": 96 | ompW_fh.write(presence) 97 | else: 98 | result = "present" + ' ' + presence 99 | ompW_fh.write(result.strip()) 100 | 101 | with open("toxR", "wb") as toxR_fh: 102 | value = row.get("toxR") 103 | presence = translate_chars(conv(value)) 104 | if presence == "(not detected)": 105 | toxR_fh.write(presence) 106 | else: 107 | result = "present" + ' ' + presence 108 | toxR_fh.write(result.strip()) 109 | 110 | # biotype - tcpA classical or tcpA ElTor 111 | with open("BIOTYPE", "wb") as biotype_fh: 112 | value_ElTor = translate_chars(conv(row.get("tcpA_ElTor"))) 113 | value_classical = translate_chars(conv(row.get("tcpA_classical"))) 114 | 115 | if value_ElTor == "(not detected)" and value_classical == "(not detected)": 116 | biotype_fh.write("(not detected)") 117 | else: 118 | if value_ElTor == "(not detected)": 119 | result = "tcpA_Classical" + ' ' + value_classical 120 | biotype_fh.write(result.strip()) 121 | else: 122 | result = "tcpA_ElTor" + ' ' + value_ElTor 123 | biotype_fh.write(result.strip()) 124 | 125 | # serogroup - O1 or O139 126 | with open("SEROGROUP", "wb") as serotype_fh: 127 | value_O1 = translate_chars(conv(row.get("wbeN_O1"))) 128 | value_O139 = translate_chars(conv(row.get("wbfR_O139"))) 129 | 130 | if value_O1 == "(not detected)" and value_O139 == "(not detected)": 131 | serotype_fh.write("(not detected)") 132 | else: 133 | if value_O1 == "(not detected)": 134 | result = "O139" + ' ' + value_O139 135 | serotype_fh.write(result.strip()) 136 | else: 137 | result = "O1" + ' ' + value_O1 138 | serotype_fh.write(result.strip()) 139 | CODE 140 | >>> 141 | output { 142 | File srst2_detailed_tsv = "~{samplename}.detailed.tsv" 143 | String srst2_version = read_string("VERSION") 144 | String srst2_vibrio_ctxA = read_string("ctxA") 145 | String srst2_vibrio_ompW = read_string("ompW") 146 | String srst2_vibrio_toxR = read_string("toxR") 147 | String srst2_vibrio_biotype = read_string("BIOTYPE") 148 | String srst2_vibrio_serogroup = read_string("SEROGROUP") 149 | } 150 | runtime { 151 | docker: "~{docker}" 152 | memory: "8 GB" 153 | cpu: cpu 154 | disks: "local-disk " + disk_size + " SSD" 155 | disk: disk_size + " GB" 156 | maxRetries: 3 157 | preemptible: 0 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /tasks/assembly/task_shovill.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task shovill_pe { 4 | input { 5 | File read1_cleaned 6 | File read2_cleaned 7 | String samplename 8 | String docker = "quay.io/staphb/shovill:1.1.0" 9 | Int disk_size = 100 10 | 11 | ## SHOVILL optional parameters 12 | ## --depth [INT] Sub-sample --R1/--R2 to this depth. Disable with --depth 0 (default: 150) 13 | ## --gsize [STRING] Estimated genome size eg. 3.2M (default: '') 14 | ## --minlen [INT] Minimum contig length <0=AUTO> (default: 0) 15 | ## --mincov [FLOAT] Minimum contig coverage <0=AUTO> (default: 2) 16 | ## --assembler [STRING] Assembler: skesa velvet megahit spades (default: 'spades') 17 | ## --opts [STRING] Extra assembler options in quotes eg. spades: "--untrusted-contigs locus.fna" ... (default: '') 18 | ## --kmers [STRING] K-mers to use (default: '') 19 | ## --trim [BOOLEAN] Enable adaptor trimming (default: OFF) 20 | ## --noreadcorr [BOOLEAN] Disable read error correction (default: OFF) 21 | ## --nostitch [BOOLEAN] Disable read stitching (default: OFF) 22 | ## --nocorr [BOOLEAN] Disable post-assembly correction (default: OFF) 23 | 24 | 25 | Int? depth 26 | String? genome_size 27 | Int min_contig_length = 200 28 | Float? min_coverage 29 | String assembler = "skesa" 30 | String? assembler_options 31 | String? kmers 32 | Boolean trim = false 33 | Boolean noreadcorr = false 34 | Boolean nostitch = false 35 | Boolean nocorr = false 36 | } 37 | command <<< 38 | shovill --version | head -1 | tee VERSION 39 | shovill \ 40 | --outdir out \ 41 | --R1 ~{read1_cleaned} \ 42 | --R2 ~{read2_cleaned} \ 43 | --minlen ~{min_contig_length} \ 44 | ~{'--depth ' + depth} \ 45 | ~{'--gsize ' + genome_size} \ 46 | ~{'--mincov ' + min_coverage} \ 47 | ~{'--assembler ' + assembler} \ 48 | ~{'--opts ' + assembler_options} \ 49 | ~{'--kmers ' + kmers} \ 50 | ~{true='--trim' false='' trim} \ 51 | ~{true='--noreadcorr' false='' noreadcorr} \ 52 | ~{true='--nostitch' false='' nostitch} \ 53 | ~{true='--nocorr' false='' nocorr} 54 | 55 | mv out/contigs.fa out/~{samplename}_contigs.fasta 56 | 57 | if [ "~{assembler}" == "spades" ] ; then 58 | mv out/contigs.gfa out/~{samplename}_contigs.gfa 59 | elif [ "~{assembler}" == "megahit" ] ; then 60 | mv out/contigs.fastg out/~{samplename}_contigs.fastg 61 | elif [ "~{assembler}" == "velvet" ] ; then 62 | mv out/contigs.LastGraph out/~{samplename}_contigs.LastGraph 63 | fi 64 | 65 | >>> 66 | output { 67 | File assembly_fasta = "out/~{samplename}_contigs.fasta" 68 | File? contigs_gfa = "out/~{samplename}_contigs.gfa" 69 | File? contigs_fastg = "out/~{samplename}_contigs.fastg" 70 | File? contigs_lastgraph = "out/~{samplename}_contigs.LastGraph" 71 | String shovill_version = read_string("VERSION") 72 | } 73 | runtime { 74 | docker: "~{docker}" 75 | memory: "16 GB" 76 | cpu: 4 77 | disks: "local-disk " + disk_size + " SSD" 78 | disk: disk_size + " GB" 79 | maxRetries: 3 80 | preemptible: 0 81 | } 82 | } 83 | 84 | task shovill_se { 85 | input { 86 | File read1_cleaned 87 | String samplename 88 | String docker = "quay.io/staphb/shovill-se:1.1.0" 89 | Int disk_size = 100 90 | 91 | ## SHOVILL optional parameters 92 | ## --depth [INT] Sub-sample --R1/--R2 to this depth. Disable with --depth 0 (default: 150) 93 | ## --gsize [STRING] Estimated genome size eg. 3.2M (default: '') 94 | ## --minlen [INT] Minimum contig length <0=AUTO> (default: 0) 95 | ## --mincov [FLOAT] Minimum contig coverage <0=AUTO> (default: 2) 96 | ## --assembler [STRING] Assembler: skesa velvet megahit spades (default: 'spades') 97 | ## --opts [STRING] Extra assembler options in quotes eg. spades: "--untrusted-contigs locus.fna" ... (default: '') 98 | ## --kmers [STRING] K-mers to use (default: '') 99 | ## --trim [BOOLEAN] Enable adaptor trimming (default: OFF) 100 | ## --noreadcorr [BOOLEAN] Disable read error correction (default: OFF) 101 | ## --nocorr [BOOLEAN] Disable post-assembly correction (default: OFF) 102 | 103 | Int? depth 104 | String? genome_size 105 | Int min_contig_length = 200 106 | Float? min_coverage 107 | String assembler = "spades" 108 | String? assembler_options 109 | String? kmers 110 | Boolean trim = false 111 | Boolean noreadcorr = false 112 | Boolean nocorr = false 113 | } 114 | command <<< 115 | shovill-se --version | head -1 | tee VERSION 116 | shovill-se \ 117 | --outdir out \ 118 | --se ~{read1_cleaned} 119 | --minlen ~{min_contig_length} \ 120 | ~{'--depth ' + depth} \ 121 | ~{'--gsize ' + genome_size} \ 122 | ~{'--mincov ' + min_coverage} \ 123 | ~{'--assembler ' + assembler} \ 124 | ~{'--opts ' + assembler_options} \ 125 | ~{'--kmers ' + kmers} \ 126 | ~{true='--trim' false='' trim} \ 127 | ~{true='--noreadcorr' false='' noreadcorr} \ 128 | ~{true='--nocorr' false='' nocorr} 129 | 130 | mv out/contigs.fa out/~{samplename}_contigs.fasta 131 | 132 | if [ "~{assembler}" == "spades" ] ; then 133 | mv out/contigs.gfa out/~{samplename}_contigs.gfa 134 | elif [ "~{assembler}" == "megahit" ] ; then 135 | mv out/contigs.fastg out/~{samplename}_contigs.fastg 136 | elif [ "~{assembler}" == "velvet" ] ; then 137 | mv out/contigs.LastGraph out/~{samplename}_contigs.LastGraph 138 | fi 139 | >>> 140 | output { 141 | File assembly_fasta = "out/~{samplename}_contigs.fasta" 142 | File? contigs_gfa = "out/~{samplename}_contigs.gfa" 143 | File? contigs_fastg = "out/~{samplename}_contigs.fastg" 144 | File? contigs_lastgraph = "out/~{samplename}_contigs.LastGraph" 145 | String shovill_version = read_string("VERSION") 146 | } 147 | runtime { 148 | docker: "~{docker}" 149 | memory: "16 GB" 150 | cpu: 4 151 | disks: "local-disk " + disk_size + " SSD" 152 | disk: disk_size + " GB" 153 | maxRetries: 3 154 | preemptible: 0 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /tasks/species_typing/task_kleborate.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task kleborate { 4 | # Inputs 5 | input { 6 | File assembly 7 | String samplename 8 | String kleborate_docker_image = "quay.io/staphb/kleborate:2.2.0" 9 | Int disk_size = 100 10 | 11 | # Parameters 12 | # --resistance Turn on resistance genes screening (default: no resistance gene screening) 13 | # --kaptive Equivalent to --kaptive_k --kaptive_ 14 | # --min_identity MIN_IDENTITY Minimum alignment percent identity for main results (default: 90.0) 15 | # --min_coverage MIN_COVERAGE Minimum alignment percent coverage for main results (default: 80.0) 16 | # --min_spurious_identity MIN_SPURIOUS_IDENTITY Minimum alignment percent identity for spurious results (default: 80.0) 17 | # --min_spurious_coverage MIN_SPURIOUS_COVERAGE Minimum alignment percent coverage for spurious results (default: 40.0) 18 | # --min_kaptive_confidence {None,Low,Good,High,Very_high,Perfect} Minimum Kaptive confidence to call K/O loci - confidence levels below this will be reported as unknown (default: Good) 19 | Boolean skip_resistance = false 20 | Boolean skip_kaptive = false 21 | Float min_identity = 90.0 22 | Float min_coverage = 80.0 23 | Float min_spurious_identity = 80.0 24 | Float min_spurious_coverage = 40.0 25 | String min_kaptive_confidence = "Good" 26 | } 27 | command <<< 28 | # capture date and version 29 | # Print and save date 30 | date | tee DATE 31 | # Print and save version 32 | kleborate --version | tee VERSION 33 | # Run Kleborate on the input assembly with the --all flag and output with samplename prefix 34 | kleborate \ 35 | ~{true="" false="--resistance" skip_resistance} \ 36 | ~{true="" false="--kaptive" skip_kaptive} \ 37 | ~{'--min_identity ' + min_identity} \ 38 | ~{'--min_coverage ' + min_coverage} \ 39 | ~{'--min_spurious_identity ' + min_spurious_identity} \ 40 | ~{'--min_spurious_coverage ' + min_spurious_coverage} \ 41 | ~{'--min_kaptive_confidence ' + min_kaptive_confidence} \ 42 | --outfile ~{samplename}_kleborate_out.tsv \ 43 | --assemblies ~{assembly} \ 44 | --all 45 | # parse outputs 46 | python3 <>> 119 | output { 120 | File kleborate_output_file = "~{samplename}_kleborate_out.tsv" 121 | String kleborate_version = read_string("VERSION") 122 | String kleborate_docker = kleborate_docker_image 123 | String kleborate_mlst_sequence_type = read_string("MLST_SEQUENCE_TYPE") 124 | String kleborate_virulence_score = read_string("VIRULENCE_SCORE") 125 | String kleborate_resistance_score = read_string("RESISTANCE_SCORE") 126 | String kleborate_num_resistance_genes = read_string("NUM_RESISTANCE_GENES") 127 | String kleborate_bla_resistance_genes = read_string("BLA_RESISTANCE_GENES") 128 | String kleborate_esbl_resistance_genes = read_string("ESBL_RESISTANCE_GENES") 129 | String kleborate_key_resistance_genes = read_string("KEY_RESISTANCE_GENES") 130 | String kleborate_genomic_resistance_mutations = read_string("GENOMIC_RESISTANCE_MUTATIONS") 131 | String kleborate_klocus = read_string("K_LOCUS") 132 | String kleborate_ktype = read_string("K_TYPE") 133 | String kleborate_otype = read_string("O_TYPE") 134 | String kleborate_olocus = read_string("O_LOCUS") 135 | String kleborate_klocus_confidence = read_string("K_LOCUS_CONFIDENCE") 136 | String kleborate_olocus_confidence = read_string("O_LOCUS_CONFIDENCE") 137 | } 138 | runtime { 139 | docker: "~{kleborate_docker_image}" 140 | memory: "16 GB" 141 | cpu: 8 142 | disks: "local-disk " + disk_size + " SSD" 143 | disk: disk_size + " GB" 144 | maxRetries: 3 145 | } 146 | } -------------------------------------------------------------------------------- /tasks/species_typing/task_shigeifinder.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task shigeifinder { 4 | input { 5 | File assembly 6 | String samplename 7 | String docker = "staphb/shigeifinder:1.3.3" 8 | Int disk_size = 100 9 | Int cpu = 2 10 | } 11 | command <<< 12 | # capture date 13 | date | tee DATE 14 | # shigeifinder does not have a --version flag, relying upon the docker image tag for the version - which StaPH-B/Curtis maintains 15 | echo "~{docker}" | sed 's|staphb/shigeifinder:||' | tee VERSION.txt 16 | 17 | # ShigEiFinder checks that all dependencies are installed before running 18 | echo "checking for shigeifinder dependencies and running ShigEiFinder..." 19 | # run shigeifinder on assembly; default is 4cpus, so turning down to 2 since it's already very fast 20 | shigeifinder -i ~{assembly} \ 21 | -t ~{cpu} \ 22 | --hits \ 23 | --output ~{samplename}_shigeifinder.tsv 24 | 25 | # parse output TSV 26 | echo "Parsing ShigEiFinder output TSV..." 27 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 2 >shigeifinder_ipaH_presence_absence.txt 28 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 3 >shigeifinder_num_virulence_plasmid_genes.txt 29 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 4 >shigeifinder_cluster.txt 30 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 5 >shigeifinder_serotype.txt 31 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 6 >shigeifinder_O_antigen.txt 32 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 7 >shigeifinder_H_antigen.txt 33 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 8 >shigeifinder_notes.txt 34 | 35 | # set helpful output strings if field in TSV is blank by overwriting output TXT files 36 | if [ "$(cat shigeifinder_ipaH_presence_absence.txt)" == "" ]; then 37 | echo "ShigEiFinder ipaH field was empty" > shigeifinder_ipaH_presence_absence.txt 38 | fi 39 | if [ "$(cat shigeifinder_num_virulence_plasmid_genes.txt)" == "" ]; then 40 | echo "ShigEiFinder number of virulence plasmid genes field was empty" > shigeifinder_num_virulence_plasmid_genes.txt 41 | fi 42 | if [ "$(cat shigeifinder_cluster.txt)" == "" ]; then 43 | echo "ShigEiFinder cluster field was empty" > shigeifinder_cluster.txt 44 | fi 45 | if [ "$(cat shigeifinder_serotype.txt)" == "" ]; then 46 | echo "ShigEiFinder serotype field was empty" > shigeifinder_serotype.txt 47 | fi 48 | if [ "$(cat shigeifinder_O_antigen.txt)" == "" ]; then 49 | echo "ShigEiFinder O antigen field was empty" > shigeifinder_O_antigen.txt 50 | fi 51 | if [ "$(cat shigeifinder_H_antigen.txt)" == "" ]; then 52 | echo "ShigEiFinder H antigen field was empty" > shigeifinder_H_antigen.txt 53 | fi 54 | if [ "$(cat shigeifinder_notes.txt)" == "" ]; then 55 | echo "ShigEiFinder notes field was empty" > shigeifinder_notes.txt 56 | fi 57 | 58 | >>> 59 | output { 60 | File shigeifinder_report = "~{samplename}_shigeifinder.tsv" 61 | String shigeifinder_docker = docker 62 | String shigeifinder_version = read_string("VERSION.txt") 63 | String shigeifinder_ipaH_presence_absence = read_string("shigeifinder_ipaH_presence_absence.txt") 64 | String shigeifinder_num_virulence_plasmid_genes = read_string("shigeifinder_num_virulence_plasmid_genes.txt") 65 | String shigeifinder_cluster = read_string("shigeifinder_cluster.txt") 66 | String shigeifinder_serotype = read_string("shigeifinder_serotype.txt") 67 | String shigeifinder_O_antigen = read_string("shigeifinder_O_antigen.txt") 68 | String shigeifinder_H_antigen = read_string("shigeifinder_H_antigen.txt") 69 | String shigeifinder_notes = read_string("shigeifinder_notes.txt") 70 | } 71 | runtime { 72 | docker: "~{docker}" 73 | memory: "8 GB" 74 | cpu: cpu 75 | disks: "local-disk " + disk_size + " SSD" 76 | disk: disk_size + " GB" 77 | preemptible: 0 78 | maxRetries: 3 79 | } 80 | } 81 | task shigeifinder_reads { 82 | input { 83 | File read1 84 | File? read2 85 | String samplename 86 | String docker = "staphb/shigeifinder:1.3.3" 87 | Int disk_size = 100 88 | Int cpu = 4 89 | Boolean paired_end = true 90 | } 91 | command <<< 92 | # capture date 93 | date | tee DATE 94 | # shigeifinder does not have a --version flag, relying upon the docker image tag for the version - which StaPH-B/Curtis maintains 95 | echo "~{docker}" | sed 's|staphb/shigeifinder:||' | tee VERSION.txt 96 | 97 | # ShigEiFinder checks that all dependencies are installed before running 98 | echo "checking for shigeifinder dependencies and running ShigEiFinder..." 99 | # run shigeifinder on reads; default is 4cpus, so keeping at 4 since it's doing alignment 100 | shigeifinder -r -i ~{read1} ~{read2} \ 101 | ~{true='' false='--single_end' paired_end} \ 102 | -t ~{cpu} \ 103 | --hits \ 104 | --output ~{samplename}_shigeifinder.tsv 105 | 106 | # parse output TSV 107 | echo "Parsing ShigEiFinder output TSV..." 108 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 2 >shigeifinder_ipaH_presence_absence.txt 109 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 3 >shigeifinder_num_virulence_plasmid_genes.txt 110 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 4 >shigeifinder_cluster.txt 111 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 5 >shigeifinder_serotype.txt 112 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 6 >shigeifinder_O_antigen.txt 113 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 7 >shigeifinder_H_antigen.txt 114 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 8 >shigeifinder_notes.txt 115 | 116 | # set helpful output strings if field in TSV is blank by overwriting output TXT files 117 | if [ "$(cat shigeifinder_ipaH_presence_absence.txt)" == "" ]; then 118 | echo "ShigEiFinder ipaH field was empty" > shigeifinder_ipaH_presence_absence.txt 119 | fi 120 | if [ "$(cat shigeifinder_num_virulence_plasmid_genes.txt)" == "" ]; then 121 | echo "ShigEiFinder number of virulence plasmid genes field was empty" > shigeifinder_num_virulence_plasmid_genes.txt 122 | fi 123 | if [ "$(cat shigeifinder_cluster.txt)" == "" ]; then 124 | echo "ShigEiFinder cluster field was empty" > shigeifinder_cluster.txt 125 | fi 126 | if [ "$(cat shigeifinder_serotype.txt)" == "" ]; then 127 | echo "ShigEiFinder serotype field was empty" > shigeifinder_serotype.txt 128 | fi 129 | if [ "$(cat shigeifinder_O_antigen.txt)" == "" ]; then 130 | echo "ShigEiFinder O antigen field was empty" > shigeifinder_O_antigen.txt 131 | fi 132 | if [ "$(cat shigeifinder_H_antigen.txt)" == "" ]; then 133 | echo "ShigEiFinder H antigen field was empty" > shigeifinder_H_antigen.txt 134 | fi 135 | if [ "$(cat shigeifinder_notes.txt)" == "" ]; then 136 | echo "ShigEiFinder notes field was empty" > shigeifinder_notes.txt 137 | fi 138 | 139 | >>> 140 | output { 141 | File shigeifinder_report = "~{samplename}_shigeifinder.tsv" 142 | String shigeifinder_docker = docker 143 | String shigeifinder_version = read_string("VERSION.txt") 144 | String shigeifinder_ipaH_presence_absence = read_string("shigeifinder_ipaH_presence_absence.txt") 145 | String shigeifinder_num_virulence_plasmid_genes = read_string("shigeifinder_num_virulence_plasmid_genes.txt") 146 | String shigeifinder_cluster = read_string("shigeifinder_cluster.txt") 147 | String shigeifinder_serotype = read_string("shigeifinder_serotype.txt") 148 | String shigeifinder_O_antigen = read_string("shigeifinder_O_antigen.txt") 149 | String shigeifinder_H_antigen = read_string("shigeifinder_H_antigen.txt") 150 | String shigeifinder_notes = read_string("shigeifinder_notes.txt") 151 | } 152 | runtime { 153 | docker: "~{docker}" 154 | memory: "8 GB" 155 | cpu: cpu 156 | disks: "local-disk " + disk_size + " SSD" 157 | disk: disk_size + " GB" 158 | preemptible: 0 159 | maxRetries: 3 160 | } 161 | } --------------------------------------------------------------------------------