├── tests ├── README.md ├── inputs │ ├── empty-for-test.txt │ ├── wf_theiaprok_illumina_pe_cromwell.json │ └── wf_theiaprok_illumina_pe.json └── config │ └── pytest_filter.yml ├── validation_files └── kleb_assembly_input.json ├── workflows ├── wf_bc_n_qc_pe_local-dev.wdl ├── wf_rasusa.wdl ├── wf_serotypefinder.wdl ├── fetch_sra_to_fastq.wdl ├── wf_gambit_query.wdl ├── wf_kraken2_se.wdl ├── wf_pmga.wdl ├── wf_mycosnp_consensus_assembly.wdl ├── wf_kraken2_pe.wdl ├── wf_mycosnp_tree.wdl ├── wf_kleborate.wdl ├── wf_amrfinderplus.wdl ├── wf_mashtree_fasta.wdl ├── wf_tbprofiler_ont.wdl ├── wf_tbprofiler_pe.wdl ├── de_novo_assembly.wdl ├── wf_ksnp3.wdl ├── ecoli_char.wdl ├── wf_read_QC_trim.wdl ├── wf_read_QC_trim_se.wdl ├── compile_ecoli_results.wdl └── wf_core_gene_snp.wdl ├── tasks ├── task_versioning.wdl ├── gene_typing │ ├── task_gamma.wdl │ ├── task_prokka.wdl │ ├── task_abricate.wdl │ ├── task_plasmidfinder.wdl │ ├── task_bakta.wdl │ └── task_resfinder.wdl ├── species_typing │ ├── task_ssuissero.wdl │ ├── task_hpsuissero.wdl │ ├── task_lissero.wdl │ ├── task_legsta.wdl │ ├── task_pasty.wdl │ ├── task_seroba.wdl │ ├── task_spatyper.wdl │ ├── task_sistr.wdl │ ├── task_serotypefinder.wdl │ ├── task_pmga.wdl │ ├── task_ectyper.wdl │ ├── task_pbptyper.wdl │ ├── task_staphopiasccmec.wdl │ ├── task_meningotype.wdl │ ├── task_seqsero2.wdl │ ├── task_hicap.wdl │ ├── task_emmtyper.wdl │ ├── task_genotyphi.wdl │ ├── task_shigatyper.wdl │ ├── task_ngmaster.wdl │ ├── task_sonneityping.wdl │ ├── task_ts_mlst.wdl │ ├── task_agrvate.wdl │ ├── task_poppunk_streppneumo.wdl │ ├── task_srst2_vibrio.wdl │ ├── task_kleborate.wdl │ └── task_shigeifinder.wdl ├── phylogenetic_inference │ ├── task_iqtree.wdl │ ├── task_mashtree.wdl │ ├── task_mycosnp_tree.wdl │ ├── task_ksnp3.wdl │ ├── task_snp_dists.wdl │ └── task_pirate.wdl ├── assembly │ ├── task_mycosnp_consensus_assembly.wdl │ └── task_shovill.wdl ├── quality_control │ ├── task_quast.wdl │ ├── task_busco.wdl │ ├── task_trimmomatic.wdl │ ├── task_bbduk.wdl │ ├── task_fastp.wdl │ ├── task_fastq_scan.wdl │ ├── task_fastqc.wdl │ ├── task_cg_pipeline.wdl │ └── task_mummer_ani.wdl ├── utilities │ ├── task_rasusa.wdl │ └── task_summarize_data.wdl └── taxon_id │ ├── task_midas.wdl │ └── task_kraken2.wdl ├── README.md ├── .github └── workflows │ ├── miniwdl-check.yml │ └── pytest-workflows.yml └── .dockstore.yml /tests/README.md: -------------------------------------------------------------------------------- 1 | # Testing info 2 | -------------------------------------------------------------------------------- /tests/inputs/empty-for-test.txt: -------------------------------------------------------------------------------- 1 | This file is empty for test purposes. -------------------------------------------------------------------------------- /validation_files/kleb_assembly_input.json: -------------------------------------------------------------------------------- 1 | { 2 | "kleborate_wf.assembly": "./validation_files/GCF_000240185.1_ASM24018v2_genomic.fna", 3 | "kleborate_wf.samplename": "Sample1" 4 | } 5 | -------------------------------------------------------------------------------- /workflows/wf_bc_n_qc_pe_local-dev.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "wf_bc_n_qc_pe.wdl" as assembly 4 | 5 | workflow bc_n_qc_local { 6 | input { 7 | Array[Pair[Array[String], Pair[File,File]]] inputSamples 8 | } 9 | 10 | scatter (sample in inputSamples) { 11 | call assembly.bc_n_qc_pe { 12 | input: 13 | samplename = sample.left[0], 14 | read1_raw = sample.right.left, 15 | read2_raw = sample.right.right 16 | } 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /tests/inputs/wf_theiaprok_illumina_pe_cromwell.json: -------------------------------------------------------------------------------- 1 | { 2 | "theiaprok_illumina_pe.samplename": "test", 3 | "theiaprok_illumina_pe.read1_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R1.fastq.gz", 4 | "theiaprok_illumina_pe.read2_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R2.fastq.gz", 5 | "theiaprok_illumina_pe.skip_screen": true, 6 | "theiaprok_illumina_pe.read_QC_trim.call_midas": false, 7 | "theiaprok_illumina_pe.genome_annotation": "prokka", 8 | "theiaprok_illumina_pe.shovill_pe.assembler": "skesa", 9 | "theiaprok_illumina_pe.merlin_magic.call_poppunk": false 10 | } 11 | -------------------------------------------------------------------------------- /tasks/task_versioning.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task version_capture { 4 | input { 5 | String? timezone 6 | } 7 | meta { 8 | volatile: true 9 | } 10 | command { 11 | PHBG_Version="PHBG v1.3.0" 12 | ~{default='' 'export TZ=' + timezone} 13 | date +"%Y-%m-%d" > TODAY 14 | echo "$PHBG_Version" > PHBG_VERSION 15 | } 16 | output { 17 | String date = read_string("TODAY") 18 | String phbg_version = read_string("PHBG_VERSION") 19 | } 20 | runtime { 21 | memory: "1 GB" 22 | cpu: 1 23 | docker: "quay.io/theiagen/utility:1.1" 24 | disks: "local-disk 10 HDD" 25 | dx_instance_type: "mem1_ssd1_v2_x2" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tasks/gene_typing/task_gamma.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task gamma_one_sample { 4 | input { 5 | File assembly_fasta 6 | String samplename 7 | String docker = "quay.io/biocontainers/gamma:1.4--hdfd78af_0" 8 | File gamma_database 9 | Int disk_size = 100 10 | } 11 | String database_name = basename(gamma_database) 12 | command <<< 13 | GAMMA.py ~{assembly_fasta} ~{gamma_database} ~{samplename} 14 | 15 | mv ~{samplename}.gamma ~{samplename}_gamma_report.tsv 16 | 17 | >>> 18 | output { 19 | File gamma_results = "~{samplename}_gamma_report.tsv" 20 | String gamma_database_version = database_name 21 | String gamma_docker = docker 22 | } 23 | runtime { 24 | memory: "8 GB" 25 | cpu: 4 26 | docker: "~{docker}" 27 | disks: "local-disk " + disk_size + " SSD" 28 | disk: disk_size + " GB" 29 | maxRetries: 3 30 | } 31 | } -------------------------------------------------------------------------------- /workflows/wf_rasusa.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/utilities/task_rasusa.wdl" as rasusa 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow rasusa_workflow { 7 | input { 8 | File read1 9 | File? read2 10 | String samplename 11 | Float coverage 12 | String genome_size 13 | } 14 | call rasusa.rasusa as rasusa_task { 15 | input: 16 | read1 = read1, 17 | read2 = read2, 18 | samplename = samplename, 19 | genome_size = genome_size, 20 | coverage = coverage 21 | } 22 | call versioning.version_capture{ 23 | input: 24 | } 25 | output { 26 | String rasusa_wf_version = version_capture.phbg_version 27 | String rasusa_wf_analysis_date = version_capture.date 28 | 29 | String rasusa_version = rasusa_task.rasusa_version 30 | File read1_subsampled = rasusa_task.read1_subsampled 31 | File? read2_subsampled = rasusa_task.read2_subsampled 32 | } 33 | } -------------------------------------------------------------------------------- /workflows/wf_serotypefinder.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/task_taxon_id.wdl" as taxon_ID 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow serotypefinder { 7 | input { 8 | String samplename 9 | File ecoli_assembly 10 | } 11 | call taxon_ID.serotypefinder_one_sample { 12 | input: 13 | samplename = samplename, 14 | ecoli_assembly = ecoli_assembly 15 | } 16 | call versioning.version_capture{ 17 | input: 18 | } 19 | output { 20 | String serotypefinder_wf_version = version_capture.phbg_version 21 | String serotypefinder_wf_analysis_date = version_capture.date 22 | 23 | String serotypefinder_report = serotypefinder_one_sample.serotypefinder_report 24 | String serotypefinder_docker = serotypefinder_one_sample.serotypefinder_docker 25 | String serotypefinder_serotype = serotypefinder_one_sample.serotypefinder_serotype 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /tasks/species_typing/task_ssuissero.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task ssuissero { 4 | meta { 5 | description: "Serotype prediction of Streptococcus suis assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/ssuissero:1.0.1--hdfd78af_0" 11 | Int disk_size = 100 12 | Int? cpu = 4 13 | String version = "1.0.1" 14 | } 15 | command <<< 16 | # Does not output a version 17 | echo ~{version} | tee VERSION 18 | SsuisSero.sh \ 19 | -i ~{assembly} \ 20 | -o ./ \ 21 | -s ~{samplename} \ 22 | -x fasta \\ 23 | -t ~{cpu} 24 | >>> 25 | output { 26 | File ssuissero_results = "~{samplename}.tsv" 27 | String ssuissero_version = read_string("VERSION") 28 | } 29 | runtime { 30 | docker: "~{docker}" 31 | memory: "8 GB" 32 | cpu: 4 33 | disks: "local-disk " + disk_size + " SSD" 34 | disk: disk_size + " GB" 35 | maxRetries: 3 36 | preemptible: 0 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tasks/species_typing/task_hpsuissero.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task hpsuissero { 4 | meta { 5 | description: "Serotype prediction of Haemophilus parasuis assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/hpsuissero:1.0.1--hdfd78af_0" 11 | Int disk_size = 100 12 | Int? cpu = 4 13 | String version = "1.0.1" 14 | } 15 | command <<< 16 | # Does not output a version 17 | echo ~{version} | tee VERSION 18 | HpsuisSero.sh \ 19 | -i ~{assembly} \ 20 | -o ./ \ 21 | -s ~{samplename} \ 22 | -x fasta \ 23 | -t ~{cpu} 24 | >>> 25 | output { 26 | File hpsuissero_results = "~{samplename}.tsv" 27 | String hpsuissero_version = read_string("VERSION") 28 | } 29 | runtime { 30 | docker: "~{docker}" 31 | memory: "8 GB" 32 | cpu: 4 33 | disks: "local-disk " + disk_size + " SSD" 34 | disk: disk_size + " GB" 35 | maxRetries: 3 36 | preemptible: 0 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /workflows/fetch_sra_to_fastq.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | workflow fetch_sra_to_fastq { 4 | 5 | input { 6 | String SRR 7 | } 8 | 9 | call prefetch_fastq_dump { 10 | input: 11 | sra_id=SRR 12 | } 13 | 14 | output { 15 | File read1 =prefetch_fastq_dump.read1 16 | File read2 =prefetch_fastq_dump.read2 17 | } 18 | } 19 | 20 | task prefetch_fastq_dump { 21 | 22 | input { 23 | String sra_id 24 | } 25 | 26 | command { 27 | prefetch --version | head -1 | tee VERSION 28 | prefetch ${sra_id} 29 | fastq-dump --version | head -1 | tee VERSION 30 | fastq-dump \ 31 | --gzip \ 32 | --split-files \ 33 | ${sra_id} 34 | } 35 | 36 | output { 37 | File read1="${sra_id}_1.fastq.gz" 38 | File read2="${sra_id}_2.fastq.gz" 39 | } 40 | 41 | runtime { 42 | docker: "quay.io/staphb/sratoolkit:2.9.2" 43 | memory: "8 GB" 44 | cpu: 2 45 | disks: "local-disk 100 SSD" 46 | preemptible: 1 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /workflows/wf_gambit_query.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/taxon_id/task_gambit.wdl" as gambit 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow gambit_query { 7 | input { 8 | File assembly_fasta 9 | String samplename 10 | } 11 | call gambit.gambit { 12 | input: 13 | assembly = assembly_fasta, 14 | samplename = samplename, 15 | } 16 | call versioning.version_capture { 17 | input: 18 | } 19 | output { 20 | String gambit_query_wf_version = version_capture.phbg_version 21 | String gambit_query_wf_analysis_date = version_capture.date 22 | #Taxon ID 23 | File gambit_report = gambit.gambit_report_file 24 | File gambit_closest_genomes = gambit.gambit_closest_genomes_file 25 | String gambit_predicted_taxon = gambit.gambit_predicted_taxon 26 | String gambit_predicted_taxon_rank = gambit.gambit_predicted_taxon_rank 27 | String gambit_version = gambit.gambit_version 28 | String gambit_db_version = gambit.gambit_db_version 29 | String gambit_docker = gambit.gambit_docker 30 | } 31 | } -------------------------------------------------------------------------------- /tests/config/pytest_filter.yml: -------------------------------------------------------------------------------- 1 | wf_theiaprok_illumina_pe: 2 | - workflows/wf_theiaprok_illumina_pe.wdl 3 | - tasks/assembly/task_shovill.wdl 4 | - tasks/quality_control/task_quast.wdl 5 | - tasks/quality_control/task_cg_pipeline.wdl 6 | - tasks/quality_control/task_screen.wdl 7 | - tasks/taxon_id/task_gambit.wdl 8 | - tasks/gene_typing/task_amrfinderplus.wdl 9 | - tasks/species_typing/task_ts_mlst.wdl 10 | - tasks/task_versioning.wdl 11 | - tasks/utilities/task_broad_terra_tools.wdl 12 | - workflows/wf_read_QC_trim.wdl 13 | - tasks/quality_control/task_trimmomatic.wdl 14 | - tasks/quality_control/task_bbduk.wdl 15 | - tasks/quality_control/task_fastq_scan.wdl 16 | - workflows/wf_merlin_magic.wdl 17 | - tasks/species_typing/task_serotypefinder.wdl 18 | - tasks/species_typing/task_ectyper.wdl 19 | - tasks/species_typing/task_lissero.wdl 20 | - tasks/species_typing/task_sistr.wdl 21 | - tasks/species_typing/task_seqsero2.wdl 22 | - tasks/species_typing/task_kleborate.wdl 23 | - tasks/species_typing/task_tbprofiler.wdl 24 | - tasks/species_typing/task_legsta.wdl 25 | - tasks/species_typing/task_genotyphi.wdl 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | **
NOTE: WORKFLOWS FROM THIS REPOSITORY HAVE BEEN MIGRATED TO THE PUBLIC HEALTH BIOINFORMATICS (PHB) REPOSITORY. FUTURE DEVELOPMENTS AND UPDATES FOR THESE WORKFLOWS WILL OCCUR IN [https://github.com/theiagen/public_health_bioinformatics](https://github.com/theiagen/public_health_bioinformatics).
** 3 | 4 | ---- 5 | 6 | # Public Health Bacterial Genomics 7 | 8 | Bioinformatics workflows for genomic characteriation, submission preparation, and genomic epidemiology of bacterial pathogens of concern. 9 | 10 | **More information about the steps undertaken in these workflows is available via the [Theiagen Public Resources Documentation](https://theiagen.notion.site/Theiagen-Public-Health-Resources-a4bd134b0c5c4fe39870e21029a30566).** 11 | 12 | # Note to Users 13 | This repository and the workflows within are in the early stages of development. We recommend usage of our stable version releases as the main and development branches are subject to routine updates. Please contact support@terrapublichealth.zendesk.com if you would like to be added to our PHBG mailing list and 14 | receive updates and announcements regarding any resource associated with this repository. 15 | 16 | 17 | -------------------------------------------------------------------------------- /workflows/wf_kraken2_se.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/taxon_id/task_kraken2.wdl" as kraken2 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow kraken2_se_wf { 7 | meta { 8 | description: "Classify single-end reads using Kraken2" 9 | } 10 | 11 | input { 12 | String samplename 13 | File read1 14 | File kraken2_db 15 | } 16 | call kraken2.kraken2_se { 17 | input: 18 | samplename = samplename, 19 | read1 = read1, 20 | kraken2_db = kraken2_db 21 | } 22 | call versioning.version_capture{ 23 | input: 24 | } 25 | output { 26 | # PHBG Version Captures 27 | String kraken2_se_wf_version = version_capture.phbg_version 28 | String kraken2_se_wf_analysis_date = version_capture.date 29 | # Kraken2 30 | String kraken2_version = kraken2_se.kraken2_version 31 | String kraken2_docker = kraken2_se.kraken2_docker 32 | File kraken2_report = kraken2_se.kraken2_report 33 | File kraken2_classified_report = kraken2_se.kraken2_classified_report 34 | File kraken2_unclassified_read1 = kraken2_se.kraken2_unclassified_read1 35 | File kraken2_classified_read1 = kraken2_se.kraken2_classified_read1 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /workflows/wf_pmga.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/tools/task_pmga.wdl" as pmga 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow pmga_wf { 7 | input { 8 | File assembly 9 | String samplename 10 | } 11 | call pmga.pmga { 12 | input: 13 | assembly = assembly, 14 | samplename = samplename 15 | } 16 | call versioning.version_capture{ 17 | input: 18 | } 19 | output { 20 | String pmga_wf_version = version_capture.phbg_version 21 | String pmga_wf_analysis_date = version_capture.date 22 | String pmga_version = pmga.version 23 | String pmga_docker = pmga.docker 24 | String pmga_speciesdb = pmga.pmga_speciesdb 25 | String pmga_serotype = pmga.pmga_serotype 26 | String pmga_genes = pmga.pmga_genes 27 | String pmga_notes = pmga.pmga_notes 28 | File pmga_results = pmga.pmga_results 29 | File pmga_allele_matrix = pmga.pmga_allele_matrix 30 | File pmga_blast_final = pmga.pmga_blast_final 31 | File pmga_blast_raw = pmga.pmga_blast_raw 32 | File pmga_loci_counts = pmga.pmga_loci_counts 33 | File pmga_gff = pmga.pmga_gff 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tasks/species_typing/task_lissero.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task lissero { 4 | meta { 5 | description: "Serogroup typing prediction for Listeria monocytogenes" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/lissero:0.4.9--py_0" 11 | Int disk_size = 100 12 | Int? cpu = 2 13 | 14 | # Parameters 15 | # --min_id Minimum percent identity to accept a match [Default 95.0] 16 | # --min_cov Minimum coverage of the gene to accept a match [Default 95.0] 17 | Float min_id = 95.0 18 | Float min_cov = 95.0 19 | } 20 | command <<< 21 | echo $(lissero --version 2>&1) | sed 's/^.*LisSero //' | tee VERSION 22 | lissero \ 23 | ~{'--min_id ' + min_id} \ 24 | ~{'--min_cov ' + min_cov} \ 25 | ~{assembly} \ 26 | > ~{samplename}.tsv 27 | 28 | # pull out serotype 29 | tail -n+2 ~{samplename}.tsv | cut -f2 | tee SEROTYPE 30 | >>> 31 | output { 32 | File lissero_results = "~{samplename}.tsv" 33 | String lissero_version = read_string("VERSION") 34 | String lissero_serotype = read_string("SEROTYPE") 35 | } 36 | runtime { 37 | docker: "~{docker}" 38 | memory: "8 GB" 39 | cpu: 2 40 | disks: "local-disk " + disk_size + " SSD" 41 | disk: disk_size + " GB" 42 | maxRetries: 3 43 | preemptible: 0 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /workflows/wf_mycosnp_consensus_assembly.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/assembly/task_mycosnp_consensus_assembly.wdl" as mycosnp_nf 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow mycosnp_consensus_assembly { 7 | meta { 8 | description: "A WDL wrapper around the qc, processing and consensus assembly components of mycosnp-nf, for whole genome sequencing analysis of fungal organisms, including Candida auris." 9 | } 10 | input { 11 | File read1 12 | File read2 13 | String samplename 14 | } 15 | call mycosnp_nf.mycosnp { 16 | input: 17 | read1 = read1, 18 | read2 = read2, 19 | samplename = samplename 20 | } 21 | call versioning.version_capture{ 22 | input: 23 | } 24 | output { 25 | #Version Captures 26 | String mycosnp_consensus_assembly_version = version_capture.phbg_version 27 | String mycosnp_consensus_assembly_analysis_date = version_capture.date 28 | #MycoSNP QC and Assembly 29 | String mycosnp_version = mycosnp.mycosnp_version 30 | String mycosnp_docker = mycosnp.mycosnp_docker 31 | String analysis_date = mycosnp.analysis_date 32 | String reference_strain = mycosnp.reference_strain 33 | String reference_accession = mycosnp.reference_accession 34 | File assembly_fasta = mycosnp.assembly_fasta 35 | File full_results = mycosnp.full_results 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /tasks/species_typing/task_legsta.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task legsta { 4 | meta { 5 | description: "Typing of Legionella pneumophila assemblies" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/legsta:0.5.1--hdfd78af_2" 11 | Int disk_size = 100 12 | Int? cpu = 2 13 | } 14 | command <<< 15 | echo $(legsta --version 2>&1) | sed 's/^.*legsta //; s/ .*\$//;' | tee VERSION 16 | legsta \ 17 | ~{assembly} > ~{samplename}.tsv 18 | 19 | # parse outputs 20 | if [ ! -f ~{samplename}.tsv ]; then 21 | SBT="No SBT predicted" 22 | else 23 | SBT="ST$(tail -n 1 ~{samplename}.tsv | cut -f 2)" 24 | if [ "$SBT" == "ST-" ]; then 25 | SBT="No SBT predicted" 26 | else 27 | if [ "$SBT" == "ST" ]; then 28 | SBT="No SBT predicted" 29 | fi 30 | fi 31 | fi 32 | 33 | echo $SBT | tee LEGSTA_SBT 34 | 35 | >>> 36 | output { 37 | File legsta_results = "~{samplename}.tsv" 38 | String legsta_predicted_sbt = read_string("LEGSTA_SBT") 39 | String legsta_version = read_string("VERSION") 40 | } 41 | runtime { 42 | docker: "~{docker}" 43 | memory: "8 GB" 44 | cpu: 2 45 | disks: "local-disk " + disk_size + " SSD" 46 | disk: disk_size + " GB" 47 | maxRetries: 3 48 | preemptible: 0 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /workflows/wf_kraken2_pe.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/taxon_id/task_kraken2.wdl" as kraken2 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow kraken2_pe_wf { 7 | meta { 8 | description: "Classify paired-end reads using Kraken2" 9 | } 10 | 11 | input { 12 | String samplename 13 | File read1 14 | File read2 15 | File kraken2_db 16 | } 17 | call kraken2.kraken2_pe { 18 | input: 19 | samplename = samplename, 20 | read1 = read1, 21 | read2 = read2, 22 | kraken2_db = kraken2_db 23 | } 24 | call versioning.version_capture{ 25 | input: 26 | } 27 | output { 28 | # PHBG Version Captures 29 | String kraken2_pe_wf_version = version_capture.phbg_version 30 | String kraken2_pe_wf_analysis_date = version_capture.date 31 | # Kraken2 32 | String kraken2_version = kraken2_pe.kraken2_version 33 | String kraken2_docker = kraken2_pe.kraken2_docker 34 | File kraken2_report = kraken2_pe.kraken2_report 35 | File kraken2_classified_report = kraken2_pe.kraken2_classified_report 36 | File kraken2_unclassified_read1 = kraken2_pe.kraken2_unclassified_read1 37 | File kraken2_unclassified_read2 = kraken2_pe.kraken2_unclassified_read2 38 | File kraken2_classified_read1 = kraken2_pe.kraken2_classified_read1 39 | File kraken2_classified_read2 = kraken2_pe.kraken2_classified_read2 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /workflows/wf_mycosnp_tree.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/phylogenetic_inference/task_mycosnp_tree.wdl" as mycosnptree_nf 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow mycosnp_tree { 7 | meta { 8 | description: "A WDL wrapper around the phylogeny components of mycosnp-nf, for whole genome sequencing analysis of fungal organisms, including Candida auris." 9 | } 10 | input { 11 | Array[String] samplename 12 | Array[File] assembly_fasta 13 | } 14 | call mycosnptree_nf.mycosnptree { 15 | input: 16 | assembly_fasta = assembly_fasta, 17 | samplename = samplename 18 | } 19 | call versioning.version_capture{ 20 | input: 21 | } 22 | output { 23 | #Version Captures 24 | String mycosnp_tree_version = version_capture.phbg_version 25 | String mycosnp_tree_analysis_date = version_capture.date 26 | #MycoSNP QC and Assembly 27 | String mycosnp_version = mycosnptree.mycosnptree_version 28 | String mycosnp_docker = mycosnptree.mycosnptree_docker 29 | String analysis_date = mycosnptree.analysis_date 30 | String reference_strain = mycosnptree.reference_strain 31 | String reference_accession = mycosnptree.reference_accession 32 | File mycosnp_tree_finaltree = mycosnptree.mycosnptree_tree 33 | File mycosnp_tree_iqtree_log = mycosnptree.mycosnptree_iqtree_log 34 | File mycosnp_tree_full_results = mycosnptree.mycosnptree_full_results 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /workflows/wf_kleborate.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | 4 | import "../tasks/task_taxon_id.wdl" as taxon 5 | import "../tasks/task_versioning.wdl" as versioning 6 | 7 | workflow kleborate_wf { 8 | input { 9 | File assembly 10 | String samplename 11 | } 12 | call taxon.kleborate_one_sample { 13 | input: 14 | assembly = assembly, 15 | samplename = samplename 16 | } 17 | call versioning.version_capture{ 18 | input: 19 | } 20 | output { 21 | String kleborate_wf_version = version_capture.phbg_version 22 | String kleborate_wf_analysis_date = version_capture.date 23 | 24 | File kleborate_report = kleborate_one_sample.kleborate_output_file 25 | String kleborate_version = kleborate_one_sample.version 26 | String kleborate_mlst_sequence_type = kleborate_one_sample.mlst_sequence_type 27 | String kleborate_virulence_score = kleborate_one_sample.virulence_score 28 | String kleborate_resistance_score = kleborate_one_sample.resistance_score 29 | String kleborate_num_resistance_genes = kleborate_one_sample.num_resistance_genes 30 | String kleborate_bla_resistance_genes = kleborate_one_sample.bla_resistance_genes 31 | String kleborate_esbl_resistance_genes = kleborate_one_sample.esbl_resistance_genes 32 | String kleborate_key_resistance_genes = kleborate_one_sample.key_resistance_genes 33 | String kleborate_resistance_mutations = kleborate_one_sample.resistance_mutations 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tasks/phylogenetic_inference/task_iqtree.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task iqtree { 4 | input { 5 | File alignment 6 | String cluster_name 7 | String iqtree_model = "GTR+I+G" # For comparison to other tools use HKY for bactopia, GTR+F+I for grandeur, GTR+G4 for nullarbor, GTR+G for dryad 8 | String iqtree_bootstraps = 1000 # Ultrafast bootstrap replicates 9 | String alrt = 1000 # SH-like approximate likelihood ratio test (SH-aLRT) replicates 10 | String? iqtree_opts = "" 11 | String docker = "staphb/iqtree:1.6.7" 12 | Int disk_size = 100 13 | } 14 | command <<< 15 | # date and version control 16 | date | tee DATE 17 | iqtree --version | grep version | sed 's/.*version/version/;s/ for Linux.*//' | tee VERSION 18 | 19 | numGenomes=`grep -o '>' ~{alignment} | wc -l` 20 | if [ $numGenomes -gt 3 ] 21 | then 22 | cp ~{alignment} ./msa.fasta 23 | iqtree \ 24 | -nt AUTO \ 25 | -s msa.fasta \ 26 | -m ~{iqtree_model} \ 27 | -bb ~{iqtree_bootstraps} \ 28 | -alrt ~{alrt} \ 29 | ~{iqtree_opts} 30 | 31 | cp msa.fasta.contree ~{cluster_name}_msa.tree 32 | fi 33 | >>> 34 | output { 35 | String date = read_string("DATE") 36 | String version = read_string("VERSION") 37 | File ml_tree = "~{cluster_name}_msa.tree" 38 | } 39 | runtime { 40 | docker: "~{docker}" 41 | memory: "32 GB" 42 | cpu: 4 43 | disks: "local-disk " + disk_size + " SSD" 44 | disk: disk_size + " GB" 45 | preemptible: 0 46 | maxRetries: 3 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /tasks/gene_typing/task_prokka.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task prokka { 4 | input { 5 | File assembly 6 | String samplename 7 | Int cpu = 8 8 | Int memory = 16 9 | String docker = "staphb/prokka:1.14.5" 10 | Int disk_size = 100 11 | # Parameters 12 | # proteins recommended: when you have good quality reference genomes and want to ensure gene naming is consistent [false] 13 | # prodigal_tf: prodigal training file 14 | # prokka_arguments: free string to add any other additional prokka arguments 15 | Boolean proteins = false 16 | Boolean compliant = true 17 | File? prodigal_tf 18 | String? prokka_arguments 19 | } 20 | command <<< 21 | date | tee DATE 22 | prokka --version | tee PROKKA_VERSION 23 | 24 | prokka \ 25 | ~{prokka_arguments} \ 26 | --cpus 0 \ 27 | --prefix ~{samplename} \ 28 | ~{true='--compliant' false='' compliant} \ 29 | ~{true='--proteins' false='' proteins} \ 30 | ~{'--prodigaltf ' + prodigal_tf} \ 31 | ~{assembly} 32 | 33 | 34 | >>> 35 | output { 36 | File prokka_gff = "~{samplename}/~{samplename}.gff" 37 | File prokka_gbk = "~{samplename}/~{samplename}.gbk" 38 | File prokka_sqn = "~{samplename}/~{samplename}.sqn" 39 | Array[File] prokka_outs = glob("~{samplename}/~{samplename}*") 40 | String prokka_version = read_string("PROKKA_VERSION") 41 | } 42 | runtime { 43 | memory: "~{memory} GB" 44 | cpu: cpu 45 | docker: docker 46 | disks: "local-disk " + disk_size + " SSD" 47 | disk: disk_size + " GB" 48 | maxRetries: 3 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /workflows/wf_amrfinderplus.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | import "../tasks/gene_typing/task_amrfinderplus.wdl" as amrfindertask 4 | import "../tasks/task_versioning.wdl" as versioning 5 | 6 | workflow amrfinderplus_wf { 7 | input { 8 | File assembly 9 | String samplename 10 | } 11 | call amrfindertask.amrfinderplus_nuc { 12 | input: 13 | assembly = assembly, 14 | samplename = samplename 15 | } 16 | call versioning.version_capture{ 17 | input: 18 | } 19 | output { 20 | String amrfinderplus_version = amrfinderplus_nuc.amrfinderplus_version 21 | String amrfinderplus_db_version = amrfinderplus_nuc.amrfinderplus_db_version 22 | String amrfinderplus_wf_version = version_capture.phbg_version 23 | String amrfinderplus_wf_analysis_date = version_capture.date 24 | File amrfinderplus_all_report = amrfinderplus_nuc.amrfinderplus_all_report 25 | File amrfinderplus_amr_report = amrfinderplus_nuc.amrfinderplus_amr_report 26 | File amrfinderplus_stress_report = amrfinderplus_nuc.amrfinderplus_stress_report 27 | File amrfinderplus_virulence_report = amrfinderplus_nuc.amrfinderplus_virulence_report 28 | String amrfinderplus_amr_genes = amrfinderplus_nuc.amrfinderplus_amr_genes 29 | String amrfinderplus_stress_genes = amrfinderplus_nuc.amrfinderplus_stress_genes 30 | String amrfinderplus_virulence_genes = amrfinderplus_nuc.amrfinderplus_virulence_genes 31 | String amrfinderplus_amr_classes = amrfinderplus_nuc.amrfinderplus_amr_classes 32 | String amrfinderplus_amr_subclasses = amrfinderplus_nuc.amrfinderplus_amr_subclasses 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /tasks/phylogenetic_inference/task_mashtree.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task mashtree_fasta { 4 | input { 5 | Array[File] assembly_fasta 6 | String cluster_name 7 | Int truncLength = 250 8 | String sort_order = "ABC" 9 | Int genomesize = 5000000 10 | Int mindepth = 5 11 | Int kmerlength = 21 12 | Int sketchsize = 10000 13 | Int cpu = 16 14 | Int memory = 64 15 | Int disk_size = 100 16 | } 17 | command <<< 18 | # date and version control 19 | date | tee DATE 20 | mashtree -v | tee VERSION 21 | 22 | # organize input assemblies 23 | mkdir mash_assemblies 24 | mv ~{sep=' ' assembly_fasta} mash_assemblies 25 | #run mashtree 26 | mashtree \ 27 | ~{'--truncLength ' + truncLength} \ 28 | ~{'--sort-order ' + sort_order} \ 29 | ~{'--genomesize ' + genomesize} \ 30 | ~{'--mindepth ' + mindepth} \ 31 | ~{'--kmerlength ' + kmerlength} \ 32 | ~{'--sketch-size ' + sketchsize} \ 33 | ~{'--numcpus ' + cpu} \ 34 | ~{'--outmatrix ' + cluster_name + '.tsv'} \ 35 | ~{'--outtree ' + cluster_name + '.nwk'} \ 36 | mash_assemblies/* 37 | 38 | >>> 39 | output { 40 | String date = read_string("DATE") 41 | String version = read_string("VERSION") 42 | File mashtree_matrix = "~{cluster_name}.tsv" 43 | File mashtree_tree = "~{cluster_name}.nwk" 44 | } 45 | runtime { 46 | docker: "quay.io/staphb/mashtree:1.2.0" 47 | memory: "~{memory} GB" 48 | cpu: cpu 49 | disks: "local-disk " + disk_size + " SSD" 50 | disk: disk_size + " GB" 51 | maxRetries: 3 52 | preemptible: 0 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tasks/species_typing/task_pasty.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task pasty { 4 | input { 5 | File assembly 6 | String samplename 7 | Int min_pident = 95 8 | Int min_coverage = 95 9 | String docker = "staphb/pasty:1.0.2" 10 | Int disk_size = 100 11 | } 12 | command <<< 13 | # date and version control 14 | date | tee DATE 15 | pasty --version > VERSION && sed -i -e 's/pasty\, version //' VERSION 16 | pasty \ 17 | --assembly ~{assembly} \ 18 | --min_pident ~{min_pident} \ 19 | --min_coverage ~{min_coverage} \ 20 | --prefix ~{samplename} \ 21 | --outdir . 22 | awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f2 > SEROGROUP 23 | awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f3 > COVERAGE 24 | awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f4 > FRAGMENTS 25 | awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f5 > COMMENT 26 | >>> 27 | output { 28 | String pasty_serogroup = read_string("SEROGROUP") 29 | Float pasty_serogroup_coverage = read_float("COVERAGE") 30 | Int pasty_serogroup_fragments = read_int("FRAGMENTS") 31 | File pasty_summary_tsv = "~{samplename}.tsv" 32 | File pasty_blast_hits = "~{samplename}.blastn.tsv" 33 | File pasty_all_serogroups = "~{samplename}.details.tsv" 34 | String pasty_version = read_string("VERSION") 35 | String pasty_pipeline_date = read_string("DATE") 36 | String pasty_docker = docker 37 | String pasty_comment = read_string("COMMENT") 38 | } 39 | runtime { 40 | docker: "~{docker}" 41 | memory: "4 GB" 42 | cpu: 2 43 | disks: "local-disk " + disk_size + " SSD" 44 | disk: disk_size + " GB" 45 | maxRetries: 3 46 | preemptible: 0 47 | } 48 | } -------------------------------------------------------------------------------- /tasks/species_typing/task_seroba.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task seroba { 4 | input { 5 | File read1 6 | File? read2 7 | String samplename 8 | String docker = "staphb/seroba:1.0.2" 9 | Int disk_size = 100 10 | } 11 | command <<< 12 | # grab version 13 | seroba version > VERSION 14 | 15 | # database path will need to be changed if/when docker image is updated. 16 | seroba runSerotyping /seroba-1.0.2/database/ ~{read1} ~{read2} ~{samplename} 17 | 18 | # check for serotype grouping & contamination flag 19 | cut -f2 ~{samplename}/pred.tsv > SEROTYPE 20 | 21 | # check for detailed serogroup information 22 | if [ -f ~{samplename}/detailed_serogroup_info.txt ]; then 23 | grep "Serotype predicted by ariba" ~{samplename}/detailed_serogroup_info.txt | cut -f2 | sed 's/://' > ARIBA_SEROTYPE 24 | grep "assembly from ariba" ~{samplename}/detailed_serogroup_info.txt | cut -f2 | sed 's/://' > ARIBA_IDENTITY 25 | else 26 | # if the details do not exist, output blanks to ariba columns 27 | echo "" > ARIBA_SEROTYPE 28 | echo "" > ARIBA_IDENTITY 29 | fi 30 | >>> 31 | output { 32 | String seroba_version = read_string("VERSION") 33 | String seroba_docker = docker 34 | String seroba_serotype = read_string("SEROTYPE") 35 | String seroba_ariba_serotype = read_string("ARIBA_SEROTYPE") 36 | String seroba_ariba_identity = read_string("ARIBA_IDENTITY") 37 | File? seroba_details = "~{samplename}/detailed_serogroup_info.txt" 38 | } 39 | runtime { 40 | docker: "~{docker}" 41 | memory: "16 GB" 42 | cpu: 8 43 | disks: "local-disk " + disk_size + " SSD" 44 | disk: disk_size + " GB" 45 | maxRetries: 3 46 | } 47 | } -------------------------------------------------------------------------------- /tasks/assembly/task_mycosnp_consensus_assembly.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task mycosnp { 4 | input { 5 | File read1 6 | File read2 7 | String samplename 8 | String docker = "quay.io/theiagen/mycosnp:dev" 9 | String strain = "B11205" 10 | String accession = "GCA_016772135" 11 | Int memory = 16 12 | Int cpu = 4 13 | Int disk_size = 100 14 | } 15 | command <<< 16 | date | tee DATE 17 | echo $(nextflow pull rpetit3/mycosnp-nf 2>&1) | sed 's/^.*revision: //;' | tee MYCOSNP_VERSION 18 | 19 | # Make sample FOFN 20 | echo "sample,fastq_1,fastq_2" > sample.csv 21 | echo "~{samplename},~{read1},~{read2}" >> sample.csv 22 | 23 | # Run MycoSNP 24 | mkdir ~{samplename} 25 | cd ~{samplename} 26 | if nextflow run rpetit3/mycosnp-nf --input ../sample.csv --ref_dir /reference/~{accession} --publish_dir_mode copy --skip_phylogeny; then 27 | # Everything finished, pack up the results and clean up 28 | rm -rf .nextflow/ work/ 29 | cd .. 30 | tar -cf - ~{samplename}/ | gzip -n --best > ${samplename}.tar.gz 31 | else 32 | # Run failed 33 | exit 1 34 | fi 35 | >>> 36 | output { 37 | String mycosnp_version = read_string("MYCOSNP_VERSION") 38 | String mycosnp_docker = docker 39 | String analysis_date = read_string("DATE") 40 | String reference_strain = strain 41 | String reference_accession = accession 42 | File assembly_fasta = "~{samplename}/results/combined/consensus/~{samplename}.fasta.gz" 43 | File full_results = "~{samplename}.tar.gz" 44 | } 45 | runtime { 46 | docker: "~{docker}" 47 | memory: "~{memory} GB" 48 | cpu: cpu 49 | disks: "local-disk " + disk_size + " SSD" 50 | disk: disk_size + " GB" 51 | maxRetries: 3 52 | preemptible: 0 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /tasks/species_typing/task_spatyper.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task spatyper { 4 | meta { 5 | description: "Computational method for finding spa types in Staphylococcus aureus" 6 | } 7 | input { 8 | File assembly 9 | String samplename 10 | String docker = "quay.io/biocontainers/spatyper:0.3.3--pyhdfd78af_3" 11 | Int disk_size = 100 12 | Int cpu = 4 13 | 14 | # Parameters 15 | # --do_enrich Do PCR product enrichment 16 | Boolean do_enrich = false 17 | } 18 | command <<< 19 | spaTyper --version 2>&1 | sed 's/^.*spaTyper //' | tee VERSION 20 | spaTyper \ 21 | ~{true="--do_enrich" false="" do_enrich} \ 22 | --fasta ~{assembly} \ 23 | --output ~{samplename}.tsv 24 | 25 | python3 <>>
45 | output {
46 | File spatyper_tsv = "~{samplename}.tsv"
47 | String spatyper_repeats = read_string("REPEATS")
48 | String spatyper_type = read_string("TYPE")
49 | String spatyper_version = read_string("VERSION")
50 | String spatyper_docker = "~{docker}"
51 | }
52 | runtime {
53 | docker: "~{docker}"
54 | memory: "8 GB"
55 | cpu: cpu
56 | disks: "local-disk " + disk_size + " SSD"
57 | disk: disk_size + " GB"
58 | maxRetries: 3
59 | preemptible: 0
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/tasks/gene_typing/task_abricate.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task abricate {
4 | input {
5 | File assembly
6 | String samplename
7 | String database
8 | # Parameters
9 | # --minid Minimum DNA %identity [80]
10 | # --mincov Minimum DNA %coverage [80]
11 | Int? minid
12 | Int? mincov
13 | Int cpu = 2
14 | String docker = "staphb/abricate:1.0.1-abaum-plasmid"
15 | Int disk_size = 100
16 | }
17 | command <<<
18 | date | tee DATE
19 | abricate -v | tee ABRICATE_VERSION
20 | abricate --list
21 | abricate --check
22 |
23 | abricate \
24 | --db ~{database} \
25 | ~{'--minid ' + minid} \
26 | ~{'--mincov ' + mincov} \
27 | --threads ~{cpu} \
28 | --nopath \
29 | ~{assembly} > ~{samplename}_abricate_hits.tsv
30 |
31 | # parse out gene names into list of strings, comma-separated, final comma at end removed by sed
32 | abricate_genes=$(awk -F '\t' '{ print $6 }' ~{samplename}_abricate_hits.tsv | tail -n+2 | tr '\n' ',' | sed 's/.$//')
33 |
34 | # if variable for list of genes is EMPTY, write string saying it is empty to float to Terra table
35 | if [ -z "${abricate_genes}" ]; then
36 | abricate_genes="No genes detected by ABRicate"
37 | fi
38 |
39 | # create final output strings
40 | echo "${abricate_genes}" > ABRICATE_GENES
41 | >>>
42 | output {
43 | File abricate_results = "~{samplename}_abricate_hits.tsv"
44 | String abricate_genes = read_string("ABRICATE_GENES")
45 | String abricate_database = database
46 | String abricate_version = read_string("ABRICATE_VERSION")
47 | String abricate_docker = docker
48 | }
49 | runtime {
50 | memory: "8 GB"
51 | cpu: cpu
52 | docker: docker
53 | disks: "local-disk " + disk_size + " SSD"
54 | disk: disk_size + " GB"
55 | maxRetries: 3
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_sistr.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task sistr {
4 | meta {
5 | description: "Serovar prediction of Salmonella assemblies"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "quay.io/biocontainers/sistr_cmd:1.1.1--pyh864c0ab_2"
11 | Int disk_size = 100
12 | Int? cpu = 4
13 |
14 | # Parameters
15 | # --use-full-cgmlst-db Use the full set of cgMLST alleles which can include highly similar alleles. By default the smaller "centroid" alleles or representative alleles are used for each marker.
16 | Boolean use_full_cgmlst_db = false
17 | }
18 | command <<<
19 | echo $(sistr --version 2>&1) | sed 's/^.*sistr_cmd //; s/ .*\$//' | tee VERSION
20 | sistr \
21 | --qc \
22 | ~{true="--use-full-cgmlst-db" false="" use_full_cgmlst_db} \
23 | --threads ~{cpu} \
24 | --alleles-output ~{samplename}-allele.json \
25 | --novel-alleles ~{samplename}-allele.fasta \
26 | --cgmlst-profiles ~{samplename}-cgmlst.csv \
27 | --output-prediction ~{samplename} \
28 | --output-format tab \
29 | ~{assembly}
30 |
31 | mv ~{samplename}.tab ~{samplename}.tsv
32 |
33 | # parse sistr TSV
34 | cut -f 15 ~{samplename}.tsv | tail -n 1 | tee PREDICTED_SEROTYPE
35 |
36 | >>>
37 | output {
38 | File sistr_results = "~{samplename}.tsv"
39 | File sistr_allele_json = "~{samplename}-allele.json"
40 | File sistr_allele_fasta = "~{samplename}-allele.fasta"
41 | File sistr_cgmlst = "~{samplename}-cgmlst.csv"
42 | String sistr_version = read_string("VERSION")
43 | String sistr_predicted_serotype = read_string("PREDICTED_SEROTYPE")
44 | }
45 | runtime {
46 | docker: "~{docker}"
47 | memory: "8 GB"
48 | cpu: 4
49 | disks: "local-disk " + disk_size + " SSD"
50 | disk: disk_size + " GB"
51 | maxRetries: 3
52 | preemptible: 0
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_serotypefinder.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task serotypefinder {
4 | input {
5 | File assembly
6 | String samplename
7 | String docker = "quay.io/staphb/serotypefinder:2.0.1"
8 | Int disk_size = 100
9 | }
10 | command <<<
11 | # capture date and version
12 | date | tee DATE
13 |
14 | serotypefinder.py -i ~{assembly} -x -o .
15 | mv results_tab.tsv ~{samplename}_results_tab.tsv
16 |
17 | # set H and O type based on serotypefinder ourputs
18 | python3 <>>
47 | output {
48 | File serotypefinder_report = "~{samplename}_results_tab.tsv"
49 | String serotypefinder_docker = docker
50 | String serotypefinder_serotype = read_string("STF_SEROTYPE")
51 | }
52 | runtime {
53 | docker: "~{docker}"
54 | memory: "8 GB"
55 | cpu: 2
56 | disks: "local-disk " + disk_size + " SSD"
57 | disk: disk_size + " GB"
58 | maxRetries: 3
59 | preemptible: 0
60 | }
61 | }
--------------------------------------------------------------------------------
/tasks/quality_control/task_quast.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task quast {
4 | input {
5 | File assembly
6 | String samplename
7 | String docker="quay.io/staphb/quast:5.0.2"
8 | Int disk_size = 100
9 | }
10 | command <<<
11 | # capture date and version
12 | date | tee DATE
13 | quast.py --version | grep QUAST | tee VERSION
14 |
15 | quast.py ~{assembly} -o .
16 | mv report.tsv ~{samplename}_report.tsv
17 |
18 | python <>>
40 | output {
41 | File quast_report = "${samplename}_report.tsv"
42 | String version = read_string("VERSION")
43 | String pipeline_date = read_string("DATE")
44 | Int genome_length = read_int("GENOME_LENGTH")
45 | Int number_contigs = read_int("NUMBER_CONTIGS")
46 | Int n50_value = read_int("N50_VALUE")
47 | Float gc_percent = read_float("GC_PERCENT")
48 | }
49 | runtime {
50 | docker: "~{docker}"
51 | memory: "2 GB"
52 | cpu: 2
53 | disks: "local-disk " + disk_size + " SSD"
54 | disk: disk_size + " GB"
55 | maxRetries: 3
56 | preemptible: 0
57 | }
58 | }
--------------------------------------------------------------------------------
/tasks/species_typing/task_pmga.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task pmga {
4 | meta {
5 | description: "Serogrouping and serotyping of all Neisseria species and Haemophilus influenzae"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "quay.io/staphb/pmga:3.0.2"
11 | Int disk_size = 100
12 | Int? cpu = 4
13 | }
14 | command <<<
15 | echo $(pmga --version 2>&1) | sed 's/.*pmga //; s/ .*\$//' | tee VERSION
16 | pmga \
17 | ~{assembly} \
18 | --blastdir /data/blastdbs \
19 | --threads ~{cpu} \
20 | --prefix ~{samplename}
21 | # Parse pmga TSV
22 | # https://github.com/rpetit3/pmga#pmga-output-files
23 | cut -f 2 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_SPECIESDB
24 | cut -f 3 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_SEROTYPE
25 | cut -f 4 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_GENES
26 | cut -f 5 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_NOTES
27 | >>>
28 | output {
29 | String version = read_string("VERSION")
30 | String pmga_docker = "~{docker}"
31 | String pmga_speciesdb = read_string("PMGA_SPECIESDB")
32 | String pmga_serotype = read_string("PMGA_SEROTYPE")
33 | String pmga_genes = read_string("PMGA_GENES")
34 | String pmga_notes = read_string("PMGA_NOTES")
35 | File pmga_results = "./pmga/~{samplename}.txt"
36 | File pmga_allele_matrix = "./pmga/~{samplename}-allele-matrix.txt"
37 | File pmga_blast_final = "./pmga/~{samplename}-blast-final-results.json.gz"
38 | File pmga_blast_raw = "./pmga/~{samplename}-blast-raw-results.json.gz"
39 | File pmga_loci_counts = "./pmga/~{samplename}-loci-counts.txt"
40 | File pmga_gff = "./pmga/~{samplename}.gff.gz"
41 | }
42 | runtime {
43 | docker: "~{docker}"
44 | memory: "8 GB"
45 | cpu: 4
46 | disks: "local-disk " + disk_size + " SSD"
47 | disk: disk_size + " GB"
48 | maxRetries: 3
49 | preemptible: 0
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/workflows/wf_mashtree_fasta.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | import "../tasks/phylogenetic_inference/task_mashtree.wdl" as mashtree
4 | import "../tasks/task_versioning.wdl" as versioning
5 | import "../tasks/utilities/task_summarize_data.wdl" as data_summary
6 | import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists
7 |
8 |
9 | workflow mashtree_fasta {
10 | input {
11 | Array[File] assembly_fasta
12 | String cluster_name
13 | Array[String]? sample_names
14 | String? data_summary_terra_project
15 | String? data_summary_terra_workspace
16 | String? data_summary_terra_table
17 | String? data_summary_column_names
18 | }
19 | call mashtree.mashtree_fasta as mashtree_task {
20 | input:
21 | assembly_fasta = assembly_fasta,
22 | cluster_name = cluster_name
23 | }
24 | call snp_dists.reorder_matrix {
25 | input:
26 | input_tree = mashtree_task.mashtree_tree,
27 | matrix = mashtree_task.mashtree_matrix,
28 | cluster_name = cluster_name
29 | }
30 | if (defined(data_summary_column_names)) {
31 | call data_summary.summarize_data {
32 | input:
33 | sample_names = sample_names,
34 | terra_project = data_summary_terra_project,
35 | terra_workspace = data_summary_terra_workspace,
36 | terra_table = data_summary_terra_table,
37 | column_names = data_summary_column_names,
38 | output_prefix = cluster_name
39 | }
40 | }
41 | call versioning.version_capture{
42 | input:
43 | }
44 | output {
45 | # Versioning
46 | String mashtree_wf_version = version_capture.phbg_version
47 | String mashtree_wf_analysis_date = version_capture.date
48 | # Masthree Out
49 | File mashtree_matrix = reorder_matrix.ordered_matrix
50 | File mashtree_tree = reorder_matrix.tree
51 | String mashtree_version = mashtree_task.version
52 | # Data Summary Out
53 | File? mashtree_summarized_data = summarize_data.summarized_data
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/workflows/wf_tbprofiler_ont.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 |
4 | import "../tasks/task_taxon_id.wdl" as taxon
5 | import "../tasks/task_versioning.wdl" as versioning
6 |
7 | workflow tbprofiler_wf {
8 | input {
9 | File reads
10 | String samplename
11 | String? mapper = "minimap2"
12 | String? caller = "bcftools"
13 | Int? min_depth = 20
14 | Float? min_af = 0.1
15 | Float? min_af_pred = 0.1
16 | Int? cov_frac_threshold = 1
17 | }
18 | call taxon.tbprofiler_one_sample_ont {
19 | input:
20 | reads = reads,
21 | samplename = samplename,
22 | mapper = mapper,
23 | caller = caller,
24 | min_depth = min_depth,
25 | min_af = min_af,
26 | min_af_pred = min_af_pred,
27 | cov_frac_threshold = cov_frac_threshold
28 | }
29 | call versioning.version_capture{
30 | input:
31 | }
32 | output {
33 | String tb_profiler_wf_version = version_capture.phbg_version
34 | String tb_profiler_wf_analysis_date = version_capture.date
35 | File tbprofiler_output_alignment_bam = tbprofiler_one_sample_ont.tbprofiler_output_bam
36 | File tbprofiler_output_alignment_bai = tbprofiler_one_sample_ont.tbprofiler_output_bai
37 | File tb_profiler_report_csv = tbprofiler_one_sample_ont.tbprofiler_output_csv
38 | File tb_profiler_report_tsv =tbprofiler_one_sample_ont.tbprofiler_output_tsv
39 | String tb_profiler_version = tbprofiler_one_sample_ont.version
40 | String tb_profiler_main_lineage = tbprofiler_one_sample_ont.tb_profiler_main_lineage
41 | String tb_profiler_sub_lineage = tbprofiler_one_sample_ont.tb_profiler_sub_lineage
42 | String tb_profiler_dr_type = tbprofiler_one_sample_ont.tb_profiler_dr_type
43 | String tb_profiler_num_dr_variants = tbprofiler_one_sample_ont.tb_profiler_num_dr_variants
44 | String tb_profiler_num_other_variants = tbprofiler_one_sample_ont.tb_profiler_num_other_variants
45 | String tb_profiler_resistance_genes = tbprofiler_one_sample_ont.tb_profiler_resistance_genes
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/workflows/wf_tbprofiler_pe.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 |
4 | import "../tasks/task_taxon_id.wdl" as taxon
5 | import "../tasks/task_versioning.wdl" as versioning
6 |
7 | workflow tbprofiler_wf {
8 | input {
9 | File read1
10 | File read2
11 | String samplename
12 | String? mapper = "bwa"
13 | String? caller = "bcftools"
14 | Int? min_depth = 10
15 | Float? min_af = 0.1
16 | Float? min_af_pred = 0.1
17 | Int? cov_frac_threshold = 1
18 | }
19 | call taxon.tbprofiler_one_sample_pe {
20 | input:
21 | read1 = read1,
22 | read2 = read2,
23 | samplename = samplename,
24 | mapper = mapper,
25 | caller = caller,
26 | min_depth = min_depth,
27 | min_af = min_af,
28 | min_af_pred = min_af_pred,
29 | cov_frac_threshold = cov_frac_threshold
30 | }
31 | call versioning.version_capture{
32 | input:
33 | }
34 | output {
35 | String tb_profiler_wf_version = version_capture.phbg_version
36 | String tb_profiler_wf_analysis_date = version_capture.date
37 | File tb_profiler_report_csv = tbprofiler_one_sample_pe.tbprofiler_output_csv
38 | File tb_profiler_report_tsv = tbprofiler_one_sample_pe.tbprofiler_output_tsv
39 | File tbprofiler_output_alignment_bam = tbprofiler_one_sample_pe.tbprofiler_output_bam
40 | File tbprofiler_output_alignment_bai = tbprofiler_one_sample_pe.tbprofiler_output_bai
41 | String tb_profiler_version = tbprofiler_one_sample_pe.version
42 | String tb_profiler_main_lineage = tbprofiler_one_sample_pe.tb_profiler_main_lineage
43 | String tb_profiler_sub_lineage = tbprofiler_one_sample_pe.tb_profiler_sub_lineage
44 | String tb_profiler_dr_type = tbprofiler_one_sample_pe.tb_profiler_dr_type
45 | String tb_profiler_num_dr_variants = tbprofiler_one_sample_pe.tb_profiler_num_dr_variants
46 | String tb_profiler_num_other_variants = tbprofiler_one_sample_pe.tb_profiler_num_other_variants
47 | String tb_profiler_resistance_genes = tbprofiler_one_sample_pe.tb_profiler_resistance_genes
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/.github/workflows/miniwdl-check.yml:
--------------------------------------------------------------------------------
1 | #
2 | # This workflow will run on Pushes and Pull Requests against the main branch. It
3 | # will only run "miniwdl check" on wdl files that have had a change in the push
4 | # or PR.
5 | #
6 | name: MiniWDL Check
7 | on:
8 | push:
9 | branches: [main]
10 | pull_request:
11 | branches: [main]
12 |
13 | jobs:
14 | changes:
15 | name: Check for changes
16 | runs-on: ubuntu-latest
17 | outputs:
18 | # Expose workflows with changes
19 | workflows: ${{ steps.filter.outputs.wf }}
20 | workflows_files: ${{ steps.filter.outputs.wf_files }}
21 | steps:
22 | # Checkout the repo
23 | - uses: actions/checkout@v3
24 |
25 | # Select wdl files with changes
26 | - uses: dorny/paths-filter@v2
27 | id: filter
28 | with:
29 | filters: |
30 | wf:
31 | - 'tasks/**'
32 | - 'workflows/**'
33 | list-files: json
34 |
35 | check:
36 | runs-on: ubuntu-20.04
37 | name: ${{ matrix.wf }}
38 | needs: changes
39 | if: ${{ needs.changes.outputs.workflows != '[]' && needs.changes.outputs.workflows != '' }}
40 | strategy:
41 | fail-fast: false
42 | matrix:
43 | wf: ${{ fromJson(needs.changes.outputs.workflows_files) }}
44 | steps:
45 | # Checkout the repo
46 | - uses: actions/checkout@v3
47 |
48 | # Install a version of Python3
49 | - name: Set up Python
50 | uses: actions/setup-python@v2
51 | with:
52 | python-version: "3.x"
53 |
54 | # Install MiniWDL (WDL syntax) and ShellCheck (shell syntax)
55 | - name: install dependencies
56 | run: |
57 | sudo apt-get update
58 | sudo apt-get -y install shellcheck
59 | pip3 -q install miniwdl 'importlib-metadata==4.13.0'
60 |
61 | # Run MiniWDL check on each of the changed WDLs
62 | - name: MiniWDL Check ${{ matrix.wf }}
63 | run: miniwdl check ${{ matrix.wf }}
64 |
--------------------------------------------------------------------------------
/tasks/gene_typing/task_plasmidfinder.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task plasmidfinder {
4 | input {
5 | File assembly
6 | String samplename
7 | Int cpu = 8
8 | Int memory = 16
9 | String docker = "staphb/plasmidfinder:2.1.6"
10 | Int disk_size = 100
11 | String? database
12 | String? database_path
13 | String? method_path
14 | # minimum coverage threshold
15 | Float? min_cov
16 | # minimum blast identity threshold
17 | Float? threshold
18 |
19 | }
20 | command <<<
21 | date | tee DATE
22 |
23 | if [[ ! -z "~{database}" ]]; then
24 | echo "User database identified; ~{database} will be utilized for analysis"
25 | plasmidfinder_db_version="~{database}"
26 | else
27 | plasmidfinder_db_version="unmodified from plasmidfinder docker container"
28 | fi
29 |
30 | echo ${plasmidfinder_db_version} | tee PLASMIDFINDER_DB_VERSION
31 |
32 | plasmidfinder.py \
33 | -i ~{assembly} \
34 | -x \
35 | ~{'-d ' + database} \
36 | ~{'-p ' + database_path} \
37 | ~{'-mp ' + method_path} \
38 | ~{'-l ' + min_cov} \
39 | ~{'-t ' + threshold}
40 |
41 | # parse outputs
42 | if [ ! -f results_tab.tsv ]; then
43 | PF="No plasmids detected in database"
44 | else
45 | PF="$(tail -n +2 results_tab.tsv | cut -f 2 | sort | uniq -u | paste -s -d, - )"
46 | if [ "$PF" == "" ]; then
47 | PF="No plasmids detected in database"
48 | fi
49 | fi
50 | echo $PF | tee PLASMIDS
51 |
52 | mv results_tab.tsv ~{samplename}_results.tsv
53 | mv Hit_in_genome_seq.fsa ~{samplename}_seqs.fsa
54 |
55 | >>>
56 | output {
57 | String plasmidfinder_plasmids = read_string("PLASMIDS")
58 | File plasmidfinder_results = "~{samplename}_results.tsv"
59 | File plasmidfinder_seqs = "~{samplename}_seqs.fsa"
60 | String plasmidfinder_docker = docker
61 | String plasmidfinder_db_version = read_string("PLASMIDFINDER_DB_VERSION")
62 | }
63 | runtime {
64 | memory: "~{memory} GB"
65 | cpu: cpu
66 | docker: "~{docker}"
67 | disks: "local-disk " + disk_size + " SSD"
68 | disk: disk_size + " GB"
69 | maxRetries: 3
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/tasks/quality_control/task_busco.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task busco {
4 | meta {
5 | description: "Run BUSCO on assemblies"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "ezlabgva/busco:v5.3.2_cv1"
11 | Int disk_size = 100
12 | Boolean eukaryote = false
13 | }
14 | command <<<
15 | # get version
16 | busco --version | tee "VERSION"
17 |
18 | # run busco
19 | # -i input assembly
20 | # -m geno for genome input
21 | # -o output file tag
22 | # --auto-lineage-euk looks at only eukaryotic organisms
23 | # --auto-lineage-prok looks at only prokaryotic organisms; default
24 | busco \
25 | -i ~{assembly} \
26 | -m geno \
27 | -o ~{samplename} \
28 | ~{true='--auto-lineage-euk' false='--auto-lineage-prok' eukaryote}
29 |
30 | # check for existence of output file; otherwise display a string that says the output was not created
31 | if [ -f ~{samplename}/short_summary.specific.*.~{samplename}.txt ]; then
32 |
33 | # grab the database version and format it according to BUSCO recommendations
34 | cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "dataset is:" | cut -d' ' -f 6,9 | sed 's/,//' | sed 's/ / (/' | sed 's/$/)/' | tee DATABASE
35 |
36 | # extract the results string
37 | cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "C:" | tee BUSCO_RESULTS
38 |
39 | cp ~{samplename}/short_summary.specific.*.~{samplename}.txt ~{samplename}_busco-summary.txt
40 | else
41 | echo "BUSCO FAILED" | tee BUSCO_RESULTS
42 | echo "NA" > DATABASE
43 | fi
44 | >>>
45 | output {
46 | String busco_version = read_string("VERSION")
47 | String busco_database = read_string("DATABASE")
48 | String busco_results = read_string("BUSCO_RESULTS")
49 | File? busco_report = "~{samplename}_busco-summary.txt"
50 | }
51 | runtime {
52 | docker: "~{docker}"
53 | memory: "8 GB"
54 | cpu: 2
55 | disks: "local-disk " + disk_size + " SSD"
56 | disk: disk_size + " GB"
57 | maxRetries: 3
58 | preemptible: 0
59 | }
60 | }
--------------------------------------------------------------------------------
/tasks/species_typing/task_ectyper.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task ectyper {
4 | meta {
5 | description: "In-silico prediction of Escherichia coli serotype"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "quay.io/biocontainers/ectyper:1.0.0--pyhdfd78af_1"
11 | Int disk_size = 100
12 | Int? cpu = 4
13 |
14 | # ECTyper Parameters
15 | # --opid [integer] Percent identity required for an O antigen allele match [default: 90]
16 | # --opcov [integer] Minumum percent coverage required for an O antigen allele match [default: 90]
17 | # --hpid [integer] Percent identity required for an H antigen allele match [default: 95]
18 | # --hpcov [integer] Minumum percent coverage required for an H antigen allele match [default: 50]
19 | # --verify [boolean] Enable E. coli species verification
20 | # --print_alleles [boolean] Prints the allele sequences if enabled as the final column
21 | Int opid = 90
22 | Int hpid = 95
23 | Int opcov = 90
24 | Int hpcov = 50
25 | Boolean verify = false
26 | Boolean print_alleles = false
27 | }
28 | command <<<
29 | echo $(ectyper --version 2>&1) | sed 's/.*ectyper //; s/ .*\$//' | tee VERSION
30 | ectyper \
31 | ~{'-opid ' + opid} \
32 | ~{'-hpid ' + hpid} \
33 | ~{'-opcov ' + opcov} \
34 | ~{'-hpcov ' + hpcov} \
35 | ~{true="--verify" false="" verify} \
36 | ~{true="-s" false="" print_alleles} \
37 | --cores ~{cpu} \
38 | --output ./ \
39 | --input ~{assembly}
40 | mv output.tsv ~{samplename}.tsv
41 | # parse ECTyper TSV
42 | cut -f 5 ~{samplename}.tsv | tail -n 1 | tee PREDICTED_SEROTYPE
43 | >>>
44 | output {
45 | File ectyper_results = "~{samplename}.tsv"
46 | String ectyper_predicted_serotype = read_string("PREDICTED_SEROTYPE")
47 | String ectyper_version = read_string("VERSION")
48 | }
49 | runtime {
50 | docker: "~{docker}"
51 | memory: "8 GB"
52 | cpu: 4
53 | disks: "local-disk " + disk_size + " SSD"
54 | disk: disk_size + " GB"
55 | maxRetries: 3
56 | preemptible: 0
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_pbptyper.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task pbptyper {
4 | meta {
5 | description: " In silico Penicillin Binding Protein (PBP) typer for Streptococcus pneumoniae assemblies. https://github.com/rpetit3/pbptyper"
6 | }
7 | input {
8 | File assembly # An assembly in FASTA format (compressed with gzip, or uncompressed) to predict the PBP type on.
9 | String samplename
10 | String? db # A path to a directory containing FASTA files for 1A, 2B, and 2X proteins. In most cases using the default value will be all that is needed.
11 | Int min_pident = 95 # Minimum percent identity to count a hit [default: 95]
12 | Int min_coverage = 95 # Minimum percent coverage to count a hit [default: 95]
13 | String docker = "staphb/pbptyper:1.0.4"
14 | Int disk_size = 100
15 | Int cpus = 4
16 |
17 | }
18 | command <<<
19 | # get version information
20 | pbptyper --version | sed 's/pbptyper, //' | tee VERSION
21 |
22 | # run pbptyper
23 | pbptyper \
24 | --assembly ~{assembly} \
25 | ~{'--db ' + db} \
26 | ~{'--min_pident ' + min_pident} \
27 | ~{'--min_coverage ' + min_coverage} \
28 | --prefix "~{samplename}" \
29 | --outdir ./
30 |
31 | # parse output tsv for pbptype
32 | cut -f 2 ~{samplename}.tsv | tail -n 1 > pbptype.txt
33 |
34 | >>>
35 | output {
36 | String pbptyper_predicted_1A_2B_2X = read_string("pbptype.txt")
37 | File pbptyper_pbptype_predicted_tsv = "~{samplename}.tsv" # A tab-delimited file with the predicted PBP type
38 | File pbptyper_pbptype_1A_tsv = "~{samplename}-1A.tblastn.tsv" # A tab-delimited file of all blast hits against 1A
39 | File pbptyper_pbptype_2B_tsv = "~{samplename}-2B.tblastn.tsv" # A tab-delimited file of all blast hits against 2B
40 | File pbptyper_pbptype_2X_tsv = "~{samplename}-2X.tblastn.tsv" # A tab-delimited file of all blast hits against 2X
41 | String pbptyper_version = read_string("VERSION")
42 | String pbptyper_docker = docker
43 | }
44 | runtime {
45 | docker: "~{docker}"
46 | memory: "16 GB"
47 | cpu: cpus
48 | disks: "local-disk " + disk_size + " SSD"
49 | disk: disk_size + " GB"
50 | maxRetries: 3
51 | preemptible: 0
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/.dockstore.yml:
--------------------------------------------------------------------------------
1 | version: 1.2
2 | workflows:
3 | - name: kSNP3
4 | subclass: WDL
5 | primaryDescriptorPath: /workflows/wf_ksnp3.wdl
6 | testParameterFiles:
7 | - empty.json
8 | - name: Gambit_Query
9 | subclass: WDL
10 | primaryDescriptorPath: /workflows/wf_gambit_query.wdl
11 | testParameterFiles:
12 | - empty.json
13 | - name: Kleborate
14 | subclass: WDL
15 | primaryDescriptorPath: /workflows/wf_kleborate.wdl
16 | testParameterFiles:
17 | - empty.json
18 | - name: SerotypeFinder
19 | subclass: WDL
20 | primaryDescriptorPath: /workflows/wf_serotypefinder.wdl
21 | testParameterFiles:
22 | - empty.json
23 | - name: TBProfiler_Illumina_PE
24 | subclass: WDL
25 | primaryDescriptorPath: /workflows/wf_tbprofiler_pe.wdl
26 | testParameterFiles:
27 | - empty.json
28 | - name: TBProfiler_ONT
29 | subclass: WDL
30 | primaryDescriptorPath: /workflows/wf_tbprofiler_ont.wdl
31 | testParameterFiles:
32 | - empty.json
33 | - name: TheiaProk_Illumina_PE
34 | subclass: WDL
35 | primaryDescriptorPath: /workflows/wf_theiaprok_illumina_pe.wdl
36 | testParameterFiles:
37 | - empty.json
38 | - name: TheiaProk_Illumina_SE
39 | subclass: WDL
40 | primaryDescriptorPath: /workflows/wf_theiaprok_illumina_se.wdl
41 | testParameterFiles:
42 | - empty.json
43 | - name: MashTree_FASTA
44 | subclass: WDL
45 | primaryDescriptorPath: /workflows/wf_mashtree_fasta.wdl
46 | testParameterFiles:
47 | - empty.json
48 | - name: NCBI-AMRFinderPlus
49 | subclass: WDL
50 | primaryDescriptorPath: /workflows/wf_amrfinderplus.wdl
51 | testParameterFiles:
52 | - empty.json
53 | - name: Kraken2_PE
54 | subclass: WDL
55 | primaryDescriptorPath: /workflows/wf_kraken2_pe.wdl
56 | testParameterFiles:
57 | - empty.json
58 | - name: Kraken2_SE
59 | subclass: WDL
60 | primaryDescriptorPath: /workflows/wf_kraken2_se.wdl
61 | testParameterFiles:
62 | - empty.json
63 | - name: RASUSA
64 | subclass: WDL
65 | primaryDescriptorPath: /workflows/wf_rasusa.wdl
66 | testParameterFiles:
67 | - empty.json
68 | - name: Core_Gene_SNP
69 | subclass: WDL
70 | primaryDescriptorPath: /workflows/wf_core_gene_snp.wdl
71 | testParameterFiles:
72 | - empty.json
--------------------------------------------------------------------------------
/tasks/utilities/task_rasusa.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task rasusa {
4 | meta {
5 | description: "Randomly subsample sequencing reads to a specified coverage (https://github.com/mbhall88/rasusa)"
6 | }
7 | input {
8 | File read1
9 | File? read2
10 | String samplename
11 | String docker = "staphb/rasusa:0.7.0"
12 | Int disk_size = 100
13 | Int cpu = 4
14 | # RASUA Parameters
15 | # --bases [STRING] Explicitly set the number of bases required e.g., 4.3kb, 7Tb, 9000, 4.1MB. If this option is given, --coverage and --genome-size are ignored
16 | # --coverage [FLOAT] The desired coverage to sub-sample the reads to. If --bases is not provided, this option and --genome-size are required
17 | # --genome_size [STRING] Genome size to calculate coverage with respect to. e.g., 4.3kb, 7Tb, 9000, 4.1MB
18 | # --seed [INTERGER] Random seed to use
19 | # --frac [FLOAT] Subsample to a fraction of the reads - e.g., 0.5 samples half the reads
20 | # --num [INTEGER] Subsample to a specific number of reads
21 | String? bases
22 | Float coverage
23 | String genome_size
24 | Int? seed
25 | Float? frac
26 | Int? num
27 | }
28 | command <<<
29 | rasusa --version | tee VERSION
30 | # set single-end or paired-end outputs
31 | if [ -z "~{read2}" ]; then
32 | OUTPUT_FILES="~{samplename}_subsampled_R1.fastq.gz"
33 | else
34 | OUTPUT_FILES="~{samplename}_subsampled_R1.fastq.gz ~{samplename}_subsampled_R2.fastq.gz"
35 | fi
36 | # ignore coverage values if frac input provided
37 | if [ -z "~{frac}" ]; then
38 | COVERAGE="--coverage ~{coverage} --genome-size ~{genome_size}"
39 | else
40 | COVERAGE=""
41 | fi
42 | # run rasusa
43 | rasusa \
44 | -i ~{read1} ~{read2} \
45 | ${COVERAGE} \
46 | ~{'--seed ' + seed} \
47 | ~{'--bases ' + bases} \
48 | ~{'--frac ' + frac} \
49 | ~{'--num ' + num} \
50 | -o ${OUTPUT_FILES}
51 | >>>
52 | output {
53 | File read1_subsampled = "~{samplename}_subsampled_R1.fastq.gz"
54 | File? read2_subsampled = "~{samplename}_subsampled_R2.fastq.gz"
55 | String rasusa_version = read_string("VERSION")
56 | }
57 | runtime {
58 | docker: "~{docker}"
59 | memory: "8 GB"
60 | cpu: cpu
61 | disks: "local-disk " + disk_size + " SSD"
62 | disk: disk_size + " GB"
63 | maxRetries: 3
64 | preemptible: 0
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_staphopiasccmec.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task staphopiasccmec {
4 | meta {
5 | description: "Primer based SCCmec typing of Staphylococcus aureus genomes"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "quay.io/biocontainers/staphopia-sccmec:1.0.0--hdfd78af_0"
11 | Int disk_size = 100
12 | Int cpu = 1
13 | }
14 | command <<<
15 | # get version
16 | staphopia-sccmec --version 2>&1 | sed 's/^.*staphopia-sccmec //' | tee VERSION
17 |
18 | # run staphopia-sccmec on input assembly; hamming option OFF; outputs are true/false
19 | staphopia-sccmec \
20 | --assembly ~{assembly} > ~{samplename}.staphopia-sccmec.summary.tsv
21 |
22 | # run staphopia-sccmec on input assembly; hamming option ON; outputs are the hamming distance; 0 is exact match
23 | staphopia-sccmec \
24 | --hamming \
25 | --assembly ~{assembly} > ~{samplename}.staphopia-sccmec.hamming.tsv
26 |
27 | # please excuse this ugly bash code below :)
28 |
29 | # parse output summary TSV for true matches
30 | # look for columns that contain the word "True" and print the column numbers in a list to a file col_headers.txt
31 | awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "True") print i } }' ~{samplename}.staphopia-sccmec.summary.tsv | tee col_headers.txt
32 |
33 | # use column number list to print column headers (example: IV, mecA, etc.) to a file type.txt
34 | cat col_headers.txt | while read -r COL_NUMBER; do \
35 | cut -f "$COL_NUMBER" ~{samplename}.staphopia-sccmec.summary.tsv | head -n 1 >>type.txt
36 | echo "," >>type.txt
37 | done
38 |
39 | # remove newlines, remove trailing comma; generate output string of comma separated values
40 | cat type.txt | tr -d '\n' | sed 's|.$||g' | tee TYPES_AND_MECA.txt
41 |
42 | >>>
43 | output {
44 | File staphopiasccmec_results_tsv = "~{samplename}.staphopia-sccmec.summary.tsv"
45 | File staphopiasccmec_hamming_distance_tsv = "~{samplename}.staphopia-sccmec.hamming.tsv"
46 | String staphopiasccmec_types_and_mecA_presence = read_string("TYPES_AND_MECA.txt")
47 | String staphopiasccmec_version = read_string("VERSION")
48 | String staphopiasccmec_docker = docker
49 | }
50 | runtime {
51 | docker: "~{docker}"
52 | memory: "4 GB"
53 | cpu: cpu
54 | disks: "local-disk " + disk_size + " SSD"
55 | disk: disk_size + " GB"
56 | maxRetries: 3
57 | preemptible: 0
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/tasks/gene_typing/task_bakta.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task bakta {
4 | input {
5 | File assembly
6 | File bakta_db = "gs://theiagen-public-files-rp/terra/theiaprok-files/bakta_db_2022-08-29.tar.gz"
7 | String samplename
8 | Int cpu = 8
9 | Int memory = 16
10 | String docker = "quay.io/biocontainers/bakta:1.5.1--pyhdfd78af_0"
11 | Int disk_size = 100
12 | # Parameters
13 | # proteins: Fasta file of trusted protein sequences for CDS annotation
14 | # prodigal_tf: Prodigal training file to use for CDS prediction
15 | # bakta_opts: any additional bakta arguments
16 | Boolean proteins = false
17 | Boolean compliant = false
18 | File? prodigal_tf
19 | String? bakta_opts
20 | }
21 | command <<<
22 | date | tee DATE
23 | bakta --version | tee BAKTA_VERSION
24 |
25 | # Extract Bakta DB
26 | mkdir db
27 | time tar xzvf ~{bakta_db} --strip-components=1 -C ./db
28 |
29 | # Install amrfinderplus db
30 | amrfinder_update --database db/amrfinderplus-db
31 | amrfinder --database_version | tee AMRFINDER_DATABASE_VERSION
32 |
33 | bakta \
34 | ~{bakta_opts} \
35 | --db db/ \
36 | --threads ~{cpu} \
37 | --prefix ~{samplename} \
38 | --output ~{samplename} \
39 | ~{true='--compliant' false='' compliant} \
40 | ~{true='--proteins' false='' proteins} \
41 | ~{'--prodigal-tf ' + prodigal_tf} \
42 | ~{assembly}
43 |
44 | # rename gff3 to gff for compatibility with downstream analysis (pirate)
45 | mv "~{samplename}/~{samplename}.gff3" "~{samplename}/~{samplename}.gff"
46 |
47 | >>>
48 | output {
49 | File bakta_embl = "~{samplename}/~{samplename}.embl"
50 | File bakta_faa = "~{samplename}/~{samplename}.faa"
51 | File bakta_ffn = "~{samplename}/~{samplename}.ffn"
52 | File bakta_fna = "~{samplename}/~{samplename}.fna"
53 | File bakta_gbff = "~{samplename}/~{samplename}.gbff"
54 | File bakta_gff3 = "~{samplename}/~{samplename}.gff"
55 | File bakta_hypotheticals_faa = "~{samplename}/~{samplename}.hypotheticals.faa"
56 | File bakta_hypotheticals_tsv = "~{samplename}/~{samplename}.hypotheticals.tsv"
57 | File bakta_tsv = "~{samplename}/~{samplename}.tsv"
58 | File bakta_txt = "~{samplename}/~{samplename}.txt"
59 | String bakta_version = read_string("BAKTA_VERSION")
60 | }
61 | runtime {
62 | memory: "~{memory} GB"
63 | cpu: cpu
64 | docker: docker
65 | disks: "local-disk " + disk_size + " SSD"
66 | disk: disk_size + " GB"
67 | maxRetries: 3
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_meningotype.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task meningotype {
4 | meta {
5 | description: "Serotyping of Neisseria meningitidis"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "quay.io/biocontainers/meningotype:0.8.5--pyhdfd78af_0"
11 | Int disk_size = 100
12 | Int cpu = 2
13 | }
14 | command <<<
15 |
16 | # Parameters
17 | # --finetype perform porA and fetA fine typing (default=off)
18 | # --porB perform porB sequence typing (NEIS2020) (default=off)
19 | # --bast perform Bexsero antigen sequence typing (BAST) (default=off)
20 | # --mlst perform MLST (default=off)
21 | # --all perform MLST, porA, fetA, porB, BAST typing (default=off)
22 |
23 | echo $(meningotype --version 2>&1) | sed 's/^.*meningotype v//' | tee VERSION
24 | meningotype \
25 | --finetype \
26 | --porB \
27 | --bast \
28 | --cpus ~{cpu} \
29 | ~{assembly} \
30 | > ~{samplename}.tsv
31 |
32 | tail -1 ~{samplename}.tsv | awk '{print $2}' | tee MENINGOTYPE_SEROTYPE
33 | tail -1 ~{samplename}.tsv | awk '{print $5}' | tee MENINGOTYPE_PORA
34 | tail -1 ~{samplename}.tsv | awk '{print $6}' | tee MENINGOTYPE_FETA
35 | tail -1 ~{samplename}.tsv | awk '{print $7}' | tee MENINGOTYPE_PORB
36 | tail -1 ~{samplename}.tsv | awk '{print $8}' | tee MENINGOTYPE_FHBP
37 | tail -1 ~{samplename}.tsv | awk '{print $9}' | tee MENINGOTYPE_NHBA
38 | tail -1 ~{samplename}.tsv | awk '{print $10}' | tee MENINGOTYPE_NADA
39 | tail -1 ~{samplename}.tsv | awk '{print $11}' | tee MENINGOTYPE_BAST
40 |
41 | >>>
42 | output {
43 | File meningotype_tsv = "~{samplename}.tsv"
44 | String meningotype_version = read_string("VERSION")
45 | String meningotype_serogroup = read_string("MENINGOTYPE_SEROTYPE")
46 | String meningotype_PorA = read_string("MENINGOTYPE_PORA")
47 | String meningotype_FetA = read_string("MENINGOTYPE_FETA")
48 | String meningotype_PorB = read_string("MENINGOTYPE_PORB")
49 | String meningotype_fHbp = read_string("MENINGOTYPE_FHBP")
50 | String meningotype_NHBA = read_string("MENINGOTYPE_NHBA")
51 | String meningotype_NadA = read_string("MENINGOTYPE_NADA")
52 | String meningotype_BAST = read_string("MENINGOTYPE_BAST")
53 | }
54 | runtime {
55 | docker: "~{docker}"
56 | memory: "8 GB"
57 | cpu: cpu
58 | disks: "local-disk " + disk_size + " SSD"
59 | disk: disk_size + " GB"
60 | maxRetries: 3
61 | preemptible: 0
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/tests/inputs/wf_theiaprok_illumina_pe.json:
--------------------------------------------------------------------------------
1 | {
2 | "theiaprok_illumina_pe.samplename": "test",
3 | "theiaprok_illumina_pe.read1_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R1.fastq.gz",
4 | "theiaprok_illumina_pe.read2_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R2.fastq.gz",
5 | "theiaprok_illumina_pe.skip_screen": true,
6 | "theiaprok_illumina_pe.read_QC_trim.read_processing": "trimmomatic",
7 | "theiaprok_illumina_pe.read_QC_trim.call_midas": false,
8 | "theiaprok_illumina_pe.read_QC_trim.midas.midas_db" : "./tests/inputs/empty-for-test.txt",
9 | "theiaprok_illumina_pe.genome_annotation": "prokka",
10 | "theiaprok_illumina_pe.shovill_pe.assembler": "skesa",
11 | "theiaprok_illumina_pe.merlin_magic.call_poppunk": false,
12 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_dists_npy" : "./tests/inputs/empty-for-test.txt",
13 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_dists_pkl" : "./tests/inputs/empty-for-test.txt",
14 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_h5" : "./tests/inputs/empty-for-test.txt",
15 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs" : "./tests/inputs/empty-for-test.txt",
16 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_dists_npy" : "./tests/inputs/empty-for-test.txt",
17 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_dists_pkl" : "./tests/inputs/empty-for-test.txt",
18 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_h5" : "./tests/inputs/empty-for-test.txt",
19 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_clusters_csv" : "./tests/inputs/empty-for-test.txt",
20 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_fit_npz" : "./tests/inputs/empty-for-test.txt",
21 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_fit_pkl" : "./tests/inputs/empty-for-test.txt",
22 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_graph_gt" : "./tests/inputs/empty-for-test.txt",
23 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_qcreport_txt" : "./tests/inputs/empty-for-test.txt",
24 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_unword_clusters_csv" : "./tests/inputs/empty-for-test.txt",
25 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_graph_gt" : "./tests/inputs/empty-for-test.txt",
26 | "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_external_clusters_csv" : "./tests/inputs/empty-for-test.txt",
27 | "theiaprok_illumina_pe.bakta.bakta_db" : "./tests/inputs/empty-for-test.txt"
28 | }
29 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_seqsero2.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task seqsero2 {
4 | # Inputs
5 | input {
6 | File read1
7 | File? read2
8 | String samplename
9 | String mode ="a"
10 | String seqsero2_docker_image = "quay.io/staphb/seqsero2:1.2.1"
11 | Int disk_size = 100
12 | Boolean paired_end
13 | }
14 |
15 | command <<<
16 | # capture date and version
17 | # Print and save date
18 | date | tee DATE
19 | # Print and save version
20 | SeqSero2_package.py --version | tee VERSION
21 | # Run SeqSero2 on the input read data
22 | SeqSero2_package.py \
23 | -p 8 \
24 | ~{true='-t 2' false='-t 3' paired_end} \
25 | -m ~{mode} \
26 | -n ~{samplename} \
27 | -d ~{samplename}_seqseqro2_output_dir \
28 | -i ~{read1} ~{read2}
29 | # Run a python block to parse output file for terra data tables
30 | python3 <>>
53 | output {
54 | File seqsero2_report = "./~{samplename}_seqseqro2_output_dir/SeqSero_result.tsv"
55 | String seqsero2_version = read_string("VERSION")
56 | String seqsero2_predicted_antigenic_profile = read_string("PREDICTED_ANTIGENIC_PROFILE")
57 | String seqsero2_predicted_serotype = read_string("PREDICTED_SEROTYPE")
58 | String seqsero2_predicted_contamination = read_string("CONTAMINATION")
59 | }
60 | runtime {
61 | docker: "~{seqsero2_docker_image}"
62 | memory: "16 GB"
63 | cpu: 8
64 | disks: "local-disk " + disk_size + " SSD"
65 | disk: disk_size + " GB"
66 | preemptible: 0
67 | maxRetries: 3
68 | }
69 | }
--------------------------------------------------------------------------------
/tasks/quality_control/task_trimmomatic.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task trimmomatic_pe {
4 | input {
5 | File read1
6 | File read2
7 | String samplename
8 | String docker = "quay.io/staphb/trimmomatic:0.39"
9 | Int? trimmomatic_window_size = 10
10 | Int? trimmomatic_quality_trim_score = 20
11 | Int? trimmomatic_minlen = 75
12 | Int? threads = 4
13 | Int disk_size = 100
14 | }
15 | command <<<
16 | # date and version control
17 | date | tee DATE
18 | trimmomatic -version > VERSION && sed -i -e 's/^/Trimmomatic /' VERSION
19 |
20 | trimmomatic PE \
21 | -threads ~{threads} \
22 | ~{read1} ~{read2} \
23 | -baseout ~{samplename}.fastq.gz \
24 | SLIDINGWINDOW:~{trimmomatic_window_size}:~{trimmomatic_quality_trim_score} \
25 | MINLEN:~{trimmomatic_minlen} &> ~{samplename}.trim.stats.txt
26 | >>>
27 | output {
28 | File read1_trimmed = "~{samplename}_1P.fastq.gz"
29 | File read2_trimmed = "~{samplename}_2P.fastq.gz"
30 | File trimmomatic_stats = "~{samplename}.trim.stats.txt"
31 | String version = read_string("VERSION")
32 | String pipeline_date = read_string("DATE")
33 | }
34 | runtime {
35 | docker: "~{docker}"
36 | memory: "8 GB"
37 | cpu: 4
38 | disks: "local-disk " + disk_size + " SSD"
39 | disk: disk_size + " GB"
40 | maxRetries: 3
41 | preemptible: 0
42 | }
43 | }
44 |
45 | task trimmomatic_se {
46 | input {
47 | File read1
48 | String samplename
49 | String docker="quay.io/staphb/trimmomatic:0.39"
50 | Int? trimmomatic_window_size = 4
51 | Int? trimmomatic_quality_trim_score = 30
52 | Int? trimmomatic_minlen = 25
53 | Int? threads = 4
54 | Int disk_size = 100
55 | }
56 | command <<<
57 | # date and version control
58 | date | tee DATE
59 | trimmomatic -version > VERSION && sed -i -e 's/^/Trimmomatic /' VERSION
60 |
61 | trimmomatic SE \
62 | -threads ~{threads} \
63 | ~{read1} \
64 | ~{samplename}_trimmed.fastq.gz \
65 | SLIDINGWINDOW:~{trimmomatic_window_size}:~{trimmomatic_quality_trim_score} \
66 | MINLEN:~{trimmomatic_minlen} > ~{samplename}.trim.stats.txt
67 | >>>
68 | output {
69 | File read1_trimmed = "${samplename}_trimmed.fastq.gz"
70 | File trimmomatic_stats = "${samplename}.trim.stats.txt"
71 | String version = read_string("VERSION")
72 | String pipeline_date = read_string("DATE")
73 | }
74 | runtime {
75 | docker: "~{docker}"
76 | memory: "8 GB"
77 | cpu: 4
78 | disks: "local-disk " + disk_size + " SSD"
79 | disk: disk_size + " GB"
80 | maxRetries: 3
81 | preemptible: 0
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_hicap.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task hicap {
4 | meta {
5 | description: "Identify cap locus serotype and structure in your Haemophilus influenzae assemblies"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "quay.io/biocontainers/hicap:1.0.3--py_0"
11 | Int disk_size = 100
12 | Int? cpu = 4
13 |
14 | # Parameters
15 | # --gene_coverage GENE_COVERAGE Minimum percentage coverage to consider a single gene complete. [default: 0.80]
16 | # --gene_identity GENE_IDENTITY Minimum percentage identity to consider a single gene complete. [default: 0.70]
17 | # --broken_gene_length BROKEN_GENE_LENGTH Minimum length to consider a broken gene. [default: 60]
18 | # --broken_gene_identity BROKEN_GENE_IDENTITY Minimum percentage identity to consider a broken gene. [default: 0.80]
19 | Float gene_coverage = 0.8
20 | Float gene_identity = 0.7
21 | Int broken_gene_length = 60
22 | Float broken_gene_identity = 0.8
23 | Boolean full_sequence = false
24 | Boolean debug = false
25 | }
26 | command <<<
27 | echo $( hicap --version 2>&1 ) | sed 's/^.*hicap //' | tee VERSION
28 | hicap \
29 | --query_fp ~{assembly} \
30 | ~{'--gene_coverage' + gene_coverage} \
31 | ~{'--gene_identity' + gene_identity} \
32 | ~{'--broken_gene_length' + broken_gene_length} \
33 | ~{'--broken_gene_identity' + broken_gene_identity} \
34 | ~{true="--full_sequence" false="" full_sequence} \
35 | ~{true="--debug" false="" debug} \
36 | --threads ~{cpu} \
37 | -o ./
38 |
39 | if [ ! -f ${samplename}.tsv ]; then
40 | # No hits, make a file to say so for downstream merging
41 | echo "isolatepredicted_serotypeattributesgenes_identifiedlocus_locationregion_I_genesregion_II_genesregion_III_genesIS1016_hits" | sed 's//\t/g' > ${samplename}.tsv
42 | echo "~{samplename}cap_not_found-------" | sed 's//\t/g' >> ~{samplename}.tsv
43 | else
44 | sed -i 's/#isolate/isolate/' ~{samplename}.tsv
45 | fi
46 | >>>
47 | output {
48 | File hicap_results = "~{samplename}.tsv"
49 | File hicap_genbank = "~{samplename}.gbk"
50 | File hicap_image = "~{samplename}.svg"
51 | String hicap_version = read_string("VERSION")
52 | }
53 | runtime {
54 | docker: "~{docker}"
55 | memory: "8 GB"
56 | cpu: 4
57 | disks: "local-disk " + disk_size + " SSD"
58 | disk: disk_size + " GB"
59 | maxRetries: 3
60 | preemptible: 0
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_mycosnp_tree.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task mycosnptree {
4 | input {
5 | Array[File] assembly_fasta
6 | Array[String] samplename
7 | String docker="quay.io/theiagen/mycosnp:dev"
8 | Int disk_size = 100
9 | String strain="B11205"
10 | String accession="GCA_016772135"
11 | }
12 | command <<<
13 | date | tee DATE
14 | echo $(nextflow pull rpetit3/mycosnp-nf 2>&1) | sed 's/^.*revision: //;' | tee MYCOSNPTREE_VERSION
15 |
16 | assembly_array=(~{sep=' ' assembly_fasta})
17 | assembly_array_len=$(echo "${#assembly_array[@]}")
18 | samplename_array=(~{sep=' ' samplename})
19 | samplename_array_len=$(echo "${#samplename_array[@]}")
20 |
21 | # Ensure assembly, and samplename arrays are of equal length
22 | if [ "$assembly_array_len" -ne "$samplename_array_len" ]; then
23 | echo "Assembly array (length: $assembly_array_len) and samplename array (length: $samplename_array_len) are of unequal length." >&2
24 | exit 1
25 | fi
26 |
27 | # Make sample FOFN
28 | echo "sample,fasta" > samples.csv
29 | for index in ${!assembly_array[@]}; do
30 | assembly=${assembly_array[$index]}
31 | samplename=${samplename_array[$index]}
32 | echo -e "${samplename},${assembly}" >> samples.csv
33 | done
34 |
35 | # Run MycoSNP
36 | mkdir mycosnptree
37 | cd mycosnptree
38 | if nextflow run rpetit3/mycosnp-nf -entry NFCORE_MYCOSNPTREE --input ../samples.csv --fasta /reference/~{accession}/masked/reference-consensus.fa --publish_dir_mode copy --rapidnj False --fasttree False --iqtree; then
39 | # Everything finished, pack up the results and clean up
40 | find work/ -name "*.iqtree" | xargs -I {} cp {} ./
41 | rm -rf .nextflow/ work/
42 | cd ..
43 | tar -cf - mycosnptree/ | gzip -n --best > mycosnptree.tar.gz
44 | else
45 | # Run failed
46 | exit 1
47 | fi
48 | >>>
49 | output {
50 | String mycosnptree_version = read_string("MYCOSNPTREE_VERSION")
51 | String mycosnptree_docker = docker
52 | String analysis_date = read_string("DATE")
53 | String reference_strain = strain
54 | String reference_accession = accession
55 | File mycosnptree_tree = "mycosnptree/results/combined/phylogeny/iqtree/alignment.fasta.treefile"
56 | File mycosnptree_iqtree_log = "mycosnptree/alignment.fasta.iqtree"
57 | File mycosnptree_full_results = "mycosnptree.tar.gz"
58 | }
59 | runtime {
60 | docker: "~{docker}"
61 | memory: "32 GB"
62 | cpu: 4
63 | disks: "local-disk " + disk_size + " SSD"
64 | disk: disk_size + " GB"
65 | maxRetries: 3
66 | preemptible: 0
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_emmtyper.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task emmtyper {
4 | meta {
5 | description: "emm-typing of Streptococcus pyogenes assemblies"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "quay.io/biocontainers/emmtyper:0.2.0--py_0"
11 | Int disk_size = 100
12 | Int? cpu = 2
13 |
14 | # Parameters
15 | # --workflow [blast|pcr] Choose workflow [default: blast]
16 | # --cluster-distance INTEGER Distance between cluster of matches to consider as different clusters. [default: 500]
17 | # --percent-identity INTEGER [BLAST] Minimal percent identity of sequence. [default: 95]
18 | # --culling-limit INTEGER [BLAST] Total hits to return in a position. [default: 5]
19 | # --mismatch INTEGER [BLAST] Threshold for number of mismatch to allow in BLAST hit. [default: 4]
20 | # --align-diff INTEGER [BLAST] Threshold for difference between alignment length and subject length in BLAST hit. [default: 5]
21 | # --gap INTEGER [BLAST] Threshold gap to allow in BLAST hit. [default: 2]
22 | # --min-perfect INTEGER [isPcr] Minimum size of perfect match at 3' primer end. [default: 15]
23 | # --min-good INTEGER [isPcr] Minimum size where there must be 2 matches for each mismatch. [default: 15]
24 | # --max-size INTEGER [isPcr] Maximum size of PCR product. [default: 2000]
25 |
26 | String wf = "blast"
27 | Int cluster_distance = 500
28 | Int percid = 95
29 | Int culling_limit = 5
30 | Int mismatch = 4
31 | Int align_diff = 5
32 | Int gap = 2
33 | Int min_perfect = 15
34 | Int min_good = 15
35 | Int max_size = 2000
36 | }
37 | command <<<
38 | echo $(emmtyper --version 2>&1) | sed 's/^.*emmtyper v//' | tee VERSION
39 | emmtyper \
40 | ~{'--workflow' + wf} \
41 | ~{'--cluster-distance' + cluster_distance} \
42 | ~{'--percent-identity' + percid} \
43 | ~{'--culling-limit' + culling_limit} \
44 | ~{'--mismatch' + mismatch} \
45 | ~{'--align-diff' + align_diff} \
46 | ~{'--gap' + gap} \
47 | ~{'--min-perfect' + min_perfect} \
48 | ~{'--min-good' + min_good} \
49 | ~{'--max-size' + max_size} \
50 | ~{assembly} \
51 | > ~{samplename}.tsv
52 | >>>
53 | output {
54 | File emmtyper_results = "~{samplename}.tsv"
55 | String emmtyper_version = read_string("VERSION")
56 | }
57 | runtime {
58 | docker: "~{docker}"
59 | memory: "8 GB"
60 | cpu: 2
61 | disks: "local-disk " + disk_size + " SSD"
62 | disk: disk_size + " GB"
63 | maxRetries: 3
64 | preemptible: 0
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/tasks/quality_control/task_bbduk.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task bbduk_pe {
4 | input {
5 | File read1_trimmed
6 | File read2_trimmed
7 | String samplename
8 | Int mem_size_gb=8
9 | String docker = "quay.io/staphb/bbtools:38.76"
10 | Int disk_size = 100
11 | }
12 | command <<<
13 | # date and version control
14 | date | tee DATE
15 |
16 | repair.sh in1=~{read1_trimmed} in2=~{read2_trimmed} out1=~{samplename}.paired_1.fastq.gz out2=~{samplename}.paired_2.fastq.gz
17 |
18 | bbduk.sh in1=~{samplename}.paired_1.fastq.gz in2=~{samplename}.paired_2.fastq.gz out1=~{samplename}.rmadpt_1.fastq.gz out2=~{samplename}.rmadpt_2.fastq.gz ref=/bbmap/resources/adapters.fa stats=~{samplename}.adapters.stats.txt ktrim=r k=23 mink=11 hdist=1 tpe tbo
19 |
20 | bbduk.sh in1=~{samplename}.rmadpt_1.fastq.gz in2=~{samplename}.rmadpt_2.fastq.gz out1=~{samplename}_1.clean.fastq.gz out2=~{samplename}_2.clean.fastq.gz outm=~{samplename}.matched_phix.fq ref=/bbmap/resources/phix174_ill.ref.fa.gz k=31 hdist=1 stats=~{samplename}.phix.stats.txt
21 |
22 | >>>
23 | output {
24 | File read1_clean = "~{samplename}_1.clean.fastq.gz"
25 | File read2_clean = "~{samplename}_2.clean.fastq.gz"
26 | File adapter_stats = "~{samplename}.adapters.stats.txt"
27 | File phiX_stats = "~{samplename}.phix.stats.txt"
28 | String bbduk_docker = docker
29 | String pipeline_date = read_string("DATE")
30 | }
31 | runtime {
32 | docker: "~{docker}"
33 | memory: "~{mem_size_gb} GB"
34 | cpu: 4
35 | disks: "local-disk " + disk_size + " SSD"
36 | disk: disk_size + " GB"
37 | preemptible: 0
38 | maxRetries: 3
39 | }
40 | }
41 |
42 | task bbduk_se {
43 | input {
44 | File read1_trimmed
45 | String samplename
46 | Int mem_size_gb=8
47 | String docker="quay.io/staphb/bbtools:38.76"
48 | Int disk_size = 100
49 | }
50 | command <<<
51 | # date and version control
52 | date | tee DATE
53 |
54 | bbduk.sh in1=~{read1_trimmed} out1=~{samplename}.rmadpt_1.fastq.gz ref=/bbmap/resources/adapters.fa stats=~{samplename}.adapters.stats.txt ktrim=r k=23 mink=11 hdist=1 tpe tbo
55 |
56 | bbduk.sh in1=~{read1_trimmed} out1=~{samplename}_1.clean.fastq.gz outm=~{samplename}.matched_phix.fq ref=/bbmap/resources/phix174_ill.ref.fa.gz k=31 hdist=1 stats=~{samplename}.phix.stats.txt
57 | >>>
58 | output {
59 | File read1_clean = "~{samplename}_1.clean.fastq.gz"
60 | File adapter_stats = "~{samplename}.adapters.stats.txt"
61 | File phiX_stats = "~{samplename}.phix.stats.txt"
62 | String bbduk_docker = docker
63 | String pipeline_date = read_string("DATE")
64 | }
65 | runtime {
66 | docker: "~{docker}"
67 | memory: "~{mem_size_gb} GB"
68 | cpu: 4
69 | disks: "local-disk " + disk_size + " SSD"
70 | disk: disk_size + " GB"
71 | preemptible: 0
72 | maxRetries: 3
73 | }
74 | }
75 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_genotyphi.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task genotyphi {
4 | # Inputs
5 | input {
6 | File read1
7 | File? read2
8 | Boolean ont_data=false
9 | String samplename
10 | String genotyphi_docker_image = "staphb/mykrobe:0.11.0"
11 | Int disk_size = 100
12 | Int cpu = 4
13 | }
14 | command <<<
15 | # Print and save versions
16 | mykrobe --version | sed 's|mykrobe v||g' | tee MYKROBE_VERSION
17 | # super ugly oneliner since "python /genotyphi/genotyphi.py --version" does NOT work due to python syntax error
18 | grep '__version__ =' /genotyphi/genotyphi.py | sed "s|__version__ = '||" | sed "s|'||" | tee GENOTYPHI_VERSION
19 |
20 | # Run Mykrobe on the input read data
21 | mykrobe predict \
22 | -t ~{cpu} \
23 | --sample ~{samplename} \
24 | --species typhi \
25 | --format json \
26 | --out ~{samplename}.mykrobe_genotyphi.json \
27 | ~{true='--ont' false='' ont_data} \
28 | --seq ~{read1} ~{read2}
29 |
30 | # use genotyphi script to produce TSV
31 | python /genotyphi/parse_typhi_mykrobe.py \
32 | --jsons ~{samplename}.mykrobe_genotyphi.json \
33 | --prefix ~{samplename}_mykrobe_genotyphi
34 |
35 | # Run a python block to parse output file for terra data tables
36 | python3 <>>
55 | output {
56 | File genotyphi_report_tsv = "./~{samplename}_mykrobe_genotyphi_predictResults.tsv"
57 | File genotyphi_mykrobe_json = "./~{samplename}.mykrobe_genotyphi.json"
58 | String genotyphi_version = read_string("GENOTYPHI_VERSION")
59 | String genotyphi_species = read_string("SPECIES")
60 | Float genotyphi_st_probes_percent_coverage = read_string("SPP_PERCENT")
61 | String genotyphi_final_genotype = read_string("FINAL_GENOTYPE")
62 | String genotyphi_genotype_confidence = read_string("CONFIDENCE")
63 | }
64 | runtime {
65 | docker: "~{genotyphi_docker_image}"
66 | memory: "8 GB"
67 | cpu: cpu
68 | disks: "local-disk " + disk_size + " SSD"
69 | disk: disk_size + " GB"
70 | preemptible: 0
71 | maxRetries: 3
72 | }
73 | }
--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_ksnp3.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task ksnp3 {
4 | input {
5 | Array[File] assembly_fasta
6 | Array[String] samplename
7 | String cluster_name
8 | Int kmer_size = 19
9 | String? ksnp3_args = "" # add -ML to calculate a maximum likelihood tree or -NJ to calculate a neighbor-joining tree
10 | String docker_image = "quay.io/staphb/ksnp3:3.1"
11 | Int memory = 8
12 | Int cpu = 4
13 | Int disk_size = 100
14 | }
15 | command <<<
16 | assembly_array=(~{sep=' ' assembly_fasta})
17 | assembly_array_len=$(echo "${#assembly_array[@]}")
18 | samplename_array=(~{sep=' ' samplename})
19 | samplename_array_len=$(echo "${#samplename_array[@]}")
20 |
21 | # Ensure assembly, and samplename arrays are of equal length
22 | if [ "$assembly_array_len" -ne "$samplename_array_len" ]; then
23 | echo "Assembly array (length: $assembly_array_len) and samplename array (length: $samplename_array_len) are of unequal length." >&2
24 | exit 1
25 | fi
26 |
27 | # create file of filenames for kSNP3 input
28 | touch ksnp3_input.tsv
29 | for index in ${!assembly_array[@]}; do
30 | assembly=${assembly_array[$index]}
31 | samplename=${samplename_array[$index]}
32 | echo -e "${assembly}\t${samplename}" >> ksnp3_input.tsv
33 | done
34 | # run ksnp3 on input assemblies
35 | kSNP3 -in ksnp3_input.tsv -outdir ksnp3 -k ~{kmer_size} -core -vcf ~{ksnp3_args}
36 |
37 | # rename ksnp3 outputs with cluster name
38 | mv -v ksnp3/core_SNPs_matrix.fasta ksnp3/~{cluster_name}_core_SNPs_matrix.fasta
39 | mv -v ksnp3/tree.core.tre ksnp3/~{cluster_name}_core.nwk
40 | mv -v ksnp3/VCF.*.vcf ksnp3/~{cluster_name}_core.vcf
41 | mv -v ksnp3/SNPs_all_matrix.fasta ksnp3/~{cluster_name}_pan_SNPs_matrix.fasta
42 | mv -v ksnp3/tree.parsimony.tre ksnp3/~{cluster_name}_pan_parsimony.nwk
43 |
44 | if [ -f ksnp3/tree.ML.tre ]; then
45 | mv -v ksnp3/tree.ML.tre ksnp3/~{cluster_name}_ML.nwk
46 | fi
47 | if [ -f ksnp3/tree.NJ.tre ]; then
48 | mv -v ksnp3/tree.NJ.tre ksnp3/~{cluster_name}_NJ.nwk
49 | fi
50 |
51 | >>>
52 | output {
53 | File ksnp3_core_matrix = "ksnp3/${cluster_name}_core_SNPs_matrix.fasta"
54 | File ksnp3_core_tree = "ksnp3/${cluster_name}_core.nwk"
55 | File ksnp3_core_vcf = "ksnp3/${cluster_name}_core.vcf"
56 | File ksnp3_pan_matrix = "ksnp3/~{cluster_name}_pan_SNPs_matrix.fasta"
57 | File ksnp3_pan_parsimony_tree = "ksnp3/~{cluster_name}_pan_parsimony.nwk"
58 | File? ksnp3_ml_tree = "ksnp3/~{cluster_name}_ML.nwk"
59 | File? ksnp3_nj_tree = "ksnp3/~{cluster_name}_NJ.nwk"
60 | File number_snps = "ksnp3/COUNT_SNPs"
61 | Array[File] ksnp_outs = glob("ksnp3/*")
62 | String ksnp3_docker_image = docker_image
63 | }
64 | runtime {
65 | docker: docker_image
66 | memory: "~{memory} GB"
67 | cpu: cpu
68 | disks: "local-disk " + disk_size + " SSD"
69 | disk: disk_size + " GB"
70 | preemptible: 0
71 | maxRetries: 3
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/workflows/de_novo_assembly.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | workflow de_novo_assembly {
4 |
5 | input {
6 | String SRR
7 | File read1
8 | File read2
9 | }
10 |
11 | call seqyclean {
12 | input:
13 | samplename=SRR,
14 | read1=read1,
15 | read2=read2
16 | }
17 |
18 | call shovill {
19 | input:
20 | samplename=SRR,
21 | read1_cleaned=seqyclean.read1_cleaned,
22 | read2_cleaned=seqyclean.read2_cleaned
23 | }
24 |
25 | output {
26 | File read1_cleaned =seqyclean.read1_cleaned
27 | File read2_cleaned =seqyclean.read2_cleaned
28 | File contigs_fasta =shovill.contigs_fasta
29 | File contigs_gfa =shovill.contigs_gfa
30 | }
31 | }
32 |
33 | task seqyclean {
34 |
35 | input {
36 | File read1
37 | File read2
38 | String samplename
39 | File? adapters
40 | Int? seqyclean_minlen=25
41 | String? seqyclean_qual="20 20"
42 | Boolean? compress=true
43 | Boolean? seqyclean_dup=false
44 | Boolean? seqyclean_no_adapter_trim=false
45 | }
46 |
47 | command {
48 | seqyclean --version | head -1 | tee VERSION
49 | seqyclean \
50 | ${'-minlen ' + seqyclean_minlen} \
51 | ${'-qual ' + seqyclean_qual} \
52 | ${'-c ' + adapters} \
53 | ${true="-dup" false="" seqyclean_dup} \
54 | ${true="-no_adapter_trim " false="" seqyclean_no_adapter_trim} \
55 | ${true="-gz " false="" compress} \
56 | ${'-1 ' + read1} \
57 | ${'-2 ' + read2} \
58 | ${'-o ' + samplename}
59 | }
60 |
61 | output {
62 | File read1_cleaned = "${samplename}_PE1.fastq.gz"
63 | File read2_cleaned = "${samplename}_PE2.fastq.gz"
64 | String seqyclean_version = read_string("VERSION")
65 | }
66 |
67 | runtime {
68 | docker: "quay.io/staphb/seqyclean:1.10.09"
69 | memory: "8 GB"
70 | cpu: 2
71 | disks: "local-disk 100 SSD"
72 | preemptible: 0
73 | }
74 | }
75 |
76 | task shovill {
77 |
78 | input {
79 | File read1_cleaned
80 | File read2_cleaned
81 | String samplename
82 | }
83 |
84 | command {
85 | shovill --version | head -1 | tee VERSION
86 | shovill \
87 | --outdir out \
88 | --R1 ${read1_cleaned} \
89 | --R2 ${read2_cleaned}
90 | mv out/contigs.fa out/${samplename}_contigs.fasta
91 | mv out/contigs.gfa out/${samplename}_contigs.gfa
92 | }
93 |
94 | output {
95 | File contigs_fasta = "out/${samplename}_contigs.fasta"
96 | File contigs_gfa = "out/${samplename}_contigs.gfa"
97 | String shovill_version = read_string("VERSION")
98 | }
99 |
100 | runtime {
101 | docker: "quay.io/staphb/shovill:1.1.0"
102 | memory: "16 GB"
103 | cpu: 4
104 | disks: "local-disk 100 SSD"
105 | preemptible: 0
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/workflows/wf_ksnp3.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | import "../tasks/phylogenetic_inference/task_ksnp3.wdl" as ksnp3
4 | import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists
5 | import "../tasks/task_versioning.wdl" as versioning
6 | import "../tasks/utilities/task_summarize_data.wdl" as data_summary
7 |
8 | workflow ksnp3_workflow {
9 | input {
10 | Array[File] assembly_fasta
11 | Array[String] samplename
12 | String cluster_name
13 | String? data_summary_terra_project
14 | String? data_summary_terra_workspace
15 | String? data_summary_terra_table
16 | String? data_summary_column_names # string of comma delimited column names
17 | }
18 | call ksnp3.ksnp3 as ksnp3_task {
19 | input:
20 | assembly_fasta = assembly_fasta,
21 | samplename = samplename,
22 | cluster_name = cluster_name
23 | }
24 | call snp_dists.snp_dists as core_snp_dists {
25 | input:
26 | cluster_name = cluster_name,
27 | alignment = ksnp3_task.ksnp3_core_matrix
28 | }
29 | call snp_dists.snp_dists as pan_snp_dists {
30 | input:
31 | cluster_name = cluster_name,
32 | alignment = ksnp3_task.ksnp3_pan_matrix
33 | }
34 | call snp_dists.reorder_matrix as core_reorder_matrix {
35 | input:
36 | input_tree = ksnp3_task.ksnp3_core_tree,
37 | matrix = core_snp_dists.snp_matrix,
38 | cluster_name = cluster_name + "_core"
39 | }
40 | call snp_dists.reorder_matrix as pan_reorder_matrix {
41 | input:
42 | input_tree = ksnp3_task.ksnp3_pan_parsimony_tree,
43 | matrix = pan_snp_dists.snp_matrix,
44 | cluster_name = cluster_name + "_pan"
45 | }
46 | if (defined(data_summary_column_names)) {
47 | call data_summary.summarize_data {
48 | input:
49 | sample_names = samplename,
50 | terra_project = data_summary_terra_project,
51 | terra_workspace = data_summary_terra_workspace,
52 | terra_table = data_summary_terra_table,
53 | column_names = data_summary_column_names,
54 | output_prefix = cluster_name
55 | }
56 | }
57 | call versioning.version_capture{
58 | input:
59 | }
60 | output {
61 | # Version Capture
62 | String ksnp3_wf_version = version_capture.phbg_version
63 | String ksnp3_wf_analysis_date = version_capture.date
64 | String ksnp3_docker = ksnp3_task.ksnp3_docker_image
65 | # ksnp3_outputs
66 | String ksnp3_snp_dists_version = pan_snp_dists.version
67 | File ksnp3_core_vcf = ksnp3_task.ksnp3_core_vcf
68 | # ordered matrixes and reordered trees
69 | File ksnp3_core_snp_matrix = core_reorder_matrix.ordered_matrix
70 | File ksnp3_core_tree = core_reorder_matrix.tree
71 | File ksnp3_pan_snp_matrix = pan_reorder_matrix.ordered_matrix
72 | File ksnp3_pan_tree = pan_reorder_matrix.tree
73 | # optional tree outputs
74 | File? ksnp3_ml_tree = ksnp3_task.ksnp3_ml_tree
75 | File? ksnp3_nj_tree = ksnp3_task.ksnp3_nj_tree
76 | # data summary output
77 | File? ksnp3_summarized_data = summarize_data.summarized_data
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_shigatyper.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task shigatyper {
4 | meta {
5 | description: "ShigaTyper is a quick and easy tool designed to determine Shigella serotype using Illumina (single or paired-end) or Oxford Nanopore reads with low computation requirement. https://github.com/CFSAN-Biostatistics/shigatyper"
6 | }
7 | input {
8 | File read1
9 | File? read2
10 | String samplename
11 | String docker = "staphb/shigatyper:2.0.3"
12 | Int disk_size = 100
13 | Int cpus = 4
14 | Boolean read1_is_ont = false
15 | }
16 | command <<<
17 | # get version information
18 | shigatyper --version | sed 's/ShigaTyper //' | tee VERSION.txt
19 |
20 | # if read2 DOES NOT EXIST, ASSUME SINGLE END OR ONT
21 | if [ -z "~{read2}" ] ; then
22 | INPUT_READS="--SE ~{read1}"
23 | # if read1_is_ont is set to TRUE, then use ONT flags
24 | if [ "~{read1_is_ont}" == "true" ]; then
25 | INPUT_READS="--SE ~{read1} --ont"
26 | fi
27 | # else read2 DOES EXIST, ASSUME PAIRED END
28 | else
29 | INPUT_READS="--R1 ~{read1} --R2 ~{read2}"
30 | fi
31 | echo "INPUT_READS set to: ${INPUT_READS}"
32 | echo
33 |
34 | # run shigatyper. 2 output files will be ~{samplename}.tsv and ~{samplename}-hits.tsv
35 | echo "Running ShigaTyper..."
36 | shigatyper \
37 | ${INPUT_READS} \
38 | -n ~{samplename}
39 |
40 | # rename output TSVs to be more descriptive
41 | mv -v ~{samplename}.tsv ~{samplename}_shigatyper_summary.tsv
42 | mv -v ~{samplename}-hits.tsv ~{samplename}_shigatyper_hits.tsv
43 |
44 | # parse summary tsv for prediction, ipaB absence/presence, and notes
45 | cut -f 2 ~{samplename}_shigatyper_summary.tsv | tail -n 1 > shigatyper_prediction.txt
46 | cut -f 3 ~{samplename}_shigatyper_summary.tsv | tail -n 1 > shigatyper_ipaB_presence_absence.txt
47 | cut -f 4 ~{samplename}_shigatyper_summary.tsv | tail -n 1 > shigatyper_notes.txt
48 |
49 | # if shigatyper notes field (really the txt file) is EMPTY, write string saying it is empty to float to Terra table
50 | if [ "$(cat shigatyper_notes.txt)" == "" ]; then
51 | echo "ShigaTyper notes field was empty" > shigatyper_notes.txt
52 | fi
53 |
54 | >>>
55 | output {
56 | String shigatyper_predicted_serotype = read_string("shigatyper_prediction.txt")
57 | String shigatyper_ipaB_presence_absence = read_string("shigatyper_ipaB_presence_absence.txt")
58 | String shigatyper_notes = read_string("shigatyper_notes.txt")
59 | File shigatyper_hits_tsv = "~{samplename}_shigatyper_hits.tsv" # A tab-delimited detailed report file
60 | File shigatyper_summary_tsv = "~{samplename}_shigatyper_summary.tsv" # A tab-delimited summary report file
61 | String shigatyper_version = read_string("VERSION.txt")
62 | String shigatyper_docker = docker
63 | }
64 | runtime {
65 | docker: "~{docker}"
66 | memory: "16 GB"
67 | cpu: cpus
68 | disks: "local-disk " + disk_size + " SSD"
69 | disk: disk_size + " GB"
70 | maxRetries: 3
71 | preemptible: 0
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/tasks/quality_control/task_fastp.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task fastp {
4 | input {
5 | File read1
6 | File read2
7 | String samplename
8 | String docker = "quay.io/staphb/fastp:0.23.2"
9 | Int disk_size = 100
10 | Int fastp_window_size = 20
11 | Int fastp_quality_trim_score = 30
12 | Int fastp_minlen = 50
13 | # -g enables polyg trimming with default value of 10
14 | String fastp_args = "--detect_adapter_for_pe -g -5 20 -3 20"
15 | Int threads = 4
16 | }
17 | command <<<
18 | # date
19 | date | tee DATE
20 |
21 | fastp \
22 | --in1 ~{read1} --in2 ~{read2} \
23 | --out1 ~{samplename}_1P.fastq.gz --out2 ~{samplename}_2P.fastq.gz \
24 | --unpaired1 ~{samplename}_1U.fastq.gz --unpaired2 ~{samplename}_2U.fastq.gz \
25 | --cut_right --cut_right_window_size ~{fastp_window_size} --cut_right_mean_quality ~{fastp_quality_trim_score} \
26 | --length_required ~{fastp_minlen} \
27 | --thread ~{threads} \
28 | ~{fastp_args} \
29 | --html ~{samplename}_fastp.html --json ~{samplename}_fastp.json
30 | >>>
31 | output {
32 | File read1_trimmed = "~{samplename}_1P.fastq.gz"
33 | File read2_trimmed = "~{samplename}_2P.fastq.gz"
34 | File read1_trimmed_unpaired = "~{samplename}_1U.fastq.gz"
35 | File read2_trimmed_unpaired = "~{samplename}_2U.fastq.gz"
36 | File fastp_stats = "~{samplename}_fastp.html"
37 | String version = "~{docker}"
38 | String pipeline_date = read_string("DATE")
39 | }
40 | runtime {
41 | docker: "quay.io/staphb/fastp:0.23.2"
42 | memory: "8 GB"
43 | cpu: 4
44 | disks: "local-disk " + disk_size + " SSD"
45 | disk: disk_size + " GB"
46 | preemptible: 0
47 | maxRetries: 3
48 | }
49 | }
50 |
51 | task fastp_se {
52 | input {
53 | File read1
54 | String samplename
55 | String docker = "quay.io/staphb/fastp:0.23.2"
56 | Int disk_size = 100
57 | Int fastp_window_size = 20
58 | Int fastp_quality_trim_score = 30
59 | Int fastp_minlen = 50
60 | # -g enables polyg trimming with default value of 10
61 | # --detect_adapter_for_pe argument was removed
62 | String fastp_args = "-g -5 20 -3 20"
63 | Int threads = 4
64 | }
65 | command <<<
66 | # date
67 | date | tee DATE
68 |
69 | fastp \
70 | --in1 ~{read1} \
71 | --out1 ~{samplename}_1P.fastq.gz \
72 | --cut_right --cut_right_window_size ~{fastp_window_size} --cut_right_mean_quality ~{fastp_quality_trim_score} \
73 | --length_required ~{fastp_minlen} \
74 | --thread ~{threads} \
75 | ~{fastp_args} \
76 | --html ~{samplename}_fastp.html --json ~{samplename}_fastp.json
77 | >>>
78 | output {
79 | File read1_trimmed = "~{samplename}_1P.fastq.gz"
80 | File fastp_stats = "~{samplename}_fastp.html"
81 | String version = "~{docker}"
82 | String pipeline_date = read_string("DATE")
83 | }
84 | runtime {
85 | docker: "quay.io/staphb/fastp:0.23.2"
86 | memory: "8 GB"
87 | cpu: 4
88 | disks: "local-disk " + disk_size + " SSD"
89 | disk: disk_size + " GB"
90 | preemptible: 0
91 | maxRetries: 3
92 | }
93 | }
--------------------------------------------------------------------------------
/workflows/ecoli_char.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | workflow ecoli_char {
4 |
5 | input {
6 | String SRR
7 | File contigs
8 | }
9 |
10 | call abricate as abricate {
11 | input:
12 | samplename=SRR,
13 | contigs=contigs,
14 | database="ncbi"
15 | }
16 |
17 | call abricate as abricate_virfinder {
18 | input:
19 | samplename=SRR,
20 | contigs=contigs,
21 | database="ecoli_vf"
22 | }
23 |
24 | call amrfinderplus {
25 | input:
26 | samplename=SRR,
27 | contigs=contigs
28 | }
29 |
30 | call serotypefinder {
31 | input:
32 | samplename=SRR,
33 | contigs=contigs
34 | }
35 |
36 | output {
37 | File abricate_results =abricate.abricate_results
38 | File abricate_virfinder_results =abricate_virfinder.abricate_results
39 | File amrfinderplus_results =amrfinderplus.amrfinder_results
40 | File serotypefinder_results =serotypefinder.serotypefinder_results
41 | }
42 | }
43 |
44 | task abricate {
45 |
46 | input {
47 | File contigs
48 | String samplename
49 | String database
50 | }
51 |
52 | command {
53 | abricate --version | head -1 | tee VERSION
54 | abricate --db ${database} ${contigs} > ${samplename + '_abricate.tsv'}
55 | }
56 |
57 | output {
58 | File abricate_results="${samplename + '_abricate.tsv'}"
59 | }
60 |
61 | runtime {
62 | docker: "quay.io/staphb/abricate:1.0.0"
63 | memory: "8 GB"
64 | cpu: 2
65 | disks: "local-disk 100 SSD"
66 | preemptible: 0
67 | }
68 | }
69 |
70 | task amrfinderplus {
71 | input {
72 | File contigs
73 | String samplename
74 | }
75 |
76 | command {
77 | amrfinder --version | head -1 | tee VERSION
78 | amrfinder \
79 | --nucleotide ${contigs} \
80 | -o ${samplename + '_amrfinder.tsv'}
81 | }
82 |
83 | output {
84 | File amrfinder_results="${samplename + '_amrfinder.tsv'}"
85 | }
86 |
87 | runtime {
88 | docker: "quay.io/staphb/ncbi-amrfinderplus:3.8.28"
89 | memory: "8 GB"
90 | cpu: 2
91 | disks: "local-disk 100 SSD"
92 | preemptible: 0
93 | }
94 | }
95 |
96 | task serotypefinder {
97 |
98 | input {
99 | File contigs
100 | String samplename
101 | }
102 |
103 | command {
104 | serotypefinder.pl --version | head -1 | tee VERSION
105 | serotypefinder.pl \
106 | -i ${contigs} \
107 | -d /serotypefinder/database \
108 | -b /blast-2.2.26 \
109 | -s ecoli \
110 | -k 85.00 \
111 | -l 0.60 \
112 | -o ${samplename}
113 | }
114 |
115 | output {
116 | File serotypefinder_results="${samplename}/results_table.txt"
117 | }
118 |
119 | runtime {
120 | docker: "quay.io/staphb/serotypefinder:1.1"
121 | memory: "8 GB"
122 | cpu: 2
123 | disks: "local-disk 100 SSD"
124 | preemptible: 0
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_snp_dists.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task snp_dists {
4 | input {
5 | File alignment
6 | String cluster_name
7 | Int disk_size = 100
8 | }
9 | command <<<
10 | # date and version control
11 | date | tee DATE
12 | snp-dists -v | tee VERSION
13 |
14 | # create snp-dists matrix file
15 | snp-dists ~{alignment} > ~{cluster_name}_snp_distance_matrix.tsv
16 | >>>
17 | output {
18 | String date = read_string("DATE")
19 | String version = read_string("VERSION")
20 | File snp_matrix = "~{cluster_name}_snp_distance_matrix.tsv"
21 | }
22 | runtime {
23 | docker: "quay.io/staphb/snp-dists:0.8.2"
24 | memory: "2 GB"
25 | cpu: 2
26 | disks: "local-disk " + disk_size + " SSD"
27 | disk: disk_size + " GB"
28 | maxRetries: 3
29 | preemptible: 0
30 | }
31 | }
32 |
33 | task reorder_matrix {
34 | input {
35 | File input_tree
36 | File matrix
37 | String cluster_name
38 | Int disk_size = 100
39 | }
40 | command <<<
41 | # removing any "_contigs" suffixes from the tree and matrix
42 | sed 's/_contigs//g' ~{input_tree} > temporary_tree.nwk
43 | sed 's/_contigs//g' ~{matrix} > temporary_matrix.tsv
44 |
45 | python3 <>>
83 | output{
84 | File ordered_matrix = "~{cluster_name}_snp_matrix.csv"
85 | File tree = "~{cluster_name}_tree.nwk"
86 | }
87 | runtime {
88 | docker: "staphb/mykrobe:0.12.1" # used because it contains both biopython and pandas
89 | memory: "2 GB"
90 | cpu: 2
91 | disks: "local-disk " + disk_size + " SSD"
92 | disk: disk_size + " GB"
93 | # maxRetries: 3
94 | preemptible: 0
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/tasks/quality_control/task_fastq_scan.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task fastq_scan_pe {
4 | input {
5 | File read1
6 | File read2
7 | String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq")
8 | String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq")
9 | Int disk_size = 100
10 | }
11 | command <<<
12 | # capture date and version
13 | date | tee DATE
14 | fastq-scan -v | tee VERSION
15 |
16 | # set cat command based on compression
17 | if [[ "~{read1}" == *".gz" ]] ; then
18 | cat_reads="zcat"
19 | else
20 | cat_reads="cat"
21 | fi
22 |
23 | # capture forward read stats
24 | eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json >(jq .qc_stats.read_total > READ1_SEQS)
25 | read1_seqs=$(cat READ1_SEQS)
26 | eval "${cat_reads} ~{read2}" | fastq-scan | tee ~{read2_name}_fastq-scan.json >(jq .qc_stats.read_total > READ2_SEQS)
27 | read2_seqs=$(cat READ2_SEQS)
28 |
29 | # capture number of read pairs
30 | if [ "${read1_seqs}" == "${read2_seqs}" ]; then
31 | read_pairs=${read1_seqs}
32 | else
33 | read_pairs="Uneven pairs: R1=${read1_seqs}, R2=${read2_seqs}"
34 | fi
35 |
36 | echo $read_pairs | tee READ_PAIRS
37 | >>>
38 | output {
39 | File read1_fastq_scan_report = "~{read1_name}_fastq-scan.json"
40 | File read2_fastq_scan_report = "~{read2_name}_fastq-scan.json"
41 | Int read1_seq = read_string("READ1_SEQS")
42 | Int read2_seq = read_string("READ2_SEQS")
43 | String read_pairs = read_string("READ_PAIRS")
44 | String version = read_string("VERSION")
45 | String pipeline_date = read_string("DATE")
46 | }
47 | runtime {
48 | docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1"
49 | memory: "2 GB"
50 | cpu: 2
51 | disks: "local-disk " + disk_size + " SSD"
52 | disk: disk_size + " GB"
53 | preemptible: 0
54 | maxRetries: 3
55 | }
56 | }
57 |
58 | task fastq_scan_se {
59 | input {
60 | File read1
61 | String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq")
62 | Int disk_size = 100
63 | }
64 | command <<<
65 | # capture date and version
66 | date | tee DATE
67 | fastq-scan -v | tee VERSION
68 |
69 | # set cat command based on compression
70 | if [[ "~{read1}" == *".gz" ]] ; then
71 | cat_reads="zcat"
72 | else
73 | cat_reads="cat"
74 | fi
75 |
76 | # capture forward read stats
77 | eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json >(jq .qc_stats.read_total > READ1_SEQS)
78 | >>>
79 | output {
80 | File fastq_scan_report = "~{read1_name}_fastq-scan.json"
81 | Int read1_seq = read_string("READ1_SEQS")
82 | String version = read_string("VERSION")
83 | String pipeline_date = read_string("DATE")
84 | }
85 | runtime {
86 | docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1"
87 | memory: "2 GB"
88 | cpu: 2
89 | disks: "local-disk " + disk_size + " SSD"
90 | disk: disk_size + " GB"
91 | preemptible: 0
92 | maxRetries: 3
93 | }
94 | }
--------------------------------------------------------------------------------
/tasks/quality_control/task_fastqc.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task fastqc_pe {
4 | input {
5 | File read1
6 | File read2
7 | String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq")
8 | String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq")
9 | Int? cpus = 2
10 | String docker="quay.io/staphb/fastqc:0.11.9"
11 | Int disk_size = 100
12 | }
13 | command <<<
14 | # capture date and version
15 | date | tee DATE
16 | fastqc --version | grep FastQC | tee VERSION
17 |
18 | fastqc --outdir $PWD --threads ~{cpus} ~{read1} ~{read2}
19 |
20 | unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS
21 | unzip -p ~{read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ2_SEQS
22 |
23 | READ1_SEQS=$(unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 )
24 | READ2_SEQS=$(unzip -p ~{read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 )
25 |
26 | if [ $READ1_SEQS == $READ2_SEQS ]; then
27 | read_pairs=$READ1_SEQS
28 | else
29 | read_pairs="Uneven pairs: R1=$READ1_SEQS, R2=$READ2_SEQS"
30 | fi
31 | echo $read_pairs | tee READ_PAIRS
32 | >>>
33 | output {
34 | File fastqc1_html = "~{read1_name}_fastqc.html"
35 | File fastqc1_zip = "~{read1_name}_fastqc.zip"
36 | File fastqc2_html = "~{read2_name}_fastqc.html"
37 | File fastqc2_zip = "~{read2_name}_fastqc.zip"
38 | Int read1_seq = read_string("READ1_SEQS")
39 | Int read2_seq = read_string("READ2_SEQS")
40 | String read_pairs = read_string("READ_PAIRS")
41 | String version = read_string("VERSION")
42 | String pipeline_date = read_string("DATE")
43 | }
44 | runtime {
45 | docker: "~{docker}"
46 | memory: "4 GB"
47 | cpu: 2
48 | disks: "local-disk " + disk_size + " SSD"
49 | disk: disk_size + " GB"
50 | maxRetries: 3
51 | preemptible: 0
52 | }
53 | }
54 |
55 | task fastqc_se {
56 | input {
57 | File read1
58 | String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq")
59 | Int? cpus = 2
60 | String docker="quay.io/staphb/fastqc:0.11.9"
61 | Int disk_size = 100
62 | }
63 | command <<<
64 | # capture date and version
65 | date | tee DATE
66 | fastqc --version | grep FastQC | tee VERSION
67 |
68 | fastqc --outdir $PWD --threads ~{cpus} ~{read1}
69 |
70 | unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS
71 |
72 | READ_SEQS=$(unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 )
73 | >>>
74 | output {
75 | File fastqc_html = "~{read1_name}_fastqc.html"
76 | File fastqc_zip = "~{read1_name}_fastqc.zip"
77 | Int number_reads = read_string("READ1_SEQS")
78 | String version = read_string("VERSION")
79 | String pipeline_date = read_string("DATE")
80 | }
81 | runtime {
82 | docker: "~{docker}"
83 | memory: "4 GB"
84 | cpu: 2
85 | disks: "local-disk " + disk_size + " SSD"
86 | disk: disk_size + " GB"
87 | maxRetries: 3
88 | preemptible: 0
89 | }
90 | }
--------------------------------------------------------------------------------
/tasks/species_typing/task_ngmaster.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task ngmaster {
4 | meta {
5 | description: "Multi-antigen sequence typing for Neisseria gonorrhoeae"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "staphb/ngmaster:1.0.0"
11 | Int disk_size = 100
12 | Int cpu = 2
13 | }
14 | command <<<
15 | ngmaster --version 2>&1 | sed 's/^.*ngmaster //' | tee VERSION
16 |
17 | # run ngmaster on input assembly
18 | # unfortunately ngmaster 1.0.0 fails when either mincov or minid flags are supplied (this is with different install strategies too - bioconda & manually)
19 | # so we're forced to stick with default minid of 90 and mincov of 10. https://github.com/MDU-PHL/ngmaster/issues/39
20 | # ngmaster --comments also does not work
21 | ngmaster \
22 | ~{assembly} \
23 | > ~{samplename}.ngmaster.tsv
24 |
25 | # parse output TSV
26 | # first one is tricky since MLSTs are in the 3rd column, separated by a /
27 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $3}' | cut -d '/' -f 1 | tee NGMAST_SEQUENCE_TYPE
28 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $3}' | cut -d '/' -f 2 | tee NGSTAR_SEQUENCE_TYPE
29 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $4}' | tee NGMAST_PORB
30 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $5}' | tee NGMAST_TBPB
31 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $6}' | tee NGSTAR_PENA
32 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $7}' | tee NGSTAR_MTRR
33 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $8}' | tee NGSTAR_PORB
34 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $9}' | tee NGSTAR_PONA
35 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $10}' | tee NGSTAR_GYRA
36 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $11}' | tee NGSTAR_PARC
37 | tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $12}' | tee NGSTAR_23S
38 |
39 | >>>
40 | output {
41 | File ngmaster_tsv = "~{samplename}.ngmaster.tsv"
42 | String ngmaster_version = read_string("VERSION")
43 | # NG-MAST scheme's MLST and alleles (only 2 loci)
44 | String ngmaster_ngmast_sequence_type = read_string("NGMAST_SEQUENCE_TYPE")
45 | String ngmaster_ngmast_porB_allele = read_string("NGMAST_PORB")
46 | String ngmaster_ngmast_tbpB_allele = read_string("NGMAST_TBPB")
47 | # NG-STAR scheme's MLST and alleles (7 loci)
48 | String ngmaster_ngstar_sequence_type = read_string("NGSTAR_SEQUENCE_TYPE")
49 | String ngmaster_ngstar_penA_allele = read_string("NGSTAR_PENA")
50 | String ngmaster_ngstar_mtrR_allele = read_string("NGSTAR_MTRR")
51 | String ngmaster_ngstar_porB_allele = read_string("NGSTAR_PORB")
52 | String ngmaster_ngstar_ponA_allele = read_string("NGSTAR_PONA")
53 | String ngmaster_ngstar_gyrA_allele = read_string("NGSTAR_GYRA")
54 | String ngmaster_ngstar_parC_allele = read_string("NGSTAR_PARC")
55 | String ngmaster_ngstar_23S_allele = read_string("NGSTAR_23S")
56 | }
57 | runtime {
58 | docker: "~{docker}"
59 | memory: "8 GB"
60 | cpu: cpu
61 | disks: "local-disk " + disk_size + " SSD"
62 | disk: disk_size + " GB"
63 | maxRetries: 3
64 | preemptible: 0
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_sonneityping.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task sonneityping {
4 | # Inputs
5 | input {
6 | File read1
7 | File? read2
8 | Boolean ont_data = false
9 | String samplename
10 | String docker = "staphb/mykrobe:0.12.1"
11 | Int disk_size = 100
12 | String? myrkobe_opts
13 | Int cpu = 4
14 | }
15 | command <<<
16 | # Print and save versions
17 | mykrobe --version | sed 's|mykrobe v||g' | tee MYKROBE_VERSION.txt
18 | # opting to skip capturing the sonneityping version since there is no --version flag or easy way to determine version
19 | # navigate here for docker image and version information: https://github.com/StaPH-B/docker-builds/tree/master/mykrobe
20 |
21 | # Run Mykrobe on the input read data
22 | mykrobe predict \
23 | -t ~{cpu} \
24 | --sample ~{samplename} \
25 | --species sonnei \
26 | --format json_and_csv \
27 | --out ~{samplename}.mykrobe \
28 | ~{true='--ont' false='' ont_data} \
29 | --seq ~{read1} ~{read2} \
30 | ~{myrkobe_opts}
31 |
32 | # use sonneityping script to produce final TSV; alleles.txt is required input for human-readable genotype names
33 | python /sonneityping/parse_mykrobe_predict.py \
34 | --jsons ~{samplename}.mykrobe.json --alleles /sonneityping/alleles.txt \
35 | --prefix ~{samplename}.sonneityping
36 |
37 | # rename output TSV to something prettier
38 | mv -v ~{samplename}.sonneityping_predictResults.tsv ~{samplename}.sonneityping.tsv
39 |
40 | # Run a python block to parse output sonneityping TSV file for terra data tables
41 | python3 <>>
60 | output {
61 | File sonneityping_mykrobe_report_csv = "~{samplename}.mykrobe.csv"
62 | File sonneityping_mykrobe_report_json = "~{samplename}.mykrobe.json"
63 | File sonneityping_final_report_tsv = "~{samplename}.sonneityping.tsv"
64 | String sonneityping_mykrobe_version = read_string("MYKROBE_VERSION.txt")
65 | String sonneityping_mykrobe_docker = docker
66 | String sonneityping_species = read_string("SPECIES.txt")
67 | String sonneityping_final_genotype = read_string("FINAL_GENOTYPE.txt")
68 | String sonneityping_genotype_confidence = read_string("CONFIDENCE.txt")
69 | String sonneityping_genotype_name = read_string("GENOTYPE_NAME.txt")
70 | }
71 | runtime {
72 | docker: "~{docker}"
73 | memory: "8 GB"
74 | cpu: cpu
75 | disks: "local-disk " + disk_size + " SSD"
76 | disk: disk_size + " GB"
77 | maxRetries: 3
78 | preemptible: 0
79 | }
80 | }
--------------------------------------------------------------------------------
/tasks/species_typing/task_ts_mlst.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task ts_mlst {
4 | meta {
5 | description: "Torsten Seeman's (TS) automatic MLST calling from assembled contigs"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "staphb/mlst:2.23.0"
11 | Int disk_size = 100
12 | Int cpu = 4
13 | # Parameters
14 | # --nopath Strip filename paths from FILE column (default OFF)
15 | # --scheme [X] Don't autodetect, force this scheme on all inputs (default '')
16 | # --minid [n.n] DNA %identity of full allelle to consider 'similar' [~] (default '95')
17 | # --mincov [n.n] DNA %cov to report partial allele at all [?] (default '10')
18 | # --minscore [n.n] Minumum score out of 100 to match a scheme (when auto --scheme) (default '50')
19 | Boolean nopath = true
20 | String? scheme
21 | Float? minid
22 | Float? mincov
23 | Float? minscore
24 | }
25 | command <<<
26 | echo $(mlst --version 2>&1) | sed 's/mlst //' | tee VERSION
27 |
28 | #create output header
29 | echo -e "Filename\tPubMLST_Scheme_name\tSequence_Type_(ST)\tAllele_IDs" > ~{samplename}_ts_mlst.tsv
30 |
31 | mlst \
32 | --threads ~{cpu} \
33 | ~{true="--nopath" false="" nopath} \
34 | ~{'--scheme ' + scheme} \
35 | ~{'--minid ' + minid} \
36 | ~{'--mincov ' + mincov} \
37 | ~{'--minscore ' + minscore} \
38 | --novel ~{samplename}_novel_mlst_alleles.fasta \
39 | ~{assembly} \
40 | >> ~{samplename}_ts_mlst.tsv
41 |
42 | # parse ts mlst tsv for relevant outputs
43 | # if output TSV only contains one line (header line); no ST predicted
44 | if [ $(wc -l ~{samplename}_ts_mlst.tsv | awk '{ print $1 }') -eq 1 ]; then
45 | predicted_mlst="No ST predicted"
46 | pubmlst_scheme="NA"
47 | # else, TSV has more than one line, so parse outputs
48 | else
49 | pubmlst_scheme="$(cut -f2 ~{samplename}_ts_mlst.tsv | tail -n 1)"
50 | predicted_mlst="ST$(cut -f3 ~{samplename}_ts_mlst.tsv | tail -n 1)"
51 | # allelic_profile: take second line of output TSV; cut to take 4th column and beyond; replace tabs with commas
52 | allelic_profile="$(cut -f 4- ~{samplename}_ts_mlst.tsv | tail -n 1 | sed -e 's|\t|,|g')"
53 | if [ "$pubmlst_scheme" == "-" ]; then
54 | predicted_mlst="No ST predicted"
55 | pubmlst_scheme="NA"
56 | else
57 | if [ "$predicted_mlst" == "ST-" ]; then
58 | predicted_mlst="No ST predicted"
59 | fi
60 | fi
61 | fi
62 |
63 | echo "$predicted_mlst" | tee PREDICTED_MLST
64 | echo "$pubmlst_scheme" | tee PUBMLST_SCHEME
65 | echo "$allelic_profile" | tee ALLELIC_PROFILE.txt
66 | >>>
67 | output {
68 | File ts_mlst_results = "~{samplename}_ts_mlst.tsv"
69 | String ts_mlst_predicted_st = read_string("PREDICTED_MLST")
70 | String ts_mlst_pubmlst_scheme = read_string("PUBMLST_SCHEME")
71 | String ts_mlst_allelic_profile = read_string("ALLELIC_PROFILE.txt")
72 | File? ts_mlst_novel_alleles = "~{samplename}_novel_mlst_alleles.fasta"
73 | String ts_mlst_version = read_string("VERSION")
74 | }
75 | runtime {
76 | docker: "~{docker}"
77 | memory: "8 GB"
78 | cpu: 4
79 | disks: "local-disk " + disk_size + " SSD"
80 | disk: disk_size + " GB"
81 | maxRetries: 3
82 | preemptible: 0
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/tasks/taxon_id/task_midas.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task midas {
4 | input {
5 | File read1
6 | File? read2
7 | File midas_db = "gs://theiagen-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz"
8 | Int disk_size = 100
9 | String samplename
10 | String docker = "quay.io/fhcrc-microbiome/midas:v1.3.2--6"
11 | Int? memory = 32
12 | Int? cpu = 4
13 | }
14 | command <<<
15 | date | tee DATE
16 |
17 | # Decompress the Midas database
18 | mkdir db
19 | tar -C ./db/ -xzvf ~{midas_db}
20 |
21 | # Run Midas
22 | run_midas.py species ~{samplename} -1 ~{read1} ~{'-2 ' + read2} -d db/midas_db_v1.2/ -t ~{cpu}
23 |
24 | # rename output files
25 | mv ~{samplename}/species/species_profile.txt ~{samplename}/species/~{samplename}_species_profile.tsv
26 | mv ~{samplename}/species/log.txt ~{samplename}/species/~{samplename}_log.txt
27 |
28 | # Run a python block to parse output file for terra data tables
29 | # pandas is available in default docker image for python2 but not python3
30 | python2 <>>
64 | output {
65 | String midas_docker = docker
66 | String midas_analysis_date = read_string("DATE")
67 | File midas_report = "~{samplename}/species/~{samplename}_species_profile.tsv"
68 | File midas_log = "~{samplename}/species/~{samplename}_log.txt"
69 | String midas_primary_genus = read_string("PRIMARY_GENUS")
70 | String midas_secondary_genus = read_string("SECONDARY_GENUS")
71 | Float midas_secondary_genus_abundance = read_string("SECONDARY_GENUS_ABUNDANCE")
72 | }
73 | runtime {
74 | docker: "~{docker}"
75 | memory: "~{memory} GB"
76 | cpu: cpu
77 | disks: "local-disk " + disk_size + " SSD"
78 | disk: disk_size + " GB"
79 | maxRetries: 3
80 | preemptible: 0
81 | }
82 | }
--------------------------------------------------------------------------------
/.github/workflows/pytest-workflows.yml:
--------------------------------------------------------------------------------
1 | #
2 | # This workflow will run on Pushes and Pull Requests against the main branch. It
3 | # will run pytest with MiniWDL and Cromwell for any workflows with a change to
4 | # them or associated tasks.
5 | #
6 | name: Pytest Workflows
7 | on:
8 | push:
9 | branches: [main]
10 | pull_request:
11 | branches: [main]
12 |
13 | jobs:
14 | changes:
15 | name: Check for changes
16 | runs-on: ubuntu-latest
17 | outputs:
18 | # Expose workflows with changes
19 | workflows: ${{ steps.filter.outputs.changes }}
20 | steps:
21 | # Checkout the repo
22 | - uses: actions/checkout@v3
23 |
24 | # Select workflows with changes
25 | - uses: dorny/paths-filter@v2
26 | id: filter
27 | with:
28 | filters: "tests/config/pytest_filter.yml"
29 |
30 | check:
31 | runs-on: ubuntu-20.04
32 | name: ${{ matrix.tag }} ${{ matrix.engine }}
33 | needs: changes
34 | if: ${{ needs.changes.outputs.workflows != '[]' && needs.changes.outputs.workflows != '' }}
35 | strategy:
36 | fail-fast: false
37 | matrix:
38 | # For every workflow, test it with MiniWDL and Cromwell
39 | tag: ["${{ fromJson(needs.changes.outputs.workflows) }}"]
40 | engine: ["miniwdl", "cromwell"]
41 | defaults:
42 | run:
43 | # Play nicely with miniconda
44 | shell: bash -l {0}
45 | steps:
46 | # Checkout the repo
47 | - name: Checkout theiagen/public_health_bacterial_genomics
48 | uses: actions/checkout@v3
49 |
50 | # Import test data
51 | - name: Pull Test Data from bactopia/bactopia-tests
52 | uses: actions/checkout@v3
53 | with:
54 | repository: bactopia/bactopia-tests
55 | path: bactopia-tests
56 |
57 | # Setup Miniconda3
58 | - name: Setup miniconda
59 | uses: conda-incubator/setup-miniconda@v2
60 | with:
61 | activate-environment: actions
62 | auto-activate-base: false
63 |
64 | # Depends and env info (mostly for debug)
65 | - name: Install Dependencies
66 | run: |
67 | conda install -y -c conda-forge -c bioconda cromwell miniwdl=1.5.2 'python>=3.7' pytest pytest-workflow 'importlib-metadata<=4.13.0'
68 | uname -a && env
69 |
70 | - name: Test ${{ matrix.tag }}
71 | run: TMPDIR=~ pytest --tag ${{ matrix.tag }}_${{ matrix.engine }} --symlink --kwdof --color=yes
72 |
73 | - name: Upload logs on failure
74 | if: failure()
75 | uses: actions/upload-artifact@v3
76 | with:
77 | name: logs-${{ matrix.engine }}
78 | path: |
79 | /home/runner/pytest_workflow_*/**/stdout*
80 | /home/runner/pytest_workflow_*/**/stderr*
81 | /home/runner/pytest_workflow_*/**/script*
82 | /home/runner/pytest_workflow_*/**/rc
83 | /home/runner/pytest_workflow_*/**/command
84 | /home/runner/pytest_workflow_*/**/*.txt
85 | /home/runner/pytest_workflow_*/**/*.log
86 | /home/runner/pytest_workflow_*/**/*.out
87 | /home/runner/pytest_workflow_*/**/*.err
88 | /home/runner/pytest_workflow_*/**/DATE
89 | /home/runner/pytest_workflow_*/**/VERSION
90 | !/home/runner/pytest_workflow_*/**/*.bam*
91 | !/home/runner/pytest_workflow_*/**/*.fastq.gz
92 |
--------------------------------------------------------------------------------
/workflows/wf_read_QC_trim.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | import "../tasks/quality_control/task_trimmomatic.wdl" as trimmomatic
4 | import "../tasks/quality_control/task_fastp.wdl" as fastp
5 | import "../tasks/quality_control/task_bbduk.wdl" as bbduk
6 | import "../tasks/quality_control/task_fastq_scan.wdl" as fastq_scan
7 | import "../tasks/taxon_id/task_midas.wdl" as midas
8 |
9 | workflow read_QC_trim {
10 | meta {
11 | description: "Runs basic QC (fastq_scan), trimming (Trimmomatic), and adapter removal (bbduk) on illumina PE reads"
12 | }
13 |
14 | input {
15 | String samplename
16 | File read1_raw
17 | File read2_raw
18 | Int trim_window_size = 10
19 | Int trim_quality_trim_score = 20
20 | Int trim_minlen = 75
21 | Int bbduk_mem = 8
22 | Boolean call_midas = false
23 | File? midas_db
24 | String read_processing = "trimmomatic"
25 | String fastp_args = "--detect_adapter_for_pe -g -5 20 -3 20"
26 | }
27 | if (read_processing == "trimmomatic"){
28 | call trimmomatic.trimmomatic_pe {
29 | input:
30 | samplename = samplename,
31 | read1 = read1_raw,
32 | read2 = read2_raw,
33 | trimmomatic_window_size = trim_window_size,
34 | trimmomatic_quality_trim_score = trim_quality_trim_score,
35 | trimmomatic_minlen = trim_minlen
36 | }
37 | }
38 | if (read_processing == "fastp"){
39 | call fastp.fastp {
40 | input:
41 | samplename = samplename,
42 | read1 = read1_raw,
43 | read2 = read2_raw,
44 | fastp_window_size = trim_window_size,
45 | fastp_quality_trim_score = trim_quality_trim_score,
46 | fastp_minlen = trim_minlen,
47 | fastp_args = fastp_args
48 | }
49 | }
50 | call bbduk.bbduk_pe {
51 | input:
52 | samplename = samplename,
53 | read1_trimmed = select_first([trimmomatic_pe.read1_trimmed,fastp.read1_trimmed]),
54 | read2_trimmed = select_first([trimmomatic_pe.read2_trimmed,fastp.read2_trimmed]),
55 | mem_size_gb = bbduk_mem
56 | }
57 | call fastq_scan.fastq_scan_pe as fastq_scan_raw {
58 | input:
59 | read1 = read1_raw,
60 | read2 = read2_raw,
61 | }
62 | call fastq_scan.fastq_scan_pe as fastq_scan_clean {
63 | input:
64 | read1 = bbduk_pe.read1_clean,
65 | read2 = bbduk_pe.read2_clean
66 | }
67 | if (call_midas) {
68 | call midas.midas as midas {
69 | input:
70 | samplename = samplename,
71 | read1 = read1_raw,
72 | read2 = read2_raw,
73 | midas_db = midas_db
74 | }
75 | }
76 |
77 | output {
78 | File read1_clean = bbduk_pe.read1_clean
79 | File read2_clean = bbduk_pe.read2_clean
80 | Int fastq_scan_raw1 = fastq_scan_raw.read1_seq
81 | Int fastq_scan_raw2 = fastq_scan_raw.read2_seq
82 | String fastq_scan_raw_pairs = fastq_scan_raw.read_pairs
83 | Int fastq_scan_clean1 = fastq_scan_clean.read1_seq
84 | Int fastq_scan_clean2 = fastq_scan_clean.read2_seq
85 | String fastq_scan_clean_pairs = fastq_scan_clean.read_pairs
86 | String fastq_scan_version = fastq_scan_raw.version
87 | String bbduk_docker = bbduk_pe.bbduk_docker
88 | String? trimmomatic_version = trimmomatic_pe.version
89 | String? fastp_version = fastp.version
90 | String? midas_docker = midas.midas_docker
91 | File? midas_report = midas.midas_report
92 | String? midas_primary_genus = midas.midas_primary_genus
93 | String? midas_secondary_genus = midas.midas_secondary_genus
94 | Float? midas_secondary_genus_abundance = midas.midas_secondary_genus_abundance
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_pirate.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task pirate {
4 | input {
5 | Array[File] gff3
6 | String cluster_name
7 | Boolean align = true # align all genes and produce core/pangenome alignments
8 | String steps = "50,60,70,80,90,95,98" # % identity thresholds to use for pangenome construction [default: 50,60,70,80,90,95,98]
9 | String features = "CDS" # features to use for pangenome construction [default: CDS]
10 | Boolean nucl = false # create a pangenome on CDS features using nucleotide identity, default: amino acid identity
11 | String? panopt # additional arguments to pass to pangenome_contruction
12 | Int memory = 32
13 | Int cpu = 4
14 | String docker_image = "quay.io/biocontainers/pirate:1.0.5--hdfd78af_0"
15 | Int disk_size = 100
16 | }
17 | command <<<
18 |
19 | # date and version control
20 | date | tee DATE
21 | PIRATE -v | tee VERSION
22 |
23 | # pirate requires the directory containing the gff files as input
24 | mkdir INPUT_DIR
25 | ln -s ~{sep=' ' gff3} INPUT_DIR
26 |
27 | # run pirate on input gff
28 | PIRATE \
29 | --input INPUT_DIR \
30 | --output PIRATE \
31 | ~{'--steps ' + steps} \
32 | ~{'--features ' + features} \
33 | ~{true="--nucl" false="" nucl} \
34 | ~{true="--align" false="" align} \
35 | ~{'--pan-opt ' + panopt} \
36 | ~{'--threads ' + cpu}
37 |
38 | # generate gene_presence_absence.csv
39 | PIRATE_to_roary.pl -i PIRATE/PIRATE.*.tsv -o ~{cluster_name}_gene_presence_absence.csv
40 |
41 | # rename outputs with cluster name
42 | mv PIRATE/PIRATE.pangenome_summary.txt PIRATE/~{cluster_name}_pangenome_summary.txt
43 | mv PIRATE/PIRATE.log PIRATE/~{cluster_name}.log
44 | mv PIRATE/PIRATE.gene_families.ordered.tsv PIRATE/~{cluster_name}_gene_families.ordered.tsv
45 | mv PIRATE/PIRATE.unique_alleles.tsv PIRATE/~{cluster_name}_unique_alleles.tsv
46 | mv PIRATE/binary_presence_absence.fasta PIRATE/~{cluster_name}_binary_presence_absence.fasta
47 | mv PIRATE/binary_presence_absence.nwk PIRATE/~{cluster_name}_binary_presence_absence.nwk
48 | mv PIRATE/pangenome.gfa PIRATE/~{cluster_name}_pangenome.gfa
49 |
50 | if [[ ~{align} == "true" ]]; then
51 | mv PIRATE/pangenome_alignment.fasta PIRATE/~{cluster_name}_pangenome_alignment.fasta
52 | mv PIRATE/pangenome_alignment.gff PIRATE/~{cluster_name}_pangenome_alignment.gff
53 | mv PIRATE/core_alignment.fasta PIRATE/~{cluster_name}_core_alignment.fasta
54 | mv PIRATE/core_alignment.gff PIRATE/~{cluster_name}_core_alignment.gff
55 | fi
56 |
57 | >>>
58 | output {
59 | File pirate_pangenome_summary = "PIRATE/~{cluster_name}_pangenome_summary.txt"
60 | File pirate_gene_families_ordered = "PIRATE/~{cluster_name}_gene_families.ordered.tsv"
61 | File pirate_unique_alleles = "PIRATE/~{cluster_name}_unique_alleles.tsv"
62 | File pirate_binary_fasta = "PIRATE/~{cluster_name}_binary_presence_absence.fasta"
63 | File pirate_binary_tree = "PIRATE/~{cluster_name}_binary_presence_absence.nwk"
64 | File pirate_pangenome_gfa = "PIRATE/~{cluster_name}_pangenome.gfa"
65 | File? pirate_pangenome_alignment_fasta = "PIRATE/~{cluster_name}_pangenome_alignment.fasta"
66 | File? pirate_pangenome_alignment_gff = "PIRATE/~{cluster_name}_pangenome_alignment.gff"
67 | File? pirate_core_alignment_fasta = "PIRATE/~{cluster_name}_core_alignment.fasta"
68 | File? pirate_core_alignment_gff = "PIRATE/~{cluster_name}_core_alignment.gff"
69 | File? pirate_presence_absence_csv = "~{cluster_name}_gene_presence_absence.csv"
70 | String pirate_docker_image = docker_image
71 | }
72 | runtime {
73 | docker: "~{docker_image}"
74 | memory: "~{memory} GB"
75 | cpu: cpu
76 | disks: "local-disk " + disk_size + " SSD"
77 | disk: disk_size + " GB"
78 | maxRetries: 3
79 | preemptible: 0
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/workflows/wf_read_QC_trim_se.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | import "../tasks/quality_control/task_trimmomatic.wdl" as trimmomatic
4 | import "../tasks/quality_control/task_fastp.wdl" as fastp
5 | import "../tasks/quality_control/task_bbduk.wdl" as bbduk
6 | import "../tasks/quality_control/task_fastq_scan.wdl" as fastq_scan
7 | import "../tasks/taxon_id/task_midas.wdl" as midas
8 |
9 | workflow read_QC_trim {
10 | meta {
11 | description: "Runs basic QC (fastq_scan), trimming (Trimmomatic), and adapter removal (bbduk) on illumina SE reads"
12 | }
13 |
14 | input {
15 | String samplename
16 | File read1_raw
17 | Int trim_window_size = 4
18 | Int trim_quality_trim_score = 30
19 | Int trim_minlen = 25
20 | Int bbduk_mem = 8
21 | Boolean call_midas = false
22 | File? midas_db
23 | String read_processing = "trimmomatic"
24 | String fastp_args = "-g -5 20 -3 20"
25 | }
26 | # call read_clean.ncbi_scrub_se {
27 | # input:
28 | # samplename = samplename,
29 | # read1 = read1_raw
30 | # }
31 | if (read_processing == "trimmomatic"){
32 | call trimmomatic.trimmomatic_se {
33 | input:
34 | samplename = samplename,
35 | read1 = read1_raw,
36 | trimmomatic_window_size = trim_window_size,
37 | trimmomatic_quality_trim_score = trim_quality_trim_score,
38 | trimmomatic_minlen = trim_minlen
39 | }
40 | }
41 | if (read_processing == "fastp"){
42 | call fastp.fastp_se {
43 | input:
44 | samplename = samplename,
45 | read1 = read1_raw,
46 | fastp_window_size = trim_window_size,
47 | fastp_quality_trim_score = trim_quality_trim_score,
48 | fastp_minlen = trim_minlen,
49 | fastp_args = fastp_args
50 | }
51 | }
52 | call bbduk.bbduk_se {
53 | input:
54 | samplename = samplename,
55 | read1_trimmed = select_first([trimmomatic_se.read1_trimmed,fastp_se.read1_trimmed]),
56 | mem_size_gb = bbduk_mem
57 | }
58 | call fastq_scan.fastq_scan_se as fastq_scan_raw {
59 | input:
60 | read1 = read1_raw
61 | }
62 | call fastq_scan.fastq_scan_se as fastq_scan_clean {
63 | input:
64 | read1 = bbduk_se.read1_clean
65 | }
66 | if (call_midas) {
67 | call midas.midas as midas {
68 | input:
69 | samplename = samplename,
70 | read1 = read1_raw,
71 | midas_db = midas_db
72 | }
73 | }
74 | # call taxonID.kraken2 as kraken2_raw {
75 | # input:
76 | # samplename = samplename,
77 | # read1 = bbduk_se.read1_clean
78 | # }
79 | # call taxonID.kraken2 as kraken2_dehosted {
80 | # input:
81 | # samplename = samplename,
82 | # read1 = ncbi_scrub_se.read1_dehosted
83 | # }
84 |
85 | output {
86 | File read1_clean = bbduk_se.read1_clean
87 |
88 | Int fastq_scan_raw_number_reads = fastq_scan_raw.read1_seq
89 | Int fastq_scan_clean_number_reads = fastq_scan_clean.read1_seq
90 |
91 | # String kraken_version = kraken2_raw.version
92 | # Float kraken_human = kraken2_raw.percent_human
93 | # Float kraken_sc2 = kraken2_raw.percent_sc2
94 | # String kraken_report = kraken2_raw.kraken_report
95 | # Float kraken_human_dehosted = kraken2_dehosted.percent_human
96 | # Float kraken_sc2_dehosted = kraken2_dehosted.percent_sc2
97 | # String kraken_report_dehosted = kraken2_dehosted.kraken_report
98 |
99 | String fastq_scan_version = fastq_scan_raw.version
100 | String bbduk_docker = bbduk_se.bbduk_docker
101 | String? trimmomatic_version = trimmomatic_se.version
102 | String? fastp_version = fastp_se.version
103 | String? midas_docker = midas.midas_docker
104 | File? midas_report = midas.midas_report
105 | String? midas_primary_genus = midas.midas_primary_genus
106 | String? midas_secondary_genus = midas.midas_secondary_genus
107 | String? midas_secondary_genus_abundance = midas.midas_secondary_genus_abundance
108 | }
109 | }
--------------------------------------------------------------------------------
/tasks/taxon_id/task_kraken2.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task kraken2_pe {
4 | input {
5 | File read1
6 | File read2
7 | File kraken2_db
8 | String samplename
9 | String docker = "quay.io/staphb/kraken2:2.1.2-no-db"
10 | Int disk_size = 100
11 |
12 | String? kraken2_args = ""
13 | String? classified_out = "classified#.fastq"
14 | String? unclassified_out = "unclassified#.fastq"
15 | Int? memory = 32
16 | Int? cpu = 4
17 | }
18 | command <<<
19 | echo $(kraken2 --version 2>&1) | sed 's/^.*Kraken version //;s/ .*$//' | tee VERSION
20 | date | tee DATE
21 |
22 | # Decompress the Kraken2 database
23 | mkdir db
24 | tar -C ./db/ -xzvf ~{kraken2_db}
25 |
26 | # Run Kraken2
27 | kraken2 \
28 | --db ./db/ \
29 | --threads ~{cpu} \
30 | --report ~{samplename}.report.txt \
31 | --gzip-compressed \
32 | --unclassified-out ~{samplename}.~{unclassified_out} \
33 | --classified-out ~{samplename}.~{classified_out} \
34 | --output ~{samplename}.classifiedreads.txt \
35 | --paired \
36 | ~{kraken2_args} \
37 | ~{read1} ~{read2}
38 |
39 | # Compress and cleanup
40 | gzip *.fastq
41 | gzip ~{samplename}.classifiedreads.txt
42 | >>>
43 | output {
44 | String kraken2_version = read_string("VERSION")
45 | String kraken2_docker = docker
46 | String analysis_date = read_string("DATE")
47 | File kraken2_report = "~{samplename}.report.txt"
48 | File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz"
49 | File kraken2_unclassified_read1 = "~{samplename}.unclassified_1.fastq.gz"
50 | File kraken2_unclassified_read2 = "~{samplename}.unclassified_2.fastq.gz"
51 | File kraken2_classified_read1 = "~{samplename}.classified_1.fastq.gz"
52 | File kraken2_classified_read2 = "~{samplename}.classified_2.fastq.gz"
53 | }
54 | runtime {
55 | docker: "~{docker}"
56 | memory: "~{memory} GB"
57 | cpu: cpu
58 | disks: "local-disk " + disk_size + " SSD"
59 | disk: disk_size + " GB"
60 | maxRetries: 3
61 | preemptible: 0
62 | }
63 | }
64 |
65 | task kraken2_se {
66 | input {
67 | File read1
68 | File kraken2_db
69 | String samplename
70 | String docker = "quay.io/staphb/kraken2:2.1.2-no-db"
71 | Int disk_size = 100
72 |
73 | String? kraken2_args = ""
74 | String? classified_out = "classified.fastq"
75 | String? unclassified_out = "unclassified.fastq"
76 | Int? memory = 32
77 | Int? cpu = 4
78 | }
79 | command <<<
80 | echo $(kraken2 --version 2>&1) | sed 's/^.*Kraken version //;s/ .*$//' | tee VERSION
81 | date | tee DATE
82 |
83 | # Decompress the Kraken2 database
84 | mkdir db
85 | tar -C ./db/ -xzvf ~{kraken2_db}
86 |
87 | # Run Kraken2
88 | kraken2 \
89 | --db ./db/ \
90 | --threads ~{cpu} \
91 | --report ~{samplename}.report.txt \
92 | --gzip-compressed \
93 | --unclassified-out ~{samplename}.~{unclassified_out} \
94 | --classified-out ~{samplename}.~{classified_out} \
95 | --output ~{samplename}.classifiedreads.txt \
96 | ~{kraken2_args} \
97 | ~{read1}
98 |
99 | # Compress and cleanup
100 | gzip *.fastq
101 | gzip ~{samplename}.classifiedreads.txt
102 | >>>
103 | output {
104 | String kraken2_version = read_string("VERSION")
105 | String kraken2_docker = docker
106 | String analysis_date = read_string("DATE")
107 | File kraken2_report = "~{samplename}.report.txt"
108 | File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz"
109 | File kraken2_unclassified_read1 = "~{samplename}.unclassified.fastq.gz"
110 | File kraken2_classified_read1 = "~{samplename}.classified.fastq.gz"
111 | }
112 | runtime {
113 | docker: "~{docker}"
114 | memory: "~{memory} GB"
115 | cpu: cpu
116 | disks: "local-disk " + disk_size + " SSD"
117 | disk: disk_size + " GB"
118 | maxRetries: 3
119 | preemptible: 0
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_agrvate.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task agrvate {
4 | meta {
5 | description: "Rapid identification of Staphylococcus aureus agr locus type and agr operon variants."
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "quay.io/biocontainers/agrvate:1.0.2--hdfd78af_0"
11 | Int disk_size = 50
12 | Int cpu = 1
13 |
14 | # Parameters
15 | # --typing_only agr typing only. Skips agr operon extraction and frameshift detection
16 | Boolean typing_only = false
17 | }
18 | command <<<
19 | # get version info
20 | agrvate -v 2>&1 | sed 's/agrvate v//;' | tee VERSION
21 |
22 | # run agrvate on assembly; usearch not available in biocontainer, cannot use that option
23 | # using -m flag for mummer frameshift detection since usearch is not available
24 | agrvate \
25 | ~{true="--typing-only" false="" typing_only} \
26 | -i ~{assembly} \
27 | -m
28 |
29 | # agrvate names output directory and file based on name of .fasta file, so .fasta as input results in -results/ outdir
30 | # and results in -results/-summary.tab files
31 | basename=$(basename ~{assembly})
32 | # strip off anything after the period
33 | fasta_prefix=${basename%.*}
34 |
35 | # rename outputs summary TSV to include samplename
36 | mv -v "${fasta_prefix}-results/${fasta_prefix}-summary.tab" ~{samplename}.agrvate.tsv
37 |
38 | # parse output summary TSV
39 | cut -f 2 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_GROUP
40 | cut -f 3 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_MATCH_SCORE
41 | cut -f 4 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_CANONICAL
42 | cut -f 5 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_MULTIPLE
43 | cut -f 6 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_NUM_FRAMESHIFTS
44 |
45 | # edit output string AGR_CANONICAL to be more informative: https://github.com/VishnuRaghuram94/AgrVATE#results
46 | if [[ $(cat AGR_CANONICAL) == 1 ]]; then
47 | echo "1. canonical agrD" >AGR_CANONICAL
48 | elif [[ $(cat AGR_CANONICAL) == 0 ]]; then
49 | echo "0. non-canonical agrD" >AGR_CANONICAL
50 | elif [[ $(cat AGR_CANONICAL) == "u" ]]; then
51 | echo "u. unknown agrD" >AGR_CANNONICAL
52 | else
53 | echo "result unrecognized, please see summary agrvate TSV file" >AGR_CANONICAL
54 | fi
55 |
56 | # edit output string AGR_MULTIPLE to be more informative: https://github.com/VishnuRaghuram94/AgrVATE#results
57 | if [[ $(cat AGR_MULTIPLE) == "s" ]]; then
58 | echo "s. single agr group found" >AGR_MULTIPLE
59 | elif [[ $(cat AGR_MULTIPLE) == "m" ]]; then
60 | echo "m. multiple agr groups found" >AGR_MULTIPLE
61 | elif [[ $(cat AGR_MULTIPLE) == "u" ]]; then
62 | echo "u. unknown agr groups found" >AGR_MULTIPLE
63 | else
64 | echo "result unrecognized, please see summary agrvate TSV file" >AGR_MULTIPLE
65 | fi
66 |
67 | # if AGR_NUM_FRAMESHIFTS is unknown, edit output string AGR_NUM_FRAMESHIFTS to be more informative, otherwise keep set to a number: https://github.com/VishnuRaghuram94/AgrVATE#results
68 | if [[ $(cat AGR_NUM_FRAMESHIFTS) == "u" ]]; then
69 | echo "u or unknown; agr operon not extracted" >AGR_NUM_FRAMESHIFTS
70 | fi
71 |
72 | # create tarball of all output files
73 | tar -czvf ~{samplename}.agrvate.tar.gz "${fasta_prefix}-results/"
74 | >>>
75 | output {
76 | File agrvate_summary = "~{samplename}.agrvate.tsv"
77 | File agrvate_results = "~{samplename}.agrvate.tar.gz"
78 | String agrvate_agr_group = read_string("AGR_GROUP")
79 | String agrvate_agr_match_score = read_string("AGR_MATCH_SCORE")
80 | String agrvate_agr_canonical = read_string("AGR_CANONICAL")
81 | String agrvate_agr_multiple = read_string("AGR_MULTIPLE")
82 | String agrvate_agr_num_frameshifts = read_string("AGR_NUM_FRAMESHIFTS")
83 | String agrvate_version = read_string("VERSION")
84 | String agrvate_docker = docker
85 | }
86 | runtime {
87 | docker: "~{docker}"
88 | memory: "4 GB"
89 | cpu: cpu
90 | disks: "local-disk " + disk_size + " SSD"
91 | disk: disk_size + " GB"
92 | maxRetries: 3
93 | preemptible: 0
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/tasks/quality_control/task_cg_pipeline.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task cg_pipeline {
4 | input {
5 | File read1
6 | File? read2
7 | String samplename
8 | String docker="quay.io/staphb/lyveset:1.1.4f"
9 | Int disk_size = 100
10 | String cg_pipe_opts="--fast"
11 | Int genome_length
12 | }
13 | command <<<
14 | # date and version control
15 | date | tee DATE
16 |
17 | run_assembly_readMetrics.pl ~{cg_pipe_opts} ~{read1} ~{read2} -e ~{genome_length} > ~{samplename}_readMetrics.tsv
18 |
19 | # repeat for concatenated read file
20 | # run_assembly_readMetrics.pl extension awareness
21 | if [[ "~{read1}" == *".gz" ]] ; then
22 | extension=".gz"
23 | else
24 | extension=""
25 | fi
26 | cat ~{read1} ~{read2} > ~{samplename}_concat.fastq"${extension}"
27 | run_assembly_readMetrics.pl ~{cg_pipe_opts} ~{samplename}_concat.fastq"${extension}" -e ~{genome_length} > ~{samplename}_concat_readMetrics.tsv
28 |
29 | python3 < R2_MEAN_Q
79 | fi
80 | # same for R2_MEAN_LENGTH
81 | if [[ ! -f R2_MEAN_LENGTH ]] ; then
82 | echo "0.0" > R2_MEAN_LENGTH
83 | fi
84 |
85 | >>>
86 | output {
87 | File cg_pipeline_report = "${samplename}_readMetrics.tsv"
88 | String cg_pipeline_docker = docker
89 | String pipeline_date = read_string("DATE")
90 | Float r1_mean_q = read_float("R1_MEAN_Q")
91 | Float r2_mean_q = read_float("R2_MEAN_Q")
92 | Float combined_mean_q = read_float("COMBINED_MEAN_Q")
93 | Float r1_mean_readlength = read_float("R1_MEAN_LENGTH")
94 | Float r2_mean_readlength = read_float("R2_MEAN_LENGTH")
95 | Float combined_mean_readlength = read_float("COMBINED_MEAN_LENGTH")
96 | Float est_coverage = read_float("EST_COVERAGE")
97 | }
98 | runtime {
99 | docker: "~{docker}"
100 | memory: "8 GB"
101 | cpu: 4
102 | disks: "local-disk " + disk_size + " SSD"
103 | disk: disk_size + " GB"
104 | maxRetries: 3
105 | preemptible: 0
106 | }
107 | }
--------------------------------------------------------------------------------
/workflows/compile_ecoli_results.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | workflow compile_results {
4 |
5 | input {
6 | Array[String] SRR_array
7 | Array[File] serotypefinder_array
8 | Array[File] abricate_array
9 | Array[File] abricate_virfinder_array
10 | Array[File] amrfinder_array
11 | }
12 | call compile_abricate {
13 | input:
14 | array_srr=SRR_array,
15 | array_abr=abricate_array
16 | }
17 |
18 | call compile_abricate as compile_abricate_virfinder {
19 | input:
20 | array_srr=SRR_array,
21 | array_abr=abricate_virfinder_array
22 | }
23 |
24 | call compile_amrfinder {
25 | input:
26 | array_srr=SRR_array,
27 | array_afp=amrfinder_array
28 | }
29 |
30 | call compile_serotypefinder {
31 | input:
32 | array_srr=SRR_array,
33 | array_stf=serotypefinder_array
34 | }
35 |
36 | output {
37 | File compiled_serotypefinder_results=compile_serotypefinder.compiled_results
38 | File compiled_abricate_results=compile_abricate.compiled_results
39 | File compiled_abricate_virfinder_results=compile_abricate_virfinder.compiled_results
40 | File compiled_amrfinderplus_results=compile_amrfinder.compiled_results
41 | }
42 | }
43 |
44 |
45 | task compile_abricate {
46 | input {
47 | Array[String] array_srr
48 | Array[File] array_abr
49 | }
50 |
51 | command <<<
52 | touch results.txt
53 |
54 | srr_array=(~{sep=' ' array_srr})
55 | abr_array=(~{sep=' ' array_abr})
56 | echo "I am here"
57 |
58 | for index in ${!srr_array[@]}; do
59 | SRR=${srr_array[$index]}
60 | file=${abr_array[$index]}
61 | echo "$index"
62 | echo "$SRR"
63 | echo "$file"
64 |
65 | while IFS= read -r result
66 | do
67 | printf "%s %s\n" "$SRR $result" >> results.txt
68 | done < <(grep -E 'fasta' "$file")
69 |
70 | done
71 | >>>
72 |
73 | output {
74 | File compiled_results="results.txt"
75 | }
76 |
77 | runtime {
78 | docker: "quay.io/staphb/abricate:1.0.0"
79 | memory: "4 GB"
80 | cpu: 1
81 | disks: "local-disk 100 SSD"
82 | preemptible: 0
83 | }
84 | }
85 |
86 | task compile_amrfinder {
87 | input {
88 | Array[String] array_srr
89 | Array[File] array_afp
90 | }
91 |
92 | command <<<
93 | touch results.txt
94 |
95 | srr_array=(~{sep=' ' array_srr})
96 | afp_array=(~{sep=' ' array_afp})
97 | echo "I am here"
98 |
99 | for index in ${!srr_array[@]}; do
100 | SRR=${srr_array[$index]}
101 | file=${afp_array[$index]}
102 | echo "$index"
103 | echo "$SRR"
104 | echo "$file"
105 |
106 | while IFS= read -r result
107 | do
108 | printf "%s %s\n" "$SRR $result" >> results.txt
109 | done < <(grep -E 'contig' "$file")
110 |
111 | done
112 | >>>
113 |
114 | output {
115 | File compiled_results="results.txt"
116 | }
117 |
118 | runtime {
119 | docker: "quay.io/staphb/ncbi-amrfinderplus:3.8.28"
120 | memory: "4 GB"
121 | cpu: 1
122 | disks: "local-disk 100 SSD"
123 | preemptible: 0
124 | }
125 | }
126 |
127 |
128 | task compile_serotypefinder {
129 | input {
130 | Array[String] array_srr
131 | Array[File] array_stf
132 | }
133 |
134 | command <<<
135 | touch results.txt
136 |
137 | srr_array=(~{sep=' ' array_srr})
138 | stf_array=(~{sep=' ' array_stf})
139 | echo "I am here"
140 |
141 | for index in ${!srr_array[@]}; do
142 | SRR=${srr_array[$index]}
143 | file=${stf_array[$index]}
144 | echo "$index"
145 | echo "$SRR"
146 | echo "$file"
147 |
148 | while IFS= read -r result
149 | do
150 | printf "%s %s\n" "$SRR $result" >> results.txt
151 | done < <(grep -E 'fliC|wzy|wzx' "$file")
152 |
153 | done
154 | >>>
155 |
156 | output {
157 | File compiled_results="results.txt"
158 | }
159 |
160 | runtime {
161 | docker: "quay.io/staphb/serotypefinder:1.1"
162 | memory: "4 GB"
163 | cpu: 1
164 | disks: "local-disk 100 SSD"
165 | preemptible: 0
166 | }
167 | }
168 |
--------------------------------------------------------------------------------
/tasks/utilities/task_summarize_data.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task summarize_data {
4 | input {
5 | Array[String]? sample_names
6 | String? terra_project
7 | String? terra_workspace
8 | String? terra_table
9 | String? column_names # string of comma-delimited column names
10 | String? output_prefix
11 |
12 | Int disk_size = 100
13 | File? input_table
14 | Boolean phandango_coloring = true
15 | }
16 | command <<<
17 | # when running on terra, comment out all input_table mentions
18 | python3 /scripts/export_large_tsv/export_large_tsv.py --project "~{terra_project}" --workspace "~{terra_workspace}" --entity_type ~{terra_table} --tsv_filename ~{terra_table}-data.tsv
19 |
20 | # when running locally, use the input_table in place of downloading from Terra
21 | #cp ~{input_table} ~{terra_table}-data.tsv
22 |
23 | if ~{phandango_coloring}; then
24 | export phandango_coloring="true"
25 | else
26 | export phandango_coloring="false"
27 | fi
28 |
29 | python3 <>>
104 | output {
105 | File summarized_data = "~{output_prefix}_summarized_data.csv"
106 | }
107 | runtime {
108 | docker: "broadinstitute/terra-tools:tqdm"
109 | memory: "8 GB"
110 | cpu: 1
111 | disks: "local-disk " + disk_size + " SSD"
112 | disk: disk_size + " GB"
113 | dx_instance_type: "mem1_ssd1_v2_x2"
114 | maxRetries: 3
115 | }
116 | }
--------------------------------------------------------------------------------
/tasks/quality_control/task_mummer_ani.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task animummer {
4 | input {
5 | File assembly
6 | String samplename
7 | File? ref_genome
8 | Float mash_filter = 0.9
9 | String docker="staphb/mummer:4.0.0-rgdv2"
10 | Int disk_size = 100
11 | }
12 | command <<<
13 | # capture and version
14 | mummer --version | tee MUMMER_VERSION
15 |
16 | # set the reference genome
17 | # if not defined by user, then use all 43 genomes in RGDv2
18 | if [[ -z "~{ref_genome}" ]]; then
19 | # ref genome is not defined. default to RGDv2
20 | # BASH variable
21 | REF_GENOME="$(ls /RGDv2/*.fasta)"
22 | echo "user did not define a reference genome, defaulting to 43 genomes in RGDv2"
23 | echo "REF_GENOME is set to: ${REF_GENOME}"
24 | else
25 | echo "User specified a reference genome, will use this instead of RGDv2"
26 | REF_GENOME="~{ref_genome}"
27 | echo "REF_GENOME is set to: ${REF_GENOME}"
28 | fi
29 |
30 | # call Lee's ani-m.pl script and compare query genome against reference genome
31 | # first does a mash check on relatedness between 2 genomes. If greater than mash_filter, then run dnadiff
32 | # --symmetric flag runs ANI on query vs. ref; followed by ref vs. query
33 | ani-m.pl --symmetric \
34 | --mash-filter ~{mash_filter} \
35 | ~{assembly} \
36 | ${REF_GENOME} | tee ~{samplename}.ani-mummer.out.tsv
37 |
38 | # CHECK FOR A NEARLY BLANK TSV (ONLY HEADER LINE), mean sample did not surpass mash-filter and thus no ANI was run
39 | LINE_COUNT_OUTPUT_TSV=$(wc -l ~{samplename}.ani-mummer.out.tsv | cut -d ' ' -f 1)
40 | echo "Number of lines in output TSV is: ${LINE_COUNT_OUTPUT_TSV}"
41 | if [[ ${LINE_COUNT_OUTPUT_TSV} -eq 1 ]]; then
42 | echo "~{samplename} did not surpass the minimum mash genetic distance filter, thus ANI was not performed"
43 | echo "The output TSV only contains the header line"
44 | # set output variables as 0s or descriptive strings
45 | echo "0.0" > ANI_HIGHEST_PERCENT_BASES_ALIGNED
46 | echo "0.0" > ANI_HIGHEST_PERCENT
47 | echo "ANI skipped due to high genetic divergence from reference genomes" > ANI_TOP_SPECIES_MATCH
48 | # if output TSV has greater than 1 lines, then parse for appropriate outputs
49 | else
50 | ## parse out highest percentBases aligned
51 | cut -f 5 ~{samplename}.ani-mummer.out.tsv | sort -nr | head -n 1 | tee ANI_HIGHEST_PERCENT_BASES_ALIGNED
52 | echo "highest percent bases aligned is: $(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)"
53 |
54 | ## parse out ANI value using highest percentBases aligned value
55 | grep "$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)" ~{samplename}.ani-mummer.out.tsv | cut -f 3 | tee ANI_HIGHEST_PERCENT
56 | echo "ANI value is: $(cat ANI_HIGHEST_PERCENT)"
57 |
58 | # have to separate out results for ani_top_species match because user-defined reference genome FASTAs will not be named as they are in RGDv2
59 | if [[ -z "~{ref_genome}" ]]; then
60 | ### ref genome is not user-defined, using RGDv2 and FASTA filenames ###
61 | # Parse out species name from reference fasta filename
62 | # use percent bases aligned to pull relevant line, cut down to query and ref fasta filenames, sed to remove your query filename, xargs to remove whitespaces & stuff
63 | # cut on periods to pull out genus_species (in future this will inlcude lineages for Listeria and other sub-species designations)
64 | # have to create assembly_file_basename bash variable since output TSV does not include full path to assembly file, only filename
65 | assembly_file_basename=$(basename ~{assembly})
66 | grep "$(cat ANI_HIGHEST_PERCENT)" ~{samplename}.ani-mummer.out.tsv | cut -f 1,2 | sed "s|${assembly_file_basename}||g" | xargs | cut -d '.' -f 3 | tee ANI_TOP_SPECIES_MATCH
67 | echo "ANI top species match is: $(cat ANI_TOP_SPECIES_MATCH)"
68 | else
69 | # User specified a reference genome, use fasta filename as output string
70 | basename "${REF_GENOME}" > ANI_TOP_SPECIES_MATCH
71 | echo "Reference genome used for ANI is: ${REF_GENOME}"
72 | fi
73 | fi
74 |
75 | >>>
76 | output {
77 | Float ani_highest_percent = read_float("ANI_HIGHEST_PERCENT")
78 | Float ani_highest_percent_bases_aligned = read_float("ANI_HIGHEST_PERCENT_BASES_ALIGNED")
79 | File ani_output_tsv = "~{samplename}.ani-mummer.out.tsv"
80 | String ani_top_species_match = read_string("ANI_TOP_SPECIES_MATCH")
81 | String ani_mummer_version = read_string("MUMMER_VERSION")
82 | }
83 | runtime {
84 | docker: "~{docker}"
85 | memory: "8 GB"
86 | cpu: 4
87 | disks: "local-disk " + disk_size + " SSD"
88 | disk: disk_size + " GB"
89 | maxRetries: 3
90 | preemptible: 0
91 | }
92 | }
--------------------------------------------------------------------------------
/workflows/wf_core_gene_snp.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | import "../tasks/phylogenetic_inference/task_pirate.wdl" as pirate
4 | import "../tasks/phylogenetic_inference/task_iqtree.wdl" as iqtree
5 | import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists
6 | import "../tasks/task_versioning.wdl" as versioning
7 | import "../tasks/utilities/task_summarize_data.wdl" as data_summary
8 |
9 |
10 | workflow core_gene_snp_workflow {
11 | input {
12 | Array[File] gff3
13 | String cluster_name
14 | # if align = true, the pirate task will produce core and pangenome alignments for the sample set,
15 | # otherwise, pirate will only produce a pangenome summary
16 | Boolean align = true
17 | # use core_tree = true to produce a phylogenetic tree and snp distance matrix from the core genome alignment
18 | Boolean core_tree = true
19 | # use pan_tree = true to produce a phylogenetic tree and snp distance matrix from the pangenome alignment
20 | Boolean pan_tree = false
21 | # data summary input variables
22 | Array[String]? sample_names
23 | String? data_summary_terra_project
24 | String? data_summary_terra_workspace
25 | String? data_summary_terra_table
26 | String? data_summary_column_names
27 | }
28 | call pirate.pirate as pirate {
29 | input:
30 | gff3 = gff3,
31 | cluster_name = cluster_name,
32 | align = align
33 | }
34 | if (align) {
35 | if (core_tree) {
36 | call iqtree.iqtree as core_iqtree {
37 | input:
38 | alignment = select_first([pirate.pirate_core_alignment_fasta]),
39 | cluster_name = cluster_name
40 | }
41 | call snp_dists.snp_dists as core_snp_dists {
42 | input:
43 | alignment = select_first([pirate.pirate_core_alignment_fasta]),
44 | cluster_name = cluster_name
45 | }
46 | call snp_dists.reorder_matrix as core_reorder_matrix {
47 | input:
48 | input_tree = core_iqtree.ml_tree,
49 | matrix = core_snp_dists.snp_matrix,
50 | cluster_name = cluster_name + "_core"
51 | }
52 | }
53 | if (pan_tree) {
54 | call iqtree.iqtree as pan_iqtree {
55 | input:
56 | alignment = select_first([pirate.pirate_pangenome_alignment_fasta]),
57 | cluster_name = cluster_name
58 | }
59 | call snp_dists.snp_dists as pan_snp_dists {
60 | input:
61 | alignment = select_first([pirate.pirate_pangenome_alignment_fasta]),
62 | cluster_name = cluster_name
63 | }
64 | call snp_dists.reorder_matrix as pan_reorder_matrix {
65 | input:
66 | input_tree = pan_iqtree.ml_tree,
67 | matrix = pan_snp_dists.snp_matrix,
68 | cluster_name = cluster_name + "_pan"
69 | }
70 | }
71 | }
72 | if (defined(data_summary_column_names)) {
73 | call data_summary.summarize_data {
74 | input:
75 | sample_names = sample_names,
76 | terra_project = data_summary_terra_project,
77 | terra_workspace = data_summary_terra_workspace,
78 | terra_table = data_summary_terra_table,
79 | column_names = data_summary_column_names,
80 | output_prefix = cluster_name
81 | }
82 | }
83 | call versioning.version_capture{
84 | input:
85 | }
86 | output {
87 | # Version Capture
88 | String core_gene_snp_wf_version = version_capture.phbg_version
89 | String core_gene_snp_wf_analysis_date = version_capture.date
90 | # pirate_outputs
91 | File pirate_pangenome_summary = pirate.pirate_pangenome_summary
92 | File pirate_gene_families_ordered = pirate.pirate_gene_families_ordered
93 | File? pirate_core_alignment_fasta = pirate.pirate_core_alignment_fasta
94 | File? pirate_core_alignment_gff = pirate.pirate_core_alignment_gff
95 | File? pirate_pan_alignment_fasta = pirate.pirate_pangenome_alignment_fasta
96 | File? pirate_pan_alignment_gff = pirate.pirate_pangenome_alignment_gff
97 | File? pirate_presence_absence_csv = pirate.pirate_presence_absence_csv
98 | String pirate_docker_image = pirate.pirate_docker_image
99 | # snp_dists outputs
100 | String? pirate_snps_dists_version = select_first([core_snp_dists.version,pan_snp_dists.version,""])
101 | # iqtree outputs
102 | String? pirate_iqtree_version = select_first([core_iqtree.version,pan_iqtree.version,""])
103 | # reorder matrix outputs
104 | File? pirate_core_snp_matrix = core_reorder_matrix.ordered_matrix
105 | File? pirate_iqtree_core_tree = core_reorder_matrix.tree
106 | File? pirate_pan_snp_matrix = pan_reorder_matrix.ordered_matrix
107 | File? pirate_iqtree_pan_tree = pan_reorder_matrix.tree
108 | # Data summary outputs
109 | File? pirate_summarized_data = summarize_data.summarized_data
110 | }
111 | }
112 |
--------------------------------------------------------------------------------
/tasks/gene_typing/task_resfinder.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task resfinder {
4 | input {
5 | File assembly # Input fasta file
6 | String samplename
7 | String? organism # Species in the sample, species should be entered with their full scientific names (e.g. "escherichia coli"), using quotation marks
8 | Boolean acquired = true # Run resfinder for acquired resistance genes
9 | Float? min_cov = 0.6 # Minimum (breadth-of) coverage of ResFinder
10 | Float? threshold = 0.9 # Threshold for identity of ResFinder
11 | Boolean point = false # Run pointfinder for chromosomal mutations
12 | String docker = "staphb/resfinder:4.1.11"
13 | Int disk_size = 100
14 | }
15 | command <<<
16 | date | tee DATE
17 | run_resfinder.py --version | tee RESFINDER_VERSION
18 | echo "unmodified from resfinder docker container" > RESFINDER_DB_VERSION
19 |
20 | # set $resfinder_organism BASH variable based on gambit_predicted_taxon or user-defined input string
21 | if [[ "~{organism}" == *"Campylobacter"*"jejuni"* ]]; then
22 | resfinder_organism="campylobacter jejuni"
23 | elif [[ "~{organism}" == *"Campylobacter"*"coli"* ]]; then
24 | resfinder_organism="campylobacter coli"
25 | elif [[ "~{organism}" == *"Campylobacter"* ]]; then
26 | resfinder_organism="campylobacter"
27 | elif [[ "~{organism}" == *"Enterococcus"*"faecalis"* ]]; then
28 | resfinder_organism="enterococcus faecalis"
29 | elif [[ "~{organism}" == *"Enterococcus"*"faecium"* ]]; then
30 | resfinder_organism="enterococcus faecium"
31 | elif [[ "~{organism}" == *"Escherichia"*"coli"* ]]; then
32 | resfinder_organism="escherichia coli"
33 | elif [[ "~{organism}" == *"Klebsiella"* ]]; then
34 | resfinder_organism="klebsiella"
35 | elif [[ "~{organism}" == *"Neisseria"*"gonorrhoeae"* ]]; then
36 | resfinder_organism="neisseria gonorrhoeae"
37 | elif [[ "~{organism}" == *"Salmonella"* ]]; then
38 | resfinder_organism="salmonella"
39 | elif [[ "~{organism}" == *"Staphylococcus"*"aureus"* ]]; then
40 | resfinder_organism="staphylococcus aureus"
41 | elif [[ "~{organism}" == *"Mycobacterium"*"tuberculosis"* ]]; then
42 | resfinder_organism="mycobacterium tuberculosis"
43 | elif [[ "~{organism}" == *"Helicobacter"*"pylori"* ]]; then
44 | resfinder_organism="helicobacter pylori"
45 | else
46 | echo "Either Gambit predicted taxon is not supported by resfinder or the user did not supply an organism as input."
47 | echo "Skipping the use of resfinder --species optional parameter."
48 | fi
49 |
50 | # if resfinder_organism variable is set, use --species flag, otherwise do not use --species flag
51 | if [[ -v resfinder_organism ]] ; then
52 | run_resfinder.py \
53 | --inputfasta ~{assembly} \
54 | --outputPath . \
55 | --species "${resfinder_organism}" \
56 | ~{true="--acquired" false="" acquired} \
57 | ~{'--min_cov ' + min_cov} \
58 | ~{'--threshold ' + threshold} \
59 | ~{true="--point" false="" point}
60 | else
61 | run_resfinder.py \
62 | --inputfasta ~{assembly} \
63 | --outputPath . \
64 | --species "other" \
65 | ~{true="--acquired" false="" acquired} \
66 | ~{'--min_cov ' + min_cov} \
67 | ~{'--threshold ' + threshold}
68 | fi
69 |
70 | # replace space in resfinder_organism with underscore
71 | resfinder_organism="${resfinder_organism// /_}"
72 |
73 | # rename files
74 | mv pheno_table.txt ~{samplename}_pheno_table.txt
75 | if [ -f pheno_table_${resfinder_organism}.txt ]; then
76 | mv pheno_table_${resfinder_organism}.txt ~{samplename}_pheno_table_species.txt
77 | fi
78 | mv ResFinder_Hit_in_genome_seq.fsa ~{samplename}_ResFinder_Hit_in_genome_seq.fsa
79 | mv ResFinder_Resistance_gene_seq.fsa ~{samplename}_ResFinder_Resistance_gene_seq.fsa
80 | mv ResFinder_results_tab.txt ~{samplename}_ResFinder_results_tab.txt
81 | if [ -f PointFinder_prediction.txt ]; then
82 | mv PointFinder_prediction.txt ~{samplename}_PointFinder_prediction.txt
83 | mv PointFinder_results.txt ~{samplename}_PointFinder_results.txt
84 | fi
85 |
86 | >>>
87 | output {
88 | File resfinder_pheno_table = "~{samplename}_pheno_table.txt"
89 | File? resfinder_pheno_table_species = "~{samplename}_pheno_table_species.txt"
90 | File resfinder_hit_in_genome_seq = "~{samplename}_ResFinder_Hit_in_genome_seq.fsa"
91 | File resfinder_resistance_gene_seq = "~{samplename}_ResFinder_Resistance_gene_seq.fsa"
92 | File resfinder_results_tab = "~{samplename}_ResFinder_results_tab.txt"
93 | File? pointfinder_pheno_table = "~{samplename}_PointFinder_prediction.txt"
94 | File? pointfinder_results = "~{samplename}_PointFinder_results.txt"
95 | String resfinder_docker = "~{docker}"
96 | String resfinder_version = read_string("RESFINDER_VERSION")
97 | String resfinder_db_version = read_string("RESFINDER_DB_VERSION")
98 | }
99 | runtime {
100 | memory: "8 GB"
101 | cpu: 4
102 | docker: docker
103 | disks: "local-disk " + disk_size + " SSD"
104 | disk: disk_size + " GB"
105 | maxRetries: 3
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_poppunk_streppneumo.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task poppunk {
4 | meta {
5 | description: "Using poppunk with GPS (Global Pneumococcal Sequencing project) database for Streptococcus pneumoniae typing"
6 | }
7 | input {
8 | File assembly
9 | String samplename
10 | String docker = "staphb/poppunk:2.4.0"
11 | Int disk_size = 100
12 | Int cpus = 4
13 | # database/reference files currently hosted on a public, requester-pays GCP bucket
14 | # hosting individually for speed purposes. Unzipping one big 20GB zip archive takes a long time, longer than downloading the files individually (which total 22GB uncompressed)
15 | # If future versions of the GPS database are released, we can update the links here or in Terra, and task should be future-proof
16 | File GPS_dists_npy = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.dists.npy"
17 | File GPS_dists_pkl = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.dists.pkl"
18 | File GPS_h5 = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.h5"
19 | File GPS_refs = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs"
20 | File GPS_refs_dists_npy = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.dists.npy"
21 | File GPS_refs_dists_pkl = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.dists.pkl"
22 | File GPS_refs_h5 = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.h5"
23 | File GPS_clusters_csv = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_clusters.csv"
24 | File GPS_fit_npz = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_fit.npz"
25 | File GPS_fit_pkl = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_fit.pkl"
26 | File GPS_graph_gt = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_graph.gt"
27 | File GPS_qcreport_txt = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_qcreport.txt"
28 | File GPS_unword_clusters_csv = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_unword_clusters.csv"
29 | File GPS_refs_graph_gt = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6refs_graph.gt"
30 | File GPS_external_clusters_csv = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_external_clusters.csv"
31 | }
32 | command <<<
33 | # get version information
34 | poppunk --version | sed 's/poppunk //' | tee VERSION
35 |
36 | # create input TSV
37 | echo -e "~{samplename}\t~{assembly}" > ~{samplename}_poppunk_input.tsv
38 |
39 | # determine the database name, which is also used as a prefix for all files included in database. Also used as GPS_DB_NAME directory to put database files in
40 | # doing this for future proofing
41 | # get file name of primary h5 file, strip off suffix
42 | GPS_DB_NAME=$(basename ~{GPS_h5} | sed 's|.h5||')
43 | # sending GPS_DB_NAME into text file for logging/output purposes
44 | echo "${GPS_DB_NAME}" > GPS_DB_NAME
45 |
46 | # move all database/reference files into single directory to feed into poppunk
47 | mkdir -v "${GPS_DB_NAME}"
48 | ln -vs ~{GPS_dists_npy} ~{GPS_dists_pkl} ~{GPS_h5} ~{GPS_refs} \
49 | ~{GPS_refs_dists_npy} ~{GPS_refs_dists_pkl} ~{GPS_refs_h5} ~{GPS_clusters_csv} \
50 | ~{GPS_fit_npz} ~{GPS_fit_pkl} ~{GPS_graph_gt} ~{GPS_qcreport_txt} \
51 | ~{GPS_unword_clusters_csv} ~{GPS_refs_graph_gt} ~{GPS_external_clusters_csv} \
52 | "${GPS_DB_NAME}"/
53 |
54 | # to allow for compatibility with future versions of the database
55 | # poppunk requires this file to be explicitly passed as input
56 | GPS_EXTERNAL_CLUSTERS_CSV=$(ls "${GPS_DB_NAME}"/GPS_*_external_clusters.csv)
57 |
58 | # run poppunk
59 | poppunk_assign \
60 | --threads ~{cpus} \
61 | --db "${GPS_DB_NAME}" \
62 | --distances "${GPS_DB_NAME}/${GPS_DB_NAME}.dists" \
63 | --query ~{samplename}_poppunk_input.tsv \
64 | --output ~{samplename}_poppunk \
65 | --external-clustering "${GPS_EXTERNAL_CLUSTERS_CSV}"
66 |
67 | # parse output CSV for GPSC (Global Pneumococcal Sequence Cluster)
68 | if [ -f ~{samplename}_poppunk/~{samplename}_poppunk_external_clusters.csv ]; then
69 | cut -d ',' -f 2 ~{samplename}_poppunk/~{samplename}_poppunk_external_clusters.csv | tail -n 1 > GPSC.txt
70 |
71 | # if GPSC is "NA", overwrite with helpful message
72 | if [[ "$(cat GPSC.txt)" == "NA" ]]; then
73 | echo "Potential novel GPS Cluster identified, please email globalpneumoseq@gmail.com to have novel clusters added to the database and a GPSC cluster name assigned after you have checked for low level contamination which may contribute to biased accessory distances." >GPSC.txt
74 | fi
75 | else
76 | echo "poppunk failed" > GPSC.txt
77 | fi
78 |
79 | >>>
80 | output {
81 | String poppunk_gps_cluster = read_string("GPSC.txt")
82 | File? poppunk_gps_external_cluster_csv = "~{samplename}_poppunk/~{samplename}_poppunk_external_clusters.csv"
83 | String poppunk_version = read_string("VERSION")
84 | String poppunk_docker = docker
85 | String poppunk_GPS_db_version = read_string("GPS_DB_NAME")
86 | }
87 | runtime {
88 | docker: "~{docker}"
89 | # poppunk with the GPS v6 db used upwards of 12GB ram at times
90 | memory: "16 GB"
91 | cpu: cpus
92 | disks: "local-disk " + disk_size + " SSD"
93 | disk: disk_size + " GB"
94 | maxRetries: 3
95 | preemptible: 0
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_srst2_vibrio.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task srst2_vibrio {
4 | meta {
5 | description: "Use of SRST2 to identify sequences of interest from a database of curated Vibrio sequences "
6 | }
7 | input {
8 | File reads1
9 | File? reads2
10 | String samplename
11 | Int srst2_min_cov
12 | Int srst2_max_divergence
13 | Int srst2_min_depth
14 | Int srst2_min_edge_depth
15 | Int srst2_gene_max_mismatch
16 | String docker = "quay.io/staphb/srst2:0.2.0-vcholerae"
17 | Int disk_size = 100
18 | Int cpu = 4
19 | }
20 | command <<<
21 | if [ -z "~{reads2}" ] ; then
22 | INPUT_READS="--input_se ~{reads1}"
23 | else
24 | # This task expects/requires that input FASTQ files end in "_1.clean.fastq.gz" and "_2.clean.fastq.gz"
25 | # which is the syntax from TheiaProk read cleaning tasks
26 | INPUT_READS="--input_pe ~{reads1} ~{reads2} --forward _1.clean --reverse _2.clean"
27 | fi
28 |
29 | srst2 --version 2>&1 | tee VERSION
30 | srst2 \
31 | ${INPUT_READS} \
32 | --gene_db /vibrio-cholerae-db/vibrio_230224.fasta \
33 | --output ~{samplename} \
34 | --min_coverage ~{srst2_min_cov} \
35 | --max_divergence ~{srst2_max_divergence} \
36 | --min_depth ~{srst2_min_depth} \
37 | --min_edge_depth ~{srst2_min_edge_depth} \
38 | --gene_max_mismatch ~{srst2_gene_max_mismatch}
39 |
40 | # capture output TSV
41 | mv ~{samplename}__genes__*__results.txt ~{samplename}.tsv
42 |
43 | # capture detailed output TSV - not available if no results are outputed
44 | mv ~{samplename}__fullgenes__*__results.txt ~{samplename}.detailed.tsv || echo "No results" > ~{samplename}.detailed.tsv
45 |
46 | # parsing block to account for when output columns do not exist
47 | python < 0:
76 | return '(' + ';'.join(translation) + ')'
77 | return ""
78 |
79 | # load output TSV as dict
80 | row = tsv_to_dict('~{samplename}.tsv')
81 |
82 | # presence or absence genes - ctxA, ompW and toxR
83 | with open("ctxA", "wb") as ctxA_fh:
84 | value = row.get("ctxA")
85 | presence = translate_chars(conv(value))
86 | if presence == "(not detected)":
87 | ctxA_fh.write(presence)
88 | else:
89 | result = "present" + ' ' + presence
90 | ctxA_fh.write(result.strip())
91 |
92 | with open("ompW", "wb") as ompW_fh:
93 | value = row.get("ompW")
94 | presence = translate_chars(conv(value))
95 | if presence == "(not detected)":
96 | ompW_fh.write(presence)
97 | else:
98 | result = "present" + ' ' + presence
99 | ompW_fh.write(result.strip())
100 |
101 | with open("toxR", "wb") as toxR_fh:
102 | value = row.get("toxR")
103 | presence = translate_chars(conv(value))
104 | if presence == "(not detected)":
105 | toxR_fh.write(presence)
106 | else:
107 | result = "present" + ' ' + presence
108 | toxR_fh.write(result.strip())
109 |
110 | # biotype - tcpA classical or tcpA ElTor
111 | with open("BIOTYPE", "wb") as biotype_fh:
112 | value_ElTor = translate_chars(conv(row.get("tcpA_ElTor")))
113 | value_classical = translate_chars(conv(row.get("tcpA_classical")))
114 |
115 | if value_ElTor == "(not detected)" and value_classical == "(not detected)":
116 | biotype_fh.write("(not detected)")
117 | else:
118 | if value_ElTor == "(not detected)":
119 | result = "tcpA_Classical" + ' ' + value_classical
120 | biotype_fh.write(result.strip())
121 | else:
122 | result = "tcpA_ElTor" + ' ' + value_ElTor
123 | biotype_fh.write(result.strip())
124 |
125 | # serogroup - O1 or O139
126 | with open("SEROGROUP", "wb") as serotype_fh:
127 | value_O1 = translate_chars(conv(row.get("wbeN_O1")))
128 | value_O139 = translate_chars(conv(row.get("wbfR_O139")))
129 |
130 | if value_O1 == "(not detected)" and value_O139 == "(not detected)":
131 | serotype_fh.write("(not detected)")
132 | else:
133 | if value_O1 == "(not detected)":
134 | result = "O139" + ' ' + value_O139
135 | serotype_fh.write(result.strip())
136 | else:
137 | result = "O1" + ' ' + value_O1
138 | serotype_fh.write(result.strip())
139 | CODE
140 | >>>
141 | output {
142 | File srst2_detailed_tsv = "~{samplename}.detailed.tsv"
143 | String srst2_version = read_string("VERSION")
144 | String srst2_vibrio_ctxA = read_string("ctxA")
145 | String srst2_vibrio_ompW = read_string("ompW")
146 | String srst2_vibrio_toxR = read_string("toxR")
147 | String srst2_vibrio_biotype = read_string("BIOTYPE")
148 | String srst2_vibrio_serogroup = read_string("SEROGROUP")
149 | }
150 | runtime {
151 | docker: "~{docker}"
152 | memory: "8 GB"
153 | cpu: cpu
154 | disks: "local-disk " + disk_size + " SSD"
155 | disk: disk_size + " GB"
156 | maxRetries: 3
157 | preemptible: 0
158 | }
159 | }
160 |
--------------------------------------------------------------------------------
/tasks/assembly/task_shovill.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task shovill_pe {
4 | input {
5 | File read1_cleaned
6 | File read2_cleaned
7 | String samplename
8 | String docker = "quay.io/staphb/shovill:1.1.0"
9 | Int disk_size = 100
10 |
11 | ## SHOVILL optional parameters
12 | ## --depth [INT] Sub-sample --R1/--R2 to this depth. Disable with --depth 0 (default: 150)
13 | ## --gsize [STRING] Estimated genome size eg. 3.2M (default: '')
14 | ## --minlen [INT] Minimum contig length <0=AUTO> (default: 0)
15 | ## --mincov [FLOAT] Minimum contig coverage <0=AUTO> (default: 2)
16 | ## --assembler [STRING] Assembler: skesa velvet megahit spades (default: 'spades')
17 | ## --opts [STRING] Extra assembler options in quotes eg. spades: "--untrusted-contigs locus.fna" ... (default: '')
18 | ## --kmers [STRING] K-mers to use (default: '')
19 | ## --trim [BOOLEAN] Enable adaptor trimming (default: OFF)
20 | ## --noreadcorr [BOOLEAN] Disable read error correction (default: OFF)
21 | ## --nostitch [BOOLEAN] Disable read stitching (default: OFF)
22 | ## --nocorr [BOOLEAN] Disable post-assembly correction (default: OFF)
23 |
24 |
25 | Int? depth
26 | String? genome_size
27 | Int min_contig_length = 200
28 | Float? min_coverage
29 | String assembler = "skesa"
30 | String? assembler_options
31 | String? kmers
32 | Boolean trim = false
33 | Boolean noreadcorr = false
34 | Boolean nostitch = false
35 | Boolean nocorr = false
36 | }
37 | command <<<
38 | shovill --version | head -1 | tee VERSION
39 | shovill \
40 | --outdir out \
41 | --R1 ~{read1_cleaned} \
42 | --R2 ~{read2_cleaned} \
43 | --minlen ~{min_contig_length} \
44 | ~{'--depth ' + depth} \
45 | ~{'--gsize ' + genome_size} \
46 | ~{'--mincov ' + min_coverage} \
47 | ~{'--assembler ' + assembler} \
48 | ~{'--opts ' + assembler_options} \
49 | ~{'--kmers ' + kmers} \
50 | ~{true='--trim' false='' trim} \
51 | ~{true='--noreadcorr' false='' noreadcorr} \
52 | ~{true='--nostitch' false='' nostitch} \
53 | ~{true='--nocorr' false='' nocorr}
54 |
55 | mv out/contigs.fa out/~{samplename}_contigs.fasta
56 |
57 | if [ "~{assembler}" == "spades" ] ; then
58 | mv out/contigs.gfa out/~{samplename}_contigs.gfa
59 | elif [ "~{assembler}" == "megahit" ] ; then
60 | mv out/contigs.fastg out/~{samplename}_contigs.fastg
61 | elif [ "~{assembler}" == "velvet" ] ; then
62 | mv out/contigs.LastGraph out/~{samplename}_contigs.LastGraph
63 | fi
64 |
65 | >>>
66 | output {
67 | File assembly_fasta = "out/~{samplename}_contigs.fasta"
68 | File? contigs_gfa = "out/~{samplename}_contigs.gfa"
69 | File? contigs_fastg = "out/~{samplename}_contigs.fastg"
70 | File? contigs_lastgraph = "out/~{samplename}_contigs.LastGraph"
71 | String shovill_version = read_string("VERSION")
72 | }
73 | runtime {
74 | docker: "~{docker}"
75 | memory: "16 GB"
76 | cpu: 4
77 | disks: "local-disk " + disk_size + " SSD"
78 | disk: disk_size + " GB"
79 | maxRetries: 3
80 | preemptible: 0
81 | }
82 | }
83 |
84 | task shovill_se {
85 | input {
86 | File read1_cleaned
87 | String samplename
88 | String docker = "quay.io/staphb/shovill-se:1.1.0"
89 | Int disk_size = 100
90 |
91 | ## SHOVILL optional parameters
92 | ## --depth [INT] Sub-sample --R1/--R2 to this depth. Disable with --depth 0 (default: 150)
93 | ## --gsize [STRING] Estimated genome size eg. 3.2M (default: '')
94 | ## --minlen [INT] Minimum contig length <0=AUTO> (default: 0)
95 | ## --mincov [FLOAT] Minimum contig coverage <0=AUTO> (default: 2)
96 | ## --assembler [STRING] Assembler: skesa velvet megahit spades (default: 'spades')
97 | ## --opts [STRING] Extra assembler options in quotes eg. spades: "--untrusted-contigs locus.fna" ... (default: '')
98 | ## --kmers [STRING] K-mers to use (default: '')
99 | ## --trim [BOOLEAN] Enable adaptor trimming (default: OFF)
100 | ## --noreadcorr [BOOLEAN] Disable read error correction (default: OFF)
101 | ## --nocorr [BOOLEAN] Disable post-assembly correction (default: OFF)
102 |
103 | Int? depth
104 | String? genome_size
105 | Int min_contig_length = 200
106 | Float? min_coverage
107 | String assembler = "spades"
108 | String? assembler_options
109 | String? kmers
110 | Boolean trim = false
111 | Boolean noreadcorr = false
112 | Boolean nocorr = false
113 | }
114 | command <<<
115 | shovill-se --version | head -1 | tee VERSION
116 | shovill-se \
117 | --outdir out \
118 | --se ~{read1_cleaned}
119 | --minlen ~{min_contig_length} \
120 | ~{'--depth ' + depth} \
121 | ~{'--gsize ' + genome_size} \
122 | ~{'--mincov ' + min_coverage} \
123 | ~{'--assembler ' + assembler} \
124 | ~{'--opts ' + assembler_options} \
125 | ~{'--kmers ' + kmers} \
126 | ~{true='--trim' false='' trim} \
127 | ~{true='--noreadcorr' false='' noreadcorr} \
128 | ~{true='--nocorr' false='' nocorr}
129 |
130 | mv out/contigs.fa out/~{samplename}_contigs.fasta
131 |
132 | if [ "~{assembler}" == "spades" ] ; then
133 | mv out/contigs.gfa out/~{samplename}_contigs.gfa
134 | elif [ "~{assembler}" == "megahit" ] ; then
135 | mv out/contigs.fastg out/~{samplename}_contigs.fastg
136 | elif [ "~{assembler}" == "velvet" ] ; then
137 | mv out/contigs.LastGraph out/~{samplename}_contigs.LastGraph
138 | fi
139 | >>>
140 | output {
141 | File assembly_fasta = "out/~{samplename}_contigs.fasta"
142 | File? contigs_gfa = "out/~{samplename}_contigs.gfa"
143 | File? contigs_fastg = "out/~{samplename}_contigs.fastg"
144 | File? contigs_lastgraph = "out/~{samplename}_contigs.LastGraph"
145 | String shovill_version = read_string("VERSION")
146 | }
147 | runtime {
148 | docker: "~{docker}"
149 | memory: "16 GB"
150 | cpu: 4
151 | disks: "local-disk " + disk_size + " SSD"
152 | disk: disk_size + " GB"
153 | maxRetries: 3
154 | preemptible: 0
155 | }
156 | }
157 |
--------------------------------------------------------------------------------
/tasks/species_typing/task_kleborate.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task kleborate {
4 | # Inputs
5 | input {
6 | File assembly
7 | String samplename
8 | String kleborate_docker_image = "quay.io/staphb/kleborate:2.2.0"
9 | Int disk_size = 100
10 |
11 | # Parameters
12 | # --resistance Turn on resistance genes screening (default: no resistance gene screening)
13 | # --kaptive Equivalent to --kaptive_k --kaptive_
14 | # --min_identity MIN_IDENTITY Minimum alignment percent identity for main results (default: 90.0)
15 | # --min_coverage MIN_COVERAGE Minimum alignment percent coverage for main results (default: 80.0)
16 | # --min_spurious_identity MIN_SPURIOUS_IDENTITY Minimum alignment percent identity for spurious results (default: 80.0)
17 | # --min_spurious_coverage MIN_SPURIOUS_COVERAGE Minimum alignment percent coverage for spurious results (default: 40.0)
18 | # --min_kaptive_confidence {None,Low,Good,High,Very_high,Perfect} Minimum Kaptive confidence to call K/O loci - confidence levels below this will be reported as unknown (default: Good)
19 | Boolean skip_resistance = false
20 | Boolean skip_kaptive = false
21 | Float min_identity = 90.0
22 | Float min_coverage = 80.0
23 | Float min_spurious_identity = 80.0
24 | Float min_spurious_coverage = 40.0
25 | String min_kaptive_confidence = "Good"
26 | }
27 | command <<<
28 | # capture date and version
29 | # Print and save date
30 | date | tee DATE
31 | # Print and save version
32 | kleborate --version | tee VERSION
33 | # Run Kleborate on the input assembly with the --all flag and output with samplename prefix
34 | kleborate \
35 | ~{true="" false="--resistance" skip_resistance} \
36 | ~{true="" false="--kaptive" skip_kaptive} \
37 | ~{'--min_identity ' + min_identity} \
38 | ~{'--min_coverage ' + min_coverage} \
39 | ~{'--min_spurious_identity ' + min_spurious_identity} \
40 | ~{'--min_spurious_coverage ' + min_spurious_coverage} \
41 | ~{'--min_kaptive_confidence ' + min_kaptive_confidence} \
42 | --outfile ~{samplename}_kleborate_out.tsv \
43 | --assemblies ~{assembly} \
44 | --all
45 | # parse outputs
46 | python3 <>>
119 | output {
120 | File kleborate_output_file = "~{samplename}_kleborate_out.tsv"
121 | String kleborate_version = read_string("VERSION")
122 | String kleborate_docker = kleborate_docker_image
123 | String kleborate_mlst_sequence_type = read_string("MLST_SEQUENCE_TYPE")
124 | String kleborate_virulence_score = read_string("VIRULENCE_SCORE")
125 | String kleborate_resistance_score = read_string("RESISTANCE_SCORE")
126 | String kleborate_num_resistance_genes = read_string("NUM_RESISTANCE_GENES")
127 | String kleborate_bla_resistance_genes = read_string("BLA_RESISTANCE_GENES")
128 | String kleborate_esbl_resistance_genes = read_string("ESBL_RESISTANCE_GENES")
129 | String kleborate_key_resistance_genes = read_string("KEY_RESISTANCE_GENES")
130 | String kleborate_genomic_resistance_mutations = read_string("GENOMIC_RESISTANCE_MUTATIONS")
131 | String kleborate_klocus = read_string("K_LOCUS")
132 | String kleborate_ktype = read_string("K_TYPE")
133 | String kleborate_otype = read_string("O_TYPE")
134 | String kleborate_olocus = read_string("O_LOCUS")
135 | String kleborate_klocus_confidence = read_string("K_LOCUS_CONFIDENCE")
136 | String kleborate_olocus_confidence = read_string("O_LOCUS_CONFIDENCE")
137 | }
138 | runtime {
139 | docker: "~{kleborate_docker_image}"
140 | memory: "16 GB"
141 | cpu: 8
142 | disks: "local-disk " + disk_size + " SSD"
143 | disk: disk_size + " GB"
144 | maxRetries: 3
145 | }
146 | }
--------------------------------------------------------------------------------
/tasks/species_typing/task_shigeifinder.wdl:
--------------------------------------------------------------------------------
1 | version 1.0
2 |
3 | task shigeifinder {
4 | input {
5 | File assembly
6 | String samplename
7 | String docker = "staphb/shigeifinder:1.3.3"
8 | Int disk_size = 100
9 | Int cpu = 2
10 | }
11 | command <<<
12 | # capture date
13 | date | tee DATE
14 | # shigeifinder does not have a --version flag, relying upon the docker image tag for the version - which StaPH-B/Curtis maintains
15 | echo "~{docker}" | sed 's|staphb/shigeifinder:||' | tee VERSION.txt
16 |
17 | # ShigEiFinder checks that all dependencies are installed before running
18 | echo "checking for shigeifinder dependencies and running ShigEiFinder..."
19 | # run shigeifinder on assembly; default is 4cpus, so turning down to 2 since it's already very fast
20 | shigeifinder -i ~{assembly} \
21 | -t ~{cpu} \
22 | --hits \
23 | --output ~{samplename}_shigeifinder.tsv
24 |
25 | # parse output TSV
26 | echo "Parsing ShigEiFinder output TSV..."
27 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 2 >shigeifinder_ipaH_presence_absence.txt
28 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 3 >shigeifinder_num_virulence_plasmid_genes.txt
29 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 4 >shigeifinder_cluster.txt
30 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 5 >shigeifinder_serotype.txt
31 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 6 >shigeifinder_O_antigen.txt
32 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 7 >shigeifinder_H_antigen.txt
33 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 8 >shigeifinder_notes.txt
34 |
35 | # set helpful output strings if field in TSV is blank by overwriting output TXT files
36 | if [ "$(cat shigeifinder_ipaH_presence_absence.txt)" == "" ]; then
37 | echo "ShigEiFinder ipaH field was empty" > shigeifinder_ipaH_presence_absence.txt
38 | fi
39 | if [ "$(cat shigeifinder_num_virulence_plasmid_genes.txt)" == "" ]; then
40 | echo "ShigEiFinder number of virulence plasmid genes field was empty" > shigeifinder_num_virulence_plasmid_genes.txt
41 | fi
42 | if [ "$(cat shigeifinder_cluster.txt)" == "" ]; then
43 | echo "ShigEiFinder cluster field was empty" > shigeifinder_cluster.txt
44 | fi
45 | if [ "$(cat shigeifinder_serotype.txt)" == "" ]; then
46 | echo "ShigEiFinder serotype field was empty" > shigeifinder_serotype.txt
47 | fi
48 | if [ "$(cat shigeifinder_O_antigen.txt)" == "" ]; then
49 | echo "ShigEiFinder O antigen field was empty" > shigeifinder_O_antigen.txt
50 | fi
51 | if [ "$(cat shigeifinder_H_antigen.txt)" == "" ]; then
52 | echo "ShigEiFinder H antigen field was empty" > shigeifinder_H_antigen.txt
53 | fi
54 | if [ "$(cat shigeifinder_notes.txt)" == "" ]; then
55 | echo "ShigEiFinder notes field was empty" > shigeifinder_notes.txt
56 | fi
57 |
58 | >>>
59 | output {
60 | File shigeifinder_report = "~{samplename}_shigeifinder.tsv"
61 | String shigeifinder_docker = docker
62 | String shigeifinder_version = read_string("VERSION.txt")
63 | String shigeifinder_ipaH_presence_absence = read_string("shigeifinder_ipaH_presence_absence.txt")
64 | String shigeifinder_num_virulence_plasmid_genes = read_string("shigeifinder_num_virulence_plasmid_genes.txt")
65 | String shigeifinder_cluster = read_string("shigeifinder_cluster.txt")
66 | String shigeifinder_serotype = read_string("shigeifinder_serotype.txt")
67 | String shigeifinder_O_antigen = read_string("shigeifinder_O_antigen.txt")
68 | String shigeifinder_H_antigen = read_string("shigeifinder_H_antigen.txt")
69 | String shigeifinder_notes = read_string("shigeifinder_notes.txt")
70 | }
71 | runtime {
72 | docker: "~{docker}"
73 | memory: "8 GB"
74 | cpu: cpu
75 | disks: "local-disk " + disk_size + " SSD"
76 | disk: disk_size + " GB"
77 | preemptible: 0
78 | maxRetries: 3
79 | }
80 | }
81 | task shigeifinder_reads {
82 | input {
83 | File read1
84 | File? read2
85 | String samplename
86 | String docker = "staphb/shigeifinder:1.3.3"
87 | Int disk_size = 100
88 | Int cpu = 4
89 | Boolean paired_end = true
90 | }
91 | command <<<
92 | # capture date
93 | date | tee DATE
94 | # shigeifinder does not have a --version flag, relying upon the docker image tag for the version - which StaPH-B/Curtis maintains
95 | echo "~{docker}" | sed 's|staphb/shigeifinder:||' | tee VERSION.txt
96 |
97 | # ShigEiFinder checks that all dependencies are installed before running
98 | echo "checking for shigeifinder dependencies and running ShigEiFinder..."
99 | # run shigeifinder on reads; default is 4cpus, so keeping at 4 since it's doing alignment
100 | shigeifinder -r -i ~{read1} ~{read2} \
101 | ~{true='' false='--single_end' paired_end} \
102 | -t ~{cpu} \
103 | --hits \
104 | --output ~{samplename}_shigeifinder.tsv
105 |
106 | # parse output TSV
107 | echo "Parsing ShigEiFinder output TSV..."
108 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 2 >shigeifinder_ipaH_presence_absence.txt
109 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 3 >shigeifinder_num_virulence_plasmid_genes.txt
110 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 4 >shigeifinder_cluster.txt
111 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 5 >shigeifinder_serotype.txt
112 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 6 >shigeifinder_O_antigen.txt
113 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 7 >shigeifinder_H_antigen.txt
114 | head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 8 >shigeifinder_notes.txt
115 |
116 | # set helpful output strings if field in TSV is blank by overwriting output TXT files
117 | if [ "$(cat shigeifinder_ipaH_presence_absence.txt)" == "" ]; then
118 | echo "ShigEiFinder ipaH field was empty" > shigeifinder_ipaH_presence_absence.txt
119 | fi
120 | if [ "$(cat shigeifinder_num_virulence_plasmid_genes.txt)" == "" ]; then
121 | echo "ShigEiFinder number of virulence plasmid genes field was empty" > shigeifinder_num_virulence_plasmid_genes.txt
122 | fi
123 | if [ "$(cat shigeifinder_cluster.txt)" == "" ]; then
124 | echo "ShigEiFinder cluster field was empty" > shigeifinder_cluster.txt
125 | fi
126 | if [ "$(cat shigeifinder_serotype.txt)" == "" ]; then
127 | echo "ShigEiFinder serotype field was empty" > shigeifinder_serotype.txt
128 | fi
129 | if [ "$(cat shigeifinder_O_antigen.txt)" == "" ]; then
130 | echo "ShigEiFinder O antigen field was empty" > shigeifinder_O_antigen.txt
131 | fi
132 | if [ "$(cat shigeifinder_H_antigen.txt)" == "" ]; then
133 | echo "ShigEiFinder H antigen field was empty" > shigeifinder_H_antigen.txt
134 | fi
135 | if [ "$(cat shigeifinder_notes.txt)" == "" ]; then
136 | echo "ShigEiFinder notes field was empty" > shigeifinder_notes.txt
137 | fi
138 |
139 | >>>
140 | output {
141 | File shigeifinder_report = "~{samplename}_shigeifinder.tsv"
142 | String shigeifinder_docker = docker
143 | String shigeifinder_version = read_string("VERSION.txt")
144 | String shigeifinder_ipaH_presence_absence = read_string("shigeifinder_ipaH_presence_absence.txt")
145 | String shigeifinder_num_virulence_plasmid_genes = read_string("shigeifinder_num_virulence_plasmid_genes.txt")
146 | String shigeifinder_cluster = read_string("shigeifinder_cluster.txt")
147 | String shigeifinder_serotype = read_string("shigeifinder_serotype.txt")
148 | String shigeifinder_O_antigen = read_string("shigeifinder_O_antigen.txt")
149 | String shigeifinder_H_antigen = read_string("shigeifinder_H_antigen.txt")
150 | String shigeifinder_notes = read_string("shigeifinder_notes.txt")
151 | }
152 | runtime {
153 | docker: "~{docker}"
154 | memory: "8 GB"
155 | cpu: cpu
156 | disks: "local-disk " + disk_size + " SSD"
157 | disk: disk_size + " GB"
158 | preemptible: 0
159 | maxRetries: 3
160 | }
161 | }
--------------------------------------------------------------------------------