├── tests
    ├── README.md
    ├── inputs
    │   ├── empty-for-test.txt
    │   ├── wf_theiaprok_illumina_pe_cromwell.json
    │   └── wf_theiaprok_illumina_pe.json
    └── config
    │   └── pytest_filter.yml
├── validation_files
    └── kleb_assembly_input.json
├── workflows
    ├── wf_bc_n_qc_pe_local-dev.wdl
    ├── wf_rasusa.wdl
    ├── wf_serotypefinder.wdl
    ├── fetch_sra_to_fastq.wdl
    ├── wf_gambit_query.wdl
    ├── wf_kraken2_se.wdl
    ├── wf_pmga.wdl
    ├── wf_mycosnp_consensus_assembly.wdl
    ├── wf_kraken2_pe.wdl
    ├── wf_mycosnp_tree.wdl
    ├── wf_kleborate.wdl
    ├── wf_amrfinderplus.wdl
    ├── wf_mashtree_fasta.wdl
    ├── wf_tbprofiler_ont.wdl
    ├── wf_tbprofiler_pe.wdl
    ├── de_novo_assembly.wdl
    ├── wf_ksnp3.wdl
    ├── ecoli_char.wdl
    ├── wf_read_QC_trim.wdl
    ├── wf_read_QC_trim_se.wdl
    ├── compile_ecoli_results.wdl
    └── wf_core_gene_snp.wdl
├── tasks
    ├── task_versioning.wdl
    ├── gene_typing
    │   ├── task_gamma.wdl
    │   ├── task_prokka.wdl
    │   ├── task_abricate.wdl
    │   ├── task_plasmidfinder.wdl
    │   ├── task_bakta.wdl
    │   └── task_resfinder.wdl
    ├── species_typing
    │   ├── task_ssuissero.wdl
    │   ├── task_hpsuissero.wdl
    │   ├── task_lissero.wdl
    │   ├── task_legsta.wdl
    │   ├── task_pasty.wdl
    │   ├── task_seroba.wdl
    │   ├── task_spatyper.wdl
    │   ├── task_sistr.wdl
    │   ├── task_serotypefinder.wdl
    │   ├── task_pmga.wdl
    │   ├── task_ectyper.wdl
    │   ├── task_pbptyper.wdl
    │   ├── task_staphopiasccmec.wdl
    │   ├── task_meningotype.wdl
    │   ├── task_seqsero2.wdl
    │   ├── task_hicap.wdl
    │   ├── task_emmtyper.wdl
    │   ├── task_genotyphi.wdl
    │   ├── task_shigatyper.wdl
    │   ├── task_ngmaster.wdl
    │   ├── task_sonneityping.wdl
    │   ├── task_ts_mlst.wdl
    │   ├── task_agrvate.wdl
    │   ├── task_poppunk_streppneumo.wdl
    │   ├── task_srst2_vibrio.wdl
    │   ├── task_kleborate.wdl
    │   └── task_shigeifinder.wdl
    ├── phylogenetic_inference
    │   ├── task_iqtree.wdl
    │   ├── task_mashtree.wdl
    │   ├── task_mycosnp_tree.wdl
    │   ├── task_ksnp3.wdl
    │   ├── task_snp_dists.wdl
    │   └── task_pirate.wdl
    ├── assembly
    │   ├── task_mycosnp_consensus_assembly.wdl
    │   └── task_shovill.wdl
    ├── quality_control
    │   ├── task_quast.wdl
    │   ├── task_busco.wdl
    │   ├── task_trimmomatic.wdl
    │   ├── task_bbduk.wdl
    │   ├── task_fastp.wdl
    │   ├── task_fastq_scan.wdl
    │   ├── task_fastqc.wdl
    │   ├── task_cg_pipeline.wdl
    │   └── task_mummer_ani.wdl
    ├── utilities
    │   ├── task_rasusa.wdl
    │   └── task_summarize_data.wdl
    └── taxon_id
    │   ├── task_midas.wdl
    │   └── task_kraken2.wdl
├── README.md
├── .github
    └── workflows
    │   ├── miniwdl-check.yml
    │   └── pytest-workflows.yml
└── .dockstore.yml


/tests/README.md:
--------------------------------------------------------------------------------
1 | # Testing info
2 | 


--------------------------------------------------------------------------------
/tests/inputs/empty-for-test.txt:
--------------------------------------------------------------------------------
1 | This file is empty for test purposes.


--------------------------------------------------------------------------------
/validation_files/kleb_assembly_input.json:
--------------------------------------------------------------------------------
1 | {
2 |   "kleborate_wf.assembly": "./validation_files/GCF_000240185.1_ASM24018v2_genomic.fna",
3 |   "kleborate_wf.samplename": "Sample1"
4 | }
5 | 


--------------------------------------------------------------------------------
/workflows/wf_bc_n_qc_pe_local-dev.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "wf_bc_n_qc_pe.wdl" as assembly
 4 | 
 5 | workflow bc_n_qc_local {
 6 |   input {
 7 |     Array[Pair[Array[String], Pair[File,File]]] inputSamples
 8 |   }
 9 | 
10 |   scatter (sample in inputSamples) {
11 |     call assembly.bc_n_qc_pe {
12 |       input:
13 |         samplename = sample.left[0],
14 |         read1_raw  = sample.right.left,
15 |         read2_raw  = sample.right.right
16 |     }
17 |   }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/inputs/wf_theiaprok_illumina_pe_cromwell.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "theiaprok_illumina_pe.samplename": "test",
 3 |   "theiaprok_illumina_pe.read1_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R1.fastq.gz",
 4 |   "theiaprok_illumina_pe.read2_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R2.fastq.gz",
 5 |   "theiaprok_illumina_pe.skip_screen": true,
 6 |    "theiaprok_illumina_pe.read_QC_trim.call_midas": false,
 7 |   "theiaprok_illumina_pe.genome_annotation": "prokka",
 8 |   "theiaprok_illumina_pe.shovill_pe.assembler": "skesa",
 9 |   "theiaprok_illumina_pe.merlin_magic.call_poppunk": false
10 | }
11 | 


--------------------------------------------------------------------------------
/tasks/task_versioning.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task version_capture {
 4 |   input {
 5 |     String? timezone
 6 |   }
 7 |   meta {
 8 |     volatile: true
 9 |   }
10 |   command {
11 |     PHBG_Version="PHBG v1.3.0"
12 |     ~{default='' 'export TZ=' + timezone}
13 |     date +"%Y-%m-%d" > TODAY
14 |     echo "$PHBG_Version" > PHBG_VERSION
15 |   }
16 |   output {
17 |     String date = read_string("TODAY")
18 |     String phbg_version = read_string("PHBG_VERSION")
19 |   }
20 |   runtime {
21 |     memory: "1 GB"
22 |     cpu: 1
23 |     docker: "quay.io/theiagen/utility:1.1"
24 |     disks: "local-disk 10 HDD"
25 |     dx_instance_type: "mem1_ssd1_v2_x2" 
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/tasks/gene_typing/task_gamma.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task gamma_one_sample {
 4 |   input {
 5 |     File assembly_fasta
 6 |     String samplename
 7 |     String docker = "quay.io/biocontainers/gamma:1.4--hdfd78af_0"
 8 |     File gamma_database
 9 |     Int disk_size = 100
10 |   }
11 |   String database_name = basename(gamma_database)
12 |   command <<<
13 |     GAMMA.py ~{assembly_fasta} ~{gamma_database} ~{samplename}
14 |     
15 |     mv ~{samplename}.gamma ~{samplename}_gamma_report.tsv
16 |     
17 |   >>>
18 |   output {
19 |     File gamma_results = "~{samplename}_gamma_report.tsv"
20 |     String gamma_database_version = database_name
21 |     String gamma_docker = docker
22 |   }
23 |   runtime {
24 |     memory: "8 GB"
25 |     cpu: 4
26 |     docker: "~{docker}"
27 |     disks: "local-disk " + disk_size + " SSD"
28 |     disk: disk_size + " GB"
29 |     maxRetries: 3
30 |   }
31 | }


--------------------------------------------------------------------------------
/workflows/wf_rasusa.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/utilities/task_rasusa.wdl" as rasusa
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | 
 6 | workflow rasusa_workflow {
 7 |   input {
 8 |     File read1
 9 |     File? read2
10 |     String samplename
11 |     Float coverage
12 |     String genome_size
13 |   }
14 |   call rasusa.rasusa as rasusa_task {
15 |     input:
16 |       read1 = read1,
17 |       read2 = read2,
18 |       samplename = samplename,
19 |       genome_size = genome_size,
20 |       coverage = coverage
21 |   }
22 |   call versioning.version_capture{
23 |     input:
24 |   }
25 |   output {
26 |     String rasusa_wf_version = version_capture.phbg_version
27 |     String rasusa_wf_analysis_date = version_capture.date
28 | 
29 |     String rasusa_version = rasusa_task.rasusa_version
30 |     File read1_subsampled = rasusa_task.read1_subsampled
31 |     File? read2_subsampled = rasusa_task.read2_subsampled
32 |   }
33 | }


--------------------------------------------------------------------------------
/workflows/wf_serotypefinder.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/task_taxon_id.wdl" as taxon_ID
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | 
 6 | workflow serotypefinder {
 7 |     input {
 8 |         String  samplename
 9 |         File    ecoli_assembly
10 |     }
11 |     call taxon_ID.serotypefinder_one_sample {
12 |     input:
13 |         samplename = samplename,
14 |         ecoli_assembly = ecoli_assembly
15 |     }
16 |     call versioning.version_capture{
17 |       input:
18 |     }
19 |     output {
20 |       String  serotypefinder_wf_version = version_capture.phbg_version
21 |       String  serotypefinder_wf_analysis_date = version_capture.date
22 |       
23 |       String serotypefinder_report  = serotypefinder_one_sample.serotypefinder_report
24 |       String serotypefinder_docker  = serotypefinder_one_sample.serotypefinder_docker
25 |       String serotypefinder_serotype = serotypefinder_one_sample.serotypefinder_serotype
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_ssuissero.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task ssuissero {
 4 |   meta {
 5 |     description: "Serotype prediction of Streptococcus suis assemblies"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/ssuissero:1.0.1--hdfd78af_0"
11 |     Int disk_size = 100
12 |     Int? cpu = 4
13 |     String version = "1.0.1"
14 |   }
15 |   command <<<
16 |     # Does not output a version
17 |     echo ~{version} | tee VERSION
18 |     SsuisSero.sh \
19 |       -i ~{assembly} \
20 |       -o ./ \
21 |       -s ~{samplename} \
22 |       -x fasta \\
23 |       -t ~{cpu}
24 |   >>>
25 |   output {
26 |     File ssuissero_results = "~{samplename}.tsv"
27 |     String ssuissero_version = read_string("VERSION")
28 |   }
29 |   runtime {
30 |     docker: "~{docker}"
31 |     memory: "8 GB"
32 |     cpu: 4
33 |     disks: "local-disk " + disk_size + " SSD"
34 |     disk: disk_size + " GB"
35 |     maxRetries: 3
36 |     preemptible: 0
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_hpsuissero.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task hpsuissero {
 4 |   meta {
 5 |     description: "Serotype prediction of Haemophilus parasuis assemblies"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/hpsuissero:1.0.1--hdfd78af_0"
11 |     Int disk_size = 100
12 |     Int? cpu = 4
13 |     String version = "1.0.1"
14 |   }
15 |   command <<<
16 |     # Does not output a version
17 |     echo ~{version} | tee VERSION
18 |     HpsuisSero.sh \
19 |       -i ~{assembly} \
20 |       -o ./ \
21 |       -s ~{samplename} \
22 |       -x fasta \
23 |       -t ~{cpu}
24 |   >>>
25 |   output {
26 |     File hpsuissero_results = "~{samplename}.tsv"
27 |     String hpsuissero_version = read_string("VERSION")
28 |   }
29 |   runtime {
30 |     docker: "~{docker}"
31 |     memory: "8 GB"
32 |     cpu: 4
33 |     disks: "local-disk " + disk_size + " SSD"
34 |     disk: disk_size + " GB"
35 |     maxRetries: 3
36 |     preemptible: 0
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/workflows/fetch_sra_to_fastq.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | workflow fetch_sra_to_fastq {
 4 | 
 5 |   input {
 6 |     String    SRR
 7 |   }
 8 | 
 9 |   call prefetch_fastq_dump {
10 |     input:
11 |       sra_id=SRR
12 |   }
13 | 
14 |   output {
15 |     File    read1   =prefetch_fastq_dump.read1
16 |     File    read2   =prefetch_fastq_dump.read2
17 |   }
18 | }
19 | 
20 | task prefetch_fastq_dump {
21 | 
22 |   input {
23 |     String    sra_id
24 |   }
25 | 
26 |   command {
27 |     prefetch --version | head -1 | tee VERSION
28 |     prefetch ${sra_id}
29 |     fastq-dump --version | head -1 | tee VERSION
30 |     fastq-dump \
31 |     --gzip \
32 |     --split-files \
33 |     ${sra_id}
34 |   }
35 | 
36 |   output {
37 |     File    read1="${sra_id}_1.fastq.gz"
38 |     File    read2="${sra_id}_2.fastq.gz"
39 |   }
40 | 
41 |   runtime {
42 |     docker:       "quay.io/staphb/sratoolkit:2.9.2"
43 |     memory:       "8 GB"
44 |     cpu:          2
45 |     disks:        "local-disk 100 SSD"
46 |     preemptible:  1
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/workflows/wf_gambit_query.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/taxon_id/task_gambit.wdl" as gambit
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | 
 6 | workflow gambit_query {
 7 |   input {
 8 |     File assembly_fasta
 9 |     String samplename
10 |   }
11 |   call gambit.gambit {
12 |     input:
13 |       assembly = assembly_fasta,
14 |       samplename = samplename,
15 |   }
16 |   call versioning.version_capture {
17 |     input:
18 |   }
19 |   output {
20 |     String gambit_query_wf_version = version_capture.phbg_version
21 |     String gambit_query_wf_analysis_date = version_capture.date
22 |     #Taxon ID
23 |     File gambit_report = gambit.gambit_report_file
24 |     File gambit_closest_genomes = gambit.gambit_closest_genomes_file
25 |     String gambit_predicted_taxon = gambit.gambit_predicted_taxon
26 |     String gambit_predicted_taxon_rank = gambit.gambit_predicted_taxon_rank
27 |     String gambit_version = gambit.gambit_version
28 |     String gambit_db_version = gambit.gambit_db_version
29 |     String gambit_docker = gambit.gambit_docker
30 |   }
31 | }


--------------------------------------------------------------------------------
/tests/config/pytest_filter.yml:
--------------------------------------------------------------------------------
 1 | wf_theiaprok_illumina_pe:
 2 |   - workflows/wf_theiaprok_illumina_pe.wdl
 3 |   - tasks/assembly/task_shovill.wdl
 4 |   - tasks/quality_control/task_quast.wdl
 5 |   - tasks/quality_control/task_cg_pipeline.wdl
 6 |   - tasks/quality_control/task_screen.wdl
 7 |   - tasks/taxon_id/task_gambit.wdl
 8 |   - tasks/gene_typing/task_amrfinderplus.wdl
 9 |   - tasks/species_typing/task_ts_mlst.wdl
10 |   - tasks/task_versioning.wdl
11 |   - tasks/utilities/task_broad_terra_tools.wdl
12 |   - workflows/wf_read_QC_trim.wdl
13 |   - tasks/quality_control/task_trimmomatic.wdl
14 |   - tasks/quality_control/task_bbduk.wdl
15 |   - tasks/quality_control/task_fastq_scan.wdl
16 |   - workflows/wf_merlin_magic.wdl
17 |   - tasks/species_typing/task_serotypefinder.wdl
18 |   - tasks/species_typing/task_ectyper.wdl
19 |   - tasks/species_typing/task_lissero.wdl
20 |   - tasks/species_typing/task_sistr.wdl
21 |   - tasks/species_typing/task_seqsero2.wdl
22 |   - tasks/species_typing/task_kleborate.wdl
23 |   - tasks/species_typing/task_tbprofiler.wdl
24 |   - tasks/species_typing/task_legsta.wdl
25 |   - tasks/species_typing/task_genotyphi.wdl
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | **<p align="center">NOTE: WORKFLOWS FROM THIS REPOSITORY HAVE BEEN MIGRATED TO THE PUBLIC HEALTH BIOINFORMATICS (PHB) REPOSITORY. FUTURE DEVELOPMENTS AND UPDATES FOR THESE WORKFLOWS WILL OCCUR IN [https://github.com/theiagen/public_health_bioinformatics](https://github.com/theiagen/public_health_bioinformatics).</p>**
 3 | 
 4 | ----
 5 | 
 6 | # Public Health Bacterial Genomics
 7 | 
 8 | Bioinformatics workflows for genomic characteriation, submission preparation, and genomic epidemiology of bacterial pathogens of concern.
 9 | 
10 | **More information about the steps undertaken in these workflows is available via the [Theiagen Public Resources Documentation](https://theiagen.notion.site/Theiagen-Public-Health-Resources-a4bd134b0c5c4fe39870e21029a30566).**
11 | 
12 | # Note to Users
13 | This repository and the workflows within are in the early stages of development. We recommend usage of our stable version releases as the main and development branches are subject to routine updates. Please contact support@terrapublichealth.zendesk.com if you would like to be added to our PHBG mailing list and
14 | receive updates and announcements regarding any resource associated with this repository.
15 | 
16 |  
17 | 


--------------------------------------------------------------------------------
/workflows/wf_kraken2_se.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/taxon_id/task_kraken2.wdl" as kraken2
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | 
 6 | workflow kraken2_se_wf {
 7 |   meta {
 8 |     description: "Classify single-end reads using Kraken2"
 9 |   }
10 | 
11 |   input {
12 |     String  samplename
13 |     File    read1
14 |     File    kraken2_db
15 |   }
16 |   call kraken2.kraken2_se {
17 |     input:
18 |       samplename = samplename,
19 |       read1 = read1,
20 |       kraken2_db = kraken2_db
21 |   }
22 |   call versioning.version_capture{
23 |     input:
24 |   }
25 |   output {
26 |     # PHBG Version Captures
27 |     String kraken2_se_wf_version = version_capture.phbg_version
28 |     String kraken2_se_wf_analysis_date = version_capture.date
29 |     # Kraken2
30 |     String kraken2_version = kraken2_se.kraken2_version
31 |     String kraken2_docker = kraken2_se.kraken2_docker
32 |     File kraken2_report = kraken2_se.kraken2_report
33 |     File kraken2_classified_report = kraken2_se.kraken2_classified_report
34 |     File kraken2_unclassified_read1 = kraken2_se.kraken2_unclassified_read1
35 |     File kraken2_classified_read1 = kraken2_se.kraken2_classified_read1
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/workflows/wf_pmga.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/tools/task_pmga.wdl" as pmga
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | 
 6 | workflow pmga_wf {
 7 |     input {
 8 |         File assembly
 9 |         String samplename
10 |     }
11 |     call pmga.pmga {
12 |         input:
13 |             assembly = assembly,
14 |             samplename = samplename
15 |     }
16 |     call versioning.version_capture{
17 |         input:
18 |     }
19 |     output {
20 |         String pmga_wf_version = version_capture.phbg_version
21 |         String pmga_wf_analysis_date = version_capture.date
22 |         String pmga_version = pmga.version
23 |         String pmga_docker = pmga.docker
24 |         String pmga_speciesdb = pmga.pmga_speciesdb
25 |         String pmga_serotype = pmga.pmga_serotype
26 |         String pmga_genes = pmga.pmga_genes
27 |         String pmga_notes = pmga.pmga_notes
28 |         File pmga_results = pmga.pmga_results
29 |         File pmga_allele_matrix = pmga.pmga_allele_matrix
30 |         File pmga_blast_final = pmga.pmga_blast_final
31 |         File pmga_blast_raw = pmga.pmga_blast_raw
32 |         File pmga_loci_counts = pmga.pmga_loci_counts
33 |         File pmga_gff = pmga.pmga_gff
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_lissero.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task lissero {
 4 |   meta {
 5 |     description: "Serogroup typing prediction for Listeria monocytogenes"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/lissero:0.4.9--py_0"
11 |     Int disk_size = 100
12 |     Int? cpu = 2
13 | 
14 |     # Parameters
15 |     # --min_id     Minimum percent identity to accept a match [Default 95.0]
16 |     # --min_cov    Minimum coverage of the gene to accept a match [Default 95.0]
17 |     Float min_id = 95.0
18 |     Float min_cov = 95.0
19 |   }
20 |   command <<<
21 |     echo $(lissero --version 2>&1) | sed 's/^.*LisSero //' | tee VERSION
22 |     lissero \
23 |       ~{'--min_id ' + min_id} \
24 |       ~{'--min_cov ' + min_cov} \
25 |       ~{assembly} \
26 |       > ~{samplename}.tsv
27 | 
28 |     # pull out serotype
29 |     tail -n+2 ~{samplename}.tsv | cut -f2 | tee SEROTYPE
30 |   >>>
31 |   output {
32 |     File lissero_results = "~{samplename}.tsv"
33 |     String lissero_version = read_string("VERSION")
34 |     String lissero_serotype = read_string("SEROTYPE")
35 |   }
36 |   runtime {
37 |     docker: "~{docker}"
38 |     memory: "8 GB"
39 |     cpu: 2
40 |     disks: "local-disk " + disk_size + " SSD"
41 |     disk: disk_size + " GB"
42 |     maxRetries: 3
43 |     preemptible: 0
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/workflows/wf_mycosnp_consensus_assembly.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/assembly/task_mycosnp_consensus_assembly.wdl" as mycosnp_nf
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | 
 6 | workflow mycosnp_consensus_assembly {
 7 |   meta {
 8 |     description: "A WDL wrapper around the qc, processing and consensus assembly components of mycosnp-nf, for whole genome sequencing analysis of fungal organisms, including Candida auris."
 9 |   }
10 |   input {
11 |     File    read1
12 |     File    read2
13 |     String  samplename
14 |   }
15 |   call mycosnp_nf.mycosnp {
16 |     input:
17 |       read1 = read1,
18 |       read2 = read2,
19 |       samplename = samplename
20 |   }
21 |   call versioning.version_capture{
22 |     input:
23 |   }
24 |   output {
25 |     #Version Captures
26 |     String mycosnp_consensus_assembly_version = version_capture.phbg_version
27 |     String mycosnp_consensus_assembly_analysis_date = version_capture.date
28 |     #MycoSNP QC and Assembly
29 |     String mycosnp_version = mycosnp.mycosnp_version
30 |     String mycosnp_docker = mycosnp.mycosnp_docker
31 |     String analysis_date = mycosnp.analysis_date
32 |     String reference_strain = mycosnp.reference_strain
33 |     String reference_accession = mycosnp.reference_accession
34 |     File assembly_fasta = mycosnp.assembly_fasta
35 |     File full_results = mycosnp.full_results
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_legsta.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task legsta {
 4 |   meta {
 5 |     description: "Typing of Legionella pneumophila assemblies"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/legsta:0.5.1--hdfd78af_2"
11 |     Int disk_size = 100
12 |     Int? cpu = 2
13 |   }
14 |   command <<<
15 |     echo $(legsta --version 2>&1) | sed 's/^.*legsta //; s/ .*\$//;' | tee VERSION
16 |     legsta \
17 |       ~{assembly} > ~{samplename}.tsv
18 |     
19 |     # parse outputs
20 |     if [ ! -f ~{samplename}.tsv ]; then
21 |       SBT="No SBT predicted"
22 |     else
23 |       SBT="ST$(tail -n 1 ~{samplename}.tsv | cut -f 2)"
24 |         if [ "$SBT" == "ST-" ]; then
25 |           SBT="No SBT predicted"
26 |         else
27 |           if [ "$SBT" == "ST" ]; then
28 |             SBT="No SBT predicted"
29 |           fi
30 |         fi  
31 |     fi
32 | 
33 |     echo $SBT | tee LEGSTA_SBT
34 | 
35 |   >>>
36 |   output {
37 |     File legsta_results = "~{samplename}.tsv"
38 |     String legsta_predicted_sbt = read_string("LEGSTA_SBT")
39 |     String legsta_version = read_string("VERSION")
40 |   }
41 |   runtime {
42 |     docker: "~{docker}"
43 |     memory: "8 GB"
44 |     cpu: 2
45 |     disks: "local-disk " + disk_size + " SSD"
46 |     disk: disk_size + " GB"
47 |     maxRetries: 3
48 |     preemptible: 0
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/workflows/wf_kraken2_pe.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/taxon_id/task_kraken2.wdl" as kraken2
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | 
 6 | workflow kraken2_pe_wf {
 7 |   meta {
 8 |     description: "Classify paired-end reads using Kraken2"
 9 |   }
10 | 
11 |   input {
12 |     String  samplename
13 |     File    read1
14 |     File    read2
15 |     File    kraken2_db
16 |   }
17 |   call kraken2.kraken2_pe {
18 |     input:
19 |       samplename = samplename,
20 |       read1 = read1,
21 |       read2 = read2,
22 |       kraken2_db = kraken2_db
23 |   }
24 |   call versioning.version_capture{
25 |     input:
26 |   }
27 |   output {
28 |     # PHBG Version Captures
29 |     String kraken2_pe_wf_version = version_capture.phbg_version
30 |     String kraken2_pe_wf_analysis_date = version_capture.date
31 |     # Kraken2
32 |     String kraken2_version = kraken2_pe.kraken2_version
33 |     String kraken2_docker = kraken2_pe.kraken2_docker
34 |     File kraken2_report = kraken2_pe.kraken2_report
35 |     File kraken2_classified_report = kraken2_pe.kraken2_classified_report
36 |     File kraken2_unclassified_read1 = kraken2_pe.kraken2_unclassified_read1
37 |     File kraken2_unclassified_read2 = kraken2_pe.kraken2_unclassified_read2
38 |     File kraken2_classified_read1 = kraken2_pe.kraken2_classified_read1
39 |     File kraken2_classified_read2 = kraken2_pe.kraken2_classified_read2
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/workflows/wf_mycosnp_tree.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/phylogenetic_inference/task_mycosnp_tree.wdl" as mycosnptree_nf
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | 
 6 | workflow mycosnp_tree {
 7 |   meta {
 8 |     description: "A WDL wrapper around the phylogeny components of mycosnp-nf, for whole genome sequencing analysis of fungal organisms, including Candida auris."
 9 |   }
10 |   input {
11 |     Array[String] samplename
12 |     Array[File] assembly_fasta
13 |   }
14 |   call mycosnptree_nf.mycosnptree {
15 |     input:
16 |       assembly_fasta = assembly_fasta,
17 |       samplename = samplename
18 |   }
19 |   call versioning.version_capture{
20 |     input:
21 |   }
22 |   output {
23 |     #Version Captures
24 |     String mycosnp_tree_version = version_capture.phbg_version
25 |     String mycosnp_tree_analysis_date = version_capture.date
26 |     #MycoSNP QC and Assembly
27 |     String mycosnp_version = mycosnptree.mycosnptree_version
28 |     String mycosnp_docker = mycosnptree.mycosnptree_docker
29 |     String analysis_date = mycosnptree.analysis_date
30 |     String reference_strain = mycosnptree.reference_strain
31 |     String reference_accession = mycosnptree.reference_accession
32 |     File mycosnp_tree_finaltree = mycosnptree.mycosnptree_tree
33 |     File mycosnp_tree_iqtree_log = mycosnptree.mycosnptree_iqtree_log
34 |     File mycosnp_tree_full_results = mycosnptree.mycosnptree_full_results
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/workflows/wf_kleborate.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | 
 4 | import "../tasks/task_taxon_id.wdl" as taxon
 5 | import "../tasks/task_versioning.wdl" as versioning
 6 | 
 7 | workflow kleborate_wf {
 8 |   input {
 9 |       File assembly
10 |       String samplename
11 |     }
12 |   call taxon.kleborate_one_sample {
13 |     input:
14 |       assembly = assembly,
15 |       samplename = samplename
16 |     }
17 |   call versioning.version_capture{
18 |     input:
19 |   }
20 |   output {
21 |     String kleborate_wf_version = version_capture.phbg_version
22 |     String kleborate_wf_analysis_date = version_capture.date
23 |     
24 |     File kleborate_report = kleborate_one_sample.kleborate_output_file
25 |     String kleborate_version = kleborate_one_sample.version
26 |     String kleborate_mlst_sequence_type = kleborate_one_sample.mlst_sequence_type
27 |     String kleborate_virulence_score = kleborate_one_sample.virulence_score
28 |     String kleborate_resistance_score = kleborate_one_sample.resistance_score
29 |     String kleborate_num_resistance_genes = kleborate_one_sample.num_resistance_genes
30 |     String kleborate_bla_resistance_genes = kleborate_one_sample.bla_resistance_genes
31 |     String kleborate_esbl_resistance_genes = kleborate_one_sample.esbl_resistance_genes
32 |     String kleborate_key_resistance_genes = kleborate_one_sample.key_resistance_genes
33 |     String kleborate_resistance_mutations = kleborate_one_sample.resistance_mutations
34 |     }
35 |  }
36 | 


--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_iqtree.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task iqtree {
 4 |   input {
 5 |     File alignment
 6 |     String cluster_name
 7 |     String iqtree_model = "GTR+I+G" # For comparison to other tools use HKY for bactopia, GTR+F+I for grandeur, GTR+G4 for nullarbor, GTR+G for dryad
 8 |     String iqtree_bootstraps = 1000 #  Ultrafast bootstrap replicates
 9 |     String alrt = 1000 # SH-like approximate likelihood ratio test (SH-aLRT) replicates
10 |     String? iqtree_opts = ""
11 |     String docker = "staphb/iqtree:1.6.7"
12 |     Int disk_size = 100
13 |   }
14 |   command <<<
15 |     # date and version control
16 |     date | tee DATE
17 |     iqtree --version | grep version | sed 's/.*version/version/;s/ for Linux.*//' | tee VERSION
18 | 
19 |     numGenomes=`grep -o '>' ~{alignment} | wc -l`
20 |     if [ $numGenomes -gt 3 ]
21 |     then
22 |       cp ~{alignment} ./msa.fasta
23 |       iqtree \
24 |       -nt AUTO \
25 |       -s msa.fasta \
26 |       -m ~{iqtree_model} \
27 |       -bb ~{iqtree_bootstraps} \
28 |       -alrt ~{alrt} \
29 |       ~{iqtree_opts}
30 | 
31 |       cp msa.fasta.contree ~{cluster_name}_msa.tree
32 |     fi
33 |   >>>
34 |   output {
35 |     String date = read_string("DATE")
36 |     String version = read_string("VERSION")
37 |     File ml_tree = "~{cluster_name}_msa.tree"
38 |   }
39 |   runtime {
40 |     docker: "~{docker}"
41 |     memory: "32 GB"
42 |     cpu: 4
43 |     disks: "local-disk " + disk_size + " SSD"
44 |     disk: disk_size + " GB"
45 |     preemptible: 0
46 |     maxRetries: 3
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/tasks/gene_typing/task_prokka.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task prokka {
 4 |   input {
 5 |     File assembly
 6 |     String samplename
 7 |     Int cpu = 8
 8 |     Int memory = 16
 9 |     String docker = "staphb/prokka:1.14.5"
10 |     Int disk_size = 100
11 |     # Parameters 
12 |     #  proteins recommended: when you have good quality reference genomes and want to ensure gene naming is consistent [false]
13 |     #  prodigal_tf: prodigal training file
14 |     # prokka_arguments: free string to add any other additional prokka arguments
15 |     Boolean proteins = false
16 |     Boolean compliant = true
17 |     File? prodigal_tf
18 |     String? prokka_arguments
19 |   }
20 |   command <<<
21 |   date | tee DATE
22 |   prokka --version | tee PROKKA_VERSION
23 |     
24 |   prokka \
25 |     ~{prokka_arguments} \
26 |     --cpus 0 \
27 |     --prefix ~{samplename} \
28 |     ~{true='--compliant' false='' compliant} \
29 |     ~{true='--proteins' false='' proteins} \
30 |     ~{'--prodigaltf ' + prodigal_tf} \
31 |     ~{assembly}
32 |   
33 |     
34 |   >>>
35 |   output {
36 |     File prokka_gff = "~{samplename}/~{samplename}.gff"
37 |     File prokka_gbk = "~{samplename}/~{samplename}.gbk"
38 |     File prokka_sqn = "~{samplename}/~{samplename}.sqn"
39 |     Array[File] prokka_outs = glob("~{samplename}/~{samplename}*")
40 |     String prokka_version = read_string("PROKKA_VERSION")
41 |   }
42 |   runtime {
43 |     memory: "~{memory} GB"
44 |     cpu: cpu
45 |     docker: docker
46 |     disks: "local-disk " + disk_size + " SSD"
47 |     disk: disk_size + " GB"
48 |     maxRetries: 3
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/workflows/wf_amrfinderplus.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/gene_typing/task_amrfinderplus.wdl" as amrfindertask
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | 
 6 | workflow amrfinderplus_wf {
 7 |   input {
 8 |       File assembly
 9 |       String samplename
10 |     }
11 |   call amrfindertask.amrfinderplus_nuc {
12 |     input:
13 |       assembly = assembly,
14 |       samplename = samplename
15 |     }
16 |   call versioning.version_capture{
17 |     input:
18 |   }
19 |   output {
20 |     String amrfinderplus_version = amrfinderplus_nuc.amrfinderplus_version
21 |     String amrfinderplus_db_version = amrfinderplus_nuc.amrfinderplus_db_version
22 |     String amrfinderplus_wf_version = version_capture.phbg_version
23 |     String amrfinderplus_wf_analysis_date = version_capture.date
24 |     File amrfinderplus_all_report = amrfinderplus_nuc.amrfinderplus_all_report
25 |     File amrfinderplus_amr_report = amrfinderplus_nuc.amrfinderplus_amr_report
26 |     File amrfinderplus_stress_report = amrfinderplus_nuc.amrfinderplus_stress_report
27 |     File amrfinderplus_virulence_report = amrfinderplus_nuc.amrfinderplus_virulence_report
28 |     String amrfinderplus_amr_genes = amrfinderplus_nuc.amrfinderplus_amr_genes
29 |     String amrfinderplus_stress_genes = amrfinderplus_nuc.amrfinderplus_stress_genes
30 |     String amrfinderplus_virulence_genes = amrfinderplus_nuc.amrfinderplus_virulence_genes
31 |     String amrfinderplus_amr_classes = amrfinderplus_nuc.amrfinderplus_amr_classes
32 |     String amrfinderplus_amr_subclasses = amrfinderplus_nuc.amrfinderplus_amr_subclasses
33 |     }
34 |  }
35 | 


--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_mashtree.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task mashtree_fasta {
 4 |   input {
 5 |     Array[File] assembly_fasta
 6 |     String cluster_name
 7 |     Int truncLength = 250
 8 |     String sort_order = "ABC"
 9 |     Int genomesize = 5000000
10 |     Int mindepth = 5
11 |     Int kmerlength = 21
12 |     Int sketchsize = 10000
13 |     Int cpu = 16
14 |     Int memory = 64
15 |     Int disk_size = 100
16 |   }
17 |   command <<<
18 |     # date and version control
19 |     date | tee DATE
20 |     mashtree -v | tee VERSION
21 |     
22 |     # organize input assemblies
23 |     mkdir mash_assemblies
24 |     mv ~{sep=' ' assembly_fasta} mash_assemblies
25 |     #run mashtree
26 |     mashtree \
27 |       ~{'--truncLength ' + truncLength} \
28 |       ~{'--sort-order ' + sort_order} \
29 |       ~{'--genomesize ' + genomesize} \
30 |       ~{'--mindepth ' + mindepth} \
31 |       ~{'--kmerlength ' + kmerlength} \
32 |       ~{'--sketch-size ' + sketchsize} \
33 |       ~{'--numcpus ' + cpu} \
34 |       ~{'--outmatrix ' + cluster_name + '.tsv'} \
35 |       ~{'--outtree ' + cluster_name + '.nwk'} \
36 |       mash_assemblies/*
37 |       
38 |   >>>
39 |   output {
40 |     String date = read_string("DATE")
41 |     String version = read_string("VERSION")
42 |     File mashtree_matrix = "~{cluster_name}.tsv"
43 |     File mashtree_tree = "~{cluster_name}.nwk"
44 |   }
45 |   runtime {
46 |     docker: "quay.io/staphb/mashtree:1.2.0"
47 |     memory: "~{memory} GB"
48 |     cpu: cpu
49 |     disks: "local-disk " + disk_size + " SSD"
50 |     disk: disk_size + " GB"
51 |     maxRetries: 3
52 |     preemptible: 0
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_pasty.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task pasty {
 4 |   input {
 5 |     File  assembly
 6 |     String  samplename
 7 |     Int min_pident = 95
 8 |     Int min_coverage = 95
 9 |     String docker = "staphb/pasty:1.0.2"
10 |     Int disk_size = 100
11 |   }
12 |   command <<<
13 |     # date and version control
14 |     date | tee DATE
15 |     pasty --version > VERSION && sed -i -e 's/pasty\, version //' VERSION  
16 |     pasty \
17 |     --assembly ~{assembly} \
18 |     --min_pident ~{min_pident} \
19 |     --min_coverage ~{min_coverage} \
20 |     --prefix ~{samplename} \
21 |     --outdir .  
22 |     awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f2 > SEROGROUP
23 |     awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f3 > COVERAGE
24 |     awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f4 > FRAGMENTS
25 |     awk 'FNR==2' "~{samplename}.tsv" | cut -d$'\t' -f5 > COMMENT
26 |   >>>
27 |   output {
28 |     String pasty_serogroup = read_string("SEROGROUP")
29 |     Float pasty_serogroup_coverage = read_float("COVERAGE")
30 |     Int pasty_serogroup_fragments = read_int("FRAGMENTS")
31 |     File pasty_summary_tsv = "~{samplename}.tsv"
32 |     File pasty_blast_hits = "~{samplename}.blastn.tsv"
33 |     File pasty_all_serogroups = "~{samplename}.details.tsv"
34 |     String pasty_version = read_string("VERSION")
35 |     String pasty_pipeline_date = read_string("DATE")
36 |     String pasty_docker = docker
37 |     String pasty_comment = read_string("COMMENT")
38 |   }
39 |   runtime {
40 |     docker: "~{docker}"
41 |     memory: "4 GB"
42 |     cpu: 2
43 |     disks: "local-disk " + disk_size + " SSD"
44 |     disk: disk_size + " GB"
45 |     maxRetries: 3
46 |     preemptible: 0
47 |   }
48 | }


--------------------------------------------------------------------------------
/tasks/species_typing/task_seroba.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task seroba {
 4 |   input {
 5 |     File read1
 6 |     File? read2
 7 |     String samplename
 8 |     String docker = "staphb/seroba:1.0.2"
 9 |     Int disk_size = 100
10 |   }
11 |   command <<<
12 |     # grab version
13 |     seroba version > VERSION
14 | 
15 |     # database path will need to be changed if/when docker image is updated. 
16 |     seroba runSerotyping /seroba-1.0.2/database/ ~{read1} ~{read2} ~{samplename}
17 | 
18 |     # check for serotype grouping & contamination flag
19 |     cut -f2 ~{samplename}/pred.tsv > SEROTYPE
20 | 
21 |     # check for detailed serogroup information
22 |     if [ -f ~{samplename}/detailed_serogroup_info.txt ]; then 
23 |       grep "Serotype predicted by ariba" ~{samplename}/detailed_serogroup_info.txt | cut -f2 | sed 's/://' > ARIBA_SEROTYPE
24 |       grep "assembly from ariba" ~{samplename}/detailed_serogroup_info.txt | cut -f2 | sed 's/://' > ARIBA_IDENTITY
25 |     else 
26 |       # if the details do not exist, output blanks to ariba columns
27 |       echo "" > ARIBA_SEROTYPE
28 |       echo "" > ARIBA_IDENTITY
29 |     fi
30 |   >>>
31 |   output {
32 |     String seroba_version = read_string("VERSION")
33 |     String seroba_docker = docker
34 |     String seroba_serotype = read_string("SEROTYPE")
35 |     String seroba_ariba_serotype = read_string("ARIBA_SEROTYPE")
36 |     String seroba_ariba_identity = read_string("ARIBA_IDENTITY")
37 |     File? seroba_details = "~{samplename}/detailed_serogroup_info.txt"
38 |   }
39 |   runtime {
40 |     docker: "~{docker}"
41 |     memory: "16 GB"
42 |     cpu: 8
43 |     disks: "local-disk " + disk_size + " SSD"
44 |     disk: disk_size + " GB"
45 |     maxRetries: 3
46 |   }
47 | }


--------------------------------------------------------------------------------
/tasks/assembly/task_mycosnp_consensus_assembly.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task mycosnp {
 4 |   input {
 5 |     File read1
 6 |     File read2
 7 |     String samplename
 8 |     String docker = "quay.io/theiagen/mycosnp:dev"
 9 |     String strain = "B11205"
10 |     String accession = "GCA_016772135"
11 |     Int memory = 16
12 |     Int cpu = 4
13 |     Int disk_size = 100
14 |   }
15 |   command <<<
16 |     date | tee DATE
17 |     echo $(nextflow pull rpetit3/mycosnp-nf 2>&1) | sed 's/^.*revision: //;' | tee MYCOSNP_VERSION
18 | 
19 |     # Make sample FOFN
20 |     echo "sample,fastq_1,fastq_2" > sample.csv
21 |     echo "~{samplename},~{read1},~{read2}" >> sample.csv
22 | 
23 |     # Run MycoSNP
24 |     mkdir ~{samplename}
25 |     cd ~{samplename}
26 |     if nextflow run rpetit3/mycosnp-nf --input ../sample.csv --ref_dir /reference/~{accession} --publish_dir_mode copy --skip_phylogeny; then
27 |       # Everything finished, pack up the results and clean up
28 |       rm -rf .nextflow/ work/
29 |       cd ..
30 |       tar -cf - ~{samplename}/ | gzip -n --best  > ${samplename}.tar.gz
31 |     else
32 |       # Run failed
33 |       exit 1
34 |     fi
35 |   >>>
36 |   output {
37 |     String mycosnp_version = read_string("MYCOSNP_VERSION")
38 |     String mycosnp_docker = docker
39 |     String analysis_date = read_string("DATE")
40 |     String reference_strain = strain
41 |     String reference_accession = accession
42 |     File assembly_fasta = "~{samplename}/results/combined/consensus/~{samplename}.fasta.gz"
43 |     File full_results = "~{samplename}.tar.gz"
44 |   }
45 |   runtime {
46 |     docker: "~{docker}"
47 |     memory: "~{memory} GB"
48 |     cpu: cpu
49 |     disks:  "local-disk " + disk_size + " SSD"
50 |     disk: disk_size + " GB"
51 |     maxRetries: 3
52 |     preemptible: 0
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_spatyper.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task spatyper {
 4 |   meta {
 5 |     description: "Computational method for finding spa types in Staphylococcus aureus"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/spatyper:0.3.3--pyhdfd78af_3"
11 |     Int disk_size = 100
12 |     Int cpu = 4
13 | 
14 |     # Parameters
15 |     # --do_enrich Do PCR product enrichment
16 |     Boolean do_enrich = false
17 |   }
18 |   command <<<
19 |     spaTyper --version 2>&1 | sed 's/^.*spaTyper //' | tee VERSION
20 |     spaTyper \
21 |       ~{true="--do_enrich" false="" do_enrich} \
22 |       --fasta ~{assembly} \
23 |       --output ~{samplename}.tsv
24 | 
25 |     python3 <<CODE
26 |     import csv
27 | 
28 |     TYPE = []
29 |     REPEATS = []
30 | 
31 |     with open("./~{samplename}.tsv",'r') as tsv_file:
32 |       tsv_reader=csv.reader(tsv_file, delimiter="\t")
33 |       next(tsv_reader, None)  # skip the headers
34 |       for row in tsv_reader:
35 |         TYPE.append(row[-1])
36 |         REPEATS.append(row[-2])
37 | 
38 |       with open ("TYPE", 'wt') as TYPE_fh:
39 |         TYPE_fh.write(','.join(TYPE))
40 | 
41 |       with open ("REPEATS", 'wt') as REPEATS_fh:
42 |         REPEATS_fh.write(','.join(REPEATS))
43 |     CODE
44 |   >>>
45 |   output {
46 |       File spatyper_tsv = "~{samplename}.tsv"
47 |       String spatyper_repeats = read_string("REPEATS")
48 |       String spatyper_type = read_string("TYPE")
49 |       String spatyper_version = read_string("VERSION")
50 |       String spatyper_docker = "~{docker}"
51 |   }
52 |   runtime {
53 |     docker: "~{docker}"
54 |     memory: "8 GB"
55 |     cpu: cpu
56 |     disks: "local-disk " + disk_size + " SSD"
57 |     disk: disk_size + " GB"
58 |     maxRetries: 3
59 |     preemptible: 0
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/tasks/gene_typing/task_abricate.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task abricate {
 4 |   input {
 5 |     File assembly
 6 |     String samplename
 7 |     String database
 8 |     # Parameters 
 9 |     #  --minid Minimum DNA %identity [80]
10 |     # --mincov Minimum DNA %coverage [80]
11 |     Int? minid
12 |     Int? mincov
13 |     Int cpu = 2
14 |     String docker = "staphb/abricate:1.0.1-abaum-plasmid"
15 |     Int disk_size = 100
16 |   }
17 |   command <<<
18 |     date | tee DATE
19 |     abricate -v | tee ABRICATE_VERSION
20 |     abricate --list
21 |     abricate --check
22 |     
23 |     abricate \
24 |       --db ~{database} \
25 |       ~{'--minid ' + minid} \
26 |       ~{'--mincov ' + mincov} \
27 |       --threads ~{cpu} \
28 |       --nopath \
29 |       ~{assembly} > ~{samplename}_abricate_hits.tsv
30 |     
31 |     # parse out gene names into list of strings, comma-separated, final comma at end removed by sed
32 |     abricate_genes=$(awk -F '\t' '{ print $6 }' ~{samplename}_abricate_hits.tsv | tail -n+2 | tr '\n' ',' | sed 's/.$//')
33 | 
34 |     # if variable for list of genes is EMPTY, write string saying it is empty to float to Terra table
35 |     if [ -z "${abricate_genes}" ]; then
36 |        abricate_genes="No genes detected by ABRicate"
37 |     fi
38 | 
39 |     # create final output strings
40 |     echo "${abricate_genes}" > ABRICATE_GENES
41 |   >>>
42 |   output {
43 |     File abricate_results = "~{samplename}_abricate_hits.tsv"
44 |     String abricate_genes = read_string("ABRICATE_GENES")
45 |     String abricate_database = database
46 |     String abricate_version = read_string("ABRICATE_VERSION")
47 |     String abricate_docker = docker 
48 |   }
49 |   runtime {
50 |     memory: "8 GB"
51 |     cpu: cpu
52 |     docker: docker
53 |     disks:  "local-disk " + disk_size + " SSD"
54 |     disk: disk_size + " GB"
55 |     maxRetries: 3
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_sistr.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task sistr {
 4 |   meta {
 5 |     description: "Serovar prediction of Salmonella assemblies"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/sistr_cmd:1.1.1--pyh864c0ab_2"
11 |     Int disk_size = 100
12 |     Int? cpu = 4
13 | 
14 |     # Parameters
15 |     # --use-full-cgmlst-db  Use the full set of cgMLST alleles which can include highly similar alleles. By default the smaller "centroid" alleles or representative alleles are used for each marker. 
16 |     Boolean use_full_cgmlst_db = false
17 |   }
18 |   command <<<
19 |     echo $(sistr --version 2>&1) | sed 's/^.*sistr_cmd //; s/ .*\$//' | tee VERSION
20 |     sistr \
21 |       --qc \
22 |       ~{true="--use-full-cgmlst-db" false="" use_full_cgmlst_db} \
23 |       --threads ~{cpu} \
24 |       --alleles-output ~{samplename}-allele.json \
25 |       --novel-alleles ~{samplename}-allele.fasta \
26 |       --cgmlst-profiles ~{samplename}-cgmlst.csv \
27 |       --output-prediction ~{samplename} \
28 |       --output-format tab \
29 |       ~{assembly}
30 |     
31 |     mv ~{samplename}.tab ~{samplename}.tsv
32 |     
33 |     # parse sistr TSV
34 |     cut -f 15 ~{samplename}.tsv | tail -n 1 | tee PREDICTED_SEROTYPE
35 |     
36 |   >>>
37 |   output {
38 |     File sistr_results = "~{samplename}.tsv"
39 |     File sistr_allele_json = "~{samplename}-allele.json"
40 |     File sistr_allele_fasta = "~{samplename}-allele.fasta"
41 |     File sistr_cgmlst = "~{samplename}-cgmlst.csv"
42 |     String sistr_version = read_string("VERSION")
43 |     String sistr_predicted_serotype = read_string("PREDICTED_SEROTYPE")
44 |   }
45 |   runtime {
46 |     docker: "~{docker}"
47 |     memory: "8 GB"
48 |     cpu: 4
49 |     disks: "local-disk " + disk_size + " SSD"
50 |     disk: disk_size + " GB"
51 |     maxRetries: 3
52 |     preemptible: 0
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_serotypefinder.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task serotypefinder {
 4 |   input {
 5 |     File assembly
 6 |     String samplename
 7 |     String docker = "quay.io/staphb/serotypefinder:2.0.1"
 8 |     Int disk_size = 100
 9 |   }
10 |   command <<<
11 |     # capture date and version
12 |     date | tee DATE
13 | 
14 |     serotypefinder.py -i ~{assembly}  -x -o .
15 |     mv results_tab.tsv ~{samplename}_results_tab.tsv
16 | 
17 |     # set H and O type based on serotypefinder ourputs
18 |     python3 <<CODE
19 |     import csv
20 |     import re
21 | 
22 |     antigens = []
23 |     h_re = re.compile("H[0-9]*")
24 |     o_re = re.compile("O[0-9]*")
25 | 
26 |     with open("~{samplename}_results_tab.tsv",'r') as tsv_file:
27 |       tsv_reader = csv.DictReader(tsv_file, delimiter="\t")
28 |       for row in tsv_reader:
29 |           if row.get('Serotype') not in antigens:
30 |             antigens.append(row.get('Serotype'))
31 |     print("Antigens: " + str(antigens))
32 | 
33 |     h_type = "/".join(set("/".join(list(filter(h_re.match, antigens))).split('/')))
34 |     print("H-type: " + h_type)
35 |     o_type = "/".join(set("/".join(list(filter(o_re.match,antigens))).split('/')))
36 |     print("O-type: " + o_type)
37 | 
38 |     serotype = "{}:{}".format(o_type,h_type)
39 |     if serotype == ":":
40 |       serotype = "NA"
41 |     print("Serotype: " + serotype)
42 | 
43 |     with open ("STF_SEROTYPE", 'wt') as stf_serotype:
44 |       stf_serotype.write(str(serotype))
45 |     CODE
46 |   >>>
47 |   output {
48 |     File serotypefinder_report = "~{samplename}_results_tab.tsv"
49 |     String serotypefinder_docker = docker
50 |     String serotypefinder_serotype = read_string("STF_SEROTYPE")
51 |   }
52 |   runtime {
53 |     docker: "~{docker}"
54 |     memory: "8 GB"
55 |     cpu: 2
56 |     disks: "local-disk " + disk_size + " SSD"
57 |     disk: disk_size + " GB"
58 |     maxRetries: 3
59 |     preemptible:  0
60 |   }
61 | }


--------------------------------------------------------------------------------
/tasks/quality_control/task_quast.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task quast {
 4 |   input {
 5 |     File assembly
 6 |     String samplename
 7 |     String docker="quay.io/staphb/quast:5.0.2"
 8 |     Int disk_size = 100
 9 |   }
10 |   command <<<
11 |     # capture date and version
12 |     date | tee DATE
13 |     quast.py --version | grep QUAST | tee VERSION
14 | 
15 |     quast.py ~{assembly} -o .
16 |     mv report.tsv ~{samplename}_report.tsv
17 |     
18 |     python <<CODE
19 |     import csv
20 |     #grab output genome length and number contigs by column header
21 |     with open("~{samplename}_report.tsv",'r') as tsv_file:
22 |       tsv_reader = csv.reader(tsv_file, delimiter="\t")
23 |       for line in tsv_reader:
24 |           if "Total length" in line[0]:
25 |             with open("GENOME_LENGTH", 'wt') as genome_length:
26 |               genome_length.write(line[1])
27 |           if "# contigs" in line[0]:
28 |             with open("NUMBER_CONTIGS", 'wt') as number_contigs:
29 |               number_contigs.write(line[1])
30 |           if "N50" in line[0]:
31 |             with open("N50_VALUE", 'wt') as n50_value:
32 |               n50_value.write(line[1])
33 |           if "GC" in line[0]:
34 |             with open("GC_PERCENT", 'wt') as gc_percent:
35 |               gc_percent.write(line[1])              
36 | 
37 |     CODE
38 | 
39 |   >>>
40 |   output {
41 |     File quast_report = "${samplename}_report.tsv"
42 |     String version = read_string("VERSION")
43 |     String pipeline_date = read_string("DATE")
44 |     Int genome_length = read_int("GENOME_LENGTH")
45 |     Int number_contigs = read_int("NUMBER_CONTIGS")
46 |     Int n50_value = read_int("N50_VALUE")
47 |     Float gc_percent = read_float("GC_PERCENT")    
48 |   }
49 |   runtime {
50 |     docker: "~{docker}"
51 |     memory: "2 GB"
52 |     cpu: 2
53 |     disks: "local-disk " + disk_size + " SSD"
54 |     disk: disk_size + " GB"
55 |     maxRetries: 3
56 |     preemptible: 0
57 |   }
58 | }


--------------------------------------------------------------------------------
/tasks/species_typing/task_pmga.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task pmga {
 4 |   meta {
 5 |     description: "Serogrouping and serotyping of all Neisseria species and Haemophilus influenzae"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/staphb/pmga:3.0.2"
11 |     Int disk_size = 100
12 |     Int? cpu = 4
13 |   }
14 |   command <<<
15 |     echo $(pmga --version 2>&1) | sed 's/.*pmga //; s/ .*\$//' | tee VERSION
16 |     pmga \
17 |         ~{assembly} \
18 |         --blastdir /data/blastdbs \
19 |         --threads ~{cpu} \
20 |         --prefix ~{samplename}
21 |     # Parse pmga TSV
22 |     # https://github.com/rpetit3/pmga#pmga-output-files
23 |     cut -f 2 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_SPECIESDB
24 |     cut -f 3 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_SEROTYPE
25 |     cut -f 4 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_GENES
26 |     cut -f 5 pmga/~{samplename}.txt | tail -n 1 | tee PMGA_NOTES
27 |   >>>
28 |   output {
29 |     String version = read_string("VERSION")
30 |     String pmga_docker = "~{docker}"
31 |     String pmga_speciesdb = read_string("PMGA_SPECIESDB")
32 |     String pmga_serotype = read_string("PMGA_SEROTYPE")
33 |     String pmga_genes = read_string("PMGA_GENES")
34 |     String pmga_notes = read_string("PMGA_NOTES")
35 |     File pmga_results = "./pmga/~{samplename}.txt"
36 |     File pmga_allele_matrix = "./pmga/~{samplename}-allele-matrix.txt"
37 |     File pmga_blast_final = "./pmga/~{samplename}-blast-final-results.json.gz"
38 |     File pmga_blast_raw = "./pmga/~{samplename}-blast-raw-results.json.gz"
39 |     File pmga_loci_counts = "./pmga/~{samplename}-loci-counts.txt"
40 |     File pmga_gff = "./pmga/~{samplename}.gff.gz"
41 |   }
42 |   runtime {
43 |     docker: "~{docker}"
44 |     memory: "8 GB"
45 |     cpu: 4
46 |     disks: "local-disk " + disk_size + " SSD"
47 |     disk: disk_size + " GB"
48 |     maxRetries: 3
49 |     preemptible: 0
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/workflows/wf_mashtree_fasta.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/phylogenetic_inference/task_mashtree.wdl" as mashtree
 4 | import "../tasks/task_versioning.wdl" as versioning
 5 | import "../tasks/utilities/task_summarize_data.wdl" as data_summary
 6 | import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists
 7 | 
 8 | 
 9 | workflow mashtree_fasta {
10 |   input {
11 |     Array[File] assembly_fasta
12 |     String cluster_name
13 |     Array[String]? sample_names
14 |     String? data_summary_terra_project
15 |     String? data_summary_terra_workspace
16 |     String? data_summary_terra_table
17 |     String? data_summary_column_names
18 |   }
19 |   call mashtree.mashtree_fasta as mashtree_task {
20 |     input:
21 |       assembly_fasta = assembly_fasta,
22 |       cluster_name = cluster_name
23 |     }
24 |   call snp_dists.reorder_matrix {
25 |     input:
26 |       input_tree = mashtree_task.mashtree_tree,
27 |       matrix = mashtree_task.mashtree_matrix,
28 |       cluster_name = cluster_name
29 |   }
30 |   if (defined(data_summary_column_names)) {
31 |     call data_summary.summarize_data {
32 |       input:
33 |         sample_names = sample_names,
34 |         terra_project = data_summary_terra_project,
35 |         terra_workspace = data_summary_terra_workspace,
36 |         terra_table = data_summary_terra_table,
37 |         column_names = data_summary_column_names,
38 |         output_prefix = cluster_name
39 |     }
40 |   } 
41 |   call versioning.version_capture{
42 |     input:
43 |   }
44 |   output {
45 |     # Versioning
46 |     String mashtree_wf_version = version_capture.phbg_version
47 |     String mashtree_wf_analysis_date = version_capture.date
48 |     # Masthree Out
49 |     File mashtree_matrix = reorder_matrix.ordered_matrix
50 |     File mashtree_tree = reorder_matrix.tree
51 |     String mashtree_version = mashtree_task.version
52 |     # Data Summary Out
53 |     File? mashtree_summarized_data = summarize_data.summarized_data
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/workflows/wf_tbprofiler_ont.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | 
 4 | import "../tasks/task_taxon_id.wdl" as taxon
 5 | import "../tasks/task_versioning.wdl" as versioning
 6 | 
 7 | workflow tbprofiler_wf {
 8 |   input {
 9 |       File reads
10 |       String samplename
11 |       String? mapper = "minimap2"
12 |       String? caller = "bcftools"
13 |       Int? min_depth = 20
14 |       Float? min_af = 0.1
15 |       Float? min_af_pred = 0.1
16 |       Int? cov_frac_threshold = 1
17 |     }
18 |   call taxon.tbprofiler_one_sample_ont {
19 |     input:
20 |       reads = reads,
21 |       samplename = samplename,
22 |       mapper = mapper,
23 |       caller = caller,
24 |       min_depth = min_depth,
25 |       min_af = min_af,
26 |       min_af_pred = min_af_pred,
27 |       cov_frac_threshold = cov_frac_threshold
28 |     }
29 |   call versioning.version_capture{
30 |     input:
31 |   }
32 |   output {
33 |     String tb_profiler_wf_version = version_capture.phbg_version
34 |     String tb_profiler_wf_analysis_date = version_capture.date
35 |     File tbprofiler_output_alignment_bam = tbprofiler_one_sample_ont.tbprofiler_output_bam
36 |     File tbprofiler_output_alignment_bai = tbprofiler_one_sample_ont.tbprofiler_output_bai
37 |     File tb_profiler_report_csv = tbprofiler_one_sample_ont.tbprofiler_output_csv
38 |     File tb_profiler_report_tsv =tbprofiler_one_sample_ont.tbprofiler_output_tsv
39 |     String tb_profiler_version = tbprofiler_one_sample_ont.version
40 |     String tb_profiler_main_lineage = tbprofiler_one_sample_ont.tb_profiler_main_lineage
41 |     String tb_profiler_sub_lineage = tbprofiler_one_sample_ont.tb_profiler_sub_lineage
42 |     String tb_profiler_dr_type = tbprofiler_one_sample_ont.tb_profiler_dr_type
43 |     String tb_profiler_num_dr_variants = tbprofiler_one_sample_ont.tb_profiler_num_dr_variants
44 |     String tb_profiler_num_other_variants = tbprofiler_one_sample_ont.tb_profiler_num_other_variants
45 |     String tb_profiler_resistance_genes = tbprofiler_one_sample_ont.tb_profiler_resistance_genes
46 |     }
47 |  }
48 | 


--------------------------------------------------------------------------------
/workflows/wf_tbprofiler_pe.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | 
 4 | import "../tasks/task_taxon_id.wdl" as taxon
 5 | import "../tasks/task_versioning.wdl" as versioning
 6 | 
 7 | workflow tbprofiler_wf {
 8 |   input {
 9 |       File read1
10 |       File read2
11 |       String samplename
12 |       String? mapper = "bwa"
13 |       String? caller = "bcftools"
14 |       Int? min_depth = 10
15 |       Float? min_af = 0.1
16 |       Float? min_af_pred = 0.1
17 |       Int? cov_frac_threshold = 1
18 |     }
19 |   call taxon.tbprofiler_one_sample_pe {
20 |     input:
21 |       read1 = read1,
22 |       read2 = read2,
23 |       samplename = samplename,
24 |       mapper = mapper,
25 |       caller = caller,
26 |       min_depth = min_depth,
27 |       min_af = min_af,
28 |       min_af_pred = min_af_pred,
29 |       cov_frac_threshold = cov_frac_threshold
30 |     }
31 |   call versioning.version_capture{
32 |     input:
33 |   }
34 |   output {
35 |     String tb_profiler_wf_version = version_capture.phbg_version
36 |     String tb_profiler_wf_analysis_date = version_capture.date
37 |     File tb_profiler_report_csv = tbprofiler_one_sample_pe.tbprofiler_output_csv
38 |     File tb_profiler_report_tsv = tbprofiler_one_sample_pe.tbprofiler_output_tsv
39 |     File tbprofiler_output_alignment_bam = tbprofiler_one_sample_pe.tbprofiler_output_bam
40 |     File tbprofiler_output_alignment_bai = tbprofiler_one_sample_pe.tbprofiler_output_bai
41 |     String tb_profiler_version = tbprofiler_one_sample_pe.version
42 |     String tb_profiler_main_lineage = tbprofiler_one_sample_pe.tb_profiler_main_lineage
43 |     String tb_profiler_sub_lineage = tbprofiler_one_sample_pe.tb_profiler_sub_lineage
44 |     String tb_profiler_dr_type = tbprofiler_one_sample_pe.tb_profiler_dr_type
45 |     String tb_profiler_num_dr_variants = tbprofiler_one_sample_pe.tb_profiler_num_dr_variants
46 |     String tb_profiler_num_other_variants = tbprofiler_one_sample_pe.tb_profiler_num_other_variants
47 |     String tb_profiler_resistance_genes = tbprofiler_one_sample_pe.tb_profiler_resistance_genes
48 |     }
49 |  }
50 | 


--------------------------------------------------------------------------------
/.github/workflows/miniwdl-check.yml:
--------------------------------------------------------------------------------
 1 | # 
 2 | # This workflow will run on Pushes and Pull Requests against the main branch. It
 3 | # will only run "miniwdl check" on wdl files that have had a change in the push 
 4 | # or PR.
 5 | #
 6 | name: MiniWDL Check
 7 | on:
 8 |   push:
 9 |     branches: [main]
10 |   pull_request:
11 |     branches: [main]
12 | 
13 | jobs:
14 |   changes:
15 |     name: Check for changes
16 |     runs-on: ubuntu-latest
17 |     outputs:
18 |       # Expose workflows with changes
19 |       workflows: ${{ steps.filter.outputs.wf }}
20 |       workflows_files: ${{ steps.filter.outputs.wf_files }}
21 |     steps:
22 |       # Checkout the repo
23 |       - uses: actions/checkout@v3
24 | 
25 |       # Select wdl files with changes
26 |       - uses: dorny/paths-filter@v2
27 |         id: filter
28 |         with:
29 |           filters: |
30 |             wf:  
31 |               - 'tasks/**'
32 |               - 'workflows/**'
33 |           list-files: json
34 | 
35 |   check:
36 |     runs-on: ubuntu-20.04
37 |     name: ${{ matrix.wf }}
38 |     needs: changes
39 |     if: ${{ needs.changes.outputs.workflows != '[]' && needs.changes.outputs.workflows != '' }}
40 |     strategy:
41 |       fail-fast: false
42 |       matrix:
43 |         wf: ${{ fromJson(needs.changes.outputs.workflows_files) }}
44 |     steps:
45 |       # Checkout the repo
46 |       - uses: actions/checkout@v3
47 | 
48 |       # Install a version of Python3
49 |       - name: Set up Python
50 |         uses: actions/setup-python@v2
51 |         with:
52 |           python-version: "3.x"
53 |       
54 |       # Install MiniWDL (WDL syntax) and ShellCheck (shell syntax)
55 |       - name: install dependencies
56 |         run: |
57 |           sudo apt-get update
58 |           sudo apt-get -y install shellcheck
59 |           pip3 -q install miniwdl 'importlib-metadata==4.13.0'
60 | 
61 |       # Run MiniWDL check on each of the changed WDLs
62 |       - name: MiniWDL Check ${{ matrix.wf }}
63 |         run: miniwdl check ${{ matrix.wf }}
64 | 


--------------------------------------------------------------------------------
/tasks/gene_typing/task_plasmidfinder.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task plasmidfinder {
 4 |   input {
 5 |     File assembly
 6 |     String samplename
 7 |     Int cpu = 8
 8 |     Int memory = 16
 9 |     String docker = "staphb/plasmidfinder:2.1.6"
10 |     Int disk_size = 100
11 |     String? database
12 |     String? database_path
13 |     String? method_path
14 |     # minimum coverage threshold
15 |     Float? min_cov 
16 |     # minimum blast identity threshold
17 |     Float? threshold
18 | 
19 |   }
20 |   command <<<  
21 |   date | tee DATE
22 | 
23 |   if [[ ! -z "~{database}" ]]; then 
24 |     echo "User database identified; ~{database} will be utilized for analysis"
25 |     plasmidfinder_db_version="~{database}"
26 |   else
27 |     plasmidfinder_db_version="unmodified from plasmidfinder docker container"
28 |   fi
29 | 
30 |   echo ${plasmidfinder_db_version} | tee PLASMIDFINDER_DB_VERSION
31 | 
32 |   plasmidfinder.py \
33 |   -i ~{assembly} \
34 |   -x \
35 |   ~{'-d ' + database} \
36 |   ~{'-p ' + database_path} \
37 |   ~{'-mp ' + method_path} \
38 |   ~{'-l ' + min_cov} \
39 |   ~{'-t ' + threshold} 
40 | 
41 |   # parse outputs
42 |   if [ ! -f results_tab.tsv ]; then
43 |     PF="No plasmids detected in database"
44 |   else
45 |     PF="$(tail -n +2 results_tab.tsv | cut -f 2 | sort | uniq -u | paste -s -d, - )"
46 |       if [ "$PF" == "" ]; then
47 |         PF="No plasmids detected in database"
48 |       fi  
49 |   fi
50 |   echo $PF | tee PLASMIDS
51 | 
52 |   mv results_tab.tsv ~{samplename}_results.tsv
53 |   mv Hit_in_genome_seq.fsa ~{samplename}_seqs.fsa
54 | 
55 |   >>>
56 |   output {
57 |     String plasmidfinder_plasmids = read_string("PLASMIDS")
58 |     File plasmidfinder_results = "~{samplename}_results.tsv"
59 |     File plasmidfinder_seqs = "~{samplename}_seqs.fsa"
60 |     String plasmidfinder_docker = docker
61 |     String plasmidfinder_db_version = read_string("PLASMIDFINDER_DB_VERSION")
62 |   }
63 |   runtime {
64 |     memory: "~{memory} GB"
65 |     cpu: cpu
66 |     docker: "~{docker}"
67 |     disks: "local-disk " + disk_size + " SSD"
68 |     disk: disk_size + " GB"
69 |     maxRetries: 3
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/tasks/quality_control/task_busco.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task busco {
 4 |   meta {
 5 |     description: "Run BUSCO on assemblies"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "ezlabgva/busco:v5.3.2_cv1"
11 |     Int disk_size = 100
12 |     Boolean eukaryote = false
13 |   }
14 |   command <<<
15 |     # get version
16 |     busco --version | tee "VERSION"
17 |  
18 |     # run busco
19 |     # -i input assembly
20 |     # -m geno for genome input
21 |     # -o output file tag
22 |     # --auto-lineage-euk looks at only eukaryotic organisms
23 |     # --auto-lineage-prok looks at only prokaryotic organisms; default
24 |     busco \
25 |       -i ~{assembly} \
26 |       -m geno \
27 |       -o ~{samplename} \
28 |       ~{true='--auto-lineage-euk' false='--auto-lineage-prok' eukaryote}
29 | 
30 |     # check for existence of output file; otherwise display a string that says the output was not created
31 |     if [ -f ~{samplename}/short_summary.specific.*.~{samplename}.txt ]; then
32 | 
33 |       # grab the database version and format it according to BUSCO recommendations
34 |       cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "dataset is:" | cut -d' ' -f 6,9 | sed 's/,//' | sed 's/ / (/' | sed 's/$/)/' | tee DATABASE
35 |       
36 |       # extract the results string
37 |       cat ~{samplename}/short_summary.specific.*.~{samplename}.txt | grep "C:" | tee BUSCO_RESULTS
38 | 
39 |       cp ~{samplename}/short_summary.specific.*.~{samplename}.txt ~{samplename}_busco-summary.txt
40 |     else
41 |       echo "BUSCO FAILED" | tee BUSCO_RESULTS
42 |       echo "NA" > DATABASE
43 |     fi
44 |   >>>
45 |   output {
46 |     String busco_version = read_string("VERSION")
47 |     String busco_database = read_string("DATABASE")
48 |     String busco_results = read_string("BUSCO_RESULTS")
49 |     File? busco_report = "~{samplename}_busco-summary.txt"
50 |   }
51 |   runtime {
52 |     docker: "~{docker}"
53 |     memory: "8 GB"
54 |     cpu: 2
55 |     disks: "local-disk " + disk_size + " SSD"
56 |     disk: disk_size + " GB"
57 |     maxRetries: 3
58 |     preemptible: 0
59 |   }
60 | }


--------------------------------------------------------------------------------
/tasks/species_typing/task_ectyper.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task ectyper {
 4 |   meta {
 5 |     description: "In-silico prediction of Escherichia coli serotype"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/ectyper:1.0.0--pyhdfd78af_1"
11 |     Int disk_size = 100
12 |     Int? cpu = 4
13 | 
14 |     # ECTyper Parameters
15 |     #  --opid           [integer] Percent identity required for an O antigen allele match [default: 90]
16 |     #  --opcov          [integer] Minumum percent coverage required for an O antigen allele match [default: 90]
17 |     #  --hpid           [integer] Percent identity required for an H antigen allele match [default: 95]
18 |     #  --hpcov          [integer] Minumum percent coverage required for an H antigen allele match [default: 50]
19 |     #  --verify         [boolean] Enable E. coli species verification
20 |     #  --print_alleles  [boolean] Prints the allele sequences if enabled as the final column
21 |     Int opid = 90
22 |     Int hpid = 95
23 |     Int opcov = 90
24 |     Int hpcov = 50
25 |     Boolean verify = false
26 |     Boolean print_alleles = false
27 |   }
28 |   command <<<
29 |     echo $(ectyper --version 2>&1) | sed 's/.*ectyper //; s/ .*\$//' | tee VERSION
30 |     ectyper \
31 |       ~{'-opid ' + opid} \
32 |       ~{'-hpid ' + hpid} \
33 |       ~{'-opcov ' + opcov} \
34 |       ~{'-hpcov ' + hpcov} \
35 |       ~{true="--verify" false="" verify} \
36 |       ~{true="-s" false="" print_alleles} \
37 |       --cores ~{cpu} \
38 |       --output ./ \
39 |       --input ~{assembly}
40 |     mv output.tsv ~{samplename}.tsv
41 |     # parse ECTyper TSV
42 |     cut -f 5 ~{samplename}.tsv | tail -n 1 | tee PREDICTED_SEROTYPE
43 |   >>>
44 |   output {
45 |     File ectyper_results = "~{samplename}.tsv"
46 |     String ectyper_predicted_serotype = read_string("PREDICTED_SEROTYPE")
47 |     String ectyper_version = read_string("VERSION")
48 |   }
49 |   runtime {
50 |     docker: "~{docker}"
51 |     memory: "8 GB"
52 |     cpu: 4
53 |     disks: "local-disk " + disk_size + " SSD"
54 |     disk: disk_size + " GB"
55 |     maxRetries: 3
56 |     preemptible: 0
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_pbptyper.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task pbptyper {
 4 |   meta {
 5 |     description: " In silico Penicillin Binding Protein (PBP) typer for Streptococcus pneumoniae assemblies. https://github.com/rpetit3/pbptyper"
 6 |   }
 7 |   input {
 8 |     File assembly # An assembly in FASTA format (compressed with gzip, or uncompressed) to predict the PBP type on.
 9 |     String samplename
10 |     String? db # A path to a directory containing FASTA files for 1A, 2B, and 2X proteins. In most cases using the default value will be all that is needed.
11 |     Int min_pident = 95 # Minimum percent identity to count a hit [default: 95]
12 |     Int min_coverage = 95 # Minimum percent coverage to count a hit [default: 95]  
13 |     String docker = "staphb/pbptyper:1.0.4"
14 |     Int disk_size = 100
15 |     Int cpus = 4
16 | 
17 |   }
18 |   command <<<
19 |     # get version information
20 |     pbptyper --version | sed 's/pbptyper, //' | tee VERSION
21 |     
22 |     # run pbptyper
23 |     pbptyper \
24 |       --assembly ~{assembly} \
25 |       ~{'--db ' + db} \
26 |       ~{'--min_pident ' + min_pident} \
27 |       ~{'--min_coverage ' + min_coverage} \
28 |       --prefix "~{samplename}" \
29 |       --outdir ./ 
30 | 
31 |     # parse output tsv for pbptype
32 |     cut -f 2 ~{samplename}.tsv | tail -n 1 > pbptype.txt
33 | 
34 |   >>>
35 |   output {
36 |     String pbptyper_predicted_1A_2B_2X = read_string("pbptype.txt")
37 |     File pbptyper_pbptype_predicted_tsv = "~{samplename}.tsv" # A tab-delimited file with the predicted PBP type
38 |     File pbptyper_pbptype_1A_tsv = "~{samplename}-1A.tblastn.tsv" # A tab-delimited file of all blast hits against 1A
39 |     File pbptyper_pbptype_2B_tsv = "~{samplename}-2B.tblastn.tsv" # A tab-delimited file of all blast hits against 2B
40 |     File pbptyper_pbptype_2X_tsv = "~{samplename}-2X.tblastn.tsv" # A tab-delimited file of all blast hits against 2X
41 |     String pbptyper_version = read_string("VERSION")
42 |     String pbptyper_docker = docker
43 |   }
44 |   runtime {
45 |     docker: "~{docker}"
46 |     memory: "16 GB"
47 |     cpu: cpus
48 |     disks: "local-disk " + disk_size + " SSD"
49 |     disk: disk_size + " GB"
50 |     maxRetries: 3
51 |     preemptible: 0
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/.dockstore.yml:
--------------------------------------------------------------------------------
 1 | version: 1.2
 2 | workflows:
 3 |  - name: kSNP3
 4 |    subclass: WDL
 5 |    primaryDescriptorPath: /workflows/wf_ksnp3.wdl
 6 |    testParameterFiles:
 7 |     - empty.json
 8 |  - name: Gambit_Query
 9 |    subclass: WDL
10 |    primaryDescriptorPath: /workflows/wf_gambit_query.wdl
11 |    testParameterFiles:
12 |     - empty.json
13 |  - name: Kleborate
14 |    subclass: WDL
15 |    primaryDescriptorPath: /workflows/wf_kleborate.wdl
16 |    testParameterFiles:
17 |     - empty.json
18 |  - name: SerotypeFinder
19 |    subclass: WDL
20 |    primaryDescriptorPath: /workflows/wf_serotypefinder.wdl
21 |    testParameterFiles:
22 |     - empty.json
23 |  - name: TBProfiler_Illumina_PE
24 |    subclass: WDL
25 |    primaryDescriptorPath: /workflows/wf_tbprofiler_pe.wdl
26 |    testParameterFiles:
27 |     - empty.json
28 |  - name: TBProfiler_ONT
29 |    subclass: WDL
30 |    primaryDescriptorPath: /workflows/wf_tbprofiler_ont.wdl
31 |    testParameterFiles:
32 |     - empty.json
33 |  - name: TheiaProk_Illumina_PE
34 |    subclass: WDL
35 |    primaryDescriptorPath: /workflows/wf_theiaprok_illumina_pe.wdl
36 |    testParameterFiles:
37 |     - empty.json
38 |  - name: TheiaProk_Illumina_SE
39 |    subclass: WDL
40 |    primaryDescriptorPath: /workflows/wf_theiaprok_illumina_se.wdl
41 |    testParameterFiles:
42 |     - empty.json
43 |  - name: MashTree_FASTA
44 |    subclass: WDL
45 |    primaryDescriptorPath: /workflows/wf_mashtree_fasta.wdl
46 |    testParameterFiles:
47 |     - empty.json
48 |  - name: NCBI-AMRFinderPlus
49 |    subclass: WDL
50 |    primaryDescriptorPath: /workflows/wf_amrfinderplus.wdl
51 |    testParameterFiles:
52 |     - empty.json
53 |  - name: Kraken2_PE
54 |    subclass: WDL
55 |    primaryDescriptorPath: /workflows/wf_kraken2_pe.wdl
56 |    testParameterFiles:
57 |     - empty.json
58 |  - name: Kraken2_SE
59 |    subclass: WDL
60 |    primaryDescriptorPath: /workflows/wf_kraken2_se.wdl
61 |    testParameterFiles:
62 |     - empty.json
63 |  - name: RASUSA
64 |    subclass: WDL
65 |    primaryDescriptorPath: /workflows/wf_rasusa.wdl
66 |    testParameterFiles:
67 |     - empty.json
68 |  - name: Core_Gene_SNP
69 |    subclass: WDL
70 |    primaryDescriptorPath: /workflows/wf_core_gene_snp.wdl
71 |    testParameterFiles:
72 |     - empty.json


--------------------------------------------------------------------------------
/tasks/utilities/task_rasusa.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task rasusa {
 4 |   meta {
 5 |     description: "Randomly subsample sequencing reads to a specified coverage (https://github.com/mbhall88/rasusa)"
 6 |   }
 7 |   input {
 8 |     File read1
 9 |     File? read2
10 |     String samplename
11 |     String docker = "staphb/rasusa:0.7.0"
12 |     Int disk_size = 100
13 |     Int cpu = 4
14 |     # RASUA Parameters
15 |     #  --bases [STRING] Explicitly set the number of bases required e.g., 4.3kb, 7Tb, 9000, 4.1MB. If this option is given, --coverage and --genome-size are ignored
16 |     #  --coverage [FLOAT] The desired coverage to sub-sample the reads to. If --bases is not provided, this option and --genome-size are required
17 |     #  --genome_size [STRING] Genome size to calculate coverage with respect to. e.g., 4.3kb, 7Tb, 9000, 4.1MB
18 |     #  --seed [INTERGER] Random seed to use
19 |     #  --frac [FLOAT] Subsample to a fraction of the reads - e.g., 0.5 samples half the reads
20 |     #  --num [INTEGER] Subsample to a specific number of reads
21 |     String? bases
22 |     Float coverage 
23 |     String genome_size
24 |     Int? seed
25 |     Float? frac
26 |     Int? num 
27 |   }
28 |   command <<<
29 |     rasusa --version | tee VERSION
30 |     # set single-end or paired-end outputs
31 |     if [ -z "~{read2}" ]; then
32 |       OUTPUT_FILES="~{samplename}_subsampled_R1.fastq.gz"
33 |     else
34 |       OUTPUT_FILES="~{samplename}_subsampled_R1.fastq.gz ~{samplename}_subsampled_R2.fastq.gz"
35 |     fi
36 |     # ignore coverage values if frac input provided
37 |     if [ -z "~{frac}" ]; then
38 |       COVERAGE="--coverage ~{coverage} --genome-size ~{genome_size}"
39 |     else
40 |       COVERAGE=""
41 |     fi
42 |     # run rasusa
43 |     rasusa \
44 |       -i ~{read1} ~{read2} \
45 |       ${COVERAGE} \
46 |       ~{'--seed ' + seed} \
47 |       ~{'--bases ' + bases} \
48 |       ~{'--frac ' + frac} \
49 |       ~{'--num ' + num} \
50 |       -o ${OUTPUT_FILES}
51 |   >>>
52 |   output {
53 |     File read1_subsampled = "~{samplename}_subsampled_R1.fastq.gz"
54 |     File? read2_subsampled = "~{samplename}_subsampled_R2.fastq.gz"
55 |     String rasusa_version = read_string("VERSION")
56 |   }
57 |   runtime {
58 |     docker: "~{docker}"
59 |     memory: "8 GB"
60 |     cpu: cpu
61 |     disks: "local-disk " + disk_size + " SSD"
62 |     disk: disk_size + " GB"
63 |     maxRetries: 3
64 |     preemptible: 0
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_staphopiasccmec.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task staphopiasccmec {
 4 |   meta {
 5 |     description: "Primer based SCCmec typing of Staphylococcus aureus genomes"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/staphopia-sccmec:1.0.0--hdfd78af_0"
11 |     Int disk_size = 100
12 |     Int cpu = 1
13 |   }
14 |   command <<<
15 |     # get version
16 |     staphopia-sccmec --version 2>&1 | sed 's/^.*staphopia-sccmec //' | tee VERSION
17 | 
18 |     # run staphopia-sccmec on input assembly; hamming option OFF; outputs are true/false
19 |     staphopia-sccmec \
20 |       --assembly ~{assembly} > ~{samplename}.staphopia-sccmec.summary.tsv
21 | 
22 |     # run staphopia-sccmec on input assembly; hamming option ON; outputs are the hamming distance; 0 is exact match
23 |     staphopia-sccmec \
24 |       --hamming \
25 |       --assembly ~{assembly} > ~{samplename}.staphopia-sccmec.hamming.tsv
26 | 
27 |     # please excuse this ugly bash code below :)
28 | 
29 |     # parse output summary TSV for true matches
30 |     # look for columns that contain the word "True" and print the column numbers in a list to a file col_headers.txt
31 |      awk '{ for (i=1; i<=NF; ++i) { if ($i ~ "True") print i } }' ~{samplename}.staphopia-sccmec.summary.tsv | tee col_headers.txt
32 | 
33 |      # use column number list to print column headers (example: IV, mecA, etc.) to a file type.txt
34 |      cat col_headers.txt | while read -r COL_NUMBER; do \
35 |         cut -f "$COL_NUMBER" ~{samplename}.staphopia-sccmec.summary.tsv | head -n 1 >>type.txt
36 |         echo "," >>type.txt
37 |      done
38 | 
39 |     # remove newlines, remove trailing comma; generate output string of comma separated values
40 |     cat type.txt | tr -d '\n' | sed 's|.$||g' | tee TYPES_AND_MECA.txt
41 | 
42 |   >>>
43 |   output {
44 |     File staphopiasccmec_results_tsv = "~{samplename}.staphopia-sccmec.summary.tsv"
45 |     File staphopiasccmec_hamming_distance_tsv = "~{samplename}.staphopia-sccmec.hamming.tsv"
46 |     String staphopiasccmec_types_and_mecA_presence = read_string("TYPES_AND_MECA.txt")
47 |     String staphopiasccmec_version = read_string("VERSION")
48 |     String staphopiasccmec_docker = docker
49 |   }
50 |   runtime {
51 |     docker: "~{docker}"
52 |     memory: "4 GB"
53 |     cpu: cpu
54 |     disks: "local-disk " + disk_size + " SSD"
55 |     disk: disk_size + " GB"
56 |     maxRetries: 3
57 |     preemptible: 0
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/tasks/gene_typing/task_bakta.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task bakta {
 4 |   input {
 5 |     File assembly
 6 |     File bakta_db = "gs://theiagen-public-files-rp/terra/theiaprok-files/bakta_db_2022-08-29.tar.gz"
 7 |     String samplename
 8 |     Int cpu = 8
 9 |     Int memory = 16
10 |     String docker = "quay.io/biocontainers/bakta:1.5.1--pyhdfd78af_0"
11 |     Int disk_size = 100
12 |     # Parameters 
13 |     #  proteins: Fasta file of trusted protein sequences for CDS annotation
14 |     #  prodigal_tf: Prodigal training file to use for CDS prediction
15 |     # bakta_opts: any additional bakta arguments
16 |     Boolean proteins = false
17 |     Boolean compliant = false
18 |     File? prodigal_tf
19 |     String? bakta_opts
20 |   }
21 |   command <<<
22 |   date | tee DATE
23 |   bakta --version | tee BAKTA_VERSION
24 |   
25 |   # Extract Bakta DB
26 |   mkdir db
27 |   time tar xzvf ~{bakta_db} --strip-components=1 -C ./db
28 | 
29 |   # Install amrfinderplus db
30 |   amrfinder_update --database db/amrfinderplus-db
31 |   amrfinder --database_version | tee AMRFINDER_DATABASE_VERSION
32 | 
33 |   bakta \
34 |     ~{bakta_opts} \
35 |     --db db/ \
36 |     --threads ~{cpu} \
37 |     --prefix ~{samplename} \
38 |     --output ~{samplename} \
39 |     ~{true='--compliant' false='' compliant} \
40 |     ~{true='--proteins' false='' proteins} \
41 |     ~{'--prodigal-tf ' + prodigal_tf} \
42 |     ~{assembly}
43 |   
44 |   # rename gff3 to gff for compatibility with downstream analysis (pirate)
45 |   mv "~{samplename}/~{samplename}.gff3" "~{samplename}/~{samplename}.gff"
46 |   
47 |   >>>
48 |   output {
49 |     File bakta_embl = "~{samplename}/~{samplename}.embl"
50 |     File bakta_faa = "~{samplename}/~{samplename}.faa"
51 |     File bakta_ffn = "~{samplename}/~{samplename}.ffn"
52 |     File bakta_fna = "~{samplename}/~{samplename}.fna"
53 |     File bakta_gbff = "~{samplename}/~{samplename}.gbff"
54 |     File bakta_gff3 = "~{samplename}/~{samplename}.gff"
55 |     File bakta_hypotheticals_faa = "~{samplename}/~{samplename}.hypotheticals.faa"
56 |     File bakta_hypotheticals_tsv = "~{samplename}/~{samplename}.hypotheticals.tsv"
57 |     File bakta_tsv = "~{samplename}/~{samplename}.tsv"
58 |     File bakta_txt = "~{samplename}/~{samplename}.txt"
59 |     String bakta_version = read_string("BAKTA_VERSION")
60 |   }
61 |   runtime {
62 |     memory: "~{memory} GB"
63 |     cpu: cpu
64 |     docker: docker
65 |     disks:  "local-disk " + disk_size + " SSD"
66 |     disk: disk_size + " GB"
67 |     maxRetries: 3
68 |   }
69 | }
70 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_meningotype.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task meningotype {
 4 |   meta {
 5 |     description: "Serotyping of Neisseria meningitidis"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/meningotype:0.8.5--pyhdfd78af_0"
11 |     Int disk_size = 100
12 |     Int cpu = 2
13 |   }
14 |   command <<<
15 |     
16 |     # Parameters
17 |     # --finetype      perform porA and fetA fine typing (default=off)
18 |     # --porB          perform porB sequence typing (NEIS2020) (default=off)
19 |     # --bast          perform Bexsero antigen sequence typing (BAST) (default=off)
20 |     # --mlst          perform MLST (default=off)
21 |     # --all           perform MLST, porA, fetA, porB, BAST typing (default=off)
22 | 
23 |     echo $(meningotype --version 2>&1) | sed 's/^.*meningotype v//' | tee VERSION
24 |     meningotype \
25 |       --finetype \
26 |       --porB \
27 |       --bast \
28 |       --cpus ~{cpu} \
29 |       ~{assembly} \
30 |       > ~{samplename}.tsv
31 |     
32 |     tail -1 ~{samplename}.tsv | awk '{print $2}' | tee MENINGOTYPE_SEROTYPE
33 |     tail -1 ~{samplename}.tsv | awk '{print $5}' | tee MENINGOTYPE_PORA
34 |     tail -1 ~{samplename}.tsv | awk '{print $6}' | tee MENINGOTYPE_FETA
35 |     tail -1 ~{samplename}.tsv | awk '{print $7}' | tee MENINGOTYPE_PORB
36 |     tail -1 ~{samplename}.tsv | awk '{print $8}' | tee MENINGOTYPE_FHBP
37 |     tail -1 ~{samplename}.tsv | awk '{print $9}' | tee MENINGOTYPE_NHBA
38 |     tail -1 ~{samplename}.tsv | awk '{print $10}' | tee MENINGOTYPE_NADA
39 |     tail -1 ~{samplename}.tsv | awk '{print $11}' | tee MENINGOTYPE_BAST
40 | 
41 |   >>>
42 |   output {
43 |     File meningotype_tsv = "~{samplename}.tsv"
44 |     String meningotype_version = read_string("VERSION")
45 |     String meningotype_serogroup = read_string("MENINGOTYPE_SEROTYPE")
46 |     String meningotype_PorA = read_string("MENINGOTYPE_PORA")
47 |     String meningotype_FetA = read_string("MENINGOTYPE_FETA")
48 |     String meningotype_PorB = read_string("MENINGOTYPE_PORB")
49 |     String meningotype_fHbp = read_string("MENINGOTYPE_FHBP")
50 |     String meningotype_NHBA = read_string("MENINGOTYPE_NHBA")
51 |     String meningotype_NadA = read_string("MENINGOTYPE_NADA")
52 |     String meningotype_BAST = read_string("MENINGOTYPE_BAST")
53 |   }
54 |   runtime {
55 |     docker: "~{docker}"
56 |     memory: "8 GB"
57 |     cpu: cpu
58 |     disks: "local-disk " + disk_size + " SSD"
59 |     disk: disk_size + " GB"
60 |     maxRetries: 3
61 |     preemptible: 0
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/tests/inputs/wf_theiaprok_illumina_pe.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "theiaprok_illumina_pe.samplename": "test",
 3 |     "theiaprok_illumina_pe.read1_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R1.fastq.gz",
 4 |     "theiaprok_illumina_pe.read2_raw": "bactopia-tests/data/species/portiera/illumina/SRR2838702_R2.fastq.gz",
 5 |     "theiaprok_illumina_pe.skip_screen": true,
 6 |     "theiaprok_illumina_pe.read_QC_trim.read_processing": "trimmomatic",
 7 |     "theiaprok_illumina_pe.read_QC_trim.call_midas": false,
 8 |     "theiaprok_illumina_pe.read_QC_trim.midas.midas_db" : "./tests/inputs/empty-for-test.txt",
 9 |     "theiaprok_illumina_pe.genome_annotation": "prokka",
10 |     "theiaprok_illumina_pe.shovill_pe.assembler": "skesa",
11 |     "theiaprok_illumina_pe.merlin_magic.call_poppunk": false,
12 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_dists_npy" : "./tests/inputs/empty-for-test.txt",
13 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_dists_pkl" : "./tests/inputs/empty-for-test.txt",
14 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_h5" : "./tests/inputs/empty-for-test.txt",
15 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs" : "./tests/inputs/empty-for-test.txt",
16 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_dists_npy" : "./tests/inputs/empty-for-test.txt",
17 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_dists_pkl" : "./tests/inputs/empty-for-test.txt",
18 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_h5" : "./tests/inputs/empty-for-test.txt",
19 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_clusters_csv" : "./tests/inputs/empty-for-test.txt",
20 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_fit_npz" : "./tests/inputs/empty-for-test.txt",
21 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_fit_pkl" : "./tests/inputs/empty-for-test.txt",
22 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_graph_gt" : "./tests/inputs/empty-for-test.txt",
23 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_qcreport_txt" : "./tests/inputs/empty-for-test.txt",
24 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_unword_clusters_csv" : "./tests/inputs/empty-for-test.txt",
25 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_refs_graph_gt" : "./tests/inputs/empty-for-test.txt",
26 |     "theiaprok_illumina_pe.merlin_magic.poppunk_task.GPS_external_clusters_csv" : "./tests/inputs/empty-for-test.txt",
27 |     "theiaprok_illumina_pe.bakta.bakta_db" : "./tests/inputs/empty-for-test.txt"
28 | }
29 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_seqsero2.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task seqsero2 {
 4 |   # Inputs
 5 |   input {
 6 |     File read1
 7 |     File? read2
 8 |     String samplename
 9 |     String mode ="a"
10 |     String seqsero2_docker_image = "quay.io/staphb/seqsero2:1.2.1"
11 |     Int disk_size = 100
12 |     Boolean paired_end
13 |   }
14 | 
15 |   command <<<
16 |     # capture date and version
17 |     # Print and save date
18 |     date | tee DATE
19 |     # Print and save version
20 |     SeqSero2_package.py --version | tee VERSION
21 |     # Run SeqSero2 on the input read data
22 |     SeqSero2_package.py \
23 |     -p 8 \
24 |     ~{true='-t 2' false='-t 3' paired_end} \
25 |     -m ~{mode} \
26 |     -n ~{samplename} \
27 |     -d ~{samplename}_seqseqro2_output_dir \
28 |     -i ~{read1} ~{read2}
29 |     # Run a python block to parse output file for terra data tables
30 |     python3 <<CODE
31 |     import csv
32 |     with open("./~{samplename}_seqseqro2_output_dir/SeqSero_result.tsv",'r') as tsv_file:
33 |       tsv_reader = list(csv.DictReader(tsv_file, delimiter="\t"))
34 |       for line in tsv_reader:
35 |         with open ("PREDICTED_ANTIGENIC_PROFILE", 'wt') as Predicted_Antigen_Prof:
36 |           pred_ant_prof=line['Predicted antigenic profile']
37 |           if not pred_ant_prof:
38 |             pred_ant_prof = "None"
39 |           Predicted_Antigen_Prof.write(pred_ant_prof)
40 |         with open ("PREDICTED_SEROTYPE", 'wt') as Predicted_Sero:
41 |           pred_sero=line['Predicted serotype']
42 |           if not pred_sero:
43 |             pred_sero = "None"
44 |           Predicted_Sero.write(pred_sero)
45 |         with open ("CONTAMINATION", 'wt') as Contamination_Detected:
46 |           cont_detect=line['Potential inter-serotype contamination']
47 |           if not cont_detect:
48 |             cont_detect = "None"
49 |           Contamination_Detected.write(cont_detect)
50 | 
51 |     CODE
52 |   >>>
53 |   output {
54 |     File seqsero2_report = "./~{samplename}_seqseqro2_output_dir/SeqSero_result.tsv"
55 |     String seqsero2_version = read_string("VERSION")
56 |     String seqsero2_predicted_antigenic_profile = read_string("PREDICTED_ANTIGENIC_PROFILE")
57 |     String seqsero2_predicted_serotype = read_string("PREDICTED_SEROTYPE")
58 |     String seqsero2_predicted_contamination = read_string("CONTAMINATION")
59 |   }
60 |   runtime {
61 |     docker: "~{seqsero2_docker_image}"
62 |     memory: "16 GB"
63 |     cpu: 8
64 |     disks: "local-disk " + disk_size + " SSD"
65 |     disk: disk_size + " GB"
66 |     preemptible: 0
67 |     maxRetries: 3
68 |   }
69 | }


--------------------------------------------------------------------------------
/tasks/quality_control/task_trimmomatic.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task trimmomatic_pe {
 4 |   input {
 5 |     File read1
 6 |     File read2
 7 |     String samplename
 8 |     String docker = "quay.io/staphb/trimmomatic:0.39"
 9 |     Int? trimmomatic_window_size = 10
10 |     Int? trimmomatic_quality_trim_score = 20
11 |     Int? trimmomatic_minlen = 75
12 |     Int? threads = 4
13 |     Int disk_size = 100
14 |   }
15 |   command <<<
16 |     # date and version control
17 |     date | tee DATE
18 |     trimmomatic -version > VERSION && sed -i -e 's/^/Trimmomatic /' VERSION
19 | 
20 |     trimmomatic PE \
21 |     -threads ~{threads} \
22 |     ~{read1} ~{read2} \
23 |     -baseout ~{samplename}.fastq.gz \
24 |     SLIDINGWINDOW:~{trimmomatic_window_size}:~{trimmomatic_quality_trim_score} \
25 |     MINLEN:~{trimmomatic_minlen} &> ~{samplename}.trim.stats.txt
26 |   >>>
27 |   output {
28 |     File read1_trimmed = "~{samplename}_1P.fastq.gz"
29 |     File read2_trimmed = "~{samplename}_2P.fastq.gz"
30 |     File trimmomatic_stats = "~{samplename}.trim.stats.txt"
31 |     String version = read_string("VERSION")
32 |     String pipeline_date = read_string("DATE")
33 |   }
34 |   runtime {
35 |     docker: "~{docker}"
36 |     memory: "8 GB"
37 |     cpu: 4
38 |     disks: "local-disk " + disk_size + " SSD"
39 |     disk: disk_size + " GB"
40 |     maxRetries: 3
41 |     preemptible:  0
42 |   }
43 | }
44 | 
45 | task trimmomatic_se {
46 |   input {
47 |     File read1
48 |     String samplename
49 |     String docker="quay.io/staphb/trimmomatic:0.39"
50 |     Int? trimmomatic_window_size = 4
51 |     Int? trimmomatic_quality_trim_score = 30
52 |     Int? trimmomatic_minlen = 25
53 |     Int? threads = 4
54 |     Int disk_size = 100
55 |   }
56 |   command <<<
57 |     # date and version control
58 |     date | tee DATE
59 |     trimmomatic -version > VERSION && sed -i -e 's/^/Trimmomatic /' VERSION
60 | 
61 |     trimmomatic SE \
62 |     -threads ~{threads} \
63 |     ~{read1} \
64 |     ~{samplename}_trimmed.fastq.gz \
65 |     SLIDINGWINDOW:~{trimmomatic_window_size}:~{trimmomatic_quality_trim_score} \
66 |     MINLEN:~{trimmomatic_minlen} > ~{samplename}.trim.stats.txt
67 |   >>>
68 |   output {
69 |     File read1_trimmed = "${samplename}_trimmed.fastq.gz"
70 |     File trimmomatic_stats = "${samplename}.trim.stats.txt"
71 |     String version = read_string("VERSION")
72 |     String pipeline_date = read_string("DATE")
73 |   }
74 |   runtime {
75 |     docker: "~{docker}"
76 |     memory: "8 GB"
77 |     cpu: 4
78 |     disks: "local-disk " + disk_size + " SSD"
79 |     disk: disk_size + " GB"
80 |     maxRetries: 3
81 |     preemptible: 0
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_hicap.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task hicap {
 4 |   meta {
 5 |     description: "Identify cap locus serotype and structure in your Haemophilus influenzae assemblies"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/hicap:1.0.3--py_0"
11 |     Int disk_size = 100
12 |     Int? cpu = 4
13 | 
14 |     # Parameters
15 |     # --gene_coverage GENE_COVERAGE                 Minimum percentage coverage to consider a single gene complete. [default: 0.80]
16 |     # --gene_identity GENE_IDENTITY                 Minimum percentage identity to consider a single gene complete. [default: 0.70]
17 |     # --broken_gene_length BROKEN_GENE_LENGTH       Minimum length to consider a broken gene. [default: 60]
18 |     # --broken_gene_identity BROKEN_GENE_IDENTITY   Minimum percentage identity to consider a broken gene. [default: 0.80]
19 |     Float gene_coverage = 0.8
20 |     Float gene_identity = 0.7
21 |     Int broken_gene_length = 60
22 |     Float broken_gene_identity = 0.8
23 |     Boolean full_sequence = false
24 |     Boolean debug = false
25 |   }
26 |   command <<<
27 |     echo $( hicap --version 2>&1 ) | sed 's/^.*hicap //' | tee VERSION
28 |     hicap \
29 |       --query_fp ~{assembly} \
30 |       ~{'--gene_coverage' + gene_coverage} \
31 |       ~{'--gene_identity' + gene_identity} \
32 |       ~{'--broken_gene_length' + broken_gene_length} \
33 |       ~{'--broken_gene_identity' + broken_gene_identity} \
34 |       ~{true="--full_sequence" false="" full_sequence} \
35 |       ~{true="--debug" false="" debug} \
36 |       --threads ~{cpu} \
37 |       -o ./
38 | 
39 |     if [ ! -f ${samplename}.tsv ]; then
40 |       # No hits, make a file to say so for downstream merging
41 |       echo "isolate<TAB>predicted_serotype<TAB>attributes<TAB>genes_identified<TAB>locus_location<TAB>region_I_genes<TAB>region_II_genes<TAB>region_III_genes<TAB>IS1016_hits" | sed 's/<TAB>/\t/g' > ${samplename}.tsv
42 |       echo "~{samplename}<TAB>cap_not_found<TAB>-<TAB>-<TAB>-<TAB>-<TAB>-<TAB>-<TAB>-" | sed 's/<TAB>/\t/g' >> ~{samplename}.tsv
43 |     else
44 |       sed -i 's/#isolate/isolate/' ~{samplename}.tsv
45 |     fi
46 |   >>>
47 |   output {
48 |     File hicap_results = "~{samplename}.tsv"
49 |     File hicap_genbank = "~{samplename}.gbk"
50 |     File hicap_image = "~{samplename}.svg"
51 |     String hicap_version = read_string("VERSION")
52 |   }
53 |   runtime {
54 |     docker: "~{docker}"
55 |     memory: "8 GB"
56 |     cpu: 4
57 |     disks: "local-disk " + disk_size + " SSD"
58 |     disk: disk_size + " GB"
59 |     maxRetries: 3
60 |     preemptible: 0
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_mycosnp_tree.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task mycosnptree {
 4 |   input {
 5 |     Array[File] assembly_fasta
 6 |     Array[String] samplename
 7 |     String docker="quay.io/theiagen/mycosnp:dev"
 8 |     Int disk_size = 100
 9 |     String strain="B11205"
10 |     String accession="GCA_016772135"
11 |   }
12 |   command <<<
13 |     date | tee DATE
14 |     echo $(nextflow pull rpetit3/mycosnp-nf 2>&1) | sed 's/^.*revision: //;' | tee MYCOSNPTREE_VERSION
15 | 
16 |     assembly_array=(~{sep=' ' assembly_fasta})
17 |     assembly_array_len=$(echo "${#assembly_array[@]}")
18 |     samplename_array=(~{sep=' ' samplename})
19 |     samplename_array_len=$(echo "${#samplename_array[@]}")
20 | 
21 |     # Ensure assembly, and samplename arrays are of equal length
22 |     if [ "$assembly_array_len" -ne "$samplename_array_len" ]; then
23 |       echo "Assembly array (length: $assembly_array_len) and samplename array (length: $samplename_array_len) are of unequal length." >&2
24 |       exit 1
25 |     fi
26 | 
27 |     # Make sample FOFN
28 |     echo "sample,fasta" > samples.csv
29 |     for index in ${!assembly_array[@]}; do
30 |       assembly=${assembly_array[$index]}
31 |       samplename=${samplename_array[$index]}
32 |       echo -e "${samplename},${assembly}" >> samples.csv
33 |     done
34 | 
35 |     # Run MycoSNP
36 |     mkdir mycosnptree
37 |     cd mycosnptree
38 |     if nextflow run rpetit3/mycosnp-nf -entry NFCORE_MYCOSNPTREE --input ../samples.csv --fasta /reference/~{accession}/masked/reference-consensus.fa --publish_dir_mode copy --rapidnj False --fasttree False --iqtree; then
39 |       # Everything finished, pack up the results and clean up
40 |       find work/ -name "*.iqtree" | xargs -I {} cp {} ./
41 |       rm -rf .nextflow/ work/
42 |       cd ..
43 |       tar -cf - mycosnptree/ | gzip -n --best  > mycosnptree.tar.gz
44 |     else
45 |       # Run failed
46 |       exit 1
47 |     fi
48 |   >>>
49 |   output {
50 |     String mycosnptree_version = read_string("MYCOSNPTREE_VERSION")
51 |     String mycosnptree_docker = docker
52 |     String analysis_date = read_string("DATE")
53 |     String reference_strain = strain
54 |     String reference_accession = accession
55 |     File mycosnptree_tree = "mycosnptree/results/combined/phylogeny/iqtree/alignment.fasta.treefile"
56 |     File mycosnptree_iqtree_log = "mycosnptree/alignment.fasta.iqtree"
57 |     File mycosnptree_full_results = "mycosnptree.tar.gz"
58 |   }
59 |   runtime {
60 |     docker: "~{docker}"
61 |     memory: "32 GB"
62 |     cpu: 4
63 |     disks: "local-disk " + disk_size + " SSD"
64 |     disk: disk_size + " GB"
65 |     maxRetries: 3
66 |     preemptible: 0
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_emmtyper.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task emmtyper {
 4 |   meta {
 5 |     description: "emm-typing of Streptococcus pyogenes assemblies"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/emmtyper:0.2.0--py_0"
11 |     Int disk_size = 100
12 |     Int? cpu = 2
13 | 
14 |     # Parameters
15 |     # --workflow [blast|pcr]      Choose workflow  [default: blast]
16 |     # --cluster-distance INTEGER  Distance between cluster of matches to consider as different clusters.  [default: 500]
17 |     # --percent-identity INTEGER      [BLAST] Minimal percent identity of sequence.  [default: 95]
18 |     # --culling-limit INTEGER         [BLAST] Total hits to return in a position. [default: 5]
19 |     # --mismatch INTEGER              [BLAST] Threshold for number of mismatch to allow in BLAST hit.  [default: 4]
20 |     # --align-diff INTEGER            [BLAST] Threshold for difference between alignment length and subject length in BLAST hit.  [default: 5]
21 |     # --gap INTEGER                   [BLAST] Threshold gap to allow in BLAST hit. [default: 2]
22 |     # --min-perfect INTEGER           [isPcr] Minimum size of perfect match at 3' primer end.  [default: 15]
23 |     # --min-good INTEGER              [isPcr] Minimum size where there must be 2 matches for each mismatch.  [default: 15]
24 |     # --max-size INTEGER              [isPcr] Maximum size of PCR product. [default: 2000]
25 | 
26 |     String wf = "blast"
27 |     Int cluster_distance = 500
28 |     Int percid = 95
29 |     Int culling_limit = 5
30 |     Int mismatch = 4
31 |     Int align_diff = 5
32 |     Int gap = 2
33 |     Int min_perfect = 15
34 |     Int min_good = 15
35 |     Int max_size = 2000
36 |   }
37 |   command <<<
38 |     echo $(emmtyper --version 2>&1) | sed 's/^.*emmtyper v//' | tee VERSION
39 |     emmtyper \
40 |       ~{'--workflow' + wf} \
41 |       ~{'--cluster-distance' + cluster_distance} \
42 |       ~{'--percent-identity' + percid} \
43 |       ~{'--culling-limit' + culling_limit} \
44 |       ~{'--mismatch' + mismatch} \
45 |       ~{'--align-diff' + align_diff} \
46 |       ~{'--gap' + gap} \
47 |       ~{'--min-perfect' + min_perfect} \
48 |       ~{'--min-good' + min_good} \
49 |       ~{'--max-size' + max_size} \
50 |       ~{assembly} \
51 |       > ~{samplename}.tsv
52 |   >>>
53 |   output {
54 |     File emmtyper_results = "~{samplename}.tsv"
55 |     String emmtyper_version = read_string("VERSION")
56 |   }
57 |   runtime {
58 |     docker: "~{docker}"
59 |     memory: "8 GB"
60 |     cpu: 2
61 |     disks: "local-disk " + disk_size + " SSD"
62 |     disk: disk_size + " GB"
63 |     maxRetries: 3
64 |     preemptible: 0
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/tasks/quality_control/task_bbduk.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task bbduk_pe {
 4 |   input {
 5 |     File read1_trimmed
 6 |     File read2_trimmed
 7 |     String samplename
 8 |     Int mem_size_gb=8
 9 |     String docker = "quay.io/staphb/bbtools:38.76"
10 |     Int disk_size = 100
11 |   }
12 |   command <<<
13 |     # date and version control
14 |     date | tee DATE
15 | 
16 |     repair.sh in1=~{read1_trimmed} in2=~{read2_trimmed} out1=~{samplename}.paired_1.fastq.gz out2=~{samplename}.paired_2.fastq.gz
17 | 
18 |     bbduk.sh in1=~{samplename}.paired_1.fastq.gz in2=~{samplename}.paired_2.fastq.gz out1=~{samplename}.rmadpt_1.fastq.gz out2=~{samplename}.rmadpt_2.fastq.gz ref=/bbmap/resources/adapters.fa stats=~{samplename}.adapters.stats.txt ktrim=r k=23 mink=11 hdist=1 tpe tbo
19 | 
20 |     bbduk.sh in1=~{samplename}.rmadpt_1.fastq.gz in2=~{samplename}.rmadpt_2.fastq.gz out1=~{samplename}_1.clean.fastq.gz out2=~{samplename}_2.clean.fastq.gz outm=~{samplename}.matched_phix.fq ref=/bbmap/resources/phix174_ill.ref.fa.gz k=31 hdist=1 stats=~{samplename}.phix.stats.txt
21 | 
22 |   >>>
23 |   output {
24 |     File read1_clean = "~{samplename}_1.clean.fastq.gz"
25 |     File read2_clean = "~{samplename}_2.clean.fastq.gz"
26 |     File adapter_stats = "~{samplename}.adapters.stats.txt"
27 |     File phiX_stats = "~{samplename}.phix.stats.txt"
28 |     String bbduk_docker = docker
29 |     String pipeline_date = read_string("DATE")
30 |   }
31 |   runtime {
32 |     docker: "~{docker}"
33 |     memory: "~{mem_size_gb} GB"
34 |     cpu: 4
35 |     disks: "local-disk " + disk_size + " SSD"
36 |     disk: disk_size + " GB"
37 |     preemptible: 0
38 |     maxRetries: 3
39 |   }
40 | }
41 | 
42 | task bbduk_se {
43 |   input {
44 |     File read1_trimmed
45 |     String samplename
46 |     Int mem_size_gb=8
47 |     String docker="quay.io/staphb/bbtools:38.76"
48 |     Int disk_size = 100
49 |   }
50 |   command <<<
51 |     # date and version control
52 |     date | tee DATE
53 | 
54 |     bbduk.sh in1=~{read1_trimmed} out1=~{samplename}.rmadpt_1.fastq.gz ref=/bbmap/resources/adapters.fa stats=~{samplename}.adapters.stats.txt ktrim=r k=23 mink=11 hdist=1 tpe tbo
55 | 
56 |     bbduk.sh in1=~{read1_trimmed} out1=~{samplename}_1.clean.fastq.gz outm=~{samplename}.matched_phix.fq ref=/bbmap/resources/phix174_ill.ref.fa.gz k=31 hdist=1 stats=~{samplename}.phix.stats.txt
57 |   >>>
58 |   output {
59 |     File read1_clean = "~{samplename}_1.clean.fastq.gz"
60 |     File adapter_stats = "~{samplename}.adapters.stats.txt"
61 |     File phiX_stats = "~{samplename}.phix.stats.txt"
62 |     String bbduk_docker = docker
63 |     String pipeline_date = read_string("DATE")
64 |   }
65 |   runtime {
66 |     docker: "~{docker}"
67 |     memory: "~{mem_size_gb} GB"
68 |     cpu: 4
69 |     disks: "local-disk " + disk_size + " SSD"
70 |     disk: disk_size + " GB"
71 |     preemptible: 0
72 |     maxRetries: 3
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_genotyphi.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task genotyphi {
 4 |   # Inputs
 5 |   input {
 6 |     File read1
 7 |     File? read2
 8 |     Boolean ont_data=false
 9 |     String samplename
10 |     String genotyphi_docker_image = "staphb/mykrobe:0.11.0"
11 |     Int disk_size = 100
12 |     Int cpu = 4
13 |   }
14 |   command <<<
15 |     # Print and save versions
16 |      mykrobe --version | sed 's|mykrobe v||g' | tee MYKROBE_VERSION
17 |      # super ugly oneliner since "python /genotyphi/genotyphi.py --version" does NOT work due to python syntax error
18 |      grep '__version__ =' /genotyphi/genotyphi.py | sed "s|__version__ = '||" | sed "s|'||" | tee GENOTYPHI_VERSION
19 | 
20 |     # Run Mykrobe on the input read data
21 |     mykrobe predict \
22 |     -t ~{cpu} \
23 |     --sample ~{samplename} \
24 |     --species typhi \
25 |     --format json \
26 |     --out ~{samplename}.mykrobe_genotyphi.json \
27 |     ~{true='--ont' false='' ont_data} \
28 |     --seq ~{read1} ~{read2}
29 | 
30 |     # use genotyphi script to produce TSV
31 |     python /genotyphi/parse_typhi_mykrobe.py \
32 |     --jsons ~{samplename}.mykrobe_genotyphi.json \
33 |     --prefix ~{samplename}_mykrobe_genotyphi
34 | 
35 |     # Run a python block to parse output file for terra data tables
36 |     python3 <<CODE
37 |     import csv
38 |     with open("./~{samplename}_mykrobe_genotyphi_predictResults.tsv",'r') as tsv_file:
39 |       tsv_reader = list(csv.DictReader(tsv_file, delimiter="\t"))
40 |       for line in tsv_reader:
41 |         with open ("SPECIES", 'wt') as genotyphi_species:
42 |           species=line["species"]
43 |           genotyphi_species.write(species)
44 |         with open ("SPP_PERCENT", 'wt') as species_percent:
45 |           spp_percent=line["spp_percent"]
46 |           species_percent.write(spp_percent)
47 |         with open ("FINAL_GENOTYPE", 'wt') as final_genotype:
48 |           genotype=line["final genotype"]
49 |           final_genotype.write(genotype)
50 |         with open ("CONFIDENCE", 'wt') as genotyphi_confidence:
51 |           confidence=line["confidence"]
52 |           genotyphi_confidence.write(confidence)
53 |     CODE
54 |   >>>
55 |   output {
56 |     File genotyphi_report_tsv = "./~{samplename}_mykrobe_genotyphi_predictResults.tsv"
57 |     File genotyphi_mykrobe_json = "./~{samplename}.mykrobe_genotyphi.json"
58 |     String genotyphi_version = read_string("GENOTYPHI_VERSION")
59 |     String genotyphi_species = read_string("SPECIES")
60 |     Float genotyphi_st_probes_percent_coverage = read_string("SPP_PERCENT")
61 |     String genotyphi_final_genotype = read_string("FINAL_GENOTYPE")
62 |     String genotyphi_genotype_confidence = read_string("CONFIDENCE")
63 |   }
64 |   runtime {
65 |     docker: "~{genotyphi_docker_image}"
66 |     memory: "8 GB"
67 |     cpu: cpu
68 |     disks: "local-disk " + disk_size + " SSD"
69 |     disk: disk_size + " GB"
70 |     preemptible: 0
71 |     maxRetries: 3
72 |   }
73 | }


--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_ksnp3.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task ksnp3 {
 4 |   input {
 5 |     Array[File] assembly_fasta
 6 |     Array[String] samplename
 7 |     String cluster_name
 8 |     Int kmer_size = 19
 9 |     String? ksnp3_args = "" # add -ML to calculate a maximum likelihood tree or -NJ to calculate a neighbor-joining tree
10 |     String docker_image = "quay.io/staphb/ksnp3:3.1"
11 |     Int memory = 8
12 |     Int cpu = 4
13 |     Int disk_size = 100
14 |   }
15 |   command <<<
16 |   assembly_array=(~{sep=' ' assembly_fasta})
17 |   assembly_array_len=$(echo "${#assembly_array[@]}")
18 |   samplename_array=(~{sep=' ' samplename})
19 |   samplename_array_len=$(echo "${#samplename_array[@]}")
20 |   
21 |   # Ensure assembly, and samplename arrays are of equal length
22 |   if [ "$assembly_array_len" -ne "$samplename_array_len" ]; then
23 |     echo "Assembly array (length: $assembly_array_len) and samplename array (length: $samplename_array_len) are of unequal length." >&2
24 |     exit 1
25 |   fi
26 | 
27 |   # create file of filenames for kSNP3 input
28 |   touch ksnp3_input.tsv
29 |   for index in ${!assembly_array[@]}; do
30 |     assembly=${assembly_array[$index]}
31 |     samplename=${samplename_array[$index]}
32 |     echo -e "${assembly}\t${samplename}" >> ksnp3_input.tsv
33 |   done
34 |   # run ksnp3 on input assemblies
35 |   kSNP3 -in ksnp3_input.tsv -outdir ksnp3 -k ~{kmer_size} -core -vcf ~{ksnp3_args}
36 |   
37 |   # rename ksnp3 outputs with cluster name 
38 |   mv -v ksnp3/core_SNPs_matrix.fasta ksnp3/~{cluster_name}_core_SNPs_matrix.fasta
39 |   mv -v ksnp3/tree.core.tre ksnp3/~{cluster_name}_core.nwk
40 |   mv -v ksnp3/VCF.*.vcf ksnp3/~{cluster_name}_core.vcf
41 |   mv -v ksnp3/SNPs_all_matrix.fasta ksnp3/~{cluster_name}_pan_SNPs_matrix.fasta
42 |   mv -v ksnp3/tree.parsimony.tre ksnp3/~{cluster_name}_pan_parsimony.nwk
43 | 
44 |   if [ -f ksnp3/tree.ML.tre ]; then  
45 |     mv -v ksnp3/tree.ML.tre ksnp3/~{cluster_name}_ML.nwk
46 |   fi 
47 |   if [ -f ksnp3/tree.NJ.tre ]; then  
48 |     mv -v ksnp3/tree.NJ.tre ksnp3/~{cluster_name}_NJ.nwk
49 |   fi 
50 | 
51 |   >>>
52 |   output {
53 |     File ksnp3_core_matrix = "ksnp3/${cluster_name}_core_SNPs_matrix.fasta"
54 |     File ksnp3_core_tree = "ksnp3/${cluster_name}_core.nwk"
55 |     File ksnp3_core_vcf = "ksnp3/${cluster_name}_core.vcf"
56 |     File ksnp3_pan_matrix = "ksnp3/~{cluster_name}_pan_SNPs_matrix.fasta"
57 |     File ksnp3_pan_parsimony_tree = "ksnp3/~{cluster_name}_pan_parsimony.nwk"
58 |     File? ksnp3_ml_tree = "ksnp3/~{cluster_name}_ML.nwk"
59 |     File? ksnp3_nj_tree = "ksnp3/~{cluster_name}_NJ.nwk"
60 |     File number_snps = "ksnp3/COUNT_SNPs"
61 |     Array[File] ksnp_outs = glob("ksnp3/*")
62 |     String ksnp3_docker_image = docker_image
63 |   }
64 |   runtime {
65 |     docker: docker_image
66 |     memory: "~{memory} GB"
67 |     cpu: cpu
68 |     disks: "local-disk " + disk_size + " SSD"
69 |     disk: disk_size + " GB"
70 |     preemptible: 0
71 |     maxRetries: 3
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/workflows/de_novo_assembly.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | workflow de_novo_assembly {
  4 | 
  5 |   input {
  6 |     String    SRR
  7 |     File      read1
  8 |     File      read2
  9 |   }
 10 | 
 11 |   call seqyclean {
 12 |     input:
 13 |       samplename=SRR,
 14 |       read1=read1,
 15 |       read2=read2
 16 |   }
 17 | 
 18 |   call shovill {
 19 |     input:
 20 |       samplename=SRR,
 21 |       read1_cleaned=seqyclean.read1_cleaned,
 22 |       read2_cleaned=seqyclean.read2_cleaned
 23 |   }
 24 | 
 25 |   output {
 26 |     File    read1_cleaned     =seqyclean.read1_cleaned
 27 |     File    read2_cleaned     =seqyclean.read2_cleaned
 28 |     File    contigs_fasta     =shovill.contigs_fasta
 29 |     File    contigs_gfa       =shovill.contigs_gfa
 30 |   }
 31 | }
 32 | 
 33 | task seqyclean {
 34 | 
 35 |   input {
 36 |     File        read1
 37 |     File        read2
 38 |     String      samplename
 39 |     File?       adapters
 40 |     Int?        seqyclean_minlen=25
 41 |     String?     seqyclean_qual="20 20"
 42 |     Boolean?    compress=true
 43 |     Boolean?    seqyclean_dup=false
 44 |     Boolean?    seqyclean_no_adapter_trim=false
 45 |   }
 46 | 
 47 |   command {
 48 |     seqyclean --version | head -1 | tee VERSION
 49 |     seqyclean \
 50 |     ${'-minlen ' + seqyclean_minlen} \
 51 |     ${'-qual ' + seqyclean_qual} \
 52 |     ${'-c ' + adapters} \
 53 |     ${true="-dup" false="" seqyclean_dup} \
 54 |     ${true="-no_adapter_trim " false="" seqyclean_no_adapter_trim} \
 55 |     ${true="-gz " false="" compress} \
 56 |     ${'-1 ' + read1} \
 57 |     ${'-2 ' + read2} \
 58 |     ${'-o ' + samplename}
 59 |   }
 60 | 
 61 |   output {
 62 | 	  File       read1_cleaned = "${samplename}_PE1.fastq.gz"
 63 | 	  File       read2_cleaned = "${samplename}_PE2.fastq.gz"
 64 |     String     seqyclean_version = read_string("VERSION")
 65 |   }
 66 | 
 67 |   runtime {
 68 |       docker:       "quay.io/staphb/seqyclean:1.10.09"
 69 |       memory:       "8 GB"
 70 |       cpu:          2
 71 |       disks:        "local-disk 100 SSD"
 72 |       preemptible:  0
 73 |   }
 74 | }
 75 | 
 76 | task shovill {
 77 | 
 78 |   input {
 79 |     File        read1_cleaned
 80 |     File        read2_cleaned
 81 |     String      samplename
 82 |   }
 83 | 
 84 |   command {
 85 |     shovill --version | head -1 | tee VERSION
 86 |     shovill \
 87 |     --outdir out \
 88 |     --R1 ${read1_cleaned} \
 89 |     --R2 ${read2_cleaned}
 90 |     mv out/contigs.fa out/${samplename}_contigs.fasta
 91 |     mv out/contigs.gfa out/${samplename}_contigs.gfa
 92 |   }
 93 | 
 94 |   output {
 95 | 	  File       contigs_fasta = "out/${samplename}_contigs.fasta"
 96 | 	  File       contigs_gfa = "out/${samplename}_contigs.gfa"
 97 |     String     shovill_version = read_string("VERSION")
 98 |   }
 99 | 
100 |   runtime {
101 |       docker:       "quay.io/staphb/shovill:1.1.0"
102 |       memory:       "16 GB"
103 |       cpu:          4
104 |       disks:        "local-disk 100 SSD"
105 |       preemptible:  0
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/workflows/wf_ksnp3.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/phylogenetic_inference/task_ksnp3.wdl" as ksnp3
 4 | import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists
 5 | import "../tasks/task_versioning.wdl" as versioning
 6 | import "../tasks/utilities/task_summarize_data.wdl" as data_summary
 7 | 
 8 | workflow ksnp3_workflow {
 9 |   input {
10 |     Array[File] assembly_fasta
11 |     Array[String] samplename
12 |     String cluster_name
13 |     String? data_summary_terra_project
14 |     String? data_summary_terra_workspace
15 |     String? data_summary_terra_table
16 |     String? data_summary_column_names # string of comma delimited column names
17 | 	}
18 | 	call ksnp3.ksnp3 as ksnp3_task {
19 | 		input:
20 | 			assembly_fasta = assembly_fasta,
21 |       samplename = samplename,
22 |       cluster_name = cluster_name
23 |   }
24 |   call snp_dists.snp_dists as core_snp_dists {
25 |     input:
26 |       cluster_name = cluster_name,
27 |       alignment = ksnp3_task.ksnp3_core_matrix
28 |   }
29 |   call snp_dists.snp_dists as pan_snp_dists {
30 |     input:
31 |       cluster_name = cluster_name,
32 |       alignment = ksnp3_task.ksnp3_pan_matrix
33 |   }
34 |   call snp_dists.reorder_matrix as core_reorder_matrix {
35 |     input:
36 |       input_tree = ksnp3_task.ksnp3_core_tree,
37 |       matrix = core_snp_dists.snp_matrix,
38 |       cluster_name = cluster_name + "_core"
39 |   }
40 |   call snp_dists.reorder_matrix as pan_reorder_matrix {
41 |     input:
42 |       input_tree = ksnp3_task.ksnp3_pan_parsimony_tree,
43 |       matrix = pan_snp_dists.snp_matrix,
44 |       cluster_name = cluster_name + "_pan"
45 |   }
46 |   if (defined(data_summary_column_names)) {
47 |     call data_summary.summarize_data {
48 |       input:
49 |         sample_names = samplename,
50 |         terra_project = data_summary_terra_project,
51 |         terra_workspace = data_summary_terra_workspace,
52 |         terra_table = data_summary_terra_table,
53 |         column_names = data_summary_column_names,
54 |         output_prefix = cluster_name
55 |     }
56 |   }
57 |   call versioning.version_capture{
58 |     input:
59 |   }
60 |   output {
61 |     # Version Capture
62 |     String ksnp3_wf_version = version_capture.phbg_version
63 |     String ksnp3_wf_analysis_date = version_capture.date
64 |     String ksnp3_docker = ksnp3_task.ksnp3_docker_image
65 |     # ksnp3_outputs
66 |     String ksnp3_snp_dists_version = pan_snp_dists.version
67 |     File ksnp3_core_vcf = ksnp3_task.ksnp3_core_vcf
68 |     # ordered matrixes and reordered trees
69 |     File ksnp3_core_snp_matrix = core_reorder_matrix.ordered_matrix
70 |     File ksnp3_core_tree = core_reorder_matrix.tree
71 |     File ksnp3_pan_snp_matrix = pan_reorder_matrix.ordered_matrix
72 |     File ksnp3_pan_tree = pan_reorder_matrix.tree
73 |     # optional tree outputs
74 |     File? ksnp3_ml_tree = ksnp3_task.ksnp3_ml_tree
75 |     File? ksnp3_nj_tree = ksnp3_task.ksnp3_nj_tree
76 |     # data summary output 
77 |     File? ksnp3_summarized_data = summarize_data.summarized_data
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_shigatyper.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task shigatyper {
 4 |   meta {
 5 |     description: "ShigaTyper is a quick and easy tool designed to determine Shigella serotype using Illumina (single or paired-end) or Oxford Nanopore reads with low computation requirement. https://github.com/CFSAN-Biostatistics/shigatyper"
 6 |   }
 7 |   input {
 8 |     File read1 
 9 |     File? read2
10 |     String samplename
11 |     String docker = "staphb/shigatyper:2.0.3"
12 |     Int disk_size = 100
13 |     Int cpus = 4
14 |     Boolean read1_is_ont = false
15 |   }
16 |   command <<<
17 |     # get version information
18 |     shigatyper --version | sed 's/ShigaTyper //' | tee VERSION.txt
19 | 
20 |     # if read2 DOES NOT EXIST, ASSUME SINGLE END OR ONT
21 |     if [ -z "~{read2}" ] ; then
22 |       INPUT_READS="--SE ~{read1}"
23 |       # if read1_is_ont is set to TRUE, then use ONT flags
24 |       if [ "~{read1_is_ont}" == "true" ]; then
25 |         INPUT_READS="--SE ~{read1} --ont"
26 |       fi
27 |     # else read2 DOES EXIST, ASSUME PAIRED END
28 |     else
29 |       INPUT_READS="--R1 ~{read1} --R2 ~{read2}"
30 |     fi
31 |     echo "INPUT_READS set to: ${INPUT_READS}"
32 |     echo 
33 | 
34 |     # run shigatyper. 2 output files will be ~{samplename}.tsv and ~{samplename}-hits.tsv
35 |     echo "Running ShigaTyper..."
36 |     shigatyper \
37 |       ${INPUT_READS} \
38 |       -n ~{samplename}
39 | 
40 |     # rename output TSVs to be more descriptive
41 |     mv -v ~{samplename}.tsv ~{samplename}_shigatyper_summary.tsv
42 |     mv -v ~{samplename}-hits.tsv ~{samplename}_shigatyper_hits.tsv
43 | 
44 |     # parse summary tsv for prediction, ipaB absence/presence, and notes
45 |     cut -f 2 ~{samplename}_shigatyper_summary.tsv | tail -n 1 > shigatyper_prediction.txt
46 |     cut -f 3 ~{samplename}_shigatyper_summary.tsv | tail -n 1 > shigatyper_ipaB_presence_absence.txt
47 |     cut -f 4 ~{samplename}_shigatyper_summary.tsv | tail -n 1 > shigatyper_notes.txt
48 | 
49 |     # if shigatyper notes field (really the txt file) is EMPTY, write string saying it is empty to float to Terra table
50 |     if [ "$(cat shigatyper_notes.txt)" == "" ]; then
51 |        echo "ShigaTyper notes field was empty" > shigatyper_notes.txt
52 |     fi
53 | 
54 |   >>>
55 |   output {
56 |     String shigatyper_predicted_serotype = read_string("shigatyper_prediction.txt")
57 |     String shigatyper_ipaB_presence_absence = read_string("shigatyper_ipaB_presence_absence.txt")
58 |     String shigatyper_notes = read_string("shigatyper_notes.txt")
59 |     File shigatyper_hits_tsv = "~{samplename}_shigatyper_hits.tsv" # A tab-delimited detailed report file
60 |     File shigatyper_summary_tsv = "~{samplename}_shigatyper_summary.tsv" # A tab-delimited summary report file
61 |     String shigatyper_version = read_string("VERSION.txt")
62 |     String shigatyper_docker = docker
63 |   }
64 |   runtime {
65 |     docker: "~{docker}"
66 |     memory: "16 GB"
67 |     cpu: cpus
68 |     disks: "local-disk " + disk_size + " SSD"
69 |     disk: disk_size + " GB"
70 |     maxRetries: 3
71 |     preemptible: 0
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/tasks/quality_control/task_fastp.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task fastp {
 4 |   input {
 5 |     File read1
 6 |     File read2
 7 |     String samplename
 8 |     String docker = "quay.io/staphb/fastp:0.23.2"
 9 |     Int disk_size = 100
10 |     Int fastp_window_size = 20
11 |     Int fastp_quality_trim_score = 30
12 |     Int fastp_minlen = 50
13 |     # -g enables polyg trimming with default value of 10
14 |     String fastp_args = "--detect_adapter_for_pe -g -5 20 -3 20"
15 |     Int threads = 4
16 |   }
17 |   command <<<
18 |     # date 
19 |     date | tee DATE
20 | 
21 |     fastp \
22 |     --in1 ~{read1} --in2 ~{read2} \
23 |     --out1 ~{samplename}_1P.fastq.gz --out2 ~{samplename}_2P.fastq.gz \
24 |     --unpaired1 ~{samplename}_1U.fastq.gz --unpaired2 ~{samplename}_2U.fastq.gz \
25 |     --cut_right --cut_right_window_size ~{fastp_window_size} --cut_right_mean_quality ~{fastp_quality_trim_score} \
26 |     --length_required ~{fastp_minlen} \
27 |     --thread ~{threads} \
28 |     ~{fastp_args} \
29 |     --html ~{samplename}_fastp.html --json ~{samplename}_fastp.json
30 |   >>>
31 |   output {
32 |     File read1_trimmed = "~{samplename}_1P.fastq.gz"
33 |     File read2_trimmed = "~{samplename}_2P.fastq.gz"
34 |     File read1_trimmed_unpaired = "~{samplename}_1U.fastq.gz"
35 |     File read2_trimmed_unpaired = "~{samplename}_2U.fastq.gz"
36 |     File fastp_stats = "~{samplename}_fastp.html"
37 |     String version = "~{docker}"
38 |     String pipeline_date = read_string("DATE")
39 |   }
40 |   runtime {
41 |     docker: "quay.io/staphb/fastp:0.23.2"
42 |     memory: "8 GB"
43 |     cpu: 4
44 |     disks: "local-disk " + disk_size + " SSD"
45 |     disk: disk_size + " GB"
46 |     preemptible: 0
47 |     maxRetries: 3
48 |   }
49 | }
50 | 
51 | task fastp_se {
52 |   input {
53 |     File read1
54 |     String samplename
55 |     String docker = "quay.io/staphb/fastp:0.23.2"
56 |     Int disk_size = 100
57 |     Int fastp_window_size = 20
58 |     Int fastp_quality_trim_score = 30
59 |     Int fastp_minlen = 50
60 |     # -g enables polyg trimming with default value of 10
61 |     # --detect_adapter_for_pe argument was removed 
62 |     String fastp_args = "-g -5 20 -3 20"
63 |     Int threads = 4
64 |   }
65 |   command <<<
66 |     # date 
67 |     date | tee DATE
68 | 
69 |     fastp \
70 |     --in1 ~{read1} \
71 |     --out1 ~{samplename}_1P.fastq.gz \
72 |     --cut_right --cut_right_window_size ~{fastp_window_size} --cut_right_mean_quality ~{fastp_quality_trim_score} \
73 |     --length_required ~{fastp_minlen} \
74 |     --thread ~{threads} \
75 |     ~{fastp_args} \
76 |     --html ~{samplename}_fastp.html --json ~{samplename}_fastp.json
77 |   >>>
78 |   output {
79 |     File read1_trimmed = "~{samplename}_1P.fastq.gz"
80 |     File fastp_stats = "~{samplename}_fastp.html"
81 |     String version = "~{docker}"
82 |     String pipeline_date = read_string("DATE")
83 |   }
84 |   runtime {
85 |     docker: "quay.io/staphb/fastp:0.23.2"
86 |     memory: "8 GB"
87 |     cpu: 4
88 |     disks: "local-disk " + disk_size + " SSD"
89 |     disk: disk_size + " GB"
90 |     preemptible: 0
91 |     maxRetries: 3
92 |   }
93 | }


--------------------------------------------------------------------------------
/workflows/ecoli_char.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | workflow ecoli_char {
  4 | 
  5 |   input {
  6 |     String    SRR
  7 |     File      contigs
  8 |   }
  9 | 
 10 |   call abricate as abricate {
 11 |     input:
 12 |       samplename=SRR,
 13 |       contigs=contigs,
 14 |       database="ncbi"
 15 |   }
 16 | 
 17 |   call abricate as abricate_virfinder {
 18 |     input:
 19 |       samplename=SRR,
 20 |       contigs=contigs,
 21 |       database="ecoli_vf"
 22 |   }
 23 | 
 24 |   call amrfinderplus {
 25 |     input:
 26 |       samplename=SRR,
 27 |       contigs=contigs
 28 |   }
 29 | 
 30 |   call serotypefinder {
 31 |     input:
 32 |       samplename=SRR,
 33 |       contigs=contigs
 34 |   }
 35 | 
 36 |   output {
 37 |     File    abricate_results                =abricate.abricate_results
 38 |     File    abricate_virfinder_results                =abricate_virfinder.abricate_results
 39 |     File    amrfinderplus_results           =amrfinderplus.amrfinder_results
 40 |     File    serotypefinder_results          =serotypefinder.serotypefinder_results
 41 |   }
 42 | }
 43 | 
 44 | task abricate {
 45 | 
 46 |   input {
 47 |     File      contigs
 48 |     String    samplename
 49 |     String    database
 50 |   }
 51 | 
 52 |   command {
 53 |     abricate --version | head -1 | tee VERSION
 54 |     abricate --db ${database} ${contigs} > ${samplename + '_abricate.tsv'}
 55 |   }
 56 | 
 57 |   output {
 58 |     File      abricate_results="${samplename + '_abricate.tsv'}"
 59 |   }
 60 | 
 61 |   runtime {
 62 |     docker:       "quay.io/staphb/abricate:1.0.0"
 63 |     memory:       "8 GB"
 64 |     cpu:          2
 65 |     disks:        "local-disk 100 SSD"
 66 |     preemptible:  0
 67 |   }
 68 | }
 69 | 
 70 | task amrfinderplus {
 71 |   input {
 72 |     File      contigs
 73 |     String    samplename
 74 |   }
 75 | 
 76 |   command {
 77 |     amrfinder --version | head -1 | tee VERSION
 78 |     amrfinder \
 79 |     --nucleotide ${contigs} \
 80 |     -o ${samplename + '_amrfinder.tsv'}
 81 |   }
 82 | 
 83 |   output {
 84 |     File      amrfinder_results="${samplename + '_amrfinder.tsv'}"
 85 |   }
 86 | 
 87 |   runtime {
 88 |     docker:       "quay.io/staphb/ncbi-amrfinderplus:3.8.28"
 89 |     memory:       "8 GB"
 90 |     cpu:          2
 91 |     disks:        "local-disk 100 SSD"
 92 |     preemptible:  0
 93 |   }
 94 | }
 95 | 
 96 | task serotypefinder {
 97 | 
 98 |   input {
 99 |     File      contigs
100 |     String    samplename
101 |   }
102 | 
103 |   command {
104 |     serotypefinder.pl --version | head -1 | tee VERSION
105 |     serotypefinder.pl \
106 |     -i ${contigs} \
107 |     -d /serotypefinder/database \
108 |     -b /blast-2.2.26 \
109 |     -s ecoli \
110 |     -k 85.00 \
111 |     -l 0.60 \
112 |     -o ${samplename}
113 |   }
114 | 
115 |   output {
116 |     File      serotypefinder_results="${samplename}/results_table.txt"
117 |   }
118 | 
119 |   runtime {
120 |     docker:       "quay.io/staphb/serotypefinder:1.1"
121 |     memory:       "8 GB"
122 |     cpu:          2
123 |     disks:        "local-disk 100 SSD"
124 |     preemptible:  0
125 |   }
126 | }
127 | 


--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_snp_dists.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task snp_dists {
 4 |   input {
 5 |     File alignment
 6 |     String cluster_name
 7 |     Int disk_size = 100
 8 |   }
 9 |   command <<<
10 |     # date and version control
11 |     date | tee DATE
12 |     snp-dists -v | tee VERSION
13 | 
14 |     # create snp-dists matrix file
15 |     snp-dists ~{alignment} > ~{cluster_name}_snp_distance_matrix.tsv
16 |   >>>
17 |   output {
18 |     String date = read_string("DATE")
19 |     String version = read_string("VERSION")
20 |     File snp_matrix = "~{cluster_name}_snp_distance_matrix.tsv"
21 |   }
22 |   runtime {
23 |     docker: "quay.io/staphb/snp-dists:0.8.2"
24 |     memory: "2 GB"
25 |     cpu: 2
26 |     disks: "local-disk " + disk_size + " SSD"
27 |     disk: disk_size + " GB"
28 |     maxRetries: 3
29 |     preemptible: 0
30 |   }
31 | }
32 | 
33 | task reorder_matrix {
34 |   input {
35 |     File input_tree
36 |     File matrix
37 |     String cluster_name
38 |     Int disk_size = 100
39 |   }
40 |   command <<<
41 |     # removing any "_contigs" suffixes from the tree and matrix
42 |     sed 's/_contigs//g' ~{input_tree} > temporary_tree.nwk
43 |     sed 's/_contigs//g' ~{matrix} > temporary_matrix.tsv
44 | 
45 |     python3 <<CODE
46 |     from Bio import Phylo
47 |     import pandas as pd
48 |     import os
49 | 
50 |     # read in newick tree
51 |     tree = Phylo.read("temporary_tree.nwk", "newick")
52 |     
53 |     # read in matrix into pandas data frame
54 |     snps = pd.read_csv("temporary_matrix.tsv", header=0, index_col=0, delimiter="\t")
55 | 
56 |     # ensure all header and index values are strings for proper reindexing
57 |     # this is because if sample_name is entirely composed of integers, pandas 
58 |     # auto-casts them as integers; get_terminals() interprets those as strings. 
59 |     # this incompatibility leads to failure and an empty ordered SNP matrix
60 |     snps.columns = snps.columns.astype(str)
61 |     snps.index = snps.index.astype(str)
62 | 
63 |     # reroot tree with midpoint
64 |     tree.root_at_midpoint()
65 | 
66 |     # extract ordered terminal ends of rerooted tree
67 |     term_names = [term.name for term in tree.get_terminals()]
68 | 
69 |     # reorder matrix with re-ordered terminal ends
70 |     snps = snps.reindex(index=term_names, columns=term_names)
71 | 
72 |     # add phandango suffix to ensure continuous coloring
73 |     snps_out2 = snps.add_suffix(":c1")
74 | 
75 |     # write out reordered matrix of rerooted tree to a file
76 |     snps_out2.to_csv("~{cluster_name}_snp_matrix.csv", sep=",")
77 | 
78 |     # write rerooted tree to a file
79 |     Phylo.write(tree, "~{cluster_name}_tree.nwk", "newick")
80 | 
81 |     CODE
82 |   >>>
83 |   output{
84 |     File ordered_matrix = "~{cluster_name}_snp_matrix.csv"
85 |     File tree = "~{cluster_name}_tree.nwk"
86 |   }
87 |   runtime {
88 |     docker: "staphb/mykrobe:0.12.1" # used because it contains both biopython and pandas
89 |     memory: "2 GB"
90 |     cpu: 2
91 |     disks: "local-disk " + disk_size + " SSD"
92 |     disk: disk_size + " GB"
93 |    # maxRetries: 3
94 |     preemptible: 0
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/tasks/quality_control/task_fastq_scan.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task fastq_scan_pe {
 4 |   input {
 5 |     File read1
 6 |     File read2
 7 |     String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq")
 8 |     String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq")
 9 |     Int disk_size = 100
10 |   }
11 |   command <<<
12 |     # capture date and version
13 |     date | tee DATE
14 |     fastq-scan -v | tee VERSION
15 | 
16 |     # set cat command based on compression
17 |     if [[ "~{read1}" == *".gz" ]] ; then
18 |       cat_reads="zcat"
19 |     else
20 |       cat_reads="cat"
21 |     fi
22 | 
23 |     # capture forward read stats
24 |     eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json >(jq .qc_stats.read_total > READ1_SEQS)
25 |     read1_seqs=$(cat READ1_SEQS)
26 |     eval "${cat_reads} ~{read2}" | fastq-scan | tee ~{read2_name}_fastq-scan.json >(jq .qc_stats.read_total > READ2_SEQS)
27 |     read2_seqs=$(cat READ2_SEQS)
28 | 
29 |     # capture number of read pairs
30 |     if [ "${read1_seqs}" == "${read2_seqs}" ]; then
31 |       read_pairs=${read1_seqs}
32 |     else
33 |       read_pairs="Uneven pairs: R1=${read1_seqs}, R2=${read2_seqs}"
34 |     fi
35 | 
36 |     echo $read_pairs | tee READ_PAIRS
37 |   >>>
38 |   output {
39 |     File read1_fastq_scan_report = "~{read1_name}_fastq-scan.json"
40 |     File read2_fastq_scan_report = "~{read2_name}_fastq-scan.json"
41 |     Int read1_seq = read_string("READ1_SEQS")
42 |     Int read2_seq = read_string("READ2_SEQS")
43 |     String read_pairs = read_string("READ_PAIRS")
44 |     String version = read_string("VERSION")
45 |     String pipeline_date = read_string("DATE")
46 |   }
47 |   runtime {
48 |     docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1"
49 |     memory: "2 GB"
50 |     cpu: 2
51 |     disks: "local-disk " + disk_size + " SSD"
52 |     disk: disk_size + " GB"
53 |     preemptible: 0
54 |     maxRetries: 3
55 |   }
56 | }
57 | 
58 | task fastq_scan_se {
59 |   input {
60 |     File read1
61 |     String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq")
62 |     Int disk_size = 100
63 |   }
64 |   command <<<
65 |     # capture date and version
66 |     date | tee DATE
67 |     fastq-scan -v | tee VERSION
68 | 
69 |     # set cat command based on compression
70 |     if [[ "~{read1}" == *".gz" ]] ; then
71 |       cat_reads="zcat"
72 |     else
73 |       cat_reads="cat"
74 |     fi
75 | 
76 |     # capture forward read stats
77 |     eval "${cat_reads} ~{read1}" | fastq-scan | tee ~{read1_name}_fastq-scan.json >(jq .qc_stats.read_total > READ1_SEQS)
78 |   >>>
79 |   output {
80 |     File fastq_scan_report = "~{read1_name}_fastq-scan.json"
81 |     Int read1_seq = read_string("READ1_SEQS")
82 |     String version = read_string("VERSION")
83 |     String pipeline_date = read_string("DATE")
84 |   }
85 |   runtime {
86 |     docker: "quay.io/biocontainers/fastq-scan:0.4.4--h7d875b9_1"
87 |     memory: "2 GB"
88 |     cpu: 2
89 |     disks: "local-disk " + disk_size + " SSD"
90 |     disk: disk_size + " GB"
91 |     preemptible: 0
92 |     maxRetries: 3
93 |   }
94 | }


--------------------------------------------------------------------------------
/tasks/quality_control/task_fastqc.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task fastqc_pe {
 4 |   input {
 5 |     File read1
 6 |     File read2
 7 |     String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq")
 8 |     String read2_name = basename(basename(basename(read2, ".gz"), ".fastq"), ".fq")
 9 |     Int? cpus = 2
10 |     String docker="quay.io/staphb/fastqc:0.11.9"
11 |     Int disk_size = 100
12 |   }
13 |   command <<<
14 |     # capture date and version
15 |     date | tee DATE
16 |     fastqc --version | grep FastQC | tee VERSION
17 | 
18 |     fastqc --outdir $PWD --threads ~{cpus} ~{read1} ~{read2}
19 | 
20 |     unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS
21 |     unzip -p ~{read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ2_SEQS
22 | 
23 |     READ1_SEQS=$(unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 )
24 |     READ2_SEQS=$(unzip -p ~{read2_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 )
25 | 
26 |     if [ $READ1_SEQS == $READ2_SEQS ]; then
27 |       read_pairs=$READ1_SEQS
28 |     else
29 |       read_pairs="Uneven pairs: R1=$READ1_SEQS, R2=$READ2_SEQS"
30 |     fi
31 |     echo $read_pairs | tee READ_PAIRS
32 |   >>>
33 |   output {
34 |     File fastqc1_html = "~{read1_name}_fastqc.html"
35 |     File fastqc1_zip = "~{read1_name}_fastqc.zip"
36 |     File fastqc2_html = "~{read2_name}_fastqc.html"
37 |     File fastqc2_zip = "~{read2_name}_fastqc.zip"
38 |     Int read1_seq = read_string("READ1_SEQS")
39 |     Int read2_seq = read_string("READ2_SEQS")
40 |     String read_pairs = read_string("READ_PAIRS")
41 |     String version = read_string("VERSION")
42 |     String pipeline_date = read_string("DATE")
43 |   }
44 |   runtime {
45 |     docker: "~{docker}"
46 |     memory: "4 GB"
47 |     cpu: 2
48 |     disks: "local-disk " + disk_size + " SSD"
49 |     disk: disk_size + " GB"
50 |     maxRetries: 3
51 |     preemptible: 0
52 |   }
53 | }
54 | 
55 | task fastqc_se {
56 |   input {
57 |     File read1
58 |     String read1_name = basename(basename(basename(read1, ".gz"), ".fastq"), ".fq")
59 |     Int? cpus = 2
60 |     String docker="quay.io/staphb/fastqc:0.11.9"
61 |     Int disk_size = 100
62 |   }
63 |   command <<<
64 |     # capture date and version
65 |     date | tee DATE
66 |     fastqc --version | grep FastQC | tee VERSION
67 | 
68 |     fastqc --outdir $PWD --threads ~{cpus} ~{read1}
69 | 
70 |     unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 | tee READ1_SEQS
71 | 
72 |     READ_SEQS=$(unzip -p ~{read1_name}_fastqc.zip */fastqc_data.txt | grep "Total Sequences" | cut -f 2 )
73 |   >>>
74 |   output {
75 |     File fastqc_html = "~{read1_name}_fastqc.html"
76 |     File fastqc_zip = "~{read1_name}_fastqc.zip"
77 |     Int number_reads = read_string("READ1_SEQS")
78 |     String version = read_string("VERSION")
79 |     String pipeline_date = read_string("DATE")
80 |   }
81 |   runtime {
82 |     docker:  "~{docker}"
83 |     memory:  "4 GB"
84 |     cpu:   2
85 |     disks: "local-disk " + disk_size + " SSD"
86 |     disk: disk_size + " GB"
87 |     maxRetries: 3
88 |     preemptible:  0
89 |   }
90 | }


--------------------------------------------------------------------------------
/tasks/species_typing/task_ngmaster.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task ngmaster {
 4 |   meta {
 5 |     description: "Multi-antigen sequence typing for Neisseria gonorrhoeae"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "staphb/ngmaster:1.0.0" 
11 |     Int disk_size = 100
12 |     Int cpu = 2
13 |   }
14 |   command <<<
15 |     ngmaster --version 2>&1 | sed 's/^.*ngmaster //' | tee VERSION
16 | 
17 |     # run ngmaster on input assembly
18 |     # unfortunately ngmaster 1.0.0 fails when either mincov or minid flags are supplied (this is with different install strategies too - bioconda & manually)
19 |     # so we're forced to stick with default minid of 90 and mincov of 10. https://github.com/MDU-PHL/ngmaster/issues/39
20 |     # ngmaster --comments also does not work
21 |     ngmaster \
22 |       ~{assembly} \
23 |       > ~{samplename}.ngmaster.tsv
24 | 
25 |     # parse output TSV
26 |     # first one is tricky since MLSTs are in the 3rd column, separated by a /
27 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $3}' | cut -d '/' -f 1 | tee NGMAST_SEQUENCE_TYPE
28 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $3}' | cut -d '/' -f 2 | tee NGSTAR_SEQUENCE_TYPE
29 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $4}' | tee NGMAST_PORB
30 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $5}' | tee NGMAST_TBPB
31 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $6}' | tee NGSTAR_PENA
32 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $7}' | tee NGSTAR_MTRR
33 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $8}' | tee NGSTAR_PORB
34 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $9}' | tee NGSTAR_PONA
35 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $10}' | tee NGSTAR_GYRA
36 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $11}' | tee NGSTAR_PARC
37 |     tail -n 1 ~{samplename}.ngmaster.tsv | awk '{print $12}' | tee NGSTAR_23S
38 | 
39 |   >>>
40 |   output {
41 |     File ngmaster_tsv = "~{samplename}.ngmaster.tsv"
42 |     String ngmaster_version = read_string("VERSION")
43 |     # NG-MAST scheme's MLST and alleles (only 2 loci)
44 |     String ngmaster_ngmast_sequence_type = read_string("NGMAST_SEQUENCE_TYPE")
45 |     String ngmaster_ngmast_porB_allele = read_string("NGMAST_PORB")
46 |     String ngmaster_ngmast_tbpB_allele = read_string("NGMAST_TBPB")
47 |     # NG-STAR scheme's MLST and alleles (7 loci)
48 |     String ngmaster_ngstar_sequence_type = read_string("NGSTAR_SEQUENCE_TYPE")
49 |     String ngmaster_ngstar_penA_allele = read_string("NGSTAR_PENA")
50 |     String ngmaster_ngstar_mtrR_allele = read_string("NGSTAR_MTRR")
51 |     String ngmaster_ngstar_porB_allele = read_string("NGSTAR_PORB")
52 |     String ngmaster_ngstar_ponA_allele = read_string("NGSTAR_PONA")
53 |     String ngmaster_ngstar_gyrA_allele = read_string("NGSTAR_GYRA")
54 |     String ngmaster_ngstar_parC_allele = read_string("NGSTAR_PARC")
55 |     String ngmaster_ngstar_23S_allele = read_string("NGSTAR_23S")
56 |   }
57 |   runtime {
58 |     docker: "~{docker}"
59 |     memory: "8 GB"
60 |     cpu: cpu
61 |     disks: "local-disk " + disk_size + " SSD"
62 |     disk: disk_size + " GB"
63 |     maxRetries: 3
64 |     preemptible: 0
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_sonneityping.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task sonneityping {
 4 |   # Inputs
 5 |   input {
 6 |     File read1
 7 |     File? read2
 8 |     Boolean ont_data = false
 9 |     String samplename
10 |     String docker = "staphb/mykrobe:0.12.1"
11 |     Int disk_size = 100
12 |     String? myrkobe_opts
13 |     Int cpu = 4
14 |   }
15 |   command <<<
16 |     # Print and save versions
17 |      mykrobe --version | sed 's|mykrobe v||g' | tee MYKROBE_VERSION.txt
18 |     # opting to skip capturing the sonneityping version since there is no --version flag or easy way to determine version
19 |     # navigate here for docker image and version information: https://github.com/StaPH-B/docker-builds/tree/master/mykrobe
20 | 
21 |     # Run Mykrobe on the input read data
22 |     mykrobe predict \
23 |     -t ~{cpu} \
24 |     --sample ~{samplename} \
25 |     --species sonnei \
26 |     --format json_and_csv \
27 |     --out ~{samplename}.mykrobe \
28 |     ~{true='--ont' false='' ont_data} \
29 |     --seq ~{read1} ~{read2} \
30 |     ~{myrkobe_opts}
31 | 
32 |     # use sonneityping script to produce final TSV; alleles.txt is required input for human-readable genotype names
33 |     python /sonneityping/parse_mykrobe_predict.py \
34 |     --jsons ~{samplename}.mykrobe.json --alleles /sonneityping/alleles.txt \
35 |     --prefix ~{samplename}.sonneityping
36 | 
37 |     # rename output TSV to something prettier
38 |     mv -v ~{samplename}.sonneityping_predictResults.tsv ~{samplename}.sonneityping.tsv
39 | 
40 |     # Run a python block to parse output sonneityping TSV file for terra data tables
41 |     python3 <<CODE
42 |     import csv
43 |     with open("./~{samplename}.sonneityping.tsv",'r') as tsv_file:
44 |       tsv_reader = list(csv.DictReader(tsv_file, delimiter="\t"))
45 |       for line in tsv_reader:
46 |         with open ("SPECIES.txt", 'wt') as sonneityping_species:
47 |           species=line["species"]
48 |           sonneityping_species.write(species)
49 |         with open ("FINAL_GENOTYPE.txt", 'wt') as final_genotype:
50 |           genotype=line["final genotype"]
51 |           final_genotype.write(genotype)
52 |         with open ("GENOTYPE_NAME.txt", 'wt') as genotype_name:
53 |           genotypename=line["name"]
54 |           genotype_name.write(genotypename)
55 |         with open ("CONFIDENCE.txt", 'wt') as sonneityping_confidence:
56 |           confidence=line["confidence"]
57 |           sonneityping_confidence.write(confidence)
58 |     CODE
59 |   >>>
60 |   output {
61 |     File sonneityping_mykrobe_report_csv = "~{samplename}.mykrobe.csv"
62 |     File sonneityping_mykrobe_report_json = "~{samplename}.mykrobe.json"
63 |     File sonneityping_final_report_tsv = "~{samplename}.sonneityping.tsv"
64 |     String sonneityping_mykrobe_version = read_string("MYKROBE_VERSION.txt")
65 |     String sonneityping_mykrobe_docker = docker
66 |     String sonneityping_species = read_string("SPECIES.txt")
67 |     String sonneityping_final_genotype = read_string("FINAL_GENOTYPE.txt")
68 |     String sonneityping_genotype_confidence = read_string("CONFIDENCE.txt")
69 |     String sonneityping_genotype_name = read_string("GENOTYPE_NAME.txt")
70 |   }
71 |   runtime {
72 |     docker: "~{docker}"
73 |     memory: "8 GB"
74 |     cpu: cpu
75 |     disks: "local-disk " + disk_size + " SSD"
76 |     disk: disk_size + " GB"
77 |     maxRetries: 3
78 |     preemptible: 0
79 |   }
80 | }


--------------------------------------------------------------------------------
/tasks/species_typing/task_ts_mlst.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task ts_mlst {
 4 |   meta {
 5 |     description: "Torsten Seeman's (TS) automatic MLST calling from assembled contigs"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "staphb/mlst:2.23.0"
11 |     Int disk_size = 100
12 |     Int cpu = 4
13 |     # Parameters
14 |     # --nopath          Strip filename paths from FILE column (default OFF)
15 |     # --scheme [X]      Don't autodetect, force this scheme on all inputs (default '')
16 |     # --minid [n.n]     DNA %identity of full allelle to consider 'similar' [~] (default '95')
17 |     # --mincov [n.n]    DNA %cov to report partial allele at all [?] (default '10')
18 |     # --minscore [n.n]  Minumum score out of 100 to match a scheme (when auto --scheme) (default '50')
19 |     Boolean nopath = true
20 |     String? scheme
21 |     Float? minid
22 |     Float? mincov
23 |     Float? minscore
24 |   }
25 |   command <<<
26 |     echo $(mlst --version 2>&1) | sed 's/mlst //' | tee VERSION
27 |     
28 |     #create output header
29 |     echo -e "Filename\tPubMLST_Scheme_name\tSequence_Type_(ST)\tAllele_IDs" > ~{samplename}_ts_mlst.tsv
30 |     
31 |     mlst \
32 |       --threads ~{cpu} \
33 |       ~{true="--nopath" false="" nopath} \
34 |       ~{'--scheme ' + scheme} \
35 |       ~{'--minid ' + minid} \
36 |       ~{'--mincov ' + mincov} \
37 |       ~{'--minscore ' + minscore} \
38 |       --novel ~{samplename}_novel_mlst_alleles.fasta \
39 |       ~{assembly} \
40 |       >> ~{samplename}_ts_mlst.tsv
41 |       
42 |     # parse ts mlst tsv for relevant outputs
43 |     # if output TSV only contains one line (header line); no ST predicted
44 |     if [ $(wc -l ~{samplename}_ts_mlst.tsv | awk '{ print $1 }') -eq 1 ]; then
45 |       predicted_mlst="No ST predicted"
46 |       pubmlst_scheme="NA"
47 |     # else, TSV has more than one line, so parse outputs
48 |     else
49 |       pubmlst_scheme="$(cut -f2 ~{samplename}_ts_mlst.tsv | tail -n 1)"
50 |       predicted_mlst="ST$(cut -f3 ~{samplename}_ts_mlst.tsv | tail -n 1)"
51 |       # allelic_profile: take second line of output TSV; cut to take 4th column and beyond; replace tabs with commas
52 |       allelic_profile="$(cut -f 4- ~{samplename}_ts_mlst.tsv | tail -n 1 | sed -e 's|\t|,|g')"
53 |         if [ "$pubmlst_scheme" == "-" ]; then
54 |           predicted_mlst="No ST predicted"
55 |           pubmlst_scheme="NA"
56 |         else
57 |           if [ "$predicted_mlst" == "ST-" ]; then
58 |           predicted_mlst="No ST predicted"
59 |           fi
60 |         fi  
61 |     fi
62 |     
63 |     echo "$predicted_mlst" | tee PREDICTED_MLST
64 |     echo "$pubmlst_scheme" | tee PUBMLST_SCHEME
65 |     echo "$allelic_profile" | tee ALLELIC_PROFILE.txt
66 |   >>>
67 |   output {
68 |     File ts_mlst_results = "~{samplename}_ts_mlst.tsv"
69 |     String ts_mlst_predicted_st = read_string("PREDICTED_MLST")
70 |     String ts_mlst_pubmlst_scheme = read_string("PUBMLST_SCHEME")
71 |     String ts_mlst_allelic_profile = read_string("ALLELIC_PROFILE.txt")
72 |     File? ts_mlst_novel_alleles = "~{samplename}_novel_mlst_alleles.fasta"
73 |     String ts_mlst_version = read_string("VERSION")
74 |   }
75 |   runtime {
76 |     docker: "~{docker}"
77 |     memory: "8 GB"
78 |     cpu: 4
79 |     disks: "local-disk " + disk_size + " SSD"
80 |     disk: disk_size + " GB"
81 |     maxRetries: 3
82 |     preemptible: 0
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/tasks/taxon_id/task_midas.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task midas {
 4 |   input {
 5 |     File read1
 6 |     File? read2
 7 |     File midas_db = "gs://theiagen-public-files-rp/terra/theiaprok-files/midas/midas_db_v1.2.tar.gz"
 8 |     Int disk_size = 100
 9 |     String samplename
10 |     String docker = "quay.io/fhcrc-microbiome/midas:v1.3.2--6"
11 |     Int? memory = 32
12 |     Int? cpu = 4
13 |   }
14 |   command <<<
15 |     date | tee DATE
16 | 
17 |     # Decompress the Midas database
18 |     mkdir db
19 |     tar -C ./db/ -xzvf ~{midas_db}  
20 | 
21 |     # Run Midas
22 |     run_midas.py species ~{samplename} -1 ~{read1} ~{'-2 ' + read2} -d db/midas_db_v1.2/ -t ~{cpu} 
23 | 
24 |     # rename output files
25 |     mv ~{samplename}/species/species_profile.txt ~{samplename}/species/~{samplename}_species_profile.tsv
26 |     mv ~{samplename}/species/log.txt ~{samplename}/species/~{samplename}_log.txt
27 | 
28 |     # Run a python block to parse output file for terra data tables
29 |     # pandas is available in default docker image for python2 but not python3
30 |     python2 <<CODE
31 |     import pandas as pd
32 | 
33 |     df = pd.read_csv('~{samplename}/species/~{samplename}_species_profile.tsv',sep="\t", header=0)
34 |     # round relative abundance to 4 decimal places
35 |     df = df.round(4)
36 |     # sort by relative abundance
37 |     sorted_df = df.sort_values(by=['relative_abundance'], ascending=False)
38 |     # split species_id column 
39 |     sorted_df[['genus','species','strain']] = sorted_df['species_id'].str.split(pat = '_',expand=True,n=2)
40 |     # capture primary genus
41 |     primary_genus = sorted_df.genus.iloc[0]
42 |     # remove rows where genus is primary_genus 
43 |     filtered_df = sorted_df[sorted_df['genus'].str.contains(str(primary_genus))==False ]
44 |     # re-sort by relative abundance, just in case
45 |     filtered_sorted_df = filtered_df.sort_values(by=['relative_abundance'], ascending=False)
46 |     # capture secondary genus
47 |     secondary_genus = filtered_sorted_df.genus.iloc[0]
48 |     # capture abundance of secondary genus
49 |     secondary_genus_abundance = filtered_sorted_df.relative_abundance.iloc[0]
50 |     # if secondary genus abundance is less than one, replace genus with text indicating no secondary genus detected
51 |     if secondary_genus_abundance < 0.01:
52 |       secondary_genus="No secondary genus detected (>1% relative abundance)"
53 | 
54 |     # write text files
55 |     with open("PRIMARY_GENUS", 'wt') as pg:
56 |       pg.write(str(primary_genus))
57 |     with open("SECONDARY_GENUS", 'wt') as sg:
58 |       sg.write(str(secondary_genus))
59 |     with open("SECONDARY_GENUS_ABUNDANCE", 'wt') as sga:
60 |       sga.write(str(secondary_genus_abundance))
61 | 
62 |     CODE
63 |   >>>
64 |   output {
65 |     String midas_docker = docker
66 |     String midas_analysis_date = read_string("DATE")
67 |     File midas_report = "~{samplename}/species/~{samplename}_species_profile.tsv"
68 |     File midas_log = "~{samplename}/species/~{samplename}_log.txt"
69 |     String midas_primary_genus = read_string("PRIMARY_GENUS")
70 |     String midas_secondary_genus = read_string("SECONDARY_GENUS")
71 |     Float midas_secondary_genus_abundance = read_string("SECONDARY_GENUS_ABUNDANCE")
72 |   }
73 |   runtime {
74 |     docker: "~{docker}"
75 |     memory: "~{memory} GB"
76 |     cpu: cpu
77 |     disks: "local-disk " + disk_size + " SSD"
78 |     disk: disk_size + " GB"
79 |     maxRetries: 3
80 |     preemptible: 0
81 |   }
82 | }


--------------------------------------------------------------------------------
/.github/workflows/pytest-workflows.yml:
--------------------------------------------------------------------------------
 1 | # 
 2 | # This workflow will run on Pushes and Pull Requests against the main branch. It
 3 | # will run pytest with MiniWDL and Cromwell for any workflows with a change to 
 4 | # them or associated tasks.
 5 | #
 6 | name: Pytest Workflows
 7 | on:
 8 |   push:
 9 |     branches: [main]
10 |   pull_request:
11 |     branches: [main]
12 | 
13 | jobs:
14 |   changes:
15 |     name: Check for changes
16 |     runs-on: ubuntu-latest
17 |     outputs:
18 |       # Expose workflows with changes
19 |       workflows: ${{ steps.filter.outputs.changes }}
20 |     steps:
21 |       # Checkout the repo
22 |       - uses: actions/checkout@v3
23 | 
24 |       # Select workflows with changes
25 |       - uses: dorny/paths-filter@v2
26 |         id: filter
27 |         with:
28 |           filters: "tests/config/pytest_filter.yml"
29 | 
30 |   check:
31 |     runs-on: ubuntu-20.04
32 |     name: ${{ matrix.tag }} ${{ matrix.engine }}
33 |     needs: changes
34 |     if: ${{ needs.changes.outputs.workflows != '[]' && needs.changes.outputs.workflows != '' }}
35 |     strategy:
36 |       fail-fast: false
37 |       matrix:
38 |         # For every workflow, test it with MiniWDL and Cromwell
39 |         tag: ["${{ fromJson(needs.changes.outputs.workflows) }}"]
40 |         engine: ["miniwdl", "cromwell"]
41 |     defaults:
42 |       run:
43 |         # Play nicely with miniconda
44 |         shell: bash -l {0}
45 |     steps:
46 |       # Checkout the repo
47 |       - name: Checkout theiagen/public_health_bacterial_genomics
48 |         uses: actions/checkout@v3
49 | 
50 |       # Import test data
51 |       - name: Pull Test Data from bactopia/bactopia-tests
52 |         uses: actions/checkout@v3
53 |         with:
54 |           repository: bactopia/bactopia-tests
55 |           path: bactopia-tests
56 | 
57 |       # Setup Miniconda3
58 |       - name: Setup miniconda
59 |         uses: conda-incubator/setup-miniconda@v2
60 |         with:
61 |           activate-environment: actions
62 |           auto-activate-base: false
63 | 
64 |       # Depends and env info (mostly for debug)
65 |       - name: Install Dependencies
66 |         run: |
67 |           conda install -y -c conda-forge -c bioconda cromwell miniwdl=1.5.2 'python>=3.7' pytest pytest-workflow 'importlib-metadata<=4.13.0'
68 |           uname -a && env
69 | 
70 |       - name: Test ${{ matrix.tag }}
71 |         run: TMPDIR=~ pytest --tag ${{ matrix.tag }}_${{ matrix.engine }} --symlink --kwdof --color=yes
72 | 
73 |       - name: Upload logs on failure
74 |         if: failure()
75 |         uses: actions/upload-artifact@v3
76 |         with:
77 |           name: logs-${{ matrix.engine }}
78 |           path: |
79 |             /home/runner/pytest_workflow_*/**/stdout*
80 |             /home/runner/pytest_workflow_*/**/stderr*
81 |             /home/runner/pytest_workflow_*/**/script*
82 |             /home/runner/pytest_workflow_*/**/rc
83 |             /home/runner/pytest_workflow_*/**/command
84 |             /home/runner/pytest_workflow_*/**/*.txt
85 |             /home/runner/pytest_workflow_*/**/*.log
86 |             /home/runner/pytest_workflow_*/**/*.out
87 |             /home/runner/pytest_workflow_*/**/*.err
88 |             /home/runner/pytest_workflow_*/**/DATE
89 |             /home/runner/pytest_workflow_*/**/VERSION
90 |             !/home/runner/pytest_workflow_*/**/*.bam*
91 |             !/home/runner/pytest_workflow_*/**/*.fastq.gz
92 | 


--------------------------------------------------------------------------------
/workflows/wf_read_QC_trim.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | import "../tasks/quality_control/task_trimmomatic.wdl" as trimmomatic
 4 | import "../tasks/quality_control/task_fastp.wdl" as fastp
 5 | import "../tasks/quality_control/task_bbduk.wdl" as bbduk
 6 | import "../tasks/quality_control/task_fastq_scan.wdl" as fastq_scan
 7 | import "../tasks/taxon_id/task_midas.wdl" as midas
 8 | 
 9 | workflow read_QC_trim {
10 |   meta {
11 |     description: "Runs basic QC (fastq_scan), trimming (Trimmomatic), and adapter removal (bbduk) on illumina PE reads"
12 |   }
13 | 
14 |   input {
15 |     String  samplename
16 |     File    read1_raw
17 |     File    read2_raw
18 |     Int     trim_window_size = 10
19 |     Int     trim_quality_trim_score = 20
20 |     Int     trim_minlen = 75
21 |     Int     bbduk_mem = 8
22 |     Boolean call_midas = false
23 |     File?   midas_db
24 |     String  read_processing = "trimmomatic"
25 |     String  fastp_args = "--detect_adapter_for_pe -g -5 20 -3 20"
26 |   }
27 |   if (read_processing == "trimmomatic"){
28 |     call trimmomatic.trimmomatic_pe {
29 |       input:
30 |         samplename = samplename,
31 |         read1 = read1_raw,
32 |         read2 = read2_raw,
33 |         trimmomatic_window_size = trim_window_size,
34 |         trimmomatic_quality_trim_score = trim_quality_trim_score,
35 |         trimmomatic_minlen = trim_minlen
36 |     }
37 |   }
38 |   if (read_processing == "fastp"){
39 |     call fastp.fastp {
40 |       input:
41 |         samplename = samplename,
42 |         read1 = read1_raw,
43 |         read2 = read2_raw,
44 |         fastp_window_size = trim_window_size,
45 |         fastp_quality_trim_score = trim_quality_trim_score,
46 |         fastp_minlen = trim_minlen,
47 |         fastp_args = fastp_args
48 |     }
49 |   }
50 |   call bbduk.bbduk_pe {
51 |     input:
52 |       samplename = samplename,
53 |       read1_trimmed = select_first([trimmomatic_pe.read1_trimmed,fastp.read1_trimmed]),
54 |       read2_trimmed = select_first([trimmomatic_pe.read2_trimmed,fastp.read2_trimmed]),
55 |       mem_size_gb = bbduk_mem
56 |   }
57 |   call fastq_scan.fastq_scan_pe as fastq_scan_raw {
58 |     input:
59 |       read1 = read1_raw,
60 |       read2 = read2_raw,
61 |   }
62 |   call fastq_scan.fastq_scan_pe as fastq_scan_clean {
63 |     input:
64 |       read1 = bbduk_pe.read1_clean,
65 |       read2 = bbduk_pe.read2_clean
66 |   }
67 |   if (call_midas) {
68 |     call midas.midas as midas {
69 |       input:
70 |         samplename = samplename,
71 |         read1 = read1_raw,
72 |         read2 = read2_raw,
73 |         midas_db = midas_db
74 |     }
75 |   }
76 | 
77 |   output {
78 |     File	read1_clean	=	bbduk_pe.read1_clean
79 |     File	read2_clean	=	bbduk_pe.read2_clean
80 |     Int	fastq_scan_raw1	=	fastq_scan_raw.read1_seq
81 |     Int	fastq_scan_raw2	=	fastq_scan_raw.read2_seq
82 |     String	fastq_scan_raw_pairs	=	fastq_scan_raw.read_pairs
83 |     Int	fastq_scan_clean1	=	fastq_scan_clean.read1_seq
84 |     Int	fastq_scan_clean2	=	fastq_scan_clean.read2_seq
85 |     String	fastq_scan_clean_pairs	=	fastq_scan_clean.read_pairs
86 |     String	fastq_scan_version	=	fastq_scan_raw.version
87 |     String	bbduk_docker	=	bbduk_pe.bbduk_docker
88 |     String?	trimmomatic_version	=	trimmomatic_pe.version
89 |     String? fastp_version = fastp.version
90 |     String? midas_docker = midas.midas_docker
91 |     File? midas_report = midas.midas_report
92 |     String? midas_primary_genus = midas.midas_primary_genus
93 |     String? midas_secondary_genus = midas.midas_secondary_genus
94 |     Float? midas_secondary_genus_abundance = midas.midas_secondary_genus_abundance
95 |   }
96 | }
97 | 


--------------------------------------------------------------------------------
/tasks/phylogenetic_inference/task_pirate.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task pirate {
 4 |   input {
 5 |     Array[File] gff3
 6 |     String cluster_name
 7 |     Boolean align = true # align all genes and produce core/pangenome alignments
 8 |     String steps = "50,60,70,80,90,95,98" # % identity thresholds to use for pangenome construction [default: 50,60,70,80,90,95,98]
 9 |     String features = "CDS" # features to use for pangenome construction [default: CDS]
10 |     Boolean nucl = false # create a pangenome on CDS features using nucleotide identity, default: amino acid identity
11 |     String? panopt # additional arguments to pass to pangenome_contruction
12 |     Int memory = 32
13 |     Int cpu = 4
14 |     String docker_image = "quay.io/biocontainers/pirate:1.0.5--hdfd78af_0"
15 |     Int disk_size = 100
16 |   }
17 |   command <<<
18 |   
19 |   # date and version control
20 |   date | tee DATE
21 |   PIRATE -v | tee VERSION
22 | 
23 |   # pirate requires the directory containing the gff files as input
24 |   mkdir INPUT_DIR
25 |   ln -s ~{sep=' ' gff3} INPUT_DIR
26 | 
27 |   # run pirate on input gff
28 |   PIRATE \
29 |   --input INPUT_DIR \
30 |   --output PIRATE \
31 |   ~{'--steps ' + steps} \
32 |   ~{'--features ' + features} \
33 |   ~{true="--nucl" false="" nucl} \
34 |   ~{true="--align" false="" align} \
35 |   ~{'--pan-opt ' + panopt} \
36 |   ~{'--threads ' + cpu}
37 |   
38 |   # generate gene_presence_absence.csv
39 |   PIRATE_to_roary.pl -i PIRATE/PIRATE.*.tsv -o ~{cluster_name}_gene_presence_absence.csv
40 |   
41 |   # rename outputs with cluster name 
42 |   mv PIRATE/PIRATE.pangenome_summary.txt PIRATE/~{cluster_name}_pangenome_summary.txt
43 |   mv PIRATE/PIRATE.log PIRATE/~{cluster_name}.log
44 |   mv PIRATE/PIRATE.gene_families.ordered.tsv PIRATE/~{cluster_name}_gene_families.ordered.tsv
45 |   mv PIRATE/PIRATE.unique_alleles.tsv PIRATE/~{cluster_name}_unique_alleles.tsv
46 |   mv PIRATE/binary_presence_absence.fasta PIRATE/~{cluster_name}_binary_presence_absence.fasta
47 |   mv PIRATE/binary_presence_absence.nwk PIRATE/~{cluster_name}_binary_presence_absence.nwk
48 |   mv PIRATE/pangenome.gfa PIRATE/~{cluster_name}_pangenome.gfa
49 | 
50 |   if [[ ~{align} == "true" ]]; then
51 |     mv PIRATE/pangenome_alignment.fasta PIRATE/~{cluster_name}_pangenome_alignment.fasta
52 |     mv PIRATE/pangenome_alignment.gff PIRATE/~{cluster_name}_pangenome_alignment.gff
53 |     mv PIRATE/core_alignment.fasta PIRATE/~{cluster_name}_core_alignment.fasta
54 |     mv PIRATE/core_alignment.gff PIRATE/~{cluster_name}_core_alignment.gff
55 |   fi
56 | 
57 |   >>>
58 |   output {
59 |     File pirate_pangenome_summary = "PIRATE/~{cluster_name}_pangenome_summary.txt"
60 |     File pirate_gene_families_ordered = "PIRATE/~{cluster_name}_gene_families.ordered.tsv"
61 |     File pirate_unique_alleles = "PIRATE/~{cluster_name}_unique_alleles.tsv"
62 |     File pirate_binary_fasta = "PIRATE/~{cluster_name}_binary_presence_absence.fasta"
63 |     File pirate_binary_tree = "PIRATE/~{cluster_name}_binary_presence_absence.nwk"
64 |     File pirate_pangenome_gfa = "PIRATE/~{cluster_name}_pangenome.gfa" 
65 |     File? pirate_pangenome_alignment_fasta = "PIRATE/~{cluster_name}_pangenome_alignment.fasta"
66 |     File? pirate_pangenome_alignment_gff = "PIRATE/~{cluster_name}_pangenome_alignment.gff"
67 |     File? pirate_core_alignment_fasta = "PIRATE/~{cluster_name}_core_alignment.fasta"
68 |     File? pirate_core_alignment_gff = "PIRATE/~{cluster_name}_core_alignment.gff"
69 |     File? pirate_presence_absence_csv = "~{cluster_name}_gene_presence_absence.csv"
70 |     String pirate_docker_image = docker_image
71 |   } 
72 |   runtime {
73 |     docker: "~{docker_image}"
74 |     memory: "~{memory} GB"
75 |     cpu: cpu
76 |     disks: "local-disk " + disk_size + " SSD"
77 |     disk: disk_size + " GB"
78 |     maxRetries: 3
79 |     preemptible: 0
80 |   }
81 | }
82 | 


--------------------------------------------------------------------------------
/workflows/wf_read_QC_trim_se.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | import "../tasks/quality_control/task_trimmomatic.wdl" as trimmomatic
  4 | import "../tasks/quality_control/task_fastp.wdl" as fastp
  5 | import "../tasks/quality_control/task_bbduk.wdl" as bbduk
  6 | import "../tasks/quality_control/task_fastq_scan.wdl" as fastq_scan
  7 | import "../tasks/taxon_id/task_midas.wdl" as midas
  8 | 
  9 | workflow read_QC_trim {
 10 |   meta {
 11 |     description: "Runs basic QC (fastq_scan), trimming (Trimmomatic), and adapter removal (bbduk) on illumina SE reads"
 12 |   }
 13 | 
 14 |   input {
 15 |     String  samplename
 16 |     File    read1_raw
 17 |     Int     trim_window_size = 4
 18 |     Int     trim_quality_trim_score = 30
 19 |     Int     trim_minlen = 25
 20 |     Int     bbduk_mem = 8
 21 |     Boolean call_midas = false
 22 |     File?   midas_db
 23 |     String  read_processing = "trimmomatic"
 24 |     String  fastp_args = "-g -5 20 -3 20"
 25 |   }
 26 | #  call read_clean.ncbi_scrub_se {
 27 | #    input:
 28 | #      samplename = samplename,
 29 | #      read1 = read1_raw
 30 | #  }
 31 |   if (read_processing == "trimmomatic"){
 32 |     call trimmomatic.trimmomatic_se {
 33 |       input:
 34 |         samplename = samplename,
 35 |         read1 = read1_raw,
 36 |         trimmomatic_window_size = trim_window_size,
 37 |         trimmomatic_quality_trim_score = trim_quality_trim_score,
 38 |         trimmomatic_minlen = trim_minlen
 39 |     }
 40 |   }
 41 |   if (read_processing == "fastp"){
 42 |     call fastp.fastp_se {
 43 |       input:
 44 |         samplename = samplename,
 45 |         read1 = read1_raw,
 46 |         fastp_window_size = trim_window_size,
 47 |         fastp_quality_trim_score = trim_quality_trim_score,
 48 |         fastp_minlen = trim_minlen,
 49 |         fastp_args = fastp_args
 50 |     }
 51 |   }
 52 |   call bbduk.bbduk_se {
 53 |     input:
 54 |       samplename = samplename,
 55 |       read1_trimmed = select_first([trimmomatic_se.read1_trimmed,fastp_se.read1_trimmed]),
 56 |       mem_size_gb = bbduk_mem
 57 |   }
 58 |   call fastq_scan.fastq_scan_se as fastq_scan_raw {
 59 |     input:
 60 |       read1 = read1_raw
 61 |   }
 62 |   call fastq_scan.fastq_scan_se as fastq_scan_clean {
 63 |     input:
 64 |       read1 = bbduk_se.read1_clean
 65 |   }
 66 |   if (call_midas) {
 67 |     call midas.midas as midas {
 68 |       input:
 69 |         samplename = samplename,
 70 |         read1 = read1_raw,
 71 |         midas_db = midas_db
 72 |     }
 73 |   }
 74 | #  call taxonID.kraken2 as kraken2_raw {
 75 | #    input:
 76 | #      samplename = samplename,
 77 | #      read1 = bbduk_se.read1_clean
 78 | #  }
 79 | #  call taxonID.kraken2 as kraken2_dehosted {
 80 | #    input:
 81 | #      samplename = samplename,
 82 | #      read1 = ncbi_scrub_se.read1_dehosted
 83 | #  }
 84 | 
 85 |   output {
 86 |     File read1_clean = bbduk_se.read1_clean
 87 | 
 88 |     Int fastq_scan_raw_number_reads = fastq_scan_raw.read1_seq
 89 |     Int fastq_scan_clean_number_reads = fastq_scan_clean.read1_seq
 90 | 
 91 | #    String  kraken_version            = kraken2_raw.version
 92 | #    Float   kraken_human              = kraken2_raw.percent_human
 93 | #    Float   kraken_sc2                = kraken2_raw.percent_sc2
 94 | #    String  kraken_report             = kraken2_raw.kraken_report
 95 | #    Float    kraken_human_dehosted    =    kraken2_dehosted.percent_human
 96 | #    Float    kraken_sc2_dehosted    =    kraken2_dehosted.percent_sc2
 97 | #    String    kraken_report_dehosted    =    kraken2_dehosted.kraken_report
 98 | 
 99 |     String fastq_scan_version = fastq_scan_raw.version
100 |     String bbduk_docker = bbduk_se.bbduk_docker
101 |     String? trimmomatic_version = trimmomatic_se.version
102 |     String? fastp_version = fastp_se.version
103 |     String? midas_docker = midas.midas_docker
104 |     File? midas_report = midas.midas_report
105 |     String? midas_primary_genus = midas.midas_primary_genus
106 |     String? midas_secondary_genus = midas.midas_secondary_genus
107 |     String? midas_secondary_genus_abundance = midas.midas_secondary_genus_abundance
108 |   }
109 | }


--------------------------------------------------------------------------------
/tasks/taxon_id/task_kraken2.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task kraken2_pe {
  4 |   input {
  5 |     File read1
  6 |     File read2
  7 |     File kraken2_db
  8 |     String samplename
  9 |     String docker = "quay.io/staphb/kraken2:2.1.2-no-db"
 10 |     Int disk_size = 100
 11 | 
 12 |     String? kraken2_args = ""
 13 |     String? classified_out = "classified#.fastq"
 14 |     String? unclassified_out = "unclassified#.fastq"
 15 |     Int? memory = 32
 16 |     Int? cpu = 4
 17 |   }
 18 |   command <<<
 19 |     echo $(kraken2 --version 2>&1) | sed 's/^.*Kraken version //;s/ .*$//' | tee VERSION
 20 |     date | tee DATE
 21 | 
 22 |     # Decompress the Kraken2 database
 23 |     mkdir db
 24 |     tar -C ./db/ -xzvf ~{kraken2_db}  
 25 | 
 26 |     # Run Kraken2
 27 |     kraken2 \
 28 |         --db ./db/ \
 29 |         --threads ~{cpu} \
 30 |         --report ~{samplename}.report.txt \
 31 |         --gzip-compressed \
 32 |         --unclassified-out ~{samplename}.~{unclassified_out} \
 33 |         --classified-out ~{samplename}.~{classified_out} \
 34 |         --output ~{samplename}.classifiedreads.txt \
 35 |         --paired \
 36 |         ~{kraken2_args} \
 37 |         ~{read1} ~{read2}
 38 |     
 39 |     # Compress and cleanup
 40 |     gzip *.fastq
 41 |     gzip ~{samplename}.classifiedreads.txt
 42 |   >>>
 43 |   output {
 44 |     String kraken2_version = read_string("VERSION")
 45 |     String kraken2_docker = docker
 46 |     String analysis_date = read_string("DATE")
 47 |     File kraken2_report = "~{samplename}.report.txt"
 48 |     File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz"
 49 |     File kraken2_unclassified_read1 = "~{samplename}.unclassified_1.fastq.gz"
 50 |     File kraken2_unclassified_read2 = "~{samplename}.unclassified_2.fastq.gz"
 51 |     File kraken2_classified_read1 = "~{samplename}.classified_1.fastq.gz"
 52 |     File kraken2_classified_read2 = "~{samplename}.classified_2.fastq.gz"
 53 |   }
 54 |   runtime {
 55 |     docker: "~{docker}"
 56 |     memory: "~{memory} GB"
 57 |     cpu: cpu
 58 |     disks: "local-disk " + disk_size + " SSD"
 59 |     disk: disk_size + " GB"
 60 |     maxRetries: 3
 61 |     preemptible: 0
 62 |   }
 63 | }
 64 | 
 65 | task kraken2_se {
 66 |   input {
 67 |     File read1
 68 |     File kraken2_db
 69 |     String samplename
 70 |     String docker = "quay.io/staphb/kraken2:2.1.2-no-db"
 71 |     Int disk_size = 100
 72 | 
 73 |     String? kraken2_args = ""
 74 |     String? classified_out = "classified.fastq"
 75 |     String? unclassified_out = "unclassified.fastq"
 76 |     Int? memory = 32
 77 |     Int? cpu = 4
 78 |   }
 79 |   command <<<
 80 |     echo $(kraken2 --version 2>&1) | sed 's/^.*Kraken version //;s/ .*$//' | tee VERSION
 81 |     date | tee DATE
 82 | 
 83 |     # Decompress the Kraken2 database
 84 |     mkdir db
 85 |     tar -C ./db/ -xzvf ~{kraken2_db}  
 86 | 
 87 |     # Run Kraken2
 88 |     kraken2 \
 89 |         --db ./db/ \
 90 |         --threads ~{cpu} \
 91 |         --report ~{samplename}.report.txt \
 92 |         --gzip-compressed \
 93 |         --unclassified-out ~{samplename}.~{unclassified_out} \
 94 |         --classified-out ~{samplename}.~{classified_out} \
 95 |         --output ~{samplename}.classifiedreads.txt \
 96 |         ~{kraken2_args} \
 97 |         ~{read1}
 98 |     
 99 |     # Compress and cleanup
100 |     gzip *.fastq
101 |     gzip ~{samplename}.classifiedreads.txt
102 |   >>>
103 |   output {
104 |     String kraken2_version = read_string("VERSION")
105 |     String kraken2_docker = docker
106 |     String analysis_date = read_string("DATE")
107 |     File kraken2_report = "~{samplename}.report.txt"
108 |     File kraken2_classified_report = "~{samplename}.classifiedreads.txt.gz"
109 |     File kraken2_unclassified_read1 = "~{samplename}.unclassified.fastq.gz"
110 |     File kraken2_classified_read1 = "~{samplename}.classified.fastq.gz"
111 |   }
112 |   runtime {
113 |     docker: "~{docker}"
114 |     memory: "~{memory} GB"
115 |     cpu: cpu
116 |     disks: "local-disk " + disk_size + " SSD"
117 |     disk: disk_size + " GB"
118 |     maxRetries: 3
119 |     preemptible: 0
120 |   }
121 | }
122 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_agrvate.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task agrvate {
 4 |   meta {
 5 |     description: "Rapid identification of Staphylococcus aureus agr locus type and agr operon variants."
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "quay.io/biocontainers/agrvate:1.0.2--hdfd78af_0"
11 |     Int disk_size = 50
12 |     Int cpu = 1
13 | 
14 |     # Parameters
15 |     # --typing_only    agr typing only. Skips agr operon extraction and frameshift detection
16 |     Boolean typing_only = false
17 |   }
18 |   command <<<
19 |     # get version info
20 |     agrvate -v 2>&1 | sed 's/agrvate v//;' | tee VERSION
21 | 
22 |     # run agrvate on assembly; usearch not available in biocontainer, cannot use that option
23 |     # using -m flag for mummer frameshift detection since usearch is not available
24 |     agrvate \
25 |         ~{true="--typing-only" false="" typing_only} \
26 |         -i ~{assembly} \
27 |         -m 
28 |         
29 |     # agrvate names output directory and file based on name of .fasta file, so <prefix>.fasta as input results in <prefix>-results/ outdir
30 |     # and results in <prefix>-results/<prefix>-summary.tab files 
31 |     basename=$(basename ~{assembly})
32 |     # strip off anything after the period
33 |     fasta_prefix=${basename%.*}
34 |     
35 |     # rename outputs summary TSV to include samplename
36 |     mv -v "${fasta_prefix}-results/${fasta_prefix}-summary.tab" ~{samplename}.agrvate.tsv
37 | 
38 |     # parse output summary TSV
39 |     cut -f 2 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_GROUP
40 |     cut -f 3 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_MATCH_SCORE
41 |     cut -f 4 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_CANONICAL
42 |     cut -f 5 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_MULTIPLE
43 |     cut -f 6 ~{samplename}.agrvate.tsv | tail -n 1 | tee AGR_NUM_FRAMESHIFTS
44 | 
45 |     # edit output string AGR_CANONICAL to be more informative: https://github.com/VishnuRaghuram94/AgrVATE#results
46 |     if [[ $(cat AGR_CANONICAL) == 1 ]]; then
47 |       echo "1. canonical agrD" >AGR_CANONICAL
48 |     elif [[ $(cat AGR_CANONICAL) == 0 ]]; then
49 |       echo "0. non-canonical agrD" >AGR_CANONICAL
50 |     elif [[ $(cat AGR_CANONICAL) == "u" ]]; then
51 |       echo "u. unknown agrD" >AGR_CANNONICAL
52 |     else 
53 |       echo "result unrecognized, please see summary agrvate TSV file" >AGR_CANONICAL
54 |     fi
55 | 
56 |     # edit output string AGR_MULTIPLE to be more informative: https://github.com/VishnuRaghuram94/AgrVATE#results
57 |     if [[ $(cat AGR_MULTIPLE) == "s" ]]; then
58 |       echo "s. single agr group found" >AGR_MULTIPLE
59 |     elif [[ $(cat AGR_MULTIPLE) == "m" ]]; then
60 |       echo "m. multiple agr groups found" >AGR_MULTIPLE
61 |     elif [[ $(cat AGR_MULTIPLE) == "u" ]]; then
62 |       echo "u. unknown agr groups found" >AGR_MULTIPLE
63 |     else 
64 |       echo "result unrecognized, please see summary agrvate TSV file" >AGR_MULTIPLE
65 |     fi
66 | 
67 |     # if AGR_NUM_FRAMESHIFTS is unknown, edit output string AGR_NUM_FRAMESHIFTS to be more informative, otherwise keep set to a number: https://github.com/VishnuRaghuram94/AgrVATE#results
68 |     if [[ $(cat AGR_NUM_FRAMESHIFTS) == "u" ]]; then
69 |       echo "u or unknown; agr operon not extracted" >AGR_NUM_FRAMESHIFTS
70 |     fi
71 | 
72 |     # create tarball of all output files
73 |     tar -czvf ~{samplename}.agrvate.tar.gz "${fasta_prefix}-results/"
74 |   >>>
75 |   output {
76 |     File agrvate_summary = "~{samplename}.agrvate.tsv"
77 |     File agrvate_results = "~{samplename}.agrvate.tar.gz"
78 |     String agrvate_agr_group = read_string("AGR_GROUP")
79 |     String agrvate_agr_match_score = read_string("AGR_MATCH_SCORE")
80 |     String agrvate_agr_canonical = read_string("AGR_CANONICAL")
81 |     String agrvate_agr_multiple = read_string("AGR_MULTIPLE")
82 |     String agrvate_agr_num_frameshifts = read_string("AGR_NUM_FRAMESHIFTS")
83 |     String agrvate_version = read_string("VERSION")
84 |     String agrvate_docker = docker
85 |   }
86 |   runtime {
87 |     docker: "~{docker}"
88 |     memory: "4 GB"
89 |     cpu: cpu
90 |     disks: "local-disk " + disk_size + " SSD"
91 |     disk: disk_size + " GB"
92 |     maxRetries: 3
93 |     preemptible: 0
94 |   }
95 | }
96 | 


--------------------------------------------------------------------------------
/tasks/quality_control/task_cg_pipeline.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task cg_pipeline {
  4 |   input {
  5 |     File read1
  6 |     File? read2
  7 |     String samplename
  8 |     String docker="quay.io/staphb/lyveset:1.1.4f"
  9 |     Int disk_size = 100
 10 |     String cg_pipe_opts="--fast"
 11 |     Int genome_length
 12 |   }
 13 |   command <<<
 14 |     # date and version control
 15 |     date | tee DATE
 16 | 
 17 |     run_assembly_readMetrics.pl ~{cg_pipe_opts} ~{read1} ~{read2} -e ~{genome_length} > ~{samplename}_readMetrics.tsv
 18 | 
 19 |     # repeat for concatenated read file
 20 |     # run_assembly_readMetrics.pl extension awareness
 21 |     if [[ "~{read1}" == *".gz" ]] ; then
 22 |       extension=".gz"
 23 |     else
 24 |       extension=""
 25 |     fi
 26 |     cat ~{read1} ~{read2} > ~{samplename}_concat.fastq"${extension}"
 27 |     run_assembly_readMetrics.pl ~{cg_pipe_opts} ~{samplename}_concat.fastq"${extension}" -e ~{genome_length} > ~{samplename}_concat_readMetrics.tsv
 28 |     
 29 |     python3 <<CODE
 30 |     import csv
 31 |     #grab output average quality and coverage scores by column header
 32 |     coverage = 0.0
 33 |     with open("~{samplename}_readMetrics.tsv",'r') as tsv_file:
 34 |       tsv_reader = list(csv.DictReader(tsv_file, delimiter="\t"))
 35 |       for line in tsv_reader:
 36 |         if "~{read1}" in line["File"]:
 37 |           with open("R1_MEAN_Q", 'wt') as r1_mean_q:
 38 |             r1_mean_q.write(line["avgQuality"])
 39 |           with open("R1_MEAN_LENGTH", 'wt') as r1_mean_length:
 40 |             r1_mean_length.write(line["avgReadLength"])            
 41 |           
 42 |           # run_assembly_readMetrics can report coverage as '.'
 43 |           try:
 44 |             coverage = float(line["coverage"])
 45 |           except ValueError:
 46 |             continue
 47 |           print(coverage)
 48 |           
 49 |         else:
 50 |           with open("R2_MEAN_Q", 'wt') as r2_mean_q:
 51 |             r2_mean_q.write(line["avgQuality"])
 52 |           with open("R2_MEAN_LENGTH", 'wt') as r2_mean_length:
 53 |             r2_mean_length.write(line["avgReadLength"]) 
 54 |           # run_assembly_readMetrics can report coverage as '.'
 55 |           try:
 56 |             coverage += float(line["coverage"])
 57 |           except ValueError:
 58 |             continue
 59 | 
 60 |       with open("EST_COVERAGE", 'wt') as est_coverage:
 61 |         est_coverage.write(str(coverage))
 62 | 
 63 |     # # parse concatenated read metrics
 64 |     #grab output average quality and coverage scores by column header
 65 |     with open("~{samplename}_concat_readMetrics.tsv",'r') as tsv_file_concat:
 66 |       tsv_reader_concat = list(csv.DictReader(tsv_file_concat, delimiter="\t"))
 67 |       for line in tsv_reader_concat:
 68 |         if "~{samplename}_concat" in line["File"]:
 69 |           with open("COMBINED_MEAN_Q", 'wt') as combined_mean_q:
 70 |             combined_mean_q.write(line["avgQuality"])
 71 |           with open("COMBINED_MEAN_LENGTH", 'wt') as combined_mean_length:
 72 |             combined_mean_length.write(line["avgReadLength"])            
 73 | 
 74 |     CODE
 75 | 
 76 |     # R2_MEAN_Q to make SE workflow work otherwise read_float fails
 77 |     if [[ ! -f R2_MEAN_Q ]] ; then
 78 |       echo "0.0" > R2_MEAN_Q
 79 |     fi
 80 |     # same for R2_MEAN_LENGTH
 81 |     if [[ ! -f R2_MEAN_LENGTH ]] ; then
 82 |       echo "0.0" > R2_MEAN_LENGTH
 83 |     fi    
 84 | 
 85 |   >>>
 86 |   output {
 87 |     File cg_pipeline_report = "${samplename}_readMetrics.tsv"
 88 |     String cg_pipeline_docker   = docker
 89 |     String pipeline_date = read_string("DATE")
 90 |     Float r1_mean_q = read_float("R1_MEAN_Q")
 91 |     Float r2_mean_q = read_float("R2_MEAN_Q")
 92 |     Float combined_mean_q = read_float("COMBINED_MEAN_Q")
 93 |     Float r1_mean_readlength = read_float("R1_MEAN_LENGTH")
 94 |     Float r2_mean_readlength = read_float("R2_MEAN_LENGTH")
 95 |     Float combined_mean_readlength = read_float("COMBINED_MEAN_LENGTH")
 96 |     Float est_coverage = read_float("EST_COVERAGE")
 97 |   }
 98 |   runtime {
 99 |     docker: "~{docker}"
100 |     memory: "8 GB"
101 |     cpu: 4
102 |     disks: "local-disk " + disk_size + " SSD"
103 |     disk: disk_size + " GB"
104 |     maxRetries: 3
105 |     preemptible: 0
106 |   }
107 | }


--------------------------------------------------------------------------------
/workflows/compile_ecoli_results.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | workflow compile_results {
  4 | 
  5 |   input {
  6 |     Array[String]    SRR_array
  7 |     Array[File]      serotypefinder_array
  8 |     Array[File]      abricate_array
  9 |     Array[File]      abricate_virfinder_array
 10 |     Array[File]      amrfinder_array
 11 |   }
 12 |   call compile_abricate {
 13 |     input:
 14 |       array_srr=SRR_array,
 15 |       array_abr=abricate_array
 16 |   }
 17 | 
 18 |   call compile_abricate as compile_abricate_virfinder {
 19 |     input:
 20 |       array_srr=SRR_array,
 21 |       array_abr=abricate_virfinder_array
 22 |   }
 23 | 
 24 |   call compile_amrfinder {
 25 |     input:
 26 |       array_srr=SRR_array,
 27 |       array_afp=amrfinder_array
 28 |   }
 29 | 
 30 |   call compile_serotypefinder {
 31 |     input:
 32 |       array_srr=SRR_array,
 33 |       array_stf=serotypefinder_array
 34 |   }
 35 | 
 36 |   output {
 37 |     File      compiled_serotypefinder_results=compile_serotypefinder.compiled_results
 38 |     File      compiled_abricate_results=compile_abricate.compiled_results
 39 |     File      compiled_abricate_virfinder_results=compile_abricate_virfinder.compiled_results
 40 |     File      compiled_amrfinderplus_results=compile_amrfinder.compiled_results
 41 |   }
 42 | }
 43 | 
 44 | 
 45 | task compile_abricate {
 46 |   input {
 47 |     Array[String]     array_srr
 48 |     Array[File]       array_abr
 49 |   }
 50 | 
 51 |   command <<<
 52 |     touch results.txt
 53 | 
 54 |     srr_array=(~{sep=' ' array_srr})
 55 |     abr_array=(~{sep=' ' array_abr})
 56 |     echo "I am here"
 57 | 
 58 |     for index in ${!srr_array[@]}; do
 59 |       SRR=${srr_array[$index]}
 60 |       file=${abr_array[$index]}
 61 |       echo "$index"
 62 |       echo "$SRR"
 63 |       echo "$file"
 64 | 
 65 |       while IFS= read -r result
 66 |       do
 67 |       printf "%s %s\n" "$SRR $result" >> results.txt
 68 |       done < <(grep -E 'fasta' "$file")
 69 | 
 70 |     done
 71 |   >>>
 72 | 
 73 |   output {
 74 |     File      compiled_results="results.txt"
 75 |   }
 76 | 
 77 |   runtime {
 78 |     docker:       "quay.io/staphb/abricate:1.0.0"
 79 |     memory:       "4 GB"
 80 |     cpu:          1
 81 |     disks:        "local-disk 100 SSD"
 82 |     preemptible:  0
 83 |   }
 84 | }
 85 | 
 86 | task compile_amrfinder {
 87 |   input {
 88 |     Array[String]     array_srr
 89 |     Array[File]       array_afp
 90 |   }
 91 | 
 92 |   command <<<
 93 |     touch results.txt
 94 | 
 95 |     srr_array=(~{sep=' ' array_srr})
 96 |     afp_array=(~{sep=' ' array_afp})
 97 |     echo "I am here"
 98 | 
 99 |     for index in ${!srr_array[@]}; do
100 |       SRR=${srr_array[$index]}
101 |       file=${afp_array[$index]}
102 |       echo "$index"
103 |       echo "$SRR"
104 |       echo "$file"
105 | 
106 |       while IFS= read -r result
107 |       do
108 |       printf "%s %s\n" "$SRR $result" >> results.txt
109 |       done < <(grep -E 'contig' "$file")
110 | 
111 |     done
112 |   >>>
113 | 
114 |   output {
115 |     File      compiled_results="results.txt"
116 |   }
117 | 
118 |   runtime {
119 |     docker:       "quay.io/staphb/ncbi-amrfinderplus:3.8.28"
120 |     memory:       "4 GB"
121 |     cpu:          1
122 |     disks:        "local-disk 100 SSD"
123 |     preemptible:  0
124 |   }
125 | }
126 | 
127 | 
128 | task compile_serotypefinder {
129 |   input {
130 |     Array[String]     array_srr
131 |     Array[File]       array_stf
132 |   }
133 | 
134 |   command <<<
135 |     touch results.txt
136 | 
137 |     srr_array=(~{sep=' ' array_srr})
138 |     stf_array=(~{sep=' ' array_stf})
139 |     echo "I am here"
140 | 
141 |     for index in ${!srr_array[@]}; do
142 |       SRR=${srr_array[$index]}
143 |       file=${stf_array[$index]}
144 |       echo "$index"
145 |       echo "$SRR"
146 |       echo "$file"
147 | 
148 |       while IFS= read -r result
149 |       do
150 |       printf "%s %s\n" "$SRR $result" >> results.txt
151 |       done < <(grep -E 'fliC|wzy|wzx' "$file")
152 | 
153 |     done
154 |   >>>
155 | 
156 |   output {
157 |     File      compiled_results="results.txt"
158 |   }
159 | 
160 |   runtime {
161 |     docker:       "quay.io/staphb/serotypefinder:1.1"
162 |     memory:       "4 GB"
163 |     cpu:          1
164 |     disks:        "local-disk 100 SSD"
165 |     preemptible:  0
166 |   }
167 | }
168 | 


--------------------------------------------------------------------------------
/tasks/utilities/task_summarize_data.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task summarize_data {
  4 |   input {
  5 |     Array[String]? sample_names
  6 |     String? terra_project
  7 |     String? terra_workspace
  8 |     String? terra_table
  9 |     String? column_names # string of comma-delimited column names
 10 |     String? output_prefix 
 11 | 
 12 |     Int disk_size = 100
 13 |     File? input_table
 14 |     Boolean phandango_coloring = true
 15 |   }
 16 |   command <<<   
 17 |     # when running on terra, comment out all input_table mentions
 18 |     python3 /scripts/export_large_tsv/export_large_tsv.py --project "~{terra_project}" --workspace "~{terra_workspace}" --entity_type ~{terra_table} --tsv_filename ~{terra_table}-data.tsv 
 19 |     
 20 |     # when running locally, use the input_table in place of downloading from Terra
 21 |     #cp ~{input_table} ~{terra_table}-data.tsv
 22 |     
 23 |     if ~{phandango_coloring}; then
 24 |       export phandango_coloring="true"
 25 |     else 
 26 |       export phandango_coloring="false"
 27 |     fi
 28 | 
 29 |     python3 <<CODE 
 30 |   import pandas as pd
 31 |   import numpy as np
 32 |   import itertools
 33 |   import os
 34 |   import re
 35 | 
 36 |   # read exported Terra table into pandas
 37 |   tablename = "~{terra_table}-data.tsv" 
 38 |   table = pd.read_csv(tablename, delimiter='\t', header=0, index_col=False, dtype={"~{terra_table}_id": 'str'}) # ensure sample_id is always a string
 39 | 
 40 |   # extract the samples for upload from the entire table
 41 |   table = table[table["~{terra_table}_id"].isin("~{sep='*' sample_names}".split("*"))]
 42 | 
 43 |   # split column list into an array
 44 |   columns = "~{column_names}".split(",")
 45 | 
 46 |   temporarylist = []
 47 |   temporarylist.append("~{terra_table}_id")
 48 |   temporarylist += columns
 49 | 
 50 |   table = table[temporarylist].copy()
 51 | 
 52 |   # create a table to search through containing only columns of interest
 53 |   searchtable = table[columns].copy()
 54 | 
 55 |   # iterate through the columns of interest and combine into a single list
 56 |   genes = []
 57 |   for item in columns:
 58 |     genes.append(table[item].str.split(",").explode().tolist())
 59 | 
 60 |   # add phandango coloring tags if indicated
 61 |   if (os.environ["phandango_coloring"] == "true"):
 62 |     i = 1 # initialize a starting group at 1
 63 |     newgenes = [] # create a new temporary list
 64 |     for group in genes: # iterate through gene list by items found within a columnkk
 65 |       if (i < 10):
 66 |         newgroup = [] # create a new temporary sublist
 67 |         for item in group: # for every item in a column, 
 68 |           newitem = item + ":o" + str(i) # add a unique :o coloring (as indicated by the str(i))
 69 |           newgroup.append(newitem) # add phandango-suffixed item to the new sublist
 70 |         newgenes.append(newgroup) # add the new sublist to the new list
 71 |         i += 1 # increment the i value so each column gets its own coloring     
 72 |       else:
 73 |         print("DEBUG: Some items will be ignored because a maximum of ten columns can be presented at once with phandango coloring.")
 74 | 
 75 |     # overwrite genes with newgenes (which now has the phandango coloring suffix)
 76 |     genes = newgenes 
 77 |   else:
 78 |     print("NOTE: Phandango coloring was not applied")
 79 | 
 80 |   # flattening the list
 81 |   genes = list(itertools.chain.from_iterable(genes))
 82 | 
 83 |   # removing duplicates but maintaining order
 84 |   genes = sorted(set(genes), key=lambda x: genes.index(x))
 85 | 
 86 |   # add genes as true/false entries into table
 87 |   for item in genes:
 88 |     if (os.environ["phandango_coloring"] == "true"): # temporarily removes coloring suffix (CAUTION: ASSUMES LESS THAN 10 COLUMN NAMES PROVIDED)
 89 |       table[item] = searchtable.apply(lambda row: row.astype(str).str.contains(re.escape(item[:len(item)-3])).any(), axis=1)
 90 | 
 91 |     else:
 92 |       table[item] = searchtable.apply(lambda row: row.astype(str).str.contains(re.escape(item)).any(), axis=1)
 93 | 
 94 |   # replace all "False" cells with empty strings
 95 |   table[table.eq(False)] = np.nan
 96 | 
 97 |   # dropping columns of interest so only true/false ones remain
 98 |   table.drop(columns,axis=1,inplace=True)
 99 | 
100 |   table.to_csv("~{output_prefix}_summarized_data.csv", sep=',', index=False)
101 | 
102 |   CODE
103 |   >>>
104 |   output {
105 |     File summarized_data = "~{output_prefix}_summarized_data.csv"
106 |   }
107 |   runtime {
108 |     docker: "broadinstitute/terra-tools:tqdm"
109 |     memory: "8 GB"
110 |     cpu: 1
111 |     disks: "local-disk " + disk_size + " SSD"
112 |     disk: disk_size + " GB"
113 |     dx_instance_type: "mem1_ssd1_v2_x2"
114 |     maxRetries: 3
115 |   }
116 | }


--------------------------------------------------------------------------------
/tasks/quality_control/task_mummer_ani.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task animummer {
 4 |   input {
 5 |     File assembly
 6 |     String samplename
 7 |     File? ref_genome
 8 |     Float mash_filter = 0.9
 9 |     String docker="staphb/mummer:4.0.0-rgdv2"
10 |     Int disk_size = 100
11 |   }
12 |   command <<<
13 |     # capture and version
14 |     mummer --version | tee MUMMER_VERSION
15 | 
16 |     # set the reference genome
17 |     # if not defined by user, then use all 43 genomes in RGDv2
18 |     if [[ -z "~{ref_genome}" ]]; then
19 |       # ref genome is not defined. default to RGDv2
20 |       # BASH variable
21 |       REF_GENOME="$(ls /RGDv2/*.fasta)"
22 |       echo "user did not define a reference genome, defaulting to 43 genomes in RGDv2"
23 |       echo "REF_GENOME is set to: ${REF_GENOME}"
24 |     else 
25 |       echo "User specified a reference genome, will use this instead of RGDv2"
26 |       REF_GENOME="~{ref_genome}"
27 |       echo "REF_GENOME is set to: ${REF_GENOME}"
28 |     fi
29 | 
30 |     # call Lee's ani-m.pl script and compare query genome against reference genome
31 |     # first does a mash check on relatedness between 2 genomes. If greater than mash_filter, then run dnadiff
32 |     # --symmetric flag runs ANI on query vs. ref; followed by ref vs. query
33 |     ani-m.pl --symmetric \
34 |              --mash-filter ~{mash_filter} \
35 |              ~{assembly} \
36 |              ${REF_GENOME} | tee ~{samplename}.ani-mummer.out.tsv
37 | 
38 |     # CHECK FOR A NEARLY BLANK TSV (ONLY HEADER LINE), mean sample did not surpass mash-filter and thus no ANI was run
39 |     LINE_COUNT_OUTPUT_TSV=$(wc -l ~{samplename}.ani-mummer.out.tsv | cut -d ' ' -f 1)
40 |     echo "Number of lines in output TSV is: ${LINE_COUNT_OUTPUT_TSV}"
41 |     if [[ ${LINE_COUNT_OUTPUT_TSV} -eq 1 ]]; then
42 |       echo "~{samplename} did not surpass the minimum mash genetic distance filter, thus ANI was not performed"
43 |       echo "The output TSV only contains the header line"
44 |       # set output variables as 0s or descriptive strings
45 |       echo "0.0" > ANI_HIGHEST_PERCENT_BASES_ALIGNED
46 |       echo "0.0" > ANI_HIGHEST_PERCENT
47 |       echo "ANI skipped due to high genetic divergence from reference genomes" > ANI_TOP_SPECIES_MATCH
48 |     # if output TSV has greater than 1 lines, then parse for appropriate outputs
49 |     else
50 |       ## parse out highest percentBases aligned
51 |       cut -f 5 ~{samplename}.ani-mummer.out.tsv | sort -nr | head -n 1 | tee ANI_HIGHEST_PERCENT_BASES_ALIGNED
52 |       echo "highest percent bases aligned is: $(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)"
53 | 
54 |       ## parse out ANI value using highest percentBases aligned value
55 |       grep "$(cat ANI_HIGHEST_PERCENT_BASES_ALIGNED)" ~{samplename}.ani-mummer.out.tsv | cut -f 3 | tee ANI_HIGHEST_PERCENT
56 |       echo "ANI value is: $(cat ANI_HIGHEST_PERCENT)"
57 | 
58 |       # have to separate out results for ani_top_species match because user-defined reference genome FASTAs will not be named as they are in RGDv2
59 |       if [[ -z "~{ref_genome}" ]]; then
60 |         ### ref genome is not user-defined, using RGDv2 and FASTA filenames ###
61 |         # Parse out species name from reference fasta filename
62 |         # use percent bases aligned to pull relevant line, cut down to query and ref fasta filenames, sed to remove your query filename, xargs to remove whitespaces & stuff
63 |         # cut on periods to pull out genus_species (in future this will inlcude lineages for Listeria and other sub-species designations)
64 |         # have to create assembly_file_basename bash variable since output TSV does not include full path to assembly file, only filename
65 |         assembly_file_basename=$(basename ~{assembly})
66 |         grep "$(cat ANI_HIGHEST_PERCENT)" ~{samplename}.ani-mummer.out.tsv | cut -f 1,2 | sed "s|${assembly_file_basename}||g" | xargs | cut -d '.' -f 3 | tee ANI_TOP_SPECIES_MATCH
67 |         echo "ANI top species match is: $(cat ANI_TOP_SPECIES_MATCH)"
68 |       else 
69 |         # User specified a reference genome, use fasta filename as output string
70 |         basename "${REF_GENOME}" > ANI_TOP_SPECIES_MATCH
71 |         echo "Reference genome used for ANI is: ${REF_GENOME}"
72 |       fi
73 |     fi
74 |     
75 |   >>>
76 |   output {
77 |     Float ani_highest_percent = read_float("ANI_HIGHEST_PERCENT")
78 |     Float ani_highest_percent_bases_aligned = read_float("ANI_HIGHEST_PERCENT_BASES_ALIGNED")
79 |     File ani_output_tsv = "~{samplename}.ani-mummer.out.tsv"
80 |     String ani_top_species_match = read_string("ANI_TOP_SPECIES_MATCH")
81 |     String ani_mummer_version = read_string("MUMMER_VERSION")
82 |   }
83 |   runtime {
84 |     docker: "~{docker}"
85 |     memory: "8 GB"
86 |     cpu: 4
87 |     disks: "local-disk " + disk_size + " SSD"
88 |     disk: disk_size + " GB"
89 |     maxRetries: 3
90 |     preemptible:  0
91 |   }
92 | }


--------------------------------------------------------------------------------
/workflows/wf_core_gene_snp.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | import "../tasks/phylogenetic_inference/task_pirate.wdl" as pirate
  4 | import "../tasks/phylogenetic_inference/task_iqtree.wdl" as iqtree
  5 | import "../tasks/phylogenetic_inference/task_snp_dists.wdl" as snp_dists
  6 | import "../tasks/task_versioning.wdl" as versioning
  7 | import "../tasks/utilities/task_summarize_data.wdl" as data_summary
  8 | 
  9 | 
 10 | workflow core_gene_snp_workflow {
 11 |   input {
 12 |     Array[File] gff3
 13 |     String cluster_name
 14 |     # if align = true, the pirate task will produce core and pangenome alignments for the sample set,
 15 |     # otherwise, pirate will only produce a pangenome summary
 16 |     Boolean align = true
 17 |     # use core_tree = true to produce a phylogenetic tree and snp distance matrix from the core genome alignment
 18 |     Boolean core_tree = true
 19 |     # use pan_tree = true to produce a phylogenetic tree and snp distance matrix from the pangenome alignment
 20 |     Boolean pan_tree = false
 21 |     # data summary input variables
 22 |     Array[String]? sample_names
 23 |     String? data_summary_terra_project
 24 |     String? data_summary_terra_workspace
 25 |     String? data_summary_terra_table
 26 |     String? data_summary_column_names
 27 |   }
 28 |   call pirate.pirate as pirate {
 29 |     input:
 30 |       gff3 = gff3,
 31 |       cluster_name = cluster_name,
 32 |       align = align
 33 |   }
 34 |   if (align) {
 35 |     if (core_tree) {
 36 |       call iqtree.iqtree as core_iqtree {
 37 |         input:
 38 |           alignment = select_first([pirate.pirate_core_alignment_fasta]),
 39 |           cluster_name = cluster_name
 40 |       }
 41 |       call snp_dists.snp_dists as core_snp_dists {
 42 |         input:
 43 |           alignment = select_first([pirate.pirate_core_alignment_fasta]),
 44 |           cluster_name = cluster_name
 45 |       }
 46 |       call snp_dists.reorder_matrix as core_reorder_matrix {
 47 |         input:
 48 |           input_tree = core_iqtree.ml_tree,
 49 |           matrix = core_snp_dists.snp_matrix,
 50 |           cluster_name = cluster_name + "_core"
 51 |       }
 52 |     }
 53 |     if (pan_tree) {
 54 |       call iqtree.iqtree as pan_iqtree {
 55 |         input:
 56 |           alignment = select_first([pirate.pirate_pangenome_alignment_fasta]),
 57 |           cluster_name = cluster_name
 58 |       }
 59 |       call snp_dists.snp_dists as pan_snp_dists {
 60 |         input:
 61 |           alignment = select_first([pirate.pirate_pangenome_alignment_fasta]),
 62 |           cluster_name = cluster_name
 63 |       }
 64 |       call snp_dists.reorder_matrix as pan_reorder_matrix {
 65 |         input:
 66 |           input_tree = pan_iqtree.ml_tree,
 67 |           matrix = pan_snp_dists.snp_matrix,
 68 |           cluster_name = cluster_name + "_pan"
 69 |       }
 70 |     }
 71 |   }
 72 |   if (defined(data_summary_column_names)) {
 73 |     call data_summary.summarize_data {
 74 |       input:
 75 |         sample_names = sample_names,
 76 |         terra_project = data_summary_terra_project,
 77 |         terra_workspace = data_summary_terra_workspace,
 78 |         terra_table = data_summary_terra_table,
 79 |         column_names = data_summary_column_names,
 80 |         output_prefix = cluster_name
 81 |     }
 82 |   }
 83 |   call versioning.version_capture{
 84 |     input:
 85 |   }
 86 |   output {
 87 |     # Version Capture
 88 |     String core_gene_snp_wf_version = version_capture.phbg_version
 89 |     String core_gene_snp_wf_analysis_date = version_capture.date
 90 |     # pirate_outputs
 91 |     File pirate_pangenome_summary = pirate.pirate_pangenome_summary
 92 |     File pirate_gene_families_ordered = pirate.pirate_gene_families_ordered
 93 |     File? pirate_core_alignment_fasta = pirate.pirate_core_alignment_fasta
 94 |     File? pirate_core_alignment_gff = pirate.pirate_core_alignment_gff
 95 |     File? pirate_pan_alignment_fasta = pirate.pirate_pangenome_alignment_fasta
 96 |     File? pirate_pan_alignment_gff = pirate.pirate_pangenome_alignment_gff
 97 |     File? pirate_presence_absence_csv = pirate.pirate_presence_absence_csv
 98 |     String pirate_docker_image = pirate.pirate_docker_image
 99 |     # snp_dists outputs
100 |     String? pirate_snps_dists_version = select_first([core_snp_dists.version,pan_snp_dists.version,""])
101 |     # iqtree outputs
102 |     String? pirate_iqtree_version = select_first([core_iqtree.version,pan_iqtree.version,""])
103 |     # reorder matrix outputs
104 |     File? pirate_core_snp_matrix = core_reorder_matrix.ordered_matrix
105 |     File? pirate_iqtree_core_tree = core_reorder_matrix.tree
106 |     File? pirate_pan_snp_matrix = pan_reorder_matrix.ordered_matrix
107 |     File? pirate_iqtree_pan_tree = pan_reorder_matrix.tree
108 |     # Data summary outputs
109 |     File? pirate_summarized_data = summarize_data.summarized_data
110 |   }
111 | }
112 | 


--------------------------------------------------------------------------------
/tasks/gene_typing/task_resfinder.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task resfinder {
  4 |   input {
  5 |     File assembly # Input fasta file
  6 |     String samplename
  7 |     String? organism # Species in the sample, species should be entered with their full scientific names (e.g. "escherichia coli"), using quotation marks
  8 |     Boolean acquired = true # Run resfinder for acquired resistance genes
  9 |     Float? min_cov = 0.6 # Minimum (breadth-of) coverage of ResFinder
 10 |     Float? threshold = 0.9 # Threshold for identity of ResFinder
 11 |     Boolean point = false # Run pointfinder for chromosomal mutations
 12 |     String docker = "staphb/resfinder:4.1.11"
 13 |     Int disk_size = 100
 14 |   }
 15 |   command <<<
 16 |     date | tee DATE
 17 |     run_resfinder.py --version | tee RESFINDER_VERSION
 18 |     echo "unmodified from resfinder docker container" > RESFINDER_DB_VERSION
 19 | 
 20 |     # set $resfinder_organism BASH variable based on gambit_predicted_taxon or user-defined input string
 21 |     if [[ "~{organism}" == *"Campylobacter"*"jejuni"* ]]; then
 22 |       resfinder_organism="campylobacter jejuni"
 23 |     elif [[ "~{organism}" == *"Campylobacter"*"coli"* ]]; then
 24 |       resfinder_organism="campylobacter coli"
 25 |     elif [[ "~{organism}" == *"Campylobacter"* ]]; then
 26 |       resfinder_organism="campylobacter"
 27 |     elif [[ "~{organism}" == *"Enterococcus"*"faecalis"* ]]; then 
 28 |       resfinder_organism="enterococcus faecalis"
 29 |     elif [[ "~{organism}" == *"Enterococcus"*"faecium"* ]]; then 
 30 |       resfinder_organism="enterococcus faecium"
 31 |     elif [[ "~{organism}" == *"Escherichia"*"coli"* ]]; then 
 32 |       resfinder_organism="escherichia coli"
 33 |     elif [[ "~{organism}" == *"Klebsiella"* ]]; then 
 34 |       resfinder_organism="klebsiella"
 35 |     elif [[ "~{organism}" == *"Neisseria"*"gonorrhoeae"* ]]; then 
 36 |       resfinder_organism="neisseria gonorrhoeae"
 37 |     elif [[ "~{organism}" == *"Salmonella"* ]]; then 
 38 |       resfinder_organism="salmonella"
 39 |     elif [[ "~{organism}" == *"Staphylococcus"*"aureus"* ]]; then 
 40 |       resfinder_organism="staphylococcus aureus"
 41 |     elif [[ "~{organism}" == *"Mycobacterium"*"tuberculosis"* ]]; then 
 42 |       resfinder_organism="mycobacterium tuberculosis"
 43 |     elif [[ "~{organism}" == *"Helicobacter"*"pylori"* ]]; then 
 44 |       resfinder_organism="helicobacter pylori"
 45 |     else 
 46 |       echo "Either Gambit predicted taxon is not supported by resfinder or the user did not supply an organism as input."
 47 |       echo "Skipping the use of resfinder --species optional parameter."
 48 |     fi
 49 | 
 50 |     # if resfinder_organism variable is set, use --species flag, otherwise do not use --species flag
 51 |     if [[ -v resfinder_organism ]] ; then
 52 |       run_resfinder.py \
 53 |         --inputfasta ~{assembly} \
 54 |         --outputPath . \
 55 |         --species "${resfinder_organism}" \
 56 |         ~{true="--acquired" false="" acquired} \
 57 |         ~{'--min_cov ' + min_cov} \
 58 |         ~{'--threshold ' + threshold} \
 59 |         ~{true="--point" false="" point} 
 60 |     else 
 61 |       run_resfinder.py \
 62 |         --inputfasta ~{assembly} \
 63 |         --outputPath . \
 64 |         --species "other" \
 65 |         ~{true="--acquired" false="" acquired} \
 66 |         ~{'--min_cov ' + min_cov} \
 67 |         ~{'--threshold ' + threshold} 
 68 |     fi
 69 | 
 70 |     # replace space in resfinder_organism with underscore
 71 |     resfinder_organism="${resfinder_organism// /_}"
 72 | 
 73 |     # rename files
 74 |     mv pheno_table.txt ~{samplename}_pheno_table.txt
 75 |     if [ -f pheno_table_${resfinder_organism}.txt ]; then
 76 |       mv pheno_table_${resfinder_organism}.txt ~{samplename}_pheno_table_species.txt
 77 |     fi
 78 |     mv ResFinder_Hit_in_genome_seq.fsa ~{samplename}_ResFinder_Hit_in_genome_seq.fsa
 79 |     mv ResFinder_Resistance_gene_seq.fsa ~{samplename}_ResFinder_Resistance_gene_seq.fsa
 80 |     mv ResFinder_results_tab.txt ~{samplename}_ResFinder_results_tab.txt
 81 |     if [ -f PointFinder_prediction.txt ]; then
 82 |       mv PointFinder_prediction.txt ~{samplename}_PointFinder_prediction.txt
 83 |       mv PointFinder_results.txt ~{samplename}_PointFinder_results.txt
 84 |     fi
 85 | 
 86 |   >>>
 87 |   output {
 88 |     File resfinder_pheno_table = "~{samplename}_pheno_table.txt"
 89 |     File? resfinder_pheno_table_species = "~{samplename}_pheno_table_species.txt"
 90 |     File resfinder_hit_in_genome_seq = "~{samplename}_ResFinder_Hit_in_genome_seq.fsa"
 91 |     File resfinder_resistance_gene_seq = "~{samplename}_ResFinder_Resistance_gene_seq.fsa"
 92 |     File resfinder_results_tab = "~{samplename}_ResFinder_results_tab.txt"
 93 |     File? pointfinder_pheno_table = "~{samplename}_PointFinder_prediction.txt"
 94 |     File? pointfinder_results = "~{samplename}_PointFinder_results.txt"
 95 |     String resfinder_docker = "~{docker}"
 96 |     String resfinder_version = read_string("RESFINDER_VERSION")
 97 |     String resfinder_db_version = read_string("RESFINDER_DB_VERSION")
 98 |   }
 99 |   runtime {
100 |     memory: "8 GB"
101 |     cpu: 4
102 |     docker: docker
103 |     disks: "local-disk " + disk_size + " SSD"
104 |     disk: disk_size + " GB"
105 |     maxRetries: 3
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_poppunk_streppneumo.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | 
 3 | task poppunk {
 4 |   meta {
 5 |     description: "Using poppunk with GPS (Global Pneumococcal Sequencing project) database for Streptococcus pneumoniae typing"
 6 |   }
 7 |   input {
 8 |     File assembly
 9 |     String samplename
10 |     String docker = "staphb/poppunk:2.4.0"
11 |     Int disk_size = 100
12 |     Int cpus = 4
13 |     # database/reference files currently hosted on a public, requester-pays GCP bucket
14 |     # hosting individually for speed purposes. Unzipping one big 20GB zip archive takes a long time, longer than downloading the files individually (which total 22GB uncompressed)
15 |     # If future versions of the GPS database are released, we can update the links here or in Terra, and task should be future-proof
16 |     File GPS_dists_npy = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.dists.npy"
17 |     File GPS_dists_pkl = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.dists.pkl"
18 |     File GPS_h5 = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.h5"
19 |     File GPS_refs = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs"
20 |     File GPS_refs_dists_npy = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.dists.npy"
21 |     File GPS_refs_dists_pkl = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.dists.pkl"
22 |     File GPS_refs_h5 = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6.refs.h5"
23 |     File GPS_clusters_csv = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_clusters.csv"
24 |     File GPS_fit_npz = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_fit.npz"
25 |     File GPS_fit_pkl = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_fit.pkl"
26 |     File GPS_graph_gt = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_graph.gt"
27 |     File GPS_qcreport_txt = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_qcreport.txt"
28 |     File GPS_unword_clusters_csv = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_unword_clusters.csv"
29 |     File GPS_refs_graph_gt = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6refs_graph.gt"
30 |     File GPS_external_clusters_csv = "gs://theiagen-public-files-rp/terra/theiaprok-files/GPS_v6/GPS_v6_external_clusters.csv"
31 |   }
32 |   command <<<
33 |     # get version information
34 |     poppunk --version | sed 's/poppunk //' | tee VERSION
35 |     
36 |     # create input TSV
37 |     echo -e "~{samplename}\t~{assembly}" > ~{samplename}_poppunk_input.tsv
38 |     
39 |     # determine the database name, which is also used as a prefix for all files included in database. Also used as GPS_DB_NAME directory to put database files in
40 |     # doing this for future proofing
41 |     # get file name of primary h5 file, strip off suffix
42 |     GPS_DB_NAME=$(basename ~{GPS_h5} | sed 's|.h5||')
43 |     # sending GPS_DB_NAME into text file for logging/output purposes
44 |     echo "${GPS_DB_NAME}" > GPS_DB_NAME
45 | 
46 |     # move all database/reference files into single directory to feed into poppunk
47 |     mkdir -v "${GPS_DB_NAME}"
48 |     ln -vs ~{GPS_dists_npy} ~{GPS_dists_pkl} ~{GPS_h5} ~{GPS_refs} \
49 |       ~{GPS_refs_dists_npy} ~{GPS_refs_dists_pkl} ~{GPS_refs_h5} ~{GPS_clusters_csv} \
50 |       ~{GPS_fit_npz} ~{GPS_fit_pkl} ~{GPS_graph_gt} ~{GPS_qcreport_txt} \
51 |       ~{GPS_unword_clusters_csv} ~{GPS_refs_graph_gt} ~{GPS_external_clusters_csv} \
52 |       "${GPS_DB_NAME}"/
53 | 
54 |     # to allow for compatibility with future versions of the database
55 |     # poppunk requires this file to be explicitly passed as input
56 |     GPS_EXTERNAL_CLUSTERS_CSV=$(ls "${GPS_DB_NAME}"/GPS_*_external_clusters.csv)
57 | 
58 |     # run poppunk
59 |     poppunk_assign \
60 |       --threads ~{cpus} \
61 |       --db "${GPS_DB_NAME}" \
62 |       --distances "${GPS_DB_NAME}/${GPS_DB_NAME}.dists" \
63 |       --query ~{samplename}_poppunk_input.tsv \
64 |       --output ~{samplename}_poppunk \
65 |       --external-clustering "${GPS_EXTERNAL_CLUSTERS_CSV}"
66 | 
67 |     # parse output CSV for GPSC (Global Pneumococcal Sequence Cluster)
68 |     if [ -f ~{samplename}_poppunk/~{samplename}_poppunk_external_clusters.csv ]; then
69 |       cut -d ',' -f 2 ~{samplename}_poppunk/~{samplename}_poppunk_external_clusters.csv | tail -n 1 > GPSC.txt
70 | 
71 |       # if GPSC is "NA", overwrite with helpful message
72 |       if [[ "$(cat GPSC.txt)" == "NA" ]]; then
73 |         echo "Potential novel GPS Cluster identified, please email globalpneumoseq@gmail.com to have novel clusters added to the database and a GPSC cluster name assigned after you have checked for low level contamination which may contribute to biased accessory distances." >GPSC.txt
74 |       fi
75 |     else
76 |       echo "poppunk failed" > GPSC.txt
77 |     fi
78 | 
79 |   >>>
80 |   output {
81 |     String poppunk_gps_cluster = read_string("GPSC.txt")
82 |     File? poppunk_gps_external_cluster_csv = "~{samplename}_poppunk/~{samplename}_poppunk_external_clusters.csv"
83 |     String poppunk_version = read_string("VERSION")
84 |     String poppunk_docker = docker
85 |     String poppunk_GPS_db_version = read_string("GPS_DB_NAME")
86 |   }
87 |   runtime {
88 |     docker: "~{docker}"
89 |     # poppunk with the GPS v6 db used upwards of 12GB ram at times
90 |     memory: "16 GB"
91 |     cpu: cpus
92 |     disks: "local-disk " + disk_size + " SSD"
93 |     disk: disk_size + " GB"
94 |     maxRetries: 3
95 |     preemptible: 0
96 |   }
97 | }
98 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_srst2_vibrio.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task srst2_vibrio {
  4 |   meta {
  5 |     description: "Use of SRST2 to identify sequences of interest from a database of curated Vibrio sequences "
  6 |   }
  7 |   input {
  8 |     File reads1
  9 |     File? reads2
 10 |     String samplename
 11 |     Int srst2_min_cov
 12 |     Int srst2_max_divergence
 13 |     Int srst2_min_depth
 14 |     Int srst2_min_edge_depth
 15 |     Int srst2_gene_max_mismatch
 16 |     String docker = "quay.io/staphb/srst2:0.2.0-vcholerae"
 17 |     Int disk_size = 100
 18 |     Int cpu = 4
 19 |   }
 20 |   command <<<
 21 |     if [ -z "~{reads2}" ] ; then
 22 |       INPUT_READS="--input_se ~{reads1}"
 23 |     else
 24 |       # This task expects/requires that input FASTQ files end in "_1.clean.fastq.gz" and "_2.clean.fastq.gz"
 25 |       # which is the syntax from TheiaProk read cleaning tasks
 26 |       INPUT_READS="--input_pe ~{reads1} ~{reads2} --forward _1.clean --reverse _2.clean"
 27 |     fi
 28 | 
 29 |     srst2 --version 2>&1 | tee VERSION
 30 |     srst2 \
 31 |       ${INPUT_READS} \
 32 |       --gene_db /vibrio-cholerae-db/vibrio_230224.fasta \
 33 |       --output ~{samplename} \
 34 |       --min_coverage ~{srst2_min_cov} \
 35 |       --max_divergence ~{srst2_max_divergence} \
 36 |       --min_depth ~{srst2_min_depth} \
 37 |       --min_edge_depth ~{srst2_min_edge_depth} \
 38 |       --gene_max_mismatch ~{srst2_gene_max_mismatch}
 39 |     
 40 |     # capture output TSV
 41 |     mv ~{samplename}__genes__*__results.txt ~{samplename}.tsv
 42 | 
 43 |     # capture detailed output TSV - not available if no results are outputed
 44 |     mv ~{samplename}__fullgenes__*__results.txt ~{samplename}.detailed.tsv || echo "No results" >  ~{samplename}.detailed.tsv
 45 | 
 46 |     # parsing block to account for when output columns do not exist
 47 |     python <<CODE
 48 |     import csv
 49 |     import re
 50 | 
 51 |     # Converting TSV file into list of dictionaries
 52 |     def tsv_to_dict(filename):
 53 |       result_list=[]
 54 |       with open(filename) as file_obj:
 55 |           reader = csv.DictReader(file_obj, delimiter='\t')
 56 |           for row in reader:
 57 |               result_list.append(dict(row))
 58 |       # only one sample is run, so there's only one row, flattening list
 59 |       return result_list[0]
 60 | 
 61 |     # Converting None to empty string
 62 |     conv = lambda i : i or '-'
 63 | 
 64 |     # Make characters human-readable 
 65 |     def translate_chars(string):
 66 |       translation = []
 67 |       if '?' in string:
 68 |         translation.append("low depth/uncertain")
 69 |       if '-' in string:
 70 |         translation.append("not detected")
 71 |       
 72 |       # in case we want to retrieve the allele information
 73 |       string = re.sub("\*|\?|-", "", string)
 74 | 
 75 |       if len(translation) > 0:
 76 |         return '(' + ';'.join(translation) + ')'
 77 |       return ""
 78 | 
 79 |     # load output TSV as dict 
 80 |     row = tsv_to_dict('~{samplename}.tsv')
 81 |   
 82 |     # presence or absence genes - ctxA, ompW and toxR
 83 |     with open("ctxA", "wb") as ctxA_fh:
 84 |       value = row.get("ctxA")
 85 |       presence = translate_chars(conv(value))
 86 |       if presence == "(not detected)":
 87 |         ctxA_fh.write(presence)
 88 |       else:
 89 |         result = "present" + ' ' + presence
 90 |         ctxA_fh.write(result.strip())
 91 |     
 92 |     with open("ompW", "wb") as ompW_fh:
 93 |       value = row.get("ompW")
 94 |       presence = translate_chars(conv(value))
 95 |       if presence == "(not detected)":
 96 |         ompW_fh.write(presence)
 97 |       else:
 98 |         result = "present" + ' ' + presence
 99 |         ompW_fh.write(result.strip())
100 |     
101 |     with open("toxR", "wb") as toxR_fh:
102 |       value = row.get("toxR")
103 |       presence = translate_chars(conv(value))
104 |       if presence == "(not detected)":
105 |         toxR_fh.write(presence)
106 |       else:
107 |         result = "present" + ' ' + presence
108 |         toxR_fh.write(result.strip())
109 |     
110 |     # biotype - tcpA classical or tcpA ElTor
111 |     with open("BIOTYPE", "wb") as biotype_fh:
112 |       value_ElTor = translate_chars(conv(row.get("tcpA_ElTor")))
113 |       value_classical = translate_chars(conv(row.get("tcpA_classical")))
114 | 
115 |       if value_ElTor == "(not detected)" and value_classical == "(not detected)":
116 |         biotype_fh.write("(not detected)")
117 |       else:
118 |         if value_ElTor == "(not detected)":
119 |           result = "tcpA_Classical" + ' ' + value_classical
120 |           biotype_fh.write(result.strip())
121 |         else:
122 |           result = "tcpA_ElTor" + ' ' + value_ElTor
123 |           biotype_fh.write(result.strip())
124 |         
125 |     # serogroup - O1 or O139
126 |     with open("SEROGROUP", "wb") as serotype_fh:
127 |       value_O1 = translate_chars(conv(row.get("wbeN_O1")))
128 |       value_O139 = translate_chars(conv(row.get("wbfR_O139")))
129 | 
130 |       if value_O1 == "(not detected)" and value_O139 == "(not detected)":
131 |         serotype_fh.write("(not detected)")
132 |       else:
133 |         if value_O1 == "(not detected)":
134 |           result = "O139" + ' ' + value_O139
135 |           serotype_fh.write(result.strip())
136 |         else:
137 |           result = "O1" + ' ' + value_O1
138 |           serotype_fh.write(result.strip())
139 |     CODE
140 |   >>>
141 |   output {
142 |       File srst2_detailed_tsv = "~{samplename}.detailed.tsv"
143 |       String srst2_version = read_string("VERSION")
144 |       String srst2_vibrio_ctxA = read_string("ctxA")
145 |       String srst2_vibrio_ompW = read_string("ompW")
146 |       String srst2_vibrio_toxR = read_string("toxR")
147 |       String srst2_vibrio_biotype = read_string("BIOTYPE")
148 |       String srst2_vibrio_serogroup = read_string("SEROGROUP")
149 |   }
150 |   runtime {
151 |     docker: "~{docker}"
152 |     memory: "8 GB"
153 |     cpu: cpu
154 |     disks: "local-disk " + disk_size + " SSD"
155 |     disk: disk_size + " GB"
156 |     maxRetries: 3
157 |     preemptible: 0
158 |   }
159 | }
160 | 


--------------------------------------------------------------------------------
/tasks/assembly/task_shovill.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task shovill_pe {
  4 |   input {
  5 |     File read1_cleaned
  6 |     File read2_cleaned
  7 |     String samplename
  8 |     String docker = "quay.io/staphb/shovill:1.1.0"
  9 |     Int disk_size = 100
 10 | 
 11 |     ## SHOVILL optional parameters
 12 |     ##  --depth [INT]           Sub-sample --R1/--R2 to this depth. Disable with --depth 0 (default: 150)
 13 |     ##  --gsize [STRING]        Estimated genome size eg. 3.2M <blank=AUTODETECT> (default: '')
 14 |     ##  --minlen [INT]          Minimum contig length <0=AUTO> (default: 0)
 15 |     ##  --mincov [FLOAT]        Minimum contig coverage <0=AUTO> (default: 2)
 16 |     ##  --assembler [STRING]    Assembler: skesa velvet megahit spades (default: 'spades')
 17 |     ##  --opts [STRING]         Extra assembler options in quotes eg. spades: "--untrusted-contigs locus.fna" ... (default: '')
 18 |     ##  --kmers [STRING]        K-mers to use <blank=AUTO> (default: '')
 19 |     ##  --trim [BOOLEAN]        Enable adaptor trimming (default: OFF)
 20 |     ##  --noreadcorr [BOOLEAN]  Disable read error correction (default: OFF)
 21 |     ##  --nostitch [BOOLEAN]    Disable read stitching (default: OFF)
 22 |     ##  --nocorr [BOOLEAN]      Disable post-assembly correction (default: OFF)
 23 | 
 24 |     
 25 |     Int? depth
 26 |     String? genome_size
 27 |     Int min_contig_length = 200
 28 |     Float? min_coverage
 29 |     String assembler = "skesa"
 30 |     String? assembler_options
 31 |     String? kmers
 32 |     Boolean trim = false
 33 |     Boolean noreadcorr = false
 34 |     Boolean nostitch = false
 35 |     Boolean nocorr = false
 36 |   }
 37 |   command <<<
 38 |     shovill --version | head -1 | tee VERSION
 39 |     shovill \
 40 |     --outdir out \
 41 |     --R1 ~{read1_cleaned} \
 42 |     --R2 ~{read2_cleaned} \
 43 |     --minlen ~{min_contig_length} \
 44 |     ~{'--depth ' + depth} \
 45 |     ~{'--gsize ' + genome_size} \
 46 |     ~{'--mincov ' + min_coverage} \
 47 |     ~{'--assembler ' + assembler} \
 48 |     ~{'--opts ' + assembler_options} \
 49 |     ~{'--kmers ' + kmers} \
 50 |     ~{true='--trim' false='' trim} \
 51 |     ~{true='--noreadcorr' false='' noreadcorr} \
 52 |     ~{true='--nostitch' false='' nostitch} \
 53 |     ~{true='--nocorr' false='' nocorr}
 54 | 
 55 |     mv out/contigs.fa out/~{samplename}_contigs.fasta
 56 | 
 57 |     if [ "~{assembler}" == "spades" ] ; then
 58 |       mv out/contigs.gfa out/~{samplename}_contigs.gfa
 59 |     elif [ "~{assembler}" == "megahit" ] ; then
 60 |       mv out/contigs.fastg out/~{samplename}_contigs.fastg
 61 |     elif [ "~{assembler}" == "velvet" ] ; then
 62 |       mv out/contigs.LastGraph out/~{samplename}_contigs.LastGraph
 63 |     fi
 64 |     
 65 |   >>>
 66 |   output {
 67 |     File assembly_fasta = "out/~{samplename}_contigs.fasta"
 68 |     File? contigs_gfa = "out/~{samplename}_contigs.gfa"
 69 |     File? contigs_fastg = "out/~{samplename}_contigs.fastg"
 70 |     File? contigs_lastgraph = "out/~{samplename}_contigs.LastGraph"
 71 |     String shovill_version = read_string("VERSION")
 72 |   }
 73 |   runtime {
 74 |     docker: "~{docker}"
 75 |     memory: "16 GB"
 76 |     cpu: 4
 77 |     disks:  "local-disk " + disk_size + " SSD"
 78 |     disk: disk_size + " GB"
 79 |     maxRetries: 3
 80 |     preemptible: 0
 81 |   }
 82 | }
 83 | 
 84 | task shovill_se {
 85 |   input {
 86 |     File read1_cleaned
 87 |     String samplename
 88 |     String docker = "quay.io/staphb/shovill-se:1.1.0"
 89 |     Int disk_size = 100
 90 | 
 91 |     ## SHOVILL optional parameters
 92 |     ##  --depth [INT]           Sub-sample --R1/--R2 to this depth. Disable with --depth 0 (default: 150)
 93 |     ##  --gsize [STRING]        Estimated genome size eg. 3.2M <blank=AUTODETECT> (default: '')
 94 |     ##  --minlen [INT]          Minimum contig length <0=AUTO> (default: 0)
 95 |     ##  --mincov [FLOAT]        Minimum contig coverage <0=AUTO> (default: 2)
 96 |     ##  --assembler [STRING]    Assembler: skesa velvet megahit spades (default: 'spades')
 97 |     ##  --opts [STRING]         Extra assembler options in quotes eg. spades: "--untrusted-contigs locus.fna" ... (default: '')
 98 |     ##  --kmers [STRING]        K-mers to use <blank=AUTO> (default: '')
 99 |     ##  --trim [BOOLEAN]        Enable adaptor trimming (default: OFF)
100 |     ##  --noreadcorr [BOOLEAN]  Disable read error correction (default: OFF)
101 |     ##  --nocorr [BOOLEAN]      Disable post-assembly correction (default: OFF)
102 | 
103 |     Int? depth
104 |     String? genome_size
105 |     Int min_contig_length = 200
106 |     Float? min_coverage
107 |     String assembler = "spades"
108 |     String? assembler_options
109 |     String? kmers
110 |     Boolean trim = false
111 |     Boolean noreadcorr = false
112 |     Boolean nocorr = false
113 |   }
114 |   command <<<
115 |     shovill-se --version | head -1 | tee VERSION
116 |     shovill-se \
117 |     --outdir out \
118 |     --se ~{read1_cleaned} 
119 |     --minlen ~{min_contig_length} \
120 |     ~{'--depth ' + depth} \
121 |     ~{'--gsize ' + genome_size} \
122 |     ~{'--mincov ' + min_coverage} \
123 |     ~{'--assembler ' + assembler} \
124 |     ~{'--opts ' + assembler_options} \
125 |     ~{'--kmers ' + kmers} \
126 |     ~{true='--trim' false='' trim} \
127 |     ~{true='--noreadcorr' false='' noreadcorr} \
128 |     ~{true='--nocorr' false='' nocorr}
129 | 
130 |     mv out/contigs.fa out/~{samplename}_contigs.fasta
131 | 
132 |     if [ "~{assembler}" == "spades" ] ; then
133 |       mv out/contigs.gfa out/~{samplename}_contigs.gfa
134 |     elif [ "~{assembler}" == "megahit" ] ; then
135 |       mv out/contigs.fastg out/~{samplename}_contigs.fastg
136 |     elif [ "~{assembler}" == "velvet" ] ; then
137 |       mv out/contigs.LastGraph out/~{samplename}_contigs.LastGraph
138 |     fi
139 |   >>>
140 |   output {
141 |     File assembly_fasta = "out/~{samplename}_contigs.fasta"
142 |     File? contigs_gfa = "out/~{samplename}_contigs.gfa"
143 |     File? contigs_fastg = "out/~{samplename}_contigs.fastg"
144 |     File? contigs_lastgraph = "out/~{samplename}_contigs.LastGraph"
145 |     String shovill_version = read_string("VERSION")
146 |   }
147 |   runtime {
148 |     docker: "~{docker}"
149 |     memory: "16 GB"
150 |     cpu: 4
151 |     disks:  "local-disk " + disk_size + " SSD"
152 |     disk: disk_size + " GB"
153 |     maxRetries: 3
154 |     preemptible: 0
155 |   }
156 | }
157 | 


--------------------------------------------------------------------------------
/tasks/species_typing/task_kleborate.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task kleborate {
  4 |   # Inputs
  5 |   input {
  6 |     File assembly
  7 |     String samplename
  8 |     String kleborate_docker_image = "quay.io/staphb/kleborate:2.2.0"
  9 |     Int disk_size = 100
 10 |     
 11 |     # Parameters
 12 |     # --resistance                      Turn on resistance genes screening (default: no resistance gene screening)
 13 |     # --kaptive                         Equivalent to --kaptive_k --kaptive_
 14 |     # --min_identity MIN_IDENTITY           Minimum alignment percent identity for main results (default: 90.0)
 15 |     # --min_coverage MIN_COVERAGE           Minimum alignment percent coverage for main results (default: 80.0)
 16 |     # --min_spurious_identity MIN_SPURIOUS_IDENTITY  Minimum alignment percent identity for spurious results (default: 80.0)
 17 |     # --min_spurious_coverage MIN_SPURIOUS_COVERAGE  Minimum alignment percent coverage for spurious results (default: 40.0)
 18 |     # --min_kaptive_confidence {None,Low,Good,High,Very_high,Perfect}  Minimum Kaptive confidence to call K/O loci - confidence levels below this will be reported as unknown (default: Good)
 19 |     Boolean skip_resistance = false
 20 |     Boolean skip_kaptive = false
 21 |     Float min_identity = 90.0
 22 |     Float min_coverage = 80.0
 23 |     Float min_spurious_identity = 80.0
 24 |     Float min_spurious_coverage = 40.0
 25 |     String min_kaptive_confidence = "Good"
 26 |   }
 27 |   command <<<
 28 |     # capture date and version
 29 |     # Print and save date
 30 |     date | tee DATE
 31 |     # Print and save version
 32 |     kleborate --version | tee VERSION 
 33 |     # Run Kleborate on the input assembly with the --all flag and output with samplename prefix
 34 |     kleborate \
 35 |     ~{true="" false="--resistance" skip_resistance} \
 36 |     ~{true="" false="--kaptive" skip_kaptive} \
 37 |     ~{'--min_identity ' + min_identity} \
 38 |     ~{'--min_coverage ' + min_coverage} \
 39 |     ~{'--min_spurious_identity ' + min_spurious_identity} \
 40 |     ~{'--min_spurious_coverage ' + min_spurious_coverage} \
 41 |     ~{'--min_kaptive_confidence ' + min_kaptive_confidence} \
 42 |     --outfile ~{samplename}_kleborate_out.tsv \
 43 |     --assemblies ~{assembly} \
 44 |     --all
 45 |     # parse outputs
 46 |     python3 <<CODE
 47 |     import csv
 48 |     with open("./~{samplename}_kleborate_out.tsv",'r') as tsv_file:
 49 |       tsv_reader=csv.reader(tsv_file, delimiter="\t")
 50 |       tsv_data=list(tsv_reader)
 51 |       tsv_dict=dict(zip(tsv_data[0], tsv_data[1]))
 52 |       with open ("SPECIES", 'wt') as Species:
 53 |         kleb_species=tsv_dict['species']
 54 |         Species.write(kleb_species)
 55 |       with open ("MLST_SEQUENCE_TYPE", 'wt') as MLST_Sequence_Type:
 56 |         mlst=tsv_dict['ST']
 57 |         MLST_Sequence_Type.write(mlst)
 58 |       with open ("VIRULENCE_SCORE", 'wt') as Virulence_Score:
 59 |         virulence_level=tsv_dict['virulence_score']
 60 |         Virulence_Score.write(virulence_level)
 61 |       with open ("RESISTANCE_SCORE", 'wt') as Resistance_Score:
 62 |         resistance_level=tsv_dict['resistance_score']
 63 |         Resistance_Score.write(resistance_level)
 64 |       with open ("NUM_RESISTANCE_GENES", 'wt') as Num_Resistance_Genes:
 65 |         resistance_genes_count=tsv_dict['num_resistance_genes']
 66 |         Num_Resistance_Genes.write(resistance_genes_count)
 67 |       with open ("BLA_RESISTANCE_GENES", 'wt') as BLA_Resistance_Genes:
 68 |         bla_res_genes_list=['Bla_acquired', 'Bla_inhR_acquired', 'Bla_ESBL_acquired', 'Bla_ESBL_inhR_acquired', 'Bla_Carb_acquired']
 69 |         bla_res_genes=[]
 70 |         for i in bla_res_genes_list:
 71 |           if tsv_dict[i] != '-':
 72 |             bla_res_genes.append(tsv_dict[i])
 73 |         bla_res_genes_string=';'.join(bla_res_genes)
 74 |         BLA_Resistance_Genes.write(bla_res_genes_string)
 75 |       with open ("ESBL_RESISTANCE_GENES", 'wt') as ESBL_Resistance_Genes:
 76 |         esbl_res_genes_list=['Bla_ESBL_acquired', 'Bla_ESBL_inhR_acquired']
 77 |         esbl_res_genes=[]
 78 |         for i in esbl_res_genes_list:
 79 |           if tsv_dict[i] != '-':
 80 |             bla_res_genes.append(tsv_dict[i])
 81 |         esbl_res_genes_string=';'.join(esbl_res_genes)
 82 |         ESBL_Resistance_Genes.write(esbl_res_genes_string)
 83 |       with open ("KEY_RESISTANCE_GENES", 'wt') as Key_Resistance_Genes:
 84 |         key_res_genes_list= ['Col_acquired', 'Fcyn_acquired', 'Flq_acquired', 'Rif_acquired', 'Bla_acquired', 'Bla_inhR_acquired', 'Bla_ESBL_acquired', 'Bla_ESBL_inhR_acquired', 'Bla_Carb_acquired']
 85 |         key_res_genes=[]
 86 |         for i in key_res_genes_list:
 87 |           if tsv_dict[i] != '-':
 88 |             key_res_genes.append(tsv_dict[i])
 89 |         key_res_genes_string=';'.join(key_res_genes)
 90 |         Key_Resistance_Genes.write(key_res_genes_string)
 91 |       with open ("GENOMIC_RESISTANCE_MUTATIONS", 'wt') as Resistance_Mutations:
 92 |         res_mutations_list= ['Bla_chr', 'SHV_mutations', 'Omp_mutations', 'Col_mutations', 'Flq_mutations']
 93 |         res_mutations=[]
 94 |         for i in res_mutations_list:
 95 |           if tsv_dict[i] != '-':
 96 |             res_mutations.append(tsv_dict[i])
 97 |         res_mutations_string=';'.join(res_mutations)
 98 |         Resistance_Mutations.write(res_mutations_string)
 99 |       with open ("K_TYPE", 'wt') as Ktype:
100 |         ktype=tsv_dict['K_type']
101 |         Ktype.write(ktype)
102 |       with open ("K_LOCUS", 'wt') as Klocus:
103 |         klocus=tsv_dict['K_locus']
104 |         Klocus.write(klocus)
105 |       with open ("O_TYPE", 'wt') as Otype:
106 |         otype=tsv_dict['O_type']
107 |         Otype.write(otype)
108 |       with open ("O_LOCUS", 'wt') as Olocus:
109 |         olocus=tsv_dict['O_locus']
110 |         Olocus.write(olocus)
111 |       with open ("K_LOCUS_CONFIDENCE", 'wt') as K_locus_confidence:
112 |         k_locus_confidence=tsv_dict['K_locus_confidence']
113 |         K_locus_confidence.write(k_locus_confidence)
114 |       with open ("O_LOCUS_CONFIDENCE", 'wt') as O_locus_confidence:
115 |         o_locus_confidence=tsv_dict['O_locus_confidence']
116 |         O_locus_confidence.write(o_locus_confidence)
117 |     CODE
118 |   >>>
119 |   output {
120 |     File kleborate_output_file = "~{samplename}_kleborate_out.tsv"
121 |     String kleborate_version = read_string("VERSION")
122 |     String kleborate_docker = kleborate_docker_image
123 |     String kleborate_mlst_sequence_type = read_string("MLST_SEQUENCE_TYPE")
124 |     String kleborate_virulence_score = read_string("VIRULENCE_SCORE")
125 |     String kleborate_resistance_score = read_string("RESISTANCE_SCORE")
126 |     String kleborate_num_resistance_genes = read_string("NUM_RESISTANCE_GENES")
127 |     String kleborate_bla_resistance_genes = read_string("BLA_RESISTANCE_GENES")
128 |     String kleborate_esbl_resistance_genes = read_string("ESBL_RESISTANCE_GENES")
129 |     String kleborate_key_resistance_genes = read_string("KEY_RESISTANCE_GENES")
130 |     String kleborate_genomic_resistance_mutations = read_string("GENOMIC_RESISTANCE_MUTATIONS")
131 |     String kleborate_klocus = read_string("K_LOCUS")
132 |     String kleborate_ktype = read_string("K_TYPE")
133 |     String kleborate_otype = read_string("O_TYPE")
134 |     String kleborate_olocus = read_string("O_LOCUS")
135 |     String kleborate_klocus_confidence = read_string("K_LOCUS_CONFIDENCE")
136 |     String kleborate_olocus_confidence = read_string("O_LOCUS_CONFIDENCE")
137 |   }
138 |   runtime {
139 |     docker: "~{kleborate_docker_image}"
140 |     memory: "16 GB"
141 |     cpu: 8
142 |     disks: "local-disk " + disk_size + " SSD"
143 |     disk: disk_size + " GB"
144 |     maxRetries: 3
145 |   }
146 | }


--------------------------------------------------------------------------------
/tasks/species_typing/task_shigeifinder.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task shigeifinder {
  4 |   input {
  5 |     File assembly
  6 |     String samplename
  7 |     String docker = "staphb/shigeifinder:1.3.3"
  8 |     Int disk_size = 100
  9 |     Int cpu = 2
 10 |   }
 11 |   command <<<
 12 |     # capture date
 13 |     date | tee DATE
 14 |     # shigeifinder does not have a --version flag, relying upon the docker image tag for the version - which StaPH-B/Curtis maintains
 15 |     echo "~{docker}" | sed 's|staphb/shigeifinder:||' | tee VERSION.txt
 16 | 
 17 |     # ShigEiFinder checks that all dependencies are installed before running
 18 |     echo "checking for shigeifinder dependencies and running ShigEiFinder..."
 19 |     # run shigeifinder on assembly; default is 4cpus, so turning down to 2 since it's already very fast
 20 |     shigeifinder -i ~{assembly} \
 21 |         -t ~{cpu} \
 22 |         --hits \
 23 |         --output ~{samplename}_shigeifinder.tsv
 24 | 
 25 |     # parse output TSV
 26 |     echo "Parsing ShigEiFinder output TSV..."
 27 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 2 >shigeifinder_ipaH_presence_absence.txt
 28 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 3 >shigeifinder_num_virulence_plasmid_genes.txt
 29 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 4 >shigeifinder_cluster.txt
 30 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 5 >shigeifinder_serotype.txt
 31 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 6 >shigeifinder_O_antigen.txt
 32 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 7 >shigeifinder_H_antigen.txt
 33 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 8 >shigeifinder_notes.txt
 34 | 
 35 |     # set helpful output strings if field in TSV is blank by overwriting output TXT files
 36 |     if [ "$(cat shigeifinder_ipaH_presence_absence.txt)" == "" ]; then
 37 |        echo "ShigEiFinder ipaH field was empty" > shigeifinder_ipaH_presence_absence.txt
 38 |     fi
 39 |     if [ "$(cat shigeifinder_num_virulence_plasmid_genes.txt)" == "" ]; then
 40 |        echo "ShigEiFinder number of virulence plasmid genes field was empty" > shigeifinder_num_virulence_plasmid_genes.txt
 41 |     fi
 42 |     if [ "$(cat shigeifinder_cluster.txt)" == "" ]; then
 43 |        echo "ShigEiFinder cluster field was empty" > shigeifinder_cluster.txt
 44 |     fi
 45 |     if [ "$(cat shigeifinder_serotype.txt)" == "" ]; then
 46 |        echo "ShigEiFinder serotype field was empty" > shigeifinder_serotype.txt
 47 |     fi
 48 |     if [ "$(cat shigeifinder_O_antigen.txt)" == "" ]; then
 49 |        echo "ShigEiFinder O antigen field was empty" > shigeifinder_O_antigen.txt
 50 |     fi
 51 |     if [ "$(cat shigeifinder_H_antigen.txt)" == "" ]; then
 52 |        echo "ShigEiFinder H antigen field was empty" > shigeifinder_H_antigen.txt
 53 |     fi
 54 |     if [ "$(cat shigeifinder_notes.txt)" == "" ]; then
 55 |        echo "ShigEiFinder notes field was empty" > shigeifinder_notes.txt
 56 |     fi
 57 | 
 58 |   >>>
 59 |   output {
 60 |     File shigeifinder_report = "~{samplename}_shigeifinder.tsv"
 61 |     String shigeifinder_docker = docker
 62 |     String shigeifinder_version = read_string("VERSION.txt")
 63 |     String shigeifinder_ipaH_presence_absence = read_string("shigeifinder_ipaH_presence_absence.txt")
 64 |     String shigeifinder_num_virulence_plasmid_genes = read_string("shigeifinder_num_virulence_plasmid_genes.txt")
 65 |     String shigeifinder_cluster = read_string("shigeifinder_cluster.txt")
 66 |     String shigeifinder_serotype = read_string("shigeifinder_serotype.txt")
 67 |     String shigeifinder_O_antigen = read_string("shigeifinder_O_antigen.txt")
 68 |     String shigeifinder_H_antigen = read_string("shigeifinder_H_antigen.txt")
 69 |     String shigeifinder_notes = read_string("shigeifinder_notes.txt")
 70 |   }
 71 |   runtime {
 72 |     docker: "~{docker}"
 73 |     memory: "8 GB"
 74 |     cpu: cpu
 75 |     disks: "local-disk " + disk_size + " SSD"
 76 |     disk: disk_size + " GB"
 77 |     preemptible: 0
 78 |     maxRetries: 3
 79 |   }
 80 | }
 81 | task shigeifinder_reads {
 82 |   input {
 83 |     File read1
 84 |     File? read2
 85 |     String samplename
 86 |     String docker = "staphb/shigeifinder:1.3.3"
 87 |     Int disk_size = 100
 88 |     Int cpu = 4
 89 |     Boolean paired_end = true
 90 |   }
 91 |   command <<<
 92 |     # capture date
 93 |     date | tee DATE
 94 |     # shigeifinder does not have a --version flag, relying upon the docker image tag for the version - which StaPH-B/Curtis maintains
 95 |     echo "~{docker}" | sed 's|staphb/shigeifinder:||' | tee VERSION.txt
 96 | 
 97 |     # ShigEiFinder checks that all dependencies are installed before running
 98 |     echo "checking for shigeifinder dependencies and running ShigEiFinder..."
 99 |     # run shigeifinder on reads; default is 4cpus, so keeping at 4 since it's doing alignment
100 |     shigeifinder -r -i ~{read1} ~{read2} \
101 |         ~{true='' false='--single_end' paired_end} \
102 |         -t ~{cpu} \
103 |         --hits \
104 |         --output ~{samplename}_shigeifinder.tsv
105 | 
106 |     # parse output TSV
107 |     echo "Parsing ShigEiFinder output TSV..."
108 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 2 >shigeifinder_ipaH_presence_absence.txt
109 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 3 >shigeifinder_num_virulence_plasmid_genes.txt
110 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 4 >shigeifinder_cluster.txt
111 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 5 >shigeifinder_serotype.txt
112 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 6 >shigeifinder_O_antigen.txt
113 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 7 >shigeifinder_H_antigen.txt
114 |     head -n 2 ~{samplename}_shigeifinder.tsv | tail -n 1 | cut -f 8 >shigeifinder_notes.txt
115 | 
116 |     # set helpful output strings if field in TSV is blank by overwriting output TXT files
117 |     if [ "$(cat shigeifinder_ipaH_presence_absence.txt)" == "" ]; then
118 |        echo "ShigEiFinder ipaH field was empty" > shigeifinder_ipaH_presence_absence.txt
119 |     fi
120 |     if [ "$(cat shigeifinder_num_virulence_plasmid_genes.txt)" == "" ]; then
121 |        echo "ShigEiFinder number of virulence plasmid genes field was empty" > shigeifinder_num_virulence_plasmid_genes.txt
122 |     fi
123 |     if [ "$(cat shigeifinder_cluster.txt)" == "" ]; then
124 |        echo "ShigEiFinder cluster field was empty" > shigeifinder_cluster.txt
125 |     fi
126 |     if [ "$(cat shigeifinder_serotype.txt)" == "" ]; then
127 |        echo "ShigEiFinder serotype field was empty" > shigeifinder_serotype.txt
128 |     fi
129 |     if [ "$(cat shigeifinder_O_antigen.txt)" == "" ]; then
130 |        echo "ShigEiFinder O antigen field was empty" > shigeifinder_O_antigen.txt
131 |     fi
132 |     if [ "$(cat shigeifinder_H_antigen.txt)" == "" ]; then
133 |        echo "ShigEiFinder H antigen field was empty" > shigeifinder_H_antigen.txt
134 |     fi
135 |     if [ "$(cat shigeifinder_notes.txt)" == "" ]; then
136 |        echo "ShigEiFinder notes field was empty" > shigeifinder_notes.txt
137 |     fi
138 | 
139 |   >>>
140 |   output {
141 |     File shigeifinder_report = "~{samplename}_shigeifinder.tsv"
142 |     String shigeifinder_docker = docker
143 |     String shigeifinder_version = read_string("VERSION.txt")
144 |     String shigeifinder_ipaH_presence_absence = read_string("shigeifinder_ipaH_presence_absence.txt")
145 |     String shigeifinder_num_virulence_plasmid_genes = read_string("shigeifinder_num_virulence_plasmid_genes.txt")
146 |     String shigeifinder_cluster = read_string("shigeifinder_cluster.txt")
147 |     String shigeifinder_serotype = read_string("shigeifinder_serotype.txt")
148 |     String shigeifinder_O_antigen = read_string("shigeifinder_O_antigen.txt")
149 |     String shigeifinder_H_antigen = read_string("shigeifinder_H_antigen.txt")
150 |     String shigeifinder_notes = read_string("shigeifinder_notes.txt")
151 |   }
152 |   runtime {
153 |     docker: "~{docker}"
154 |     memory: "8 GB"
155 |     cpu: cpu
156 |     disks: "local-disk " + disk_size + " SSD"
157 |     disk: disk_size + " GB"
158 |     preemptible: 0
159 |     maxRetries: 3
160 |   }
161 | }


--------------------------------------------------------------------------------