├── .dockerignore ├── .editorconfig ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.yml │ ├── config.yml │ ├── feature_request.yml │ └── question.yml ├── .gitignore ├── .gitlab-ci.yml ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── bin ├── create_pairs_report.py ├── workflow-glue └── workflow_glue │ ├── __init__.py │ ├── check_bam_headers_in_dir.py │ ├── check_sample_sheet.py │ ├── check_xam_index.py │ ├── configure_igv.py │ ├── get_max_depth_locus.py │ ├── report.py │ ├── tests │ ├── __init__.py │ └── test_test.py │ └── util.py ├── data └── OPTIONAL_FILE ├── docs ├── 01_brief_description.md ├── 02_introduction.md ├── 03_compute_requirements.md ├── 04_install_and_run.md ├── 05_related_protocols.md ├── 06_input_example.md ├── 06_input_parameters.md ├── 07_outputs.md ├── 08_pipeline_overview.md ├── 09_troubleshooting.md ├── 10_FAQ.md └── 11_other.md ├── lib ├── ArgumentParser.groovy ├── CWUtil.groovy ├── NfcoreSchema.groovy ├── NfcoreTemplate.groovy ├── Pinguscript.groovy ├── WorkflowMain.groovy ├── common.nf ├── ingress.nf └── nfcore_external_java_deps.jar ├── main.nf ├── modules └── local │ ├── 4dn.nf │ ├── common.nf │ └── pore-c.nf ├── nextflow.config ├── nextflow_schema.json ├── output_definition.json ├── subworkflows └── local │ └── prepare_genome.nf └── test_data ├── bams ├── barcode01 │ └── porec_test.concatemers.bam └── barcode02 │ └── porec_test.concatemers.bam ├── bams_dir ├── shard_0001.bam ├── shard_0002.bam ├── shard_0003.bam └── shard_0004.bam ├── porec_test.concatemers.bam ├── porec_test.concatemers.fastq ├── porec_test.fasta ├── porec_test.fasta.fai ├── porec_test.monomer.fastq ├── porec_test.params.json ├── porec_test.phased_variants.vcf.gz ├── porec_test.phased_variants.vcf.gz.tbi ├── porec_test_no_index.phased_variants.vcf.gz ├── sample_sheet.csv ├── sample_sheet_cutter.csv ├── sample_sheet_no_tbi.csv ├── sample_sheet_no_vcf.csv └── tests.pairs.stats.txt /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | bin 3 | CHANGELOG.md 4 | data 5 | lib 6 | LICENSE 7 | main.nf 8 | nextflow.config 9 | README.md 10 | test_data 11 | # we typically run tests with outputs to these: 12 | output 13 | work 14 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | end_of_line = lf 6 | insert_final_newline = true 7 | trim_trailing_whitespace = true 8 | indent_size = 4 9 | indent_style = space 10 | 11 | [*.{md,yml,yaml,html,css,scss,js,cff}] 12 | indent_size = 2 13 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: File a bug report 3 | labels: ["triage"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thanks for taking the time to fill out this bug report! 9 | 10 | 11 | - type: markdown 12 | attributes: 13 | value: | 14 | # Background 15 | - type: dropdown 16 | id: os 17 | attributes: 18 | label: Operating System 19 | description: What operating system are you running? 20 | options: 21 | - Windows 10 22 | - Windows 11 23 | - macOS 24 | - Ubuntu 22.04 25 | - CentOS 7 26 | - Other Linux (please specify below) 27 | validations: 28 | required: true 29 | - type: input 30 | id: other-os 31 | attributes: 32 | label: Other Linux 33 | placeholder: e.g. Fedora 38 34 | - type: input 35 | id: version 36 | attributes: 37 | label: Workflow Version 38 | description: This is most easily found in the workflow output log 39 | placeholder: v1.2.3 40 | validations: 41 | required: true 42 | - type: dropdown 43 | id: execution 44 | attributes: 45 | label: Workflow Execution 46 | description: Where are you running the workflow? 47 | options: 48 | - EPI2ME Desktop (Local) 49 | - EPI2ME Desktop (Cloud) 50 | - Command line (Local) 51 | - Command line (Cluster) 52 | - Other (please describe) 53 | validations: 54 | required: true 55 | - type: input 56 | id: other-workflow-execution 57 | attributes: 58 | label: Other workflow execution 59 | description: If "Other", please describe 60 | placeholder: Tell us where / how you are running the workflow. 61 | 62 | - type: markdown 63 | attributes: 64 | value: | 65 | # EPI2ME Desktop Application 66 | If you are using the application please provide the following. 67 | - type: input 68 | id: labs-version 69 | attributes: 70 | label: EPI2ME Version 71 | description: Available from the application settings page. 72 | placeholder: v5.1.1 73 | validations: 74 | required: false 75 | 76 | 77 | - type: markdown 78 | attributes: 79 | value: | 80 | # Command-line execution 81 | If you are using nextflow on a command-line, please provide the following. 82 | - type: textarea 83 | id: cli-command 84 | attributes: 85 | label: CLI command run 86 | description: Please tell us the command you are running 87 | placeholder: e.g. nextflow run epi2me-labs/wf-human-variations -profile standard --fastq my-reads/fastq 88 | validations: 89 | required: false 90 | - type: dropdown 91 | id: profile 92 | attributes: 93 | label: Workflow Execution - CLI Execution Profile 94 | description: Which execution profile are you using? If you are using a custom profile or nextflow configuration, please give details below. 95 | options: 96 | - standard (default) 97 | - singularity 98 | - custom 99 | validations: 100 | required: false 101 | 102 | 103 | - type: markdown 104 | attributes: 105 | value: | 106 | # Report details 107 | - type: textarea 108 | id: what-happened 109 | attributes: 110 | label: What happened? 111 | description: Also tell us, what did you expect to happen? 112 | placeholder: Tell us what you see! 113 | validations: 114 | required: true 115 | - type: textarea 116 | id: logs 117 | attributes: 118 | label: Relevant log output 119 | description: For CLI execution please include the full output from running nextflow. For execution from the EPI2ME application please copy the contents of the "Workflow logs" panel from the "Logs" tab corresponding to your workflow instance. (This will be automatically formatted into code, so no need for backticks). 120 | render: shell 121 | validations: 122 | required: true 123 | - type: textarea 124 | id: activity-log 125 | attributes: 126 | label: Application activity log entry 127 | description: For use with the EPI2ME application please see the Settings > View Activity Log page, and copy the contents of any items listed in red using the Copy to clipboard button. 128 | render: shell 129 | validations: 130 | required: false 131 | - type: dropdown 132 | id: run-demo 133 | attributes: 134 | label: Were you able to successfully run the latest version of the workflow with the demo data? 135 | description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button? 136 | options: 137 | - 'yes' 138 | - 'no' 139 | - other (please describe below) 140 | validations: 141 | required: true 142 | - type: textarea 143 | id: demo-other 144 | attributes: 145 | label: Other demo data information 146 | render: shell 147 | validations: 148 | required: false 149 | 150 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Nanopore customer support 4 | url: https://nanoporetech.com/contact 5 | about: For general support, including bioinformatics questions. 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature request 2 | description: Suggest an idea for this project 3 | labels: ["feature request"] 4 | body: 5 | 6 | - type: textarea 7 | id: question1 8 | attributes: 9 | label: Is your feature related to a problem? 10 | placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 11 | validations: 12 | required: true 13 | - type: textarea 14 | id: question2 15 | attributes: 16 | label: Describe the solution you'd like 17 | placeholder: A clear and concise description of what you want to happen. 18 | validations: 19 | required: true 20 | - type: textarea 21 | id: question3 22 | attributes: 23 | label: Describe alternatives you've considered 24 | placeholder: A clear and concise description of any alternative solutions or features you've considered. 25 | validations: 26 | required: true 27 | - type: textarea 28 | id: question4 29 | attributes: 30 | label: Additional context 31 | placeholder: Add any other context about the feature request here. 32 | validations: 33 | required: false 34 | 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.yml: -------------------------------------------------------------------------------- 1 | name: Question 2 | description: Ask a generic question about this project unrelated to features or bugs. 3 | labels: ["question"] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Please reserve this form for issues not related to bugs or feature requests. If our developers deem your questions to be related to bugs or features you will be asked to fill in the appropriate form. 9 | - type: textarea 10 | id: question1 11 | attributes: 12 | label: Ask away! 13 | placeholder: | 14 | Bad question: How do I use this workflow in my HPC cluster? 15 | Good question: My HPC cluster uses a GridEngine scheduler. Can you point me to documentation for how to use your workflows to efficiently submit jobs to my cluster? 16 | validations: 17 | required: true 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | nextflow 2 | .nextflow* 3 | template-workflow 4 | .*.swp 5 | .*.swo 6 | *.pyc 7 | *.pyo 8 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | # Include shared CI 2 | include: 3 | - project: "epi2melabs/ci-templates" 4 | file: "wf-containers.yaml" 5 | 6 | 7 | variables: 8 | # Workflow inputs given to nextflow. 9 | # The workflow should define `--out_dir`, the CI template sets this. 10 | # Only common file inputs and option values need to be given here 11 | # (not things such as -profile) 12 | NF_BEFORE_SCRIPT: "mkdir -p ${CI_PROJECT_NAME}/data/ && wget -O ${CI_PROJECT_NAME}/data/wf-pore-c-demo.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo.tar.gz && tar -xzvf ${CI_PROJECT_NAME}/data/wf-pore-c-demo.tar.gz -C ${CI_PROJECT_NAME}/data/" 13 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 14 | --bam ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.concatemers.bam --chunk_size 100 --ref \ 15 | ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.fasta \ 16 | --cutter NlaIII \ 17 | --vcf ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.phased_variants.vcf.gz \ 18 | --paired_end_minimum_distance 100 --paired_end_maximum_distance 200 --hi_c --mcool --paired_end" 19 | CI_FLAVOUR: "new" 20 | 21 | macos-run: 22 | tags: 23 | - macos 24 | - x86 25 | 26 | docker-run: 27 | parallel: 28 | matrix: 29 | - MATRIX_NAME: [ 30 | "no-sample-sheet-chunk", "sample-sheet", "sample-sheet-cutter", 31 | "fastq", "chromunity", "input-dir-hic", "demo", 32 | "sample-sheet-and-chunk-size", "vcf-no-tbi", 33 | "sample-sheet-no-vcf", "sample-sheet-vcf-no-tbi"] 34 | rules: 35 | - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template") 36 | when: never 37 | - if: $MATRIX_NAME == "no-sample-sheet-chunk" 38 | variables: 39 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 40 | --bam test_data/porec_test.concatemers.bam --chunk_size 100 --ref \ 41 | test_data/porec_test.fasta \ 42 | --cutter NlaIII \ 43 | --vcf test_data/porec_test.phased_variants.vcf.gz --pairs \ 44 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 45 | --paired_end_maximum_distance 200 --bed" 46 | - if: $MATRIX_NAME == "sample-sheet" 47 | variables: 48 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 49 | --bam test_data/bams --ref \ 50 | test_data/porec_test.fasta \ 51 | --pairs --chunk_size 0 \ 52 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 53 | --paired_end_maximum_distance 200 --sample_sheet test_data/sample_sheet.csv \ 54 | --max_monomers 8" 55 | NF_IGNORE_PROCESSES: "index_bam" 56 | - if: $MATRIX_NAME == "sample-sheet-cutter" 57 | variables: 58 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 59 | --bam test_data/bams --ref \ 60 | test_data/porec_test.fasta \ 61 | --pairs --chunk_size 0 \ 62 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 63 | --paired_end_maximum_distance 200 --sample_sheet test_data/sample_sheet_cutter.csv \ 64 | --max_monomers 8" 65 | NF_IGNORE_PROCESSES: "index_bam" 66 | - if: $MATRIX_NAME == "input-dir-hic" 67 | variables: 68 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 69 | --bam test_data/bams_dir --chunk_size 500 --ref \ 70 | test_data/porec_test.fasta \ 71 | --cutter NlaIII \ 72 | --vcf test_data/porec_test.phased_variants.vcf.gz \ 73 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 74 | --paired_end_maximum_distance 200 --hi_c" 75 | - if: $MATRIX_NAME == "fastq" 76 | variables: 77 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 78 | --fastq test_data/porec_test.concatemers.fastq --ref \ 79 | test_data/porec_test.fasta \ 80 | --cutter NlaIII \ 81 | --vcf test_data/porec_test.phased_variants.vcf.gz --pairs \ 82 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 83 | --paired_end_maximum_distance 200" 84 | NF_IGNORE_PROCESSES: "index_bam" 85 | - if: $MATRIX_NAME == "chromunity" 86 | variables: 87 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 88 | --fastq test_data/porec_test.concatemers.fastq --ref \ 89 | test_data/porec_test.fasta \ 90 | --mcool_resolutions 1000,2000,3000 --cutter NlaIII \ 91 | --vcf test_data/porec_test.phased_variants.vcf.gz --pairs \ 92 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 93 | --paired_end_maximum_distance 200 --chromunity --chromunity_merge_distance 5 --mcool" 94 | NF_IGNORE_PROCESSES: "index_bam" 95 | - if: $MATRIX_NAME == "demo" 96 | variables: 97 | NF_BEFORE_SCRIPT: "mkdir -p ${CI_PROJECT_NAME}/data/ && wget -O ${CI_PROJECT_NAME}/data/wf-pore-c-demo.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo.tar.gz && tar -xzvf ${CI_PROJECT_NAME}/data/wf-pore-c-demo.tar.gz -C ${CI_PROJECT_NAME}/data/" 98 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 99 | --bam ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.concatemers.bam --chunk_size 100 --ref \ 100 | ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.fasta \ 101 | --cutter NlaIII \ 102 | --vcf ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.phased_variants.vcf.gz \ 103 | --paired_end_minimum_distance 100 --paired_end_maximum_distance 200 --hi_c --mcool --paired_end" 104 | - if: $MATRIX_NAME == "sample-sheet-and-chunk-size" 105 | variables: 106 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 107 | --bam test_data/bams --ref \ 108 | test_data/porec_test.fasta \ 109 | --pairs --chunk_size 100 \ 110 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 111 | --paired_end_maximum_distance 200 --sample_sheet test_data/sample_sheet.csv" 112 | - if: $MATRIX_NAME == "vcf-no-tbi" 113 | variables: 114 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 115 | --fastq test_data/porec_test.concatemers.fastq --ref \ 116 | test_data/porec_test.fasta \ 117 | --cutter NlaIII \ 118 | --vcf test_data/porec_test_no_index.phased_variants.vcf.gz --pairs \ 119 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 120 | --paired_end_maximum_distance 20" 121 | - if: $MATRIX_NAME == "sample-sheet-no-vcf" 122 | variables: 123 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 124 | --bam test_data/bams --ref \ 125 | test_data/porec_test.fasta \ 126 | --pairs --chunk_size 100 \ 127 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 128 | --paired_end_maximum_distance 200 --sample_sheet test_data/sample_sheet_no_vcf.csv" 129 | - if: $MATRIX_NAME == "sample-sheet-vcf-no-tbi" 130 | variables: 131 | NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \ 132 | --bam test_data/bams --ref \ 133 | test_data/porec_test.fasta \ 134 | --pairs --chunk_size 100 \ 135 | --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \ 136 | --paired_end_maximum_distance 200 \ 137 | --sample_sheet test_data/sample_sheet_no_tbi.csv" -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: docs_readme 5 | name: docs_readme 6 | entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json 7 | language: python 8 | always_run: true 9 | pass_filenames: false 10 | additional_dependencies: 11 | - epi2melabs==0.0.57 12 | - id: build_models 13 | name: build_models 14 | entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py 15 | language: python 16 | files: 'results_schema.yml' 17 | pass_filenames: false 18 | additional_dependencies: 19 | - datamodel-code-generator 20 | - repo: https://github.com/pycqa/flake8 21 | rev: 5.0.4 22 | hooks: 23 | - id: flake8 24 | pass_filenames: false 25 | additional_dependencies: 26 | - flake8-rst-docstrings 27 | - flake8-docstrings 28 | - flake8-import-order 29 | - flake8-forbid-visual-indent 30 | - pep8-naming 31 | - flake8-no-types 32 | - flake8-builtins 33 | - flake8-absolute-import 34 | - flake8-print 35 | args: [ 36 | "bin", 37 | "--import-order-style=google", 38 | "--statistics", 39 | "--max-line-length=88", 40 | "--extend-exclude=bin/workflow_glue/results_schema.py", 41 | ] 42 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [v1.3.0] 8 | ### Added 9 | - Workflow will generate VCF index file if it doesn't exist. 10 | ### Fixed 11 | - Set format of `--bam` and `--fastq` in schema to `path`, to enable directories to be selected as input in the EPI2ME application. 12 | ### Removed 13 | - Empty alignment stats plots which are not relevant to this workflow. 14 | 15 | ### [v1.2.2] 16 | ### Fixed 17 | - Capitalised modified base tags additionally removed from monomers if no modified bases for a monomer. 18 | 19 | ## [v1.2.1] 20 | ### Fixed 21 | - bamindex fetch error when running more than one sample and `--chunk_size` is greater than 0. 22 | 23 | ## [v1.2.0] 24 | ### Fixed 25 | - `--bed` parameter will now output BED file using the paired_end BAM file. 26 | - Reduce memory usage of BED file creation and sorting. 27 | - Increased memory allocation for `prepare_hic` and `merge_mcool` processes. 28 | - If sample sheet provided and cutter column not present the workflow will instead use `--cutter` parameter. 29 | ### Changed 30 | - Bump pore-c-py to v2.1.4 to prevent issues with modified base tags and strip minimap2 tags from inputs. 31 | ### Added 32 | - Reduce peak memory usage of minimap2 by adding `--cap-kalloc 100m --cap-sw-mem 50m` to the minimap2 command in the `digest_align_annotate` process. 33 | 34 | ## [v1.1.0] 35 | ### Added 36 | - `--bed` parameter which if set to true will output a BED file that is compatible with downstream tools including the scaffolder [Yahs](https://github.com/c-zhou/yahs). 37 | - `--pairtools_chunksize` parameter which exposes pairtools dedup chunksize parameter, in case peak memory usage of hi_c process needs to be reduced. 38 | - `digest_align_annotate` process uses dedicated pore_c_py container. 39 | - `--max_monomers` parameter, which is set to 250 by default, will filter out any reads that have more than this number of monomers. These reads will not be included in the analysis. 40 | - Output a `filtered_out/{alias}.bam` with any reads that are filtered out due to the max_monomers parameter. 41 | 42 | ## [v1.0.0] 43 | ### Changed 44 | - New documentation. 45 | 46 | ## [v0.2.0] 47 | ### Fixed 48 | - Pairtools merge step single quote the input directory so it will not error with Argument list too long. 49 | - Chromunity parquet files now contain the correct column names. 50 | ### Changed 51 | - `--ubam` parameter has been renamed `--bam` 52 | - All other ubam related parameters have been renamed with bam for consistency 53 | - The `--bam_map_threads`, `--digest_annotate_threads` and `bam_bam2fq_threads` threading parameters are now automatically extracted from the `--threads` specifying the maximum number of threads to use for a process. 54 | ### Removed 55 | - Default local executor CPU and RAM limits. 56 | 57 | ## [v0.1.1] 58 | ### Changed 59 | - If `--hi_c` parameter set to true the pairs file will be created. 60 | 61 | ## [v0.1.0] 62 | ### Changed 63 | - GitHub issue templates 64 | - Nextflow minimum version 23.04.2. 65 | - `--sample_id` parameter has been changed to `--sample` for consistency. 66 | - `--summary_json` optional parameter with default set to true, to include an annotation summary json in outputs. 67 | - Remove `--params_sheet` parameter and add all per sample parameters to sample_sheet. 68 | 69 | ### Added 70 | - `--hi_c` optional parameter with default set to false, to include a `.hic` output file which is compatible with [Juice box](https://www.aidenlab.org/juicebox/). 71 | 72 | ## [v0.0.8] 73 | * Improve schema parameter explanations and output file descriptions in the README. 74 | * Add a default `--chunk_size` parameter value of 25000. 75 | * Update fastcat which removes need to index ubam. 76 | * Enum choices are enumerated in the `--help` output. 77 | * Enum choices are enumerated as part of the error message when a user has selected an invalid choice. 78 | * Bumped minimum required Nextflow version to 22.10.8. 79 | 80 | ### Fixed 81 | - Replaced `--threads` option in fastqingress with hardcoded values to remove warning about undefined `param.threads` 82 | 83 | ## [v0.0.7] 84 | ### Fixed 85 | - Testing for the cooler tool. 86 | 87 | ## [v0.0.6] 88 | ### Added 89 | - Configuration for running demo data in AWS 90 | 91 | ## [v0.0.5] 92 | ### Fixed 93 | - Broken heat map in the pairtools report. 94 | - Meta table repeated tabs. 95 | - Nextflow config example cmd. 96 | 97 | ### Added 98 | - Cutter parameter help text link to Restriction Enzyme options. 99 | 100 | ## [v0.0.4] 101 | ### Added 102 | - Changed LICENSE to Oxford Nanopore Technologies PLC. Public License Version 1.0. 103 | - Test for Chromunity writer 104 | 105 | ### Fixed 106 | - Use latest pore-c-py package with fix for the modified bases digest step. 107 | 108 | ## [v0.0.3] 109 | ### Fixed 110 | - Reduce time by using bamindex instead of splitting bam. 111 | 112 | ### Changed 113 | - Replace input check with fastq ingress. 114 | - Parameters to input fastq or ubam. 115 | - Output a basic report. 116 | 117 | ## [v0.0.2] 118 | ### Fixed 119 | - Create pairs report handling of missing references in pairs file. 120 | 121 | ### Changed 122 | - Update Pore-c-py package used to v2.0.1 123 | - Improved performance 124 | - Use one pipe for digest, align and annotate processes. 125 | 126 | ## [v0.0.1] 127 | * First release of Wf-Pore-C 128 | 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Oxford Nanopore Technologies PLC. Public License Version 1.0 2 | ============================================================= 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor’s Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Executable Form" 25 | means any form of the work other than Source Code Form. 26 | 27 | 1.6. "Larger Work" 28 | means a work that combines Covered Software with other material, in 29 | a separate file or files, that is not Covered Software. 30 | 31 | 1.7. "License" 32 | means this document. 33 | 34 | 1.8. "Licensable" 35 | means having the right to grant, to the maximum extent possible, 36 | whether at the time of the initial grant or subsequently, any and 37 | all of the rights conveyed by this License. 38 | 39 | 1.9. "Modifications" 40 | means any of the following: 41 | 42 | (a) any file in Source Code Form that results from an addition to, 43 | deletion from, or modification of the contents of Covered 44 | Software; or 45 | (b) any new file in Source Code Form that contains any Covered 46 | Software. 47 | 48 | 1.10. "Research Purposes" 49 | means use for internal research and not intended for or directed 50 | towards commercial advantages or monetary compensation; provided, 51 | however, that monetary compensation does not include sponsored 52 | research of research funded by grants. 53 | 54 | 1.11 "Secondary License" 55 | means either the GNU General Public License, Version 2.0, the GNU 56 | Lesser General Public License, Version 2.1, the GNU Affero General 57 | Public License, Version 3.0, or any later versions of those 58 | licenses. 59 | 60 | 1.12. "Source Code Form" 61 | means the form of the work preferred for making modifications. 62 | 63 | 1.13. "You" (or "Your") 64 | means an individual or a legal entity exercising rights under this 65 | License. For legal entities, "You" includes any entity that 66 | controls, is controlled by, or is under common control with You. For 67 | purposes of this definition, "control" means (a) the power, direct 68 | or indirect, to cause the direction or management of such entity, 69 | whether by contract or otherwise, or (b) ownership of more than 70 | fifty percent (50%) of the outstanding shares or beneficial 71 | ownership of such entity. 72 | 73 | 2. License Grants and Conditions 74 | -------------------------------- 75 | 76 | 2.1. Grants 77 | 78 | Each Contributor hereby grants You a world-wide, royalty-free, 79 | non-exclusive license under Contributor copyrights Licensable by such 80 | Contributor to use, reproduce, make available, modify, display, 81 | perform, distribute, and otherwise exploit solely for Research Purposes 82 | its Contributions, either on an unmodified basis, with Modifications, 83 | or as part of a Larger Work. 84 | 85 | 2.2. Effective Date 86 | 87 | The licenses granted in Section 2.1 with respect to any Contribution 88 | become effective for each Contribution on the date the Contributor 89 | first distributes such Contribution. 90 | 91 | 2.3. Limitations on Grant Scope 92 | 93 | The licenses granted in this Section 2 are the only rights granted under 94 | this License. No additional rights or licenses will be implied from the 95 | distribution or licensing of Covered Software under this License. The 96 | License is incompatible with Secondary Licenses. Notwithstanding 97 | Section 2.1 above, no copyright license is granted: 98 | 99 | (a) for any code that a Contributor has removed from Covered Software; 100 | or 101 | 102 | (b) use of the Contributions or its Contributor Version other than for 103 | Research Purposes only; or 104 | 105 | (c) for infringements caused by: (i) Your and any other third party’s 106 | modifications of Covered Software, or (ii) the combination of its 107 | Contributions with other software (except as part of its Contributor 108 | Version). 109 | 110 | This License does not grant any rights in the patents, trademarks, 111 | service marks, or logos of any Contributor (except as may be necessary 112 | to comply with the notice requirements in Section 3.4). 113 | 114 | 2.4. Subsequent Licenses 115 | 116 | No Contributor makes additional grants as a result of Your choice to 117 | distribute the Covered Software under a subsequent version of this 118 | License (see Section 10.2) or under the terms of a Secondary License 119 | (if permitted under the terms of Section 3.3). 120 | 121 | 2.5. Representation 122 | 123 | Each Contributor represents that the Contributor believes its 124 | Contributions are its original creation(s) or it has sufficient rights 125 | to grant the rights to its Contributions conveyed by this License. 126 | 127 | 2.6. Fair Use 128 | 129 | This License is not intended to limit any rights You have under 130 | applicable copyright doctrines of fair use, fair dealing, or other 131 | equivalents. 132 | 133 | 2.7. Conditions 134 | 135 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 136 | in Section 2.1. 137 | 138 | 3. Responsibilities 139 | ------------------- 140 | 141 | 3.1. Distribution of Source Form 142 | 143 | All distribution of Covered Software in Source Code Form, including any 144 | Modifications that You create or to which You contribute, must be under 145 | the terms of this License. You must inform recipients that the Source 146 | Code Form of the Covered Software is governed by the terms of this 147 | License, and how they can obtain a copy of this License. You may not 148 | attempt to alter or restrict the recipients’ rights in the Source Code Form. 149 | 150 | 3.2. Distribution of Executable Form 151 | 152 | If You distribute Covered Software in Executable Form then: 153 | 154 | (a) such Covered Software must also be made available in Source Code 155 | Form, as described in Section 3.1, and You must inform recipients of 156 | the Executable Form how they can obtain a copy of such Source Code 157 | Form by reasonable means in a timely manner, at a charge no more 158 | than the cost of distribution to the recipient; and 159 | 160 | (b) You may distribute such Executable Form under the terms of this 161 | License. 162 | 163 | 3.3. Distribution of a Larger Work 164 | 165 | You may create and distribute a Larger Work under terms of Your choice, 166 | provided that You also comply with the requirements of this License for 167 | the Covered Software. The Larger Work may not be a combination of Covered 168 | Software with a work governed by one or more Secondary Licenses. 169 | 170 | 3.4. Notices 171 | 172 | You may not remove or alter the substance of any license notices 173 | (including copyright notices, patent notices, disclaimers of warranty, 174 | or limitations of liability) contained within the Source Code Form of 175 | the Covered Software, except that You may alter any license notices to 176 | the extent required to remedy known factual inaccuracies. 177 | 178 | 3.5. Application of Additional Terms 179 | 180 | You may not choose to offer, or charge a fee for use of the Covered 181 | Software or a fee for, warranty, support, indemnity or liability 182 | obligations to one or more recipients of Covered Software. You must 183 | make it absolutely clear that any such warranty, support, indemnity, or 184 | liability obligation is offered by You alone, and You hereby agree to 185 | indemnify every Contributor for any liability incurred by such 186 | Contributor as a result of warranty, support, indemnity or liability 187 | terms You offer. You may include additional disclaimers of warranty and 188 | limitations of liability specific to any jurisdiction. 189 | 190 | 4. Inability to Comply Due to Statute or Regulation 191 | --------------------------------------------------- 192 | 193 | If it is impossible for You to comply with any of the terms of this 194 | License with respect to some or all of the Covered Software due to 195 | statute, judicial order, or regulation then You must: (a) comply with 196 | the terms of this License to the maximum extent possible; and (b) 197 | describe the limitations and the code they affect. Such description must 198 | be placed in a text file included with all distributions of the Covered 199 | Software under this License. Except to the extent prohibited by statute 200 | or regulation, such description must be sufficiently detailed for a 201 | recipient of ordinary skill to be able to understand it. 202 | 203 | 5. Termination 204 | -------------- 205 | 206 | 5.1. The rights granted under this License will terminate automatically 207 | if You fail to comply with any of its terms. 208 | 209 | 5.2. If You initiate litigation against any entity by asserting an 210 | infringement claim (excluding declaratory judgment actions, 211 | counter-claims, and cross-claims) alleging that a Contributor Version 212 | directly or indirectly infringes, then the rights granted to 213 | You by any and all Contributors for the Covered Software under Section 214 | 2.1 of this License shall terminate. 215 | 216 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 217 | end user license agreements (excluding distributors and resellers) which 218 | have been validly granted by You or Your distributors under this License 219 | prior to termination shall survive termination. 220 | 221 | ************************************************************************ 222 | * * 223 | * 6. Disclaimer of Warranty * 224 | * ------------------------- * 225 | * * 226 | * Covered Software is provided under this License on an "as is" * 227 | * basis, without warranty of any kind, either expressed, implied, or * 228 | * statutory, including, without limitation, warranties that the * 229 | * Covered Software is free of defects, merchantable, fit for a * 230 | * particular purpose or non-infringing. The entire risk as to the * 231 | * quality and performance of the Covered Software is with You. * 232 | * Should any Covered Software prove defective in any respect, You * 233 | * (not any Contributor) assume the cost of any necessary servicing, * 234 | * repair, or correction. This disclaimer of warranty constitutes an * 235 | * essential part of this License. No use of any Covered Software is * 236 | * authorized under this License except under this disclaimer. * 237 | * * 238 | ************************************************************************ 239 | 240 | ************************************************************************ 241 | * * 242 | * 7. Limitation of Liability * 243 | * -------------------------- * 244 | * * 245 | * Under no circumstances and under no legal theory, whether tort * 246 | * (including negligence), contract, or otherwise, shall any * 247 | * Contributor, or anyone who distributes Covered Software as * 248 | * permitted above, be liable to You for any direct, indirect, * 249 | * special, incidental, or consequential damages of any character * 250 | * including, without limitation, damages for lost profits, loss of * 251 | * goodwill, work stoppage, computer failure or malfunction, or any * 252 | * and all other commercial damages or losses, even if such party * 253 | * shall have been informed of the possibility of such damages. This * 254 | * limitation of liability shall not apply to liability for death or * 255 | * personal injury resulting from such party’s negligence to the * 256 | * extent applicable law prohibits such limitation, but in such event, * 257 | * and to the greatest extent permissible, damages will be limited to * 258 | * direct damages not to exceed one hundred dollars. Some * 259 | * jurisdictions do not allow the exclusion or limitation of * 260 | * incidental or consequential damages, so this exclusion and * 261 | * limitation may not apply to You. * 262 | * * 263 | ************************************************************************ 264 | 265 | 8. Litigation 266 | ------------- 267 | 268 | Any litigation relating to this License may be brought only in the 269 | courts of a jurisdiction where the defendant maintains its principal 270 | place of business and such litigation shall be governed by laws of that 271 | jurisdiction, without reference to its conflict-of-law provisions. 272 | Nothing in this Section shall prevent a party’s ability to bring 273 | cross-claims or counter-claims. 274 | 275 | 9. Miscellaneous 276 | ---------------- 277 | 278 | This License represents the complete agreement concerning the subject 279 | matter hereof. If any provision of this License is held to be 280 | unenforceable, such provision shall be reformed only to the extent 281 | necessary to make it enforceable. Any law or regulation which provides 282 | that the language of a contract shall be construed against the drafter 283 | shall not be used to construe this License against a Contributor. 284 | 285 | 10. Versions of the License 286 | --------------------------- 287 | 288 | 10.1. New Versions 289 | 290 | Oxford Nanopore Technologies PLC. is the license steward. Except as 291 | provided in Section 10.3, no one other than the license steward has the 292 | right to modify or publish new versions of this License. Each version 293 | will be given a distinguishing version number. 294 | 295 | 10.2. Effect of New Versions 296 | 297 | You may distribute the Covered Software under the terms of the version 298 | of the License under which You originally received the Covered Software, 299 | or under the terms of any subsequent version published by the license 300 | steward. 301 | 302 | 10.3. Modified Versions 303 | 304 | If you create software not governed by this License, and you want to 305 | create a new license for such software, you may create and use a 306 | modified version of this License if you rename the license and remove 307 | any references to the name of the license steward (except to note that 308 | such modified license differs from this License). 309 | 310 | Exhibit A - Source Code Form License Notice 311 | ------------------------------------------- 312 | 313 | This Source Code Form is subject to the terms of the Oxford Nanopore 314 | Technologies PLC. Public License, v. 1.0. Full licence can be found 315 | obtained from support@nanoporetech.com 316 | 317 | If it is not possible or desirable to put the notice in a particular 318 | file, then You may include the notice in a location (such as a LICENSE 319 | file in a relevant directory) where a recipient would be likely to look 320 | for such a notice. 321 | 322 | You may add additional accurate notices of copyright ownership. 323 | -------------------------------------------------------------------------------- /bin/create_pairs_report.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Create pairs report.""" 3 | 4 | from collections import defaultdict 5 | from typing import List, Tuple 6 | 7 | import pandas as pd 8 | import panel as pn 9 | import typer 10 | import hvplot.pandas # noqa 11 | 12 | pn.extension() 13 | 14 | PAIR_TYPES = { 15 | "W": "walk", 16 | "N": "null", 17 | "X": "corrupt", 18 | "M": "multi", 19 | "R": "rescued", 20 | "U": "unique", 21 | "D": "duplicate", 22 | } 23 | # https://github.com/4dn-dcic/pairsqc/blob/master/pairsqc.py 24 | ORI_NAMES = dict(zip(["+-", "-+", "++", "--"], ["Inner", "Outer", "Right", "Left"])) 25 | 26 | 27 | # %% 28 | def _parse_totals_table(data=List[Tuple[str, str]]): 29 | """Parse totals table.""" 30 | res = [] 31 | total = 0 32 | for key, val in data: 33 | key = key.strip() 34 | if key == "total": 35 | section = "all" 36 | total = int(val) 37 | elif key in ("total_unmapped", "total_single_sided_mapped", "total_mapped"): 38 | section = "mapping" 39 | elif key in ("total_dups", "total_nodups"): 40 | section = "duplicates" 41 | elif key in ("cis", "trans"): 42 | section = "cis/trans" 43 | elif key.startswith("cis_"): 44 | section = "distance" 45 | else: 46 | raise ValueError(f"#{key}#") 47 | 48 | res.append((section, key, int(val))) 49 | df = pd.DataFrame(res, columns=["Section", "Type", "Count"]) 50 | df["Perc. of Total"] = df["Count"] / total * 100.0 51 | df["Perc. of Section"] = df.groupby("Section")["Count"].transform( 52 | lambda x: 100.0 * x / x.sum() 53 | ) 54 | return df 55 | 56 | 57 | def _parse_pair_types(data=List[Tuple[str, str]]): 58 | """Parse pair types.""" 59 | res = [] 60 | for code, val in data: 61 | left, right = code[0], code[1] 62 | label = f"{PAIR_TYPES[left]}-{PAIR_TYPES[right]}" 63 | res.append((code, left, right, label, int(val))) 64 | df = pd.DataFrame(res, columns=["code", "left", "right", "label", "pairs"]) 65 | df["perc"] = 100.0 * df["pairs"] / df["pairs"].sum() 66 | return df 67 | 68 | 69 | def _parse_chrom_freq(data=List[Tuple[str, str]]): 70 | """Parse chrom freq.""" 71 | res = [] 72 | for code, val in data: 73 | chr1, chr2 = code.split("/") 74 | res.append((chr1, chr2, int(val))) 75 | 76 | df = ( 77 | pd.DataFrame(res, columns=["chrom1", "chrom2", "count"]) 78 | .set_index(["chrom1", "chrom2"]) 79 | .sort_index() 80 | .unstack(fill_value=0) 81 | ) 82 | df = df.xs("count", axis=1) 83 | return df 84 | 85 | 86 | def _parse_summary(data=List[Tuple[str, str]]): 87 | """Parse summary.""" 88 | res = [] 89 | for key, val in data: 90 | res.append({"statistic": key, "value": float(val)}) 91 | return pd.DataFrame(res) 92 | 93 | 94 | def _parse_dist_freq(data=List[Tuple[str, str]]): 95 | """Parse dist freq.""" 96 | res = [] 97 | for key, val in data: 98 | interval, ori = key.split("/") 99 | interval = interval.strip() 100 | if interval.endswith("+"): 101 | bin_left = bin_right = interval[:-1] 102 | else: 103 | bin_left, bin_right = interval.split("-") 104 | res.append( 105 | (int(bin_left), int(bin_right), ori, ORI_NAMES[ori] + f" ({ori})", int(val)) 106 | ) 107 | res = pd.DataFrame( 108 | res, columns=["bin_left", "bin_right", "ori", "ori_name", "count"] 109 | ) 110 | return res 111 | 112 | 113 | def read_pairs_stats(path): 114 | """Read Pairs stats.""" 115 | _data = defaultdict(list) 116 | with open(path) as f: 117 | for i in f: 118 | if "/" not in i: 119 | table = "totals" 120 | else: 121 | table, i = i.split("/", 1) 122 | _data[table].append(tuple(i.strip().split("\t"))) 123 | totals = _parse_totals_table(_data["totals"]) 124 | pair_types = _parse_pair_types(_data["pair_types"]) 125 | chrom_freq = _parse_chrom_freq(_data["chrom_freq"]) 126 | summary = _parse_summary(_data["summary"]) 127 | dist_freq = _parse_dist_freq(_data["dist_freq"]) 128 | return totals, pair_types, chrom_freq, summary, dist_freq 129 | 130 | 131 | def main(pair_stats, report_html, show_chroms=None): 132 | """Entry point.""" 133 | totals, pair_types, chrom_freq, summary, dist_freq = read_pairs_stats( 134 | pair_stats) 135 | totals_pane = pn.Column( 136 | pn.Row( 137 | pn.pane.DataFrame(totals.set_index( 138 | ["Section", "Type"]), width=600), 139 | totals.query("Section == 'mapping'").hvplot.bar( 140 | x="Section", 141 | y="Perc. of Total", 142 | by="Type", 143 | hover_cols=["Count", "Perc. of Total"], 144 | stacked=True, 145 | width=400, 146 | title="Mapping Rate", 147 | ), 148 | ), 149 | totals.query("Section == 'distance'").hvplot.bar( 150 | x="Type", y="Perc. of Section", 151 | title="Genomic Distance Distribution" 152 | ), 153 | ) 154 | 155 | pair_type_pane = pn.Column( 156 | pair_types.hvplot.bar( 157 | x="label", y="perc", hover_cols=["pairs"], title="Pair Types" 158 | ), 159 | pn.pane.DataFrame(pair_types, width=600), 160 | ) 161 | show_chroms_columns = chrom_freq.columns 162 | show_chroms_index = chrom_freq.index 163 | if show_chroms is not None: 164 | show_chroms_columns = show_chroms_columns.intersection(show_chroms) 165 | show_chroms_index = show_chroms_index.intersection(show_chroms) 166 | chrom_freq = chrom_freq.reindex( 167 | index=show_chroms_index, columns=show_chroms_columns, fill_value=0 168 | ) 169 | chrom_contact_pane = pn.Row( 170 | chrom_freq.hvplot.heatmap( 171 | width=600, 172 | height=600, 173 | colorbar=False, 174 | rot=45, 175 | colormap="viridis", 176 | title="Contact Count", 177 | ), 178 | chrom_freq 179 | .pipe(lambda x: x.div(x.sum(axis=0), axis=1)) 180 | .hvplot.heatmap( 181 | width=600, 182 | height=600, 183 | colorbar=False, 184 | rot=45, 185 | colormap="viridis", 186 | title="Contact Proportion (normalized by Chromosome)", 187 | ), 188 | ) 189 | 190 | distance_pane = pn.Row( 191 | dist_freq.hvplot.line( 192 | x="bin_right", by="ori_name", y="count", logx=True) 193 | ) 194 | 195 | report = pn.Tabs( 196 | ("Pairs", totals_pane), 197 | ("Pair Types", pair_type_pane), 198 | ("Chrom Contacts", chrom_contact_pane), 199 | ("Distance", distance_pane), 200 | ) 201 | report.save(report_html) 202 | 203 | 204 | if __name__ == "__main__": 205 | typer.run(main) 206 | -------------------------------------------------------------------------------- /bin/workflow-glue: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Entrypoint of pseudo-package for all the code used in the workflow.""" 3 | 4 | from workflow_glue import cli 5 | 6 | if __name__ == "__main__": 7 | cli() 8 | -------------------------------------------------------------------------------- /bin/workflow_glue/__init__.py: -------------------------------------------------------------------------------- 1 | """Workflow Python code.""" 2 | import argparse 3 | import glob 4 | import importlib 5 | import os 6 | import sys 7 | 8 | from .util import _log_level, get_main_logger # noqa: ABS101 9 | 10 | 11 | __version__ = "0.0.1" 12 | _package_name = "workflow_glue" 13 | 14 | 15 | def get_components(allowed_components=None): 16 | """Find a list of workflow command scripts.""" 17 | logger = get_main_logger(_package_name) 18 | path = os.path.dirname(os.path.abspath(__file__)) 19 | components = dict() 20 | for fname in glob.glob(os.path.join(path, "*.py")): 21 | name = os.path.splitext(os.path.basename(fname))[0] 22 | if name in ("__init__", "util"): 23 | continue 24 | if allowed_components is not None and name not in allowed_components: 25 | continue 26 | 27 | # leniently attempt to import module 28 | try: 29 | mod = importlib.import_module(f"{_package_name}.{name}") 30 | except ModuleNotFoundError as e: 31 | # if imports cannot be satisifed, refuse to add the component 32 | # rather than exploding 33 | logger.warn(f"Could not load {name} due to missing module {e.name}") 34 | continue 35 | 36 | # if theres a main() and and argparser() thats good enough for us. 37 | try: 38 | req = "main", "argparser" 39 | if all(callable(getattr(mod, x)) for x in req): 40 | components[name] = mod 41 | except Exception: 42 | pass 43 | return components 44 | 45 | 46 | def cli(): 47 | """Run workflow entry points.""" 48 | logger = get_main_logger(_package_name) 49 | logger.info("Bootstrapping CLI.") 50 | parser = argparse.ArgumentParser( 51 | 'wf-glue', 52 | parents=[_log_level()], 53 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 54 | 55 | parser.add_argument( 56 | '-v', '--version', action='version', 57 | version='%(prog)s {}'.format(__version__)) 58 | 59 | subparsers = parser.add_subparsers( 60 | title='subcommands', description='valid commands', 61 | help='additional help', dest='command') 62 | subparsers.required = True 63 | 64 | # importing everything can take time, try to shortcut 65 | if len(sys.argv) > 1: 66 | components = get_components(allowed_components=[sys.argv[1]]) 67 | if not sys.argv[1] in components: 68 | logger.warn("Importing all modules, this may take some time.") 69 | components = get_components() 70 | else: 71 | components = get_components() 72 | 73 | # add all module parsers to main CLI 74 | for name, module in components.items(): 75 | p = subparsers.add_parser( 76 | name.split(".")[-1], parents=[module.argparser()]) 77 | p.set_defaults(func=module.main) 78 | 79 | args = parser.parse_args() 80 | 81 | logger.info("Starting entrypoint.") 82 | args.func(args) 83 | -------------------------------------------------------------------------------- /bin/workflow_glue/check_bam_headers_in_dir.py: -------------------------------------------------------------------------------- 1 | """Check (u)BAM files for `@SQ` lines whether they are the same in all headers.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pysam 7 | 8 | from .util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def main(args): 12 | """Run the entry point.""" 13 | logger = get_named_logger("checkBamHdr") 14 | 15 | if not args.input_path.is_dir(): 16 | raise ValueError(f"Input path '{args.input_path}' must be a directory.") 17 | 18 | target_files = list(args.input_path.glob("*")) 19 | if not target_files: 20 | raise ValueError(f"No files found in input directory '{args.input_path}'.") 21 | # Loop over target files and check if there are `@SQ` lines in all headers or not. 22 | # Set `is_unaligned` accordingly. If there are mixed headers (either with some files 23 | # containing `@SQ` lines and some not or with different files containing different 24 | # `@SQ` lines), set `mixed_headers` to `True`. 25 | # Also check if there is the SO line, to validate whether the file is (un)sorted. 26 | first_sq_lines = None 27 | mixed_headers = False 28 | sorted_xam = False 29 | for xam_file in target_files: 30 | # get the `@SQ` and `@HD` lines in the header 31 | with pysam.AlignmentFile(xam_file, check_sq=False) as f: 32 | # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with 33 | # same reference but different SQ.UR as mixed_header (see CW-4842) 34 | sq_lines = [{ 35 | "SN": sq["SN"], 36 | "LN": sq["LN"], 37 | "M5": sq.get("M5"), 38 | } for sq in f.header.get("SQ", [])] 39 | hd_lines = f.header.get("HD") 40 | # Check if it is sorted. 41 | # When there is more than one BAM, merging/sorting 42 | # will happen regardless of this flag. 43 | if hd_lines is not None and hd_lines.get('SO') == 'coordinate': 44 | sorted_xam = True 45 | if first_sq_lines is None: 46 | # this is the first file 47 | first_sq_lines = sq_lines 48 | else: 49 | # this is a subsequent file; check with the first `@SQ` lines 50 | if sq_lines != first_sq_lines: 51 | mixed_headers = True 52 | break 53 | 54 | # we set `is_unaligned` to `True` if there were no mixed headers and the last file 55 | # didn't have `@SQ` lines (as we can then be sure that none of the files did) 56 | is_unaligned = not mixed_headers and not sq_lines 57 | # write `is_unaligned` and `mixed_headers` out so that they can be set as env. 58 | # variables 59 | sys.stdout.write( 60 | f"IS_UNALIGNED={int(is_unaligned)};" + 61 | f"MIXED_HEADERS={int(mixed_headers)};" + 62 | f"IS_SORTED={int(sorted_xam)}" 63 | ) 64 | logger.info(f"Checked (u)BAM headers in '{args.input_path}'.") 65 | 66 | 67 | def argparser(): 68 | """Argument parser for entrypoint.""" 69 | parser = wf_parser("check_bam_headers_in_dir") 70 | parser.add_argument("input_path", type=Path, help="Path to target directory") 71 | return parser 72 | -------------------------------------------------------------------------------- /bin/workflow_glue/check_sample_sheet.py: -------------------------------------------------------------------------------- 1 | """Check if a sample sheet is valid.""" 2 | import codecs 3 | import csv 4 | import os 5 | import re 6 | import sys 7 | 8 | from .util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | # Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my 12 | # comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8 13 | # I should add). If we do not handle this with the correct encoding, the mark will 14 | # appear in the parsed data, causing the header to be malformed. 15 | # See CW-2310 16 | def determine_codec(f): 17 | """Peek at a file and return an appropriate reading codec.""" 18 | with open(f, 'rb') as f_bytes: 19 | # Could use chardet here if we need to expand codec support 20 | initial_bytes = f_bytes.read(8) 21 | 22 | for codec, encoding_name in [ 23 | [codecs.BOM_UTF8, "utf-8-sig"], # use the -sig codec to drop the mark 24 | [codecs.BOM_UTF16_BE, "utf-16"], # don't specify LE or BE to drop mark 25 | [codecs.BOM_UTF16_LE, "utf-16"], 26 | [codecs.BOM_UTF32_BE, "utf-32"], # handle 32 for completeness 27 | [codecs.BOM_UTF32_LE, "utf-32"], # again skip LE or BE to drop mark 28 | ]: 29 | if initial_bytes.startswith(codec): 30 | return encoding_name 31 | return None # will cause file to be opened with default encoding 32 | 33 | 34 | def main(args): 35 | """Run the entry point.""" 36 | logger = get_named_logger("checkSheet") 37 | 38 | barcodes = [] 39 | aliases = [] 40 | sample_types = [] 41 | analysis_groups = [] 42 | allowed_sample_types = [ 43 | "test_sample", "positive_control", "negative_control", "no_template_control" 44 | ] 45 | 46 | if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet): 47 | sys.stdout.write("Could not open sample sheet file.") 48 | sys.exit() 49 | 50 | try: 51 | encoding = determine_codec(args.sample_sheet) 52 | with open(args.sample_sheet, "r", encoding=encoding) as f: 53 | try: 54 | # Excel files don't throw any error until here 55 | csv.Sniffer().sniff(f.readline()) 56 | f.seek(0) # return to initial position again 57 | except Exception as e: 58 | # Excel fails with UniCode error 59 | sys.stdout.write( 60 | "The sample sheet doesn't seem to be a CSV file.\n" 61 | "The sample sheet has to be a CSV file.\n" 62 | "Please verify that the sample sheet is a CSV file.\n" 63 | f"Parsing error: {e}" 64 | ) 65 | 66 | sys.exit() 67 | 68 | csv_reader = csv.DictReader(f) 69 | n_row = 0 70 | for row in csv_reader: 71 | n_row += 1 72 | if n_row == 1: 73 | n_cols = len(row) 74 | else: 75 | # check we got the same number of fields 76 | if len(row) != n_cols: 77 | sys.stdout.write( 78 | f"Unexpected number of cells in row number {n_row}" 79 | ) 80 | sys.exit() 81 | try: 82 | barcodes.append(row["barcode"]) 83 | except KeyError: 84 | sys.stdout.write("'barcode' column missing") 85 | sys.exit() 86 | try: 87 | aliases.append(row["alias"]) 88 | except KeyError: 89 | sys.stdout.write("'alias' column missing") 90 | sys.exit() 91 | try: 92 | sample_types.append(row["type"]) 93 | except KeyError: 94 | pass 95 | try: 96 | analysis_groups.append(row["analysis_group"]) 97 | except KeyError: 98 | pass 99 | except Exception as e: 100 | sys.stdout.write(f"Parsing error: {e}") 101 | sys.exit() 102 | 103 | # check barcodes are correct format 104 | for barcode in barcodes: 105 | if not re.match(r'^barcode\d\d+$', barcode): 106 | sys.stdout.write("values in 'barcode' column are incorrect format") 107 | sys.exit() 108 | 109 | # check barcodes are all the same length 110 | first_length = len(barcodes[0]) 111 | for barcode in barcodes[1:]: 112 | if len(barcode) != first_length: 113 | sys.stdout.write("values in 'barcode' column are different lengths") 114 | sys.exit() 115 | 116 | # check barcode and alias values are unique 117 | if len(barcodes) > len(set(barcodes)): 118 | sys.stdout.write("values in 'barcode' column not unique") 119 | sys.exit() 120 | if len(aliases) > len(set(aliases)): 121 | sys.stdout.write("values in 'alias' column not unique") 122 | sys.exit() 123 | 124 | if sample_types: 125 | # check if "type" column has unexpected values 126 | unexp_type_vals = set(sample_types) - set(allowed_sample_types) 127 | 128 | if unexp_type_vals: 129 | sys.stdout.write( 130 | f"found unexpected values in 'type' column: {unexp_type_vals}. " 131 | f"Allowed values are: {allowed_sample_types}" 132 | ) 133 | sys.exit() 134 | 135 | if args.required_sample_types: 136 | for required_type in args.required_sample_types: 137 | if required_type not in allowed_sample_types: 138 | sys.stdout.write(f"Not an allowed sample type: {required_type}") 139 | sys.exit() 140 | if sample_types.count(required_type) < 1: 141 | sys.stdout.write( 142 | f"Sample sheet requires at least 1 of {required_type}") 143 | sys.exit() 144 | if analysis_groups: 145 | # if there was a "analysis_group" column, make sure it had values for all 146 | # samples 147 | if not all(analysis_groups): 148 | sys.stdout.write( 149 | "if an 'analysis_group' column exists, it needs values in each row" 150 | ) 151 | sys.exit() 152 | 153 | logger.info(f"Checked sample sheet {args.sample_sheet}.") 154 | 155 | 156 | def argparser(): 157 | """Argument parser for entrypoint.""" 158 | parser = wf_parser("check_sample_sheet") 159 | parser.add_argument("sample_sheet", help="Sample sheet to check") 160 | parser.add_argument( 161 | "--required_sample_types", 162 | help="List of required sample types. Each sample type provided must " 163 | "appear at least once in the sample sheet", 164 | nargs="*" 165 | ) 166 | return parser 167 | -------------------------------------------------------------------------------- /bin/workflow_glue/check_xam_index.py: -------------------------------------------------------------------------------- 1 | """Validate a single (u)BAM file index.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pysam 7 | 8 | from .util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def validate_xam_index(xam_file): 12 | """Use fetch to validate the index. 13 | 14 | Invalid indexes will fail the call with a ValueError: 15 | ValueError: fetch called on bamfile without index 16 | """ 17 | with pysam.AlignmentFile(xam_file, check_sq=False) as alignments: 18 | try: 19 | alignments.fetch() 20 | has_valid_index = True 21 | except ValueError: 22 | has_valid_index = False 23 | return has_valid_index 24 | 25 | 26 | def main(args): 27 | """Run the entry point.""" 28 | logger = get_named_logger("checkBamIdx") 29 | 30 | # Check if a XAM has a valid index 31 | has_valid_index = validate_xam_index(args.input_xam) 32 | # write `has_valid_index` out so that they can be set as env. 33 | sys.stdout.write( 34 | f"HAS_VALID_INDEX={int(has_valid_index)}" 35 | ) 36 | logger.info(f"Checked (u)BAM index for: '{args.input_xam}'.") 37 | 38 | 39 | def argparser(): 40 | """Argument parser for entrypoint.""" 41 | parser = wf_parser("check_xam_index") 42 | parser.add_argument("input_xam", type=Path, help="Path to target XAM") 43 | return parser 44 | -------------------------------------------------------------------------------- /bin/workflow_glue/configure_igv.py: -------------------------------------------------------------------------------- 1 | """Create an IGV config file.""" 2 | 3 | from itertools import zip_longest 4 | import json 5 | import sys 6 | 7 | from .util import get_named_logger, wf_parser # noqa: ABS101 8 | 9 | 10 | def parse_fnames(fofn): 11 | """Parse list with filenames and return them grouped as ref-, XAM-, or VCF-related. 12 | 13 | :param fofn: File with list of file names (one per line) 14 | :return: dict of reference-related filenames (with keys 'ref', 'fai', and '.gzi' and 15 | `None` as default values); lists of XAM- and VCF-related filenames 16 | """ 17 | ref_extensions = [".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fna.gz"] 18 | ref_dict = {} 19 | xams = [] 20 | xam_indices = [] 21 | vcfs = [] 22 | vcf_indices = [] 23 | with open(fofn, "r") as f: 24 | for line in f: 25 | fname = line.strip() 26 | if any(fname.endswith(ext) for ext in ref_extensions): 27 | ref_dict["ref"] = fname 28 | elif fname.endswith(".fai"): 29 | ref_dict["fai"] = fname 30 | elif fname.endswith(".gzi"): 31 | ref_dict["gzi"] = fname 32 | elif fname.endswith(".bam") or fname.endswith(".cram"): 33 | xams.append(fname) 34 | elif fname.endswith(".bai") or fname.endswith(".crai"): 35 | xam_indices.append(fname) 36 | elif fname.endswith(".vcf") or fname.endswith(".vcf.gz"): 37 | vcfs.append(fname) 38 | elif fname.endswith(".csi") or fname.endswith(".tbi"): 39 | vcf_indices.append(fname) 40 | # do some sanity checks 41 | if "ref" not in ref_dict: 42 | raise ValueError( 43 | "No reference file (i.e. file ending in one of " 44 | f"{ref_extensions} was found)." 45 | ) 46 | ref = ref_dict["ref"] 47 | if (gzi := ref_dict.get("gzi")) is not None: 48 | # since we got a '.gzi' index, make sure that the reference is actually 49 | # compressed 50 | if not ref_dict["ref"].endswith(".gz"): 51 | raise ValueError( 52 | f"Found GZI reference index '{gzi}', but the reference file " 53 | f"'{ref}' appears not to be compressed." 54 | ) 55 | if xam_indices: 56 | if len(xams) != len(xam_indices): 57 | raise ValueError("Got different number of XAM and XAM index files.") 58 | if vcf_indices: 59 | if len(vcfs) != len(vcf_indices): 60 | raise ValueError("Got different number of VCF and VCF index files.") 61 | if xams and vcfs: 62 | if len(xams) != len(vcfs): 63 | raise ValueError("Got different number of XAM and VCF files.") 64 | # if we got XAM or VCF indices, pair them up with their corresponding files (and 65 | # otherwise with `None`) 66 | xams_with_indices = zip_longest(xams, xam_indices) 67 | vcfs_with_indices = zip_longest(vcfs, vcf_indices) 68 | return ref_dict, xams_with_indices, vcfs_with_indices 69 | 70 | 71 | def get_reference_options(ref, fai=None, gzi=None): 72 | """Create dict with IGV reference options. 73 | 74 | :param ref: reference file name 75 | :param fai: name reference `.fai` index file 76 | :param gzi: name of `.gzi` index file for a compressed reference 77 | :return: dict with reference options 78 | """ 79 | # initialise the options dict and add the index attributes later 80 | ref_opts = { 81 | "id": "ref", 82 | "name": "ref", 83 | "wholeGenomeView": False, 84 | "fastaURL": ref, 85 | } 86 | if fai is not None: 87 | ref_opts["indexURL"] = fai 88 | if gzi is not None: 89 | ref_opts["compressedIndexURL"] = gzi 90 | return ref_opts 91 | 92 | 93 | def get_alignment_track(xam, xai=None, extra_opts=None): 94 | """Create dict with options for IGV alignment track. 95 | 96 | :param xam: name of XAM file to be displayed 97 | :param xai: name of XAM index file 98 | :param extra_opts: dict of extra options for the alignment track 99 | :return: dict with alignment track options 100 | """ 101 | alignment_track_dict = { 102 | "name": xam, 103 | "type": "alignment", 104 | "format": xam.split(".")[-1], 105 | "url": xam, 106 | } 107 | # add the XAM index if present 108 | if xai is not None: 109 | alignment_track_dict["indexURL"] = xai 110 | alignment_track_dict.update(extra_opts or {}) 111 | return alignment_track_dict 112 | 113 | 114 | def get_variant_track(vcf, index=None, extra_opts=None): 115 | """Create dict with options for IGV variant track. 116 | 117 | :param vcf: name of VCF file to be displayed 118 | :param index: name of VCF index file (ending in `.csi` or `.tbi`) 119 | :param extra_opts: dict of extra options for the variant track 120 | :return: dict with variant track options 121 | """ 122 | variant_track_dict = { 123 | "name": vcf, 124 | "type": "variant", 125 | "format": "vcf", 126 | "url": vcf, 127 | } 128 | # add the VCF index if we got an index extension 129 | if index is not None: 130 | variant_track_dict["indexURL"] = index 131 | variant_track_dict.update(extra_opts or {}) 132 | return variant_track_dict 133 | 134 | 135 | def main(args): 136 | """Run the entry point.""" 137 | logger = get_named_logger("configIGV") 138 | 139 | # parse the FOFN 140 | ref_dict, xams_with_indices, vcfs_with_indices = parse_fnames(args.fofn) 141 | 142 | # initialise the IGV options dict with the reference options 143 | json_dict = {"reference": get_reference_options(**ref_dict)} 144 | 145 | # if we got JSON files with extra options for the alignment / variant tracks, read 146 | # them 147 | extra_alignment_opts = {} 148 | if args.extra_alignment_opts is not None: 149 | with open(args.extra_alignment_opts, "r") as f: 150 | extra_alignment_opts = json.load(f) 151 | extra_variant_opts = {} 152 | if args.extra_variant_opts is not None: 153 | with open(args.extra_variant_opts, "r") as f: 154 | extra_variant_opts = json.load(f) 155 | 156 | # now add the alignment and variant tracks 157 | json_dict["tracks"] = [] 158 | # we use `zip_longest` to make sure that variant and alignment tracks from the same 159 | # sample are added after each other 160 | for (vcf, vcf_index), (xam, xam_index) in zip_longest( 161 | vcfs_with_indices, xams_with_indices, fillvalue=(None, None) 162 | ): 163 | if vcf is not None: 164 | # add a variant track for the VCF 165 | json_dict["tracks"].append( 166 | get_variant_track(vcf, vcf_index, extra_variant_opts) 167 | ) 168 | if xam is not None: 169 | # add an alignment track for the XAM 170 | json_dict["tracks"].append( 171 | get_alignment_track(xam, xam_index, extra_alignment_opts) 172 | ) 173 | 174 | if args.locus is not None: 175 | json_dict["locus"] = args.locus 176 | 177 | json.dump(json_dict, sys.stdout, indent=4) 178 | 179 | logger.info("Printed IGV config JSON to STDOUT.") 180 | 181 | 182 | def argparser(): 183 | """Argument parser for entrypoint.""" 184 | parser = wf_parser("configure_igv") 185 | parser.add_argument( 186 | "--fofn", 187 | required=True, 188 | help=( 189 | "File with list of names of reference / XAM / VCF files and indices " 190 | "(one filename per line)" 191 | ), 192 | ) 193 | parser.add_argument( 194 | "--locus", 195 | help="Locus string to set initial genomic coordinates to display in IGV", 196 | ) 197 | parser.add_argument( 198 | "--extra-alignment-opts", 199 | help="JSON file with extra options for alignment tracks", 200 | ) 201 | parser.add_argument( 202 | "--extra-variant-opts", 203 | help="JSON file with extra options for variant tracks", 204 | ) 205 | return parser 206 | -------------------------------------------------------------------------------- /bin/workflow_glue/get_max_depth_locus.py: -------------------------------------------------------------------------------- 1 | """Find max depth window in a `mosdepth` regions BED file and write as locus string.""" 2 | 3 | from pathlib import Path 4 | import sys 5 | 6 | import pandas as pd 7 | 8 | from .util import get_named_logger, wf_parser # noqa: ABS101 9 | 10 | 11 | def main(args): 12 | """Run the entry point.""" 13 | logger = get_named_logger("getMaxDepth") 14 | 15 | # read the regions BED file 16 | df = pd.read_csv( 17 | args.depths_bed, sep="\t", header=None, names=["ref", "start", "end", "depth"] 18 | ) 19 | 20 | # get the window with the largest depth 21 | ref, start, end, depth = df.loc[df["depth"].idxmax()] 22 | 23 | # get the length of the reference of that window 24 | ref_length = df.query("ref == @ref")["end"].iloc[-1] 25 | 26 | # show the whole reference in case it's shorter than the desired locus size 27 | if ref_length < args.locus_size: 28 | start = 1 29 | end = ref_length 30 | else: 31 | # otherwise, show a region of the desired size around the window 32 | half_size = args.locus_size // 2 33 | mid = (start + end) // 2 34 | start = mid - half_size 35 | end = mid + half_size 36 | # check if the region starts below `1` or ends beyond the end of the reference 37 | if start < 1: 38 | start = 1 39 | end = args.locus_size 40 | if end > ref_length: 41 | start = ref_length - args.locus_size 42 | end = ref_length 43 | 44 | # write depth and locus string 45 | sys.stdout.write(f"{depth}\t{ref}:{start}-{end}") 46 | 47 | logger.info("Wrote locus with maximum depth to STDOUT.") 48 | 49 | 50 | def argparser(): 51 | """Argument parser for entrypoint.""" 52 | parser = wf_parser("get_max_depth_locus") 53 | parser.add_argument( 54 | "depths_bed", 55 | type=Path, 56 | help="path to mosdepth regions depth file (can be compressed)", 57 | ) 58 | parser.add_argument( 59 | "locus_size", type=int, help="size of the locus in basepairs (e.g. '2000')" 60 | ) 61 | return parser 62 | -------------------------------------------------------------------------------- /bin/workflow_glue/report.py: -------------------------------------------------------------------------------- 1 | """Create workflow report.""" 2 | import json 3 | 4 | from ezcharts.components import fastcat 5 | from ezcharts.components.reports import labs 6 | from ezcharts.layout.snippets import Tabs 7 | from ezcharts.layout.snippets.table import DataTable 8 | import pandas as pd 9 | 10 | from .util import get_named_logger, wf_parser # noqa: ABS101 11 | 12 | 13 | def main(args): 14 | """Run the entry point.""" 15 | logger = get_named_logger("Report") 16 | report = labs.LabsReport( 17 | "Workflow Pore C report", "wf-pore-c", 18 | args.params, args.versions, args.wf_version) 19 | 20 | with open(args.metadata) as metadata: 21 | sample_details = [{ 22 | 'sample': d['alias'], 23 | 'type': d['type'], 24 | 'barcode': d['barcode'] 25 | } for d in json.load(metadata)] 26 | 27 | if args.stats: 28 | with report.add_section("Read summary", "Read summary"): 29 | names = tuple(d['sample'] for d in sample_details) 30 | stats = tuple(args.stats) 31 | if len(stats) == 1: 32 | stats = stats[0] 33 | names = names[0] 34 | fastcat.SeqSummary( 35 | stats, sample_names=names, alignment_stats=False) 36 | 37 | with report.add_section("Sample Metadata", "Sample Metadata"): 38 | tabs = Tabs() 39 | for d in sample_details: 40 | with tabs.add_tab(d["sample"]): 41 | df = pd.DataFrame.from_dict(d, orient="index", columns=["Value"]) 42 | df.index.name = "Key" 43 | DataTable.from_pandas(df) 44 | 45 | report.write(args.report) 46 | logger.info(f"Report written to {args.report}.") 47 | 48 | 49 | def argparser(): 50 | """Argument parser for entrypoint.""" 51 | parser = wf_parser("report") 52 | parser.add_argument("report", help="Report output file") 53 | parser.add_argument( 54 | "--stats", nargs='+', 55 | help="Fastcat stats histogram directories, \ 56 | ordered as per entries in --metadata.") 57 | parser.add_argument( 58 | "--metadata", required=True, 59 | help="sample metadata JSON") 60 | parser.add_argument( 61 | "--versions", required=True, 62 | help="directory containing CSVs containing name,version.") 63 | parser.add_argument( 64 | "--params", required=True, 65 | help="A JSON file containing the workflow parameter key/values") 66 | parser.add_argument( 67 | "--wf_version", default='unknown', 68 | help="version of the executed workflow") 69 | return parser 70 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """__init__.py for the tests.""" 2 | -------------------------------------------------------------------------------- /bin/workflow_glue/tests/test_test.py: -------------------------------------------------------------------------------- 1 | """A dummy test.""" 2 | 3 | import argparse 4 | 5 | from workflow_glue import report 6 | 7 | 8 | def test(): 9 | """Just showing that we can import using the workflow-glue.""" 10 | assert isinstance(report.argparser(), argparse.ArgumentParser) 11 | -------------------------------------------------------------------------------- /bin/workflow_glue/util.py: -------------------------------------------------------------------------------- 1 | """The odd helper function.""" 2 | 3 | import argparse 4 | import logging 5 | 6 | _log_name = None 7 | 8 | 9 | def get_main_logger(name): 10 | """Create the top-level logger.""" 11 | global _log_name 12 | _log_name = name 13 | logging.basicConfig( 14 | format='[%(asctime)s - %(name)s] %(message)s', 15 | datefmt='%H:%M:%S', level=logging.INFO) 16 | return logging.getLogger(name) 17 | 18 | 19 | def get_named_logger(name): 20 | """Create a logger with a name. 21 | 22 | :param name: name of logger. 23 | """ 24 | name = name.ljust(10)[:10] # so logging is aligned 25 | logger = logging.getLogger('{}.{}'.format(_log_name, name)) 26 | return logger 27 | 28 | 29 | def wf_parser(name): 30 | """Make an argument parser for a workflow command.""" 31 | return argparse.ArgumentParser( 32 | name, 33 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, 34 | add_help=False) 35 | 36 | 37 | def _log_level(): 38 | """Parser to set logging level and acquire software version/commit.""" 39 | parser = argparse.ArgumentParser( 40 | formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False) 41 | 42 | modify_log_level = parser.add_mutually_exclusive_group() 43 | modify_log_level.add_argument( 44 | '--debug', action='store_const', 45 | dest='log_level', const=logging.DEBUG, default=logging.INFO, 46 | help='Verbose logging of debug information.') 47 | modify_log_level.add_argument( 48 | '--quiet', action='store_const', 49 | dest='log_level', const=logging.WARNING, default=logging.INFO, 50 | help='Minimal logging; warnings only.') 51 | 52 | return parser 53 | -------------------------------------------------------------------------------- /data/OPTIONAL_FILE: -------------------------------------------------------------------------------- 1 | # Nothing to see here. A sentinel file to replace real data. 2 | # e.g.: 3 | # 4 | # process run { 5 | # input: 6 | # path some_data 7 | # path extra_data 8 | # script: 9 | # def extra = extra_data.name != 'OPTIONAL_FILE' ? "--extra-data $opt" : '' 10 | # """ 11 | # command ${some_data} ${extra} 12 | # """ 13 | # } 14 | # 15 | # some_data = ... 16 | # extra_data = Channel.fromPath("$projectDir/data/OPTIONAL_FILE")) 17 | # run(some_data, extra_data) 18 | -------------------------------------------------------------------------------- /docs/01_brief_description.md: -------------------------------------------------------------------------------- 1 | Workflow for analysing Pore-c data for chromatin conformation capture. -------------------------------------------------------------------------------- /docs/02_introduction.md: -------------------------------------------------------------------------------- 1 | Pore-C is an end-to-end workflow unique to Oxford Nanopore which combines chromatin conformation capture (3C) with direct, long nanopore sequencing reads. With nanopore reads, long-range, multi-way contact information can be obtained. 2 | 3 | This workflow can be used for the following: 4 | 5 | * Pre-processing a reference genome or draft assembly to generate auxiliary files used in downstream analyses. 6 | * Creating virtual digests of Pore-c reads. 7 | * Filtering the raw reads to remove any that might break downstream tools. 8 | * Align virtually digested reads against a reference genome. 9 | * Processing results to filter spurious alignments, detect ligation junctions and assign fragments. 10 | * Outputting aligned, sorted and annotated BAM files. 11 | * Generating a contact map, which shows the intensity of the physical interaction between two genome regions. 12 | * Create output files for downstream analysis in the following formats. 13 | - [Pairs format](https://github.com/4dn-dcic/pairix/blob/master/pairs_format_specification.md) 14 | - [Cooler format](https://mirnylab.github.io/cooler/) 15 | - [Hic format](https://github.com/aidenlab/juicer/wiki/) -------------------------------------------------------------------------------- /docs/03_compute_requirements.md: -------------------------------------------------------------------------------- 1 | Recommended requirements: 2 | 3 | + CPUs = 64 4 | + Memory = 128GB 5 | 6 | Minimum requirements: 7 | 8 | + CPUs = 8 9 | + Memory = 32GB 10 | 11 | Approximate run time: 12 hours for 100GB input BAM using the recommended resources, this will vary depending on number of monomers found per read. 12 | 13 | ARM processor support: False 14 | -------------------------------------------------------------------------------- /docs/04_install_and_run.md: -------------------------------------------------------------------------------- 1 | 2 | These are instructions to install and run the workflow on command line. 3 | You can also access the workflow via the 4 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/). 5 | 6 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage 7 | compute and software resources, 8 | therefore Nextflow will need to be 9 | installed before attempting to run the workflow. 10 | 11 | The workflow can currently be run using either 12 | [Docker](https://www.docker.com/products/docker-desktop) 13 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html) 14 | to provide isolation of the required software. 15 | Both methods are automated out-of-the-box provided 16 | either Docker or Singularity is installed. 17 | This is controlled by the 18 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles) 19 | parameter as exemplified below. 20 | 21 | It is not required to clone or download the git repository 22 | in order to run the workflow. 23 | More information on running EPI2ME workflows can 24 | be found on our [website](https://labs.epi2me.io/wfindex). 25 | 26 | The following command can be used to obtain the workflow. 27 | This will pull the repository in to the assets folder of 28 | Nextflow and provide a list of all parameters 29 | available for the workflow as well as an example command: 30 | 31 | ``` 32 | nextflow run epi2me-labs/wf-pore-c --help 33 | ``` 34 | To update a workflow to the latest version on the command line use 35 | the following command: 36 | ``` 37 | nextflow pull epi2me-labs/wf-pore-c 38 | ``` 39 | 40 | A demo dataset is provided for testing of the workflow. 41 | It can be downloaded and unpacked using the following commands: 42 | ``` 43 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo.tar.gz 44 | tar -xzvf wf-pore-c-demo.tar.gz 45 | ``` 46 | The workflow can then be run with the downloaded demo data using: 47 | ``` 48 | nextflow run epi2me-labs/wf-pore-c \ 49 | --bam 'wf-pore-c-demo/porec_test.concatemers.bam' \ 50 | --chunk_size 100 \ 51 | --cutter 'NlaIII' \ 52 | --hi_c \ 53 | --mcool \ 54 | --paired_end \ 55 | --paired_end_maximum_distance 200 \ 56 | --paired_end_minimum_distance 100 \ 57 | --phased_vcf 'wf-pore-c-demo/porec_test.phased_variants.vcf.gz' \ 58 | --ref 'wf-pore-c-demo/porec_test.fasta' \ 59 | --vcf 'wf-pore-c-demo/porec_test.phased_variants.vcf.gz' \ 60 | -profile standard 61 | ``` 62 | 63 | For further information about running a workflow on 64 | the command line see https://labs.epi2me.io/wfquickstart/ 65 | -------------------------------------------------------------------------------- /docs/05_related_protocols.md: -------------------------------------------------------------------------------- 1 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices. 2 | 3 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/). -------------------------------------------------------------------------------- /docs/06_input_example.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | This workflow accepts either FASTQ or unaligned BAM files as input. 4 | 5 | The FASTQ and BAM input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ or BAM file; (ii) the path to a top-level directory containing FASTQ or BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ or BAM files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. 6 | 7 | ``` 8 | (i) (ii) (iii) 9 | input_reads.fastq ─── input_directory ─── input_directory 10 | ├── reads0.fastq ├── barcode01 11 | └── reads1.fastq │ ├── reads0.fastq 12 | │ └── reads1.fastq 13 | ├── barcode02 14 | │ ├── reads0.fastq 15 | │ ├── reads1.fastq 16 | │ └── reads2.fastq 17 | └── barcode03 18 | └── reads0.fastq 19 | ``` -------------------------------------------------------------------------------- /docs/06_input_parameters.md: -------------------------------------------------------------------------------- 1 | ### Input Options 2 | 3 | | Nextflow parameter name | Type | Description | Help | Default | 4 | |--------------------------|------|-------------|------|---------| 5 | | bam | string | An unaligned BAM file containing Pore-C concatemer sequences. | This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | 6 | | fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. | | 7 | | sample_sheet | string | A CSV file used to map barcodes to sample aliases and optionally provide per-sample parameters. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Optionally, a `cutter` column can contain the name of the enzyme used per sample (see the `--cutter` parameter for more details) and a `vcf` column can be used to provide a phased VCF file per sample if you require haplotagged alignments. | | 8 | | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. | | | 9 | | analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False | 10 | | ref | string | A FASTA file containing the reference genome to map against. | | | 11 | | vcf | string | An optional phased VCF file that will be used to haplotag alignments. | | | 12 | | cutter | string | The enzyme used in the restriction digest. | Any enzyme from the Biopython restriction dictionary can be used. See `https://github.com/biopython/biopython/blob/master/Bio/Restriction/Restriction_Dictionary.py`. This can also be defined per sample: see `--sample_sheet` parameter. | NlaIII | 13 | 14 | 15 | ### Output Options 16 | 17 | | Nextflow parameter name | Type | Description | Help | Default | 18 | |--------------------------|------|-------------|------|---------| 19 | | out_dir | string | Directory for output of all user-facing files. | | output | 20 | | hi_c | boolean | Output a Hi-C formatted file; will convert pairs format to a Hi-C (`.hic`) file which will be compatible with [juicer](https://github.com/aidenlab/juicer). | Load this file with [Juice box](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation. | False | 21 | | bed | boolean | Output a BED file of the paired-end BAM alignments for use with downstream tools. Setting this to true will also trigger creation of the paired-end BAM. | Will use the paired-end BAM to create a BED file compatible with downstream tools including scaffolding tool [Yahs](https://github.com/c-zhou/yahs). | False | 22 | 23 | 24 | ### Advanced Options 25 | 26 | | Nextflow parameter name | Type | Description | Help | Default | 27 | |--------------------------|------|-------------|------|---------| 28 | | chunk_size | integer | Process input in chunks of this number of reads. | To reduce per-process memory requirements for large datasets, process the inputs in chunks of reads. Set to 0 to process entire dataset in one go. | 20000 | 29 | | threads | integer | Set maximum number of threads to use for more intense processes (limited by config executor cpus). We recommend a minimum of 4, but if available 19. | | 4 | 30 | 31 | 32 | ### Pore-C Tools Options 33 | 34 | | Nextflow parameter name | Type | Description | Help | Default | 35 | |--------------------------|------|-------------|------|---------| 36 | | minimap2_settings | string | The minimap2 settings for mapping monomers. | | -x map-ont | 37 | | max_monomers | integer | The maximum number of monomers allowed for a read to be included in downstream analysis. | | 250 | 38 | | coverage | boolean | Calculate restriction-fragment coverage using mosdepth. | | False | 39 | | summary_json | boolean | Output pore-c-py annotation summary in json format. | | True | 40 | 41 | 42 | ### Chromunity Options 43 | 44 | | Nextflow parameter name | Type | Description | Help | Default | 45 | |--------------------------|------|-------------|------|---------| 46 | | chromunity | boolean | Create parquet files for Chromunity. | See the chromunity documentation for further details 'https://github.com/mskilab/chromunity'. | False | 47 | | chromunity_merge_distance | integer | Merge colinear alignments separated by less than this base pair distance into a single monomer. | | -1 | 48 | 49 | 50 | ### 4DN files Options 51 | 52 | | Nextflow parameter name | Type | Description | Help | Default | 53 | |--------------------------|------|-------------|------|---------| 54 | | pairs | boolean | Create a 4DN-format pairs file (also calculate stats). | Outputs a directory with a pairs stats report and a pairs file which can be used for downstream anaylsis. | False | 55 | | pairtools_chunksize | integer | Number of pairs to be processed in each chunk in the prepare_hic process which uses the pairtools dedup tool. | Reduce for lower memory footprint. Below 10,000 performance starts suffering significantly. | 100000 | 56 | | mcool | boolean | Create a multi-resolution cooler file. Will output the cooler formatted file which you can load with cooler. | See 'https://open2c.github.io/cooler' for more details. | False | 57 | | cool_bin_size | integer | The bin size of the cooler output file in base pairs. | See 'https://open2c.github.io/cooler' for more details. | 1000 | 58 | | mcool_resolutions | string | The resolutions of the mcool file in pixels (see cooler documentation for details). | Comma-separated list of target resolutions. Use suffixes B or N to specify a progression: B for binary (geometric steps of factor 2), N for nice (geometric steps of factor 10 interleaved with steps of 2 and 5). This is the equivalent of the `--resolutions` flag in cooler; see an example here 'https://cooler.readthedocs.io/en/latest/cli.html'. | 1000,2000,5000N | 59 | 60 | 61 | ### Paired-end BAM Options 62 | 63 | | Nextflow parameter name | Type | Description | Help | Default | 64 | |--------------------------|------|-------------|------|---------| 65 | | paired_end | boolean | Create mock paired-end BAM files. | | False | 66 | | filter_pairs | boolean | Filter paired-end reads using minimum and maximum distance parameters. | | False | 67 | | paired_end_minimum_distance | integer | Remove trans/cis pairs separated by a distance shorter than this. | | -1 | 68 | | paired_end_maximum_distance | integer | Remove trans/cis pairs separated by a distance greater than this. | | -1 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /docs/07_outputs.md: -------------------------------------------------------------------------------- 1 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}. 2 | 3 | | Title | File path | Description | Per sample or aggregated | 4 | |-------|-----------|-------------|--------------------------| 5 | | workflow report | ./wf-template-report.html | Report for all samples. | aggregated | 6 | | Per file read stats | ./ingress_results/reads/fastcat_stats/per-file-stats.tsv | A TSV with per file read stats, including all samples. | aggregated | 7 | | Per read stats | ./ingress_results/reads/fastcat_stats/per-read-stats.tsv | A TSV with per read stats, including all samples. | aggregated | 8 | | Run ID's | ./ingress_results/reads/fastcat_stats/run_ids | List of run ID's present in reads. | aggregated | 9 | | Meta map json | ./ingress_results/reads/metamap.json | Meta data used in workflow presented in a JSON. | aggregated | 10 | | Concatenated sequence data | ./ingress_results/reads/{{ alias }}.fastq.gz | Per-sample reads concatenated in to one fastq file. | per-sample | 11 | | Coordinate-sorted Bam | ./bams/{{ alias }}.cs.bam | Coordinate-sorted Bam. | per-sample | 12 | | Coordinate-sorted Bam Index | ./bams/{{ alias }}.cs.bam.bai | Coordinate-sorted Bam Index. | per-sample | 13 | | Name-sorted Bam | ./bams/{{ alias }}.ns.bam | Name-sorted Bam. | per-sample | 14 | | Pairs file | ./pairs/{{ alias }}.pairs.gz | This file contains contact information in a human-readable tabular format, and can be used with downstream tools. See [Pairtools documentation](https://pairtools.readthedocs.io/en/latest/formats.html#pairs) for full specification. | per-sample | 15 | | Pairs summary stats file | ./pairs/{{ alias }}.pairs.stats.txt | Summary statistics of the pairs file. See this [overview](https://pairtools.readthedocs.io/en/latest/stats.html) for a full specification. | per-sample | 16 | | Pairs summary report | ./pairs/{{ alias }}.pairs.stats.html | Pairs html report with result including an interactive contact map and statistics. See [pairsqc documentation](https://github.com/4dn-dcic/pairsqc) for further details. | per-sample | 17 | | Multi-resolution cool file | ./cooler/{{ alias }}.mcool | Multi-resolution cool `.mcool` file which can be used with downstream tools to provide a high resolution genomic interaction matrix. See [Cool tools documentation](https://github.com/open2c/cooltools) for details on downstream analysis. | per-sample | 18 | | Paired-end BAM | ./paired_end/{{ alias }}.ns.bam | Mock paired end BAM. | per-sample | 19 | | Chromunity parquet files. | ./chromunity | Chromunity directory with parquet files which can be used with the Chromunity package. Chromunity enables the nomination and statistical evaluation of high order interactions. See [Chromunity documentation](http://mskilab.com/chromunity/tutorial.html) for further details. | per-sample | 20 | | Fragments BED | ./paireds/fragments.bed | File with the DNA fragments created from the virtual digest. | per-sample | 21 | | Hi-C for contact map | ./hi-c/{{ alias }}.hic | File which can be loaded into the [Juice box tool](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation. | per-sample | 22 | | Filtered out reads | ./filtered_out/{{ alias }}.bam | BAM file containing any reads that were filtered out at the digest step and not included in the analysis. | per-sample | 23 | -------------------------------------------------------------------------------- /docs/08_pipeline_overview.md: -------------------------------------------------------------------------------- 1 | ### 1. Concatenate input files and generate per read stats. 2 | 3 | This workflow accepts FASTQ or unaligned BAM as input. [Fastcat or Bamstats](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. It will also output per read stats including average read lengths and qualities. 4 | 5 | ### 2. Index reference 6 | 7 | The input reference genome is indexed with [Minimap2](https://github.com/lh3/minimap2). 8 | 9 | ### 3. Split input file 10 | 11 | The reads are indexed in chunks for parallel processing using the `chunk_size` parameter which is defaulted to 10,000. 12 | 13 | ### 4. Digest Reads 14 | 15 | Chimeric Pore-C reads are digested using the [Pore-c-py](https://github.com/epi2me-labs/pore-c-py) python package. The enzyme provided to the `cutter` parameter will be used by the Pore-c-py package to find the corresponding sequence using the [Biopython](https://biopython.org/) restriction enzymes library. Any reads containing more than `max_monomers` (default: 250) will be excluded at this stage as they are assumed to have been created in error. 16 | 17 | ### 5. Align Reads 18 | 19 | The monomers are then aligned with the reference genome using Minimap2. 20 | 21 | ### 6. Annotate 22 | 23 | The Pore-c-py package is then used again to filter spurious alignments, detect ligation junctions and assign chimeric fragments. The aligned segmnets will be used to generate a "walk" which enumerates the alignment coordinates of the monomers comprising the chimeric read and this is used to annotate the alignments. 24 | 25 | ### 7. Output BAMS 26 | 27 | The Pore-c-py will output the tagged alignments in a name sorted and coordinate sorted BAM. If the `paired_end` parameter is selected a mock paired end bam will also be output, this is for use with downstream tools such as [Pairtools](https://github.com/open2c/pairtools). At this stage if the [Chromunity](https://github.com/mskilab-org/chromunity) parameter is set to true the annotate script will also output the parquet files required for us with the downstream Chromunity tool. 28 | 29 | ### 8. Haplotag Alignments 30 | 31 | If a phased VCF is provided using the `vcf` parameter the output BAM will be haplotagged using [Whatshap](https://github.com/whatshap/whatshap). 32 | 33 | ### 9. Merge BAMS 34 | 35 | The outputs BAM's from each of the split chunks will be merged and sorted per sample using [Samtools](https://www.htslib.org/doc/samtools-merge.html). 36 | 37 | ### 10. Coverage is calculated 38 | 39 | If the `coverage` parameter is set to true [Mosdepth](https://github.com/brentp/mosdepth) is used to find coverage across the input reference genome. 40 | 41 | ### 11. Additional output formats for downstream analysis 42 | 43 | The workflow will output several formats that can be used with downstream tools. 44 | 45 | + [Pairtools](https://github.com/open2c/pairtools) is used to create pairs format file and html report which contains a contact map and other statistics. Use the `pairs` parameter for the workflow to generate this output. 46 | 47 | + [Cooler](https://github.com/open2c/cooler) is used to output cooler format for use with cooler, a multi-resolution contact map. Use the `mcool` parameter to generate this output. 48 | 49 | + [Juicer](https://github.com/aidenlab/juicer) tools is used to create `.hic` format file which can be used for visualising the file which can be loaded into the [Juice box tool](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation. Use the `hi_c` parameter to generate this output. -------------------------------------------------------------------------------- /docs/09_troubleshooting.md: -------------------------------------------------------------------------------- 1 | 2 | + If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug. 3 | + See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/). 4 | + If the workflow breaks with a memory error, try running the workflow again with a reduced chunk size parameter. -------------------------------------------------------------------------------- /docs/10_FAQ.md: -------------------------------------------------------------------------------- 1 | 2 | * Does the workflow have support for a scaffolding tool? * - Currently we do not support any scaffolding tool but you may like to try [Yahs](https://academic.oup.com/bioinformatics/article/39/1/btac808/6917071). 3 | 4 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-pore-c/issues) page or start a discussion on the [community](https://community.nanoporetech.com/). -------------------------------------------------------------------------------- /docs/11_other.md: -------------------------------------------------------------------------------- 1 | + [Importing third-party workflows into EPI2ME Labs](https://labs.epi2me.io/nexflow-for-epi2melabs/) 2 | 3 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts. -------------------------------------------------------------------------------- /lib/ArgumentParser.groovy: -------------------------------------------------------------------------------- 1 | /* Check arguments of a Nextflow function 2 | * 3 | * Nextflow script does not support the Groovy idiom: 4 | * 5 | * def function(Map args[:], arg1, arg2, ...) 6 | * 7 | * to support unordered kwargs. The methods here are designed 8 | * to reduce boileplate while allowing Nextflow script to implement 9 | * 10 | * def function(Map args[:]) 11 | * 12 | * with required and default values. This is similar to some Python 13 | * libraries' (notably matplotlib) extensive use of things like: 14 | * 15 | * def function(*args, **kwargs) 16 | * 17 | * to implement generic APIs. Why do we want to do all this? Because 18 | * we want to write library code with a clean set of required parameters 19 | * but also extensible with non-required parameters with default values. 20 | * This allows us to later add parameters without breaking existing code, 21 | * and is very common practice elsewhere. 22 | */ 23 | 24 | import java.util.Set 25 | 26 | class ArgumentParser { 27 | Set args 28 | Map kwargs 29 | String name 30 | 31 | /* Parse arguments, raising an error on unknown keys */ 32 | public Map parse_args(LinkedHashMap given_args) { 33 | Set opt_keys = kwargs.keySet() 34 | Set given_keys = given_args.keySet() 35 | check_required(given_keys) 36 | check_unknown(given_keys, opt_keys) 37 | return kwargs + given_args 38 | } 39 | 40 | /* Parse arguments, without raising an error for extra keys */ 41 | public Map parse_known_args(LinkedHashMap given_args) { 42 | Set opt_keys = kwargs.keySet() 43 | Set given_keys = given_args.keySet() 44 | check_required(given_keys) 45 | return kwargs + given_args 46 | } 47 | 48 | private void check_required(Set given) { 49 | Set missing_keys = args - given 50 | if (!missing_keys.isEmpty()) { 51 | throw new Exception("Missing arguments for function ${name}: ${missing_keys}") 52 | } 53 | } 54 | 55 | private void check_unknown(Set given, Set kwargs_keys) { 56 | Set extra_keys = given - (args + kwargs_keys) 57 | if (!extra_keys.isEmpty()) { 58 | throw new Exception("Unknown arguments provided to function ${name}: ${extra_keys}.") 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /lib/CWUtil.groovy: -------------------------------------------------------------------------------- 1 | /* Miscellaneous utilities for workflows from the ONT Customer Workflows Group. 2 | */ 3 | class CWUtil { 4 | 5 | /* Mutate the global Nextflow params map 6 | * 7 | * Occasionally, we may wish to mutate the value of a parameter provided 8 | * by the user. Typically, this leads to workflows with `params.my_param` 9 | * and `params._my_param` which is ripe for confusion. Instead, we can 10 | * mutate the parameter value in the Nextflow params ScriptMap itself 11 | * with the following call: 12 | * 13 | * CWUtil.mutateParam(params, k, v) 14 | * 15 | * This is possible as Groovy actually has a surprisingly loose 16 | * definition of "private", and allows us to call the private `allowNames` 17 | * method on the ScriptMap which removes the read-only status for a key set. 18 | * We can follow this up with a call to the private `put0` to reinsert 19 | * the key and mark it as read-only again. 20 | */ 21 | public static void mutateParam(nf_params, key, value) { 22 | Set s = [key] // must be a set to allow call to allowNames 23 | nf_params.allowNames(s) 24 | nf_params.put0(key, value) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /lib/NfcoreTemplate.groovy: -------------------------------------------------------------------------------- 1 | // 2 | // This file holds several functions used within the nf-core pipeline template. 3 | // 4 | 5 | // MIT License 6 | // 7 | // Copyright (c) 2018 nf-core 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a copy 10 | // of this software and associated documentation files (the "Software"), to deal 11 | // in the Software without restriction, including without limitation the rights 12 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | // copies of the Software, and to permit persons to whom the Software is 14 | // furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included in all 17 | // copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | // SOFTWARE. 26 | 27 | 28 | import org.yaml.snakeyaml.Yaml 29 | 30 | class NfcoreTemplate { 31 | 32 | // 33 | // Check AWS Batch related parameters have been specified correctly 34 | // 35 | public static void awsBatch(workflow, params) { 36 | if (workflow.profile.contains('awsbatch')) { 37 | // Check params.awsqueue and params.awsregion have been set if running on AWSBatch 38 | assert (params.awsqueue && params.awsregion) : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" 39 | // Check outdir paths to be S3 buckets if running on AWSBatch 40 | assert params.outdir.startsWith('s3:') : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" 41 | } 42 | } 43 | 44 | // 45 | // Check params.hostnames 46 | // 47 | public static void hostName(workflow, params, log) { 48 | Map colors = logColours(params.monochrome_logs) 49 | if (params.hostnames) { 50 | try { 51 | def hostname = "hostname".execute().text.trim() 52 | params.hostnames.each { prof, hnames -> 53 | hnames.each { hname -> 54 | if (hostname.contains(hname) && !workflow.profile.contains(prof)) { 55 | log.info "=${colors.yellow}====================================================${colors.reset}=\n" + 56 | "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + 57 | " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + 58 | " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + 59 | "=${colors.yellow}====================================================${colors.reset}=" 60 | } 61 | } 62 | } 63 | } catch (Exception e) { 64 | log.warn "[$workflow.manifest.name] Could not determine 'hostname' - skipping check. Reason: ${e.message}." 65 | } 66 | } 67 | } 68 | 69 | // 70 | // Generate version string 71 | // 72 | public static String version(workflow) { 73 | String version_string = "" 74 | 75 | if (workflow.manifest.version) { 76 | def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' 77 | version_string += "${prefix_v}${workflow.manifest.version}" 78 | } 79 | 80 | if (workflow.commitId) { 81 | def git_shortsha = workflow.commitId.substring(0, 7) 82 | version_string += "-g${git_shortsha}" 83 | } 84 | 85 | return version_string 86 | } 87 | 88 | // 89 | // Construct and send completion email 90 | // 91 | public static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], fail_mapped_reads=[:]) { 92 | 93 | // Set up the e-mail variables 94 | def subject = "[$workflow.manifest.name] Successful: $workflow.runName" 95 | if (fail_mapped_reads.size() > 0) { 96 | subject = "[$workflow.manifest.name] Partially successful (${fail_mapped_reads.size()} skipped): $workflow.runName" 97 | } 98 | if (!workflow.success) { 99 | subject = "[$workflow.manifest.name] FAILED: $workflow.runName" 100 | } 101 | 102 | def summary = [:] 103 | for (group in summary_params.keySet()) { 104 | summary << summary_params[group] 105 | } 106 | 107 | def misc_fields = [:] 108 | misc_fields['Date Started'] = workflow.start 109 | misc_fields['Date Completed'] = workflow.complete 110 | misc_fields['Pipeline script file path'] = workflow.scriptFile 111 | misc_fields['Pipeline script hash ID'] = workflow.scriptId 112 | if (workflow.repository) misc_fields['Pipeline repository Git URL'] = workflow.repository 113 | if (workflow.commitId) misc_fields['Pipeline repository Git Commit'] = workflow.commitId 114 | if (workflow.revision) misc_fields['Pipeline Git branch/tag'] = workflow.revision 115 | misc_fields['Nextflow Version'] = workflow.nextflow.version 116 | misc_fields['Nextflow Build'] = workflow.nextflow.build 117 | misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp 118 | 119 | def email_fields = [:] 120 | email_fields['version'] = NfcoreTemplate.version(workflow) 121 | email_fields['runName'] = workflow.runName 122 | email_fields['success'] = workflow.success 123 | email_fields['dateComplete'] = workflow.complete 124 | email_fields['duration'] = workflow.duration 125 | email_fields['exitStatus'] = workflow.exitStatus 126 | email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') 127 | email_fields['errorReport'] = (workflow.errorReport ?: 'None') 128 | email_fields['commandLine'] = workflow.commandLine 129 | email_fields['projectDir'] = workflow.projectDir 130 | email_fields['summary'] = summary << misc_fields 131 | email_fields['fail_mapped_reads'] = fail_mapped_reads.keySet() 132 | email_fields['min_mapped_reads'] = params.min_mapped_reads 133 | 134 | // On success try attach the multiqc report 135 | def mqc_report = null 136 | try { 137 | if (workflow.success && !params.skip_multiqc) { 138 | mqc_report = multiqc_report.getVal() 139 | if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) { 140 | if (mqc_report.size() > 1) { 141 | log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" 142 | } 143 | mqc_report = mqc_report[0] 144 | } 145 | } 146 | } catch (all) { 147 | if (multiqc_report) { 148 | log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" 149 | } 150 | } 151 | 152 | // Check if we are only sending emails on failure 153 | def email_address = params.email 154 | if (!params.email && params.email_on_fail && !workflow.success) { 155 | email_address = params.email_on_fail 156 | } 157 | 158 | // Render the TXT template 159 | def engine = new groovy.text.GStringTemplateEngine() 160 | def tf = new File("$projectDir/assets/email_template.txt") 161 | def txt_template = engine.createTemplate(tf).make(email_fields) 162 | def email_txt = txt_template.toString() 163 | 164 | // Render the HTML template 165 | def hf = new File("$projectDir/assets/email_template.html") 166 | def html_template = engine.createTemplate(hf).make(email_fields) 167 | def email_html = html_template.toString() 168 | 169 | // Render the sendmail template 170 | def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit 171 | def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] 172 | def sf = new File("$projectDir/assets/sendmail_template.txt") 173 | def sendmail_template = engine.createTemplate(sf).make(smail_fields) 174 | def sendmail_html = sendmail_template.toString() 175 | 176 | // Send the HTML e-mail 177 | Map colors = logColours(params.monochrome_logs) 178 | if (email_address) { 179 | try { 180 | if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } 181 | // Try to send HTML e-mail using sendmail 182 | [ 'sendmail', '-t' ].execute() << sendmail_html 183 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" 184 | } catch (all) { 185 | // Catch failures and try with plaintext 186 | def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] 187 | if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { 188 | mail_cmd += [ '-A', mqc_report ] 189 | } 190 | mail_cmd.execute() << email_html 191 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-" 192 | } 193 | } 194 | 195 | // Write summary e-mail HTML to a file 196 | def output_d = new File("${params.outdir}/pipeline_info/") 197 | if (!output_d.exists()) { 198 | output_d.mkdirs() 199 | } 200 | def output_hf = new File(output_d, "pipeline_report.html") 201 | output_hf.withWriter { w -> w << email_html } 202 | def output_tf = new File(output_d, "pipeline_report.txt") 203 | output_tf.withWriter { w -> w << email_txt } 204 | } 205 | 206 | // 207 | // Print pipeline summary on completion 208 | // 209 | public static void summary(workflow, params, log, fail_mapped_reads=[:], pass_mapped_reads=[:]) { 210 | Map colors = logColours(params.monochrome_logs) 211 | 212 | if (pass_mapped_reads.size() > 0) { 213 | def idx = 0 214 | def samp_aln = '' 215 | def total_aln_count = pass_mapped_reads.size() + fail_mapped_reads.size() 216 | for (samp in pass_mapped_reads) { 217 | samp_aln += " ${samp.value}: ${samp.key}\n" 218 | idx += 1 219 | if (idx > 5) { 220 | samp_aln += " ..see pipeline reports for full list\n" 221 | break; 222 | } 223 | } 224 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} ${pass_mapped_reads.size()}/$total_aln_count samples passed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-" 225 | } 226 | if (fail_mapped_reads.size() > 0) { 227 | def samp_aln = '' 228 | for (samp in fail_mapped_reads) { 229 | samp_aln += " ${samp.value}: ${samp.key}\n" 230 | } 231 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} ${fail_mapped_reads.size()} samples skipped since they failed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-" 232 | } 233 | 234 | if (workflow.success) { 235 | if (workflow.stats.ignoredCount == 0) { 236 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" 237 | } else { 238 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" 239 | } 240 | } else { 241 | hostName(workflow, params, log) 242 | log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" 243 | } 244 | } 245 | 246 | // 247 | // ANSII Colours used for terminal logging 248 | // 249 | public static Map logColours(Boolean monochrome_logs) { 250 | Map colorcodes = [:] 251 | 252 | // Reset / Meta 253 | colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" 254 | colorcodes['bold'] = monochrome_logs ? '' : "\033[1m" 255 | colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" 256 | colorcodes['underlined'] = monochrome_logs ? '' : "\033[4m" 257 | colorcodes['blink'] = monochrome_logs ? '' : "\033[5m" 258 | colorcodes['reverse'] = monochrome_logs ? '' : "\033[7m" 259 | colorcodes['hidden'] = monochrome_logs ? '' : "\033[8m" 260 | 261 | // Regular Colors 262 | colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" 263 | colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" 264 | colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" 265 | colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" 266 | colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" 267 | colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" 268 | colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" 269 | colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" 270 | 271 | // Bold 272 | colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" 273 | colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" 274 | colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" 275 | colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" 276 | colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" 277 | colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" 278 | colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" 279 | colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" 280 | 281 | // Underline 282 | colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" 283 | colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" 284 | colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" 285 | colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" 286 | colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" 287 | colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" 288 | colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" 289 | colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" 290 | 291 | // High Intensity 292 | colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" 293 | colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" 294 | colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" 295 | colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" 296 | colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" 297 | colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" 298 | colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" 299 | colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" 300 | 301 | // Bold High Intensity 302 | colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" 303 | colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" 304 | colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" 305 | colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" 306 | colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" 307 | colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" 308 | colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" 309 | colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" 310 | 311 | return colorcodes 312 | } 313 | 314 | // 315 | // Does what is says on the tin 316 | // 317 | public static String dashedLine(monochrome_logs) { 318 | Map colors = logColours(monochrome_logs) 319 | return "${colors.dim}--------------------------------------------------------------------------------${colors.reset}" 320 | } 321 | 322 | // epi2me-labs logo 323 | public static String logo(workflow, monochrome_logs) { 324 | Map colors = NfcoreTemplate.logColours(monochrome_logs) 325 | String workflow_name = workflow.manifest.name.split("/")[1] 326 | String workflow_version = version(workflow) 327 | String.format( 328 | """ 329 | ${colors.igreen}|||||||||| ${colors.reset}${colors.dim}_____ ____ ___ ____ __ __ _____ _ _ 330 | ${colors.igreen}|||||||||| ${colors.reset}${colors.dim}| ____| _ \\_ _|___ \\| \\/ | ____| | | __ _| |__ ___ 331 | ${colors.yellow}||||| ${colors.reset}${colors.dim}| _| | |_) | | __) | |\\/| | _| _____| |/ _` | '_ \\/ __| 332 | ${colors.yellow}||||| ${colors.reset}${colors.dim}| |___| __/| | / __/| | | | |__|_____| | (_| | |_) \\__ \\ 333 | ${colors.iblue}|||||||||| ${colors.reset}${colors.dim}|_____|_| |___|_____|_| |_|_____| |_|\\__,_|_.__/|___/ 334 | ${colors.iblue}|||||||||| ${colors.reset}${colors.bold}${workflow_name} ${workflow_version}${colors.reset} 335 | ${NfcoreTemplate.dashedLine(monochrome_logs)} 336 | """.stripIndent() 337 | ) 338 | } 339 | } 340 | -------------------------------------------------------------------------------- /lib/Pinguscript.groovy: -------------------------------------------------------------------------------- 1 | import static groovy.json.JsonOutput.toJson 2 | import groovy.json.JsonBuilder 3 | import groovy.json.JsonSlurper 4 | 5 | 6 | class Pinguscript { 7 | 8 | // Send a ping for the start of a workflow 9 | public static void ping_start(nextflow, workflow, params) { 10 | wf_ping(nextflow, workflow, "start", null, params) 11 | } 12 | // Send a ping for a completed workflow (successful or otherwise) 13 | public static void ping_complete(nextflow, workflow, params) { 14 | wf_ping(nextflow, workflow, "end", null, params) 15 | } 16 | // Send a ping for a workflow error 17 | public static void ping_error(nextflow, workflow, params) { 18 | def error_message = workflow.errorMessage 19 | wf_ping(nextflow, workflow, "error", error_message, params) 20 | } 21 | // Shared handler to construct a ping JSON and send it 22 | private static String wf_ping(nextflow, workflow, event, error_message, params) { 23 | if (params.disable_ping) { 24 | return "{}" 25 | } 26 | def body_json = make_wf_ping(nextflow, workflow, event, error_message, params) 27 | send_ping_post("epilaby", body_json) 28 | } 29 | 30 | // Helper to removing keys from a map 31 | private static clean_meta(meta, keys_to_remove) { 32 | for (key in keys_to_remove) { 33 | if (meta.containsKey(key)) { 34 | meta.remove(key) 35 | } 36 | } 37 | } 38 | 39 | // Helper for fetching a key from the params map 40 | // seems pointless but you just know someone is going to end up writing meta.this ? meta.that 41 | private static get_meta(meta, key) { 42 | (meta.containsKey(key) && meta[key]) ? meta[key].toString() : null 43 | } 44 | 45 | // Construct workflow ping JSON 46 | private static String make_wf_ping(nextflow, workflow, event, error_message, params) { 47 | // cheeky deepcopy using json 48 | String paramsJSON = new JsonBuilder(params).toPrettyString() 49 | def params_data = new JsonSlurper().parseText(paramsJSON) 50 | 51 | // hostname 52 | def host = null 53 | try { 54 | host = InetAddress.getLocalHost().getHostName() 55 | } 56 | catch(Exception e) {} 57 | 58 | // OS 59 | // TODO check version on WSL 60 | def opsys = System.properties['os.name'].toLowerCase() 61 | def opver = System.properties['os.version'] 62 | if (opver.toLowerCase().contains("wsl")){ 63 | opsys = "wsl" 64 | } 65 | 66 | // placeholder for any future okta business 67 | // for now we'll use the guest_ sent to wf.epi2me_user 68 | def user = get_meta(params.wf, "epi2me_user") 69 | 70 | // drop cruft to save some precious bytes 71 | // affects the deep copy rather than original params 72 | clean_meta(params_data, [ 73 | "schema_ignore_params", 74 | ]) 75 | def ingress_ids = [] 76 | if (params_data.containsKey("wf")) { 77 | ingress_ids = params_data.wf["ingress.run_ids"] ?: [] 78 | clean_meta(params_data.wf, [ 79 | "agent", // we send this later 80 | "epi2me_instance", // we send this later 81 | "epi2me_user", // we send this later 82 | "example_cmd", 83 | "ingress.run_ids", // we will send this elsewhere 84 | ]) 85 | } 86 | 87 | // try and get runtime information 88 | def cpus = null 89 | try { 90 | cpus = Runtime.getRuntime().availableProcessors() 91 | } 92 | catch(Exception e) {} 93 | 94 | def workflow_success = null 95 | def workflow_exitcode = null 96 | if (event != "start") { 97 | workflow_success = workflow.success 98 | workflow_exitcode = workflow.exitStatus 99 | } 100 | 101 | /// build message 102 | def body_json = new JsonBuilder() 103 | body_json \ 104 | "tracking_id": [ 105 | "msg_id": UUID.randomUUID().toString(), 106 | "version": "3.0.0" 107 | ], 108 | "source": "workflow", 109 | "event": event, 110 | "params": params_data, 111 | // data will be null on start events, as ingress has not run 112 | "data": event != "start" ? [run_ids: ingress_ids] : null, 113 | "workflow": [ 114 | "name": workflow.manifest.name, 115 | "version": workflow.manifest.version, // could use NfcoreTemplate.version(workflow) 116 | "run_name": workflow.runName, // required to disambiguate sessions 117 | "session": workflow.sessionId, 118 | "profile": workflow.profile, 119 | "resume": workflow.resume, 120 | "error": error_message, // null if no error 121 | "success": workflow_success, 122 | "exitcode": workflow_exitcode, 123 | ], 124 | "env": [ 125 | "user": user, // placeholder for any future okta 126 | "hostname": host, 127 | "os": [ 128 | "name": opsys, 129 | "version": opver 130 | ], 131 | "resource": [ 132 | "cpus": cpus, 133 | "memory": null, // placeholder, no point asking via Runtime as it will just give us the Xmx size 134 | ], 135 | "agent": get_meta(params.wf, "agent"), // access via original params 136 | "epi2me": [ 137 | "instance": get_meta(params.wf, "epi2me_instance"), 138 | "user": user, 139 | ], 140 | "nextflow": [ 141 | "version": nextflow.version.toString(), 142 | "version_compat": nextflow.version.matches(workflow.manifest.nextflowVersion) 143 | ] 144 | ] 145 | return body_json 146 | } 147 | 148 | // Send a JSON payload to a given endpoint 149 | private static String send_ping_post(endpoint, body_json) { 150 | // Attempt to send payload and absorb any possible Exception gracefully 151 | String postResult 152 | boolean raise_exception = false 153 | try { 154 | ((HttpURLConnection)new URL("https://ping.oxfordnanoportal.com/${endpoint}").openConnection()).with({ 155 | requestMethod = 'POST' 156 | doOutput = true 157 | setConnectTimeout(5000) 158 | setReadTimeout(10000) 159 | setRequestProperty('Content-Type', 'application/json') 160 | setRequestProperty('accept', 'application/json') 161 | outputStream.withPrintWriter({printWriter -> 162 | printWriter.write(body_json.toString()) 163 | }) 164 | 165 | // Rethrow exceptions that imply we're not using this endpoint properly 166 | if(responseCode >= 400 && agent.toString() == "cw-ci") { 167 | raise_exception = true 168 | } 169 | // Accessing inputStream.text will raise an Exception for failed requests 170 | postResult = inputStream.text 171 | }) 172 | } 173 | catch(Exception e) { 174 | if(raise_exception) { throw e } 175 | } 176 | return (postResult) 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /lib/WorkflowMain.groovy: -------------------------------------------------------------------------------- 1 | // This file is based on the nf-core/tools pipeline-template. 2 | // Changes to this file must be propagated via wf-template. 3 | 4 | class WorkflowMain { 5 | 6 | // Citation string for pipeline 7 | public static String citation(workflow) { 8 | return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + 9 | "* The nf-core framework\n" + 10 | " https://doi.org/10.1038/s41587-020-0439-x\n\n" 11 | } 12 | 13 | // Generate help string 14 | public static String help(workflow, params, log) { 15 | String line_sep = ' \\ \n\t' 16 | String command_example = params.wf.example_cmd.join(line_sep) 17 | String command = 'nextflow run ' + workflow.manifest.name + line_sep + command_example 18 | String help_string = '' 19 | help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) 20 | help_string += NfcoreSchema.paramsHelp(workflow, params, command) 21 | help_string += '\n' + citation(workflow) + '\n' 22 | return help_string 23 | } 24 | 25 | // Generate parameter summary log string 26 | public static String paramsSummaryLog(workflow, params, log) { 27 | String workflow_version = NfcoreTemplate.version(workflow) 28 | String summary_log = '' 29 | summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) 30 | summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) 31 | summary_log += '\n' + citation(workflow) + '\n' 32 | summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) 33 | summary_log += "\nThis is ${workflow.manifest.name} ${workflow_version}.\n" 34 | summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) 35 | return summary_log 36 | } 37 | 38 | // Validate parameters and print summary to screen 39 | public static void initialise(workflow, params, log) { 40 | // Print help to screen if required 41 | if (params.help) { 42 | log.info help(workflow, params, log) 43 | System.exit(0) 44 | } 45 | 46 | // Print workflow version and exit on --version 47 | if (params.version) { 48 | String workflow_version = NfcoreTemplate.version(workflow) 49 | log.info "${workflow.manifest.name} ${workflow_version}" 50 | System.exit(0) 51 | } 52 | 53 | // Explode on conda 54 | // conda.enabled seems to be backward compatible but wrap this 55 | // in a generic catch just in case 56 | try { 57 | if (workflow.session.config.conda.enabled) { 58 | log.error "Sorry, this workflow is not compatible with Conda, please use -profile standard (Docker) or -profile singularity." 59 | System.exit(1) 60 | } 61 | } catch(Exception e) {} 62 | 63 | // Validate workflow parameters via the JSON schema 64 | if (params.validate_params) { 65 | NfcoreSchema.validateParameters(workflow, params, log) 66 | } 67 | 68 | // Print parameter summary log to screen 69 | log.info paramsSummaryLog(workflow, params, log) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /lib/common.nf: -------------------------------------------------------------------------------- 1 | import groovy.json.JsonBuilder 2 | 3 | process getParams { 4 | label "wf_common" 5 | cpus 1 6 | memory "2 GB" 7 | output: 8 | path "params.json" 9 | script: 10 | def paramsJSON = new JsonBuilder(params).toPrettyString() 11 | """ 12 | # Output nextflow params object to JSON 13 | echo '$paramsJSON' > params.json 14 | """ 15 | } 16 | 17 | process configure_igv { 18 | publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv 19 | label "wf_common" 20 | cpus 1 21 | memory "2 GB" 22 | input: 23 | // the python script will work out what to do with all the files based on their 24 | // extensions 25 | path "file-names.txt" 26 | val locus_str 27 | val aln_extra_opts 28 | val var_extra_opts 29 | output: path "igv.json" 30 | script: 31 | // the locus argument just makes sure that the initial view in IGV shows something 32 | // interesting 33 | String locus_arg = locus_str ? "--locus $locus_str" : "" 34 | // extra options for alignment tracks 35 | def aln_opts_json_str = \ 36 | aln_extra_opts ? new JsonBuilder(aln_extra_opts).toPrettyString() : "" 37 | String aln_extra_opts_arg = \ 38 | aln_extra_opts ? "--extra-alignment-opts extra-aln-opts.json" : "" 39 | // extra options for variant tracks 40 | def var_opts_json_str = \ 41 | var_extra_opts ? new JsonBuilder(var_extra_opts).toPrettyString() : "" 42 | String var_extra_opts_arg = \ 43 | var_extra_opts ? "--extra-vcf-opts extra-var-opts.json" : "" 44 | """ 45 | # write out JSON files with extra options for the alignment and variant tracks 46 | echo '$aln_opts_json_str' > extra-aln-opts.json 47 | echo '$var_opts_json_str' > extra-var-opts.json 48 | 49 | workflow-glue configure_igv \ 50 | --fofn file-names.txt \ 51 | $locus_arg \ 52 | $aln_extra_opts_arg \ 53 | $var_extra_opts_arg \ 54 | > igv.json 55 | """ 56 | } 57 | 58 | -------------------------------------------------------------------------------- /lib/nfcore_external_java_deps.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/lib/nfcore_external_java_deps.jar -------------------------------------------------------------------------------- /main.nf: -------------------------------------------------------------------------------- 1 | #!usr/bin/env nextflow 2 | import groovy.json.JsonBuilder 3 | nextflow.enable.dsl = 2 4 | include { 5 | fastq_ingress 6 | xam_ingress 7 | } from "./lib/ingress" 8 | include { 9 | index_ref_fai 10 | decompress_ref 11 | publish_artifact 12 | merge_namesorted_bams 13 | merge_namesorted_bams as merge_paired_end_bams 14 | merge_coordsorted_bams 15 | mosdepth_coverage 16 | get_filtered_out_bam 17 | index_vcf 18 | } from './modules/local/common' 19 | 20 | include { 21 | digest_align_annotate 22 | haplotagReads as haplotag_alignments 23 | merge_parquets_to_dataset 24 | } from './modules/local/pore-c' 25 | include { 26 | to_pairs_file 27 | pairsToCooler 28 | merge_mcools 29 | merge_pairs 30 | merge_pairs_stats 31 | create_restriction_bed 32 | pair_stats_report 33 | prepare_hic 34 | createBed 35 | mergeBed 36 | } from './modules/local/4dn' 37 | 38 | 39 | include { prepare_genome } from "./subworkflows/local/prepare_genome" 40 | 41 | OPTIONAL_FILE = file("$projectDir/data/OPTIONAL_FILE") 42 | 43 | // bamindex will work with bam or fastq format file as input 44 | process index_bam { 45 | label "wfporec" 46 | cpus 4 47 | memory "8 GB" 48 | input: 49 | tuple val(meta), path("concatemers.bam") 50 | val chunk_size 51 | output: 52 | tuple val(meta), path("concatemers.bam"), path("concatemers.bam.bci"), path("indexed_chunks.csv") 53 | shell: 54 | args = task.ext.args ?: " " 55 | """ 56 | bamindex build -c ${params.chunk_size} -t ${task.cpus} concatemers.bam 57 | bamindex dump concatemers.bam.bci > chunks.csv 58 | awk -F' ' -v OFS=' ' 'NR == 1 {print "ID", \$0; next} {print (NR-2), \$0}' chunks.csv > indexed_chunks.csv 59 | """ 60 | } 61 | 62 | 63 | process getVersions { 64 | label "wfporec" 65 | cpus 4 66 | memory "4 GB" 67 | output: 68 | path "versions.txt" 69 | script: 70 | """ 71 | fastcat --version | sed 's/^/fastcat,/' >> versions.txt 72 | mosdepth --version | sed 's/ /,/' >> versions.txt 73 | pairtools --version | sed 's/\\//g' >> versions.txt 74 | whatshap --version | sed 's/^/whatshap,/' >> versions.txt 75 | pore-c-py --version | sed 's/ /,/' >> versions.txt 76 | samtools --version | (head -n 1 && exit 0) | sed 's/ /,/' >> versions.txt 77 | """ 78 | } 79 | 80 | 81 | process getParams { 82 | label "wfporec" 83 | cpus 1 84 | memory "4 GB" 85 | output: 86 | path "params.json" 87 | script: 88 | String paramsJSON = new JsonBuilder(params).toPrettyString() 89 | """ 90 | # Output nextflow params object to JSON 91 | echo '$paramsJSON' > params.json 92 | """ 93 | } 94 | 95 | 96 | process makeReport { 97 | label "wf_common" 98 | cpus 4 99 | memory "15 GB" 100 | input: 101 | val metadata 102 | path(stats, stageAs: "stats_*") 103 | path "versions/*" 104 | path "params.json" 105 | val wf_version 106 | output: 107 | path "wf-pore-c-report.html" 108 | script: 109 | String report_name = "wf-pore-c-report.html" 110 | String metadata = new JsonBuilder(metadata).toPrettyString() 111 | """ 112 | echo '${metadata}' > metadata.json 113 | workflow-glue report $report_name \ 114 | --metadata metadata.json \ 115 | --stats $stats \ 116 | --versions versions \ 117 | --params params.json \ 118 | --wf_version $wf_version 119 | """ 120 | } 121 | 122 | // Creates a new directory named after the sample alias and moves the ingress results 123 | // into it. So output folder will contain alias named folders with stats. 124 | process collectIngressResultsInDir { 125 | label "wf_common" 126 | input: 127 | // inputs might be `OPTIONAL_FILE` --> stage in different sub-directories 128 | // to avoid name collisions 129 | tuple val(meta), 130 | path(stats, stageAs: "stats/*") 131 | output: 132 | // use sub-dir to avoid name clashes (in the unlikely event of a sample alias 133 | // being `reads` or `stats`) 134 | tuple path("out/*"), val("ingress_results") 135 | script: 136 | String outdir = "out/${meta["alias"]}" 137 | String metaJson = new JsonBuilder(meta).toPrettyString() 138 | String stats = stats.fileName.name == OPTIONAL_FILE.name ? "" : stats 139 | """ 140 | mkdir -p $outdir 141 | echo '$metaJson' > metamap.json 142 | mv metamap.json $stats $outdir 143 | """ 144 | } 145 | 146 | 147 | // See https://github.com/nextflow-io/nextflow/issues/1636. This is the only way to 148 | // publish files from a workflow whilst decoupling the publish from the process steps. 149 | // The process takes a tuple containing the filename and the name of a sub-directory to 150 | // put the file into. If the latter is `null`, puts it into the top-level directory. 151 | process publish { 152 | // publish inputs to output directory 153 | label "wfporec" 154 | cpus 1 155 | memory "4 GB" 156 | publishDir ( 157 | params.out_dir, 158 | mode: "copy", 159 | saveAs: { dirname ? "$dirname/$fname" : fname } 160 | ) 161 | input: 162 | tuple path(fname), val(dirname) 163 | output: 164 | path fname 165 | """ 166 | """ 167 | } 168 | 169 | // entrypointworkflow 170 | WorkflowMain.initialise(workflow, params, log) 171 | 172 | workflow POREC { 173 | main: 174 | Pinguscript.ping_start(nextflow, workflow, params) 175 | /// PREPARE INPUTS /// 176 | 177 | if (params.fastq) { 178 | sample_data = fastq_ingress([ 179 | "input":params.fastq, 180 | "sample":params.sample, 181 | "sample_sheet":params.sample_sheet, 182 | "analyse_unclassified":params.analyse_unclassified, 183 | "stats": true, 184 | "fastcat_extra_args": "", 185 | ]) 186 | // fastq_ingress doesn't have the index; add one extra null for compatibility. 187 | // We do not use variable name as assigning variable name with a tuple 188 | // not matching (e.g. meta, bam, bai, stats <- [meta, bam, stats]) causes 189 | // the workflow to crash. 190 | sample_data = sample_data 191 | .map{ 192 | it.size() == 4 ? it : [it[0], it[1], null, it[2]] 193 | } 194 | } else { 195 | // if we didn't get a `--fastq`, there must have been a `--bam` (as is codified 196 | // by the schema) 197 | sample_data = xam_ingress([ 198 | "input":params.bam, 199 | "sample":params.sample, 200 | "sample_sheet":params.sample_sheet, 201 | "analyse_unclassified":params.analyse_unclassified, 202 | "keep_unaligned": true, 203 | "stats": true, 204 | ]) 205 | } 206 | 207 | // create channel of input chimeric reads 208 | input_reads = sample_data.map{meta, path, index, stats -> [meta, path]} 209 | 210 | if (params.chunk_size > 0) { 211 | chunks = index_bam(input_reads, channel.value(params.chunk_size)) 212 | // create tuple for each region 213 | reads = chunks 214 | .map{meta, bam, bai, chunk_csv -> 215 | tuple(meta, bam, bai,chunk_csv.splitCsv(header: ['index','region', 'ref'], skip: 1 , sep:' '))} 216 | .transpose() 217 | .map{ meta, bam, bai, chunk_index -> 218 | [meta, bam, bai, chunk_index.index, chunk_index.ref]} 219 | } else { 220 | // Add optional file and nulls to satisfy channel structure. 221 | // These values are ignored in digest_align_annotate 222 | reads = input_reads.combine(Channel.of(tuple(OPTIONAL_FILE, null, null))) 223 | } 224 | if (!params.sample_sheet) { 225 | if (params.vcf){ 226 | // If vcf index does not exist create index 227 | vcf_channel = Channel.of(file(params.vcf, checkExists:true)) 228 | def candidate_tbi = file("${params.vcf}.tbi") 229 | vcf_file_tmp = input_reads.combine(vcf_channel).map{ meta, path, vcf -> [meta, vcf]} 230 | if (candidate_tbi.exists()){ 231 | tbi_file = Channel.of(candidate_tbi) 232 | vcf_file = vcf_channel 233 | } else { 234 | vcf = index_vcf(vcf_file_tmp) 235 | vcf_file = vcf.map{meta, vcf, tbi -> vcf}.flatten() 236 | tbi_file = vcf.map{meta, vcf, tbi -> tbi}.flatten() 237 | } 238 | } else { 239 | vcf_file = Channel.of(OPTIONAL_FILE) 240 | tbi_file = Channel.of(OPTIONAL_FILE) 241 | } 242 | ch_chunks = reads 243 | | combine(vcf_file) 244 | | combine(tbi_file) 245 | | map{meta, bam, index, chunk_index, chunk_ref, vcf, tbi -> 246 | if (!params.vcf){ 247 | vcf = null 248 | tbi = null 249 | } 250 | [meta + [cutter: params.cutter, vcf:vcf, tbi:tbi], 251 | bam, index, chunk_index, chunk_ref]} 252 | } else { 253 | // check if vcf exists if not set to null and haplotag will be skipped 254 | // Branch to get samples with vcf 255 | sample_data 256 | | map{ 257 | meta, path, index, stats -> 258 | def vcf_file = meta["vcf"] ? file(meta["vcf"], checkExists: true) : null 259 | def tbi_file = file(meta["vcf"] + '.tbi') 260 | def tbi = vcf_file && tbi_file.exists() ? tbi_file : null 261 | [meta, vcf_file, tbi] 262 | } 263 | | branch{ 264 | indexed_vcf: it[1] != null && it[2] != null 265 | unindexed_vcf: it[1] != null && it[2] == null 266 | no_vcf: true 267 | } | set{vcf_fork} 268 | // Index vcfs with no existing index 269 | vcf = index_vcf(vcf_fork.unindexed_vcf.map{meta, vcf, index -> [meta, vcf]}) 270 | // Combine back with any samples that have index 271 | vcf_index = vcf_fork.indexed_vcf.mix(vcf) 272 | // Combine back with samples that have no vcf 273 | per_sample = vcf_fork.no_vcf.mix(vcf_index) 274 | | map{meta, vcf, tbi -> [meta.alias, vcf, tbi]} 275 | // combine with output of ingress 276 | combined_samples = reads 277 | .map { [it[0]["alias"], *it] } 278 | .combine(per_sample, by: 0) 279 | .map { it[1..-1] } 280 | // add tuple values to meta data 281 | pre_chunks = combined_samples.map{meta, bam, index, chunk_index, chunk_ref, vcf_file, tbi_file -> 282 | [meta + [vcf:vcf_file, tbi:tbi_file], bam, index, chunk_index, chunk_ref]} 283 | // use params.cutter if it was missing from user provided sample_sheet 284 | ch_chunks = pre_chunks.map{ meta, bam, index, chunk_index, chunk_ref -> 285 | if (meta.cutter && params.cutter){ 286 | log.warn("Using cutter: ${meta.cutter} from sample sheet column for ${meta.alias}") 287 | } 288 | cutter = meta.cutter ?: params.cutter 289 | return [ meta + ["cutter": cutter], bam, index, chunk_index, chunk_ref] 290 | } 291 | } 292 | ref = prepare_genome(params.ref, params.minimap2_settings) 293 | 294 | /// RUN PORE-C TOOLS /// 295 | chunks_refs = ch_chunks.combine(ref.mmi).combine(ref.minimap2_settings) 296 | 297 | ch_annotated_monomers = digest_align_annotate(chunks_refs) 298 | 299 | // create a fork for samples that have phase info available 300 | ch_annotated_monomers.cs_bam 301 | .branch{ 302 | to_haplotag: it[0].vcf != null 303 | no_haplotag: it[0].vcf == null 304 | } 305 | .set { haplotag_fork } 306 | // haplotag bams when we have VCF available 307 | (haplotag_fork 308 | .to_haplotag // [meta, bam bai] 309 | .combine(ref.fasta) 310 | .combine(ref.fai) 311 | .map(i -> { 312 | [ 313 | i[0], // meta 314 | i[1], // bam 315 | i[2], // bai 316 | i[3], // fasta 317 | i[4], // fai 318 | i[0].vcf, // vcf 319 | i[0].tbi, // tbi 320 | ] 321 | })) | haplotag_alignments | set {haplotagged_monomers} 322 | 323 | // merge haplotagged and non-haplotagged coord-sorted bam chunks 324 | // back to single channel 325 | haplotag_fork 326 | .no_haplotag 327 | .mix(haplotagged_monomers.cs_bam) 328 | .set { cs_bam_chunks } 329 | 330 | /// MERGE PORE-C BAMS /// 331 | 332 | // merge coord-sorted bams by alias 333 | cs_bam = merge_coordsorted_bams( 334 | cs_bam_chunks.map(i -> [i[0], i[1]]) 335 | .groupTuple() 336 | ) 337 | // merge namesorted bams by alias 338 | ns_bam = merge_namesorted_bams( 339 | ch_annotated_monomers 340 | .ns_bam 341 | .map(i -> [i[0], i[1]]) 342 | .groupTuple() 343 | ) 344 | 345 | if (params.coverage || params.pairs || params.mcool || params.hi_c) { 346 | // for each cutter a bed file of the fragments 347 | digest_ch = create_restriction_bed( 348 | ch_chunks.map{meta, bam, index, chunk_index, chunk_ref -> meta.cutter} 349 | .unique() 350 | .combine(ref.fasta) 351 | .combine(ref.fai) 352 | ) 353 | } 354 | 355 | /// COVERAGE CALCULATIONS 356 | if (params.coverage) { 357 | // calculate coverage on the merged BAM 358 | digest_ch 359 | .cross( 360 | cs_bam 361 | .map(i -> [i[0].cutter, i[0], i[1], i[2]]) // [key, meta, bam, bai] 362 | ) 363 | .map(i -> [ 364 | i[1][1], // meta 365 | i[1][2], // bam 366 | i[1][3], // bai 367 | i[0][2], // bed 368 | ]) | mosdepth_coverage | set{ coverage } 369 | } 370 | /// 4DN file formats 371 | if (params.pairs || params.mcool || params.hi_c) { 372 | (digest_ch 373 | .cross( 374 | ch_annotated_monomers 375 | .ns_bam 376 | .map(i -> [i[0].cutter, i[0], i[1]]) // [key, meta, bam] 377 | ) 378 | .map(i -> [ 379 | i[1][1], // meta 380 | i[1][2], // bam 381 | i[0][1], // fai 382 | i[0][2], // bed 383 | ]) 384 | ) | to_pairs_file | set {pair_chunks} 385 | 386 | if (params.mcool) { 387 | mcool_chunks = pairsToCooler( 388 | pair_chunks 389 | .pairs 390 | .combine(Channel.of(params.cool_bin_size)) 391 | ) 392 | mcool = merge_mcools( 393 | mcool_chunks 394 | .groupTuple() 395 | .combine(Channel.of(params.mcool_resolutions)) 396 | ) 397 | } 398 | if (params.pairs || params.hi_c) { 399 | unsorted_pairs = merge_pairs( 400 | pair_chunks.pairs.map(i -> [i[0], i[2]]).groupTuple() 401 | ) 402 | pairs_stats = merge_pairs_stats( 403 | pair_chunks.stats.groupTuple() 404 | ) 405 | pairs_report = pair_stats_report( 406 | pairs_stats 407 | ) 408 | 409 | } 410 | } 411 | /// CHROMUNITY 412 | if (params.chromunity) { 413 | chromunity_pq = merge_parquets_to_dataset( 414 | ch_annotated_monomers 415 | .chromunity_pq 416 | .groupTuple() 417 | ) 418 | } 419 | 420 | /// Paired end bams 421 | if (params.paired_end) { 422 | pe_bam = merge_paired_end_bams( 423 | ch_annotated_monomers 424 | .paired_end_bam 425 | .map(i -> [i[0], i[1]]) 426 | .groupTuple() 427 | ) 428 | } 429 | 430 | // Make a report 431 | software_versions = getVersions() 432 | workflow_params = getParams() 433 | 434 | // get metadata and stats files, keeping them ordered (could do with transpose I suppose) 435 | sample_data.multiMap{ meta, path, index, stats -> 436 | meta: meta 437 | stats: stats 438 | }.set { for_report } 439 | metadata = for_report.meta.collect() 440 | // create a file list of the stats, and signal if its empty or not 441 | stats = for_report.stats | collect 442 | report = makeReport( 443 | metadata, stats, software_versions, workflow_params, workflow.manifest.version 444 | ) 445 | 446 | if (params.hi_c){ 447 | hi_c = prepare_hic(merge_pairs.out.merged_pairs.combine(ref.fai)) 448 | } 449 | 450 | if (params.bed){ 451 | bed_chunks = createBed(ch_annotated_monomers.paired_end_bam) 452 | mergeBed(bed_chunks.groupTuple()) 453 | 454 | } 455 | 456 | 457 | sample_data 458 | | map { 459 | meta, path, index, stats -> 460 | if (stats) [ meta, stats ] 461 | } 462 | | collectIngressResultsInDir 463 | 464 | 465 | // Group together lists of filtered reads from all the processed chunks 466 | named_filtered_read_ids = ch_annotated_monomers.filtered_read_ids.groupTuple().map{ meta, read_ids -> tuple(meta.alias, read_ids)} 467 | named_reads = input_reads.map{ meta, reads -> tuple(meta.alias, reads)} 468 | // Combine with input reads 469 | filtered_reads = named_filtered_read_ids.join(named_reads, remainder:false) 470 | // Retrieve filtered out BAM from list of filtered reads per sample 471 | filtered_out = get_filtered_out_bam(filtered_reads) 472 | 473 | 474 | emit: 475 | name_sorted_bam = ns_bam 476 | coord_sorted_bam = cs_bam 477 | report = report 478 | ingress_results = collectIngressResultsInDir.out 479 | } 480 | 481 | workflow { 482 | if (params.containsKey("params_sheet")) { 483 | error = "`--params_sheet` parameter is deprecated. Use parameter `--sample_sheet` instead." 484 | } 485 | POREC() 486 | publish(POREC.out.ingress_results) 487 | } 488 | 489 | workflow.onComplete { 490 | Pinguscript.ping_complete(nextflow, workflow, params) 491 | } 492 | workflow.onError { 493 | Pinguscript.ping_error(nextflow, workflow, params) 494 | } 495 | -------------------------------------------------------------------------------- /modules/local/4dn.nf: -------------------------------------------------------------------------------- 1 | #!usr/bin/env nextflow 2 | nextflow.enable.dsl = 2 3 | 4 | 5 | process to_pairs_file { 6 | label 'wfporec' 7 | cpus 2 8 | memory "8 GB" 9 | input: 10 | tuple val(meta), path("monomers.mm2.ns.bam"), path("fasta.fai"), path("fragments.bed") 11 | output: 12 | tuple val(meta), path("fasta.fai"), path("${meta.alias}.pairs.gz"), emit: "pairs" 13 | tuple val(meta), path("${meta.alias}.stats.txt"), emit: "stats" 14 | shell: 15 | def args = task.ext.args ?: "--drop-sam --drop-seq --expand --add-pair-index --add-columns mapq,pos5,pos3,cigar,read_len,matched_bp,algn_ref_span,algn_read_span,dist_to_5,dist_to_3,mismatches" 16 | """ 17 | pairtools parse2 \ 18 | --output-stats "${meta.alias}.stats.txt" \ 19 | -c "fasta.fai" --single-end --readid-transform 'readID.split(":")[0]' \ 20 | $args "monomers.mm2.ns.bam" > extract_pairs.tmp 21 | pairtools restrict -f "fragments.bed" -o "${meta.alias}.pairs.gz" extract_pairs.tmp 22 | rm -rf extract_pairs.tmp 23 | """ 24 | } 25 | 26 | 27 | process prepare_hic { 28 | label 'wfporec' 29 | cpus 2 30 | memory "31 GB" 31 | input: 32 | tuple val(meta), path("input.pairs.gz"), path("fasta.fai") 33 | output: 34 | path "${meta.alias}.hic", emit: hic 35 | """ 36 | cut -f1,2 fasta.fai > sizes.genome 37 | pairtools flip input.pairs.gz -c sizes.genome > flipped.pairs.tmp 38 | pairtools sort flipped.pairs.tmp > sorted.pairs.tmp 39 | pairtools dedup --chunksize ${params.pairtools_chunksize} sorted.pairs.tmp > dedup.pairs.tmp 40 | java -jar /home/epi2melabs/juicer_tools_1.22.01.jar pre dedup.pairs.tmp "${meta.alias}.hic" sizes.genome 41 | rm -rf "*.pairs.tmp" 42 | """ 43 | } 44 | 45 | process merge_pairs { 46 | label 'wfporec' 47 | cpus 2 48 | memory "8 GB" 49 | input: 50 | tuple val(meta), path('to_merge/{?}.gz') 51 | output: 52 | tuple val(meta), path("${prefix}.pairs.gz"), emit: merged_pairs 53 | shell: 54 | prefix = task.ext.prefix ?: "${meta.alias}" 55 | def args = task.ext.args ?: "--concatenate" 56 | """ 57 | # pass a quoted glob, pairtools will do its own globbing 58 | pairtools merge -o "${prefix}.pairs.gz" $args 'to_merge/*' 59 | """ 60 | } 61 | 62 | process merge_pairs_stats { 63 | label 'wfporec' 64 | cpus 2 65 | memory "4 GB" 66 | input: 67 | tuple val(meta), path('to_merge/src*.stats.txt') 68 | output: 69 | tuple val(meta), path("${prefix}.pairs.stats.txt") 70 | shell: 71 | prefix = task.ext.prefix ?: "${meta.alias}" 72 | def args = task.ext.args ?: "--merge " 73 | """ 74 | pairtools stats -o "${prefix}.pairs.stats.txt" $args to_merge/src*.stats.txt 75 | """ 76 | } 77 | 78 | process pair_stats_report { 79 | label 'wfporec' 80 | cpus 2 81 | memory "4 GB" 82 | input: 83 | tuple val(meta), path("pairs.stats.txt") 84 | output: 85 | tuple val(meta), path("${prefix}.pairs.stats.html") 86 | shell: 87 | prefix = task.ext.prefix ?: "${meta.alias}" 88 | """ 89 | create_pairs_report.py "pairs.stats.txt" "${prefix}.pairs.stats.html" 90 | """ 91 | } 92 | 93 | process create_restriction_bed { 94 | label 'wfporec' 95 | cpus 2 96 | memory "4 GB" 97 | input: 98 | tuple val(enzyme), path("reference.fasta"), path("reference.fasta.fai") 99 | output: 100 | tuple val(enzyme), path("reference.fasta.fai"), path("fragments.bed") 101 | shell: 102 | def args = task.ext.args ?: " " 103 | """ 104 | cooler digest -o "fragments.bed" $args "reference.fasta.fai" "reference.fasta" $enzyme 105 | """ 106 | } 107 | 108 | process pairsToCooler { 109 | label 'wfporec' 110 | cpus 2 111 | memory "4 GB" 112 | input: 113 | tuple val(meta), path(fai), path(pairs), val(min_bin_width) 114 | output: 115 | tuple val(meta), path("${pairs.baseName}.cool") 116 | shell: 117 | """ 118 | cooler cload pairs -c1 2 -p1 3 -c2 4 -p2 5 $fai:${min_bin_width} $pairs ${pairs.baseName}.cool 119 | """ 120 | } 121 | 122 | process merge_mcools { 123 | label 'wfporec' 124 | cpus 2 125 | memory "15 GB" 126 | input: 127 | tuple val(meta), path('to_merge/src*.cool'), val(resolutions) 128 | output: 129 | tuple val(meta), path("${prefix}.mcool") 130 | shell: 131 | prefix = task.ext.prefix ?: "${meta.alias}" 132 | def args = task.ext.args ?: " " 133 | """ 134 | cooler merge ${prefix}.cool $args to_merge/src*.cool 135 | cooler zoomify -r ${resolutions} -o ${prefix}.mcool ${prefix}.cool 136 | """ 137 | } 138 | 139 | 140 | process createBed { 141 | label 'wfporec' 142 | cpus 2 143 | memory "4 GB" 144 | input: 145 | tuple val(meta), path("monomers.mm2.ns.bam") 146 | output: 147 | tuple val(meta), path("${meta.alias}.${task.index}.bed") 148 | // Use Sed to remove coordinates from monomer names 149 | // as only required for pairtools. 150 | """ 151 | bedtools bamtobed -i monomers.mm2.ns.bam > tmp.out.bed 152 | sed -E 's/:[0-9]+//g' tmp.out.bed > "${meta.alias}.${task.index}.bed" 153 | rm -rf tmp* 154 | """ 155 | } 156 | 157 | 158 | process mergeBed { 159 | label 'wfporec' 160 | cpus params.threads 161 | memory "16 GB" 162 | input: 163 | tuple val(meta), path('to_merge/src*.bed') 164 | output: 165 | tuple val(meta), path("${meta.alias}.bed") 166 | // Merge and sort by the monomer ID so contacts are grouped 167 | // and remove any duplicates. 168 | """ 169 | cat to_merge/* > tmp.bed 170 | sort --parallel=${task.cpus} -S 15G -k4,4 tmp.bed | uniq > "${meta.alias}.bed" 171 | rm -rf tmp* 172 | """ 173 | } 174 | -------------------------------------------------------------------------------- /modules/local/common.nf: -------------------------------------------------------------------------------- 1 | process index_ref_fai { 2 | label 'wfporec' 3 | memory "15 GB" 4 | cpus 1 5 | input: 6 | path "reference.fasta" 7 | output: 8 | path "reference.fasta.fai", emit: reference_index 9 | """ 10 | samtools faidx "reference.fasta" 11 | """ 12 | } 13 | 14 | process index_ref_mmi { 15 | label 'wfporec' 16 | memory "15 GB" 17 | cpus 4 18 | input: 19 | path "reference.fasta" 20 | val(minimap_settings) 21 | output: 22 | path "reference.fasta.mmi" 23 | """ 24 | minimap2 ${minimap_settings} -d "reference.fasta.mmi" "reference.fasta" 25 | """ 26 | } 27 | 28 | // NOTE -f required to compress symlink 29 | process decompress_ref { 30 | label 'wfporec' 31 | memory "4 GB" 32 | cpus 1 33 | input: 34 | path compressed_ref 35 | output: 36 | path "${compressed_ref.baseName}", emit: decompressed_ref 37 | """ 38 | gzip -df "${compressed_ref}" 39 | """ 40 | } 41 | 42 | // See https://github.com/nextflow-io/nextflow/issues/1636 43 | // This is the only way to publish files from a workflow whilst 44 | // decoupling the publish from the process steps. 45 | process publish_artifact { 46 | cpus 1 47 | memory "4 GB" 48 | label 'wfporec' 49 | publishDir "${params.out_dir}", mode: 'copy', pattern: "*" 50 | input: 51 | path fname 52 | output: 53 | path fname 54 | """ 55 | echo "Writing output files" 56 | """ 57 | } 58 | 59 | // TODO rewrite as single merge process 60 | process merge_namesorted_bams { 61 | label 'wfporec' 62 | cpus 2 63 | memory "4 GB" 64 | input: 65 | tuple val(meta), path('to_merge/src*.bam') 66 | output: 67 | tuple val(meta), path("${prefix}.${suffix}.bam") 68 | shell: 69 | suffix = task.ext.suffix ?: "ns" 70 | prefix = task.ext.prefix ?: "${meta.alias}" 71 | """ 72 | samtools cat --threads $task.cpus -o "${prefix}.${suffix}.bam" --no-PG to_merge/src*.bam 73 | """ 74 | } 75 | 76 | process merge_coordsorted_bams { 77 | label 'wfporec' 78 | memory "8 GB" 79 | cpus params.threads 80 | input: 81 | tuple val(meta), path('to_merge/src*.bam') 82 | output: 83 | tuple val(meta), path("${prefix}.bam"), path("${prefix}.bam.csi") 84 | shell: 85 | prefix = task.ext.prefix ?: "${meta.alias}.cs" 86 | """ 87 | samtools merge --threads $task.cpus -o "${prefix}.bam" -p --write-index --no-PG to_merge/src*.bam 88 | """ 89 | } 90 | 91 | process mosdepth_coverage { 92 | label 'wfporec' 93 | cpus params.threads 94 | memory "4 GB" 95 | input: 96 | tuple val(meta), 97 | path("concatemers.cs.bam"), 98 | path("concatemers.cs.bam.csi"), 99 | path("fragments.bed") 100 | output: 101 | tuple val(meta), 102 | path("${prefix}.per-base.d4"), 103 | emit: d4 104 | tuple val(meta), 105 | path("${prefix}.regions.bed.gz"), 106 | path("${prefix}.regions.bed.gz.csi"), 107 | emit: regions 108 | tuple val(meta), 109 | path("${prefix}.thresholds.bed.gz"), 110 | path("${prefix}.thresholds.bed.gz.csi"), 111 | emit: thresholds 112 | tuple val(meta), 113 | path("${prefix}.mosdepth.*"), 114 | emit: summaries 115 | shell: 116 | prefix = task.ext.prefix ?: "${meta.alias}" 117 | args = task.ext.args ?: "--thresholds 1,10,30,60,100" 118 | """ 119 | mosdepth --threads $task.cpus --d4 --by "fragments.bed" $args $prefix "concatemers.cs.bam" 120 | """ 121 | } 122 | 123 | 124 | process get_filtered_out_bam{ 125 | label "wfporec" 126 | cpus 1 127 | memory "15 GB" 128 | input: 129 | tuple val(alias), path ("filtered_files/?.txt"), path("concatemers.bam") 130 | output: 131 | path ("${alias}.filtered_out.bam") 132 | // Output the list of reads that were filtered out of the analysis in a BAM. 133 | """ 134 | find -L filtered_files -name '*.txt' -exec cat {} + > filtered.txt 135 | samtools view -N filtered.txt "concatemers.bam" > "${alias}".filtered_out.bam 136 | """ 137 | } 138 | 139 | 140 | process index_vcf { 141 | label 'wfporec' 142 | memory "4 GB" 143 | cpus 3 144 | input: 145 | tuple val(meta), path(vcf) 146 | output: 147 | tuple val(meta), path("porec.vcf.gz"), path("porec.vcf.gz.tbi") 148 | """ 149 | gzip -f -c -d "${vcf}" > "porec.vcf" 150 | bgzip --threads ${task.cpus} "porec.vcf" 151 | tabix "porec.vcf.gz" 152 | """ 153 | } 154 | -------------------------------------------------------------------------------- /modules/local/pore-c.nf: -------------------------------------------------------------------------------- 1 | process digest_align_annotate { 2 | label 'pore_c_py' 3 | errorStrategy = 'retry' 4 | memory { 15.GB * task.attempt } 5 | maxRetries 1 6 | errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' } 7 | cpus params.threads 8 | input: 9 | tuple val(meta), 10 | path("concatemers.bam"), 11 | path("concatemers.bam.bci"), 12 | val(chunk_index), val(chunk_ref), path("reference.fasta.mmi"), 13 | val(minimap2_settings) 14 | output: 15 | tuple val(meta), 16 | path("${meta.alias}_out.ns.bam"), 17 | emit: ns_bam 18 | tuple val(meta), 19 | path("${meta.alias}.cs.bam"), 20 | path("${meta.alias}.cs.bam.csi"), 21 | emit: cs_bam 22 | tuple val(meta), 23 | path("${meta.alias}.chromunity.parquet"), 24 | emit: chromunity_pq, optional: true 25 | tuple val(meta), 26 | path("${meta.alias}.pe.bam"), 27 | emit: paired_end_bam, optional: true 28 | tuple val(meta), 29 | path("filtered_reads.txt"), 30 | emit: filtered_read_ids, optional: true 31 | script: 32 | args = task.ext.args ?: " " 33 | if (params.chromunity) { 34 | args += "--chromunity " 35 | if (params.chromunity_merge_distance != null) { 36 | args += "--chromunity_merge_distance ${params.chromunity_merge_distance} " 37 | } 38 | } 39 | if (params.paired_end | params.bed) { 40 | args += "--paired_end " 41 | if (params.filter_pairs) { 42 | args += "--filter_pairs " 43 | if (params.paired_end_minimum_distance != null) { 44 | args += "--paired_end_minimum_distance ${params.paired_end_minimum_distance} " 45 | } 46 | if (params.paired_end_maximum_distance != null) { 47 | args += "--paired_end_maximum_distance ${params.paired_end_maximum_distance} " 48 | } 49 | } 50 | } 51 | if (params.summary_json) { 52 | args += "--summary " 53 | } 54 | def chunk = task.index - 1 55 | // 2 threads are recommended for each the pore-c-py processes 56 | def digest_annotate_threads = params.threads >= 8 ? 2 : 1 57 | // if possible use 3 for samtools (--threads 2 + 1) 58 | def samtools_threads = params.threads >= 8 ? 2 : 1 59 | // calculate the left over threads for mapping and leave one as samtools will require 3 60 | def ubam_map_threads = params.threads - (digest_annotate_threads * 2) - samtools_threads - 1 61 | if (params.chunk_size > 0){ 62 | """ 63 | echo "${chunk_ref}" 64 | bamindex fetch --chunk=${chunk_index} "concatemers.bam" | 65 | pore-c-py digest "${meta.cutter}" --max_monomers ${params.max_monomers} --excluded_list "filtered_reads.txt" \ 66 | --header "concatemers.bam" \ 67 | --threads ${digest_annotate_threads} | 68 | samtools fastq --threads 1 -T '*' | 69 | minimap2 -ay -t ${ubam_map_threads} ${minimap2_settings} --cap-kalloc 100m --cap-sw-mem 50m \ 70 | "reference.fasta.mmi" - | 71 | pore-c-py annotate - "${meta.alias}" --monomers \ 72 | --threads ${digest_annotate_threads} --stdout ${args} | \ 73 | tee "${meta.alias}_out.ns.bam" | 74 | samtools sort -m 1G --threads ${samtools_threads} -u --write-index -o "${meta.alias}.cs.bam" - 75 | """ 76 | }else{ 77 | """ 78 | pore-c-py digest "concatemers.bam" "${meta.cutter}" --max_monomers ${params.max_monomers} --excluded_list "filtered_reads.txt" \ 79 | --header "concatemers.bam" \ 80 | --threads ${digest_annotate_threads} | 81 | samtools fastq --threads 1 -T '*' | 82 | minimap2 -ay -t ${ubam_map_threads} ${minimap2_settings} --cap-kalloc 100m --cap-sw-mem 50m \ 83 | "reference.fasta.mmi" - | 84 | pore-c-py annotate - "${meta.alias}" --monomers \ 85 | --threads ${digest_annotate_threads} --stdout ${args} | \ 86 | tee "${meta.alias}_out.ns.bam" | 87 | samtools sort -m 1G --threads ${samtools_threads} -u --write-index -o "${meta.alias}.cs.bam" - 88 | """ 89 | } 90 | 91 | } 92 | 93 | process haplotagReads { 94 | label 'wfporec' 95 | cpus 2 96 | memory "15 GB" 97 | input: 98 | tuple val(meta), 99 | path("concatemers.cs.bam"), 100 | path("concatemers.cs.bam.csi"), 101 | path("reference.fasta"), 102 | path("reference.fasta.fai"), 103 | path(phased_vcf), 104 | path(phased_vcf_tbi) 105 | output: 106 | tuple val(meta), 107 | path("${meta.alias}.ht.bam"), 108 | path("${meta.alias}.ht.bam.csi"), 109 | emit: "cs_bam" 110 | tuple val(meta), 111 | path("${meta.alias}.ht.txt.gz"), 112 | emit: "haplotagged_monomers" 113 | shell: 114 | args = task.ext.args ?: "--ignore-read-groups --skip-missing-contigs " 115 | """ 116 | whatshap haplotag --reference "reference.fasta" -o "${meta.alias}.ht.bam" \ 117 | --output-haplotag-list "${meta.alias}.ht.txt.gz" $args "$phased_vcf" "concatemers.cs.bam" 118 | samtools index -c "${meta.alias}.ht.bam" 119 | """ 120 | } 121 | 122 | /// gather individual parquets into a single directory 123 | process merge_parquets_to_dataset { 124 | label 'wfporec' 125 | cpus 2 126 | memory "4 GB" 127 | input: 128 | tuple val(meta), 129 | path("to_merge/part?????.parquet") 130 | output: 131 | tuple val(meta), 132 | path("$prefix"), 133 | emit: "parquets" 134 | shell: 135 | prefix = task.ext.prefix ?: "${meta.alias}.chromunity.parquet" 136 | """ 137 | mkdir $prefix 138 | cp to_merge/part*.parquet $prefix/ 139 | """ 140 | } 141 | -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | // 2 | // Notes to End Users. 3 | // 4 | // The workflow should run without editing this configuration file, 5 | // however there may be instances in which you wish to edit this 6 | // file for compute performance or other reasons. Please see: 7 | // 8 | // https://nextflow.io/docs/latest/config.html#configuration 9 | // 10 | // for further help editing this file. 11 | 12 | params { 13 | help = false 14 | version = false 15 | bam = null 16 | fastq = null 17 | ref = null 18 | cutter = 'NlaIII' 19 | out_dir = 'output' 20 | chunk_size = 20000 21 | sample = null 22 | vcf = null 23 | pairs = false 24 | mcool = false 25 | mcool_resolutions = '1000,2000,5000N' // 4DN tuple default 26 | coverage = false 27 | 28 | minimap2_settings = '-x map-ont' 29 | threads = 4 30 | 31 | aws_image_prefix = null 32 | aws_queue = null 33 | disable_ping = false 34 | 35 | analyse_unclassified = false 36 | monochrome_logs = false 37 | validate_params = true 38 | show_hidden_params = false 39 | schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf' 40 | chromunity = false 41 | chromunity_merge_distance = -1 42 | cool_bin_size = 1000 43 | paired_end = false 44 | summary_json = true 45 | filter_pairs = false 46 | paired_end_minimum_distance = -1 47 | paired_end_maximum_distance = -1 48 | sample_sheet = null 49 | hi_c = false 50 | bed = false 51 | pairtools_chunksize = 100000 52 | max_monomers = 250 53 | 54 | wf { 55 | name = 'wf-pore-c' 56 | example_cmd = [ 57 | "--bam 'wf-pore-c-demo/porec_test.concatemers.bam'", 58 | "--chunk_size 100", 59 | "--cutter 'NlaIII'", 60 | "--hi_c", 61 | "--mcool", 62 | "--paired_end", 63 | "--paired_end_maximum_distance 200", 64 | "--paired_end_minimum_distance 100", 65 | "--phased_vcf 'wf-pore-c-demo/porec_test.phased_variants.vcf.gz'", 66 | "--ref 'wf-pore-c-demo/porec_test.fasta'", 67 | "--vcf 'wf-pore-c-demo/porec_test.phased_variants.vcf.gz'", 68 | ] 69 | common_sha = "shad28e55140f75a68f59bbecc74e880aeab16ab158" 70 | container_sha = 'sha3787c234c0cacf66a67fb77da223cc2e1cb0baf0' 71 | pore_c_py_sha = 'sha50378db56ddafe19f5e1d313ddb52dc70bbcc2bd' 72 | agent = null 73 | } 74 | } 75 | 76 | manifest { 77 | name = 'epi2me-labs/wf-pore-c' 78 | author = 'Oxford Nanopore Technologies' 79 | homePage = 'https://github.com/epi2me-labs/wf-pore-c' 80 | description = 'workflow for analysing pore-c data.' 81 | mainScript = 'main.nf' 82 | nextflowVersion = '>=23.04.2' 83 | version = 'v1.3.0' 84 | } 85 | 86 | epi2melabs { 87 | tags = "pore-c,contact,map,chromatin,conformation,capture" 88 | } 89 | 90 | // used by default for "standard" (docker) and singularity profiles, 91 | // other profiles may override. 92 | process { 93 | withLabel:wfporec { 94 | container = "ontresearch/wf-pore-c:${params.wf.container_sha}" 95 | } 96 | withLabel:pore_c_py { 97 | container = "ontresearch/pore-c-py:${params.wf.pore_c_py_sha}" 98 | } 99 | withLabel:wf_common { 100 | container = "ontresearch/wf-common:${params.wf.common_sha}" 101 | } 102 | shell = ['/bin/bash', '-euo', 'pipefail'] 103 | withName: "merge_coordsorted_bams|merge_namesorted_bams" { 104 | publishDir = [ 105 | path: { "${params.out_dir}/bams/" }, 106 | mode: "copy", 107 | pattern: '*.{bam,csi,bai}' 108 | ] 109 | } 110 | withName: "merge_pairs|merge_pairs_stats|merge_mcools|create_restriction_bed|pair_stats_report" { 111 | publishDir = [ 112 | path: { "${params.out_dir}/pairs/" }, 113 | mode: "copy", 114 | pattern: '*.{gz,stats.txt,bed,mcool,html}' 115 | ] 116 | } 117 | withName: "mosdepth_coverage" { 118 | publishDir = [ 119 | path: { "${params.out_dir}/coverage/" }, 120 | mode: "copy", 121 | pattern: '*.*' 122 | ] 123 | } 124 | withName: "merge_parquets_to_dataset" { 125 | publishDir = [ 126 | path: { "${params.out_dir}/chromunity/" }, 127 | mode: "copy", 128 | pattern: '*.*' 129 | ] 130 | } 131 | withName: "merge_paired_end_bams" { 132 | publishDir = [ 133 | path: { "${params.out_dir}/paired_end/" }, 134 | mode: "copy", 135 | pattern: '*.{bam,csi,bai}' 136 | ] 137 | } 138 | withName: "makeReport" { 139 | publishDir = [ 140 | path: { "${params.out_dir}/" }, 141 | mode: "copy", 142 | pattern: '*.{html}' 143 | ] 144 | } 145 | withName: "prepare_hic" { 146 | publishDir = [ 147 | path: { "${params.out_dir}/hi-c" }, 148 | mode: "copy", 149 | pattern: '*.{hic}' 150 | ] 151 | } 152 | withName: "mergeBed" { 153 | publishDir = [ 154 | path: { "${params.out_dir}/bed" }, 155 | mode: "copy", 156 | pattern: '*.{bed}' 157 | ] 158 | } 159 | withName: "get_filtered_out_bam" { 160 | publishDir = [ 161 | path: { "${params.out_dir}/filtered_out" }, 162 | mode: "copy", 163 | pattern: '*.{bam}' 164 | ] 165 | } 166 | } 167 | 168 | 169 | profiles { 170 | // the "standard" profile is used implicitely by nextflow 171 | // if no other profile is given on the CLI 172 | standard { 173 | docker { 174 | enabled = true 175 | // this ensures container is run as host user and group, but 176 | // also adds host user to the within-container group 177 | runOptions = "--user \$(id -u):\$(id -g) --group-add 100" 178 | } 179 | } 180 | 181 | // using singularity instead of docker 182 | singularity { 183 | singularity { 184 | enabled = true 185 | autoMounts = true 186 | } 187 | } 188 | 189 | 190 | // keep stub conda profile to prevent unknown profile warning so users get a better error 191 | conda { 192 | conda.enabled = true 193 | } 194 | 195 | // Using AWS batch. 196 | // May need to set aws.region and aws.batch.cliPath 197 | awsbatch { 198 | process { 199 | executor = 'awsbatch' 200 | queue = "${params.aws_queue}" 201 | memory = '8G' 202 | withLabel:wfporec { 203 | container = "${params.aws_image_prefix}-wf-pore-c:${params.wf.container_sha}" 204 | } 205 | withLabel:pore_c_py { 206 | container = "${params.aws_image_prefix}-pore-c-py:${params.wf.pore_c_py_sha}" 207 | } 208 | withLabel:wf_common { 209 | container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}" 210 | } 211 | shell = ['/bin/bash', '-euo', 'pipefail'] 212 | } 213 | } 214 | aws.region = 'eu-west-1' 215 | aws.batch.cliPath = '/home/ec2-user/miniconda/bin/aws' 216 | 217 | // local profile for simplified development testing 218 | local { 219 | process.executor = 'local' 220 | } 221 | } 222 | 223 | 224 | timeline { 225 | enabled = true 226 | overwrite = true 227 | file = "${params.out_dir}/execution/timeline.html" 228 | } 229 | report { 230 | enabled = true 231 | overwrite = true 232 | file = "${params.out_dir}/execution/report.html" 233 | } 234 | trace { 235 | enabled = true 236 | overwrite = true 237 | file = "${params.out_dir}/execution/trace.txt" 238 | } 239 | 240 | env { 241 | PYTHONNOUSERSITE = 1 242 | JAVA_TOOL_OPTIONS = "-Xlog:disable -Xlog:all=warning:stderr" 243 | } 244 | -------------------------------------------------------------------------------- /nextflow_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema", 3 | "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json", 4 | "title": "epi2me-labs/wf-pore-c", 5 | "workflow_title": "Pore-c Workflow", 6 | "description": "Workflow for analysing Pore-c data for chromatin conformation capture.", 7 | "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo.tar.gz", 8 | "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo/aws.nextflow.config", 9 | "url": "https://github.com/epi2me-labs/wf-pore-c", 10 | "type": "object", 11 | "definitions": { 12 | "input_options": { 13 | "title": "Input Options", 14 | "type": "object", 15 | "fa_icon": "fas fa-terminal", 16 | "description": "Parameters for finding and handling input data for analysis.", 17 | "properties": { 18 | "bam": { 19 | "type": "string", 20 | "format": "path", 21 | "title": "Unaligned BAM", 22 | "description": "An unaligned BAM file containing Pore-C concatemer sequences.", 23 | "help_text": "This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`." 24 | }, 25 | "fastq": { 26 | "type": "string", 27 | "format": "path", 28 | "title": "FASTQ", 29 | "description": "FASTQ files to use in the analysis.", 30 | "help_text": "This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`." 31 | }, 32 | "sample_sheet": { 33 | "type": "string", 34 | "format": "file-path", 35 | "title": "Sample sheet", 36 | "description": "A CSV file used to map barcodes to sample aliases and optionally provide per-sample parameters. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files.", 37 | "help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Optionally, a `cutter` column can contain the name of the enzyme used per sample (see the `--cutter` parameter for more details) and a `vcf` column can be used to provide a phased VCF file per sample if you require haplotagged alignments." 38 | }, 39 | "sample": { 40 | "type": "string", 41 | "description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files." 42 | }, 43 | "analyse_unclassified": { 44 | "type": "boolean", 45 | "default": false, 46 | "description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.", 47 | "help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory." 48 | }, 49 | "ref": { 50 | "type": "string", 51 | "title": "Reference FASTA", 52 | "format": "file-path", 53 | "description": "A FASTA file containing the reference genome to map against." 54 | }, 55 | "vcf": { 56 | "type": "string", 57 | "title": "VCF", 58 | "format": "file-path", 59 | "description": "An optional phased VCF file that will be used to haplotag alignments." 60 | }, 61 | "cutter": { 62 | "type": "string", 63 | "default": "NlaIII", 64 | "description": "The enzyme used in the restriction digest.", 65 | "help_text": "Any enzyme from the Biopython restriction dictionary can be used. See `https://github.com/biopython/biopython/blob/master/Bio/Restriction/Restriction_Dictionary.py`. This can also be defined per sample: see `--sample_sheet` parameter." 66 | } 67 | }, 68 | "allOf": [ 69 | { 70 | "required": [ 71 | "ref" 72 | ] 73 | }, 74 | { 75 | "oneOf": [ 76 | { 77 | "required": [ 78 | "fastq" 79 | ] 80 | }, 81 | { 82 | "required": [ 83 | "bam" 84 | ] 85 | } 86 | ] 87 | } 88 | ] 89 | }, 90 | "output_options": { 91 | "title": "Output Options", 92 | "type": "object", 93 | "description": "Parameters for saving and naming workflow outputs.", 94 | "default": "", 95 | "properties": { 96 | "out_dir": { 97 | "type": "string", 98 | "default": "output", 99 | "format": "directory-path", 100 | "description": "Directory for output of all user-facing files." 101 | }, 102 | "hi_c": { 103 | "type": "boolean", 104 | "title": "Hi-C", 105 | "default": false, 106 | "description": "Output a Hi-C formatted file; will convert pairs format to a Hi-C (`.hic`) file which will be compatible with [juicer](https://github.com/aidenlab/juicer).", 107 | "help_text": "Load this file with [Juice box](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation." 108 | }, 109 | "bed": { 110 | "type": "boolean", 111 | "title": "BED", 112 | "default": false, 113 | "description": "Output a BED file of the paired-end BAM alignments for use with downstream tools. Setting this to true will also trigger creation of the paired-end BAM.", 114 | "help_text": "Will use the paired-end BAM to create a BED file compatible with downstream tools including scaffolding tool [Yahs](https://github.com/c-zhou/yahs)." 115 | 116 | 117 | } 118 | } 119 | }, 120 | "advanced_options": { 121 | "title": "Advanced Options", 122 | "type": "object", 123 | "description": "Avanced options for configuring processes inside the workflow.", 124 | "default": "", 125 | "properties": { 126 | "chunk_size": { 127 | "type": "integer", 128 | "default": 20000, 129 | "description": "Process input in chunks of this number of reads.", 130 | "help_text": "To reduce per-process memory requirements for large datasets, process the inputs in chunks of reads. Set to 0 to process entire dataset in one go." 131 | }, 132 | "threads": { 133 | "type": "integer", 134 | "default": 4, 135 | "description": "Set maximum number of threads to use for more intense processes (limited by config executor cpus). We recommend a minimum of 4, but if available 19.", 136 | "help": "Increasing this will speed up some individual processes, but reduce the number of processes that can run in parallel, potentially increasing the time for the workflow to run overall." 137 | } 138 | }, 139 | "help_text": "These advanced options do not need to be changed for typical use, but allow fine tuning of workflows for users who want more control over the workflow." 140 | }, 141 | "pore_c_tools_options": { 142 | "title": "Pore-C Tools Options", 143 | "type": "object", 144 | "description": "Parameters to control the pore-c tools", 145 | "properties": { 146 | "minimap2_settings": { 147 | "type": "string", 148 | "default": "-x map-ont", 149 | "description": "The minimap2 settings for mapping monomers." 150 | }, 151 | "max_monomers": { 152 | "type": "integer", 153 | "title": "Maximum monomers", 154 | "default": 250, 155 | "description": "The maximum number of monomers allowed for a read to be included in downstream analysis.", 156 | "help": "Any reads that have more than this number will be filtered out, and output in a per sample filtered_bam file." 157 | }, 158 | "coverage": { 159 | "type": "boolean", 160 | "default": false, 161 | "description": "Calculate restriction-fragment coverage using mosdepth." 162 | }, 163 | "summary_json": { 164 | "type": "boolean", 165 | "title": "Summary JSON", 166 | "default": true, 167 | "description": "Output pore-c-py annotation summary in json format." 168 | } 169 | } 170 | }, 171 | "chromunity_options": { 172 | "title": "Chromunity Options", 173 | "type": "object", 174 | "description": "Create files for Chromunity analyses.", 175 | "properties": { 176 | "chromunity": { 177 | "type": "boolean", 178 | "default": false, 179 | "description": "Create parquet files for Chromunity.", 180 | "help_text": "See the chromunity documentation for further details 'https://github.com/mskilab/chromunity'." 181 | }, 182 | "chromunity_merge_distance": { 183 | "type": "integer", 184 | "default": -1, 185 | "description": "Merge colinear alignments separated by less than this base pair distance into a single monomer." 186 | } 187 | } 188 | }, 189 | "4dn_files_options": { 190 | "title": "4DN files Options", 191 | "type": "object", 192 | "description": "Create files for the 4D nucleome toolset.", 193 | "properties": { 194 | "pairs": { 195 | "type": "boolean", 196 | "default": false, 197 | "description": "Create a 4DN-format pairs file (also calculate stats).", 198 | "help_text": "Outputs a directory with a pairs stats report and a pairs file which can be used for downstream anaylsis." 199 | }, 200 | "pairtools_chunksize": { 201 | "type": "integer", 202 | "default": 100000, 203 | "description": "Number of pairs to be processed in each chunk in the prepare_hic process which uses the pairtools dedup tool.", 204 | "help_text": "Reduce for lower memory footprint. Below 10,000 performance starts suffering significantly." 205 | }, 206 | "mcool": { 207 | "type": "boolean", 208 | "default": false, 209 | "title": "Multi-resolution cooler file (mcool)", 210 | "description": "Create a multi-resolution cooler file. Will output the cooler formatted file which you can load with cooler.", 211 | "help_text": "See 'https://open2c.github.io/cooler' for more details." 212 | }, 213 | "cool_bin_size": { 214 | "type": "integer", 215 | "title": "Cooler file bin size", 216 | "default": 1000, 217 | "description": "The bin size of the cooler output file in base pairs.", 218 | "help_text": "See 'https://open2c.github.io/cooler' for more details." 219 | }, 220 | "mcool_resolutions": { 221 | "type": "string", 222 | "default": "1000,2000,5000N", 223 | "description": "The resolutions of the mcool file in pixels (see cooler documentation for details).", 224 | "help_text": "Comma-separated list of target resolutions. Use suffixes B or N to specify a progression: B for binary (geometric steps of factor 2), N for nice (geometric steps of factor 10 interleaved with steps of 2 and 5). This is the equivalent of the `--resolutions` flag in cooler; see an example here 'https://cooler.readthedocs.io/en/latest/cli.html'." 225 | } 226 | } 227 | }, 228 | "paired_end_bam_options": { 229 | "title": "Paired-end BAM Options", 230 | "type": "object", 231 | "description": "Create mock paired-end BAM files for legacy tools.", 232 | "properties": { 233 | "paired_end": { 234 | "type": "boolean", 235 | "title": "Paired-end BAM", 236 | "description": "Create mock paired-end BAM files.", 237 | "default": false 238 | }, 239 | "filter_pairs": { 240 | "type": "boolean", 241 | "default": false, 242 | "description": "Filter paired-end reads using minimum and maximum distance parameters." 243 | }, 244 | "paired_end_minimum_distance": { 245 | "type": "integer", 246 | "default": -1, 247 | "description": "Remove trans/cis pairs separated by a distance shorter than this." 248 | }, 249 | "paired_end_maximum_distance": { 250 | "type": "integer", 251 | "default": -1, 252 | "description": "Remove trans/cis pairs separated by a distance greater than this." 253 | } 254 | } 255 | }, 256 | "misc": { 257 | "title": "Misc", 258 | "type": "object", 259 | "description": "", 260 | "default": "", 261 | "properties": { 262 | "help": { 263 | "type": "boolean", 264 | "description": "Display help text.", 265 | "fa_icon": "fas fa-question-circle", 266 | "hidden": true, 267 | "default": false 268 | }, 269 | "disable_ping": { 270 | "type": "boolean", 271 | "default": false, 272 | "description": "Enable to prevent sending a workflow ping." 273 | }, 274 | "version": { 275 | "type": "boolean", 276 | "default": false, 277 | "hidden": true 278 | } 279 | } 280 | } 281 | }, 282 | "allOf": [ 283 | { 284 | "$ref": "#/definitions/input_options" 285 | }, 286 | { 287 | "$ref": "#/definitions/output_options" 288 | }, 289 | { 290 | "$ref": "#/definitions/advanced_options" 291 | }, 292 | { 293 | "$ref": "#/definitions/pore_c_tools_options" 294 | }, 295 | { 296 | "$ref": "#/definitions/chromunity_options" 297 | }, 298 | { 299 | "$ref": "#/definitions/4dn_files_options" 300 | }, 301 | { 302 | "$ref": "#/definitions/paired_end_bam_options" 303 | }, 304 | { 305 | "$ref": "#/definitions/misc" 306 | } 307 | ], 308 | "properties": { 309 | "aws_image_prefix": { 310 | "type": "string", 311 | "hidden": true 312 | }, 313 | "aws_queue": { 314 | "type": "string", 315 | "hidden": true 316 | }, 317 | "monochrome_logs": { 318 | "type": "boolean" 319 | }, 320 | "validate_params": { 321 | "type": "boolean", 322 | "default": true 323 | }, 324 | "show_hidden_params": { 325 | "type": "boolean" 326 | } 327 | }, 328 | "resources": { 329 | "recommended": { 330 | "cpus": 64, 331 | "memory": "128GB" 332 | }, 333 | "minimum": { 334 | "cpus": 8, 335 | "memory": "32GB" 336 | }, 337 | "run_time": "12 hours for 100GB input BAM using the recommended resources, this will vary depending on number of monomers found per read.", 338 | "arm_support": false 339 | } 340 | } -------------------------------------------------------------------------------- /output_definition.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": { 3 | "workflow-report": { 4 | "filepath": "./wf-template-report.html", 5 | "title": "workflow report", 6 | "description": "Report for all samples.", 7 | "mime-type": "text/html", 8 | "optional": false, 9 | "type": "aggregated" 10 | }, 11 | "read-stats-per-file": { 12 | "filepath": "./ingress_results/reads/fastcat_stats/per-file-stats.tsv", 13 | "title": "Per file read stats", 14 | "description": "A TSV with per file read stats, including all samples.", 15 | "mime-type": "text/tab-separated-values", 16 | "optional": false, 17 | "type": "aggregated" 18 | }, 19 | "read-stats-per-read": { 20 | "filepath": "./ingress_results/reads/fastcat_stats/per-read-stats.tsv", 21 | "title": "Per read stats", 22 | "description": "A TSV with per read stats, including all samples.", 23 | "mime-type": "text/tab-separated-values", 24 | "optional": false, 25 | "type": "aggregated" 26 | }, 27 | "run-ids": { 28 | "filepath": "./ingress_results/reads/fastcat_stats/run_ids", 29 | "title": "Run ID's", 30 | "description": "List of run ID's present in reads.", 31 | "mime-type": "text/plain", 32 | "optional": false, 33 | "type": "aggregated" 34 | }, 35 | "metamap": { 36 | "filepath": "./ingress_results/reads/metamap.json", 37 | "title": "Meta map json", 38 | "description": "Meta data used in workflow presented in a JSON.", 39 | "mime-type": "text/json", 40 | "optional": false, 41 | "type": "aggregated" 42 | }, 43 | "sample-data": { 44 | "filepath": "./ingress_results/reads/{{ alias }}.fastq.gz", 45 | "title": "Concatenated sequence data", 46 | "description": "Per-sample reads concatenated in to one fastq file.", 47 | "mime-type": "text/json", 48 | "optional": false, 49 | "type": "per-sample" 50 | }, 51 | "coord-sorted-bam": { 52 | "filepath": "./bams/{{ alias }}.cs.bam", 53 | "title": "Coordinate-sorted Bam", 54 | "description": "Coordinate-sorted Bam.", 55 | "mime-type": "application/gzip", 56 | "optional": false, 57 | "type": "per-sample" 58 | }, 59 | "coord-sorted-bam-bai": { 60 | "filepath": "./bams/{{ alias }}.cs.bam.bai", 61 | "title": "Coordinate-sorted Bam Index", 62 | "description": "Coordinate-sorted Bam Index.", 63 | "mime-type": "application/octet-stream", 64 | "optional": false, 65 | "type": "per-sample" 66 | }, 67 | "name-sorted-bam": { 68 | "filepath": "./bams/{{ alias }}.ns.bam", 69 | "title": "Name-sorted Bam", 70 | "description": "Name-sorted Bam.", 71 | "mime-type": "application/octet-stream", 72 | "optional": false, 73 | "type": "per-sample" 74 | }, 75 | "pairs": { 76 | "filepath": "./pairs/{{ alias }}.pairs.gz", 77 | "title": "Pairs file", 78 | "description": "This file contains contact information in a human-readable tabular format, and can be used with downstream tools. See [Pairtools documentation](https://pairtools.readthedocs.io/en/latest/formats.html#pairs) for full specification.", 79 | "mime-type": "application/gzip", 80 | "optional": true, 81 | "type": "per-sample" 82 | }, 83 | "pairs-stats": { 84 | "filepath": "./pairs/{{ alias }}.pairs.stats.txt", 85 | "title": "Pairs summary stats file", 86 | "description": "Summary statistics of the pairs file. See this [overview](https://pairtools.readthedocs.io/en/latest/stats.html) for a full specification.", 87 | "mime-type": "text/plain", 88 | "optional": true, 89 | "type": "per-sample" 90 | }, 91 | "pairs-report": { 92 | "filepath": "./pairs/{{ alias }}.pairs.stats.html", 93 | "title": "Pairs summary report", 94 | "description": "Pairs html report with result including an interactive contact map and statistics. See [pairsqc documentation](https://github.com/4dn-dcic/pairsqc) for further details.", 95 | "mime-type": "text/html", 96 | "optional": true, 97 | "type": "per-sample" 98 | }, 99 | "mcool": { 100 | "filepath": "./cooler/{{ alias }}.mcool", 101 | "title": "Multi-resolution cool file", 102 | "description": "Multi-resolution cool `.mcool` file which can be used with downstream tools to provide a high resolution genomic interaction matrix. See [Cool tools documentation](https://github.com/open2c/cooltools) for details on downstream analysis.", 103 | "mime-type": "application/octet-stream", 104 | "optional": true, 105 | "type": "per-sample" 106 | }, 107 | "paired_end_bam": { 108 | "filepath": "./paired_end/{{ alias }}.ns.bam", 109 | "title": "Paired-end BAM", 110 | "description": "Mock paired end BAM.", 111 | "mime-type": "application/octet-stream", 112 | "optional": true, 113 | "type": "per-sample" 114 | }, 115 | "chromunity": { 116 | "filepath": "./chromunity", 117 | "title": "Chromunity parquet files.", 118 | "description": "Chromunity directory with parquet files which can be used with the Chromunity package. Chromunity enables the nomination and statistical evaluation of high order interactions. See [Chromunity documentation](http://mskilab.com/chromunity/tutorial.html) for further details.", 119 | "mime-type": "text/directory", 120 | "optional": true, 121 | "type": "per-sample" 122 | }, 123 | "fragments-bed": { 124 | "filepath": "./paireds/fragments.bed", 125 | "title": "Fragments BED", 126 | "description": "File with the DNA fragments created from the virtual digest.", 127 | "mime-type": "text/tab-separated-values", 128 | "optional": true, 129 | "type": "per-sample" 130 | }, 131 | "hi-c": { 132 | "filepath": "./hi-c/{{ alias }}.hic", 133 | "title": "Hi-C for contact map", 134 | "description": "File which can be loaded into the [Juice box tool](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation.", 135 | "mime-type": "application/octet-stream", 136 | "optional": true, 137 | "type": "per-sample" 138 | }, 139 | "filtered_out": { 140 | "filepath": "./filtered_out/{{ alias }}.bam", 141 | "title": "Filtered out reads", 142 | "description": "BAM file containing any reads that were filtered out at the digest step and not included in the analysis.", 143 | "mime-type": "application/octet-stream", 144 | "optional": true, 145 | "type": "per-sample" 146 | } 147 | } 148 | } -------------------------------------------------------------------------------- /subworkflows/local/prepare_genome.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | import groovy.json.JsonBuilder 3 | nextflow.enable.dsl = 2 4 | 5 | include { 6 | index_ref_fai 7 | index_ref_mmi 8 | decompress_ref 9 | } from '../../modules/local/common' 10 | 11 | 12 | 13 | workflow prepare_genome { 14 | take: 15 | ref_param 16 | minimap2_settings 17 | main: 18 | // taken from wf-human-variation 19 | // Check ref and decompress if needed 20 | ref = null 21 | ref_index_fp = null 22 | if (ref_param.toLowerCase().endsWith('gz')) { 23 | // gzipped ref not supported by some downstream tools (pyfaidx, cram_cache) 24 | // easier to just decompress and pass it around rather than confusing the user 25 | decompress_ref(file(ref_param)) 26 | ref = decompress_ref.out.decompressed_ref 27 | } 28 | else { 29 | ref = Channel.fromPath(ref_param, checkIfExists: true) 30 | ref_index_fp = file(ref_param + '.fai') 31 | } 32 | // Create ref index if required 33 | if (!ref_index_fp || !ref_index_fp.exists()) { 34 | index_ref = index_ref_fai(ref) 35 | ref_index = index_ref.reference_index 36 | } 37 | else { 38 | ref_index = Channel.of(ref_index_fp) 39 | } 40 | ref_channel = ref.concat(ref_index).buffer(size: 2) 41 | // create a minimap2 index, not strictly necessary 42 | mmi = index_ref_mmi(ref, minimap2_settings) 43 | 44 | emit: 45 | fasta = ref 46 | fai = ref_index 47 | mmi = mmi 48 | minimap2_settings = minimap2_settings 49 | } 50 | -------------------------------------------------------------------------------- /test_data/bams/barcode01/porec_test.concatemers.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams/barcode01/porec_test.concatemers.bam -------------------------------------------------------------------------------- /test_data/bams/barcode02/porec_test.concatemers.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams/barcode02/porec_test.concatemers.bam -------------------------------------------------------------------------------- /test_data/bams_dir/shard_0001.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams_dir/shard_0001.bam -------------------------------------------------------------------------------- /test_data/bams_dir/shard_0002.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams_dir/shard_0002.bam -------------------------------------------------------------------------------- /test_data/bams_dir/shard_0003.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams_dir/shard_0003.bam -------------------------------------------------------------------------------- /test_data/bams_dir/shard_0004.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams_dir/shard_0004.bam -------------------------------------------------------------------------------- /test_data/porec_test.concatemers.bam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/porec_test.concatemers.bam -------------------------------------------------------------------------------- /test_data/porec_test.fasta: -------------------------------------------------------------------------------- 1 | >chr1 2 | AACCTGATCGGACTACCGCAGGATAGAAGCGGTTGCTTAAAGCCACAGCCGGGCAGCTGTGTCAAAGGTCCCACATACAATGAGCGCTATTCCCAGACGGTGTTCTGACTGCGAGATACGTGAATACTAACCTCCAAGGGGAAAGGAATCAATCACATTGTGTACCGGCCTACTTTGAACTTGCACGAACGTTAGCGATTTAATTAACCAAACCGAGAGTGCAGTCGAGTGAGGTACACGCTCGGACTGCGTGAATGGCGTTATGTGTTTATCGTCACGCTTCCACAATTAGACAAGAATGCTTCCCAGCTTATGTCAGTTAAGGAGTTAACGATCTGTCTATTGACCATCTCGTGTATTTAGCGGGGCAAACCGACGATCTACTCCGCTCAATGTTTACCAAAGAATCGTAACTAGGTGCAGTCTCTAGCTGGCCAGTCAAAGTGTTTTGCGTATTAAGAAGAAGATATTGCGTTTATGAGCTGACTGATCGGCAGTGAAAAATCTTTGGCATTTATGGGATCGATTAGTCGGGTATTGTTGCAACAGGCTGCTGCGAAAGCTATCCTTTATAGGCAAATGAGTGACGCGCGAACGCATGACGTCGTCAAGCGGGTCTCATCACTTTTATCGACAGTATCTCGTTTGATAATTGAACCATTTGCCGTGCCAAGCCGAGGCCGTATAACCAAGGCGCCGGCATTGACTACTGTAGTGATTATGCGATCTGTCCCGCATAACCGTCTATACTGGTCCTGAAAGGTGTTCGTTGTCGTTCGATTCAAGCCTCTACCGTCGCTGGTTGCTGGCGACATTGCATAGCATACCCATTCGCTATATCGAGCTGACGTTATTGGCTAACGCTTAGTGTGTCAAGTCCTGGTCTGGGAATGATCGGAGCAACGGGATGCTCAACGCGCTACGAATTAGTTGGTCCGGGAGCGAGCGTGTTGCGATCTAACTTCGTTCAGAGACAGGCCTGCCAATCCAACATAGCTATGTTCACGCTTGCATTTAATCTCGTCACCCACCGCACATTAACCGGGGAACGTACACCAGAATGTAGGGCTGGCGAATCAAGAAGGGCGGGGACCCACGGCATGCCTTCGGTTTTACTAGACACGCAGCATCCCATTTGGAGCTCGGGGTAGATCTGTCGGCCCGCGGGCTCGACCGTACCAAGAATACTGCGCGATGCATAGGACCTCGGAGGACTTTGCGACTATTATTACCGAAGAAGATTTTGTTCGTGCAATACGGTGTGTTTGAGGCCGGCCAAGTAGCATCTTGGAATTTATCCACTAACTATCCGAGCCTGGTTGGGATGCTGATCGATTGTTGACACCTCACATACGATTGCGCCATTTGGAAGGACTGGATTTGCGCTGTCAACCTGACTGGTCTAAGATTTGCCCGCGCAGTCCCATCGGTTGATGGAGAAGGTCCTTGTAACTTATCTACAATCTAAAAAAATCGAACAAGTTGGGATCGTCCACGGTTTTAGATGTGCGAGATCAACTAGGAACGGCAGAGAACAGTCTGACTACACGTGTGAGTTCGGATAACACGTGCACGCTGCCTCGCACGGAGGTTCGCAGGATGGCATCTCGGTTTGATCCTATGAGAGCCCTTTATCTTGGAACTGCCTGCGGTACAAGCGCGGGGCCGTCCCAAGCCAAGCCAGGTAGGTAACACCCCAAGTCGTTAGACGCCTGTTCGGATGGTAGTCCACGCGTTGCACTGTGCAAAAGACCAAGAATACGCGAGGGGTAAACGCGCTTGCTTAGGCTATCGAGACGAACGGTTCACTGATTCAGTGTTAGATGATAGAACACGGAAGACGCTAAGAACCAAAAGTAACGCATTACTATAAGGAGGTAATTGGCCGATGCACCCCCAGAACGTGAATAACTTGCAGTCGCTGGGGTCGACCATCGAAGAGAAACCCATCATTATTACTGGCCCCAAGTATCTCATCGGCAGGCCTGCGCGTCCACGCTACTAACATAGTTCTCAGAGTACTGTCCCATCAGTTGTTGCTCGCAATTCCCCGCTGGGGGCTTCCGCGAATAGGAGGCCAGCTATCCAGTCCCTATACCAGTGCTGTTAGCGTCGCTATTCTGGCCTCCTAAGCCACACGGTTCTGAGATTGTTATATGATCGCTCCTCCACAGCACGGATGGACGACAGAGCCTTCTGAGCAATCCATAAAGCGACCAATTAATCGCACCACGAATACCCTCTAACAAGGGCCTAGCTTGTTAAGTGGAAGAGGCTAAGGCCAATCAAGAGGCCAGCTCACAGTCCGGTGCTTCCAGGGGCCCCTCGCCTGTACAGTATCTCCTACGACATTAACGTCTAACCTTGTCATAAATGGATAGGGGTTGTACGCTGCTTTATGGTTTTTTCAGGTCTCACGCCGAGATCCAGCCCTGAAGCAATCTCTACGTACGCGACATTACAGTGACACGATCCTGCGTTGGAACAATGGGAAATCTTTATGGGAGATTTAATATATGTAGTTGGAGCTGTAAGGGCGTAAATTTGGCTGAGACGTGCCAGCGACTCTGCTCTGTTCGGATCATTTGGTCATTGAAGTCTCGAATTGCGGGGGCAATAACCCGAGGATCACAGTCTTGAACGAGGGTTCCTTGCCGATTTAACAACGTGTATTTGAGGTGTCTCTGTTATAACATTCGGCGTTGCATACGGACTGGGCTCAATATAGAGTTTTACTGTGTTTGAAAATTGAAGCGTCGAGTACTTACGCTCCCACTACTCGAACATCCTCCAAGCGGGCAGTTTGTGCAAAGGTTTCTTAAATCTATCACATTTTATAGACTACTCTAACGAGGATCTTCTGCAAAAATCCCAATTAAGTGTGATACTAGGGGTCGCCGTAGAAGAATGAATGCCATTCAAGGTTAGGTATCCACGACAGAAGCCATCGTAAATAGGCCGTCGATACAGGGTCGATGGAATTGTGGGCTCCAATGGGACATGTTGCCTAACGATGGGGACGCGTTTGTAAGGAAATCTGAAATTTCGACTACCTCCAGTCATCCACTCGCGGTACTTCTCTCGCAGTAGATTTACGTGTAAAAAATGTCCGATCTGGTATCCGAAGAGGGCGGGACCGCGACTTCGAACGCCAGATATCGGATGCTCTCGGTTAATGGAGGGTACCCCATCCTGCTACTTTGCCGAAGCCGCCAAACGTAGGAGTTAAGGCAATTAGCTGACAGAGACATATTGTCCACTCCTTGCGGATTTACTCCGTACAGACCCATCTACGGAATTCATCATAGACGATGGAATTAATCCACAGCTAAGACTACACAAAATACATAACTCCATCCGGGGCGGGCCAGCCGCGCACCCATTGTGTTACCGTGTAGGCCTACCATTATAACGTTGAGGACGCAAGGATCAGTTAAGCCTCCGATGGACTGTGAAAAGCAAGCAAGACCACGGCTAGTACGGTAATACTCTCTAGATGCTTAGCTCATCCGCACGCAACCACCCCATTCTTCTGATGCGGCAGCTAGGAGGGTACGACCCTTCGGGGCGGTTCATG 3 | >chr2 4 | ACGAGGGTCGGGGCACTGGACTTTGGAGCCCCTCGTGACAATGCAGGTTTTCAGCATCGTTTGTGAGGTGTGTTCTGTTTTACTTGAATGTAGCGAGTCGTTATTAGGCCCTGCGGCGCCGCATTTGGGTATCGCTTCCGGACACTTTATGGCCATCGCCCCGGTGTTGGACGGATATCGATACCAACAGGGAGTATTGTAGGGGCTTAGCAACAGACCTACATCCAGCTGCGAGCGGTCTCGAAAGGAATGTTATTGCATCACCGTCCGTCTCGTGATGTCCGTAAAAGATAGACGTGGCCTTGGCGGACCGGAGGAAGGTTGGGACGCAAGTCATCTCCAGCCACCGTAGTCTCTTCAACTTCCTGCCGCAGACCAGTACCGATCAGCGCTGGATTTTTCAGAACACACGAGGCGACCATACGCCAATGTTCGTATCCTTCGCTAACCACCACCACCGTGAGTTCAGCTAGATCCATTCTGTTTAATCCCTACAGGCACTCATTTTGGGCGAGCCATACCGTGAATGGAGTCATGCTTGTAGCGTCCGAGGTCCTGACGGCGTCGTTCTTCTCCGGTCTCCCGAGGCTAAGTGGACAATCGAGTATAGTCGGTATAGCGTTGCTGAAATCATAGTGGTCGGACTCTACTATGTAGGCGGTACGTAATGGCGATGCCTCTCAGTGGGATGCCCTTGTAAAAGACAGATTCAGCGATGTTAACTCTATCGTGTAAGCGCGAGGGGACTGTGCATTGATATGGTCCTTTGATTACTCAAATGGATCCGTACTAAACCCTCCGGAACGGTTACGGGACGCTGTGGCGCTAGGGACTTCCTGAAGAGTTATATTAGGATTCGTTCCGGGCCAAGGGGCTTTGCTATCAGGGCTGTTCGTCATCGCTCTATACCTATGACCGTATATGAGGACGTCAGGTGCTAGGAACACTGAGACATTTCTGAGTGTGGCGGCCCGCGCGGAAAGGTGAAGTAATTCCAATGCACAAAGGAGTAACGGTTCGTTCACTGGTCAAACTCAAGTGGTGGACTAGGACTGATTAGTTCCTGGGTGACATCACCAGCCCGTCGCGCATAGAACGCCGGCCGAGTGCCACACGACGTGCTCAGTGATTTATTTGCACTACAGTTACAGACAGGAGTGCGTCAAAGTCCCCCCCCAATATGCGAGTTTTAAGACCTTTGCTAGGTGGTTAACAACTGTGCGTCTAGACGTTATCTGACTATGTCCCGCTTTTGTGAAGTGACGCGCAATCTGAGGTGCCCGATATTGACCCCTCCTCGGGCTTGAGCGCAAGTCGGGTACCGCTAGTAGTACAAGGAGCAACGTTGTTTATTAGGGTATACTCAAAAAAAGAGGGATCTGGAGAAGTGAGTTACCTTGTCTAAGAATTATCCGGCTACAATAATAAGCGTCAAGGCAGCGGACGTTTCGACAGTCACTCGAAGACATAGGGGTACGGCAGTATCCACCTAGGGTCGCCCGTGATAACCTTGAGCCCTGGGATAGCCCGCATCATACCAATGGAATTTACTCTGACCATAATCTAGATAGCCTAGACTAGGATCTGCCCCGAAGCCGATATTCAGTCTGATACAAGAAACGTTATATGCCCCTATGATAAGCGTTGCCACTGGTCCCTACCGTAACAAGGCTTCAGTCTTCTGACGCGCTTCAGGGCTCATCGCTTGAGGGCGCAAAATTACTAGTAATGGACTCTATCTGCAAACTACAGCGCTACGTATAGAACTCGGCAGAGGGGATAATATATAAAACTGACGTTGTTTTAGAGCACCGAGATGAGCTTTTGTCTGATGAGCTCAAGAACGTACTTCATCCTCATACAAGATTTTAGGACGACCCCGGATGGGGGGGGAGACTGTATTCGATGCCTGGCCCAGTGTGCGTGCCATCGCAAGTGGCTGTACCGCAGCCCCTGAAACGAGTGCAAGTTGCTGGGACTATACAAATAAGTGGTCGAAGCCTATTTGCGTAGCACACGTCGCCATTCGGTGTAATTAGGCCGCCGTAATGTCTAAGTATGAGCTGACGACTTCAAGGTAAATTAGCACTTTAGTAAACCCAAGTTCAAACGTAGTTAATCAAGCCACTAATAACATTTCCCTTAAGGCATCGTAATCTGAAACTTCCACTGAGGGGTCAGGCACCGATCCTAATATGTCTTTTATCATTACACTCGCTACGCTGAGCACAGACGATAATGACCTGTATCGACTTTTCGATTGATTAGATCCAAAGATTGCAGAGGGTCTCGGCCGCCTGGTTTTAAGAATACGCAATGTAGCGTTTAGCGGATGTTCTGACGCCACTGCTCTGCTGGCGTCTGGATGGCAAGACTATTAGAGTGAGTGATGGCGTCAGCTGGCACCTCGGGGGAATTAGGTTTATAGTGCGCCTTGCACGCACAACTCCCAGTGAGGCGGCTGACTCGGGATACTTGCCGGACACTACAACTCCGGGGAAGCTCAGAGTCTCTTGCAGTAAGGCGGGCGGGTTTACACTGATCAGTGCCCCTCTCGGCGGGTGGTAAACGAGGCATTCAATCGCACAGCAAGAGAAAATCATAATTCAACCGAAAGAGTTAGAAAATCCCAAGACGAGGCGGGTTGGGCAATAAACCACTCAGTAACCTACAACAATCAAGTCCTCGCGGCCCACCAAATAGTGACCGCTTCTTAAATGTTTCTAATCATCAAGCAGCCTGTTCTTGCAGTCGTTTCCATCAAATTGGTTCTCTTAGAAAAATACTCGGTACCTGGCTTGCACTAAGTCGAAAAAATGGGCACAACGTAGACGCAGGGGCGAAATCAACGGGATACGTGTTGCGTCGTTACGCCCGCTTCCAATCAACCTACGGCTGCCTATGGGCGCAATTGCGGGGCTGTAGCTTCCTGACTTTATTGGTGCGGGTTCTATATGTGGTTAAAAGACGTTCTAGCTATTTTGGAATTGTAAATTCCCGGTTGTGACGCCATCACCTCACCTACCCCCGGTACTGGATGCTTGTCATATGCGACACGAGTCGGCACCACGAGCATTACCGAGTACGTATTTCTAGAACAAACTTACTATATGAAGGTCTTTAACCGAAGGTAGGGACAGGCCGACGGCTGAAAAAGTGCCAAGCCAAGTCCCCACTGTGGTGGAACTCAAGGGTGAGTGGACCTAAAGAGCCCAAAAGAATCAAGTGTCTAGGACTTCAATAAGCGCGCGGCAGTAAGAACAGTCGCACCCAACGGACTTTCCTGGGAGGCCTGTCTATCTGCTCATTCGTGTATTATCCCCTTTTGCAAGTGCCAGTGCGGCTAACCGTGGGATAGTGAGGGCAGGGATTGCACTCTCGTTGCCCTTCCCGAAGCAAGTACAAGAGATCACTCTGGTATGGTCATACTCAGAATGCACGGGCCCTCCGGTCGGGTCTGATGCGAAGCTGCCCTCCAGCTTCCATTCCGAAAGAGTACTATGACCAGGAACTCCCTACGACTATCTAACCCAGTACTCGCGACTTAACTATCTAGCGTTAACCTTTTGCCGGCCGACATTAACCCAAACCTAGAGCCGCAAGACGAACCCGTCCCGCGTACTTTAGGTCTAGCCTAGTCCGGTAATATAGGTCGATGTGGGCAGGGTTCTCGAGCCTAGATGTTCACTGACCGGGAGTAGGCCGACATCAGGCCCGAAGCCGAGGCAGTGTCCAATGGTATGACCCGCAGCACAATAATACGACATCCCCGAAAACATTAGGCTGACAAGAATCGTATTGCACCAACGCGGATAGTAGACTGCTCCTTGGGAATAATGTTAGTTTGTGGCAGTAGGAGGAGATTTAGAAGTTCTCTTCTGGTATCTCCCGCAACGCGGTTCCCGGGCGAGGGGAAGCCTGCGTCCTGGCGAAATTCTGCGTACCTATGTGGGCACCACGGTTAGGCAGTAACGTCTAGTAGCGCTACGGTATCGGGATACTGGCGGCCGTAGTGAATCATACACCTGGAGCCGGGCTCGTAAAGGAGTCTTCAGGTCCATTAATTTCGAATTCAGGGCCGCTTTGCGAGATCGCCGTAATCCTAGCGGGGTTTTCCTCTAAGGAGGTAGACGTGACCATCCCACAAATCAAAAGTCTGATGCCTGGAACCATACTTCAGGCGCCGTCTGAGACCCCTTGTGCGCAGTAAAATTGCTACTTTTTACAAGATTCGTGACCGGAGAGGTGGATGAGGCCCGGGATTAGACGAACGGTTTCCCGTAGGCCGCTACACGGGGGCGGGGCCAAACACTATGTGTATTGTCCCATAACGAGACTTTGCTGGCTGCTTCCCACCAGACGCAATTTAAGTCAATTTTATACAGTTGGGCTCTTGCCAGCACAATAGGCAGGTTCCTCAAAAATAACATCCCTCGGCGCCATAGAGCCGTACACCGTAGTCTGATATCCTGCGCGTCGTGTCTACAGAGTTGTTAAAGAAGGCTTATGCCGTTTTCGCACGCCTAGCACGGAACTCTAATTTCTATTAGAAAGAATCACTGGTTTTGACAAGGTTAGGACGGCTACACTGCCAAACCGCGCGCACAGCTTATCAAAGAGAGATTGAATGGCGTACACGTACTTCTCGGGACTAGGTCCACTCGAGACCCACCACTGAGGCACCCCGTCCCAGATTTATTCCTAGGACACTCCTACTAGTCAACGGAATTCTGTGGGACTCCGGCTGCCCTGTGCACCGATCTGTTCTAGTATCTATGTACCAGGCAGATTTAGTACACTGGAGAAATACGTCCTCCCGGGCAAAGCTGTTCGTCCCTTGCTTGTGGATCAGTGGAGGACAATATCTCAAAACCAACTCTTCAGGCTCGGTGTCGCATAGCTCGCTCCCAAGAGCTATACATTGCCATTCCCTGTCCTGCGGTAGGGGCTGCCGTCAGGTGCTGGACCGGTTCGTCGCGCGAACAGTGTCACATCGTTTGCCTTGTTATTGAGACGAACATACAGGGCGGCTCGATTTGCTTAGACAGGGCGGCGCACGAAACTGAAGCGGGCCGGGCATCTTTCGGGGTGACGTCATAATACTTGGGGGCGGAAAAAACCCTCCGGTGGGACCATACCGCGGCGTGATATCGAGAAGTTAGGCGAGACCTAAACGGAACATTCAGATACCTAAGATAACTGAATCCCTTAAACCGGTTCGGTGGAGTGCCGACCACATTAGGCCTCGTCAGACGTAGCGGCTATCGCTGGCTGTCGAGGTATGATACAGTTACAGTGTTGCTTACTGTATCCTGGGGGTCCTACACTGCCAGGAAAAGCAACTCCCACAACTTATTGTAACAACGCCTGCTGGATCTGGCTACGGAAGGGCACGAGCCATTGGCTGATATGCGTCCTAAAACGATTTCCCGTTTTACCATGACGCGGGGTGTTTGATCGAGTAAACTGGAGGGCACGACTTGAATTGCAGCAAGATCTACACGCGCTCATGATCTATTGAATCTCATGAGCGCACCAGGGTGTGGAACACACCAAAAACGAGGTGAGCCAGCCTAGCAATGGGTAAAGTGGGCGTACAACGACCAATGGTTCCCTGGACCGCAAAAGAAAGTGTCAGACAGGAGCATTCTGTACGGAGATACCCTACCTTAATCGCAACACACAGTAGACTCACACCTTCAACTATCACACAGTATAAAGCTCTCAGACTGATTAAGGTACTGTGTGATACTGAAAGCACGCTTAGTGGTGACCTGGCCACTAACGCACGGCTACGGCGCTTCGGCGCCTCCTCCAGGTCTGCGTGTGATACCGACTTGCTTATGAGTATAATAGGCTAGGCTCTTATTGTGTAGAGGGCCTCAAGACCTATTTGTTGTTAAGTCGTGGGCGTTGCATCCAGGAGTTATCTATGGTTGATACTCGCCCCAGCTTAAGCACTCTGAGTCCCGTCCATAATTTATGGCTAGACAGCGCTCGGACTTCCTTACCTAGTATATCCGTTCACAGCAGGAAGAACCCCATACTGAGCTATAGATTCTTTTCCGTCCTCTCAGCCAGAAGCTTTTTCAGATATCGTGCACAGGATAAGGGGAATTACATTCGCAAGGTTCTTACTCGCCAACTCAGGTTTGGCTATTACAAGTTGCAGCACACCGGAAAAAAGGGTATCCCGGAACTTTTTTTGTAGCCAGCAGGTCTCCAACGGAAAAGGGGAAGCCCCCACTTTTCCTGCTAACTTTCGAACGTGGCCTGTGGCGTGACCCATATATGTGAATGGAGCGTAATGGCCCCTAAAGTAACTTCTTCCGCCTAACACCAATGCTGCGTGTACTTAGTTCAATACCCTTGCAATATAGCGATATCAAGTAAGTCACTTCTAACTGCACTGAGTAATCGTGCGTGGCGTTGGCTTGAGACCGTGATGTGGCATGAATAAGTCTGACTGAATGCCCACCGGCTCCACCTTAGGTCAAACTCAGACGCGTTGCTCGGACCCCTCAATACGCAACTCATGGTGCTCCTATTCACGGGTTCTCGGGCCGCTATTGCGGTAACGATAACGCCAAGGTGTCAACAAGAAAGGAGCACGCCTCTCCATAATAACAACAAAACTATGGTAGAATGAACGTTAGTCAGCTCCGCATAGCATTCAGTGACCTGCTCGACTGCCGGTCCTGCAGCATAGTCCGTTGACGACCGGAGAGATCATAGACTGCAAACAGCTTTTAGCCTGATACATGGAAACGTACAAGAAAGGATGAGGAGAGGTCACAGGATATCAATCAGTTAAGACACTGCCCTACTCTTGTAATGTCGACGTTGGCTACCACCTAGAGGAGGTAGTAGTGGGGTTGGTACTCTTATAAAGGCACTGCCATTACGCGATCGCGATGTTTTTGACAGATTTTTTCTGAGGTCCAGTGCCGCTTAGATCCATTCGACCTCGGGGCATTCGCAGTGCTTAAGGGGTAGTAAGCCTATGCTACGTATAGGCTAGCGACCTTCTGCTCACTGCTTAGGCCATTCACCTAGGCGAGTTTCACTATTTCCCTAGCGAACGGACTTGGGGTAGGGCTGCGGCCTAGTCGTGATGCGGGTTGATAAGCCGCGCGTCGGTTCACCCGGACCGTAGATTCGATCTCGGAGACTGTGGGGAGAACCCTCTGTTGGCCGGGGAAGTCGGGCTCCTCACTAATACCGTATGGGTGGTAGAAATAACAGATGGGATACCCTACTTGGTAAGGAGGGTGCAAGCCTTACACCTTGAAATTAGCCCAGAAGTATCTTATCGAGAACTGCTAGCCAATAATTGCCCGCTTGACCGCACGTATCAGGTAATGTGTACGCCACGTTGGCGCGGAGATACCTGATACGAAAAGAATCCGTTGGTCAGTCCTACAGTGTCTCTCTCGTCATCAAGCGCCCCTATCGAGCCGACGTCTCGTGCGAGTCGGCTCTCTGTGTACAAATTGCCCGCGGGAGCATCTGCAGCCTTCCGCCTTAATAGCCAAATGAGCATCCGCCATG 5 | -------------------------------------------------------------------------------- /test_data/porec_test.fasta.fai: -------------------------------------------------------------------------------- 1 | chr1 3577 6 3577 3578 2 | chr2 7551 3590 7551 7552 3 | -------------------------------------------------------------------------------- /test_data/porec_test.params.json: -------------------------------------------------------------------------------- 1 | {"seed": 42122, "genome_size": 10000, "num_chroms": 2, "cut_rate": 0.001, "enzyme": "NlaIII", "num_concatemers": 1000, "num_haplotypes": 2, "variant_density": 0.05, "p_cis": 0.8, "mean_frags_per_concatemer": 10, "max_frags_per_concatemer": 10} -------------------------------------------------------------------------------- /test_data/porec_test.phased_variants.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/porec_test.phased_variants.vcf.gz -------------------------------------------------------------------------------- /test_data/porec_test.phased_variants.vcf.gz.tbi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/porec_test.phased_variants.vcf.gz.tbi -------------------------------------------------------------------------------- /test_data/porec_test_no_index.phased_variants.vcf.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/porec_test_no_index.phased_variants.vcf.gz -------------------------------------------------------------------------------- /test_data/sample_sheet.csv: -------------------------------------------------------------------------------- 1 | flow_cell_id,kit,experiment_id,barcode,alias,type,vcf 2 | FA026858,SQK-RBK004,sequencing_20200522,barcode01,s01,test_sample,test_data/porec_test.phased_variants.vcf.gz 3 | FA026858,SQK-RBK004,sequencing_20200522,barcode02,s02,test_sample,test_data/porec_test.phased_variants.vcf.gz -------------------------------------------------------------------------------- /test_data/sample_sheet_cutter.csv: -------------------------------------------------------------------------------- 1 | flow_cell_id,kit,experiment_id,barcode,alias,type,cutter,vcf 2 | FA026858,SQK-RBK004,sequencing_20200522,barcode01,s01,test_sample,NlaIII,test_data/porec_test.phased_variants.vcf.gz 3 | FA026858,SQK-RBK004,sequencing_20200522,barcode02,s02,test_sample,NlaIII,test_data/porec_test.phased_variants.vcf.gz -------------------------------------------------------------------------------- /test_data/sample_sheet_no_tbi.csv: -------------------------------------------------------------------------------- 1 | flow_cell_id,kit,experiment_id,barcode,alias,type,vcf 2 | FA026858,SQK-RBK004,sequencing_20200522,barcode01,s01,test_sample,test_data/porec_test_no_index.phased_variants.vcf.gz 3 | FA026858,SQK-RBK004,sequencing_20200522,barcode02,s02,test_sample,test_data/porec_test_no_index.phased_variants.vcf.gz -------------------------------------------------------------------------------- /test_data/sample_sheet_no_vcf.csv: -------------------------------------------------------------------------------- 1 | flow_cell_id,kit,experiment_id,barcode,alias,type 2 | FA026858,SQK-RBK004,sequencing_20200522,barcode01,s01,test_sample 3 | FA026858,SQK-RBK004,sequencing_20200522,barcode02,s02,test_sample --------------------------------------------------------------------------------