├── .gitattributes ├── .github ├── .dockstore.yml ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md ├── PULL_REQUEST_TEMPLATE.md ├── markdownlint.yml └── workflows │ ├── awsfulltest.yml │ ├── awstest.yml │ ├── branch.yml │ ├── ci.yml │ ├── linting.yml │ ├── linting_comment.yml │ ├── push_dockerhub_dev.yml │ └── push_dockerhub_release.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── Dockerfile ├── LICENSE ├── README.md ├── assets ├── email_template.html ├── email_template.txt ├── multiqc_config.yaml ├── nf-core-spliz_logo.png └── sendmail_template.txt ├── bin ├── ann_splices.py ├── annotator.py ├── calc_splizvd.py ├── convert_parquet.py ├── final_summary.py ├── find_SpliZ_sites.R ├── light_class_input_subcols.py ├── light_utils.py ├── markdown_to_html.py ├── parquet_to_tsv.py ├── process_CI.py ├── rijk_zscore.py ├── scrape_software_versions.py ├── svd_zscore.py └── variance_adjusted_permutations_bytiss.py ├── conf ├── base.config ├── igenomes.config ├── test.config └── test_full.config ├── docs ├── README.md ├── images │ └── nf-core-spliz_logo.png ├── output.md └── usage.md ├── environment.yml ├── lib ├── Headers.groovy ├── NfcoreSchema.groovy └── nfcore_external_java_deps.jar ├── main.nf ├── modules └── local │ ├── ann_splices.nf │ ├── calc_rijk_zscore.nf │ ├── calc_splizvd.nf │ ├── class_input_10X.nf │ ├── class_input_SS2.nf │ ├── convert_parquet.nf │ ├── convert_split_parquet.nf │ ├── find_spliz_sites.nf │ ├── preprocess_tsv.nf │ ├── process_class_input.nf │ ├── pval_permutations.nf │ └── summarize_results.nf ├── nextflow.config ├── nextflow_schema.json ├── requirements.txt ├── small_data ├── small.config ├── small.pq └── small.tsv ├── subworkflows └── local │ ├── analysis.nf │ ├── convert_bam.nf │ ├── preprocess.nf │ └── spliz.nf └── workflows └── spliz_pipeline.nf /.gitattributes: -------------------------------------------------------------------------------- 1 | *.config linguist-language=nextflow 2 | -------------------------------------------------------------------------------- /.github/.dockstore.yml: -------------------------------------------------------------------------------- 1 | # Dockstore config version, not pipeline version 2 | version: 1.2 3 | workflows: 4 | - subclass: nfl 5 | primaryDescriptorPath: /nextflow.config 6 | publish: True 7 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # nf-core/spliz: Contributing Guidelines 2 | 3 | Hi there! 4 | Many thanks for taking an interest in improving nf-core/spliz. 5 | 6 | We try to manage the required tasks for nf-core/spliz using GitHub issues, you probably came to this page when creating one. 7 | Please use the pre-filled template to save time. 8 | 9 | However, don't be put off by this template - other more general issues and suggestions are welcome! 10 | Contributions to the code are even more welcome ;) 11 | 12 | > If you need help using or modifying nf-core/spliz then the best place to ask is on the nf-core Slack [#spliz](https://nfcore.slack.com/channels/spliz) channel ([join our Slack here](https://nf-co.re/join/slack)). 13 | 14 | ## Contribution workflow 15 | 16 | If you'd like to write some code for nf-core/spliz, the standard workflow is as follows: 17 | 18 | 1. Check that there isn't already an issue about your idea in the [nf-core/spliz issues](https://github.com/nf-core/spliz/issues) to avoid duplicating work 19 | * If there isn't one already, please create one so that others know you're working on this 20 | 2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/spliz repository](https://github.com/nf-core/spliz) to your GitHub account 21 | 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 22 | 4. Use `nf-core schema build .` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). 23 | 5. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged 24 | 25 | If you're not used to this workflow with git, you can start with some [docs from GitHub](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests) or even their [excellent `git` resources](https://try.github.io/). 26 | 27 | ## Tests 28 | 29 | When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. 30 | Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. 31 | 32 | There are typically two types of tests that run: 33 | 34 | ### Lint tests 35 | 36 | `nf-core` has a [set of guidelines](https://nf-co.re/developers/guidelines) which all pipelines must adhere to. 37 | To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint ` command. 38 | 39 | If any failures or warnings are encountered, please follow the listed URL for more documentation. 40 | 41 | ### Pipeline tests 42 | 43 | Each `nf-core` pipeline should be set up with a minimal set of test-data. 44 | `GitHub Actions` then runs the pipeline on this data to ensure that it exits successfully. 45 | If there are any failures then the automated tests fail. 46 | These tests are run both with the latest available version of `Nextflow` and also the minimum required version that is stated in the pipeline code. 47 | 48 | ## Patch 49 | 50 | :warning: Only in the unlikely and regretful event of a release happening with a bug. 51 | 52 | * On your own fork, make a new branch `patch` based on `upstream/master`. 53 | * Fix the bug, and bump version (X.Y.Z+1). 54 | * A PR should be made on `master` from patch to directly this particular bug. 55 | 56 | ## Getting help 57 | 58 | For further information/help, please consult the [nf-core/spliz documentation](https://nf-co.re/spliz/usage) and don't hesitate to get in touch on the nf-core Slack [#spliz](https://nfcore.slack.com/channels/spliz) channel ([join our Slack here](https://nf-co.re/join/slack)). 59 | 60 | ## Pipeline contribution conventions 61 | 62 | To make the nf-core/spliz code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. 63 | 64 | ### Adding a new step 65 | 66 | If you wish to contribute a new step, please use the following coding standards: 67 | 68 | 1. Define the corresponding input channel into your new process from the expected previous process channel 69 | 2. Write the process block (see below). 70 | 3. Define the output channel if needed (see below). 71 | 4. Add any new flags/options to `nextflow.config` with a default (see below). 72 | 5. Add any new flags/options to `nextflow_schema.json` with help text (with `nf-core schema build .`). 73 | 6. Add any new flags/options to the help message (for integer/text parameters, print to help the corresponding `nextflow.config` parameter). 74 | 7. Add sanity checks for all relevant parameters. 75 | 8. Add any new software to the `scrape_software_versions.py` script in `bin/` and the version command to the `scrape_software_versions` process in `main.nf`. 76 | 9. Do local tests that the new code works properly and as expected. 77 | 10. Add a new test command in `.github/workflow/ci.yaml`. 78 | 11. If applicable add a [MultiQC](https://https://multiqc.info/) module. 79 | 12. Update MultiQC config `assets/multiqc_config.yaml` so relevant suffixes, name clean up, General Statistics Table column order, and module figures are in the right order. 80 | 13. Optional: Add any descriptions of MultiQC report sections and output files to `docs/output.md`. 81 | 82 | ### Default values 83 | 84 | Parameters should be initialised / defined with default values in `nextflow.config` under the `params` scope. 85 | 86 | Once there, use `nf-core schema build .` to add to `nextflow_schema.json`. 87 | 88 | ### Default processes resource requirements 89 | 90 | Sensible defaults for process resource requirements (CPUs / memory / time) for a process should be defined in `conf/base.config`. These should generally be specified generic with `withLabel:` selectors so they can be shared across multiple processes/steps of the pipeline. A nf-core standard set of labels that should be followed where possible can be seen in the [nf-core pipeline template](https://github.com/nf-core/tools/blob/master/nf_core/pipeline-template/conf/base.config), which has the default process as a single core-process, and then different levels of multi-core configurations for increasingly large memory requirements defined with standardised labels. 91 | 92 | The process resources can be passed on to the tool dynamically within the process with the `${task.cpu}` and `${task.memory}` variables in the `script:` block. 93 | 94 | ### Naming schemes 95 | 96 | Please use the following naming schemes, to make it easy to understand what is going where. 97 | 98 | * initial process channel: `ch_output_from_` 99 | * intermediate and terminal channels: `ch__for_` 100 | 101 | ### Nextflow version bumping 102 | 103 | If you are using a new feature from core Nextflow, you may bump the minimum required version of nextflow in the pipeline with: `nf-core bump-version --nextflow . [min-nf-version]` 104 | 105 | ### Software version reporting 106 | 107 | If you add a new tool to the pipeline, please ensure you add the information of the tool to the `get_software_version` process. 108 | 109 | Add to the script block of the process, something like the following: 110 | 111 | ```bash 112 | --version &> v_.txt 2>&1 || true 113 | ``` 114 | 115 | or 116 | 117 | ```bash 118 | --help | head -n 1 &> v_.txt 2>&1 || true 119 | ``` 120 | 121 | You then need to edit the script `bin/scrape_software_versions.py` to: 122 | 123 | 1. Add a Python regex for your tool's `--version` output (as in stored in the `v_.txt` file), to ensure the version is reported as a `v` and the version number e.g. `v2.1.1` 124 | 2. Add a HTML entry to the `OrderedDict` for formatting in MultiQC. 125 | 126 | ### Images and figures 127 | 128 | For overview images and other documents we follow the nf-core [style guidelines and examples](https://nf-co.re/developers/design_guidelines). 129 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Report something that is broken or incorrect 4 | labels: bug 5 | --- 6 | 7 | 15 | 16 | ## Check Documentation 17 | 18 | I have checked the following places for your error: 19 | 20 | - [ ] [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) 21 | - [ ] [nf-core/spliz pipeline documentation](https://nf-co.re/nf-core/spliz/usage) 22 | 23 | ## Description of the bug 24 | 25 | 26 | 27 | ## Steps to reproduce 28 | 29 | Steps to reproduce the behaviour: 30 | 31 | 1. Command line: 32 | 2. See error: 33 | 34 | ## Expected behaviour 35 | 36 | 37 | 38 | ## Log files 39 | 40 | Have you provided the following extra information/files: 41 | 42 | - [ ] The command used to run the pipeline 43 | - [ ] The `.nextflow.log` file 44 | 45 | ## System 46 | 47 | - Hardware: 48 | - Executor: 49 | - OS: 50 | - Version 51 | 52 | ## Nextflow Installation 53 | 54 | - Version: 55 | 56 | ## Container engine 57 | 58 | - Engine: 59 | - version: 60 | - Image tag: 61 | 62 | ## Additional context 63 | 64 | 65 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Join nf-core 4 | url: https://nf-co.re/join 5 | about: Please join the nf-core community here 6 | - name: "Slack #spliz channel" 7 | url: https://nfcore.slack.com/channels/spliz 8 | about: Discussion about the nf-core/spliz pipeline 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for the nf-core/spliz pipeline 4 | labels: enhancement 5 | --- 6 | 7 | 15 | 16 | ## Is your feature request related to a problem? Please describe 17 | 18 | 19 | 20 | 21 | 22 | ## Describe the solution you'd like 23 | 24 | 25 | 26 | ## Describe alternatives you've considered 27 | 28 | 29 | 30 | ## Additional context 31 | 32 | 33 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 13 | 14 | 15 | ## PR checklist 16 | 17 | - [ ] This comment contains a description of changes (with reason). 18 | - [ ] If you've fixed a bug or added code that should be tested, add tests! 19 | - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). 20 | - [ ] Usage Documentation in `docs/usage.md` is updated. 21 | - [ ] Output Documentation in `docs/output.md` is updated. 22 | - [ ] `CHANGELOG.md` is updated. 23 | - [ ] `README.md` is updated (including new tool citations and authors/contributors). 24 | -------------------------------------------------------------------------------- /.github/markdownlint.yml: -------------------------------------------------------------------------------- 1 | # Markdownlint configuration file 2 | default: true 3 | line-length: false 4 | no-duplicate-header: 5 | siblings_only: true 6 | no-inline-html: 7 | allowed_elements: 8 | - img 9 | - p 10 | - kbd 11 | - details 12 | - summary 13 | -------------------------------------------------------------------------------- /.github/workflows/awsfulltest.yml: -------------------------------------------------------------------------------- 1 | name: nf-core AWS full size tests 2 | # This workflow is triggered on published releases. 3 | # It can be additionally triggered manually with GitHub actions workflow dispatch. 4 | # It runs the -profile 'test_full' on AWS batch 5 | 6 | on: 7 | workflow_run: 8 | workflows: ["nf-core Docker push (release)"] 9 | types: [completed] 10 | workflow_dispatch: 11 | 12 | 13 | env: 14 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 15 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 16 | TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} 17 | AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} 18 | AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} 19 | AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} 20 | 21 | 22 | jobs: 23 | run-awstest: 24 | name: Run AWS full tests 25 | if: github.repository == 'nf-core/spliz' 26 | runs-on: ubuntu-latest 27 | steps: 28 | - name: Setup Miniconda 29 | uses: conda-incubator/setup-miniconda@v2 30 | with: 31 | auto-update-conda: true 32 | python-version: 3.7 33 | - name: Install awscli 34 | run: conda install -c conda-forge awscli 35 | - name: Start AWS batch job 36 | # TODO nf-core: You can customise AWS full pipeline tests as required 37 | # Add full size test data (but still relatively small datasets for few samples) 38 | # on the `test_full.config` test runs with only one set of parameters 39 | # Then specify `-profile test_full` instead of `-profile test` on the AWS batch command 40 | run: | 41 | aws batch submit-job \ 42 | --region eu-west-1 \ 43 | --job-name nf-core-spliz \ 44 | --job-queue $AWS_JOB_QUEUE \ 45 | --job-definition $AWS_JOB_DEFINITION \ 46 | --container-overrides '{"command": ["nf-core/spliz", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/spliz/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/spliz/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}' 47 | -------------------------------------------------------------------------------- /.github/workflows/awstest.yml: -------------------------------------------------------------------------------- 1 | name: nf-core AWS test 2 | # This workflow is triggered on push to the master branch. 3 | # It can be additionally triggered manually with GitHub actions workflow dispatch. 4 | # It runs the -profile 'test' on AWS batch. 5 | 6 | on: 7 | workflow_dispatch: 8 | 9 | 10 | env: 11 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 12 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 13 | TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }} 14 | AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }} 15 | AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }} 16 | AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }} 17 | 18 | 19 | jobs: 20 | run-awstest: 21 | name: Run AWS tests 22 | if: github.repository == 'nf-core/spliz' 23 | runs-on: ubuntu-latest 24 | steps: 25 | - name: Setup Miniconda 26 | uses: conda-incubator/setup-miniconda@v2 27 | with: 28 | auto-update-conda: true 29 | python-version: 3.7 30 | - name: Install awscli 31 | run: conda install -c conda-forge awscli 32 | - name: Start AWS batch job 33 | # TODO nf-core: You can customise CI pipeline run tests as required 34 | # For example: adding multiple test runs with different parameters 35 | # Remember that you can parallelise this by using strategy.matrix 36 | run: | 37 | aws batch submit-job \ 38 | --region eu-west-1 \ 39 | --job-name nf-core-spliz \ 40 | --job-queue $AWS_JOB_QUEUE \ 41 | --job-definition $AWS_JOB_DEFINITION \ 42 | --container-overrides '{"command": ["nf-core/spliz", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/spliz/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/spliz/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}' 43 | -------------------------------------------------------------------------------- /.github/workflows/branch.yml: -------------------------------------------------------------------------------- 1 | name: nf-core branch protection 2 | # This workflow is triggered on PRs to master branch on the repository 3 | # It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev` 4 | on: 5 | pull_request_target: 6 | branches: [master] 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | steps: 12 | # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches 13 | - name: Check PRs 14 | if: github.repository == 'nf-core/spliz' 15 | run: | 16 | { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/spliz ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] 17 | 18 | 19 | # If the above check failed, post a comment on the PR explaining the failure 20 | # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets 21 | - name: Post PR comment 22 | if: failure() 23 | uses: mshick/add-pr-comment@v1 24 | with: 25 | message: | 26 | ## This PR is against the `master` branch :x: 27 | 28 | * Do not close this PR 29 | * Click _Edit_ and change the `base` to `dev` 30 | * This CI test will remain failed until you push a new commit 31 | 32 | --- 33 | 34 | Hi @${{ github.event.pull_request.user.login }}, 35 | 36 | It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch. 37 | The `master` branch on nf-core repositories should always contain code from the latest release. 38 | Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch. 39 | 40 | You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page. 41 | Note that even after this, the test will continue to show as failing until you push a new commit. 42 | 43 | Thanks again for your contribution! 44 | repo-token: ${{ secrets.GITHUB_TOKEN }} 45 | allow-repeats: false 46 | 47 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: nf-core CI 2 | # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | pull_request: 8 | release: 9 | types: [published] 10 | 11 | # Uncomment if we need an edge release of Nextflow again 12 | # env: NXF_EDGE: 1 13 | 14 | jobs: 15 | test: 16 | name: Run workflow tests 17 | # Only run on push if this is the nf-core dev branch (merged PRs) 18 | if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/spliz') }} 19 | runs-on: ubuntu-latest 20 | env: 21 | NXF_VER: ${{ matrix.nxf_ver }} 22 | NXF_ANSI_LOG: false 23 | strategy: 24 | matrix: 25 | # Nextflow versions: check pipeline minimum and current latest 26 | nxf_ver: ['20.04.0', ''] 27 | steps: 28 | - name: Check out pipeline code 29 | uses: actions/checkout@v2 30 | 31 | - name: Check if Dockerfile or Conda environment changed 32 | uses: technote-space/get-diff-action@v4 33 | with: 34 | FILES: | 35 | Dockerfile 36 | environment.yml 37 | 38 | - name: Build new docker image 39 | if: env.MATCHED_FILES 40 | run: docker build --no-cache . -t nfcore/spliz:dev 41 | 42 | - name: Pull docker image 43 | if: ${{ !env.MATCHED_FILES }} 44 | run: | 45 | docker pull nfcore/spliz:dev 46 | docker tag nfcore/spliz:dev nfcore/spliz:dev 47 | 48 | - name: Install Nextflow 49 | env: 50 | CAPSULE_LOG: none 51 | run: | 52 | wget -qO- get.nextflow.io | bash 53 | sudo mv nextflow /usr/local/bin/ 54 | 55 | - name: Run pipeline with test data 56 | # TODO nf-core: You can customise CI pipeline run tests as required 57 | # For example: adding multiple test runs with different parameters 58 | # Remember that you can parallelise this by using strategy.matrix 59 | run: | 60 | nextflow run ${GITHUB_WORKSPACE} -profile test,docker 61 | -------------------------------------------------------------------------------- /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: nf-core linting 2 | # This workflow is triggered on pushes and PRs to the repository. 3 | # It runs the `nf-core lint` and markdown lint tests to ensure that the code meets the nf-core guidelines 4 | on: 5 | push: 6 | pull_request: 7 | release: 8 | types: [published] 9 | 10 | jobs: 11 | Markdown: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - uses: actions/setup-node@v1 16 | with: 17 | node-version: '10' 18 | - name: Install markdownlint 19 | run: npm install -g markdownlint-cli 20 | - name: Run Markdownlint 21 | run: markdownlint ${GITHUB_WORKSPACE} -c ${GITHUB_WORKSPACE}/.github/markdownlint.yml 22 | 23 | # If the above check failed, post a comment on the PR explaining the failure 24 | - name: Post PR comment 25 | if: failure() 26 | uses: mshick/add-pr-comment@v1 27 | with: 28 | message: | 29 | ## Markdown linting is failing 30 | 31 | To keep the code consistent with lots of contributors, we run automated code consistency checks. 32 | To fix this CI test, please run: 33 | 34 | * Install `markdownlint-cli` 35 | * On Mac: `brew install markdownlint-cli` 36 | * Everything else: [Install `npm`](https://www.npmjs.com/get-npm) then [install `markdownlint-cli`](https://www.npmjs.com/package/markdownlint-cli) (`npm install -g markdownlint-cli`) 37 | * Fix the markdown errors 38 | * Automatically: `markdownlint . --config .github/markdownlint.yml --fix` 39 | * Manually resolve anything left from `markdownlint . --config .github/markdownlint.yml` 40 | 41 | Once you push these changes the test should pass, and you can hide this comment :+1: 42 | 43 | We highly recommend setting up markdownlint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! 44 | 45 | Thanks again for your contribution! 46 | repo-token: ${{ secrets.GITHUB_TOKEN }} 47 | allow-repeats: false 48 | 49 | 50 | YAML: 51 | runs-on: ubuntu-latest 52 | steps: 53 | - uses: actions/checkout@v1 54 | - uses: actions/setup-node@v1 55 | with: 56 | node-version: '10' 57 | - name: Install yaml-lint 58 | run: npm install -g yaml-lint 59 | - name: Run yaml-lint 60 | run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") 61 | 62 | # If the above check failed, post a comment on the PR explaining the failure 63 | - name: Post PR comment 64 | if: failure() 65 | uses: mshick/add-pr-comment@v1 66 | with: 67 | message: | 68 | ## YAML linting is failing 69 | 70 | To keep the code consistent with lots of contributors, we run automated code consistency checks. 71 | To fix this CI test, please run: 72 | 73 | * Install `yaml-lint` 74 | * [Install `npm`](https://www.npmjs.com/get-npm) then [install `yaml-lint`](https://www.npmjs.com/package/yaml-lint) (`npm install -g yaml-lint`) 75 | * Fix the markdown errors 76 | * Run the test locally: `yamllint $(find . -type f -name "*.yml" -o -name "*.yaml")` 77 | * Fix any reported errors in your YAML files 78 | 79 | Once you push these changes the test should pass, and you can hide this comment :+1: 80 | 81 | We highly recommend setting up yaml-lint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! 82 | 83 | Thanks again for your contribution! 84 | repo-token: ${{ secrets.GITHUB_TOKEN }} 85 | allow-repeats: false 86 | 87 | 88 | nf-core: 89 | runs-on: ubuntu-latest 90 | steps: 91 | 92 | - name: Check out pipeline code 93 | uses: actions/checkout@v2 94 | 95 | - name: Install Nextflow 96 | env: 97 | CAPSULE_LOG: none 98 | run: | 99 | wget -qO- get.nextflow.io | bash 100 | sudo mv nextflow /usr/local/bin/ 101 | 102 | - uses: actions/setup-python@v1 103 | with: 104 | python-version: '3.6' 105 | architecture: 'x64' 106 | 107 | - name: Install dependencies 108 | run: | 109 | python -m pip install --upgrade pip 110 | pip install nf-core 111 | 112 | - name: Run nf-core lint 113 | env: 114 | GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }} 115 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 116 | GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }} 117 | run: nf-core -l lint_log.txt lint ${GITHUB_WORKSPACE} --markdown lint_results.md 118 | 119 | - name: Save PR number 120 | if: ${{ always() }} 121 | run: echo ${{ github.event.pull_request.number }} > PR_number.txt 122 | 123 | - name: Upload linting log file artifact 124 | if: ${{ always() }} 125 | uses: actions/upload-artifact@v2 126 | with: 127 | name: linting-logs 128 | path: | 129 | lint_log.txt 130 | lint_results.md 131 | PR_number.txt 132 | 133 | -------------------------------------------------------------------------------- /.github/workflows/linting_comment.yml: -------------------------------------------------------------------------------- 1 | 2 | name: nf-core linting comment 3 | # This workflow is triggered after the linting action is complete 4 | # It posts an automated comment to the PR, even if the PR is coming from a fork 5 | 6 | on: 7 | workflow_run: 8 | workflows: ["nf-core linting"] 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Download lint results 15 | uses: dawidd6/action-download-artifact@v2 16 | with: 17 | workflow: linting.yml 18 | 19 | - name: Get PR number 20 | id: pr_number 21 | run: echo "::set-output name=pr_number::$(cat linting-logs/PR_number.txt)" 22 | 23 | - name: Post PR comment 24 | uses: marocchino/sticky-pull-request-comment@v2 25 | with: 26 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 27 | number: ${{ steps.pr_number.outputs.pr_number }} 28 | path: linting-logs/lint_results.md 29 | 30 | -------------------------------------------------------------------------------- /.github/workflows/push_dockerhub_dev.yml: -------------------------------------------------------------------------------- 1 | name: nf-core Docker push (dev) 2 | # This builds the docker image and pushes it to DockerHub 3 | # Runs on nf-core repo releases and push event to 'dev' branch (PR merges) 4 | on: 5 | push: 6 | branches: 7 | - dev 8 | 9 | jobs: 10 | push_dockerhub: 11 | name: Push new Docker image to Docker Hub (dev) 12 | runs-on: ubuntu-latest 13 | # Only run for the nf-core repo, for releases and merged PRs 14 | if: ${{ github.repository == 'nf-core/spliz' }} 15 | env: 16 | DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} 17 | DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }} 18 | steps: 19 | - name: Check out pipeline code 20 | uses: actions/checkout@v2 21 | 22 | - name: Build new docker image 23 | run: docker build --no-cache . -t nfcore/spliz:dev 24 | 25 | - name: Push Docker image to DockerHub (dev) 26 | run: | 27 | echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin 28 | docker push nfcore/spliz:dev 29 | -------------------------------------------------------------------------------- /.github/workflows/push_dockerhub_release.yml: -------------------------------------------------------------------------------- 1 | name: nf-core Docker push (release) 2 | # This builds the docker image and pushes it to DockerHub 3 | # Runs on nf-core repo releases and push event to 'dev' branch (PR merges) 4 | on: 5 | release: 6 | types: [published] 7 | 8 | jobs: 9 | push_dockerhub: 10 | name: Push new Docker image to Docker Hub (release) 11 | runs-on: ubuntu-latest 12 | # Only run for the nf-core repo, for releases and merged PRs 13 | if: ${{ github.repository == 'nf-core/spliz' }} 14 | env: 15 | DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} 16 | DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }} 17 | steps: 18 | - name: Check out pipeline code 19 | uses: actions/checkout@v2 20 | 21 | - name: Build new docker image 22 | run: docker build --no-cache . -t nfcore/spliz:latest 23 | 24 | - name: Push Docker image to DockerHub (release) 25 | run: | 26 | echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin 27 | docker push nfcore/spliz:latest 28 | docker tag nfcore/spliz:latest nfcore/spliz:${{ github.event.release.tag_name }} 29 | docker push nfcore/spliz:${{ github.event.release.tag_name }} 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .nextflow* 2 | work/ 3 | data/ 4 | results/ 5 | .DS_Store 6 | test* 7 | tests/ 8 | testing/ 9 | testing* 10 | *.pyc 11 | HLCA* 12 | original* 13 | TSP* 14 | *out 15 | *err 16 | *sbatch 17 | samplesheets/* 18 | sandbox* 19 | *mouse* 20 | s3* 21 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # nf-core/spliz: Changelog 2 | 3 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) 4 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 5 | 6 | ## v1.0dev - [date] 7 | 8 | Initial release of nf-core/spliz, created with the [nf-core](https://nf-co.re/) template. 9 | 10 | ### `Added` 11 | 12 | ### `Fixed` 13 | 14 | ### `Dependencies` 15 | 16 | ### `Deprecated` 17 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct at nf-core (v1.0) 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: 6 | 7 | - Age 8 | - Body size 9 | - Familial status 10 | - Gender identity and expression 11 | - Geographical location 12 | - Level of experience 13 | - Nationality and national origins 14 | - Native language 15 | - Physical and neurological ability 16 | - Race or ethnicity 17 | - Religion 18 | - Sexual identity and orientation 19 | - Socioeconomic status 20 | 21 | Please note that the list above is alphabetised and is therefore not ranked in any order of preference or importance. 22 | 23 | ## Preamble 24 | 25 | > Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. 26 | 27 | An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. 28 | 29 | nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. 30 | 31 | We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. 32 | 33 | Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. 34 | 35 | We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. 36 | 37 | Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re 38 | 39 | ## Our Responsibilities 40 | 41 | The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. 42 | 43 | The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 44 | 45 | Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. 46 | 47 | ## When are where does this Code of Conduct apply? 48 | 49 | Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: 50 | 51 | - Communicating with an official project email address. 52 | - Communicating with community members within the nf-core Slack channel. 53 | - Participating in hackathons organised by nf-core (both online and in-person events). 54 | - Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. 55 | - Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. 56 | - Representing nf-core on social media. This includes both official and personal accounts. 57 | 58 | ## nf-core cares 😊 59 | 60 | nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): 61 | 62 | - Ask for consent before sharing another community member’s personal information (including photographs) on social media. 63 | - Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. 64 | - Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) 65 | - Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) 66 | - Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) 67 | - Focus on what is best for the team and the community. (When in doubt, ask) 68 | - Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. 69 | - Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) 70 | - Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) 71 | - Take breaks when you feel like you need them. 72 | - Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) 73 | 74 | ## nf-core frowns on 😕 75 | 76 | The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. 77 | 78 | - Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. 79 | - “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. 80 | - Spamming or trolling of individuals on social media. 81 | - Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. 82 | - Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. 83 | 84 | ### Online Trolling 85 | 86 | The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. 87 | 88 | All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. 89 | 90 | ## Procedures for Reporting CoC violations 91 | 92 | If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. 93 | 94 | You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). 95 | 96 | Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. 97 | 98 | All reports will be handled with utmost discretion and confidentially. 99 | 100 | ## Attribution and Acknowledgements 101 | 102 | - The [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4) 103 | - The [OpenCon 2017 Code of Conduct](http://www.opencon2017.org/code_of_conduct) (CC BY 4.0 OpenCon organisers, SPARC and Right to Research Coalition) 104 | - The [eLife innovation sprint 2020 Code of Conduct](https://sprint.elifesciences.org/code-of-conduct/) 105 | - The [Mozilla Community Participation Guidelines v3.1](https://www.mozilla.org/en-US/about/governance/policies/participation/) (version 3.1, CC BY-SA 3.0 Mozilla) 106 | 107 | ## Changelog 108 | 109 | ### v1.0 - March 12th, 2021 110 | 111 | - Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. 112 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # based on existing Docker image 2 | FROM ubuntu:20.04 3 | ENV DEBIAN_FRONTEND=noninteractive 4 | 5 | # dependencies, some are probably unnecessary 6 | RUN apt-get update && apt-get install -y wget && apt-get install -y --no-install-recommends build-essential r-base python3.9 python3-pip python3-setuptools python3-dev 7 | RUN apt-get update -qq && apt-get -y --no-install-recommends install \ 8 | r-base-dev \ 9 | libgsl0-dev \ 10 | libxml2-dev \ 11 | libcairo2-dev \ 12 | libsqlite-dev \ 13 | libpq-dev \ 14 | libicu-dev \ 15 | libbz2-dev \ 16 | liblzma-dev \ 17 | libfontconfig1-dev \ 18 | libssl-dev \ 19 | libcurl4-openssl-dev \ 20 | libnetcdf-dev \ 21 | udunits-bin \ 22 | libopenblas-dev \ 23 | libudunits2-dev \ 24 | curl 25 | RUN apt-get update -qq && apt-get -y --no-install-recommends install \ 26 | autoconf \ 27 | automake \ 28 | g++ \ 29 | gcc \ 30 | gfortran \ 31 | make \ 32 | && apt-get clean all \ 33 | && rm -rf /var/lib/apt/lists/* 34 | 35 | # Python packages 36 | WORKDIR /app 37 | COPY requirements.txt /app/requirements.txt 38 | RUN pip3 install -r requirements.txt 39 | 40 | # R packages 41 | RUN LC_ALL=C.UTF-8 Rscript -e "install.packages('data.table')" 42 | RUN LC_ALL=C.UTF-8 Rscript -e "install.packages('logger')" 43 | RUN LC_ALL=C.UTF-8 Rscript -e "install.packages('Rfast')" 44 | 45 | 46 | COPY . /app 47 | 48 | 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Salzman Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | 4 | **salzmanlab/spliz** is a bioinformatics best-practise analysis pipeline for calculating the splicing z-score for single cell RNA-seq analysis. 5 | 6 | This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE). 7 | 8 | > The nf-core framework for community-curated bioinformatics pipelines. 9 | > 10 | > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. 11 | > 12 | > Nat Biotechnol. 2020 Feb 13. doi: 10.1038/s41587-020-0439-x. 13 | 14 | ## Quick Start 15 | 16 | 1. Install [`nextflow`](https://nf-co.re/usage/installation) (`>=20.04.0`) and [`conda`](https://docs.conda.io/en/latest/). 17 | 18 | 2. Download environment file. 19 | ```bash 20 | wget https://raw.githubusercontent.com/salzmanlab/SpliZ/main/environment.yml 21 | ``` 22 | 23 | 3. Create conda environment and activate. 24 | ```bash 25 | conda env create --name spliz_env --file=environment.yml 26 | conda activate spliz_env 27 | ``` 28 | 29 | 4. Run the pipeline on the test data set. 30 | You may need to modify the [executor scope](https://www.nextflow.io/docs/latest/executor.html) in the config file, in accordance to your compute needs. 31 | ```bash 32 | nextflow run salzmanlab/spliz \ 33 | -r main \ 34 | -latest \ 35 | -profile small_test_data 36 | ``` 37 | [Sherlock](https://www.sherlock.stanford.edu/) users should use the `sherlock` profile: 38 | 39 | 40 | nextflow run salzmanlab/spliz \ 41 | -r main \ 42 | -latest \ 43 | -profile small_test_data,sherlock 44 | 45 | 5. Run the pipeline on your own dataset. 46 | 1. Edit your config file with the parameters below. (You can use `/small_data/small.config` as a template, be sure to include any memory or time paramters.) 47 | 2. Run with your config file: 48 | ``` 49 | nextflow run salzmanlab/spliz \ 50 | -r main \ 51 | -latest \ 52 | -c YOUR_CONFIG_HERE.conf 53 | ``` 54 | 55 | 56 | See [usage docs](https://nf-co.re/spliz/usage) for all of the available options when running the pipeline. 57 | 58 | ## Pipeline Summary 59 | 60 | By default, the pipeline currently performs the following: 61 | * Calculate the SpliZ scores for: 62 | * Identifying variable splice sites 63 | * Identifying differential splicing between cell types. 64 | 65 | ## Input Parameters 66 | 67 | | Argument | Description |Example Usage | 68 | | --------------------- | ---------------- |-----------| 69 | | `dataname` | Descriptive name for SpliZ run | "Tumor_5" | 70 | | `run_analysis` | If the pipeline will perform splice site identifcation and differential splicing analysis | `true`, `false` | 71 | | `input_file` | File to be used as SpliZ input | *tumor_5_with_postprocessing.txt* | 72 | | `SICILIAN` | If `input_file` is output from [SICILIAN](https://github.com/salzmanlab/SICILIAN) | `true`, `false` | 73 | | `pin_S` | Bound splice site residuals at this quantile (e.g. values in the lower `pin_S` quantile and the upper 1 - `pin_S` quantile will be rounded to the quantile limits) | 0.1 | 74 | | `pin_z` | Bound SpliZ scores at this quantile (e.g. values in the lower `pin_z` quantile and the upper 1 - `pin_z` quantile will be rounded to the quantile limits) | 0 | 75 | | `bounds` | Only include cell/gene pairs that have more than this many junctional reads for the gene | 5 | 76 | | `light` | Only output the minimum number of columns | `true`, `false` | 77 | | `svd_type` | Type of SVD calculation | `normdonor`, `normgene` | 78 | | `n_perms` | Number of permutations | 100 | 79 | | `grouping_level_1` | Metadata column by which the data is intially partitioned | "tissue" | 80 | | `grouping_level_2` | Metadata column by which the partitioned data is grouped | "compartment" | 81 | | `libraryType` | Library prepration method of the input data | `10X`, `SS2` | 82 | 83 | ## Optional Parameters for non-SICILIAN Inputs (`SICILIAN` = `false`) 84 | | Argument | Description |Example Usage | 85 | | --------------------- | ---------------- |-----------| 86 | | `samplesheet` | If input files are in BAM format, this file specifies the locations of the input bam files. Samplesheet formatting is specified below. | *Tumor_5_samplesheet.csv* | 87 | | `annotator_pickle` | [Genome-specific annotation file for gene names](https://github.com/salzmanlab/SICILIAN#annotator-and-index-files-needed-for-running-sicilian) | *hg38_refseq.pkl* | 88 | | `exon_pickle` | [Genome-specific annotation file for exon boundaries](https://github.com/salzmanlab/SICILIAN#annotator-and-index-files-needed-for-running-sicilian) | *hg38_refseq_exon_bounds.pkl* | 89 | | `splice_pickle` | [Genome-specific annotation file for splice sites](https://github.com/salzmanlab/SICILIAN#annotator-and-index-files-needed-for-running-sicilian) | *hg38_refseq_splices.pkl* | 90 | | `gtf` | GTF file used as the reference annotation file for the genome assembly | *GRCh38_genomic.gtf* | 91 | | `meta` | If input files are in BAM format, this file contains per-cell annotations. This file must contain columns for `grouping_level_1` and `grouping_level_2`. | *metadata_tumor_5.tsv* | 92 | 93 | ### Samplesheets 94 | 95 | The samplesheet must be in comma-separated value(CSV) format. The file must be without a header. The sampleID must be a unique identifier for each bam file entry. 96 | 97 | For non-SICILIAN samples, samplesheets must have 2 columns: sampleID and path to the bam file. 98 | ``` 99 | Tumor_5_S1,tumor_5_S1_L001.bam 100 | Tumor_5_S2,tumor_5_S2_L002.bam 101 | Tumor_5_S3,tumor_5_S3_L003.bam 102 | ``` 103 | 104 | For SICILIAN SS2 samples, amplesheets must have 3 columns: sampleID, read 1 bam file, and read 2 bam file. 105 | ``` 106 | Tumor_5_S1,tumor_5_S1_L001_R1.bam,tumor_5_S1_L001_R2.bam 107 | Tumor_5_S2,tumor_5_S2_L002_R1.bam,tumor_5_S2_L002_R2.bam 108 | Tumor_5_S3,tumor_5_S3_L003_R1.bam,tumor_5_S3_L003_R2.bam 109 | ``` 110 | 111 | ## Credits 112 | 113 | salzmanlab/spliz was originally written by Salzman Lab. 114 | 115 | ## Contributions and Support 116 | 117 | If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). 118 | 119 | 120 | ## Citations 121 | 122 | 123 | 124 | This repositiory contains code to perform the analyses in this paper: 125 | 126 | > **The SpliZ generalizes “Percent Spliced In” to reveal regulated splicing at single-cell resolution** 127 | > 128 | > Julia Eve Olivieri*, Roozbeh Dehghannasiri*, Julia Salzman. 129 | > 130 | > _Nature Methods_ 2022 Mar 3. doi: [https://www.nature.com/articles/s41592-022-01400-x](https://www.nature.com/articles/s41592-022-01400-x). 131 | 132 | You can cite the `nf-core` publication as follows: 133 | 134 | > **The nf-core framework for community-curated bioinformatics pipelines.** 135 | > 136 | > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. 137 | > 138 | > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). 139 | 140 | 141 | -------------------------------------------------------------------------------- /assets/email_template.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | nf-core/spliz Pipeline Report 9 | 10 | 11 |
12 | 13 | 14 | 15 |

nf-core/spliz v${version}

16 |

Run Name: $runName

17 | 18 | <% if (!success){ 19 | out << """ 20 |
21 |

nf-core/spliz execution completed unsuccessfully!

22 |

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

23 |

The full error message was:

24 |
${errorReport}
25 |
26 | """ 27 | } else { 28 | out << """ 29 |
30 | nf-core/spliz execution completed successfully! 31 |
32 | """ 33 | } 34 | %> 35 | 36 |

The workflow was completed at $dateComplete (duration: $duration)

37 |

The command used to launch the workflow was as follows:

38 |
$commandLine
39 | 40 |

Pipeline Configuration:

41 | 42 | 43 | <% out << summary.collect{ k,v -> "" }.join("\n") %> 44 | 45 |
$k
$v
46 | 47 |

nf-core/spliz

48 |

https://github.com/nf-core/spliz

49 | 50 |
51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /assets/email_template.txt: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------- 2 | ,--./,-. 3 | ___ __ __ __ ___ /,-._.--~\\ 4 | |\\ | |__ __ / ` / \\ |__) |__ } { 5 | | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, 6 | `._,._,' 7 | nf-core/spliz v${version} 8 | ---------------------------------------------------- 9 | 10 | Run Name: $runName 11 | 12 | <% if (success){ 13 | out << "## nf-core/spliz execution completed successfully! ##" 14 | } else { 15 | out << """#################################################### 16 | ## nf-core/spliz execution completed unsuccessfully! ## 17 | #################################################### 18 | The exit status of the task that caused the workflow execution to fail was: $exitStatus. 19 | The full error message was: 20 | 21 | ${errorReport} 22 | """ 23 | } %> 24 | 25 | 26 | The workflow was completed at $dateComplete (duration: $duration) 27 | 28 | The command used to launch the workflow was as follows: 29 | 30 | $commandLine 31 | 32 | 33 | 34 | Pipeline Configuration: 35 | ----------------------- 36 | <% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> 37 | 38 | -- 39 | nf-core/spliz 40 | https://github.com/nf-core/spliz 41 | -------------------------------------------------------------------------------- /assets/multiqc_config.yaml: -------------------------------------------------------------------------------- 1 | report_comment: > 2 | This report has been generated by the nf-core/spliz 3 | analysis pipeline. For information about how to interpret these results, please see the 4 | documentation. 5 | report_section_order: 6 | software_versions: 7 | order: -1000 8 | nf-core-spliz-summary: 9 | order: -1001 10 | 11 | export_plots: true 12 | -------------------------------------------------------------------------------- /assets/nf-core-spliz_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salzman-lab/SpliZ/594532c1b12a1a30b5e00cd9e1fbed30ef28047e/assets/nf-core-spliz_logo.png -------------------------------------------------------------------------------- /assets/sendmail_template.txt: -------------------------------------------------------------------------------- 1 | To: $email 2 | Subject: $subject 3 | Mime-Version: 1.0 4 | Content-Type: multipart/related;boundary="nfcoremimeboundary" 5 | 6 | --nfcoremimeboundary 7 | Content-Type: text/html; charset=utf-8 8 | 9 | $email_html 10 | 11 | --nfcoremimeboundary 12 | Content-Type: image/png;name="nf-core-spliz_logo.png" 13 | Content-Transfer-Encoding: base64 14 | Content-ID: 15 | Content-Disposition: inline; filename="nf-core-spliz_logo.png" 16 | 17 | <% out << new File("$projectDir/assets/nf-core-spliz_logo.png"). 18 | bytes. 19 | encodeBase64(). 20 | toString(). 21 | tokenize( '\n' )*. 22 | toList()*. 23 | collate( 76 )*. 24 | collect { it.join() }. 25 | flatten(). 26 | join( '\n' ) %> 27 | 28 | <% 29 | if (mqcFile){ 30 | def mqcFileObj = new File("$mqcFile") 31 | if (mqcFileObj.length() < mqcMaxSize){ 32 | out << """ 33 | --nfcoremimeboundary 34 | Content-Type: text/html; name=\"multiqc_report\" 35 | Content-Transfer-Encoding: base64 36 | Content-ID: 37 | Content-Disposition: attachment; filename=\"${mqcFileObj.getName()}\" 38 | 39 | ${mqcFileObj. 40 | bytes. 41 | encodeBase64(). 42 | toString(). 43 | tokenize( '\n' )*. 44 | toList()*. 45 | collate( 76 )*. 46 | collect { it.join() }. 47 | flatten(). 48 | join( '\n' )} 49 | """ 50 | }} 51 | %> 52 | 53 | --nfcoremimeboundary-- 54 | -------------------------------------------------------------------------------- /bin/ann_splices.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from collections import defaultdict 5 | import pandas as pd 6 | import pickle 7 | import annotator 8 | 9 | 10 | def get_args(): 11 | parser = argparse.ArgumentParser(description="add annotation columns for splicing") 12 | parser.add_argument( 13 | "-i", 14 | "--in_file", 15 | help="the file to add columns to. Must be human data, be tab separated, and have columns chrR1A, chrR1B, juncPosR1A, and juncPosR1B. Will create columns exon_annR1A, exon_annR1B, both_ann, splice_ann, and sort_junc (the last is just an artifact of computation)", 16 | ) 17 | parser.add_argument( 18 | "-o", 19 | "--out_file", 20 | help="file to save the output to. If you just want to add the columns to the original file you can pass in the same path as in_file", 21 | ) 22 | parser.add_argument( 23 | "-e", "--exon_pickle", help="the pickle file for exon annotation" 24 | ) 25 | parser.add_argument( 26 | "-s", "--splice_pickle", help="the pickle file for splice junction annotation" 27 | ) 28 | args = parser.parse_args() 29 | return args 30 | 31 | 32 | def add_exon_columns(temp_df, exon_bounds): 33 | for suffix in ["A", "B"]: 34 | temp_df["exon_annR1" + suffix] = False 35 | for name2, group in temp_df.groupby("chrR1A"): 36 | temp_df.loc[group.index, "exon_annR1" + suffix] = group[ 37 | "juncPosR1" + suffix 38 | ].isin(exon_bounds[name2]) 39 | 40 | temp_df["both_ann"] = (temp_df["exon_annR1B"] & temp_df["exon_annR1A"]).astype( 41 | "bool" 42 | ) 43 | return temp_df 44 | 45 | 46 | def add_splice_ann_column(temp_df, splices): 47 | temp_df["sort_junc"] = [ 48 | tuple(sorted([x, y])) for x, y in zip(temp_df.juncPosR1A, temp_df.juncPosR1B) 49 | ] 50 | temp_df["splice_ann"] = False 51 | 52 | for name2, group in temp_df.groupby("chrR1A"): 53 | sub_group = group[group["chrR1A"].astype(str) == group["chrR1A"].astype(str)] 54 | if name2 in splices: 55 | 56 | temp_df.loc[sub_group.index, "splice_ann"] = sub_group["sort_junc"].isin( 57 | splices[name2] 58 | ) 59 | return temp_df 60 | 61 | 62 | def main(): 63 | args = get_args() 64 | 65 | exon_bounds = pickle.load(open(args.exon_pickle, "rb")) 66 | splices = pickle.load(open(args.splice_pickle, "rb")) 67 | 68 | exon_bounds = defaultdict(set, exon_bounds) 69 | splices = defaultdict(set, splices) 70 | 71 | df = pd.read_parquet(args.in_file) 72 | df = add_exon_columns(df, exon_bounds) 73 | df = add_splice_ann_column(df, splices) 74 | print(df.head()) 75 | df.to_csv(args.out_file, sep="\t", index=False) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() -------------------------------------------------------------------------------- /bin/annotator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import pandas as pd 4 | 5 | def get_gene_id(row): 6 | # return row["attribute"].split(";")[0].split()[1][1:-1] 7 | if "gene_name" in row["attribute"]: 8 | return row["attribute"].split("gene_name")[-1].split('"')[1] 9 | elif ";gene=" in row["attribute"]: 10 | return row["attribute"].split(";gene=")[-1].split(";")[0] 11 | 12 | def round_down(num, divisor): 13 | return num - (num%divisor) 14 | 15 | # This is a class to create an annotator object that you can create based on a gtf file, 16 | # that allows you to put in a chromosome and position and get all gene names in that area. 17 | 18 | # Usage example: 19 | # import sys 20 | # sys.path.insert(0, '/scratch/PI/horence/JuliaO/single_cell/scripts/') 21 | # import annotator 22 | # 23 | # ann = annotator.Annotator(/scratch/PI/horence/JuliaO/single_cell/STAR_output/mm10_files/mm10.gtf) # this step can take a while - like 2 minutes 24 | # ann.get_name_given_locus("chr1", 1003024) # returns all gene names separated by ","; if none, returns "" 25 | 26 | class Annotator: 27 | def __init__(self, gtf_file, jump = 10000): 28 | self.jump = jump 29 | self.gtf_file = gtf_file 30 | self.unknown = "unknown" 31 | self.unknown_strand = "?" 32 | self.get_gtf_dict() 33 | 34 | def get_gtf_dict(self): 35 | print("here") 36 | 37 | # load in gtf 38 | gtf_df = pd.read_csv(self.gtf_file,sep="\t",names=["seqname","source","feature","start","end","score","strand","frame","attribute"],comment="#") 39 | print(gtf_df.head()) 40 | # make gene id column 41 | gtf_df["gene_id"] = gtf_df.apply(get_gene_id, axis=1) 42 | print(gtf_df.head()) 43 | 44 | # figure out how long to make each chromosome entry 45 | seqname_len_dict = {} 46 | for seqname in gtf_df["seqname"].unique(): 47 | print(seqname) 48 | seqname_len_dict[seqname] = max(gtf_df[gtf_df["seqname"] == seqname]["end"]) 49 | if seqname_len_dict[seqname] < max(gtf_df[gtf_df["seqname"] == seqname]["start"]): 50 | print("start more than end") 51 | 52 | # set up gtf dict to have a dictionary for each chromsome with entries for every "jump" in its length 53 | gtf_dict = {s : {r : {} for r in range(0, seqname_len_dict[s],self.jump)} for s in seqname_len_dict.keys()} 54 | 55 | # assign genes to their requisite ranges 56 | for seqname in seqname_len_dict: 57 | seqname_df = gtf_df[gtf_df["seqname"] == seqname] 58 | for gene_id in seqname_df["gene_id"].unique(): 59 | if gene_id is not None: 60 | gene_df = seqname_df[seqname_df["gene_id"] == gene_id] 61 | if len(gene_df["strand"].unique()) == 1: 62 | # print("gene_df['strand'].unique(): {}".format(gene_df['strand'].unique())) 63 | # print("gene_df['strand'].unique()[0]: {}".format(gene_df["strand"].unique()[0])) 64 | strand = gene_df["strand"].unique()[0] 65 | else: 66 | strand = self.unknown_strand 67 | 68 | # assign gene to all ranges it falls within 69 | try: 70 | start = min(gene_df["start"]) 71 | except: 72 | print("gene_id",gene_id) 73 | print("start failed", gene_df) 74 | try: 75 | end = max(gene_df["end"]) 76 | except: 77 | print("gene_id",gene_id) 78 | print("end failed",gene_df) 79 | for j in range(round_down(start,self.jump),round_down(end + self.jump, self.jump),self.jump): 80 | gtf_dict[seqname][j][gene_id] = [start,end, strand] 81 | self.gtf_dict = gtf_dict 82 | 83 | def get_name_given_locus(self, seqname, position, read_strand = "", stranded_library = False): 84 | 85 | try: 86 | poss_genes = self.gtf_dict[seqname][round_down(position,self.jump)] 87 | except Exception as e: 88 | 89 | if seqname not in self.gtf_dict.keys(): 90 | if stranded_library: 91 | return self.unknown, read_strand 92 | 93 | else: 94 | return self.unknown, self.unknown_strand 95 | if position > max(self.gtf_dict[seqname].keys()): 96 | if stranded_library: 97 | return self.unknown, read_strand 98 | else: 99 | return self.unknown, self.unknown_strand 100 | else: 101 | raise e 102 | if len(poss_genes) == 0: 103 | if stranded_library: 104 | return self.unknown, read_strand 105 | else: 106 | return self.unknown, self.unknown_strand 107 | 108 | gene_names = [] 109 | strands = [] 110 | for gene, pos in poss_genes.items(): 111 | if pos[0] <= position <= pos[1]: 112 | if stranded_library: 113 | if pos[2] == read_strand: 114 | gene_names.append(gene) 115 | strands.append(pos[2]) 116 | else: 117 | gene_names.append(gene) 118 | strands.append(pos[2]) 119 | if len(gene_names) == 0: 120 | gene_names.append(self.unknown) 121 | 122 | if len(set(strands)) == 1: 123 | strand = strands[0] 124 | elif stranded_library: 125 | strand = read_strand 126 | else: 127 | strand = self.unknown_strand 128 | return ",".join(gene_names), strand -------------------------------------------------------------------------------- /bin/convert_parquet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser(description="convert parquet to tsv") 8 | parser.add_argument("--tsv",help="name to save tsv") 9 | parser.add_argument("--dataname",help="Dataname/basename of the input file") 10 | args = parser.parse_args() 11 | return args 12 | 13 | def main(): 14 | args = get_args() 15 | full_df = pd.read_csv(args.tsv, sep = "\t") 16 | 17 | df = full_df[full_df['called'] == True] 18 | 19 | for i, x in df.groupby('chrR1A'): 20 | outname = "{}_{}.pq".format(i, args.dataname) 21 | x.to_parquet(outname) 22 | 23 | 24 | main() -------------------------------------------------------------------------------- /bin/final_summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import numpy as np 5 | import pandas as pd 6 | import logging 7 | 8 | def get_args(): 9 | parser = argparse.ArgumentParser(description="Create final summary file") 10 | parser.add_argument("--perm_pvals", help="Permutation pvalue file") 11 | parser.add_argument("--first_evec", help="First eigenvector file") 12 | parser.add_argument("--second_evec", help="Second eigenvector file") 13 | parser.add_argument("--third_evec", help="Third eigenvector file") 14 | parser.add_argument("--splizvd", help="SpliZVD file") 15 | parser.add_argument("--grouping_level_2", help="column to group the data by (e.g. ontology, compartment, tissue)", default="ontology") 16 | parser.add_argument("--grouping_level_1", help="subset data by this column before checking for differences (e.g. tissue, compartment)", default="dummy") 17 | parser.add_argument("--outname", help="Name of output file") 18 | parser.add_argument("--outname_log", help="Name of log file") 19 | 20 | args = parser.parse_args() 21 | return args 22 | 23 | 24 | def main(): 25 | args = get_args() 26 | 27 | logging.basicConfig( 28 | filename = args.outname_log, 29 | format='%(asctime)s %(levelname)-8s %(message)s', 30 | level=logging.INFO, 31 | datefmt='%Y-%m-%d %H:%M:%S') 32 | 33 | logging.info("Starting") 34 | 35 | # load in data 36 | pval_df = pd.read_csv(args.perm_pvals, sep = "\t") 37 | 38 | splizsite_dfs = [] 39 | evec_files = [args.first_evec, args.second_evec, args.third_evec] 40 | for evec_file in evec_files: 41 | splizsite_dfs.append(pd.read_csv(evec_file, sep="\t")) 42 | splizsite_df = pd.concat(splizsite_dfs,axis=0).drop_duplicates() 43 | 44 | df = pd.read_csv(args.splizvd, sep="\t") 45 | if (args.grouping_level_1 == "tiss_comp") & (args.grouping_level_1 not in df.columns): 46 | df["tiss_comp"] = df[args.grouping_level_1] + df[args.grouping_level_2] 47 | elif args.grouping_level_1 == "dummy": 48 | df["dummy"] = "dummy" 49 | 50 | # combine outputs 51 | out_dict = {"gene" : [],"grouping_level_1" : [], "grouping_level_2" : [], "SpliZsites" : []} 52 | z_cols = ["scZ","svd_z0","svd_z1","svd_z2"] 53 | 54 | for z_col in z_cols: 55 | out_dict["{}_median".format(z_col)] = [] 56 | out_dict["{}_pval".format(z_col)] = [] 57 | 58 | for gene, gene_df in df.groupby("gene"): 59 | for tiss, tiss_df in gene_df.groupby(args.grouping_level_1): 60 | for ont, ont_df in tiss_df.groupby(args.grouping_level_2): 61 | out_dict["gene"].append(gene) 62 | out_dict["grouping_level_1"].append(tiss) 63 | out_dict["grouping_level_2"].append(ont) 64 | out_dict["SpliZsites"].append(",".join([str(x) for x in splizsite_df[splizsite_df["gene"] == gene]["end"]])) 65 | 66 | 67 | for z_col in z_cols: 68 | 69 | out_dict["{}_median".format(z_col)].append(ont_df[z_col].median()) 70 | try: 71 | pval = pval_df[(pval_df["gene"] == gene) & ((pval_df["grouping_level_1"] == tiss) | (pval_df["grouping_level_1"].isna()))]["perm_pval_adj_{}".format(z_col)].iloc[0] 72 | except: 73 | pval = np.nan 74 | out_dict["{}_pval".format(z_col)].append(pval) 75 | out_df = pd.DataFrame.from_dict(out_dict) 76 | out_df = out_df.sort_values(["gene","grouping_level_1","scZ_median"]) 77 | out_df.to_csv(args.outname, sep="\t", index=False) 78 | 79 | logging.info("Completed") 80 | 81 | main() -------------------------------------------------------------------------------- /bin/find_SpliZ_sites.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # Written By : Roozbeh Dehghannasiri (rdehghan@stanford.edu) 4 | # this script takes the permutation file and finds the most variable splize sites for genes with permutation p-value <0.05 5 | # it finds up to 3 splice sites for each eigenvector (1st, 2nd, 3rd) 6 | #it creates three output files corresponding to the splice sites for each eigenvector 7 | 8 | library(data.table) 9 | library(Rfast) 10 | 11 | args <- commandArgs(trailingOnly = TRUE) 12 | p_value_file = args[1] 13 | first_evec_file = args[2] 14 | second_evec_file = args[3] 15 | third_evec_file = args[4] 16 | libraryType = args[5] 17 | mat_samplesheet = args[6] 18 | 19 | p_value = fread(p_value_file,sep="\t",header=TRUE) 20 | mat_paths = fread(mat_samplesheet,sep="\t",header=TRUE) 21 | 22 | ## I want to select the top 20 and top 50 genes with FDR < 0.05 23 | if (libraryType == "SS2") { 24 | p_value = p_value[perm_pval_adj_svd_z0<0.05] 25 | } 26 | 27 | 28 | 29 | topgenes = unique(p_value$gene) 30 | print(paste("number of genes to run",length(topgenes))) 31 | 32 | if (length(topgenes) == 0) { 33 | to_plot <- data.frame(matrix(ncol = 3, nrow = 0)) 34 | names(to_plot) = c("gene","let","end") 35 | write.table(to_plot, first_evec_file, sep = "\t", row.names = FALSE, quote = FALSE) 36 | write.table(to_plot, second_evec_file, sep = "\t", row.names = FALSE, quote = FALSE) 37 | write.table(to_plot, third_evec_file, sep = "\t", row.names = FALSE, quote = FALSE) 38 | 39 | } else { 40 | 41 | 42 | gene_to_plot = c() # I get these vectors to build a data table so that their dot plots can be made automatically 43 | coordinate_to_plot = c() 44 | let_to_plot = c() 45 | for (counter in 1:length(topgenes)){ 46 | gene = topgenes[counter] # name of the gene 47 | tryCatch({ 48 | # geneMat_file = paste(gene, ".geneMat", sep="") 49 | geneMat_file = mat_paths$path[mat_paths$gene == gene] 50 | 51 | loadings = fread(geneMat_file) 52 | loadings_sq = loadings[1,]^2 53 | top_site = names(loadings_sq)[loadings_sq==max(loadings_sq)] 54 | coordinate_to_plot = c(coordinate_to_plot,strsplit(top_site,split = "_")[[1]][1]) 55 | let_to_plot = c(let_to_plot,strsplit(top_site,split = "_")[[1]][2]) 56 | gene_to_plot = c(gene_to_plot,gene) 57 | 58 | # I copy for the second and third only if they have at least 10% of loadings 59 | if (Rfast::nth(as.matrix(loadings_sq), 2, descending = T) > 0.1){ 60 | second_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 2, descending = T)] 61 | coordinate_to_plot = c(coordinate_to_plot,strsplit(second_top_site,split = "_")[[1]][1]) 62 | let_to_plot = c(let_to_plot,strsplit(second_top_site,split = "_")[[1]][2]) 63 | gene_to_plot = c(gene_to_plot,gene) 64 | } 65 | if (Rfast::nth(as.matrix(loadings_sq), 3, descending = T) > 0.1){ 66 | third_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 3, descending = T)] 67 | coordinate_to_plot = c(coordinate_to_plot,strsplit(third_top_site,split = "_")[[1]][1]) 68 | let_to_plot = c(let_to_plot,strsplit(third_top_site,split = "_")[[1]][2]) 69 | gene_to_plot = c(gene_to_plot,gene) 70 | } 71 | 72 | top_site = "" 73 | second_top_site = "" 74 | third_top_site = "" 75 | },error=function(e){cat("ERROR :",conditionMessage(e), "\n")}) 76 | } 77 | to_plot = data.table(gene_to_plot,let_to_plot,coordinate_to_plot) 78 | names(to_plot) = c("gene","let","end") 79 | 80 | write.table(to_plot, first_evec_file, sep = "\t", row.names = FALSE, quote = FALSE) 81 | 82 | ############################## 83 | #### second eigen vector ##### 84 | ############################## 85 | 86 | gene_to_plot = c() # I get these vectors to build a data table so that their dot plots can be made automatically 87 | coordinate_to_plot = c() 88 | let_to_plot = c() 89 | for (counter in 1:length(topgenes)){ 90 | gene = topgenes[counter] # name of the gene 91 | tryCatch({ 92 | geneMat_file = mat_paths$path[mat_paths$gene == gene] 93 | 94 | 95 | loadings = fread(geneMat_file) 96 | loadings_sq = loadings[2,]^2 97 | top_site = names(loadings_sq)[loadings_sq==max(loadings_sq)] 98 | coordinate_to_plot = c(coordinate_to_plot,strsplit(top_site,split = "_")[[1]][1]) 99 | let_to_plot = c(let_to_plot,strsplit(top_site,split = "_")[[1]][2]) 100 | gene_to_plot = c(gene_to_plot,gene) 101 | 102 | # I copy for the second and third only if they have at least 10% of loadings 103 | if (Rfast::nth(as.matrix(loadings_sq), 2, descending = T) > 0.1){ 104 | second_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 2, descending = T)] 105 | coordinate_to_plot = c(coordinate_to_plot,strsplit(second_top_site,split = "_")[[1]][1]) 106 | let_to_plot = c(let_to_plot,strsplit(second_top_site,split = "_")[[1]][2]) 107 | gene_to_plot = c(gene_to_plot,gene) 108 | } 109 | if (Rfast::nth(as.matrix(loadings_sq), 3, descending = T) > 0.1){ 110 | third_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 3, descending = T)] 111 | coordinate_to_plot = c(coordinate_to_plot,strsplit(third_top_site,split = "_")[[1]][1]) 112 | let_to_plot = c(let_to_plot,strsplit(third_top_site,split = "_")[[1]][2]) 113 | gene_to_plot = c(gene_to_plot,gene) 114 | } 115 | 116 | top_site = "" 117 | second_top_site = "" 118 | third_top_site = "" 119 | },error=function(e){cat("ERROR :",conditionMessage(e), "\n")}) 120 | } 121 | to_plot = data.table(gene_to_plot,let_to_plot,coordinate_to_plot) 122 | names(to_plot) = c("gene","let","end") 123 | 124 | write.table(to_plot, second_evec_file, sep = "\t", row.names = FALSE, quote = FALSE) 125 | 126 | 127 | ############################## 128 | #### third eigen vector ##### 129 | ############################## 130 | 131 | gene_to_plot = c() # I get these vectors to build a data table so that their dot plots can be made automatically 132 | coordinate_to_plot = c() 133 | let_to_plot = c() 134 | for (counter in 1:length(topgenes)){ 135 | gene = topgenes[counter] # name of the gene 136 | tryCatch({ 137 | geneMat_file = mat_paths$path[mat_paths$gene == gene] 138 | 139 | 140 | loadings = fread(geneMat_file) 141 | loadings_sq = loadings[3,]^2 142 | top_site = names(loadings_sq)[loadings_sq==max(loadings_sq)] 143 | coordinate_to_plot = c(coordinate_to_plot,strsplit(top_site,split = "_")[[1]][1]) 144 | let_to_plot = c(let_to_plot,strsplit(top_site,split = "_")[[1]][2]) 145 | gene_to_plot = c(gene_to_plot,gene) 146 | 147 | # I copy for the second and third only if they have at least 10% of loadings 148 | if (Rfast::nth(as.matrix(loadings_sq), 2, descending = T) > 0.1){ 149 | second_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 2, descending = T)] 150 | coordinate_to_plot = c(coordinate_to_plot,strsplit(second_top_site,split = "_")[[1]][1]) 151 | let_to_plot = c(let_to_plot,strsplit(second_top_site,split = "_")[[1]][2]) 152 | gene_to_plot = c(gene_to_plot,gene) 153 | } 154 | if (Rfast::nth(as.matrix(loadings_sq), 3, descending = T) > 0.1){ 155 | third_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 3, descending = T)] 156 | coordinate_to_plot = c(coordinate_to_plot,strsplit(third_top_site,split = "_")[[1]][1]) 157 | let_to_plot = c(let_to_plot,strsplit(third_top_site,split = "_")[[1]][2]) 158 | gene_to_plot = c(gene_to_plot,gene) 159 | } 160 | 161 | 162 | },error=function(e){cat("ERROR :",conditionMessage(e), "\n")}) 163 | } 164 | to_plot = data.table(gene_to_plot,let_to_plot,coordinate_to_plot) 165 | names(to_plot) = c("gene","let","end") 166 | 167 | write.table(to_plot, third_evec_file, sep = "\t", row.names = FALSE, quote = FALSE) 168 | } -------------------------------------------------------------------------------- /bin/light_class_input_subcols.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from collections import defaultdict 5 | import numpy as np 6 | import pandas as pd 7 | import pickle 8 | import pysam 9 | import annotator 10 | from light_utils import * 11 | from tqdm import tqdm 12 | 13 | 14 | def get_args(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--bams', nargs="+",required=True, help='bams to parse (either one or two for paired end)') 17 | parser.add_argument("--libraryType",help="Options: SS2, 10X, SLS") 18 | parser.add_argument("--annotator", required=True, help="the path to the annotator pickle file") 19 | parser.add_argument("--gtf", required=True, help="the path to the gtf file") 20 | parser.add_argument("--outname",help="Output file name") 21 | 22 | args = parser.parse_args() 23 | return args 24 | 25 | def extract_info_align(cellranger, CI_dict, bam_read, suffix, bam_file, ann, UMI_bar, stranded_library, spatial_bar, fill_char = np.nan, strand_dict={True : "-", False : "+"}): 26 | if UMI_bar: 27 | if cellranger: 28 | # print("CB",bam_read.has_tag("CB")) 29 | # print("UB",bam_read.has_tag("UB")) 30 | # print("UR",bam_read.has_tag("UR")) 31 | 32 | CI_dict["barcode"].append(bam_read.get_tag("CB")) 33 | try: 34 | CI_dict["UMI"].append(bam_read.get_tag("UB")) 35 | except: 36 | CI_dict["UMI"].append(bam_read.get_tag("UR")) 37 | 38 | else: 39 | vals = bam_read.query_name.split("_") 40 | CI_dict["barcode"].append(vals[-2]) 41 | CI_dict["UMI"].append(vals[-1]) 42 | elif spatial_bar: 43 | CI_dict["barcode"].append(bam_read.get_tag("XC")) 44 | CI_dict["UMI"].append(bam_read.get_tag("XM")) 45 | else: 46 | CI_dict["barcode"].append(fill_char) 47 | CI_dict["UMI"].append(fill_char) 48 | CI_dict["id"].append(bam_read.query_name) 49 | 50 | seqname = bam_file.get_reference_name(bam_read.tid) 51 | 52 | # if chromosome is numeric, prepend "chr" 53 | if str(seqname).isnumeric(): 54 | seqname = "chr" + str(seqname) 55 | 56 | refName, chrA, geneA, posA, chrB, geneB, posB = readObj_refname( 57 | strand_dict[bam_read.is_reverse], 58 | bam_read.cigarstring, 59 | seqname, 60 | bam_read.reference_start + 1, 61 | ann, 62 | fill_char, 63 | stranded_library 64 | ) 65 | CI_dict["refName_AB" + suffix].append(refName) 66 | CI_dict["chr{}A".format(suffix)].append(chrA) 67 | CI_dict["chr{}B".format(suffix)].append(chrB) 68 | CI_dict["gene{}A".format(suffix)].append(geneA) 69 | CI_dict["gene{}B".format(suffix)].append(geneB) 70 | CI_dict["juncPos{}A".format(suffix)].append(int(posA)) 71 | if np.isnan(posB): 72 | CI_dict["juncPos{}B".format(suffix)].append(posB) 73 | else: 74 | CI_dict["juncPos{}B".format(suffix)].append(int(posB)) 75 | strand_dict = {True : "-", False : "+"} 76 | CI_dict["read_strand{}".format(suffix)].append(strand_dict[bam_read.is_reverse]) 77 | 78 | 79 | CI_dict["primary{}".format(suffix)].append(not bam_read.is_secondary) 80 | 81 | empty_cols = [] 82 | for c in empty_cols: 83 | CI_dict[c].append(fill_char) 84 | return CI_dict 85 | 86 | def extract_info_chim(CI_dict,bam_read1,bam_read2,suffix, bam_file, ann, UMI_bar, stranded_library, fill_char = np.nan): 87 | assert bam_read1.query_name == bam_read2.query_name 88 | sec_dict = {True: 0, False: 1} 89 | if UMI_bar: 90 | vals = bam_read1.query_name.split("_") 91 | CI_dict["barcode"].append(vals[-2]) 92 | CI_dict["UMI"].append(vals[-1]) 93 | else: 94 | CI_dict["barcode"].append(fill_char) 95 | CI_dict["UMI"].append(fill_char) 96 | reads = [bam_read1,bam_read2] 97 | halves = ["A","B"] 98 | CI_dict["id"].append(bam_read1.query_name) 99 | 100 | refName, chrA, geneA, posA, chrB, geneB, posB = chim_refName([x.flag for x in reads], [x.cigarstring for x in reads], [x.reference_start + 1 for x in reads], [bam_file.get_reference_name(x.tid) for x in reads], ann, stranded_library) 101 | CI_dict["refName_AB" + suffix].append(refName) 102 | CI_dict["chr{}A".format(suffix)].append(chrA) 103 | CI_dict["chr{}B".format(suffix)].append(chrB) 104 | CI_dict["gene{}A".format(suffix)].append(geneA) 105 | CI_dict["gene{}B".format(suffix)].append(geneB) 106 | CI_dict["juncPos{}A".format(suffix)].append(int(posA)) 107 | CI_dict["juncPos{}B".format(suffix)].append(int(posB)) 108 | for i in range(2): 109 | 110 | CI_dict["primary{}{}".format(suffix,halves[i])].append(sec_dict[reads[i].is_secondary]) 111 | return CI_dict 112 | 113 | 114 | def get_final_df(cellranger, bam_files, j, suffixes, ann, UMI_bar, gtf, stranded_library, spatial_bar): 115 | 116 | CI_dfs = [] 117 | for i in range(len(bam_files)): 118 | if i == 1: 119 | read_ids = set(CI_dfs[0]["id"]) 120 | else: 121 | read_ids = set() 122 | suffix = suffixes[i] 123 | col_bases = [ "juncPos", "gene", "chr"] 124 | columns = ["id", "refName_AB" + suffix, "UMI", "barcode", "primary" + suffix, "read_strand" + suffix] 125 | for c in col_bases: 126 | for l in ["A", "B"]: 127 | columns.append("{}{}{}".format(c,suffix,l)) 128 | CI_dict = {c : [] for c in columns} 129 | count = 0 130 | first = False 131 | if i == 0: 132 | genomic_alignments = {} 133 | alignFile = pysam.AlignmentFile(bam_files[i]) 134 | # columns 135 | #for bam_read in tqdm(alignFile.fetch(until_eof=True)): 136 | for bam_read in (alignFile.fetch(until_eof=True)): 137 | # require CB if this is cell ranger 138 | if ((not cellranger) | ((bam_read.has_tag("CB") & (bam_read.cigarstring is not None)))): 139 | 140 | # make sure read is mapped 141 | if not bam_read.is_unmapped: 142 | if (i == 0) or (not bam_read.is_secondary and bam_read.query_name in read_ids): 143 | # it's a chimeric alignment and we need another line from it 144 | if bam_read.has_tag("ch") and not first: 145 | prev_read = bam_read 146 | first = True 147 | else: 148 | 149 | # add info from chimeric read 150 | if bam_read.has_tag("ch"): 151 | count += 1 152 | 153 | # note: removing chim for this test ONLY; uncomment after 154 | first = False 155 | 156 | # add info from align read 157 | elif "N" in bam_read.cigarstring: 158 | count += 1 159 | CI_dict = extract_info_align(cellranger, CI_dict, bam_read, suffix, alignFile, ann, UMI_bar, stranded_library, spatial_bar) 160 | 161 | # save genomic alignment information 162 | else: 163 | if i == 0: 164 | if bam_read.query_name not in genomic_alignments: 165 | genomic_alignments[bam_read.query_name] = bam_read.get_tag("AS") 166 | else: 167 | genomic_alignments[bam_read.query_name] = max(bam_read.get_tag("AS"), genomic_alignments[bam_read.query_name]) 168 | else: 169 | CI_dict = extract_info_align(cellranger, CI_dict, bam_read, suffix, alignFile, ann, UMI_bar, stranded_library, spatial_bar) 170 | 171 | CI_df = pd.DataFrame.from_dict(CI_dict) 172 | if i == 0: 173 | genomic_alignments = defaultdict(lambda: np.nan,genomic_alignments) 174 | 175 | CI_dfs.append(CI_df) 176 | if len(bam_files) == 2: 177 | final_df = pd.merge(left=CI_dfs[0],right=CI_dfs[1][[c for c in CI_dfs[1].columns if c not in ["UMI","barcode"]]],how="left",left_on="id",right_on="id") 178 | final_df["read_strand_compatible"] = 1 179 | final_df.loc[final_df["read_strandR1"] == final_df["read_strandR2"],"read_strand_compatible"] = 0 180 | final_df["location_compatible"] = final_df.apply(get_loc_flag,axis=1) 181 | else: 182 | final_df = CI_dfs[0] 183 | float_cols = ["primaryR1"] 184 | if len(bam_files) == 2: 185 | float_cols += ["juncPosR2A","juncPosR2B","primaryR2"] 186 | 187 | final_df = final_df[final_df["primaryR1"]] 188 | 189 | return final_df 190 | 191 | def main(): 192 | save = pysam.set_verbosity(0) 193 | 194 | args = get_args() 195 | 196 | bam_files = args.bams 197 | gtf = args.gtf 198 | 199 | annotator_path = args.annotator 200 | ann = pickle.load(open(annotator_path, "rb")) 201 | 202 | suffixes = ["R1","R2"] 203 | 204 | final_dfs = [] 205 | 206 | n_rounds = len(bam_files) 207 | 208 | if args.libraryType == '10X': 209 | UMI_bar = True 210 | stranded_library = False 211 | cellranger = True 212 | spatial_bar = False 213 | 214 | elif args.libraryType == 'SS2': 215 | UMI_bar = False 216 | stranded_library = False 217 | cellranger = False 218 | spatial_bar = False 219 | 220 | if args.libraryType == "SLS": 221 | UMI_bar = False 222 | stranded_library = False 223 | cellranger = False 224 | spatial_bar = True 225 | 226 | for j in range(n_rounds): 227 | if j == 1: 228 | bam_files.reverse() 229 | primary = get_final_df(cellranger, bam_files, j, suffixes, ann, UMI_bar, gtf, stranded_library, spatial_bar) 230 | final_dfs.append(primary) 231 | 232 | pd.concat(final_dfs, axis=0).reset_index(drop=True).to_parquet(args.outname) 233 | 234 | pysam.set_verbosity(save) 235 | 236 | 237 | 238 | main() 239 | -------------------------------------------------------------------------------- /bin/markdown_to_html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from __future__ import print_function 3 | import argparse 4 | import markdown 5 | import os 6 | import sys 7 | import io 8 | 9 | 10 | def convert_markdown(in_fn): 11 | input_md = io.open(in_fn, mode="r", encoding="utf-8").read() 12 | html = markdown.markdown( 13 | "[TOC]\n" + input_md, 14 | extensions=["pymdownx.extra", "pymdownx.b64", "pymdownx.highlight", "pymdownx.emoji", "pymdownx.tilde", "toc"], 15 | extension_configs={ 16 | "pymdownx.b64": {"base_path": os.path.dirname(in_fn)}, 17 | "pymdownx.highlight": {"noclasses": True}, 18 | "toc": {"title": "Table of Contents"}, 19 | }, 20 | ) 21 | return html 22 | 23 | 24 | def wrap_html(contents): 25 | header = """ 26 | 27 | 28 | 62 | 63 | 64 |
65 | """ 66 | footer = """ 67 |
68 | 69 | 70 | """ 71 | return header + contents + footer 72 | 73 | 74 | def parse_args(args=None): 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument("mdfile", type=argparse.FileType("r"), nargs="?", help="File to convert. Defaults to stdin.") 77 | parser.add_argument( 78 | "-o", "--out", type=argparse.FileType("w"), default=sys.stdout, help="Output file name. Defaults to stdout." 79 | ) 80 | return parser.parse_args(args) 81 | 82 | 83 | def main(args=None): 84 | args = parse_args(args) 85 | converted_md = convert_markdown(args.mdfile.name) 86 | html = wrap_html(converted_md) 87 | args.out.write(html) 88 | 89 | 90 | if __name__ == "__main__": 91 | sys.exit(main()) 92 | -------------------------------------------------------------------------------- /bin/parquet_to_tsv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import pandas as pd 5 | 6 | def get_args(): 7 | parser = argparse.ArgumentParser(description="convert parquet to tsv") 8 | parser.add_argument("-p","--parquet",help="input parquet file") 9 | parser.add_argument("-o","--tsv",help="name to save tsv") 10 | parser.add_argument("--reverse",action="store_true",help="convert from tsv to pq instead") 11 | args = parser.parse_args() 12 | return args 13 | 14 | def main(): 15 | args = get_args() 16 | if args.reverse: 17 | df = pd.read_csv(args.tsv, sep = "\t") 18 | df.to_parquet(args.parquet) 19 | else: 20 | df = pd.read_parquet(args.parquet) 21 | df.to_csv(args.tsv, sep = "\t", index = False) 22 | 23 | main() -------------------------------------------------------------------------------- /bin/process_CI.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import pandas as pd 5 | from pathlib import Path 6 | 7 | def get_args(): 8 | parser = argparse.ArgumentParser(description="merge class input files") 9 | parser.add_argument("--input_file", help="Metadata file") 10 | parser.add_argument("--meta", help="Metadata file") 11 | parser.add_argument("--outname", help="Output file name") 12 | parser.add_argument("--libraryType") 13 | 14 | args = parser.parse_args() 15 | return args 16 | 17 | def main(): 18 | args = get_args() 19 | 20 | file_list = pd.read_csv(args.input_file, header=None, names=['sample_ID','file']) 21 | 22 | file_list['sample_ID'] = file_list['sample_ID'].map(lambda x: x.lstrip('[')) 23 | file_list['file'] = file_list['file'].map(lambda x: x.rstrip(']').lstrip(' ')) 24 | 25 | dfs = [] 26 | 27 | for index, row in file_list.iterrows(): 28 | sample_ID = row['sample_ID'] 29 | fn = Path(row['file']) 30 | df = pd.read_parquet(fn) 31 | 32 | # remove UMI duplicates by cell + junction 33 | df = df.drop_duplicates(["barcode","UMI","refName_ABR1"]) 34 | 35 | df["barcode_refName"] = df["barcode"].astype(str) + df["refName_ABR1"] 36 | 37 | # count number of lines corresponding to the junction in the cell 38 | barcode_name_vc = df["barcode_refName"].value_counts() 39 | df["numReads"] = df["barcode_refName"].map(barcode_name_vc) 40 | 41 | # deduplicate by cell + junction 42 | df = df.drop_duplicates(["refName_ABR1","barcode"]) 43 | 44 | # clean up barcode column 45 | 46 | if args.libraryType in ['10X',"SLS"]: 47 | df["barcode"] = df["barcode"].str.rstrip("-1") 48 | df["cell_id"] = sample_ID + "_" + df["barcode"].astype(str) 49 | elif args.libraryType == 'SS2': 50 | df['id'] = df['id'].str.split('.').str[0] 51 | df["cell_id"] = df["id"].astype(str) 52 | 53 | 54 | dfs.append(df) 55 | 56 | full_df = pd.concat(dfs) 57 | full_df["called"] = 1 58 | full_df["refName_newR1"] = full_df["refName_ABR1"] 59 | full_df.rename(columns={"geneR1A" : "geneR1A_uniq", "geneR1B" : "geneR1B_uniq"}, inplace=True) 60 | 61 | final_df = full_df[["refName_newR1","geneR1A_uniq","geneR1B_uniq", "juncPosR1A","juncPosR1B","chrR1A","chrR1B","numReads","cell_id"]] 62 | 63 | meta = pd.read_csv(args.meta, sep="\t") 64 | final_df.drop([x for x in final_df.columns if x in meta.columns and x != "cell_id"], inplace=True, axis=1) 65 | 66 | merged = final_df.merge(meta, left_on="cell_id", right_on="cell_id", how = "left") 67 | 68 | merged.rename(columns={'cell_id': 'cell'}, inplace=True) 69 | merged.to_parquet(args.outname) 70 | 71 | 72 | main() 73 | -------------------------------------------------------------------------------- /bin/rijk_zscore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import os 5 | import argparse 6 | import numpy as np 7 | import pandas as pd 8 | from tqdm import tqdm 9 | import warnings 10 | import logging 11 | warnings.filterwarnings("ignore") 12 | 13 | def get_args(): 14 | parser = argparse.ArgumentParser(description="calculate splicing scores per gene/cell") 15 | parser.add_argument("--parquet", help="input parquet file") 16 | parser.add_argument("--pinning_S", type=float, help="pinning level for S_ijks") 17 | parser.add_argument("--pinning_z", type=float, help="pinning level for zs") 18 | parser.add_argument("--lower_bound", type=int, help="only include cell/gene pairs the have more than this many junctional reads for the gene") 19 | parser.add_argument("--isLight", help="if included, don't calculate extra columns (saves time)") 20 | parser.add_argument("--isSICILIAN", help="Is SICILIAN input file") 21 | parser.add_argument("--grouping_level_2", help="column to group the data by (e.g. ontology, compartment, tissue)", default="ontology") 22 | parser.add_argument("--grouping_level_1", help="subset data by this column before checking for differences (e.g. tissue, compartment)", default="dummy") 23 | parser.add_argument("--outname_pq", help="Name of output file") 24 | parser.add_argument("--outname_tsv", help="Name of output file") 25 | parser.add_argument("--outname_log", help="Name of log file") 26 | args = parser.parse_args() 27 | return args 28 | 29 | def prepare_df(df, let, rank_by_donor, rev_let, let_dict): 30 | 31 | # create donor identifier 32 | df["pos{}_group".format(let)] = df["junc{}".format(let)].astype(str) + df["gene"] 33 | df["rank_" + let_dict[let]] = df.groupby("pos{}_group".format(let))["junc{}".format(rev_let[let])].rank(method="dense") 34 | 35 | # remove consitutive splicing 36 | df["max_rank"] = df["pos{}_group".format(let)].map(df.groupby("pos{}_group".format(let))["rank_" + let_dict[let]].max()) 37 | df = df[df["max_rank"] > 1] 38 | 39 | if not rank_by_donor: 40 | df["rank_" + let_dict[let]] = df.groupby("gene")["juncEnd"].rank(method="dense") 41 | 42 | return df 43 | 44 | def calc_Sijk(df,let, pinning_S, let_dict): 45 | # calculate the average rank calculation per gene 46 | # same as this calculation (this one's slower): df["rank_mean"] = df.groupby("pos{}_group".format(let)).apply(lambda x: (x["numReads"] * x["rank_acc"])/x["numReads"].sum()).reset_index(level=0,drop=True) 47 | 48 | # number of reads with this donor across all cells 49 | df["sum_reads_group"] = df.groupby("pos{}_group".format(let))["numReads"].transform("sum") 50 | 51 | df["read_x_" + let_dict[let]] = df["numReads"] * df["rank_" + let_dict[let]] 52 | 53 | # the sum of acceptors for all reads in all cells with this donor 54 | df["num"] = df.groupby("pos{}_group".format(let))["read_x_" + let_dict[let]].transform("sum") 55 | 56 | # average acceptor for a read with this donor (donor has one value for this) 57 | df["rank_mean"]= df["num"] / df["sum_reads_group"] 58 | 59 | # sum squared difference in rank for ever read 60 | df["sq_diff"] = df["numReads"] * (df["rank_" + let_dict[let]] - df["rank_mean"])**2 61 | 62 | # Get the sum of these squared differences for each donor 63 | df["don_num"] = df.groupby("pos{}_group".format(let))["sq_diff"].transform("sum") 64 | 65 | # sum of squared differences normalized by total number of reads 66 | # changed to make it the sample standard deviation (added minus 1) 67 | df["don_sigma"] = df["don_num"] / (df["sum_reads_group"]) 68 | 69 | # this is the S_ijk value (difference normalized by sd) - should be normal 0/1 70 | df["S_ijk_{}".format(let)] = (df["rank_" + let_dict[let]] - df["rank_mean"])/np.sqrt(df["don_sigma"]) 71 | 72 | # round outlying S values 73 | low_quant = df["S_ijk_{}".format(let)].quantile(q=pinning_S) 74 | high_quant = df["S_ijk_{}".format(let)].quantile(q=1 - pinning_S) 75 | df["S_ijk_{}_unpinned".format(let)] = df["S_ijk_{}".format(let)] 76 | 77 | df.loc[df["S_ijk_{}".format(let)] < low_quant,"S_ijk_{}".format(let)] = low_quant 78 | df.loc[df["S_ijk_{}".format(let)] > high_quant,"S_ijk_{}".format(let)] = high_quant 79 | 80 | # correct for those with no variance 81 | df.loc[df["don_sigma"] == 0, "S_ijk_{}".format(let)] = 0 82 | df["n_sijk"] = df["numReads"] 83 | df.loc[df["don_sigma"] == 0,"n_sijk"] = 0 84 | 85 | return df 86 | 87 | def normalize_Sijks(df,let): 88 | 89 | # calculate mean of SijkA's per gene 90 | df["n_s"] = df["numReads"] * df["S_ijk_" + let] 91 | df["num"] = df.groupby("gene")["n_s"].transform("sum") 92 | df["n_gene"] = df.groupby("gene")["numReads"].transform("sum") 93 | df["sijk{}_mean".format(let)] = df["num"] / df["n_gene"] 94 | 95 | # calculate standard deviation of SijkA's per gene 96 | df["sd_num"] = df["numReads"] * (df["S_ijk_" + let] - df["sijk{}_mean".format(let)])**2 97 | df["num"] = df.groupby("gene")["sd_num"].transform("sum") 98 | df["sijk{}_var".format(let)] = df["num"] / df["n_gene"] 99 | 100 | return df 101 | 102 | def contains_required_cols(df, required_cols, grouping_level_2, grouping_level_1): 103 | 104 | # Function to check if the input file contains the required columns for processing 105 | 106 | required_cols.append(grouping_level_2) 107 | if grouping_level_1.lower() != "dummy": 108 | required_cols.append(grouping_level_1) 109 | 110 | set_req = set(required_cols) 111 | set_df = set(list(df.columns)) 112 | 113 | print(set_req) 114 | print(set_df) 115 | 116 | if set_df.issuperset(set_req): 117 | return True, required_cols 118 | else: 119 | return False, required_cols 120 | 121 | def main(): 122 | args = get_args() 123 | light = bool(int(args.isLight)) 124 | SICILIAN = bool(int(args.isSICILIAN)) 125 | 126 | logging.basicConfig( 127 | filename = args.outname_log, 128 | format='%(asctime)s %(levelname)-8s %(message)s', 129 | level=logging.INFO, 130 | datefmt='%Y-%m-%d %H:%M:%S') 131 | 132 | logging.info("Starting") 133 | 134 | let_dict = {"Start" : "acc", "End" : "don"} 135 | 136 | logging.info("Begin reading in parquet") 137 | 138 | df = pd.read_parquet(args.parquet) 139 | 140 | logging.info("Finished reading in parquet") 141 | 142 | logging.info("Input column check") 143 | 144 | if not SICILIAN: 145 | df["called"] = 1 146 | 147 | base_required_cols = ["juncPosR1A", "geneR1A_uniq", "juncPosR1B", "numReads", "cell", "splice_ann", "refName_newR1", "called", "chrR1A"] 148 | passes_input_check, required_cols = contains_required_cols(df, base_required_cols, args.grouping_level_2, args.grouping_level_1) 149 | if passes_input_check: 150 | logging.info("Passed input column check") 151 | else: 152 | logging.exception("Failed input column check! Exiting") 153 | sys.exit(1) 154 | 155 | df = df[required_cols] 156 | 157 | logging.info("Rename SICILIAN columns") 158 | 159 | cols_dict = { 160 | "geneR1A_uniq": "gene", 161 | "juncPosR1A": "juncStart", 162 | "juncPosR1B": "juncEnd" 163 | } 164 | df.rename(columns=cols_dict, inplace=True) 165 | 166 | if "missing_domains" in df.columns and not light: 167 | domain_breakdown = True 168 | else: 169 | domain_breakdown = False 170 | 171 | df.reset_index(drop=True,inplace=True) 172 | rank_by_donor = True 173 | 174 | if SICILIAN: 175 | df = df[df["called"] == 1] 176 | else: 177 | # only include junctions with more than 1 read in the dataset 178 | df["numReads_tot"] = df.groupby("refName_newR1")["numReads"].transform("sum") 179 | df = df[df["numReads_tot"] > 1] 180 | 181 | # use second location gene name if first is unknown 182 | 183 | df["geneR1B_uniq"] = df["refName_newR1"].str.split("|").str[1].str.split(":").str[1] 184 | idx = df[(df["gene"].isin(["unknown",""])) | (df["gene"].isna())].index 185 | df.loc[idx,"gene"] = df.loc[idx,"geneR1B_uniq"] 186 | 187 | bin_size = 100000 188 | # bin unknown genes 189 | idx = df[(df["gene"] == "") | (df["gene"] == "unknown") | (df["gene"].isna())].index 190 | df.loc[idx,"gene"] = "unknown_" + df["chrR1A"].astype(str) + "_" + (df.loc[idx]["juncStart"] - df.loc[idx]["juncStart"] % bin_size).astype(str) 191 | 192 | logging.info("Replace with geneR1B") 193 | 194 | # get sign of gene to adjust z score 195 | sign_df = df.drop_duplicates("gene") 196 | sign_df["strandA"] = sign_df["refName_newR1"].str.split("|").str[0].str.split(":").str[3] 197 | sign_df["strandB"] = sign_df["refName_newR1"].str.split("|").str[1].str.split(":").str[3] 198 | idx = sign_df[sign_df["strandA"] == "?"].index 199 | sign_df.loc[idx,"strandA"] = sign_df.loc[idx,"strandB"] 200 | sign_df["sign"] = 1 201 | sign_df.loc[sign_df["strandA"] == "-","sign"] = -1 202 | sign_df[["gene","strandA","sign"]] 203 | sign_dict = pd.Series(sign_df.sign.values,index=sign_df.gene).to_dict() 204 | df["sign"] = df["gene"].map(sign_dict).fillna(1) 205 | 206 | logging.info("Get sign") 207 | 208 | df["cell_gene"] = df["cell"] + df["gene"] 209 | 210 | rev_let = {"Start" : "End", "End" : "Start"} 211 | 212 | if domain_breakdown: 213 | split_dict = {True : ["ann", "dom_ch"], False : ["unann", "dom_unch"]} 214 | else: 215 | split_dict = {True : ["ann"], False : ["unann"]} 216 | 217 | # remove constitutive splicing 218 | df["posA_group"] = df["juncStart"].astype(str) + df["gene"] 219 | df["posB_group"] = df["juncEnd"].astype(str) + df["gene"] 220 | 221 | df["rank_acc"] = df.groupby("posA_group")["juncEnd"].rank(method="dense") 222 | df["rank_don"] = df.groupby("posB_group")["juncStart"].rank(method="dense") 223 | 224 | df["max_rank_acc"] = df["posA_group"].map(df.groupby("posA_group")["rank_acc"].max()) 225 | df["max_rank_don"] = df["posB_group"].map(df.groupby("posB_group")["rank_don"].max()) 226 | 227 | # add domain columns 228 | letters = ["Start", "End"] 229 | for let in letters: 230 | 231 | if domain_breakdown: 232 | df["num_missing_" + let] = df["pos{}_group".format(let)].map(df.groupby("pos{}_group".format(let))["missing_domains"].nunique()) 233 | df["num_inserted_" + let] = df["pos{}_group".format(let)].map(df.groupby("pos{}_group".format(let))["domain_insertions"].nunique()) 234 | df["domain_changed_" + let] = (df["num_missing_" + let] + df["num_inserted_" + let]) > 0 235 | 236 | 237 | df = df[(df["max_rank_don"] > 1) | (df["max_rank_acc"] > 1)] 238 | 239 | logging.info("Remove constitutive") 240 | 241 | # require at least args.lower_bound nonconstitutive spliced reads 242 | df["noncon_count"] = df.groupby("cell_gene")["numReads"].transform("sum") 243 | df = df[df["noncon_count"] > args.lower_bound] 244 | 245 | full_df = df.copy() 246 | 247 | calc_dfs = {} 248 | 249 | for let in tqdm(letters): 250 | df = full_df 251 | # create donor identifier 252 | df = prepare_df(df, let, rank_by_donor, rev_let, let_dict) 253 | 254 | logging.info("Prepare df") 255 | df = calc_Sijk(df,let,args.pinning_S, let_dict) 256 | 257 | logging.info("Calculate Sijk") 258 | 259 | df = normalize_Sijks(df,let) 260 | 261 | logging.info("Normalize Sijk") 262 | 263 | # remove those with variance == 0 264 | df = df[df["sijk{}_var".format(let)] != 0] 265 | 266 | # calculate z score 267 | df["n.g_" + let] = df.groupby("cell_gene")["numReads"].transform("sum") 268 | 269 | df["nSijk" + let] = (df["S_ijk_" + let] - df["sijk{}_mean".format(let)]) / np.sqrt(df["sijk{}_var".format(let)]) 270 | df["mult"] = df["numReads"] * df["nSijk" + let] / np.sqrt(df["n.g_" + let]) 271 | df["z_" + let] = df["sign"] * df.groupby("cell_gene")["mult"].transform("sum") 272 | df["scaled_z_" + let] = df["z_" + let] / np.sqrt(df["n.g_" + let]) 273 | 274 | logging.info("Calc z") 275 | 276 | ############## end modify Sijk #################### 277 | df["cell_gene_junc"] = df["cell_gene"] + df["refName_newR1"] 278 | 279 | if not light: 280 | # calculate the z score 281 | df["x_sijk"] = df["S_ijk_{}".format(let)] * df["n_sijk"] 282 | 283 | df["num"] = df.groupby("cell_gene")["x_sijk"].transform("sum") 284 | df["denom_sq"] = df.groupby("cell_gene")["n_sijk"].transform("sum") 285 | 286 | # get junction that "contributes the most" to the z score 287 | df["temp"] = df["x_sijk"] / np.sqrt(df["denom_sq"]) 288 | df["temp_mag"] = abs(df["temp"]) 289 | df["idxmax_z"] = df["cell_gene"].map(df.groupby("cell_gene")["temp_mag"].idxmax()) 290 | map_df = df.loc[df["idxmax_z"],["cell_gene","refName_newR1","temp"]] 291 | df["junc_max_{}".format(let)] = df["cell_gene"].map(pd.Series(map_df.refName_newR1.values,index=map_df.cell_gene).to_dict()) 292 | df["max_don_z_{}".format(let)] = df["cell_gene"].map(pd.Series(map_df.temp.values,index=map_df.cell_gene).to_dict()) 293 | 294 | if args.pinning_z != 0: 295 | # round outlying z values 296 | low_quant = df["z_{}".format(let)].quantile(q=args.pinning_z) 297 | high_quant = df["z_{}".format(let)].quantile(q=1 - args.pinning_z) 298 | 299 | df.loc[df["z_{}".format(let)] < low_quant,"z_{}".format(let)] = low_quant 300 | df.loc[df["z_{}".format(let)] > high_quant,"z_{}".format(let)] = high_quant 301 | 302 | if not light: 303 | # break down z score by annotation 304 | for k,v in split_dict.items(): 305 | df["num_{}".format(v[0])] = df["cell_gene"].map(df[df["splice_ann"] == k].groupby("cell_gene")["x_sijk"].sum()) 306 | 307 | if domain_breakdown: 308 | df["num_{}".format(v[1])] = df["cell_gene"].map(df[df["domain_changed_" + let] == k].groupby("cell_gene")["x_sijk"].sum()) 309 | 310 | for y in v: 311 | 312 | df["z_{}_{}".format(let,y)] = df["sign"] * df["num_{}".format(y)]/np.sqrt(df["denom_sq"]) 313 | 314 | # round outlying z values 315 | low_quant = df["z_{}_{}".format(let,y)].quantile(q=args.pinning_z) 316 | high_quant = df["z_{}_{}".format(let,y)].quantile(q=1 - args.pinning_z) 317 | 318 | df.loc[df["z_{}_{}".format(let,y)] < low_quant,"z_{}_{}".format(let,y)] = low_quant 319 | df.loc[df["z_{}_{}".format(let,y)] > high_quant,"z_{}_{}".format(let,y)] = high_quant 320 | 321 | calc_dfs[let] = df 322 | 323 | df = calc_dfs["Start"].merge(calc_dfs["End"],on="cell_gene_junc",how="outer",suffixes=("","_x")) 324 | 325 | logging.info("Merged") 326 | 327 | for cx in [x for x in df.columns if x.endswith("_x")]: 328 | c = cx[:-2] 329 | df.loc[df[c].isna(),c] = df.loc[df[c].isna(),cx] 330 | 331 | df.drop([x for x in df.columns if x.endswith("_x")],inplace=True,axis=1) 332 | 333 | # average two scores (negate one of them) 334 | 335 | grouped = df.groupby('gene') 336 | for let in letters: 337 | z_dict = pd.Series(calc_dfs[let]["z_" + let].values,index=calc_dfs[let].cell_gene).to_dict() 338 | df["z_" + let] = df["cell_gene"].map(z_dict) 339 | scz_dict = pd.Series(calc_dfs[let]["scaled_z_" + let].values,index=calc_dfs[let].cell_gene).to_dict() 340 | df["scaled_z_" + let] = df["cell_gene"].map(scz_dict) 341 | 342 | df["cov"] = df["gene"].map(grouped.apply(lambda x: x['z_Start'].cov(x['z_End']))) 343 | 344 | idx = df[df["z_Start"].isna()].index 345 | df.loc[idx,"z"] = -df.loc[idx,"z_End"] 346 | df.loc[idx,"scZ"] = -df.loc[idx,"scaled_z_End"] 347 | 348 | idx = df[df["z_End"].isna()].index 349 | df.loc[idx,"z"] = df.loc[idx,"z_Start"] 350 | df.loc[idx,"scZ"] = df.loc[idx,"scaled_z_Start"] 351 | 352 | idx = df[(~df["z_Start"].isna()) & (~df["z_End"].isna())].index 353 | df.loc[idx,"z"] = (df.loc[idx,"z_Start"] - df.loc[idx,"z_End"])/np.sqrt(2 ) 354 | df.loc[idx,"scZ"] = (df.loc[idx,"scaled_z_Start"] - df.loc[idx,"scaled_z_End"])/np.sqrt(2 ) 355 | 356 | logging.info("Avg z") 357 | 358 | if not light: 359 | # average two scores for split z 360 | for v in split_dict.values(): 361 | for y in v: 362 | grouped = df.groupby('gene') 363 | df["cov_{}".format(y)] = df["gene"].map(grouped.apply(lambda x: x['z_Start_{}'.format(y)].cov(x['z_End_{}'.format(y)]))) 364 | 365 | idx = df[df["z_Start_{}".format(y)].isna()].index 366 | df.loc[idx,"z_{}".format(y)] = -df.loc[idx,"z_End_{}".format(y)] 367 | 368 | idx = df[df["z_End_{}".format(y)].isna()].index 369 | df.loc[idx,"z_{}".format(y)] = df.loc[idx,"z_Start_{}".format(y)] 370 | 371 | idx = df[(~df["z_Start_{}".format(y)].isna()) & (~df["z_End_{}".format(y)].isna())].index 372 | df.loc[idx,"z_{}".format(y)] = (df.loc[idx,"z_Start_{}".format(y)] - df.loc[idx,"z_End_{}".format(y)])/np.sqrt(2) - df["cov_{}".format(y)] 373 | 374 | df["ontology"] = df[args.grouping_level_1] + df[args.grouping_level_2] 375 | 376 | df["n.g"] = df.groupby("cell_gene")["numReads"].transform("sum") 377 | df["scaled_z"] = df["z"] / np.sqrt(df["n.g"]) 378 | 379 | for let in letters: 380 | df["zcontrib" + let] = df["numReads"] * df["nSijk" + let] / np.sqrt(df["n.g"]) 381 | 382 | sub_cols = ["cell", "gene", "ontology", "scZ", "n.g_Start", "n.g_End"] 383 | sub_cols.append(args.grouping_level_2) 384 | if args.grouping_level_1.lower() != "dummy": 385 | sub_cols.append(args.grouping_level_1) 386 | 387 | df.drop_duplicates("cell_gene")[sub_cols].to_csv(args.outname_tsv, index=False, sep="\t") 388 | df.to_parquet(args.outname_pq) 389 | 390 | logging.info("Wrote files") 391 | 392 | logging.info("Completed") 393 | 394 | try: 395 | exit(main()) 396 | except Exception: 397 | logging.exception("Exception in main(): ") 398 | exit(1) -------------------------------------------------------------------------------- /bin/scrape_software_versions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import print_function 3 | from collections import OrderedDict 4 | import re 5 | 6 | # TODO nf-core: Add additional regexes for new tools in process get_software_versions 7 | regexes = { 8 | "nf-core/spliz": ["v_pipeline.txt", r"(\S+)"], 9 | "Nextflow": ["v_nextflow.txt", r"(\S+)"], 10 | "FastQC": ["v_fastqc.txt", r"FastQC v(\S+)"], 11 | "MultiQC": ["v_multiqc.txt", r"multiqc, version (\S+)"], 12 | } 13 | results = OrderedDict() 14 | results["nf-core/spliz"] = 'N/A' 15 | results["Nextflow"] = 'N/A' 16 | results["FastQC"] = 'N/A' 17 | results["MultiQC"] = 'N/A' 18 | 19 | # Search each file using its regex 20 | for k, v in regexes.items(): 21 | try: 22 | with open(v[0]) as x: 23 | versions = x.read() 24 | match = re.search(v[1], versions) 25 | if match: 26 | results[k] = "v{}".format(match.group(1)) 27 | except IOError: 28 | results[k] = False 29 | 30 | # Remove software set to false in results 31 | for k in list(results): 32 | if not results[k]: 33 | del results[k] 34 | 35 | # Dump to YAML 36 | print( 37 | """ 38 | id: 'software_versions' 39 | section_name: 'nf-core/spliz Software Versions' 40 | section_href: 'https://github.com/nf-core/spliz' 41 | plot_type: 'html' 42 | description: 'are collected at run time from the software output.' 43 | data: | 44 |
45 | """ 46 | ) 47 | for k, v in results.items(): 48 | print("
{}
{}
".format(k, v)) 49 | print("
") 50 | 51 | # Write out regexes as csv file: 52 | with open("software_versions.csv", "w") as f: 53 | for k, v in results.items(): 54 | f.write("{}\t{}\n".format(k, v)) 55 | -------------------------------------------------------------------------------- /bin/svd_zscore.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import numpy as np 5 | import pandas as pd 6 | from scipy import linalg 7 | from tqdm import tqdm 8 | import os 9 | import logging 10 | 11 | def get_args(): 12 | parser = argparse.ArgumentParser(description="calculate splicing scores per gene/cell") 13 | parser.add_argument("--input", help="Name of the input file from rijk_zscore") 14 | parser.add_argument("--svd_type", choices=["normgene","normdonor"], help="Method of calculating matrix before SVD") 15 | parser.add_argument("--grouping_level_2", help="column to group the data by (e.g. ontology, compartment, tissue)", default="ontology") 16 | parser.add_argument("--grouping_level_1", help="subset data by this column before checking for differences (e.g. tissue, compartment)", default="dummy") 17 | parser.add_argument("--outname_pq", help="Name of output file") 18 | parser.add_argument("--outname_tsv", help="Name of output File") 19 | parser.add_argument("--outname_log", help="Name of output File") 20 | args = parser.parse_args() 21 | return args 22 | 23 | def main(): 24 | args = get_args() 25 | 26 | logging.basicConfig( 27 | filename = args.outname_log, 28 | format='%(asctime)s %(levelname)-8s %(message)s', 29 | level=logging.INFO, 30 | datefmt='%Y-%m-%d %H:%M:%S') 31 | 32 | logging.info("Beginning calculation") 33 | logging.info("Read in parquet file") 34 | 35 | df = pd.read_parquet(args.input) 36 | 37 | ##### PERFORM SVD ZSCORE CALCULATION ##### 38 | 39 | logging.info("Perform SVD zscore calculation") 40 | 41 | letters = ["Start", "End"] 42 | 43 | if args.svd_type == "normgene": 44 | zcontrib_col = "zcontrib" 45 | elif args.svd_type == "normdonor": 46 | 47 | for let in letters: 48 | # find number of reads per donor (or acceptor) per cell 49 | df["cell_gene_pos" + let] = df["cell_gene"] + df["junc" + let].astype(str) 50 | df["n.g_pos" + let] = df.groupby("cell_gene_pos" + let)["numReads"].transform("sum") 51 | # normalize on a donor/acceptor rather than a gene basis 52 | # TRY OUT NOT SQRT-ING denominator as normalization 53 | df["zcontrib_posnorm" + let] = df["numReads"] * df["nSijk" + let] / df["n.g_pos" + let] 54 | 55 | zcontrib_col = "zcontrib_posnorm" 56 | 57 | for let in letters: 58 | 59 | # replace NANs with zeros 60 | df["zcontrib{}_rep".format(let)] = df[zcontrib_col + let].fillna(0) 61 | 62 | # create label for each junction + donor/acceptor 63 | df["str_junc" + let] = df["junc" + let].astype(int).astype(str) + "_" + let 64 | df["cell_gene_pos" + let] = df["cell"] + df["gene"] + df["junc" + let].astype(str) 65 | 66 | # get sum of zcontribs for the given cell and splice site 67 | df["summed_zcontrib" + let] = df.groupby("cell_gene_pos" + let)["zcontrib{}_rep".format(let)].transform('sum') 68 | 69 | k = 3 # number of components to include 70 | loads = {"f{}".format(i) : {} for i in range(k)} 71 | zs = {"svd_z{}".format(i) : {} for i in range(k)} 72 | 73 | logging.info("Iterate over each gene") 74 | for gene, gene_df in tqdm(df.groupby("gene")): 75 | 76 | # get zcontrib matrix 77 | gene_mats = [] 78 | for let in letters: 79 | gene_mat = gene_df.drop_duplicates("cell_gene_pos" + let).pivot_table(index="cell_gene",columns="str_junc{}".format(let),values="summed_zcontrib" + let,fill_value=0) 80 | 81 | gene_mats.append(gene_mat) 82 | gene_mat = gene_mats[0].merge(gene_mats[1],on="cell_gene") 83 | 84 | # mean-normalize the rows 85 | gene_mat = gene_mat.subtract(gene_mat.mean(axis=1),axis=0) 86 | 87 | # calculate svd 88 | u, s, vh = linalg.svd(gene_mat,check_finite=False,full_matrices=False) 89 | 90 | if len(s) >= k: 91 | # calculate new z scores based on svd 92 | new_zs = gene_mat.dot(np.transpose(vh[:k,:])) 93 | 94 | # calculate load on each component 95 | load = np.square(s)/sum(np.square(s)) 96 | 97 | # save new zs and fs in dictionaries to save later 98 | for i in range(k): 99 | loads["f{}".format(i)][gene] = load[i] 100 | zs["svd_z{}".format(i)].update(pd.Series(new_zs[i].values,index=new_zs.index).to_dict()) 101 | 102 | # save loadings 103 | v_out = pd.DataFrame(vh,columns=gene_mat.columns) 104 | #gene_mat_name = "{}_{}_{}.geneMat".format(gene, args.dataname, args.param_stem) 105 | gene_mat_name = "{}.geneMat".format(gene) 106 | v_out.to_csv(gene_mat_name, index=False, sep = "\t") 107 | 108 | for i in range(k): 109 | df["f{}".format(i)] = df["gene"].map(loads["f{}".format(i)]) 110 | df["svd_z{}".format(i)] = df["cell_gene"].map(zs["svd_z{}".format(i)]) 111 | 112 | df["svd_z_sumsq"] = (df[["svd_z{}".format(i) for i in range(k)]]**2).sum(axis=1) 113 | 114 | sub_cols = ["cell","gene","scZ","svd_z_sumsq","n.g_Start","n.g_End"] + ["f{}".format(i) for i in range(k)] + ["svd_z{}".format(i) for i in range(k)] #+ velocity_cols 115 | if "ontology" in df.columns: 116 | sub_cols = sub_cols + [args.grouping_level_1, args.grouping_level_2, "ontology"] 117 | 118 | logging.info("Write out files") 119 | 120 | df.drop_duplicates("cell_gene")[sub_cols].to_csv(args.outname_tsv, index=False, sep="\t") 121 | df.to_parquet(args.outname_pq) 122 | 123 | logging.info("Completed") 124 | 125 | try: 126 | exit(main()) 127 | except Exception: 128 | logging.exception("Exception in main(): ") 129 | exit(1) 130 | -------------------------------------------------------------------------------- /bin/variance_adjusted_permutations_bytiss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import numpy as np 5 | import pandas as pd 6 | from scipy import stats 7 | from tqdm import tqdm 8 | import logging 9 | from statsmodels.stats.multitest import multipletests 10 | 11 | def get_args(): 12 | parser = argparse.ArgumentParser(description="calculate p values based on Romano method") 13 | parser.add_argument("--input", help="Name of the input file from svd_zscore") 14 | parser.add_argument("--num_perms", type=int,help="number of permutations to run for") 15 | parser.add_argument("--grouping_level_2", help="column to group the data by (e.g. ontology, compartment, tissue)", default="ontology") 16 | parser.add_argument("--grouping_level_1", help="subset data by this column before checking for differences (e.g. tissue, compartment)", default="dummy") 17 | parser.add_argument("--outname_all_pvals", help="Name of output file") 18 | parser.add_argument("--outname_perm_pvals", help="Name of output File") 19 | parser.add_argument("--outname_log", help="Name of log File") 20 | args = parser.parse_args() 21 | return args 22 | 23 | 24 | def calc_pval(var_df): 25 | 26 | # calculate the inner sum that's subtracted 27 | num = 0 28 | denom = 0 29 | for index, row in var_df.iterrows(): 30 | num += row["num_cells_ont"]*row["ont_median"]/row["ont_var"] 31 | denom += row["num_cells_ont"]/row["ont_var"] 32 | const = num/denom 33 | 34 | # calculate the outer sum 35 | sum_vals = 0 36 | for index, row in var_df.iterrows(): 37 | sum_vals += (row["num_cells_ont"]/row["ont_var"])*(row["ont_median"] - const)**2 38 | 39 | # return the chi^2 p value and the chi^2 statistic 40 | return 1 - stats.chi2.cdf(sum_vals , var_df.shape[0] - 1), sum_vals 41 | 42 | def get_var_df(sub_df, z_col, adj_var, grouping_level_2): 43 | 44 | sub_df["num_cells_ont"] = sub_df[grouping_level_2].map(sub_df.groupby(grouping_level_2)["cell"].nunique()) 45 | sub_df["ont_median"] = sub_df[grouping_level_2].map(sub_df.groupby(grouping_level_2)[z_col].median()) 46 | sub_df["ont_var"] = sub_df[grouping_level_2].map(sub_df.groupby(grouping_level_2)[z_col].var()) 47 | 48 | var_df = sub_df.drop_duplicates(grouping_level_2)[[grouping_level_2,"ont_median","num_cells_ont","ont_var"]] 49 | 50 | # don't need to remove cell types with variance 0 when we're adjusting variance 51 | if not adj_var: 52 | 53 | # remove ontologies with zero variance 54 | var_df = var_df[var_df["ont_var"] > 0] 55 | return var_df 56 | 57 | def main(): 58 | np.random.seed(123) 59 | alpha = 0.05 60 | 61 | args = get_args() 62 | 63 | logging.basicConfig( 64 | filename = args.outname_log, 65 | format='%(asctime)s %(levelname)-8s %(message)s', 66 | level=logging.INFO, 67 | datefmt='%Y-%m-%d %H:%M:%S') 68 | 69 | logging.info("Starting") 70 | 71 | df_cols = ["gene", "cell", "scZ", "svd_z0", "svd_z1", "svd_z2", "cell_gene", "f0", "f1", "f2"] 72 | 73 | if args.grouping_level_1.lower() != "dummy": 74 | df_cols.append(args.grouping_level_2) 75 | df_cols.append(args.grouping_level_1) 76 | else: 77 | df_cols.append(args.grouping_level_2) 78 | 79 | df = pd.read_parquet( 80 | args.input, 81 | columns=df_cols 82 | ) 83 | df = df.drop_duplicates("cell_gene") 84 | 85 | if args.grouping_level_1 == "dummy": 86 | df["dummy"] = "null" 87 | df["tiss_comp"] = df[args.grouping_level_1].astype(str) + df[args.grouping_level_2].astype(str) 88 | 89 | # subset to ontologies with > 20 cells 90 | df["ontology_gene"] = df[args.grouping_level_2].astype(str) + df["gene"] 91 | df["num_ont_gene"] = df["ontology_gene"].map(df.groupby("ontology_gene")["cell_gene"].nunique()) 92 | df = df[df["num_ont_gene"] > 10] 93 | 94 | z_cols = ["scZ","svd_z0","svd_z1","svd_z2"] 95 | out = {"pval" : [], "gene" : [], "num_onts" : [],"z_col" : [],"max_abs_median" : [], "Tn1" : [], "grouping_level_1" : []} 96 | 97 | var_adj = 0.1 98 | adj_var = True 99 | 100 | perm_pval = True 101 | 102 | if perm_pval: 103 | out["perm_pval"] = [] 104 | 105 | df["dummy"] = "null" 106 | for tiss, tiss_df in df.groupby(args.grouping_level_1): 107 | for gene, sub_df in tqdm(tiss_df.groupby("gene")): 108 | 109 | for z_col in z_cols: 110 | 111 | var_df = get_var_df(sub_df, z_col, adj_var, args.grouping_level_2) 112 | 113 | if var_df.shape[0] > 1: 114 | if adj_var: 115 | var_df["ont_var"] = var_df["ont_var"] + var_adj 116 | pval, Tn1 = calc_pval(var_df) 117 | out["pval"].append(pval) 118 | out["Tn1"].append(Tn1) 119 | 120 | out["gene"].append(gene) 121 | out["num_onts"].append(var_df.shape[0]) 122 | out["z_col"].append(z_col) 123 | out["max_abs_median"].append((var_df["ont_median"].abs()).max()) 124 | out["grouping_level_1"].append(tiss) 125 | 126 | if perm_pval: 127 | sub_df_perm = sub_df.copy() 128 | if (pval < alpha): 129 | Tn1_dist = [] 130 | # for i in range(args.num_perms): 131 | while len(Tn1_dist) < args.num_perms: 132 | sub_df_perm[args.grouping_level_2] = np.random.permutation(sub_df_perm[args.grouping_level_2]) 133 | var_df = get_var_df(sub_df_perm, z_col, adj_var, args.grouping_level_2) 134 | if var_df.shape[0] > 1: 135 | if adj_var: 136 | var_df["ont_var"] = var_df["ont_var"] + var_adj 137 | pval, Tn1_perm = calc_pval(var_df) 138 | Tn1_dist.append(Tn1_perm) 139 | out["perm_pval"].append(len([x for x in Tn1_dist if x < Tn1])/args.num_perms) 140 | else: 141 | out["perm_pval"].append(np.nan) 142 | out_df = pd.DataFrame.from_dict(out) 143 | 144 | out_df["perm_pval_inv"] = 1 - out_df["perm_pval"] 145 | out_df["perm_pval2"] = 2*out_df[["perm_pval","perm_pval_inv"]].min(axis=1) 146 | 147 | # adjust p values all together 148 | 149 | # Try if old fails: if any na values, don't include in adjustment 150 | #out_df.loc[~out_df["pval"].isna(),"pval_adj"] = multipletests(out_df.loc[~out_df["pval"].isna(),"pval"], alpha, method = "fdr_bh")[1] 151 | #out_df.loc[~out_df["perm_pval2"].isna(),"perm_pval2_adj"] = multipletests(out_df.loc[~out_df["perm_pval2"].isna(),"perm_pval2"], alpha, method = "fdr_bh")[1] 152 | 153 | # OLD 154 | try: 155 | out_df["pval_adj"] = multipletests(out_df["pval"],alpha, method="fdr_bh")[1] 156 | out_df.loc[~out_df["perm_pval2"].isna(),"perm_pval2_adj"] = multipletests(out_df.loc[~out_df["perm_pval2"].isna(),"perm_pval2"], alpha, method = "fdr_bh")[1] 157 | except: 158 | out_df["pval_adj"] = np.nan 159 | out_df["perm_pval2_adj"] = np.nan 160 | 161 | out_df.to_csv(args.outname_all_pvals, sep="\t", index=False) 162 | 163 | out_df["gene_grouping_level_1"] = out_df["gene"] + out_df["grouping_level_1"].astype(str) 164 | 165 | # reformat output 166 | new_out = {"gene" : [], "num_onts" : [], "grouping_level_1" : []} 167 | for z_col in z_cols: 168 | new_out["chi2_pval_adj_" + z_col] = [] 169 | new_out["perm_pval_adj_" + z_col] = [] 170 | new_out["max_abs_median_" + z_col] = [] 171 | new_out["perm_cdf_" + z_col] = [] 172 | for gene_sub, gene_df in out_df.groupby("gene_grouping_level_1"): 173 | new_out["gene"].append(gene_df["gene"].iloc[0]) 174 | new_out["grouping_level_1"].append(gene_df["grouping_level_1"].iloc[0]) 175 | new_out["num_onts"].append(gene_df["num_onts"].iloc[0]) 176 | temp_z_cols = [] 177 | for z_col, z_df in gene_df.groupby("z_col"): 178 | new_out["chi2_pval_adj_" + z_col].append(z_df["pval_adj"].iloc[0]) 179 | new_out["perm_pval_adj_" + z_col].append(z_df["perm_pval2_adj"].iloc[0]) 180 | new_out["max_abs_median_" + z_col].append(z_df["max_abs_median"].iloc[0]) 181 | new_out["perm_cdf_" + z_col].append(z_df["perm_pval"].iloc[0]) 182 | temp_z_cols.append(z_col) 183 | for z_col in [x for x in z_cols if x not in temp_z_cols]: 184 | new_out["chi2_pval_adj_" + z_col].append(np.nan) 185 | new_out["perm_pval_adj_" + z_col].append(np.nan) 186 | new_out["max_abs_median_" + z_col].append(np.nan) 187 | new_out["perm_cdf_" + z_col].append(np.nan) 188 | new_out_df = pd.DataFrame.from_dict(new_out).sort_values("perm_pval_adj_scZ") 189 | 190 | # add frac from SVD for each gene 191 | df = df.drop_duplicates("gene") 192 | for i in range(3): 193 | frac_dict = pd.Series(df["f" + str(i)].values,index=df.gene).to_dict() 194 | new_out_df["f" + str(i)] = new_out_df["gene"].map(frac_dict) 195 | 196 | new_out_df.to_csv(args.outname_perm_pvals, sep="\t", index=False) 197 | 198 | logging.info("Completed") 199 | 200 | main() 201 | -------------------------------------------------------------------------------- /conf/base.config: -------------------------------------------------------------------------------- 1 | /* 2 | ======================================================================================== 3 | nf-core/rnaseq Nextflow base config file 4 | ======================================================================================== 5 | A 'blank slate' config file, appropriate for general use on most high performance 6 | compute environments. Assumes that all software is installed and available on 7 | the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. 8 | ---------------------------------------------------------------------------------------- 9 | */ 10 | 11 | process { 12 | 13 | cpus = { check_max( 1 * task.attempt, 'cpus' ) } 14 | memory = { check_max( 6.GB * task.attempt, 'memory' ) } 15 | time = { check_max( 4.h * task.attempt, 'time' ) } 16 | 17 | errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } 18 | maxRetries = 3 19 | maxErrors = '-1' 20 | 21 | // Process-specific resource requirements 22 | withLabel:process_low { 23 | cpus = { check_max( 2 * task.attempt, 'cpus' ) } 24 | memory = { check_max( 20.GB * task.attempt, 'memory' ) } 25 | time = { check_max( 1.h * task.attempt, 'time' ) } 26 | } 27 | withLabel:process_medium { 28 | cpus = { check_max( 6 * task.attempt, 'cpus' ) } 29 | memory = { check_max( 100.GB * task.attempt, 'memory' ) } 30 | time = { check_max( 2.h * task.attempt, 'time' ) } 31 | } 32 | withLabel:process_high { 33 | cpus = { check_max( 12 * task.attempt, 'cpus' ) } 34 | memory = { check_max( 200.GB * task.attempt, 'memory' ) } 35 | time = { check_max( 2.h * task.attempt, 'time' ) } 36 | } 37 | withLabel:process_long { 38 | time = { check_max( 20.h * task.attempt, 'time' ) } 39 | } 40 | withLabel:process_high_memory { 41 | memory = { check_max( 400.GB * task.attempt, 'memory' ) } 42 | } 43 | withLabel:error_ignore { 44 | errorStrategy = 'ignore' 45 | } 46 | withLabel:error_retry { 47 | errorStrategy = 'retry' 48 | maxRetries = 2 49 | } 50 | } -------------------------------------------------------------------------------- /conf/test.config: -------------------------------------------------------------------------------- 1 | process { 2 | executor = 'slurm' 3 | clusterOptions = '-p quake,horence,owners' 4 | 5 | memory = { 1.GB * task.attempt } 6 | time = { 1.h * task.attempt } 7 | errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } 8 | maxRetries = 3 9 | } 10 | 11 | params { 12 | dataname = "test" 13 | input_file = "/scratch/groups/horence/kaitlin/spliz_nextflow/nf-core-spliz/test_data/test.tsv" 14 | SICILIAN = true 15 | grouping_level_2 = "compartment" 16 | grouping_level_1 = "tissue" 17 | libraryType = "10X" 18 | run_analysis = true 19 | } 20 | 21 | params.outdir = "./results/${params.dataname}" 22 | params.tracedir = "./results/${params.dataname}/pipeline_info" 23 | params.schema_ignore_params = "input,single_end,show_hidden_params,validate_params,igenomes_ignore,tracedir,igenomes_base,help,monochrome_logs,plaintext_email,max_multiqc_email_size,email_on_fail,email,multiqc_config,publish_dir_mode,genome,genomes" 24 | 25 | tower { 26 | enabled = true 27 | } -------------------------------------------------------------------------------- /conf/test_full.config: -------------------------------------------------------------------------------- 1 | /* 2 | ======================================================================================== 3 | Nextflow config file for running full-size tests 4 | ======================================================================================== 5 | Defines input files and everything required to run a full size pipeline test. 6 | 7 | Use as follows: 8 | nextflow run nf-core/rnaseq -profile test_full, 9 | 10 | ---------------------------------------------------------------------------------------- 11 | */ 12 | 13 | params { 14 | config_profile_name = 'Full test profile' 15 | config_profile_description = 'Full test dataset to check pipeline function' 16 | 17 | // Parameters for full-size test 18 | input = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/v3.1/samplesheet_full.csv' 19 | genome = 'GRCh37' 20 | pseudo_aligner = 'salmon' 21 | } 22 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # nf-core/spliz: Documentation 2 | 3 | The nf-core/spliz documentation is split into the following pages: 4 | 5 | * [Usage](usage.md) 6 | * An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. 7 | * [Output](output.md) 8 | * An overview of the different results produced by the pipeline and how to interpret them. 9 | 10 | You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re) 11 | -------------------------------------------------------------------------------- /docs/images/nf-core-spliz_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salzman-lab/SpliZ/594532c1b12a1a30b5e00cd9e1fbed30ef28047e/docs/images/nf-core-spliz_logo.png -------------------------------------------------------------------------------- /docs/output.md: -------------------------------------------------------------------------------- 1 | # nf-core/spliz: Output 2 | 3 | ## Introduction 4 | 5 | This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. 6 | 7 | The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. 8 | 9 | 10 | 11 | ## Pipeline overview 12 | 13 | The pipeline is built using [Nextflow](https://www.nextflow.io/) 14 | and processes data using the following steps: 15 | 16 | * [FastQC](#fastqc) - Read quality control 17 | * [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline 18 | * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution 19 | 20 | ## FastQC 21 | 22 | [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. 23 | 24 | For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). 25 | 26 | **Output files:** 27 | 28 | * `fastqc/` 29 | * `*_fastqc.html`: FastQC report containing quality metrics for your untrimmed raw fastq files. 30 | * `fastqc/zips/` 31 | * `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. 32 | 33 | > **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. 34 | 35 | ## MultiQC 36 | 37 | [MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. 38 | 39 | The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. 40 | 41 | For more information about how to use MultiQC reports, see [https://multiqc.info](https://multiqc.info). 42 | 43 | **Output files:** 44 | 45 | * `multiqc/` 46 | * `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. 47 | * `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. 48 | * `multiqc_plots/`: directory containing static images from the report in various formats. 49 | 50 | ## Pipeline information 51 | 52 | [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. 53 | 54 | **Output files:** 55 | 56 | * `pipeline_info/` 57 | * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. 58 | * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`. 59 | * Documentation for interpretation of results in HTML format: `results_description.html`. 60 | -------------------------------------------------------------------------------- /docs/usage.md: -------------------------------------------------------------------------------- 1 | # salzmanlab/spliz: Usage 2 | 3 | > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ 4 | 5 | ## Introduction 6 | 7 | 8 | 9 | ## Running the pipeline 10 | 11 | The typical command for running the pipeline is as follows: 12 | 13 | ```bash 14 | nextflow run salzmanlab/spliz --input '*_R{1,2}.fastq.gz' -profile docker 15 | ``` 16 | 17 | This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. 18 | 19 | Note that the pipeline will create the following files in your working directory: 20 | 21 | ```bash 22 | work # Directory containing the nextflow working files 23 | results # Finished results (configurable, see below) 24 | .nextflow_log # Log file from Nextflow 25 | # Other nextflow hidden files, eg. history of pipeline runs and old logs. 26 | ``` 27 | 28 | ### Updating the pipeline 29 | 30 | When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: 31 | 32 | ```bash 33 | nextflow pull salzmanlab/spliz 34 | ``` 35 | 36 | ### Reproducibility 37 | 38 | It's a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. 39 | 40 | First, go to the [salzmanlab/spliz releases page](https://github.com/salzmanlab/spliz/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. 41 | 42 | This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. 43 | 44 | ## Core Nextflow arguments 45 | 46 | > **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). 47 | 48 | ### `-profile` 49 | 50 | Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. 51 | 52 | Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. 53 | 54 | > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. 55 | 56 | The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). 57 | 58 | Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! 59 | They are loaded in sequence, so later profiles can overwrite earlier profiles. 60 | 61 | If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. 62 | 63 | * `docker` 64 | * A generic configuration profile to be used with [Docker](https://docker.com/) 65 | * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/) 66 | * `singularity` 67 | * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) 68 | * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/) 69 | * `podman` 70 | * A generic configuration profile to be used with [Podman](https://podman.io/) 71 | * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/) 72 | * `shifter` 73 | * A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) 74 | * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/) 75 | * `charliecloud` 76 | * A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) 77 | * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/) 78 | * `conda` 79 | * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. 80 | * A generic configuration profile to be used with [Conda](https://conda.io/docs/) 81 | * Pulls most software from [Bioconda](https://bioconda.github.io/) 82 | * `test` 83 | * A profile with a complete configuration for automated testing 84 | * Includes links to test data so needs no other parameters 85 | 86 | ### `-resume` 87 | 88 | Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. 89 | 90 | You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. 91 | 92 | ### `-c` 93 | 94 | Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. 95 | 96 | #### Custom resource requests 97 | 98 | Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources) it will automatically resubmit with higher requests (2 x original, then 3 x original). If it still fails after three times then the pipeline is stopped. 99 | 100 | Whilst these default requirements will hopefully work for most people with most data, you may find that you want to customise the compute resources that the pipeline requests. You can do this by creating a custom config file. For example, to give the workflow process `star` 32GB of memory, you could use the following config: 101 | 102 | ```nextflow 103 | process { 104 | withName: star { 105 | memory = 32.GB 106 | } 107 | } 108 | ``` 109 | 110 | To find the exact name of a process you wish to modify the compute resources, check the live-status of a nextflow run displayed on your terminal or check the nextflow error for a line like so: `Error executing process > 'bwa'`. In this case the name to specify in the custom config file is `bwa`. 111 | 112 | See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more information. 113 | 114 | If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition above). You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. 115 | 116 | If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). 117 | 118 | ### Running in the background 119 | 120 | Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. 121 | 122 | The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. 123 | 124 | Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. 125 | Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). 126 | 127 | #### Nextflow memory requirements 128 | 129 | In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. 130 | We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): 131 | 132 | ```bash 133 | NXF_OPTS='-Xms1g -Xmx4g' 134 | ``` 135 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | # You can use this file to create a conda environment for this pipeline: 2 | # conda env create -f environment.yml 3 | name: nf-core-spliz-1.0dev 4 | channels: 5 | - conda-forge 6 | - bioconda 7 | - defaults 8 | dependencies: 9 | - python=3.9.6 10 | - pandas=1.3.1 11 | - tqdm=4.62.0 12 | - numpy=1.21.1 13 | - pyarrow=5.0.0 14 | - pysam=0.16.0.1 15 | - r-base=4.1.1 16 | - r-data.table=1.14.0 17 | - r-logger=0.2.1 18 | - r-rfast=2.0.3 19 | - scipy=1.7.1 20 | - statsmodels=0.12.2 21 | - nextflow=21.04.0 22 | -------------------------------------------------------------------------------- /lib/Headers.groovy: -------------------------------------------------------------------------------- 1 | /* 2 | * This file holds several functions used to render the nf-core ANSI header. 3 | */ 4 | 5 | class Headers { 6 | 7 | private static Map log_colours(Boolean monochrome_logs) { 8 | Map colorcodes = [:] 9 | colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" 10 | colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" 11 | colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" 12 | colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" 13 | colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" 14 | colorcodes['yellow_bold'] = monochrome_logs ? '' : "\033[1;93m" 15 | colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" 16 | colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" 17 | colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" 18 | colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" 19 | colorcodes['red'] = monochrome_logs ? '' : "\033[1;91m" 20 | return colorcodes 21 | } 22 | 23 | static String dashed_line(monochrome_logs) { 24 | Map colors = log_colours(monochrome_logs) 25 | return "-${colors.dim}----------------------------------------------------${colors.reset}-" 26 | } 27 | 28 | static String nf_core(workflow, monochrome_logs) { 29 | Map colors = log_colours(monochrome_logs) 30 | String.format( 31 | """\n 32 | ${dashed_line(monochrome_logs)} 33 | ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} 34 | ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} 35 | ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} 36 | ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} 37 | ${colors.green}`._,._,\'${colors.reset} 38 | ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} 39 | ${dashed_line(monochrome_logs)} 40 | """.stripIndent() 41 | ) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /lib/nfcore_external_java_deps.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salzman-lab/SpliZ/594532c1b12a1a30b5e00cd9e1fbed30ef28047e/lib/nfcore_external_java_deps.jar -------------------------------------------------------------------------------- /main.nf: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env nextflow 2 | /* 3 | ======================================================================================== 4 | nf-core/spliz 5 | ======================================================================================== 6 | nf-core/spliz Analysis Pipeline. 7 | #### Homepage / Documentation 8 | https://github.com/salzmanlab/SpliZ 9 | ---------------------------------------------------------------------------------------- 10 | */ 11 | nextflow.enable.dsl=2 12 | 13 | log.info Headers.nf_core(workflow, params.monochrome_logs) 14 | 15 | //////////////////////////////////////////////////// 16 | /* -- PRINT HELP -- */ 17 | ////////////////////////////////////////////////////+ 18 | def json_schema = "$projectDir/nextflow_schema.json" 19 | if (params.help) { 20 | def command = "nextflow run nf-core/spliz -c conf/test.config" 21 | log.info NfcoreSchema.params_help(workflow, params, json_schema, command) 22 | exit 0 23 | } 24 | 25 | //////////////////////////////////////////////////// 26 | /* -- VALIDATE PARAMETERS -- */ 27 | ////////////////////////////////////////////////////+ 28 | if (params.validate_params) { 29 | NfcoreSchema.validateParameters(params, json_schema, log) 30 | } 31 | 32 | //////////////////////////////////////////////////// 33 | /* -- Collect configuration parameters -- */ 34 | //////////////////////////////////////////////////// 35 | 36 | // Check if genome exists in the config file 37 | if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { 38 | exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(', ')}" 39 | } 40 | 41 | // TODO nf-core: Add any reference files that are needed 42 | // Configurable reference genomes 43 | // 44 | // NOTE - THIS IS NOT USED IN THIS PIPELINE, EXAMPLE ONLY 45 | // If you want to use the channel below in a process, define the following: 46 | // input: 47 | // file fasta from ch_fasta 48 | // 49 | //params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false 50 | //if (params.fasta) { ch_fasta = file(params.fasta, checkIfExists: true) } 51 | 52 | // Check AWS batch settings 53 | if (workflow.profile.contains('awsbatch')) { 54 | // AWSBatch sanity checking 55 | if (!params.awsqueue || !params.awsregion) exit 1, 'Specify correct --awsqueue and --awsregion parameters on AWSBatch!' 56 | // Check outdir paths to be S3 buckets if running on AWSBatch 57 | // related: https://github.com/nextflow-io/nextflow/issues/813 58 | if (!params.outdir.startsWith('s3:')) exit 1, 'Outdir not on S3 - specify S3 Bucket to run on AWSBatch!' 59 | // Prevent trace files to be stored on S3 since S3 does not support rolling files. 60 | if (params.tracedir.startsWith('s3:')) exit 1, 'Specify a local tracedir or run without trace! S3 cannot be used for tracefiles.' 61 | } 62 | 63 | // Stage config files 64 | ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) 65 | ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() 66 | ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) 67 | ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true) 68 | 69 | 70 | //////////////////////////////////////////////////// 71 | /* -- PRINT PARAMETER SUMMARY -- */ 72 | //////////////////////////////////////////////////// 73 | log.info NfcoreSchema.params_summary_log(workflow, params, json_schema) 74 | 75 | // Header log info 76 | def summary = [:] 77 | if (workflow.revision) summary['Pipeline Release'] = workflow.revision 78 | summary['Run Name'] = workflow.runName 79 | // TODO nf-core: Report custom parameters here 80 | //summary['Input'] = params.input 81 | //summary['Fasta Ref'] = params.fasta 82 | //summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' 83 | summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" 84 | if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" 85 | summary['Output dir'] = params.outdir 86 | summary['Launch dir'] = workflow.launchDir 87 | summary['Working dir'] = workflow.workDir 88 | summary['Script dir'] = workflow.projectDir 89 | summary['User'] = workflow.userName 90 | if (workflow.profile.contains('awsbatch')) { 91 | summary['AWS Region'] = params.awsregion 92 | summary['AWS Queue'] = params.awsqueue 93 | summary['AWS CLI'] = params.awscli 94 | } 95 | summary['Config Profile'] = workflow.profile 96 | if (params.config_profile_description) summary['Config Profile Description'] = params.config_profile_description 97 | if (params.config_profile_contact) summary['Config Profile Contact'] = params.config_profile_contact 98 | if (params.config_profile_url) summary['Config Profile URL'] = params.config_profile_url 99 | summary['Config Files'] = workflow.configFiles.join(', ') 100 | if (params.email || params.email_on_fail) { 101 | summary['E-mail Address'] = params.email 102 | summary['E-mail on failure'] = params.email_on_fail 103 | summary['MultiQC maxsize'] = params.max_multiqc_email_size 104 | } 105 | 106 | // Check the hostnames against configured profiles 107 | checkHostname() 108 | 109 | Channel.from(summary.collect{ [it.key, it.value] }) 110 | .map { k,v -> "
$k
${v ?: 'N/A'}
" } 111 | .reduce { a, b -> return [a, b].join("\n ") } 112 | .map { x -> """ 113 | id: 'nf-core-spliz-summary' 114 | description: " - this information is collected when the pipeline is started." 115 | section_name: 'nf-core/spliz Workflow Summary' 116 | section_href: 'https://github.com/nf-core/spliz' 117 | plot_type: 'html' 118 | data: | 119 |
120 | $x 121 |
122 | """.stripIndent() } 123 | .set { ch_workflow_summary } 124 | 125 | /* 126 | * Parse software version numbers 127 | */ 128 | process get_software_versions { 129 | publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode, 130 | saveAs: { filename -> 131 | if (filename.indexOf('.csv') > 0) filename 132 | else null 133 | } 134 | 135 | output: 136 | file 'software_versions_mqc.yaml' into ch_software_versions_yaml 137 | file 'software_versions.csv' 138 | 139 | script: 140 | // TODO nf-core: Get all tools to print their version number here 141 | """ 142 | echo $workflow.manifest.version > v_pipeline.txt 143 | echo $workflow.nextflow.version > v_nextflow.txt 144 | fastqc --version > v_fastqc.txt 145 | multiqc --version > v_multiqc.txt 146 | scrape_software_versions.py &> software_versions_mqc.yaml 147 | """ 148 | } 149 | 150 | /* 151 | * STEP 1 - FastQC 152 | */ 153 | process fastqc { 154 | tag "$name" 155 | label 'process_medium' 156 | publishDir "${params.outdir}/fastqc", mode: params.publish_dir_mode, 157 | saveAs: { filename -> 158 | filename.indexOf('.zip') > 0 ? "zips/$filename" : "$filename" 159 | } 160 | 161 | input: 162 | set val(name), file(reads) from ch_read_files_fastqc 163 | 164 | output: 165 | file '*_fastqc.{zip,html}' into ch_fastqc_results 166 | 167 | script: 168 | """ 169 | fastqc --quiet --threads $task.cpus $reads 170 | """ 171 | } 172 | 173 | /* 174 | * STEP 2 - MultiQC 175 | */ 176 | process multiqc { 177 | publishDir "${params.outdir}/MultiQC", mode: params.publish_dir_mode 178 | 179 | input: 180 | file (multiqc_config) from ch_multiqc_config 181 | file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) 182 | // TODO nf-core: Add in log files from your new processes for MultiQC to find! 183 | file ('fastqc/*') from ch_fastqc_results.collect().ifEmpty([]) 184 | file ('software_versions/*') from ch_software_versions_yaml.collect() 185 | file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") 186 | 187 | output: 188 | file "*multiqc_report.html" into ch_multiqc_report 189 | file "*_data" 190 | file "multiqc_plots" 191 | 192 | script: 193 | rtitle = '' 194 | rfilename = '' 195 | if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { 196 | rtitle = "--title \"${workflow.runName}\"" 197 | rfilename = "--filename " + workflow.runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" 198 | } 199 | custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' 200 | // TODO nf-core: Specify which MultiQC modules to use with -m for a faster run time 201 | """ 202 | multiqc -f $rtitle $rfilename $custom_config_file . 203 | """ 204 | } 205 | 206 | /* 207 | * STEP 3 - Output Description HTML 208 | */ 209 | process output_documentation { 210 | publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode 211 | 212 | input: 213 | file output_docs from ch_output_docs 214 | file images from ch_output_docs_images 215 | 216 | output: 217 | file 'results_description.html' 218 | 219 | script: 220 | """ 221 | markdown_to_html.py $output_docs -o results_description.html 222 | """ 223 | } 224 | 225 | /* 226 | ======================================================================================== 227 | IMPORT LOCAL MODULES/SUBWORKFLOWS 228 | ======================================================================================== 229 | */ 230 | include { SPLIZ_PIPELINE } from './workflows/spliz_pipeline' 231 | 232 | /* 233 | ======================================================================================== 234 | MAIN WORKFLOW 235 | ======================================================================================== 236 | */ 237 | workflow NFCORE_SPLIZ { 238 | SPLIZ_PIPELINE () 239 | } 240 | 241 | workflow { 242 | NFCORE_SPLIZ () 243 | } 244 | 245 | /* 246 | * Completion e-mail notification 247 | */ 248 | workflow.onComplete { 249 | 250 | // Set up the e-mail variables 251 | def subject = "[nf-core/spliz] Successful: $workflow.runName" 252 | if (!workflow.success) { 253 | subject = "[nf-core/spliz] FAILED: $workflow.runName" 254 | } 255 | def email_fields = [:] 256 | email_fields['version'] = workflow.manifest.version 257 | email_fields['runName'] = workflow.runName 258 | email_fields['success'] = workflow.success 259 | email_fields['dateComplete'] = workflow.complete 260 | email_fields['duration'] = workflow.duration 261 | email_fields['exitStatus'] = workflow.exitStatus 262 | email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') 263 | email_fields['errorReport'] = (workflow.errorReport ?: 'None') 264 | email_fields['commandLine'] = workflow.commandLine 265 | email_fields['projectDir'] = workflow.projectDir 266 | email_fields['summary'] = summary 267 | email_fields['summary']['Date Started'] = workflow.start 268 | email_fields['summary']['Date Completed'] = workflow.complete 269 | email_fields['summary']['Pipeline script file path'] = workflow.scriptFile 270 | email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId 271 | if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository 272 | if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId 273 | if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision 274 | email_fields['summary']['Nextflow Version'] = workflow.nextflow.version 275 | email_fields['summary']['Nextflow Build'] = workflow.nextflow.build 276 | email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp 277 | 278 | /* 279 | // TODO nf-core: If not using MultiQC, strip out this code (including params.max_multiqc_email_size) 280 | // On success try attach the multiqc report 281 | def mqc_report = null 282 | try { 283 | if (workflow.success) { 284 | mqc_report = ch_multiqc_report.getVal() 285 | if (mqc_report.getClass() == ArrayList) { 286 | log.warn "[nf-core/spliz] Found multiple reports from process 'multiqc', will use only one" 287 | mqc_report = mqc_report[0] 288 | } 289 | } 290 | } catch (all) { 291 | log.warn "[nf-core/spliz] Could not attach MultiQC report to summary email" 292 | } 293 | */ 294 | 295 | // Check if we are only sending emails on failure 296 | email_address = params.email 297 | if (!params.email && params.email_on_fail && !workflow.success) { 298 | email_address = params.email_on_fail 299 | } 300 | 301 | // Render the TXT template 302 | def engine = new groovy.text.GStringTemplateEngine() 303 | def tf = new File("$projectDir/assets/email_template.txt") 304 | def txt_template = engine.createTemplate(tf).make(email_fields) 305 | def email_txt = txt_template.toString() 306 | 307 | // Render the HTML template 308 | def hf = new File("$projectDir/assets/email_template.html") 309 | def html_template = engine.createTemplate(hf).make(email_fields) 310 | def email_html = html_template.toString() 311 | 312 | // Render the sendmail template 313 | def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] 314 | def sf = new File("$projectDir/assets/sendmail_template.txt") 315 | def sendmail_template = engine.createTemplate(sf).make(smail_fields) 316 | def sendmail_html = sendmail_template.toString() 317 | 318 | // Send the HTML e-mail 319 | if (email_address) { 320 | try { 321 | if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } 322 | // Try to send HTML e-mail using sendmail 323 | [ 'sendmail', '-t' ].execute() << sendmail_html 324 | log.info "[nf-core/spliz] Sent summary e-mail to $email_address (sendmail)" 325 | } catch (all) { 326 | // Catch failures and try with plaintext 327 | def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] 328 | if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) { 329 | mail_cmd += [ '-A', mqc_report ] 330 | } 331 | mail_cmd.execute() << email_html 332 | log.info "[nf-core/spliz] Sent summary e-mail to $email_address (mail)" 333 | } 334 | } 335 | 336 | // Write summary e-mail HTML to a file 337 | def output_d = new File("${params.outdir}/pipeline_info/") 338 | if (!output_d.exists()) { 339 | output_d.mkdirs() 340 | } 341 | def output_hf = new File(output_d, "pipeline_report.html") 342 | output_hf.withWriter { w -> w << email_html } 343 | def output_tf = new File(output_d, "pipeline_report.txt") 344 | output_tf.withWriter { w -> w << email_txt } 345 | 346 | c_green = params.monochrome_logs ? '' : "\033[0;32m"; 347 | c_purple = params.monochrome_logs ? '' : "\033[0;35m"; 348 | c_red = params.monochrome_logs ? '' : "\033[0;31m"; 349 | c_reset = params.monochrome_logs ? '' : "\033[0m"; 350 | 351 | if (workflow.stats.ignoredCount > 0 && workflow.success) { 352 | log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" 353 | log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" 354 | log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" 355 | } 356 | 357 | if (workflow.success) { 358 | log.info "-${c_purple}[nf-core/spliz]${c_green} Pipeline completed successfully${c_reset}-" 359 | log.info "Results can be found in ${params.outdir}." 360 | } else { 361 | checkHostname() 362 | log.info "-${c_purple}[nf-core/spliz]${c_red} Pipeline completed with errors${c_reset}-" 363 | } 364 | 365 | } 366 | 367 | workflow.onError { 368 | // Print unexpected parameters - easiest is to just rerun validation 369 | NfcoreSchema.validateParameters(params, json_schema, log) 370 | } 371 | 372 | def checkHostname() { 373 | def c_reset = params.monochrome_logs ? '' : "\033[0m" 374 | def c_white = params.monochrome_logs ? '' : "\033[0;37m" 375 | def c_red = params.monochrome_logs ? '' : "\033[1;91m" 376 | def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" 377 | if (params.hostnames) { 378 | def hostname = 'hostname'.execute().text.trim() 379 | params.hostnames.each { prof, hnames -> 380 | hnames.each { hname -> 381 | if (hostname.contains(hname) && !workflow.profile.contains(prof)) { 382 | log.error "${c_red}====================================================${c_reset}\n" + 383 | " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + 384 | " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + 385 | " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + 386 | "${c_red}====================================================${c_reset}\n" 387 | } 388 | } 389 | } 390 | } 391 | } 392 | -------------------------------------------------------------------------------- /modules/local/ann_splices.nf: -------------------------------------------------------------------------------- 1 | process ANN_SPLICES { 2 | tag "${params.dataname}" 3 | 4 | label 'process_medium' 5 | 6 | input: 7 | path pq 8 | path exon_pickle 9 | path splice_pickle 10 | 11 | output: 12 | path outname, emit: tsv 13 | 14 | script: 15 | outname = "${params.dataname}_ann_splices.tsv" 16 | """ 17 | ann_splices.py \\ 18 | --in_file ${pq} \\ 19 | --out_file ${outname} \\ 20 | --exon_pickle ${exon_pickle} \\ 21 | --splice_pickle ${splice_pickle} 22 | """ 23 | } -------------------------------------------------------------------------------- /modules/local/calc_rijk_zscore.nf: -------------------------------------------------------------------------------- 1 | process CALC_RIJK_ZSCORE { 2 | tag "${params.dataname}" 3 | //label 'process_high_memory' 4 | publishDir "${params.outdir}/SpliZ_values", 5 | mode: 'copy', 6 | pattern: '*.tsv' 7 | publishDir "${params.outdir}/SpliZ_values", 8 | mode: 'copy', 9 | pattern: '*.pq' 10 | publishDir "${params.outdir}/logs", 11 | mode: 'copy', 12 | pattern: '*.log' 13 | 14 | input: 15 | val dataname 16 | path pq 17 | val pin_S 18 | val pin_z 19 | val bounds 20 | val light 21 | val SICILIAN 22 | val grouping_level_2 23 | val grouping_level_1 24 | val convert_parquet 25 | 26 | output: 27 | tuple val(dataname), val(param_stem), path("*.pq") , emit: pq 28 | path "*.tsv" , emit: tsv 29 | path "*.log" , emit: log 30 | 31 | script: 32 | def suff_light = light ? "_light" : "" 33 | def suff_SICILIAN = SICILIAN ? "_SICILIAN" : "" 34 | 35 | def isLight = light ? "1" : "0" 36 | def isSICILIAN = SICILIAN ? "1" : "0" 37 | 38 | param_stem = "S_${pin_S}_z_${pin_z}_b_${bounds}${suff_light}${suff_SICILIAN}" 39 | 40 | outname_pq = "${dataname}_sym_${param_stem}.pq" 41 | outname_tsv = "${dataname}_sym_${param_stem}_subcol.tsv" 42 | outname_log = "calc_rijk_zscore.log" 43 | 44 | if (convert_parquet): 45 | """ 46 | rijk_zscore.py \\ 47 | --parquet ${pq} \\ 48 | --pinning_S ${pin_S} \\ 49 | --pinning_z ${pin_z} \\ 50 | --lower_bound ${bounds} \\ 51 | --isLight ${isLight} \\ 52 | --isSICILIAN ${isSICILIAN} \\ 53 | --grouping_level_2 ${grouping_level_2} \\ 54 | --grouping_level_1 ${grouping_level_1} \\ 55 | --outname_pq ${outname_pq} \\ 56 | --outname_tsv ${outname_tsv} \\ 57 | --outname_log ${outname_log} \\ 58 | --convert_parquet 59 | """ 60 | else: 61 | """ 62 | rijk_zscore.py \\ 63 | --parquet ${pq} \\ 64 | --pinning_S ${pin_S} \\ 65 | --pinning_z ${pin_z} \\ 66 | --lower_bound ${bounds} \\ 67 | --isLight ${isLight} \\ 68 | --isSICILIAN ${isSICILIAN} \\ 69 | --grouping_level_2 ${grouping_level_2} \\ 70 | --grouping_level_1 ${grouping_level_1} \\ 71 | --outname_pq ${outname_pq} \\ 72 | --outname_tsv ${outname_tsv} \\ 73 | --outname_log ${outname_log} 74 | """ 75 | } -------------------------------------------------------------------------------- /modules/local/calc_splizvd.nf: -------------------------------------------------------------------------------- 1 | process CALC_SPLIZVD { 2 | tag "${params.dataname}" 3 | publishDir "${params.outdir}/SpliZ_values", 4 | mode: "copy", 5 | pattern: "*.tsv" 6 | publishDir "${params.outdir}/SpliZ_values", 7 | mode: "copy", 8 | pattern: "*.pq" 9 | publishDir "${params.outdir}/logs", 10 | mode: 'copy', 11 | pattern: '*.log' 12 | 13 | label 'process_medium' 14 | 15 | input: 16 | path input 17 | val param_stem 18 | val dataname 19 | val pin_S 20 | val pin_z 21 | val bounds 22 | val svd_type 23 | val grouping_level_1 24 | val grouping_level_2 25 | val isLight 26 | val isSICILIAN 27 | val rank_quant 28 | 29 | output: 30 | path outname_pq , emit: pq 31 | path outname_tsv , emit: tsv 32 | path "*.log" , emit: log 33 | path "mat_samplesheet.tsv" , emit: matSheet 34 | 35 | script: 36 | outname_pq = "${dataname}_sym_SVD_${svd_type}_${param_stem}.pq" 37 | outname_tsv = "${dataname}_sym_SVD_${svd_type}_${param_stem}_subcol.tsv" 38 | outname_log = "calc_splizvd.log" 39 | 40 | """ 41 | calc_splizvd.py \\ 42 | --input ${input} \\ 43 | --pinning_S ${pin_S} \\ 44 | --pinning_z ${pin_z} \\ 45 | --lower_bound ${bounds} \\ 46 | --isLight ${isLight} \\ 47 | --isSICILIAN ${isSICILIAN} \\ 48 | --svd_type ${svd_type} \\ 49 | --grouping_level_1 ${grouping_level_1} \\ 50 | --grouping_level_2 ${grouping_level_2} \\ 51 | --outname_pq ${outname_pq} \\ 52 | --outname_tsv ${outname_tsv} \\ 53 | --outname_log ${outname_log} \\ 54 | --workdir \$PWD \\ 55 | --rank_quant ${rank_quant} 56 | 57 | """ 58 | 59 | } 60 | -------------------------------------------------------------------------------- /modules/local/class_input_10X.nf: -------------------------------------------------------------------------------- 1 | process CLASS_INPUT_10X { 2 | tag "${params.dataname}" 3 | 4 | label 'process_high' 5 | 6 | input: 7 | tuple val(sample_ID), file(bam) 8 | val dataname 9 | val libraryType 10 | path annotator_pickle 11 | path gtf 12 | 13 | output: 14 | tuple val(sample_ID), path(outname), emit: class_input 15 | 16 | script: 17 | outname = "${sample_ID}.class_input" 18 | 19 | """ 20 | light_class_input_subcols.py \\ 21 | --bams ${bam} \\ 22 | --libraryType ${libraryType} \\ 23 | --annotator ${annotator_pickle} \\ 24 | --gtf ${gtf} \\ 25 | --outname ${outname} 26 | """ 27 | 28 | } -------------------------------------------------------------------------------- /modules/local/class_input_SS2.nf: -------------------------------------------------------------------------------- 1 | process CLASS_INPUT_SS2 { 2 | tag "${params.dataname}" 3 | 4 | label 'process_high' 5 | 6 | input: 7 | tuple val(sample_ID), file(bam_R1), file(bam_R2) 8 | val dataname 9 | val libraryType 10 | path annotator_pickle 11 | path gtf 12 | 13 | output: 14 | tuple val(sample_ID), path(outname), emit: class_input 15 | 16 | script: 17 | outname = "${sample_ID}.class_input" 18 | 19 | """ 20 | light_class_input_subcols.py \\ 21 | --bams ${bam_R1} ${bam_R2} \\ 22 | --libraryType ${libraryType} \\ 23 | --annotator ${annotator_pickle} \\ 24 | --gtf ${gtf} \\ 25 | --outname ${outname} 26 | """ 27 | 28 | } -------------------------------------------------------------------------------- /modules/local/convert_parquet.nf: -------------------------------------------------------------------------------- 1 | process CONVERT_PARQUET { 2 | tag "${params.dataname}" 3 | 4 | input: 5 | path tsv 6 | 7 | output: 8 | path "*.pq", emit: pq 9 | 10 | script: 11 | pq = "${tsv.baseName}.pq" 12 | """ 13 | parquet_to_tsv.py \\ 14 | --parquet ${pq} \\ 15 | --tsv ${tsv} \\ 16 | --reverse 17 | """ 18 | } -------------------------------------------------------------------------------- /modules/local/convert_split_parquet.nf: -------------------------------------------------------------------------------- 1 | process CONVERT_SPLIT_PARQUET { 2 | tag "${params.dataname}" 3 | //label 'process_high_memory' 4 | 5 | input: 6 | path tsv 7 | 8 | output: 9 | path "*.pq", emit: pq 10 | 11 | script: 12 | basename = tsv.baseName 13 | """ 14 | convert_tsv_to_parquet.py \\ 15 | --tsv ${tsv} \\ 16 | --splitChr \\ 17 | --basename ${basename} 18 | """ 19 | } -------------------------------------------------------------------------------- /modules/local/find_spliz_sites.nf: -------------------------------------------------------------------------------- 1 | process FIND_SPLIZ_SITES { 2 | tag "${params.dataname}" 3 | //label 'process_high_memory' 4 | publishDir "${params.outdir}/SpliZ_sites", 5 | mode: "copy", 6 | pattern: "*.tsv" 7 | 8 | label 'process_medium' 9 | 10 | input: 11 | path perm_pvals 12 | val libraryType 13 | path geneMat_samplesheet 14 | 15 | output: 16 | path first_evec , emit: first_evec 17 | path second_evec , emit: second_evec 18 | path third_evec , emit: third_evec 19 | 20 | script: 21 | param_stem = perm_pvals.baseName 22 | 23 | first_evec = "first_evec_${param_stem}.tsv" 24 | second_evec = "second_evec_${param_stem}.tsv" 25 | third_evec = "third_evec_${param_stem}.tsv" 26 | 27 | """ 28 | find_SpliZ_sites.R \\ 29 | ${perm_pvals} \\ 30 | ${first_evec} \\ 31 | ${second_evec} \\ 32 | ${third_evec} \\ 33 | ${libraryType} \\ 34 | ${geneMat_samplesheet} 35 | """ 36 | 37 | } 38 | -------------------------------------------------------------------------------- /modules/local/preprocess_tsv.nf: -------------------------------------------------------------------------------- 1 | include { CONVERT_PARQUET } from '../../modules/local/convert_parquet' 2 | 3 | workflow PREPROCESS_TSV { 4 | take: 5 | ch_input 6 | 7 | main: 8 | CONVERT_PARQUET ( 9 | ch_input, 10 | params.dataname 11 | ) 12 | 13 | emit: 14 | pq = CONVERT_PARQUET.out.pq 15 | } -------------------------------------------------------------------------------- /modules/local/process_class_input.nf: -------------------------------------------------------------------------------- 1 | process PROCESS_CLASS_INPUT { 2 | 3 | publishDir "${params.outdir}/class_input", 4 | mode: 'copy', 5 | pattern: '*.pq' 6 | 7 | label 'process_medium' 8 | 9 | input: 10 | path class_input 11 | val dataname 12 | val libraryType 13 | path meta 14 | 15 | output: 16 | path "*.pq", emit: pq 17 | 18 | script: 19 | outname = "${dataname}.pq" 20 | """ 21 | process_CI.py \\ 22 | --input_file ${class_input} \\ 23 | --meta ${meta} \\ 24 | --libraryType ${libraryType} \\ 25 | --outname ${outname} 26 | """ 27 | } -------------------------------------------------------------------------------- /modules/local/pval_permutations.nf: -------------------------------------------------------------------------------- 1 | process PVAL_PERMUTATIONS { 2 | tag "${params.dataname}" 3 | 4 | publishDir "${params.outdir}/variance_adjusted_permutations", 5 | mode: "copy", 6 | pattern: "*.tsv" 7 | publishDir "${params.outdir}/logs", 8 | mode: 'copy', 9 | pattern: '*.log' 10 | 11 | label 'process_medium' 12 | 13 | input: 14 | val splizvd_pq 15 | val param_stem 16 | val dataname 17 | val n_perms 18 | val grouping_level_2 19 | val grouping_level_1 20 | 21 | output: 22 | path outname_all_pvals , emit: all_pvals 23 | path outname_perm_pvals , emit: perm_pvals 24 | path outname_log , emit: log 25 | 26 | script: 27 | outname_all_pvals = "${dataname}_outdf_${grouping_level_2}-${grouping_level_1}_${n_perms}_${param_stem}.tsv" 28 | outname_perm_pvals = "${dataname}_pvals_${grouping_level_2}-${grouping_level_1}_${n_perms}_${param_stem}.tsv" 29 | outname_log = "pval_permutations.log" 30 | 31 | """ 32 | variance_adjusted_permutations_bytiss.py \\ 33 | --input ${splizvd_pq} \\ 34 | --num_perms ${n_perms} \\ 35 | --grouping_level_2 ${grouping_level_2} \\ 36 | --grouping_level_1 ${grouping_level_1} \\ 37 | --outname_all_pvals ${outname_all_pvals} \\ 38 | --outname_perm_pvals ${outname_perm_pvals} \\ 39 | --outname_log ${outname_log} 40 | """ 41 | } -------------------------------------------------------------------------------- /modules/local/summarize_results.nf: -------------------------------------------------------------------------------- 1 | process SUMMARIZE_RESULTS { 2 | tag "${params.dataname}" 3 | 4 | publishDir "${params.outdir}", 5 | mode: "copy", 6 | pattern: "*.tsv" 7 | publishDir "${params.outdir}/logs", 8 | mode: 'copy', 9 | pattern: '*.log' 10 | 11 | label 'process_medium' 12 | 13 | input: 14 | path perm_pvals 15 | val param_stem 16 | val dataname 17 | path first_evec 18 | path second_evec 19 | path third_evec 20 | path splizvd_tsv 21 | val grouping_level_2 22 | val grouping_level_1 23 | 24 | output: 25 | path outname , emit: summary 26 | path outname_log , emit: log 27 | 28 | script: 29 | outname = "summary_${dataname}_${grouping_level_2}-${grouping_level_1}_${param_stem}.tsv" 30 | outname_log = "summarize_results.log" 31 | 32 | """ 33 | final_summary.py \\ 34 | --perm_pvals ${perm_pvals} \\ 35 | --first_evec ${first_evec} \\ 36 | --second_evec ${second_evec} \\ 37 | --third_evec ${third_evec} \\ 38 | --splizvd ${splizvd_tsv} \\ 39 | --grouping_level_2 ${grouping_level_2} \\ 40 | --grouping_level_1 ${grouping_level_1} \\ 41 | --outname ${outname} \\ 42 | --outname_log ${outname_log} 43 | """ 44 | } -------------------------------------------------------------------------------- /nextflow.config: -------------------------------------------------------------------------------- 1 | /* 2 | * ------------------------------------------------- 3 | * nf-core/spliz Nextflow config file 4 | * ------------------------------------------------- 5 | * Default config options for all environments. 6 | */ 7 | 8 | // Global default params, used in configs 9 | params { 10 | // Workflow flags for SpliZ 11 | // TODO nf-core: Specify your pipeline's command line flags 12 | dataname = null 13 | input_file = null 14 | SICILIAN = false 15 | pin_S = 0.01 16 | pin_z = 0.0 17 | bounds = 5 18 | light = false 19 | svd_type = "normdonor" 20 | n_perms = 100 21 | grouping_level_1 = null 22 | grouping_level_2 = null 23 | libraryType = null 24 | run_analysis = false 25 | samplesheet = null 26 | annotator_pickle = null 27 | exon_pickle = null 28 | splice_pickle = null 29 | meta = null 30 | gtf = null 31 | rank_quant = 0 32 | 33 | outdir = './results/${params.dataname}' 34 | publish_dir_mode = 'copy' 35 | 36 | // Boilerplate options 37 | genome = false 38 | genomes = false 39 | multiqc_config = false 40 | email = false 41 | email_on_fail = false 42 | max_multiqc_email_size = 25.MB 43 | plaintext_email = false 44 | monochrome_logs = false 45 | help = false 46 | igenomes_base = 's3://ngi-igenomes/igenomes' 47 | tracedir = "${params.outdir}/pipeline_info" 48 | igenomes_ignore = false 49 | custom_config_version = 'master' 50 | custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" 51 | hostnames = false 52 | config_profile_name = null 53 | config_profile_description = false 54 | config_profile_contact = false 55 | config_profile_url = false 56 | validate_params = true 57 | show_hidden_params = false 58 | schema_ignore_params = 'genomes' 59 | 60 | // Defaults only, expecting to be overwritten 61 | max_memory = 800.GB 62 | max_cpus = 16 63 | max_time = 240.h 64 | 65 | } 66 | 67 | // Container slug. Stable releases should specify release tag! 68 | // Developmental code should specify :dev 69 | process.container = 'kaitlinchaung/spliz:dev' 70 | 71 | // Load base.config by default for all pipelines 72 | includeConfig 'conf/base.config' 73 | 74 | // Load nf-core custom profiles from different Institutions 75 | try { 76 | includeConfig "${params.custom_config_base}/nfcore_custom.config" 77 | } catch (Exception e) { 78 | System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") 79 | } 80 | 81 | profiles { 82 | sherlock { 83 | process.executor = 'slurm' 84 | process.clusterOptions = '-p owners' 85 | 86 | process.memory = { 20.GB * task.attempt } 87 | process.time = { 1.h * task.attempt } 88 | process.errorStrategy = { task.exitStatus in [1,130,143,137,104,134,139] ? 'retry' : 'finish' } 89 | process.maxRetries = 3 90 | } 91 | conda { 92 | docker.enabled = false 93 | singularity.enabled = false 94 | podman.enabled = false 95 | shifter.enabled = false 96 | charliecloud.enabled = false 97 | process.conda = "$projectDir/environment.yml" 98 | createTimeout = "2 h" 99 | } 100 | debug { process.beforeScript = 'echo $HOSTNAME' } 101 | docker { 102 | process.container = 'kaitlinchaung/spliz:v0.8' 103 | docker.enabled = true 104 | docker.userEmulation = true 105 | singularity.enabled = false 106 | podman.enabled = false 107 | shifter.enabled = false 108 | charliecloud.enabled = false 109 | } 110 | singularity { 111 | process.container = 'kaitlinchaung/spliz:v0.8' 112 | singularity.enabled = true 113 | singularity.autoMounts = true 114 | docker.enabled = false 115 | podman.enabled = false 116 | shifter.enabled = false 117 | charliecloud.enabled = false 118 | } 119 | podman { 120 | singularity.enabled = false 121 | docker.enabled = false 122 | podman.enabled = true 123 | shifter.enabled = false 124 | charliecloud.enabled = false 125 | } 126 | shifter { 127 | singularity.enabled = false 128 | docker.enabled = false 129 | podman.enabled = false 130 | shifter.enabled = true 131 | charliecloud.enabled = false 132 | } 133 | charliecloud { 134 | singularity.enabled = false 135 | docker.enabled = false 136 | podman.enabled = false 137 | shifter.enabled = false 138 | charliecloud.enabled = true 139 | } 140 | test { includeConfig 'conf/test.config' } 141 | test_full { includeConfig 'conf/test_full.config' } 142 | small_test_data { includeConfig 'small_data/small.config'} 143 | } 144 | 145 | // Export these variables to prevent local Python/R libraries from conflicting with those in the container 146 | env { 147 | PYTHONNOUSERSITE = 1 148 | R_PROFILE_USER = "/.Rprofile" 149 | R_ENVIRON_USER = "/.Renviron" 150 | } 151 | 152 | // Capture exit codes from upstream processes when piping 153 | process.shell = ['/bin/bash', '-euo', 'pipefail'] 154 | 155 | def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') 156 | timeline { 157 | enabled = true 158 | file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" 159 | } 160 | report { 161 | enabled = true 162 | file = "${params.tracedir}/execution_report_${trace_timestamp}.html" 163 | } 164 | trace { 165 | enabled = true 166 | file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" 167 | } 168 | dag { 169 | enabled = true 170 | file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" 171 | } 172 | 173 | manifest { 174 | name = 'salzmanlab/spliz' 175 | author = 'Salzman Lab' 176 | homePage = 'https://github.com/salzmanlab/SpliZ' 177 | description = 'Code to calculate the Splicing Z Score (SZS) for single cell RNA-seq splicing analysis' 178 | mainScript = 'main.nf' 179 | nextflowVersion = '>=20.04.0' 180 | version = '1.0dev' 181 | } 182 | 183 | // Function to ensure that resource requirements don't go beyond 184 | // a maximum limit 185 | def check_max(obj, type) { 186 | if (type == 'memory') { 187 | try { 188 | if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) 189 | return params.max_memory as nextflow.util.MemoryUnit 190 | else 191 | return obj 192 | } catch (all) { 193 | println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" 194 | return obj 195 | } 196 | } else if (type == 'time') { 197 | try { 198 | if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) 199 | return params.max_time as nextflow.util.Duration 200 | else 201 | return obj 202 | } catch (all) { 203 | println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" 204 | return obj 205 | } 206 | } else if (type == 'cpus') { 207 | try { 208 | return Math.min( obj, params.max_cpus as int ) 209 | } catch (all) { 210 | println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" 211 | return obj 212 | } 213 | } 214 | } 215 | -------------------------------------------------------------------------------- /nextflow_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/draft-07/schema", 3 | "$id": "https://raw.githubusercontent.com/nf-core/spliz/master/nextflow_schema.json", 4 | "title": "nf-core/spliz pipeline parameters", 5 | "description": "Code to calculate the Splicing Z Score (SZS) for single cell RNA-seq splicing analysis", 6 | "type": "object", 7 | "definitions": { 8 | "input_output_options": { 9 | "title": "Input/output options", 10 | "type": "object", 11 | "fa_icon": "fas fa-terminal", 12 | "description": "Define where the pipeline should find input data and save output data.", 13 | "properties": { 14 | "dataname": { 15 | "type": "string", 16 | "description": "Name identifier of the SpliZ run" 17 | }, 18 | "input_file": { 19 | "type": "string", 20 | "fa_icon": "fas fa-dna", 21 | "description": "Input parquet or tsv file" 22 | }, 23 | "outdir": { 24 | "type": "string", 25 | "description": "Output directory for results", 26 | "default": "./results/${params.dataname}", 27 | "hidden": true 28 | }, 29 | "SICILIAN": { 30 | "type": "boolean", 31 | "description": "Is the input file SICILIAN output?" 32 | }, 33 | "pin_S": { 34 | "type": "number", 35 | "description": "Bound splice site residuals at this quantile (e.g. values in the lower pin_S quantile and the upper 1 - pin_S quantile will be rounded to the quantile limits)" 36 | }, 37 | "pin_z": { 38 | "type": "number", 39 | "description": "Bound SpliZ scores at this quantile (e.g. values in the lower pin_z quantile and the upper 1 - pin_z quantile will be rounded to the quantile limits)" 40 | }, 41 | "bounds": { 42 | "type": "integer", 43 | "description": "Only include cell/gene pairs that have more than this many junctional reads for the gene" 44 | }, 45 | "light": { 46 | "type": "boolean", 47 | "description": "Output the minimum number of columns", 48 | "default": true 49 | }, 50 | "svd_type": { 51 | "type": "string", 52 | "description": "Type of SVD calculation" 53 | }, 54 | "grouping_level_1": { 55 | "type": "string", 56 | "description": "Column to partition data by" 57 | }, 58 | "grouping_level_2": { 59 | "type": "string", 60 | "description": "Column to group data by" 61 | }, 62 | "n_perms": { 63 | "type": "integer", 64 | "description": "Number of permutations" 65 | }, 66 | "annotator_pickle": { 67 | "type": "string", 68 | "description": "Annotator pickle file" 69 | }, 70 | "exon_pickle": { 71 | "type": "string", 72 | "description": "Exon pickle file" 73 | }, 74 | "splice_pickle": { 75 | "type": "string", 76 | "description": "Splice pickle file" 77 | }, 78 | "libraryType": { 79 | "type": "string", 80 | "description": "Options: 10X (for 10X chromium), SS2 (for Smart-seq2), and SLS (for Slide-seq or Slide-seq2)" 81 | }, 82 | "gtf": { 83 | "type": "string", 84 | "description": "GTF annotation file" 85 | }, 86 | "rank_quant": { 87 | "type": "number", 88 | "description": "Bound SpliZ ranks for each donor/acceptor at this quantile (e.g. values in the lower rank_quant quantile and the upper 1 - rank_quant quantile will be rounded to the quantile limits)" 89 | }, 90 | "help": { 91 | "type": "boolean", 92 | "description": "Display help text.", 93 | "hidden": true, 94 | "fa_icon": "fas fa-question-circle" 95 | }, 96 | "run_analysis": { 97 | "type": "boolean", 98 | "description": "Run analysis steps?" 99 | }, 100 | "samplesheet": { 101 | "type": "string" 102 | }, 103 | "meta": { 104 | "type": "string", 105 | "description": "Metadata file containing entries for each barcode/grouping_col_1/grouping_col_2 combination" 106 | } 107 | }, 108 | "required": [ 109 | "dataname" 110 | ] 111 | }, 112 | "max_job_request_options": { 113 | "title": "Max job request options", 114 | "type": "object", 115 | "fa_icon": "fab fa-acquisitions-incorporated", 116 | "description": "Set the top limit for requested resources for any single job.", 117 | "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", 118 | "properties": { 119 | "max_cpus": { 120 | "type": "integer", 121 | "description": "Maximum number of CPUs that can be requested for any single job.", 122 | "default": 16, 123 | "fa_icon": "fas fa-microchip", 124 | "hidden": true, 125 | "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" 126 | }, 127 | "max_memory": { 128 | "type": "string", 129 | "description": "Maximum amount of memory that can be requested for any single job.", 130 | "default": "128.GB", 131 | "fa_icon": "fas fa-memory", 132 | "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", 133 | "hidden": true, 134 | "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" 135 | }, 136 | "max_time": { 137 | "type": "string", 138 | "description": "Maximum amount of time that can be requested for any single job.", 139 | "default": "240.h", 140 | "fa_icon": "far fa-clock", 141 | "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", 142 | "hidden": true, 143 | "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" 144 | } 145 | } 146 | }, 147 | "institutional_config_options": { 148 | "title": "Institutional config options", 149 | "type": "object", 150 | "fa_icon": "fas fa-university", 151 | "description": "Parameters used to describe centralised config profiles. These should not be edited.", 152 | "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.", 153 | "properties": { 154 | "custom_config_version": { 155 | "type": "string", 156 | "description": "Git commit id for Institutional configs.", 157 | "default": "master", 158 | "hidden": true, 159 | "fa_icon": "fas fa-users-cog", 160 | "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```" 161 | }, 162 | "custom_config_base": { 163 | "type": "string", 164 | "description": "Base directory for Institutional configs.", 165 | "default": "https://raw.githubusercontent.com/nf-core/configs/master", 166 | "hidden": true, 167 | "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.", 168 | "fa_icon": "fas fa-users-cog" 169 | }, 170 | "hostnames": { 171 | "type": "string", 172 | "description": "Institutional configs hostname.", 173 | "hidden": true, 174 | "fa_icon": "fas fa-users-cog" 175 | }, 176 | "config_profile_name": { 177 | "type": "string", 178 | "description": "Institutional config name.", 179 | "hidden": true, 180 | "fa_icon": "fas fa-users-cog" 181 | }, 182 | "config_profile_description": { 183 | "type": "string", 184 | "description": "Institutional config description.", 185 | "hidden": true, 186 | "fa_icon": "fas fa-users-cog" 187 | }, 188 | "config_profile_contact": { 189 | "type": "string", 190 | "description": "Institutional config contact information.", 191 | "hidden": true, 192 | "fa_icon": "fas fa-users-cog" 193 | }, 194 | "config_profile_url": { 195 | "type": "string", 196 | "description": "Institutional config URL link.", 197 | "hidden": true, 198 | "fa_icon": "fas fa-users-cog" 199 | } 200 | } 201 | } 202 | }, 203 | "allOf": [ 204 | { 205 | "$ref": "#/definitions/input_output_options" 206 | }, 207 | { 208 | "$ref": "#/definitions/max_job_request_options" 209 | }, 210 | { 211 | "$ref": "#/definitions/institutional_config_options" 212 | } 213 | ], 214 | "properties": { 215 | "publish_dir_mode": { 216 | "type": "string", 217 | "default": "copy", 218 | "hidden": true, 219 | "description": "Method used to save pipeline results to output directory.", 220 | "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", 221 | "fa_icon": "fas fa-copy", 222 | "enum": [ 223 | "symlink", 224 | "rellink", 225 | "link", 226 | "copy", 227 | "copyNoFollow", 228 | "move" 229 | ] 230 | }, 231 | "validate_params": { 232 | "type": "boolean", 233 | "description": "Boolean whether to validate parameters against the schema at runtime", 234 | "default": true, 235 | "fa_icon": "fas fa-check-square", 236 | "hidden": true 237 | }, 238 | "email_on_fail": { 239 | "type": "string", 240 | "description": "Email address for completion summary, only when pipeline fails.", 241 | "fa_icon": "fas fa-exclamation-triangle", 242 | "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$", 243 | "hidden": true, 244 | "help_text": "This works exactly as with `--email`, except emails are only sent if the workflow is not successful." 245 | }, 246 | "plaintext_email": { 247 | "type": "boolean", 248 | "description": "Send plain-text email instead of HTML.", 249 | "fa_icon": "fas fa-remove-format", 250 | "hidden": true, 251 | "help_text": "Set to receive plain-text e-mails instead of HTML formatted." 252 | }, 253 | "max_multiqc_email_size": { 254 | "type": "string", 255 | "description": "File size limit when attaching MultiQC reports to summary emails.", 256 | "default": "25.MB", 257 | "fa_icon": "fas fa-file-upload", 258 | "hidden": true, 259 | "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached." 260 | }, 261 | "monochrome_logs": { 262 | "type": "boolean", 263 | "description": "Do not use coloured log outputs.", 264 | "fa_icon": "fas fa-palette", 265 | "hidden": true, 266 | "help_text": "Set to disable colourful command line output and live life in monochrome." 267 | }, 268 | "multiqc_config": { 269 | "type": "string", 270 | "description": "Custom config file to supply to MultiQC.", 271 | "fa_icon": "fas fa-cog", 272 | "hidden": true 273 | }, 274 | "tracedir": { 275 | "type": "string", 276 | "description": "Directory to keep pipeline Nextflow logs and reports.", 277 | "default": "${params.outdir}/pipeline_info", 278 | "fa_icon": "fas fa-cogs", 279 | "hidden": true 280 | }, 281 | "email": { 282 | "type": "string" 283 | }, 284 | "igenomes_base": { 285 | "type": "string", 286 | "default": "s3://ngi-igenomes/igenomes" 287 | }, 288 | "igenomes_ignore": { 289 | "type": "string" 290 | }, 291 | "show_hidden_params": { 292 | "type": "string" 293 | }, 294 | "genome": { 295 | "type": "string" 296 | } 297 | } 298 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ###### Requirements with Version Specifiers ###### 2 | 3 | argparse == 1.4.0 4 | datetime == 4.4 5 | numpy == 1.22.3 6 | pandas == 1.4.1 7 | pysam == 0.16.0.1 8 | utils == 1.0.1 9 | tqdm == 4.62.0 10 | pyarrow == 5.0.0 11 | scipy == 1.7.1 12 | statsmodels == 0.12.2 -------------------------------------------------------------------------------- /small_data/small.config: -------------------------------------------------------------------------------- 1 | params { 2 | dataname = "test" 3 | input_file = "https://raw.githubusercontent.com/salzmanlab/SpliZ/main/small_data/small.pq" 4 | SICILIAN = true 5 | pin_S = 0.1 6 | pin_z = 0.0 7 | bounds = 5 8 | light = false 9 | svd_type = "normdonor" 10 | n_perms = 100 11 | grouping_level_2 = "compartment" 12 | grouping_level_1 = "tissue" 13 | libraryType = "10X" 14 | run_analysis = true 15 | } 16 | 17 | params.outdir = "./results/${params.dataname}" 18 | params.tracedir = "./results/${params.dataname}/pipeline_info" 19 | params.schema_ignore_params = "input,single_end,show_hidden_params,validate_params,igenomes_ignore,tracedir,igenomes_base,help,monochrome_logs,plaintext_email,max_multiqc_email_size,email_on_fail,email,multiqc_config,publish_dir_mode,genome,genomes" 20 | -------------------------------------------------------------------------------- /small_data/small.pq: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salzman-lab/SpliZ/594532c1b12a1a30b5e00cd9e1fbed30ef28047e/small_data/small.pq -------------------------------------------------------------------------------- /subworkflows/local/analysis.nf: -------------------------------------------------------------------------------- 1 | include { PVAL_PERMUTATIONS } from '../../modules/local/pval_permutations' 2 | include { FIND_SPLIZ_SITES } from '../../modules/local/find_spliz_sites' 3 | include { SUMMARIZE_RESULTS } from '../../modules/local/summarize_results' 4 | 5 | workflow ANALYSIS { 6 | take: 7 | splizvd_tsv 8 | splizvd_pq 9 | param_stem 10 | geneMat_samplesheet 11 | 12 | main: 13 | // Step 1: Calculate variance adjusted permutations 14 | PVAL_PERMUTATIONS ( 15 | splizvd_pq, 16 | param_stem, 17 | params.dataname, 18 | params.n_perms, 19 | params.grouping_level_2, 20 | params.grouping_level_1 21 | ) 22 | 23 | PVAL_PERMUTATIONS.out.perm_pvals 24 | .set{ pval_permutations } 25 | 26 | // Step 2: Find SpliZ sites 27 | FIND_SPLIZ_SITES ( 28 | pval_permutations, 29 | params.libraryType, 30 | geneMat_samplesheet 31 | ) 32 | 33 | // Step 3: Summarize results 34 | SUMMARIZE_RESULTS ( 35 | pval_permutations, 36 | param_stem, 37 | params.dataname, 38 | FIND_SPLIZ_SITES.out.first_evec, 39 | FIND_SPLIZ_SITES.out.second_evec, 40 | FIND_SPLIZ_SITES.out.third_evec, 41 | splizvd_tsv, 42 | params.grouping_level_2, 43 | params.grouping_level_1 44 | ) 45 | } 46 | -------------------------------------------------------------------------------- /subworkflows/local/convert_bam.nf: -------------------------------------------------------------------------------- 1 | include { CLASS_INPUT_10X } from '../../modules/local/class_input_10X' 2 | include { CLASS_INPUT_SS2 } from '../../modules/local/class_input_SS2' 3 | include { PROCESS_CLASS_INPUT } from '../../modules/local/process_class_input' 4 | include { ANN_SPLICES } from '../../modules/local/ann_splices' 5 | 6 | workflow CONVERT_BAM { 7 | take: 8 | ch_bam 9 | 10 | main: 11 | 12 | if ((params.libraryType == '10X') || (params.libraryType == "SLS")) { 13 | CLASS_INPUT_10X ( 14 | ch_bam, 15 | params.dataname, 16 | params.libraryType, 17 | params.annotator_pickle, 18 | params.gtf 19 | ) 20 | ch_light_class_input = CLASS_INPUT_10X.out.class_input 21 | } else if (params.libraryType == 'SS2') { 22 | CLASS_INPUT_SS2 ( 23 | ch_bam, 24 | params.dataname, 25 | params.libraryType, 26 | params.annotator_pickle, 27 | params.gtf 28 | ) 29 | ch_light_class_input = CLASS_INPUT_SS2.out.class_input 30 | } 31 | 32 | ch_light_class_input 33 | .collectFile(newLine: true) { files -> 34 | files.toString() 35 | } 36 | .set { ch_class_input } 37 | 38 | ch_class_input.view() 39 | 40 | PROCESS_CLASS_INPUT ( 41 | ch_class_input, 42 | params.dataname, 43 | params.libraryType, 44 | params.meta 45 | ) 46 | 47 | ANN_SPLICES ( 48 | PROCESS_CLASS_INPUT.out.pq, 49 | params.exon_pickle, 50 | params.splice_pickle 51 | ) 52 | 53 | emit: 54 | tsv = ANN_SPLICES.out.tsv 55 | } -------------------------------------------------------------------------------- /subworkflows/local/preprocess.nf: -------------------------------------------------------------------------------- 1 | include { CONVERT_PARQUET } from './../../modules/local/convert_parquet' 2 | include { CONVERT_BAM } from './convert_bam' 3 | 4 | workflow PREPROCESS { 5 | 6 | main: 7 | 8 | convert_bam = false 9 | 10 | if (params.input_file && params.samplesheet) { 11 | exit 1, "Invalid input, provide either input_file or samplesheet but not both." 12 | } else if (params.samplesheet) { 13 | if (params.SICILIAN) { 14 | exit 1, "Invalid input, SICILIAN inputs must be provided as input_file." 15 | } else { 16 | if ((params.libraryType == '10X') || (params.libraryType == "SLS")) { 17 | ch_bam = Channel.fromPath(params.samplesheet) 18 | .splitCsv(header:false) 19 | .map { row -> 20 | tuple( 21 | row[0], // bam file sample_ID 22 | file(row[1]) // bam file path 23 | ) 24 | } 25 | convert_bam = true 26 | } else if (params.libraryType == 'SS2') { 27 | ch_bam = Channel.fromPath(params.samplesheet) 28 | .splitCsv(header:false) 29 | .map { row -> 30 | tuple( 31 | row[0], // bam file sample_ID 32 | file(row[1]), // R1 bam file path 33 | file(row[2]) // R2 bam file path 34 | ) 35 | } 36 | convert_bam = true 37 | } 38 | } 39 | } else if (params.input_file) { 40 | input_file = file(params.input_file) 41 | def is_valid_input_file = input_file.extension in ["tsv", "pq", "txt", "bam"] 42 | if (!is_valid_input_file) { 43 | exit 1, "Invalid input file type supplied, options are *.bam, *.pq, *.txt, or *.tsv." 44 | } 45 | if (params.SICILIAN) { 46 | if (input_file.extension == "bam") { 47 | exit 1, "Invalid input, SICILIAN input must be a tsv, pq, or txt file." 48 | } else { 49 | ch_input = Channel.fromPath(params.input_file) 50 | } 51 | } else { 52 | if (input_file.extension == "bam") { 53 | if (!params.dataname) { 54 | exit 1, "Must provide dataname for bam file." 55 | } 56 | ch_bam = Channel.fromPath(params.input_file) 57 | .map { it -> 58 | tuple( 59 | params.dataname, 60 | file(it) 61 | ) 62 | } 63 | convert_bam = true 64 | } else { 65 | ch_input = Channel.fromPath(params.input_file) 66 | } 67 | } 68 | } else { 69 | exit 1, "No input_file or samplesheet provided." 70 | } 71 | 72 | if (convert_bam) { 73 | CONVERT_BAM ( 74 | ch_bam 75 | ) 76 | ch_input = CONVERT_BAM.out.tsv 77 | } 78 | 79 | emit: 80 | input = ch_input 81 | 82 | } -------------------------------------------------------------------------------- /subworkflows/local/spliz.nf: -------------------------------------------------------------------------------- 1 | include { CALC_SPLIZVD } from '../../modules/local/calc_splizvd' 2 | 3 | workflow SPLIZ { 4 | take: 5 | ch_input 6 | 7 | main: 8 | 9 | def suff_light = params.light ? "_light" : "" 10 | def suff_SICILIAN = params.SICILIAN ? "_SICILIAN" : "" 11 | def suff_rank_quant = params.rank_quant == 0 ? "" : "_r_${params.rank_quant}" 12 | 13 | def isLight = params.light ? "1" : "0" 14 | def isSICILIAN = params.SICILIAN ? "1" : "0" 15 | 16 | param_stem = "S_${params.pin_S}_z_${params.pin_z}_b_${params.bounds}${suff_rank_quant}${suff_light}${suff_SICILIAN}" 17 | 18 | // Step 1: Calculate RIJK zscore 19 | CALC_SPLIZVD ( 20 | ch_input, 21 | param_stem, 22 | params.dataname, 23 | params.pin_S, 24 | params.pin_z, 25 | params.bounds, 26 | params.svd_type, 27 | params.grouping_level_1, 28 | params.grouping_level_2, 29 | isLight, 30 | isSICILIAN, 31 | params.rank_quant 32 | ) 33 | 34 | emit: 35 | geneMat_samplesheet = CALC_SPLIZVD.out.matSheet 36 | splizvd_tsv = CALC_SPLIZVD.out.tsv 37 | splizvd_pq = CALC_SPLIZVD.out.pq 38 | param_stem = param_stem 39 | } 40 | -------------------------------------------------------------------------------- /workflows/spliz_pipeline.nf: -------------------------------------------------------------------------------- 1 | /* 2 | ======================================================================================== 3 | VALIDATE INPUTS 4 | ======================================================================================== 5 | */ 6 | 7 | // Check params with defined inputs 8 | def is_valid_svd_type = params.svd_type in ["normgene", "normdonor"] 9 | if (!is_valid_svd_type) { 10 | exit 1, "Invalid svd_type; options are 'normgene' and 'normdonor'." 11 | } 12 | 13 | def is_valid_libraryType = params.libraryType in ["SS2", "10X", "SLS"] 14 | if (!is_valid_libraryType) { 15 | exit 1, "Invalid libraryType; options are 'SS2', '10X', and 'SLS'." 16 | } 17 | 18 | /* 19 | ======================================================================================== 20 | IMPORT LOCAL MODULES/SUBWORKFLOWS 21 | ======================================================================================== 22 | */ 23 | include { PREPROCESS } from './../subworkflows/local/preprocess' 24 | include { SPLIZ } from './../subworkflows/local/spliz' 25 | include { ANALYSIS } from './../subworkflows/local/analysis' 26 | 27 | /* 28 | ======================================================================================== 29 | RUN MAIN WORKFLOW 30 | ======================================================================================== 31 | */ 32 | 33 | workflow SPLIZ_PIPELINE { 34 | 35 | PREPROCESS () 36 | 37 | 38 | SPLIZ ( 39 | PREPROCESS.out.input 40 | ) 41 | 42 | if (params.run_analysis) { 43 | ANALYSIS ( 44 | SPLIZ.out.splizvd_tsv, 45 | SPLIZ.out.splizvd_pq, 46 | SPLIZ.out.param_stem, 47 | SPLIZ.out.geneMat_samplesheet 48 | ) 49 | } 50 | } 51 | 52 | /* 53 | ======================================================================================== 54 | THE END 55 | ======================================================================================== 56 | */ 57 | --------------------------------------------------------------------------------