├── .gitattributes
├── .github
    ├── .dockstore.yml
    ├── CONTRIBUTING.md
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── config.yml
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── markdownlint.yml
    └── workflows
    │   ├── awsfulltest.yml
    │   ├── awstest.yml
    │   ├── branch.yml
    │   ├── ci.yml
    │   ├── linting.yml
    │   ├── linting_comment.yml
    │   ├── push_dockerhub_dev.yml
    │   └── push_dockerhub_release.yml
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── Dockerfile
├── LICENSE
├── README.md
├── assets
    ├── email_template.html
    ├── email_template.txt
    ├── multiqc_config.yaml
    ├── nf-core-spliz_logo.png
    └── sendmail_template.txt
├── bin
    ├── ann_splices.py
    ├── annotator.py
    ├── calc_splizvd.py
    ├── convert_parquet.py
    ├── final_summary.py
    ├── find_SpliZ_sites.R
    ├── light_class_input_subcols.py
    ├── light_utils.py
    ├── markdown_to_html.py
    ├── parquet_to_tsv.py
    ├── process_CI.py
    ├── rijk_zscore.py
    ├── scrape_software_versions.py
    ├── svd_zscore.py
    └── variance_adjusted_permutations_bytiss.py
├── conf
    ├── base.config
    ├── igenomes.config
    ├── test.config
    └── test_full.config
├── docs
    ├── README.md
    ├── images
    │   └── nf-core-spliz_logo.png
    ├── output.md
    └── usage.md
├── environment.yml
├── lib
    ├── Headers.groovy
    ├── NfcoreSchema.groovy
    └── nfcore_external_java_deps.jar
├── main.nf
├── modules
    └── local
    │   ├── ann_splices.nf
    │   ├── calc_rijk_zscore.nf
    │   ├── calc_splizvd.nf
    │   ├── class_input_10X.nf
    │   ├── class_input_SS2.nf
    │   ├── convert_parquet.nf
    │   ├── convert_split_parquet.nf
    │   ├── find_spliz_sites.nf
    │   ├── preprocess_tsv.nf
    │   ├── process_class_input.nf
    │   ├── pval_permutations.nf
    │   └── summarize_results.nf
├── nextflow.config
├── nextflow_schema.json
├── requirements.txt
├── small_data
    ├── small.config
    ├── small.pq
    └── small.tsv
├── subworkflows
    └── local
    │   ├── analysis.nf
    │   ├── convert_bam.nf
    │   ├── preprocess.nf
    │   └── spliz.nf
└── workflows
    └── spliz_pipeline.nf


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.config linguist-language=nextflow
2 | 


--------------------------------------------------------------------------------
/.github/.dockstore.yml:
--------------------------------------------------------------------------------
1 | # Dockstore config version, not pipeline version
2 | version: 1.2
3 | workflows:
4 |   - subclass: nfl
5 |     primaryDescriptorPath: /nextflow.config
6 |     publish: True
7 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # nf-core/spliz: Contributing Guidelines
  2 | 
  3 | Hi there!
  4 | Many thanks for taking an interest in improving nf-core/spliz.
  5 | 
  6 | We try to manage the required tasks for nf-core/spliz using GitHub issues, you probably came to this page when creating one.
  7 | Please use the pre-filled template to save time.
  8 | 
  9 | However, don't be put off by this template - other more general issues and suggestions are welcome!
 10 | Contributions to the code are even more welcome ;)
 11 | 
 12 | > If you need help using or modifying nf-core/spliz then the best place to ask is on the nf-core Slack [#spliz](https://nfcore.slack.com/channels/spliz) channel ([join our Slack here](https://nf-co.re/join/slack)).
 13 | 
 14 | ## Contribution workflow
 15 | 
 16 | If you'd like to write some code for nf-core/spliz, the standard workflow is as follows:
 17 | 
 18 | 1. Check that there isn't already an issue about your idea in the [nf-core/spliz issues](https://github.com/nf-core/spliz/issues) to avoid duplicating work
 19 |     * If there isn't one already, please create one so that others know you're working on this
 20 | 2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/spliz repository](https://github.com/nf-core/spliz) to your GitHub account
 21 | 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions)
 22 | 4. Use `nf-core schema build .` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10).
 23 | 5. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged
 24 | 
 25 | If you're not used to this workflow with git, you can start with some [docs from GitHub](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests) or even their [excellent `git` resources](https://try.github.io/).
 26 | 
 27 | ## Tests
 28 | 
 29 | When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests.
 30 | Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then.
 31 | 
 32 | There are typically two types of tests that run:
 33 | 
 34 | ### Lint tests
 35 | 
 36 | `nf-core` has a [set of guidelines](https://nf-co.re/developers/guidelines) which all pipelines must adhere to.
 37 | To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint <pipeline-directory>` command.
 38 | 
 39 | If any failures or warnings are encountered, please follow the listed URL for more documentation.
 40 | 
 41 | ### Pipeline tests
 42 | 
 43 | Each `nf-core` pipeline should be set up with a minimal set of test-data.
 44 | `GitHub Actions` then runs the pipeline on this data to ensure that it exits successfully.
 45 | If there are any failures then the automated tests fail.
 46 | These tests are run both with the latest available version of `Nextflow` and also the minimum required version that is stated in the pipeline code.
 47 | 
 48 | ## Patch
 49 | 
 50 | :warning: Only in the unlikely and regretful event of a release happening with a bug.
 51 | 
 52 | * On your own fork, make a new branch `patch` based on `upstream/master`.
 53 | * Fix the bug, and bump version (X.Y.Z+1).
 54 | * A PR should be made on `master` from patch to directly this particular bug.
 55 | 
 56 | ## Getting help
 57 | 
 58 | For further information/help, please consult the [nf-core/spliz documentation](https://nf-co.re/spliz/usage) and don't hesitate to get in touch on the nf-core Slack [#spliz](https://nfcore.slack.com/channels/spliz) channel ([join our Slack here](https://nf-co.re/join/slack)).
 59 | 
 60 | ## Pipeline contribution conventions
 61 | 
 62 | To make the nf-core/spliz code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written.
 63 | 
 64 | ### Adding a new step
 65 | 
 66 | If you wish to contribute a new step, please use the following coding standards:
 67 | 
 68 | 1. Define the corresponding input channel into your new process from the expected previous process channel
 69 | 2. Write the process block (see below).
 70 | 3. Define the output channel if needed (see below).
 71 | 4. Add any new flags/options to `nextflow.config` with a default (see below).
 72 | 5. Add any new flags/options to `nextflow_schema.json` with help text (with `nf-core schema build .`).
 73 | 6. Add any new flags/options to the help message (for integer/text parameters, print to help the corresponding `nextflow.config` parameter).
 74 | 7. Add sanity checks for all relevant parameters.
 75 | 8. Add any new software to the `scrape_software_versions.py` script in `bin/` and the version command to the `scrape_software_versions` process in `main.nf`.
 76 | 9. Do local tests that the new code works properly and as expected.
 77 | 10. Add a new test command in `.github/workflow/ci.yaml`.
 78 | 11. If applicable add a [MultiQC](https://https://multiqc.info/) module.
 79 | 12. Update MultiQC config `assets/multiqc_config.yaml` so relevant suffixes, name clean up, General Statistics Table column order, and module figures are in the right order.
 80 | 13. Optional: Add any descriptions of MultiQC report sections and output files to `docs/output.md`.
 81 | 
 82 | ### Default values
 83 | 
 84 | Parameters should be initialised / defined with default values in `nextflow.config` under the `params` scope.
 85 | 
 86 | Once there, use `nf-core schema build .` to add to `nextflow_schema.json`.
 87 | 
 88 | ### Default processes resource requirements
 89 | 
 90 | Sensible defaults for process resource requirements (CPUs / memory / time) for a process should be defined in `conf/base.config`. These should generally be specified generic with `withLabel:` selectors so they can be shared across multiple processes/steps of the pipeline. A nf-core standard set of labels that should be followed where possible can be seen in the [nf-core pipeline template](https://github.com/nf-core/tools/blob/master/nf_core/pipeline-template/conf/base.config), which has the default process as a single core-process, and then different levels of multi-core configurations for increasingly large memory requirements defined with standardised labels.
 91 | 
 92 | The process resources can be passed on to the tool dynamically within the process with the `${task.cpu}` and `${task.memory}` variables in the `script:` block.
 93 | 
 94 | ### Naming schemes
 95 | 
 96 | Please use the following naming schemes, to make it easy to understand what is going where.
 97 | 
 98 | * initial process channel: `ch_output_from_<process>`
 99 | * intermediate and terminal channels: `ch_<previousprocess>_for_<nextprocess>`
100 | 
101 | ### Nextflow version bumping
102 | 
103 | If you are using a new feature from core Nextflow, you may bump the minimum required version of nextflow in the pipeline with: `nf-core bump-version --nextflow . [min-nf-version]`
104 | 
105 | ### Software version reporting
106 | 
107 | If you add a new tool to the pipeline, please ensure you add the information of the tool to the `get_software_version` process.
108 | 
109 | Add to the script block of the process, something like the following:
110 | 
111 | ```bash
112 | <YOUR_TOOL> --version &> v_<YOUR_TOOL>.txt 2>&1 || true
113 | ```
114 | 
115 | or
116 | 
117 | ```bash
118 | <YOUR_TOOL> --help | head -n 1 &> v_<YOUR_TOOL>.txt 2>&1 || true
119 | ```
120 | 
121 | You then need to edit the script `bin/scrape_software_versions.py` to:
122 | 
123 | 1. Add a Python regex for your tool's `--version` output (as in stored in the `v_<YOUR_TOOL>.txt` file), to ensure the version is reported as a `v` and the version number e.g. `v2.1.1`
124 | 2. Add a HTML entry to the `OrderedDict` for formatting in MultiQC.
125 | 
126 | ### Images and figures
127 | 
128 | For overview images and other documents we follow the nf-core [style guidelines and examples](https://nf-co.re/developers/design_guidelines).
129 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Report something that is broken or incorrect
 4 | labels: bug
 5 | ---
 6 | 
 7 | <!--
 8 | # nf-core/spliz bug report
 9 | 
10 | Hi there!
11 | 
12 | Thanks for telling us about a problem with the pipeline.
13 | Please delete this text and anything that's not relevant from the template below:
14 | -->
15 | 
16 | ## Check Documentation
17 | 
18 | I have checked the following places for your error:
19 | 
20 | - [ ] [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting)
21 | - [ ] [nf-core/spliz pipeline documentation](https://nf-co.re/nf-core/spliz/usage)
22 | 
23 | ## Description of the bug
24 | 
25 | <!-- A clear and concise description of what the bug is. -->
26 | 
27 | ## Steps to reproduce
28 | 
29 | Steps to reproduce the behaviour:
30 | 
31 | 1. Command line: <!-- [e.g. `nextflow run ...`] -->
32 | 2. See error: <!-- [Please provide your error message] -->
33 | 
34 | ## Expected behaviour
35 | 
36 | <!-- A clear and concise description of what you expected to happen. -->
37 | 
38 | ## Log files
39 | 
40 | Have you provided the following extra information/files:
41 | 
42 | - [ ] The command used to run the pipeline
43 | - [ ] The `.nextflow.log` file <!-- this is a hidden file in the directory where you launched the pipeline -->
44 | 
45 | ## System
46 | 
47 | - Hardware: <!-- [e.g. HPC, Desktop, Cloud...] -->
48 | - Executor: <!-- [e.g. slurm, local, awsbatch...] -->
49 | - OS: <!-- [e.g. CentOS Linux, macOS, Linux Mint...] -->
50 | - Version <!-- [e.g. 7, 10.13.6, 18.3...] -->
51 | 
52 | ## Nextflow Installation
53 | 
54 | - Version: <!-- [e.g. 19.10.0] -->
55 | 
56 | ## Container engine
57 | 
58 | - Engine: <!-- [e.g. Conda, Docker, Singularity, Podman, Shifter or Charliecloud] -->
59 | - version: <!-- [e.g. 1.0.0] -->
60 | - Image tag: <!-- [e.g. nfcore/spliz:1.0.0] -->
61 | 
62 | ## Additional context
63 | 
64 | <!-- Add any other context about the problem here. -->
65 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Join nf-core
4 |     url: https://nf-co.re/join
5 |     about: Please join the nf-core community here
6 |   - name: "Slack #spliz channel"
7 |     url: https://nfcore.slack.com/channels/spliz
8 |     about: Discussion about the nf-core/spliz pipeline
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for the nf-core/spliz pipeline
 4 | labels: enhancement
 5 | ---
 6 | 
 7 | <!--
 8 | # nf-core/spliz feature request
 9 | 
10 | Hi there!
11 | 
12 | Thanks for suggesting a new feature for the pipeline!
13 | Please delete this text and anything that's not relevant from the template below:
14 | -->
15 | 
16 | ## Is your feature request related to a problem? Please describe
17 | 
18 | <!-- A clear and concise description of what the problem is. -->
19 | 
20 | <!-- e.g. [I'm always frustrated when ...] -->
21 | 
22 | ## Describe the solution you'd like
23 | 
24 | <!-- A clear and concise description of what you want to happen. -->
25 | 
26 | ## Describe alternatives you've considered
27 | 
28 | <!-- A clear and concise description of any alternative solutions or features you've considered. -->
29 | 
30 | ## Additional context
31 | 
32 | <!-- Add any other context about the feature request here. -->
33 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | # nf-core/spliz pull request
 3 | 
 4 | Many thanks for contributing to nf-core/spliz!
 5 | 
 6 | Please fill in the appropriate checklist below (delete whatever is not relevant).
 7 | These are the most common things requested on pull requests (PRs).
 8 | 
 9 | Remember that PRs should be made against the dev branch, unless you're preparing a pipeline release.
10 | 
11 | Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/spliz/tree/master/.github/CONTRIBUTING.md)
12 | -->
13 | <!-- markdownlint-disable ul-indent -->
14 | 
15 | ## PR checklist
16 | 
17 | - [ ] This comment contains a description of changes (with reason).
18 | - [ ] If you've fixed a bug or added code that should be tested, add tests!
19 | - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`).
20 | - [ ] Usage Documentation in `docs/usage.md` is updated.
21 | - [ ] Output Documentation in `docs/output.md` is updated.
22 | - [ ] `CHANGELOG.md` is updated.
23 | - [ ] `README.md` is updated (including new tool citations and authors/contributors).
24 | 


--------------------------------------------------------------------------------
/.github/markdownlint.yml:
--------------------------------------------------------------------------------
 1 | # Markdownlint configuration file
 2 | default: true
 3 | line-length: false
 4 | no-duplicate-header:
 5 |     siblings_only: true
 6 | no-inline-html:
 7 |     allowed_elements:
 8 |         - img
 9 |         - p
10 |         - kbd
11 |         - details
12 |         - summary
13 | 


--------------------------------------------------------------------------------
/.github/workflows/awsfulltest.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core AWS full size tests
 2 | # This workflow is triggered on published releases.
 3 | # It can be additionally triggered manually with GitHub actions workflow dispatch.
 4 | # It runs the -profile 'test_full' on AWS batch
 5 | 
 6 | on:
 7 |   workflow_run:
 8 |     workflows: ["nf-core Docker push (release)"]
 9 |     types: [completed]
10 |   workflow_dispatch:
11 | 
12 | 
13 | env:
14 |   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
15 |   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
16 |   TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }}
17 |   AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }}
18 |   AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }}
19 |   AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
20 | 
21 | 
22 | jobs:
23 |   run-awstest:
24 |     name: Run AWS full tests
25 |     if: github.repository == 'nf-core/spliz'
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |       - name: Setup Miniconda
29 |         uses: conda-incubator/setup-miniconda@v2
30 |         with:
31 |           auto-update-conda: true
32 |           python-version: 3.7
33 |       - name: Install awscli
34 |         run: conda install -c conda-forge awscli
35 |       - name: Start AWS batch job
36 |         # TODO nf-core: You can customise AWS full pipeline tests as required
37 |         # Add full size test data (but still relatively small datasets for few samples)
38 |         # on the `test_full.config` test runs with only one set of parameters
39 |         # Then specify `-profile test_full` instead of `-profile test` on the AWS batch command
40 |         run: |
41 |           aws batch submit-job \
42 |             --region eu-west-1 \
43 |             --job-name nf-core-spliz \
44 |             --job-queue $AWS_JOB_QUEUE \
45 |             --job-definition $AWS_JOB_DEFINITION \
46 |             --container-overrides '{"command": ["nf-core/spliz", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/spliz/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/spliz/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'
47 | 


--------------------------------------------------------------------------------
/.github/workflows/awstest.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core AWS test
 2 | # This workflow is triggered on push to the master branch.
 3 | # It can be additionally triggered manually with GitHub actions workflow dispatch.
 4 | # It runs the -profile 'test' on AWS batch.
 5 | 
 6 | on:
 7 |   workflow_dispatch:
 8 | 
 9 | 
10 | env:
11 |   AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
12 |   AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
13 |   TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }}
14 |   AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }}
15 |   AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }}
16 |   AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
17 | 
18 | 
19 | jobs:
20 |   run-awstest:
21 |     name: Run AWS tests
22 |     if: github.repository == 'nf-core/spliz'
23 |     runs-on: ubuntu-latest
24 |     steps:
25 |       - name: Setup Miniconda
26 |         uses: conda-incubator/setup-miniconda@v2
27 |         with:
28 |           auto-update-conda: true
29 |           python-version: 3.7
30 |       - name: Install awscli
31 |         run: conda install -c conda-forge awscli
32 |       - name: Start AWS batch job
33 |         # TODO nf-core: You can customise CI pipeline run tests as required
34 |         # For example: adding multiple test runs with different parameters
35 |         # Remember that you can parallelise this by using strategy.matrix
36 |         run: |
37 |           aws batch submit-job \
38 |           --region eu-west-1 \
39 |           --job-name nf-core-spliz \
40 |           --job-queue $AWS_JOB_QUEUE \
41 |           --job-definition $AWS_JOB_DEFINITION \
42 |           --container-overrides '{"command": ["nf-core/spliz", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/spliz/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/spliz/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'
43 | 


--------------------------------------------------------------------------------
/.github/workflows/branch.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core branch protection
 2 | # This workflow is triggered on PRs to master branch on the repository
 3 | # It fails when someone tries to make a PR against the nf-core `master` branch instead of `dev`
 4 | on:
 5 |   pull_request_target:
 6 |     branches: [master]
 7 | 
 8 | jobs:
 9 |   test:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       # PRs to the nf-core repo master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches
13 |       - name: Check PRs
14 |         if: github.repository == 'nf-core/spliz'
15 |         run: |
16 |           { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/spliz ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]]
17 | 
18 | 
19 |       # If the above check failed, post a comment on the PR explaining the failure
20 |       # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets
21 |       - name: Post PR comment
22 |         if: failure()
23 |         uses: mshick/add-pr-comment@v1
24 |         with:
25 |           message: |
26 |             ## This PR is against the `master` branch :x:
27 | 
28 |             * Do not close this PR
29 |             * Click _Edit_ and change the `base` to `dev`
30 |             * This CI test will remain failed until you push a new commit
31 | 
32 |             ---
33 | 
34 |             Hi @${{ github.event.pull_request.user.login }},
35 | 
36 |             It looks like this pull-request is has been made against the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `master` branch.
37 |             The `master` branch on nf-core repositories should always contain code from the latest release.
38 |             Because of this, PRs to `master` are only allowed if they come from the [${{github.event.pull_request.head.repo.full_name }}](https://github.com/${{github.event.pull_request.head.repo.full_name }}) `dev` branch.
39 | 
40 |             You do not need to close this PR, you can change the target branch to `dev` by clicking the _"Edit"_ button at the top of this page.
41 |             Note that even after this, the test will continue to show as failing until you push a new commit.
42 | 
43 |             Thanks again for your contribution!
44 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
45 |           allow-repeats: false
46 | 
47 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core CI
 2 | # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - dev
 7 |   pull_request:
 8 |   release:
 9 |     types: [published]
10 | 
11 | # Uncomment if we need an edge release of Nextflow again
12 | # env: NXF_EDGE: 1
13 | 
14 | jobs:
15 |   test:
16 |     name: Run workflow tests
17 |     # Only run on push if this is the nf-core dev branch (merged PRs)
18 |     if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/spliz') }}
19 |     runs-on: ubuntu-latest
20 |     env:
21 |       NXF_VER: ${{ matrix.nxf_ver }}
22 |       NXF_ANSI_LOG: false
23 |     strategy:
24 |       matrix:
25 |         # Nextflow versions: check pipeline minimum and current latest
26 |         nxf_ver: ['20.04.0', '']
27 |     steps:
28 |       - name: Check out pipeline code
29 |         uses: actions/checkout@v2
30 | 
31 |       - name: Check if Dockerfile or Conda environment changed
32 |         uses: technote-space/get-diff-action@v4
33 |         with:
34 |           FILES: |
35 |             Dockerfile
36 |             environment.yml
37 | 
38 |       - name: Build new docker image
39 |         if: env.MATCHED_FILES
40 |         run: docker build --no-cache . -t nfcore/spliz:dev
41 | 
42 |       - name: Pull docker image
43 |         if: ${{ !env.MATCHED_FILES }}
44 |         run: |
45 |           docker pull nfcore/spliz:dev
46 |           docker tag nfcore/spliz:dev nfcore/spliz:dev
47 | 
48 |       - name: Install Nextflow
49 |         env:
50 |           CAPSULE_LOG: none
51 |         run: |
52 |           wget -qO- get.nextflow.io | bash
53 |           sudo mv nextflow /usr/local/bin/
54 | 
55 |       - name: Run pipeline with test data
56 |         # TODO nf-core: You can customise CI pipeline run tests as required
57 |         # For example: adding multiple test runs with different parameters
58 |         # Remember that you can parallelise this by using strategy.matrix
59 |         run: |
60 |           nextflow run ${GITHUB_WORKSPACE} -profile test,docker
61 | 


--------------------------------------------------------------------------------
/.github/workflows/linting.yml:
--------------------------------------------------------------------------------
  1 | name: nf-core linting
  2 | # This workflow is triggered on pushes and PRs to the repository.
  3 | # It runs the `nf-core lint` and markdown lint tests to ensure that the code meets the nf-core guidelines
  4 | on:
  5 |   push:
  6 |   pull_request:
  7 |   release:
  8 |     types: [published]
  9 | 
 10 | jobs:
 11 |   Markdown:
 12 |     runs-on: ubuntu-latest
 13 |     steps:
 14 |       - uses: actions/checkout@v2
 15 |       - uses: actions/setup-node@v1
 16 |         with:
 17 |           node-version: '10'
 18 |       - name: Install markdownlint
 19 |         run: npm install -g markdownlint-cli
 20 |       - name: Run Markdownlint
 21 |         run: markdownlint ${GITHUB_WORKSPACE} -c ${GITHUB_WORKSPACE}/.github/markdownlint.yml
 22 | 
 23 |       # If the above check failed, post a comment on the PR explaining the failure
 24 |       - name: Post PR comment
 25 |         if: failure()
 26 |         uses: mshick/add-pr-comment@v1
 27 |         with:
 28 |           message: |
 29 |             ## Markdown linting is failing
 30 | 
 31 |             To keep the code consistent with lots of contributors, we run automated code consistency checks.
 32 |             To fix this CI test, please run:
 33 | 
 34 |             * Install `markdownlint-cli`
 35 |                 * On Mac: `brew install markdownlint-cli`
 36 |                 * Everything else: [Install `npm`](https://www.npmjs.com/get-npm) then [install `markdownlint-cli`](https://www.npmjs.com/package/markdownlint-cli) (`npm install -g markdownlint-cli`)
 37 |             * Fix the markdown errors
 38 |                 * Automatically: `markdownlint . --config .github/markdownlint.yml --fix`
 39 |                 * Manually resolve anything left from `markdownlint . --config .github/markdownlint.yml`
 40 | 
 41 |             Once you push these changes the test should pass, and you can hide this comment :+1:
 42 | 
 43 |             We highly recommend setting up markdownlint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help!
 44 | 
 45 |             Thanks again for your contribution!
 46 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
 47 |           allow-repeats: false
 48 | 
 49 | 
 50 |   YAML:
 51 |     runs-on: ubuntu-latest
 52 |     steps:
 53 |       - uses: actions/checkout@v1
 54 |       - uses: actions/setup-node@v1
 55 |         with:
 56 |           node-version: '10'
 57 |       - name: Install yaml-lint
 58 |         run: npm install -g yaml-lint
 59 |       - name: Run yaml-lint
 60 |         run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml")
 61 | 
 62 |       # If the above check failed, post a comment on the PR explaining the failure
 63 |       - name: Post PR comment
 64 |         if: failure()
 65 |         uses: mshick/add-pr-comment@v1
 66 |         with:
 67 |           message: |
 68 |             ## YAML linting is failing
 69 | 
 70 |             To keep the code consistent with lots of contributors, we run automated code consistency checks.
 71 |             To fix this CI test, please run:
 72 | 
 73 |             * Install `yaml-lint`
 74 |                 * [Install `npm`](https://www.npmjs.com/get-npm) then [install `yaml-lint`](https://www.npmjs.com/package/yaml-lint) (`npm install -g yaml-lint`)
 75 |             * Fix the markdown errors
 76 |                 * Run the test locally: `yamllint $(find . -type f -name "*.yml" -o -name "*.yaml")`
 77 |                 * Fix any reported errors in your YAML files
 78 | 
 79 |             Once you push these changes the test should pass, and you can hide this comment :+1:
 80 | 
 81 |             We highly recommend setting up yaml-lint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help!
 82 | 
 83 |             Thanks again for your contribution!
 84 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
 85 |           allow-repeats: false
 86 | 
 87 | 
 88 |   nf-core:
 89 |     runs-on: ubuntu-latest
 90 |     steps:
 91 | 
 92 |       - name: Check out pipeline code
 93 |         uses: actions/checkout@v2
 94 | 
 95 |       - name: Install Nextflow
 96 |         env:
 97 |           CAPSULE_LOG: none
 98 |         run: |
 99 |           wget -qO- get.nextflow.io | bash
100 |           sudo mv nextflow /usr/local/bin/
101 | 
102 |       - uses: actions/setup-python@v1
103 |         with:
104 |           python-version: '3.6'
105 |           architecture: 'x64'
106 | 
107 |       - name: Install dependencies
108 |         run: |
109 |           python -m pip install --upgrade pip
110 |           pip install nf-core
111 | 
112 |       - name: Run nf-core lint
113 |         env:
114 |           GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
115 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
116 |           GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
117 |         run: nf-core -l lint_log.txt lint ${GITHUB_WORKSPACE} --markdown lint_results.md
118 | 
119 |       - name: Save PR number
120 |         if: ${{ always() }}
121 |         run: echo ${{ github.event.pull_request.number }} > PR_number.txt
122 | 
123 |       - name: Upload linting log file artifact
124 |         if: ${{ always() }}
125 |         uses: actions/upload-artifact@v2
126 |         with:
127 |           name: linting-logs
128 |           path: |
129 |             lint_log.txt
130 |             lint_results.md
131 |             PR_number.txt
132 | 
133 | 


--------------------------------------------------------------------------------
/.github/workflows/linting_comment.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: nf-core linting comment
 3 | # This workflow is triggered after the linting action is complete
 4 | # It posts an automated comment to the PR, even if the PR is coming from a fork
 5 | 
 6 | on:
 7 |   workflow_run:
 8 |     workflows: ["nf-core linting"]
 9 | 
10 | jobs:
11 |   test:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Download lint results
15 |         uses: dawidd6/action-download-artifact@v2
16 |         with:
17 |           workflow: linting.yml
18 | 
19 |       - name: Get PR number
20 |         id: pr_number
21 |         run: echo "::set-output name=pr_number::$(cat linting-logs/PR_number.txt)"
22 | 
23 |       - name: Post PR comment
24 |         uses: marocchino/sticky-pull-request-comment@v2
25 |         with:
26 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
27 |           number: ${{ steps.pr_number.outputs.pr_number }}
28 |           path: linting-logs/lint_results.md
29 | 
30 | 


--------------------------------------------------------------------------------
/.github/workflows/push_dockerhub_dev.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core Docker push (dev)
 2 | # This builds the docker image and pushes it to DockerHub
 3 | # Runs on nf-core repo releases and push event to 'dev' branch (PR merges)
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - dev
 8 | 
 9 | jobs:
10 |   push_dockerhub:
11 |     name: Push new Docker image to Docker Hub (dev)
12 |     runs-on: ubuntu-latest
13 |     # Only run for the nf-core repo, for releases and merged PRs
14 |     if: ${{ github.repository == 'nf-core/spliz' }}
15 |     env:
16 |       DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
17 |       DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }}
18 |     steps:
19 |       - name: Check out pipeline code
20 |         uses: actions/checkout@v2
21 | 
22 |       - name: Build new docker image
23 |         run: docker build --no-cache . -t nfcore/spliz:dev
24 | 
25 |       - name: Push Docker image to DockerHub (dev)
26 |         run: |
27 |           echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
28 |           docker push nfcore/spliz:dev
29 | 


--------------------------------------------------------------------------------
/.github/workflows/push_dockerhub_release.yml:
--------------------------------------------------------------------------------
 1 | name: nf-core Docker push (release)
 2 | # This builds the docker image and pushes it to DockerHub
 3 | # Runs on nf-core repo releases and push event to 'dev' branch (PR merges)
 4 | on:
 5 |   release:
 6 |     types: [published]
 7 | 
 8 | jobs:
 9 |   push_dockerhub:
10 |     name: Push new Docker image to Docker Hub (release)
11 |     runs-on: ubuntu-latest
12 |     # Only run for the nf-core repo, for releases and merged PRs
13 |     if: ${{ github.repository == 'nf-core/spliz' }}
14 |     env:
15 |       DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
16 |       DOCKERHUB_PASS: ${{ secrets.DOCKERHUB_PASS }}
17 |     steps:
18 |       - name: Check out pipeline code
19 |         uses: actions/checkout@v2
20 | 
21 |       - name: Build new docker image
22 |         run: docker build --no-cache . -t nfcore/spliz:latest
23 | 
24 |       - name: Push Docker image to DockerHub (release)
25 |         run: |
26 |           echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
27 |           docker push nfcore/spliz:latest
28 |           docker tag nfcore/spliz:latest nfcore/spliz:${{ github.event.release.tag_name }}
29 |           docker push nfcore/spliz:${{ github.event.release.tag_name }}
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .nextflow*
 2 | work/
 3 | data/
 4 | results/
 5 | .DS_Store
 6 | test*
 7 | tests/
 8 | testing/
 9 | testing*
10 | *.pyc
11 | HLCA*
12 | original*
13 | TSP*
14 | *out
15 | *err
16 | *sbatch
17 | samplesheets/*
18 | sandbox*
19 | *mouse*
20 | s3*
21 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # nf-core/spliz: Changelog
 2 | 
 3 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 4 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 5 | 
 6 | ## v1.0dev - [date]
 7 | 
 8 | Initial release of nf-core/spliz, created with the [nf-core](https://nf-co.re/) template.
 9 | 
10 | ### `Added`
11 | 
12 | ### `Fixed`
13 | 
14 | ### `Dependencies`
15 | 
16 | ### `Deprecated`
17 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Code of Conduct at nf-core (v1.0)
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of:
  6 | 
  7 | - Age
  8 | - Body size
  9 | - Familial status
 10 | - Gender identity and expression
 11 | - Geographical location
 12 | - Level of experience
 13 | - Nationality and national origins
 14 | - Native language
 15 | - Physical and neurological ability
 16 | - Race or ethnicity
 17 | - Religion
 18 | - Sexual identity and orientation
 19 | - Socioeconomic status
 20 | 
 21 | Please note that the list above is alphabetised and is therefore not ranked in any order of preference or importance.
 22 | 
 23 | ## Preamble
 24 | 
 25 | > Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply.
 26 | 
 27 | An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva.
 28 | 
 29 | nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals.
 30 | 
 31 | We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc.
 32 | 
 33 | Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities.
 34 | 
 35 | We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC.
 36 | 
 37 | Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re
 38 | 
 39 | ## Our Responsibilities
 40 | 
 41 | The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour.
 42 | 
 43 | The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
 44 | 
 45 | Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC.
 46 | 
 47 | ## When are where does this Code of Conduct apply?
 48 | 
 49 | Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference:
 50 | 
 51 | - Communicating with an official project email address.
 52 | - Communicating with community members within the nf-core Slack channel.
 53 | - Participating in hackathons organised by nf-core (both online and in-person events).
 54 | - Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence.
 55 | - Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc.
 56 | - Representing nf-core on social media. This includes both official and personal accounts.
 57 | 
 58 | ## nf-core cares 😊
 59 | 
 60 | nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order):
 61 | 
 62 | - Ask for consent before sharing another community member’s personal information (including photographs) on social media.
 63 | - Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity.
 64 | - Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !)
 65 | - Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.)
 66 | - Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can)
 67 | - Focus on what is best for the team and the community. (When in doubt, ask)
 68 | - Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn.
 69 | - Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!)
 70 | - Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**)
 71 | - Take breaks when you feel like you need them.
 72 | - Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.)
 73 | 
 74 | ## nf-core frowns on 😕
 75 | 
 76 | The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces.
 77 | 
 78 | - Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom.
 79 | - “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online.
 80 | - Spamming or trolling of individuals on social media.
 81 | - Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention.
 82 | - Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience.
 83 | 
 84 | ### Online Trolling
 85 | 
 86 | The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately.
 87 | 
 88 | All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls.
 89 | 
 90 | ## Procedures for Reporting CoC violations
 91 | 
 92 | If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible.
 93 | 
 94 | You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s).
 95 | 
 96 | Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course.
 97 | 
 98 | All reports will be handled with utmost discretion and confidentially.
 99 | 
100 | ## Attribution and Acknowledgements
101 | 
102 | - The [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4)
103 | - The [OpenCon 2017 Code of Conduct](http://www.opencon2017.org/code_of_conduct) (CC BY 4.0 OpenCon organisers, SPARC and Right to Research Coalition)
104 | - The [eLife innovation sprint 2020 Code of Conduct](https://sprint.elifesciences.org/code-of-conduct/)
105 | - The [Mozilla Community Participation Guidelines v3.1](https://www.mozilla.org/en-US/about/governance/policies/participation/) (version 3.1, CC BY-SA 3.0 Mozilla)
106 | 
107 | ## Changelog
108 | 
109 | ### v1.0 - March 12th, 2021
110 | 
111 | - Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC.
112 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # based on existing Docker image
 2 | FROM ubuntu:20.04
 3 | ENV DEBIAN_FRONTEND=noninteractive
 4 | 
 5 | # dependencies, some are probably unnecessary
 6 | RUN apt-get update && apt-get install -y wget && apt-get install -y --no-install-recommends build-essential r-base python3.9 python3-pip python3-setuptools python3-dev
 7 | RUN apt-get update -qq && apt-get -y --no-install-recommends install \
 8 |     r-base-dev \
 9 |     libgsl0-dev \ 
10 |     libxml2-dev \
11 |     libcairo2-dev \
12 |     libsqlite-dev \
13 |     libpq-dev \
14 |     libicu-dev \
15 |     libbz2-dev \
16 |     liblzma-dev \
17 |     libfontconfig1-dev \
18 |     libssl-dev \
19 |     libcurl4-openssl-dev \
20 |     libnetcdf-dev \
21 |     udunits-bin \
22 |     libopenblas-dev \ 
23 |     libudunits2-dev \
24 |     curl
25 | RUN apt-get update -qq && apt-get -y --no-install-recommends install \
26 |     autoconf \
27 |     automake \
28 |     g++ \
29 |     gcc \
30 |     gfortran \
31 |     make \
32 |     && apt-get clean all \
33 |     && rm -rf /var/lib/apt/lists/*
34 | 
35 | # Python packages
36 | WORKDIR /app
37 | COPY requirements.txt /app/requirements.txt
38 | RUN pip3 install -r requirements.txt
39 | 
40 | # R packages
41 | RUN LC_ALL=C.UTF-8 Rscript -e "install.packages('data.table')"
42 | RUN LC_ALL=C.UTF-8 Rscript -e "install.packages('logger')"
43 | RUN LC_ALL=C.UTF-8 Rscript -e "install.packages('Rfast')"
44 | 
45 | 
46 | COPY . /app
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) Salzman Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Introduction
  2 | 
  3 | <!-- TODO nf-core: Write a 1-2 sentence summary of what data the pipeline is for and what it does -->
  4 | **salzmanlab/spliz** is a bioinformatics best-practise analysis pipeline for calculating the splicing z-score for single cell RNA-seq analysis.
  5 | 
  6 | This pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/master/LICENSE).
  7 | 
  8 | > The nf-core framework for community-curated bioinformatics pipelines.
  9 | >
 10 | > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.
 11 | >
 12 | > Nat Biotechnol. 2020 Feb 13. doi: 10.1038/s41587-020-0439-x.
 13 | 
 14 | ## Quick Start
 15 | 
 16 | 1. Install [`nextflow`](https://nf-co.re/usage/installation) (`>=20.04.0`) and [`conda`](https://docs.conda.io/en/latest/).
 17 | 
 18 | 2. Download environment file.
 19 |     ```bash
 20 |     wget https://raw.githubusercontent.com/salzmanlab/SpliZ/main/environment.yml
 21 |     ```
 22 | 
 23 | 3. Create conda environment and activate.
 24 |     ```bash
 25 |     conda env create --name spliz_env --file=environment.yml
 26 |     conda activate spliz_env
 27 |     ```
 28 | 
 29 | 4. Run the pipeline on the test data set. 
 30 | You may need to modify the [executor scope](https://www.nextflow.io/docs/latest/executor.html) in the config file, in accordance to your compute needs.
 31 |     ```bash
 32 |     nextflow run salzmanlab/spliz \
 33 |         -r main \
 34 |         -latest \
 35 |         -profile small_test_data
 36 |     ```
 37 |     [Sherlock](https://www.sherlock.stanford.edu/) users should use the `sherlock` profile:
 38 | 
 39 | 
 40 |         nextflow run salzmanlab/spliz \
 41 |             -r main \
 42 |             -latest \
 43 |             -profile small_test_data,sherlock
 44 |             
 45 |  5. Run the pipeline on your own dataset.
 46 |     1. Edit your config file with the parameters below. (You can use `/small_data/small.config` as a template, be sure to include any memory or time paramters.)
 47 |     2. Run with your config file:
 48 |     ```
 49 |     nextflow run salzmanlab/spliz \
 50 |         -r main \
 51 |         -latest \
 52 |         -c YOUR_CONFIG_HERE.conf
 53 |     ```
 54 | 
 55 | 
 56 | See [usage docs](https://nf-co.re/spliz/usage) for all of the available options when running the pipeline.
 57 | 
 58 | ## Pipeline Summary
 59 | 
 60 | By default, the pipeline currently performs the following:
 61 | * Calculate the SpliZ scores for:
 62 |     * Identifying variable splice sites
 63 |     * Identifying differential splicing between cell types.
 64 | 
 65 | ## Input Parameters
 66 | 
 67 | | Argument              | Description       |Example Usage  |
 68 | | --------------------- | ---------------- |-----------|
 69 | | `dataname`            | Descriptive name for SpliZ run        | "Tumor_5" |
 70 | | `run_analysis`        | If the pipeline will perform splice site identifcation and differential splicing analysis | `true`, `false` |
 71 | | `input_file`          | File to be used as SpliZ input | *tumor_5_with_postprocessing.txt* |
 72 | | `SICILIAN`            | If `input_file` is output from [SICILIAN](https://github.com/salzmanlab/SICILIAN) | `true`, `false` |
 73 | | `pin_S`               | Bound splice site residuals at this quantile (e.g. values in the lower `pin_S` quantile and the upper 1 - `pin_S` quantile will be rounded to the quantile limits) | 0.1 |
 74 | | `pin_z`               | Bound SpliZ scores at this quantile (e.g. values in the lower `pin_z` quantile and the upper 1 - `pin_z` quantile will be rounded to the quantile limits) | 0 |  
 75 | | `bounds`              | Only include cell/gene pairs that have more than this many junctional reads for the gene | 5 |
 76 | | `light`               | Only output the minimum number of columns | `true`, `false` |
 77 | | `svd_type`            | Type of SVD calculation | `normdonor`, `normgene` |
 78 | | `n_perms`             | Number of permutations | 100 |
 79 | | `grouping_level_1`    | Metadata column by which the data is intially partitioned  | "tissue" |
 80 | | `grouping_level_2`    | Metadata column by which the partitioned data is grouped | "compartment" |
 81 | | `libraryType`         | Library prepration method of the input data | `10X`, `SS2` |
 82 | 
 83 | ## Optional Parameters for non-SICILIAN Inputs (`SICILIAN` = `false`)
 84 | | Argument              | Description       |Example Usage  |
 85 | | --------------------- | ---------------- |-----------|
 86 | | `samplesheet`         | If input files are in BAM format, this file specifies the locations of the input bam files. Samplesheet formatting is specified below. | *Tumor_5_samplesheet.csv* |
 87 | | `annotator_pickle`    | [Genome-specific annotation file for gene names](https://github.com/salzmanlab/SICILIAN#annotator-and-index-files-needed-for-running-sicilian) | *hg38_refseq.pkl* |
 88 | | `exon_pickle`         | [Genome-specific annotation file for exon boundaries](https://github.com/salzmanlab/SICILIAN#annotator-and-index-files-needed-for-running-sicilian) | *hg38_refseq_exon_bounds.pkl* |
 89 | | `splice_pickle`       | [Genome-specific annotation file for splice sites](https://github.com/salzmanlab/SICILIAN#annotator-and-index-files-needed-for-running-sicilian) | *hg38_refseq_splices.pkl* |
 90 | | `gtf`                 | GTF file used as the reference annotation file for the genome assembly | *GRCh38_genomic.gtf* |
 91 | | `meta`                | If input files are in BAM format, this file contains per-cell annotations. This file must contain columns for `grouping_level_1` and `grouping_level_2`. | *metadata_tumor_5.tsv* |
 92 | 
 93 | ### Samplesheets
 94 | 
 95 | The samplesheet must be in comma-separated value(CSV) format. The file must be without a header. The sampleID must be a unique identifier for each bam file entry.
 96 | 
 97 | For non-SICILIAN samples, samplesheets must have 2 columns: sampleID and path to the bam file.
 98 | ```
 99 | Tumor_5_S1,tumor_5_S1_L001.bam
100 | Tumor_5_S2,tumor_5_S2_L002.bam
101 | Tumor_5_S3,tumor_5_S3_L003.bam
102 | ```
103 | 
104 | For SICILIAN SS2 samples, amplesheets must have 3 columns: sampleID, read 1 bam file, and read 2 bam file.
105 | ```
106 | Tumor_5_S1,tumor_5_S1_L001_R1.bam,tumor_5_S1_L001_R2.bam
107 | Tumor_5_S2,tumor_5_S2_L002_R1.bam,tumor_5_S2_L002_R2.bam
108 | Tumor_5_S3,tumor_5_S3_L003_R1.bam,tumor_5_S3_L003_R2.bam
109 | ```
110 | 
111 | ## Credits
112 | 
113 | salzmanlab/spliz was originally written by Salzman Lab.
114 | 
115 | ## Contributions and Support
116 | 
117 | If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).
118 | 
119 | 
120 | ## Citations
121 | 
122 | <!-- TODO nf-core: Add citation for pipeline after first release. Uncomment lines below and update Zenodo doi. -->
123 | <!-- If you use  salzmanlab/spliz for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) -->
124 | This repositiory contains code to perform the analyses in this paper:
125 | 
126 | > **The SpliZ generalizes “Percent Spliced In” to reveal regulated splicing at single-cell resolution**
127 | >
128 | > Julia Eve Olivieri*, Roozbeh Dehghannasiri*, Julia Salzman.
129 | >
130 | > _Nature Methods_ 2022 Mar 3. doi: [https://www.nature.com/articles/s41592-022-01400-x](https://www.nature.com/articles/s41592-022-01400-x).
131 | 
132 | You can cite the `nf-core` publication as follows:
133 | 
134 | > **The nf-core framework for community-curated bioinformatics pipelines.**
135 | >
136 | > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.
137 | >
138 | > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/assets/email_template.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |   <meta charset="utf-8">
 4 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
 5 |   <meta name="viewport" content="width=device-width, initial-scale=1">
 6 | 
 7 |   <meta name="description" content="nf-core/spliz: Code to calculate the Splicing Z Score (SZS) for single cell RNA-seq splicing analysis">
 8 |   <title>nf-core/spliz Pipeline Report</title>
 9 | </head>
10 | <body>
11 | <div style="font-family: Helvetica, Arial, sans-serif; padding: 30px; max-width: 800px; margin: 0 auto;">
12 | 
13 | <img src="cid:nfcorepipelinelogo">
14 | 
15 | <h1>nf-core/spliz v${version}</h1>
16 | <h2>Run Name: $runName</h2>
17 | 
18 | <% if (!success){
19 |     out << """
20 |     <div style="color: #a94442; background-color: #f2dede; border-color: #ebccd1; padding: 15px; margin-bottom: 20px; border: 1px solid transparent; border-radius: 4px;">
21 |         <h4 style="margin-top:0; color: inherit;">nf-core/spliz execution completed unsuccessfully!</h4>
22 |         <p>The exit status of the task that caused the workflow execution to fail was: <code>$exitStatus</code>.</p>
23 |         <p>The full error message was:</p>
24 |         <pre style="white-space: pre-wrap; overflow: visible; margin-bottom: 0;">${errorReport}</pre>
25 |     </div>
26 |     """
27 | } else {
28 |     out << """
29 |     <div style="color: #3c763d; background-color: #dff0d8; border-color: #d6e9c6; padding: 15px; margin-bottom: 20px; border: 1px solid transparent; border-radius: 4px;">
30 |         nf-core/spliz execution completed successfully!
31 |     </div>
32 |     """
33 | }
34 | %>
35 | 
36 | <p>The workflow was completed at <strong>$dateComplete</strong> (duration: <strong>$duration</strong>)</p>
37 | <p>The command used to launch the workflow was as follows:</p>
38 | <pre style="white-space: pre-wrap; overflow: visible; background-color: #ededed; padding: 15px; border-radius: 4px; margin-bottom:30px;">$commandLine</pre>
39 | 
40 | <h3>Pipeline Configuration:</h3>
41 | <table style="width:100%; max-width:100%; border-spacing: 0; border-collapse: collapse; border:0; margin-bottom: 30px;">
42 |     <tbody style="border-bottom: 1px solid #ddd;">
43 |         <% out << summary.collect{ k,v -> "<tr><th style='text-align:left; padding: 8px 0; line-height: 1.42857143; vertical-align: top; border-top: 1px solid #ddd;'>$k</th><td style='text-align:left; padding: 8px; line-height: 1.42857143; vertical-align: top; border-top: 1px solid #ddd;'><pre style='white-space: pre-wrap; overflow: visible;'>$v</pre></td></tr>" }.join("\n") %>
44 |     </tbody>
45 | </table>
46 | 
47 | <p>nf-core/spliz</p>
48 | <p><a href="https://github.com/nf-core/spliz">https://github.com/nf-core/spliz</a></p>
49 | 
50 | </div>
51 | 
52 | </body>
53 | </html>
54 | 


--------------------------------------------------------------------------------
/assets/email_template.txt:
--------------------------------------------------------------------------------
 1 | ----------------------------------------------------
 2 |                                         ,--./,-.
 3 |         ___     __   __   __   ___     /,-._.--~\\
 4 |   |\\ | |__  __ /  ` /  \\ |__) |__         }  {
 5 |   | \\| |       \\__, \\__/ |  \\ |___     \\`-._,-`-,
 6 |                                         `._,._,'
 7 |   nf-core/spliz v${version}
 8 | ----------------------------------------------------
 9 | 
10 | Run Name: $runName
11 | 
12 | <% if (success){
13 |     out << "## nf-core/spliz execution completed successfully! ##"
14 | } else {
15 |     out << """####################################################
16 | ## nf-core/spliz execution completed unsuccessfully! ##
17 | ####################################################
18 | The exit status of the task that caused the workflow execution to fail was: $exitStatus.
19 | The full error message was:
20 | 
21 | ${errorReport}
22 | """
23 | } %>
24 | 
25 | 
26 | The workflow was completed at $dateComplete (duration: $duration)
27 | 
28 | The command used to launch the workflow was as follows:
29 | 
30 |   $commandLine
31 | 
32 | 
33 | 
34 | Pipeline Configuration:
35 | -----------------------
36 | <% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %>
37 | 
38 | --
39 | nf-core/spliz
40 | https://github.com/nf-core/spliz
41 | 


--------------------------------------------------------------------------------
/assets/multiqc_config.yaml:
--------------------------------------------------------------------------------
 1 | report_comment: >
 2 |     This report has been generated by the <a href="https://github.com/nf-core/spliz" target="_blank">nf-core/spliz</a>
 3 |     analysis pipeline. For information about how to interpret these results, please see the
 4 |     <a href="https://github.com/nf-core/spliz" target="_blank">documentation</a>.
 5 | report_section_order:
 6 |     software_versions:
 7 |         order: -1000
 8 |     nf-core-spliz-summary:
 9 |         order: -1001
10 | 
11 | export_plots: true
12 | 


--------------------------------------------------------------------------------
/assets/nf-core-spliz_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salzman-lab/SpliZ/594532c1b12a1a30b5e00cd9e1fbed30ef28047e/assets/nf-core-spliz_logo.png


--------------------------------------------------------------------------------
/assets/sendmail_template.txt:
--------------------------------------------------------------------------------
 1 | To: $email
 2 | Subject: $subject
 3 | Mime-Version: 1.0
 4 | Content-Type: multipart/related;boundary="nfcoremimeboundary"
 5 | 
 6 | --nfcoremimeboundary
 7 | Content-Type: text/html; charset=utf-8
 8 | 
 9 | $email_html
10 | 
11 | --nfcoremimeboundary
12 | Content-Type: image/png;name="nf-core-spliz_logo.png"
13 | Content-Transfer-Encoding: base64
14 | Content-ID: <nfcorepipelinelogo>
15 | Content-Disposition: inline; filename="nf-core-spliz_logo.png"
16 | 
17 | <% out << new File("$projectDir/assets/nf-core-spliz_logo.png").
18 |   bytes.
19 |   encodeBase64().
20 |   toString().
21 |   tokenize( '\n' )*.
22 |   toList()*.
23 |   collate( 76 )*.
24 |   collect { it.join() }.
25 |   flatten().
26 |   join( '\n' ) %>
27 | 
28 | <%
29 | if (mqcFile){
30 | def mqcFileObj = new File("$mqcFile")
31 | if (mqcFileObj.length() < mqcMaxSize){
32 | out << """
33 | --nfcoremimeboundary
34 | Content-Type: text/html; name=\"multiqc_report\"
35 | Content-Transfer-Encoding: base64
36 | Content-ID: <mqcreport>
37 | Content-Disposition: attachment; filename=\"${mqcFileObj.getName()}\"
38 | 
39 | ${mqcFileObj.
40 |   bytes.
41 |   encodeBase64().
42 |   toString().
43 |   tokenize( '\n' )*.
44 |   toList()*.
45 |   collate( 76 )*.
46 |   collect { it.join() }.
47 |   flatten().
48 |   join( '\n' )}
49 | """
50 | }}
51 | %>
52 | 
53 | --nfcoremimeboundary--
54 | 


--------------------------------------------------------------------------------
/bin/ann_splices.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from collections import defaultdict
 5 | import pandas as pd
 6 | import pickle
 7 | import annotator
 8 | 
 9 | 
10 | def get_args():
11 |     parser = argparse.ArgumentParser(description="add annotation columns for splicing")
12 |     parser.add_argument(
13 |         "-i",
14 |         "--in_file",
15 |         help="the file to add columns to. Must be human data, be tab separated, and have columns chrR1A, chrR1B, juncPosR1A, and juncPosR1B. Will create columns exon_annR1A, exon_annR1B, both_ann, splice_ann, and sort_junc (the last is just an artifact of computation)",
16 |     )
17 |     parser.add_argument(
18 |         "-o",
19 |         "--out_file",
20 |         help="file to save the output to. If you just want to add the columns to the original file you can pass in the same path as in_file",
21 |     )
22 |     parser.add_argument(
23 |         "-e", "--exon_pickle", help="the pickle file for exon annotation"
24 |     )
25 |     parser.add_argument(
26 |         "-s", "--splice_pickle", help="the pickle file for splice junction annotation"
27 |     )
28 |     args = parser.parse_args()
29 |     return args
30 | 
31 | 
32 | def add_exon_columns(temp_df, exon_bounds):
33 |     for suffix in ["A", "B"]:
34 |         temp_df["exon_annR1" + suffix] = False
35 |         for name2, group in temp_df.groupby("chrR1A"):
36 |             temp_df.loc[group.index, "exon_annR1" + suffix] = group[
37 |                 "juncPosR1" + suffix
38 |             ].isin(exon_bounds[name2])
39 | 
40 |     temp_df["both_ann"] = (temp_df["exon_annR1B"] & temp_df["exon_annR1A"]).astype(
41 |         "bool"
42 |     )
43 |     return temp_df
44 | 
45 | 
46 | def add_splice_ann_column(temp_df, splices):
47 |     temp_df["sort_junc"] = [
48 |         tuple(sorted([x, y])) for x, y in zip(temp_df.juncPosR1A, temp_df.juncPosR1B)
49 |     ]
50 |     temp_df["splice_ann"] = False
51 | 
52 |     for name2, group in temp_df.groupby("chrR1A"):
53 |         sub_group = group[group["chrR1A"].astype(str) == group["chrR1A"].astype(str)]
54 |         if name2 in splices:
55 | 
56 |             temp_df.loc[sub_group.index, "splice_ann"] = sub_group["sort_junc"].isin(
57 |                 splices[name2]
58 |             )
59 |     return temp_df
60 | 
61 | 
62 | def main():
63 |     args = get_args()
64 | 
65 |     exon_bounds = pickle.load(open(args.exon_pickle, "rb"))
66 |     splices = pickle.load(open(args.splice_pickle, "rb"))
67 | 
68 |     exon_bounds = defaultdict(set, exon_bounds)
69 |     splices = defaultdict(set, splices)
70 | 
71 |     df = pd.read_parquet(args.in_file)
72 |     df = add_exon_columns(df, exon_bounds)
73 |     df = add_splice_ann_column(df, splices)
74 |     print(df.head())
75 |     df.to_csv(args.out_file, sep="\t", index=False)
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()


--------------------------------------------------------------------------------
/bin/annotator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import pandas as pd
  4 | 
  5 | def get_gene_id(row):
  6 | #  return row["attribute"].split(";")[0].split()[1][1:-1]
  7 |   if "gene_name" in row["attribute"]:
  8 |     return row["attribute"].split("gene_name")[-1].split('"')[1]
  9 |   elif ";gene=" in row["attribute"]:
 10 |     return row["attribute"].split(";gene=")[-1].split(";")[0]
 11 | 
 12 | def round_down(num, divisor): 
 13 |     return num - (num%divisor)
 14 | 
 15 | # This is a class to create an annotator object that you can create based on a gtf file,
 16 | # that allows you to put in a chromosome and position and get all gene names in that area.
 17 | 
 18 | # Usage example:
 19 | # import sys 
 20 | # sys.path.insert(0, '/scratch/PI/horence/JuliaO/single_cell/scripts/') 
 21 | # import annotator
 22 | # 
 23 | # ann = annotator.Annotator(/scratch/PI/horence/JuliaO/single_cell/STAR_output/mm10_files/mm10.gtf) # this step can take a while - like 2 minutes
 24 | # ann.get_name_given_locus("chr1", 1003024) # returns all gene names separated by ","; if none, returns ""
 25 | 
 26 | class Annotator:
 27 |   def __init__(self, gtf_file, jump = 10000):
 28 |     self.jump = jump
 29 |     self.gtf_file = gtf_file
 30 |     self.unknown = "unknown"
 31 |     self.unknown_strand = "?"
 32 |     self.get_gtf_dict()
 33 | 
 34 |   def get_gtf_dict(self):
 35 |     print("here")
 36 |   
 37 |     # load in gtf
 38 |     gtf_df = pd.read_csv(self.gtf_file,sep="\t",names=["seqname","source","feature","start","end","score","strand","frame","attribute"],comment="#")
 39 |     print(gtf_df.head()) 
 40 |     # make gene id column
 41 |     gtf_df["gene_id"] = gtf_df.apply(get_gene_id, axis=1)
 42 |     print(gtf_df.head())
 43 |     
 44 |     # figure out how long to make each chromosome entry
 45 |     seqname_len_dict = {}
 46 |     for seqname in gtf_df["seqname"].unique():
 47 |         print(seqname)
 48 |         seqname_len_dict[seqname] = max(gtf_df[gtf_df["seqname"] == seqname]["end"])
 49 |         if seqname_len_dict[seqname] < max(gtf_df[gtf_df["seqname"] == seqname]["start"]):
 50 |             print("start more than end")
 51 |   
 52 |     # set up gtf dict to have a dictionary for each chromsome with entries for every "jump" in its length
 53 |     gtf_dict = {s : {r : {} for r in range(0, seqname_len_dict[s],self.jump)} for s in seqname_len_dict.keys()}
 54 |   
 55 |     # assign genes to their requisite ranges
 56 |     for seqname in seqname_len_dict:
 57 |         seqname_df = gtf_df[gtf_df["seqname"] == seqname]
 58 |         for gene_id in seqname_df["gene_id"].unique():
 59 |             if gene_id is not None:
 60 |               gene_df = seqname_df[seqname_df["gene_id"] == gene_id]
 61 |               if len(gene_df["strand"].unique()) == 1:
 62 |   #              print("gene_df['strand'].unique(): {}".format(gene_df['strand'].unique()))
 63 |   #              print("gene_df['strand'].unique()[0]: {}".format(gene_df["strand"].unique()[0]))
 64 |                 strand = gene_df["strand"].unique()[0]
 65 |               else:
 66 |                 strand = self.unknown_strand
 67 |     
 68 |               # assign gene to all ranges it falls within
 69 |               try:
 70 |                 start = min(gene_df["start"])
 71 |               except:
 72 |                 print("gene_id",gene_id)
 73 |                 print("start failed", gene_df)
 74 |               try: 
 75 |                 end = max(gene_df["end"])
 76 |               except:
 77 |                 print("gene_id",gene_id)
 78 |                 print("end failed",gene_df)
 79 |               for j in range(round_down(start,self.jump),round_down(end + self.jump, self.jump),self.jump):
 80 |                   gtf_dict[seqname][j][gene_id] = [start,end, strand] 
 81 |     self.gtf_dict = gtf_dict
 82 | 
 83 |   def get_name_given_locus(self, seqname, position, read_strand = "", stranded_library = False): 
 84 |    
 85 |       try: 
 86 |           poss_genes = self.gtf_dict[seqname][round_down(position,self.jump)] 
 87 |       except Exception as e: 
 88 | 
 89 |           if seqname not in self.gtf_dict.keys(): 
 90 |               if stranded_library:
 91 |                 return self.unknown,  read_strand
 92 | 
 93 |               else:
 94 |                 return self.unknown, self.unknown_strand 
 95 |           if position > max(self.gtf_dict[seqname].keys()): 
 96 |               if stranded_library:
 97 |                 return self.unknown, read_strand
 98 |               else:
 99 |                 return self.unknown, self.unknown_strand
100 |           else: 
101 |               raise e 
102 |       if len(poss_genes) == 0: 
103 |           if stranded_library:
104 |               return self.unknown, read_strand 
105 |           else:
106 |               return self.unknown, self.unknown_strand 
107 | 
108 |       gene_names = [] 
109 |       strands = []
110 |       for gene, pos in poss_genes.items(): 
111 |           if pos[0] <= position <= pos[1]:
112 |               if stranded_library:
113 |                   if pos[2] == read_strand:
114 |                       gene_names.append(gene)
115 |                       strands.append(pos[2])
116 |               else:
117 |                   gene_names.append(gene)
118 |                   strands.append(pos[2])
119 |       if len(gene_names) == 0:
120 |           gene_names.append(self.unknown)
121 | 
122 |       if len(set(strands)) == 1:
123 |           strand = strands[0]
124 |       elif stranded_library:
125 |           strand = read_strand
126 |       else:
127 |           strand = self.unknown_strand 
128 |       return ",".join(gene_names), strand


--------------------------------------------------------------------------------
/bin/convert_parquet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |   parser = argparse.ArgumentParser(description="convert parquet to tsv")
 8 |   parser.add_argument("--tsv",help="name to save tsv")
 9 |   parser.add_argument("--dataname",help="Dataname/basename of the input file")
10 |   args = parser.parse_args()
11 |   return args
12 | 
13 | def main():
14 |   args = get_args()
15 |   full_df = pd.read_csv(args.tsv, sep = "\t")
16 | 
17 |   df = full_df[full_df['called'] == True]
18 | 
19 |   for i, x in df.groupby('chrR1A'):
20 |     outname = "{}_{}.pq".format(i, args.dataname)
21 |     x.to_parquet(outname)
22 | 
23 |   
24 | main()


--------------------------------------------------------------------------------
/bin/final_summary.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import numpy as np
 5 | import pandas as pd
 6 | import logging
 7 | 
 8 | def get_args():
 9 |   parser = argparse.ArgumentParser(description="Create final summary file")
10 |   parser.add_argument("--perm_pvals", help="Permutation pvalue file")
11 |   parser.add_argument("--first_evec", help="First eigenvector file")
12 |   parser.add_argument("--second_evec", help="Second eigenvector file")
13 |   parser.add_argument("--third_evec", help="Third eigenvector file")
14 |   parser.add_argument("--splizvd", help="SpliZVD file")
15 |   parser.add_argument("--grouping_level_2", help="column to group the data by (e.g. ontology, compartment, tissue)", default="ontology")
16 |   parser.add_argument("--grouping_level_1", help="subset data by this column before checking for differences (e.g. tissue, compartment)", default="dummy")
17 |   parser.add_argument("--outname", help="Name of output file")
18 |   parser.add_argument("--outname_log", help="Name of log file")
19 | 
20 |   args = parser.parse_args()
21 |   return args
22 | 
23 | 
24 | def main():
25 |   args = get_args()
26 | 
27 |   logging.basicConfig(
28 |     filename = args.outname_log,
29 |     format='%(asctime)s %(levelname)-8s %(message)s',
30 |     level=logging.INFO,
31 |     datefmt='%Y-%m-%d %H:%M:%S')
32 | 
33 |   logging.info("Starting")
34 | 
35 |   # load in data
36 |   pval_df = pd.read_csv(args.perm_pvals, sep = "\t")
37 |   
38 |   splizsite_dfs = []
39 |   evec_files = [args.first_evec, args.second_evec, args.third_evec]
40 |   for evec_file in evec_files:
41 |     splizsite_dfs.append(pd.read_csv(evec_file, sep="\t"))
42 |   splizsite_df = pd.concat(splizsite_dfs,axis=0).drop_duplicates()
43 |   
44 |   df = pd.read_csv(args.splizvd, sep="\t")
45 |   if (args.grouping_level_1 == "tiss_comp") & (args.grouping_level_1 not in df.columns):
46 |     df["tiss_comp"] = df[args.grouping_level_1] + df[args.grouping_level_2]
47 |   elif args.grouping_level_1 == "dummy":
48 |     df["dummy"] = "dummy"
49 | 
50 |   # combine outputs
51 |   out_dict = {"gene" : [],"grouping_level_1" : [], "grouping_level_2" : [],  "SpliZsites" : []}
52 |   z_cols = ["scZ","svd_z0","svd_z1","svd_z2"]
53 |   
54 |   for z_col in z_cols:
55 |     out_dict["{}_median".format(z_col)] = []
56 |     out_dict["{}_pval".format(z_col)] = []
57 |   
58 |   for gene, gene_df in df.groupby("gene"):
59 |     for tiss, tiss_df in gene_df.groupby(args.grouping_level_1):
60 |       for ont, ont_df in tiss_df.groupby(args.grouping_level_2):
61 |         out_dict["gene"].append(gene)
62 |         out_dict["grouping_level_1"].append(tiss)
63 |         out_dict["grouping_level_2"].append(ont)
64 |         out_dict["SpliZsites"].append(",".join([str(x) for x in splizsite_df[splizsite_df["gene"] == gene]["end"]]))
65 |         
66 |         
67 |         for z_col in z_cols:
68 |   
69 |           out_dict["{}_median".format(z_col)].append(ont_df[z_col].median())
70 |           try:
71 |             pval = pval_df[(pval_df["gene"] == gene) & ((pval_df["grouping_level_1"] == tiss) | (pval_df["grouping_level_1"].isna()))]["perm_pval_adj_{}".format(z_col)].iloc[0]
72 |           except:
73 |             pval = np.nan
74 |           out_dict["{}_pval".format(z_col)].append(pval)
75 |   out_df = pd.DataFrame.from_dict(out_dict)
76 |   out_df = out_df.sort_values(["gene","grouping_level_1","scZ_median"])
77 |   out_df.to_csv(args.outname, sep="\t", index=False)
78 | 
79 |   logging.info("Completed")
80 | 
81 | main()


--------------------------------------------------------------------------------
/bin/find_SpliZ_sites.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # Written By : Roozbeh Dehghannasiri (rdehghan@stanford.edu)
  4 | # this script takes the permutation file and finds the most variable splize sites for genes with permutation p-value <0.05 
  5 | # it finds up to 3 splice sites for each eigenvector (1st, 2nd, 3rd)
  6 | #it creates three output files corresponding to the splice sites for each eigenvector 
  7 | 
  8 | library(data.table)
  9 | library(Rfast)
 10 | 
 11 | args <- commandArgs(trailingOnly = TRUE)
 12 | p_value_file = args[1]
 13 | first_evec_file = args[2]
 14 | second_evec_file = args[3]
 15 | third_evec_file = args[4]
 16 | libraryType = args[5]
 17 | mat_samplesheet = args[6]
 18 | 
 19 | p_value = fread(p_value_file,sep="\t",header=TRUE)
 20 | mat_paths = fread(mat_samplesheet,sep="\t",header=TRUE)
 21 | 
 22 | ## I want to select the top 20 and top 50 genes with FDR < 0.05
 23 | if (libraryType == "SS2") {
 24 |   p_value = p_value[perm_pval_adj_svd_z0<0.05]
 25 | }
 26 | 
 27 | 
 28 | 
 29 | topgenes = unique(p_value$gene)
 30 | print(paste("number of genes to run",length(topgenes)))
 31 | 
 32 | if (length(topgenes) == 0) {
 33 |   to_plot <- data.frame(matrix(ncol = 3, nrow = 0))
 34 |   names(to_plot) = c("gene","let","end")
 35 |   write.table(to_plot, first_evec_file, sep = "\t", row.names = FALSE, quote = FALSE)
 36 |   write.table(to_plot, second_evec_file, sep = "\t", row.names = FALSE, quote = FALSE)
 37 |   write.table(to_plot, third_evec_file, sep = "\t", row.names = FALSE, quote = FALSE)
 38 | 
 39 | } else {
 40 | 
 41 | 
 42 | gene_to_plot = c() # I get these vectors to build a data table so that their dot plots can be made automatically
 43 | coordinate_to_plot = c()
 44 | let_to_plot = c()
 45 | for (counter in 1:length(topgenes)){
 46 |   gene = topgenes[counter]  # name of the gene
 47 |   tryCatch({
 48 | #    geneMat_file = paste(gene, ".geneMat", sep="")
 49 |     geneMat_file = mat_paths$path[mat_paths$gene == gene]
 50 | 
 51 |     loadings = fread(geneMat_file)
 52 |     loadings_sq = loadings[1,]^2
 53 |     top_site = names(loadings_sq)[loadings_sq==max(loadings_sq)]
 54 |     coordinate_to_plot = c(coordinate_to_plot,strsplit(top_site,split = "_")[[1]][1])
 55 |     let_to_plot = c(let_to_plot,strsplit(top_site,split = "_")[[1]][2])
 56 |     gene_to_plot = c(gene_to_plot,gene)
 57 |     
 58 |     # I copy for the second and third only if they have at least 10% of loadings
 59 |     if (Rfast::nth(as.matrix(loadings_sq), 2, descending = T) > 0.1){
 60 |       second_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 2, descending = T)]
 61 |       coordinate_to_plot = c(coordinate_to_plot,strsplit(second_top_site,split = "_")[[1]][1])
 62 |       let_to_plot = c(let_to_plot,strsplit(second_top_site,split = "_")[[1]][2])
 63 |       gene_to_plot = c(gene_to_plot,gene)
 64 |     }
 65 |     if (Rfast::nth(as.matrix(loadings_sq), 3, descending = T) > 0.1){
 66 |       third_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 3, descending = T)]
 67 |       coordinate_to_plot = c(coordinate_to_plot,strsplit(third_top_site,split = "_")[[1]][1])
 68 |       let_to_plot = c(let_to_plot,strsplit(third_top_site,split = "_")[[1]][2])
 69 |       gene_to_plot = c(gene_to_plot,gene)
 70 |     }
 71 |     
 72 |     top_site = ""
 73 |     second_top_site = ""
 74 |     third_top_site = ""
 75 |   },error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
 76 | }
 77 | to_plot = data.table(gene_to_plot,let_to_plot,coordinate_to_plot)
 78 | names(to_plot) = c("gene","let","end")
 79 | 
 80 | write.table(to_plot, first_evec_file, sep = "\t", row.names = FALSE, quote = FALSE)
 81 | 
 82 | ##############################
 83 | #### second eigen vector #####
 84 | ##############################
 85 | 
 86 | gene_to_plot = c() # I get these vectors to build a data table so that their dot plots can be made automatically
 87 | coordinate_to_plot = c()
 88 | let_to_plot = c()
 89 | for (counter in 1:length(topgenes)){
 90 |   gene = topgenes[counter]  # name of the gene
 91 |   tryCatch({
 92 |     geneMat_file = mat_paths$path[mat_paths$gene == gene]
 93 | 
 94 | 
 95 |     loadings = fread(geneMat_file)
 96 |     loadings_sq = loadings[2,]^2
 97 |     top_site = names(loadings_sq)[loadings_sq==max(loadings_sq)]
 98 |     coordinate_to_plot = c(coordinate_to_plot,strsplit(top_site,split = "_")[[1]][1])
 99 |     let_to_plot = c(let_to_plot,strsplit(top_site,split = "_")[[1]][2])
100 |     gene_to_plot = c(gene_to_plot,gene)
101 |     
102 |     # I copy for the second and third only if they have at least 10% of loadings
103 |     if (Rfast::nth(as.matrix(loadings_sq), 2, descending = T) > 0.1){
104 |       second_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 2, descending = T)]
105 |       coordinate_to_plot = c(coordinate_to_plot,strsplit(second_top_site,split = "_")[[1]][1])
106 |       let_to_plot = c(let_to_plot,strsplit(second_top_site,split = "_")[[1]][2])
107 |       gene_to_plot = c(gene_to_plot,gene)
108 |     }
109 |     if (Rfast::nth(as.matrix(loadings_sq), 3, descending = T) > 0.1){
110 |       third_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 3, descending = T)]
111 |       coordinate_to_plot = c(coordinate_to_plot,strsplit(third_top_site,split = "_")[[1]][1])
112 |       let_to_plot = c(let_to_plot,strsplit(third_top_site,split = "_")[[1]][2])
113 |       gene_to_plot = c(gene_to_plot,gene)
114 |     }
115 |     
116 |     top_site = ""
117 |     second_top_site = ""
118 |     third_top_site = ""
119 |   },error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
120 | }
121 | to_plot = data.table(gene_to_plot,let_to_plot,coordinate_to_plot)
122 | names(to_plot) = c("gene","let","end")
123 | 
124 | write.table(to_plot, second_evec_file, sep = "\t", row.names = FALSE, quote = FALSE)
125 | 
126 | 
127 | ##############################
128 | #### third eigen vector #####
129 | ##############################
130 | 
131 | gene_to_plot = c() # I get these vectors to build a data table so that their dot plots can be made automatically
132 | coordinate_to_plot = c()
133 | let_to_plot = c()
134 | for (counter in 1:length(topgenes)){
135 |   gene = topgenes[counter]  # name of the gene
136 |   tryCatch({
137 |     geneMat_file = mat_paths$path[mat_paths$gene == gene]
138 | 
139 | 
140 |     loadings = fread(geneMat_file)
141 |     loadings_sq = loadings[3,]^2
142 |     top_site = names(loadings_sq)[loadings_sq==max(loadings_sq)]
143 |     coordinate_to_plot = c(coordinate_to_plot,strsplit(top_site,split = "_")[[1]][1])
144 |     let_to_plot = c(let_to_plot,strsplit(top_site,split = "_")[[1]][2])
145 |     gene_to_plot = c(gene_to_plot,gene)
146 |     
147 |     # I copy for the second and third only if they have at least 10% of loadings
148 |     if (Rfast::nth(as.matrix(loadings_sq), 2, descending = T) > 0.1){
149 |       second_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 2, descending = T)]
150 |       coordinate_to_plot = c(coordinate_to_plot,strsplit(second_top_site,split = "_")[[1]][1])
151 |       let_to_plot = c(let_to_plot,strsplit(second_top_site,split = "_")[[1]][2])
152 |       gene_to_plot = c(gene_to_plot,gene)
153 |     }
154 |     if (Rfast::nth(as.matrix(loadings_sq), 3, descending = T) > 0.1){
155 |       third_top_site = names(loadings_sq)[loadings_sq == Rfast::nth(as.matrix(loadings_sq), 3, descending = T)]
156 |       coordinate_to_plot = c(coordinate_to_plot,strsplit(third_top_site,split = "_")[[1]][1])
157 |       let_to_plot = c(let_to_plot,strsplit(third_top_site,split = "_")[[1]][2])
158 |       gene_to_plot = c(gene_to_plot,gene)
159 |     }
160 |     
161 |     
162 |   },error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
163 | }
164 | to_plot = data.table(gene_to_plot,let_to_plot,coordinate_to_plot)
165 | names(to_plot) = c("gene","let","end")
166 | 
167 | write.table(to_plot, third_evec_file, sep = "\t", row.names = FALSE, quote = FALSE)
168 | }


--------------------------------------------------------------------------------
/bin/light_class_input_subcols.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | from collections import defaultdict
  5 | import numpy as np
  6 | import pandas as pd
  7 | import pickle
  8 | import pysam
  9 | import annotator
 10 | from light_utils import *
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | def get_args():
 15 |   parser = argparse.ArgumentParser()
 16 |   parser.add_argument('--bams', nargs="+",required=True, help='bams to parse (either one or two for paired end)')
 17 |   parser.add_argument("--libraryType",help="Options: SS2, 10X, SLS")
 18 |   parser.add_argument("--annotator", required=True, help="the path to the annotator pickle file")
 19 |   parser.add_argument("--gtf", required=True, help="the path to the gtf file")
 20 |   parser.add_argument("--outname",help="Output file name")
 21 | 
 22 |   args = parser.parse_args()
 23 |   return args
 24 | 
 25 | def extract_info_align(cellranger, CI_dict, bam_read, suffix, bam_file, ann, UMI_bar, stranded_library, spatial_bar, fill_char = np.nan, strand_dict={True : "-", False : "+"}):
 26 |   if UMI_bar:
 27 |     if cellranger:
 28 | #      print("CB",bam_read.has_tag("CB"))
 29 | #      print("UB",bam_read.has_tag("UB"))
 30 | #      print("UR",bam_read.has_tag("UR"))
 31 | 
 32 |       CI_dict["barcode"].append(bam_read.get_tag("CB"))
 33 |       try:
 34 |         CI_dict["UMI"].append(bam_read.get_tag("UB"))
 35 |       except:
 36 |         CI_dict["UMI"].append(bam_read.get_tag("UR"))
 37 | 
 38 |     else:
 39 |       vals = bam_read.query_name.split("_")
 40 |       CI_dict["barcode"].append(vals[-2])
 41 |       CI_dict["UMI"].append(vals[-1])
 42 |   elif spatial_bar:
 43 |     CI_dict["barcode"].append(bam_read.get_tag("XC"))
 44 |     CI_dict["UMI"].append(bam_read.get_tag("XM"))
 45 |   else:
 46 |     CI_dict["barcode"].append(fill_char)
 47 |     CI_dict["UMI"].append(fill_char)
 48 |   CI_dict["id"].append(bam_read.query_name)
 49 |   
 50 |   seqname = bam_file.get_reference_name(bam_read.tid)
 51 | 
 52 |   # if chromosome is numeric, prepend "chr"
 53 |   if str(seqname).isnumeric():
 54 |     seqname = "chr" + str(seqname)
 55 | 
 56 |   refName, chrA, geneA, posA, chrB, geneB, posB = readObj_refname(
 57 |     strand_dict[bam_read.is_reverse], 
 58 |     bam_read.cigarstring, 
 59 |     seqname, 
 60 |     bam_read.reference_start + 1, 
 61 |     ann, 
 62 |     fill_char, 
 63 |     stranded_library
 64 |   )
 65 |   CI_dict["refName_AB" + suffix].append(refName)
 66 |   CI_dict["chr{}A".format(suffix)].append(chrA)
 67 |   CI_dict["chr{}B".format(suffix)].append(chrB)
 68 |   CI_dict["gene{}A".format(suffix)].append(geneA)
 69 |   CI_dict["gene{}B".format(suffix)].append(geneB)
 70 |   CI_dict["juncPos{}A".format(suffix)].append(int(posA))
 71 |   if np.isnan(posB):
 72 |     CI_dict["juncPos{}B".format(suffix)].append(posB)
 73 |   else:
 74 |     CI_dict["juncPos{}B".format(suffix)].append(int(posB))
 75 |   strand_dict = {True : "-", False : "+"}
 76 |   CI_dict["read_strand{}".format(suffix)].append(strand_dict[bam_read.is_reverse])
 77 | 
 78 | 
 79 |   CI_dict["primary{}".format(suffix)].append(not bam_read.is_secondary)
 80 | 
 81 |   empty_cols = []
 82 |   for c in empty_cols:
 83 |     CI_dict[c].append(fill_char)
 84 |   return CI_dict
 85 |  
 86 | def extract_info_chim(CI_dict,bam_read1,bam_read2,suffix, bam_file, ann, UMI_bar, stranded_library, fill_char = np.nan):
 87 |   assert bam_read1.query_name == bam_read2.query_name
 88 |   sec_dict = {True: 0, False: 1}
 89 |   if UMI_bar:
 90 |     vals = bam_read1.query_name.split("_")
 91 |     CI_dict["barcode"].append(vals[-2])
 92 |     CI_dict["UMI"].append(vals[-1])
 93 |   else:
 94 |     CI_dict["barcode"].append(fill_char)
 95 |     CI_dict["UMI"].append(fill_char)
 96 |   reads = [bam_read1,bam_read2]
 97 |   halves = ["A","B"]
 98 |   CI_dict["id"].append(bam_read1.query_name)
 99 | 
100 |   refName, chrA, geneA, posA, chrB, geneB, posB  = chim_refName([x.flag for x in reads], [x.cigarstring for x in reads], [x.reference_start + 1 for x in reads], [bam_file.get_reference_name(x.tid) for x in reads], ann, stranded_library)
101 |   CI_dict["refName_AB" + suffix].append(refName)
102 |   CI_dict["chr{}A".format(suffix)].append(chrA)
103 |   CI_dict["chr{}B".format(suffix)].append(chrB)
104 |   CI_dict["gene{}A".format(suffix)].append(geneA)
105 |   CI_dict["gene{}B".format(suffix)].append(geneB)
106 |   CI_dict["juncPos{}A".format(suffix)].append(int(posA))
107 |   CI_dict["juncPos{}B".format(suffix)].append(int(posB))
108 |   for i in range(2):
109 | 
110 |     CI_dict["primary{}{}".format(suffix,halves[i])].append(sec_dict[reads[i].is_secondary])
111 |   return CI_dict
112 | 
113 | 
114 | def get_final_df(cellranger, bam_files, j, suffixes, ann, UMI_bar, gtf, stranded_library, spatial_bar):
115 | 
116 |   CI_dfs = []
117 |   for i in range(len(bam_files)):
118 |     if i == 1:
119 |       read_ids = set(CI_dfs[0]["id"])
120 |     else:
121 |       read_ids = set()
122 |     suffix = suffixes[i]
123 |     col_bases = [ "juncPos", "gene", "chr"]
124 |     columns = ["id", "refName_AB" + suffix, "UMI", "barcode", "primary" + suffix, "read_strand" + suffix]
125 |     for c in col_bases:
126 |       for l in ["A", "B"]:
127 |         columns.append("{}{}{}".format(c,suffix,l))
128 |     CI_dict = {c : [] for c in columns}
129 |     count = 0
130 |     first = False
131 |     if i == 0:
132 |       genomic_alignments = {}
133 |     alignFile = pysam.AlignmentFile(bam_files[i])
134 |     # columns
135 |     #for bam_read in tqdm(alignFile.fetch(until_eof=True)):
136 |     for bam_read in (alignFile.fetch(until_eof=True)):
137 |       # require CB if this is cell ranger
138 |       if ((not cellranger) | ((bam_read.has_tag("CB") & (bam_read.cigarstring is not None)))): 
139 | 
140 |         # make sure read is mapped
141 |         if not bam_read.is_unmapped:
142 |           if (i == 0) or (not bam_read.is_secondary and bam_read.query_name in read_ids):
143 |             # it's a chimeric alignment and we need another line from it
144 |             if bam_read.has_tag("ch") and not first:
145 |                   prev_read = bam_read
146 |                   first = True
147 |             else:
148 |   
149 |               # add info from chimeric read
150 |               if bam_read.has_tag("ch"):
151 |                 count += 1
152 |   
153 |                 # note: removing chim for this test ONLY; uncomment after
154 |                 first = False
155 |   
156 |               # add info from align read
157 |               elif "N" in bam_read.cigarstring:
158 |                 count += 1
159 |                 CI_dict = extract_info_align(cellranger, CI_dict, bam_read, suffix, alignFile, ann, UMI_bar, stranded_library, spatial_bar)
160 |   
161 |               # save genomic alignment information
162 |               else:
163 |                 if i == 0:
164 |                   if bam_read.query_name not in genomic_alignments:
165 |                     genomic_alignments[bam_read.query_name] = bam_read.get_tag("AS")
166 |                   else:
167 |                     genomic_alignments[bam_read.query_name] = max(bam_read.get_tag("AS"), genomic_alignments[bam_read.query_name])
168 |                 else:
169 |                   CI_dict = extract_info_align(cellranger, CI_dict, bam_read, suffix, alignFile, ann, UMI_bar, stranded_library, spatial_bar)
170 | 
171 |     CI_df = pd.DataFrame.from_dict(CI_dict)
172 |     if i == 0:
173 |       genomic_alignments = defaultdict(lambda: np.nan,genomic_alignments)
174 | 
175 |     CI_dfs.append(CI_df)
176 |   if len(bam_files) == 2:
177 |     final_df = pd.merge(left=CI_dfs[0],right=CI_dfs[1][[c for c in CI_dfs[1].columns if c not in ["UMI","barcode"]]],how="left",left_on="id",right_on="id")
178 |     final_df["read_strand_compatible"] = 1
179 |     final_df.loc[final_df["read_strandR1"] == final_df["read_strandR2"],"read_strand_compatible"] = 0
180 |     final_df["location_compatible"] = final_df.apply(get_loc_flag,axis=1)
181 |   else:
182 |     final_df = CI_dfs[0]
183 |   float_cols = ["primaryR1"]
184 |   if len(bam_files) == 2:
185 |     float_cols += ["juncPosR2A","juncPosR2B","primaryR2"]
186 | 
187 |   final_df = final_df[final_df["primaryR1"]]
188 | 
189 |   return final_df
190 | 
191 | def main():
192 |   save = pysam.set_verbosity(0)
193 |   
194 |   args = get_args()
195 | 
196 |   bam_files = args.bams
197 |   gtf = args.gtf
198 | 
199 |   annotator_path = args.annotator
200 |   ann = pickle.load(open(annotator_path, "rb"))
201 | 
202 |   suffixes = ["R1","R2"]
203 | 
204 |   final_dfs = []
205 |   
206 |   n_rounds = len(bam_files)
207 | 
208 |   if args.libraryType == '10X':
209 |     UMI_bar = True
210 |     stranded_library = False
211 |     cellranger = True
212 |     spatial_bar = False
213 | 
214 |   elif args.libraryType == 'SS2':
215 |     UMI_bar = False
216 |     stranded_library = False
217 |     cellranger = False
218 |     spatial_bar = False
219 |   
220 |   if args.libraryType == "SLS":
221 |     UMI_bar = False
222 |     stranded_library = False
223 |     cellranger = False
224 |     spatial_bar = True
225 | 
226 |   for j in range(n_rounds):
227 |     if j == 1:
228 |       bam_files.reverse()
229 |     primary = get_final_df(cellranger, bam_files, j, suffixes, ann, UMI_bar, gtf, stranded_library, spatial_bar)
230 |     final_dfs.append(primary)
231 | 
232 |   pd.concat(final_dfs, axis=0).reset_index(drop=True).to_parquet(args.outname)
233 |   
234 |   pysam.set_verbosity(save)
235 | 
236 | 
237 | 
238 | main()
239 | 


--------------------------------------------------------------------------------
/bin/markdown_to_html.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from __future__ import print_function
 3 | import argparse
 4 | import markdown
 5 | import os
 6 | import sys
 7 | import io
 8 | 
 9 | 
10 | def convert_markdown(in_fn):
11 |     input_md = io.open(in_fn, mode="r", encoding="utf-8").read()
12 |     html = markdown.markdown(
13 |         "[TOC]\n" + input_md,
14 |         extensions=["pymdownx.extra", "pymdownx.b64", "pymdownx.highlight", "pymdownx.emoji", "pymdownx.tilde", "toc"],
15 |         extension_configs={
16 |             "pymdownx.b64": {"base_path": os.path.dirname(in_fn)},
17 |             "pymdownx.highlight": {"noclasses": True},
18 |             "toc": {"title": "Table of Contents"},
19 |         },
20 |     )
21 |     return html
22 | 
23 | 
24 | def wrap_html(contents):
25 |     header = """<!DOCTYPE html><html>
26 |     <head>
27 |         <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
28 |         <style>
29 |             body {
30 |               font-family: -apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";
31 |               padding: 3em;
32 |               margin-right: 350px;
33 |               max-width: 100%;
34 |             }
35 |             .toc {
36 |               position: fixed;
37 |               right: 20px;
38 |               width: 300px;
39 |               padding-top: 20px;
40 |               overflow: scroll;
41 |               height: calc(100% - 3em - 20px);
42 |             }
43 |             .toctitle {
44 |               font-size: 1.8em;
45 |               font-weight: bold;
46 |             }
47 |             .toc > ul {
48 |               padding: 0;
49 |               margin: 1rem 0;
50 |               list-style-type: none;
51 |             }
52 |             .toc > ul ul { padding-left: 20px; }
53 |             .toc > ul > li > a { display: none; }
54 |             img { max-width: 800px; }
55 |             pre {
56 |               padding: 0.6em 1em;
57 |             }
58 |             h2 {
59 | 
60 |             }
61 |         </style>
62 |     </head>
63 |     <body>
64 |     <div class="container">
65 |     """
66 |     footer = """
67 |     </div>
68 |     </body>
69 |     </html>
70 |     """
71 |     return header + contents + footer
72 | 
73 | 
74 | def parse_args(args=None):
75 |     parser = argparse.ArgumentParser()
76 |     parser.add_argument("mdfile", type=argparse.FileType("r"), nargs="?", help="File to convert. Defaults to stdin.")
77 |     parser.add_argument(
78 |         "-o", "--out", type=argparse.FileType("w"), default=sys.stdout, help="Output file name. Defaults to stdout."
79 |     )
80 |     return parser.parse_args(args)
81 | 
82 | 
83 | def main(args=None):
84 |     args = parse_args(args)
85 |     converted_md = convert_markdown(args.mdfile.name)
86 |     html = wrap_html(converted_md)
87 |     args.out.write(html)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     sys.exit(main())
92 | 


--------------------------------------------------------------------------------
/bin/parquet_to_tsv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import pandas as pd
 5 | 
 6 | def get_args():
 7 |   parser = argparse.ArgumentParser(description="convert parquet to tsv")
 8 |   parser.add_argument("-p","--parquet",help="input parquet file")
 9 |   parser.add_argument("-o","--tsv",help="name to save tsv")
10 |   parser.add_argument("--reverse",action="store_true",help="convert from tsv to pq instead")
11 |   args = parser.parse_args()
12 |   return args
13 | 
14 | def main():
15 |   args = get_args()
16 |   if args.reverse:
17 |     df = pd.read_csv(args.tsv, sep = "\t")
18 |     df.to_parquet(args.parquet)
19 |   else:
20 |     df = pd.read_parquet(args.parquet)
21 |     df.to_csv(args.tsv, sep = "\t", index = False)
22 |   
23 | main()


--------------------------------------------------------------------------------
/bin/process_CI.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import pandas as pd
 5 | from pathlib import Path
 6 | 
 7 | def get_args():
 8 |   parser = argparse.ArgumentParser(description="merge class input files")
 9 |   parser.add_argument("--input_file", help="Metadata file")
10 |   parser.add_argument("--meta", help="Metadata file")
11 |   parser.add_argument("--outname", help="Output file name")
12 |   parser.add_argument("--libraryType")
13 | 
14 |   args = parser.parse_args()
15 |   return args
16 | 
17 | def main():
18 |   args = get_args()
19 | 
20 |   file_list = pd.read_csv(args.input_file, header=None, names=['sample_ID','file'])
21 | 
22 |   file_list['sample_ID'] = file_list['sample_ID'].map(lambda x: x.lstrip('['))
23 |   file_list['file'] = file_list['file'].map(lambda x: x.rstrip(']').lstrip(' '))
24 | 
25 |   dfs = []
26 | 
27 |   for index, row in file_list.iterrows():
28 |     sample_ID = row['sample_ID']
29 |     fn = Path(row['file'])
30 |     df = pd.read_parquet(fn)
31 | 
32 |     # remove UMI duplicates by cell + junction
33 |     df = df.drop_duplicates(["barcode","UMI","refName_ABR1"])
34 |   
35 |     df["barcode_refName"] = df["barcode"].astype(str) + df["refName_ABR1"]
36 |   
37 |     # count number of lines corresponding to the junction in the cell
38 |     barcode_name_vc = df["barcode_refName"].value_counts()
39 |     df["numReads"] = df["barcode_refName"].map(barcode_name_vc)
40 | 
41 |     # deduplicate by cell + junction
42 |     df = df.drop_duplicates(["refName_ABR1","barcode"])
43 | 
44 |     # clean up barcode column
45 |   
46 |     if args.libraryType in ['10X',"SLS"]:
47 |       df["barcode"] = df["barcode"].str.rstrip("-1")
48 |       df["cell_id"] = sample_ID + "_" + df["barcode"].astype(str)
49 |     elif args.libraryType == 'SS2':
50 |       df['id'] = df['id'].str.split('.').str[0]
51 |       df["cell_id"] = df["id"].astype(str)
52 | 
53 |   
54 |     dfs.append(df)
55 | 
56 |   full_df = pd.concat(dfs)
57 |   full_df["called"] = 1
58 |   full_df["refName_newR1"] = full_df["refName_ABR1"]
59 |   full_df.rename(columns={"geneR1A" : "geneR1A_uniq", "geneR1B" : "geneR1B_uniq"}, inplace=True)
60 |   
61 |   final_df = full_df[["refName_newR1","geneR1A_uniq","geneR1B_uniq", "juncPosR1A","juncPosR1B","chrR1A","chrR1B","numReads","cell_id"]]
62 | 
63 |   meta = pd.read_csv(args.meta, sep="\t") 
64 |   final_df.drop([x for x in final_df.columns if x in meta.columns and x != "cell_id"], inplace=True, axis=1)
65 | 
66 |   merged = final_df.merge(meta, left_on="cell_id", right_on="cell_id", how = "left")
67 | 
68 |   merged.rename(columns={'cell_id': 'cell'}, inplace=True)
69 |   merged.to_parquet(args.outname)
70 | 
71 | 
72 | main()
73 | 


--------------------------------------------------------------------------------
/bin/rijk_zscore.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | import os
  5 | import argparse
  6 | import numpy as np
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | import warnings
 10 | import logging
 11 | warnings.filterwarnings("ignore")
 12 | 
 13 | def get_args():
 14 |   parser = argparse.ArgumentParser(description="calculate splicing scores per gene/cell")
 15 |   parser.add_argument("--parquet", help="input parquet file")
 16 |   parser.add_argument("--pinning_S", type=float, help="pinning level for S_ijks")
 17 |   parser.add_argument("--pinning_z", type=float, help="pinning level for zs")
 18 |   parser.add_argument("--lower_bound", type=int, help="only include cell/gene pairs the have more than this many junctional reads for the gene")
 19 |   parser.add_argument("--isLight", help="if included, don't calculate extra columns (saves time)")
 20 |   parser.add_argument("--isSICILIAN", help="Is SICILIAN input file")
 21 |   parser.add_argument("--grouping_level_2", help="column to group the data by (e.g. ontology, compartment, tissue)", default="ontology")
 22 |   parser.add_argument("--grouping_level_1", help="subset data by this column before checking for differences (e.g. tissue, compartment)", default="dummy")
 23 |   parser.add_argument("--outname_pq", help="Name of output file")
 24 |   parser.add_argument("--outname_tsv", help="Name of output file")  
 25 |   parser.add_argument("--outname_log", help="Name of log file")
 26 |   args = parser.parse_args()
 27 |   return args
 28 | 
 29 | def prepare_df(df, let, rank_by_donor, rev_let, let_dict):
 30 |   
 31 |   # create donor identifier
 32 |   df["pos{}_group".format(let)] = df["junc{}".format(let)].astype(str) + df["gene"]
 33 |   df["rank_" + let_dict[let]] = df.groupby("pos{}_group".format(let))["junc{}".format(rev_let[let])].rank(method="dense")
 34 | 
 35 |   # remove consitutive splicing
 36 |   df["max_rank"] = df["pos{}_group".format(let)].map(df.groupby("pos{}_group".format(let))["rank_" + let_dict[let]].max())
 37 |   df = df[df["max_rank"] > 1]
 38 |   
 39 |   if not rank_by_donor:
 40 |     df["rank_" + let_dict[let]] = df.groupby("gene")["juncEnd"].rank(method="dense")
 41 |   
 42 |   return df
 43 | 
 44 | def calc_Sijk(df,let, pinning_S, let_dict):
 45 |   # calculate the average rank calculation per gene
 46 |   # same as this calculation (this one's slower): df["rank_mean"] = df.groupby("pos{}_group".format(let)).apply(lambda x: (x["numReads"] * x["rank_acc"])/x["numReads"].sum()).reset_index(level=0,drop=True)
 47 | 
 48 |   # number of reads with this donor across all cells
 49 |   df["sum_reads_group"] = df.groupby("pos{}_group".format(let))["numReads"].transform("sum")
 50 | 
 51 |   df["read_x_" + let_dict[let]] = df["numReads"] * df["rank_" + let_dict[let]]
 52 | 
 53 |   # the sum of acceptors for all reads in all cells with this donor
 54 |   df["num"] = df.groupby("pos{}_group".format(let))["read_x_" + let_dict[let]].transform("sum")
 55 | 
 56 |   # average acceptor for a read with this donor (donor has one value for this)
 57 |   df["rank_mean"]= df["num"] / df["sum_reads_group"]
 58 | 
 59 |   # sum squared difference in rank for ever read
 60 |   df["sq_diff"] = df["numReads"] * (df["rank_" + let_dict[let]] - df["rank_mean"])**2
 61 | 
 62 |   # Get the sum of these squared differences for each donor
 63 |   df["don_num"] = df.groupby("pos{}_group".format(let))["sq_diff"].transform("sum")
 64 | 
 65 |   # sum of squared differences normalized by total number of reads
 66 |   # changed to make it the sample standard deviation (added minus 1)
 67 |   df["don_sigma"] = df["don_num"] / (df["sum_reads_group"])
 68 | 
 69 |   # this is the S_ijk value (difference normalized by sd) - should be normal 0/1
 70 |   df["S_ijk_{}".format(let)] = (df["rank_" + let_dict[let]] - df["rank_mean"])/np.sqrt(df["don_sigma"])
 71 | 
 72 |   # round outlying S values
 73 |   low_quant = df["S_ijk_{}".format(let)].quantile(q=pinning_S)
 74 |   high_quant = df["S_ijk_{}".format(let)].quantile(q=1 - pinning_S)
 75 |   df["S_ijk_{}_unpinned".format(let)] = df["S_ijk_{}".format(let)]
 76 | 
 77 |   df.loc[df["S_ijk_{}".format(let)] < low_quant,"S_ijk_{}".format(let)] = low_quant
 78 |   df.loc[df["S_ijk_{}".format(let)] > high_quant,"S_ijk_{}".format(let)] = high_quant
 79 | 
 80 |   # correct for those with no variance
 81 |   df.loc[df["don_sigma"] == 0, "S_ijk_{}".format(let)] = 0
 82 |   df["n_sijk"] = df["numReads"]
 83 |   df.loc[df["don_sigma"] == 0,"n_sijk"] = 0
 84 |   
 85 |   return df
 86 | 
 87 | def normalize_Sijks(df,let):
 88 | 
 89 |   # calculate mean of SijkA's per gene
 90 |   df["n_s"] = df["numReads"] * df["S_ijk_" + let]
 91 |   df["num"] = df.groupby("gene")["n_s"].transform("sum")
 92 |   df["n_gene"] = df.groupby("gene")["numReads"].transform("sum")
 93 |   df["sijk{}_mean".format(let)] = df["num"] / df["n_gene"]
 94 | 
 95 |   # calculate standard deviation of SijkA's per gene
 96 |   df["sd_num"] = df["numReads"] * (df["S_ijk_" + let] - df["sijk{}_mean".format(let)])**2
 97 |   df["num"] = df.groupby("gene")["sd_num"].transform("sum")
 98 |   df["sijk{}_var".format(let)] = df["num"] / df["n_gene"]
 99 | 
100 |   return df
101 | 
102 | def contains_required_cols(df, required_cols, grouping_level_2, grouping_level_1):
103 |   
104 |   # Function to check if the input file contains the required columns for processing
105 | 
106 |   required_cols.append(grouping_level_2)
107 |   if grouping_level_1.lower() != "dummy":
108 |     required_cols.append(grouping_level_1)
109 | 
110 |   set_req = set(required_cols)
111 |   set_df = set(list(df.columns))
112 | 
113 |   print(set_req)
114 |   print(set_df)
115 | 
116 |   if set_df.issuperset(set_req):
117 |     return True, required_cols
118 |   else:
119 |     return False, required_cols
120 | 
121 | def main():
122 |   args = get_args()
123 |   light = bool(int(args.isLight))
124 |   SICILIAN = bool(int(args.isSICILIAN))
125 | 
126 |   logging.basicConfig(
127 |     filename = args.outname_log,
128 |     format='%(asctime)s %(levelname)-8s %(message)s',
129 |     level=logging.INFO,
130 |     datefmt='%Y-%m-%d %H:%M:%S')  
131 |   
132 |   logging.info("Starting")
133 | 
134 |   let_dict = {"Start" : "acc", "End" : "don"}
135 | 
136 |   logging.info("Begin reading in parquet")
137 | 
138 |   df = pd.read_parquet(args.parquet)
139 |   
140 |   logging.info("Finished reading in parquet")
141 | 
142 |   logging.info("Input column check")
143 | 
144 |   if not SICILIAN:
145 |     df["called"] = 1
146 |   
147 |   base_required_cols = ["juncPosR1A", "geneR1A_uniq", "juncPosR1B", "numReads", "cell", "splice_ann", "refName_newR1", "called", "chrR1A"]
148 |   passes_input_check, required_cols = contains_required_cols(df, base_required_cols, args.grouping_level_2, args.grouping_level_1)
149 |   if passes_input_check: 
150 |     logging.info("Passed input column check")
151 |   else:
152 |     logging.exception("Failed input column check! Exiting")
153 |     sys.exit(1)
154 | 
155 |   df = df[required_cols]
156 | 
157 |   logging.info("Rename SICILIAN columns")
158 | 
159 |   cols_dict = {
160 |     "geneR1A_uniq": "gene",
161 |     "juncPosR1A": "juncStart",
162 |     "juncPosR1B": "juncEnd"
163 |   }
164 |   df.rename(columns=cols_dict, inplace=True)
165 | 
166 |   if "missing_domains" in df.columns and not light:
167 |     domain_breakdown = True
168 |   else:
169 |     domain_breakdown = False
170 | 
171 |   df.reset_index(drop=True,inplace=True)
172 |   rank_by_donor = True
173 | 
174 |   if SICILIAN:
175 |     df = df[df["called"] == 1]
176 |   else:
177 |     # only include junctions with more than 1 read in the dataset
178 |     df["numReads_tot"] = df.groupby("refName_newR1")["numReads"].transform("sum")
179 |     df = df[df["numReads_tot"] > 1]
180 | 
181 |   # use second location gene name if first is unknown
182 | 
183 |   df["geneR1B_uniq"] = df["refName_newR1"].str.split("|").str[1].str.split(":").str[1]
184 |   idx = df[(df["gene"].isin(["unknown",""])) | (df["gene"].isna())].index
185 |   df.loc[idx,"gene"] = df.loc[idx,"geneR1B_uniq"]
186 | 
187 |   bin_size = 100000
188 |   # bin unknown genes
189 |   idx = df[(df["gene"] == "") | (df["gene"] == "unknown") | (df["gene"].isna())].index
190 |   df.loc[idx,"gene"] = "unknown_" + df["chrR1A"].astype(str) + "_" + (df.loc[idx]["juncStart"] - df.loc[idx]["juncStart"] % bin_size).astype(str)
191 | 
192 |   logging.info("Replace with geneR1B")
193 | 
194 |   # get sign of gene to adjust z score
195 |   sign_df = df.drop_duplicates("gene")
196 |   sign_df["strandA"] = sign_df["refName_newR1"].str.split("|").str[0].str.split(":").str[3]
197 |   sign_df["strandB"] = sign_df["refName_newR1"].str.split("|").str[1].str.split(":").str[3]
198 |   idx = sign_df[sign_df["strandA"] == "?"].index
199 |   sign_df.loc[idx,"strandA"] = sign_df.loc[idx,"strandB"]
200 |   sign_df["sign"] = 1
201 |   sign_df.loc[sign_df["strandA"] == "-","sign"] = -1
202 |   sign_df[["gene","strandA","sign"]]
203 |   sign_dict = pd.Series(sign_df.sign.values,index=sign_df.gene).to_dict()
204 |   df["sign"] = df["gene"].map(sign_dict).fillna(1)
205 | 
206 |   logging.info("Get sign")
207 | 
208 |   df["cell_gene"] = df["cell"] + df["gene"]
209 | 
210 |   rev_let = {"Start" : "End", "End" : "Start"}
211 | 
212 |   if domain_breakdown:
213 |     split_dict = {True : ["ann", "dom_ch"], False : ["unann", "dom_unch"]}
214 |   else:
215 |     split_dict = {True : ["ann"], False : ["unann"]}
216 | 
217 |   # remove constitutive splicing
218 |   df["posA_group"] = df["juncStart"].astype(str) + df["gene"]
219 |   df["posB_group"] = df["juncEnd"].astype(str) + df["gene"]
220 | 
221 |   df["rank_acc"] = df.groupby("posA_group")["juncEnd"].rank(method="dense")
222 |   df["rank_don"] = df.groupby("posB_group")["juncStart"].rank(method="dense")
223 | 
224 |   df["max_rank_acc"] = df["posA_group"].map(df.groupby("posA_group")["rank_acc"].max())
225 |   df["max_rank_don"] = df["posB_group"].map(df.groupby("posB_group")["rank_don"].max())
226 | 
227 |   # add domain columns
228 |   letters = ["Start", "End"]
229 |   for let in letters:
230 | 
231 |     if domain_breakdown:
232 |       df["num_missing_" + let] = df["pos{}_group".format(let)].map(df.groupby("pos{}_group".format(let))["missing_domains"].nunique())
233 |       df["num_inserted_" + let] = df["pos{}_group".format(let)].map(df.groupby("pos{}_group".format(let))["domain_insertions"].nunique())
234 |       df["domain_changed_" + let] = (df["num_missing_" + let] + df["num_inserted_" + let]) > 0
235 | 
236 | 
237 |   df = df[(df["max_rank_don"] > 1) | (df["max_rank_acc"] > 1)]
238 | 
239 |   logging.info("Remove constitutive")
240 | 
241 |   # require at least args.lower_bound nonconstitutive spliced reads
242 |   df["noncon_count"] = df.groupby("cell_gene")["numReads"].transform("sum")
243 |   df = df[df["noncon_count"] > args.lower_bound]
244 | 
245 |   full_df = df.copy()
246 | 
247 |   calc_dfs = {}
248 | 
249 |   for let in tqdm(letters):
250 |     df = full_df
251 |     # create donor identifier
252 |     df = prepare_df(df, let, rank_by_donor, rev_let, let_dict)
253 | 
254 |     logging.info("Prepare df")
255 |     df = calc_Sijk(df,let,args.pinning_S, let_dict)
256 |     
257 |     logging.info("Calculate Sijk")
258 | 
259 |     df = normalize_Sijks(df,let)
260 | 
261 |     logging.info("Normalize Sijk")
262 |     
263 |     # remove those with variance == 0
264 |     df = df[df["sijk{}_var".format(let)] != 0]
265 | 
266 |     # calculate z score 
267 |     df["n.g_" + let] = df.groupby("cell_gene")["numReads"].transform("sum")
268 | 
269 |     df["nSijk" + let] = (df["S_ijk_" + let] - df["sijk{}_mean".format(let)]) / np.sqrt(df["sijk{}_var".format(let)])
270 |     df["mult"] = df["numReads"] * df["nSijk" + let]  / np.sqrt(df["n.g_" + let])
271 |     df["z_" + let] = df["sign"] * df.groupby("cell_gene")["mult"].transform("sum")
272 |     df["scaled_z_" + let] = df["z_" + let] / np.sqrt(df["n.g_" + let])
273 | 
274 |     logging.info("Calc z")
275 | 
276 |     ############## end modify Sijk ####################
277 |     df["cell_gene_junc"] = df["cell_gene"] + df["refName_newR1"]
278 | 
279 |     if not light:
280 |       # calculate the z score
281 |       df["x_sijk"] = df["S_ijk_{}".format(let)] * df["n_sijk"]
282 | 
283 |       df["num"] = df.groupby("cell_gene")["x_sijk"].transform("sum")
284 |       df["denom_sq"] = df.groupby("cell_gene")["n_sijk"].transform("sum")
285 |   
286 |       # get junction that "contributes the most" to the z score
287 |       df["temp"] = df["x_sijk"] / np.sqrt(df["denom_sq"])
288 |       df["temp_mag"] = abs(df["temp"])
289 |       df["idxmax_z"] = df["cell_gene"].map(df.groupby("cell_gene")["temp_mag"].idxmax())
290 |       map_df = df.loc[df["idxmax_z"],["cell_gene","refName_newR1","temp"]]
291 |       df["junc_max_{}".format(let)] = df["cell_gene"].map(pd.Series(map_df.refName_newR1.values,index=map_df.cell_gene).to_dict())
292 |       df["max_don_z_{}".format(let)] = df["cell_gene"].map(pd.Series(map_df.temp.values,index=map_df.cell_gene).to_dict())
293 |   
294 |     if args.pinning_z != 0:
295 |       # round outlying z values
296 |       low_quant = df["z_{}".format(let)].quantile(q=args.pinning_z)
297 |       high_quant = df["z_{}".format(let)].quantile(q=1 - args.pinning_z)
298 |       
299 |       df.loc[df["z_{}".format(let)] < low_quant,"z_{}".format(let)] = low_quant
300 |       df.loc[df["z_{}".format(let)] > high_quant,"z_{}".format(let)] = high_quant
301 | 
302 |     if not light:
303 |       # break down z score by annotation
304 |       for k,v in split_dict.items():
305 |         df["num_{}".format(v[0])] = df["cell_gene"].map(df[df["splice_ann"] == k].groupby("cell_gene")["x_sijk"].sum())
306 |   
307 |         if domain_breakdown:
308 |           df["num_{}".format(v[1])] = df["cell_gene"].map(df[df["domain_changed_" + let] == k].groupby("cell_gene")["x_sijk"].sum())
309 |   
310 |         for y in v:
311 |   
312 |           df["z_{}_{}".format(let,y)] = df["sign"] * df["num_{}".format(y)]/np.sqrt(df["denom_sq"])
313 |       
314 |           # round outlying z values
315 |           low_quant = df["z_{}_{}".format(let,y)].quantile(q=args.pinning_z)
316 |           high_quant = df["z_{}_{}".format(let,y)].quantile(q=1 - args.pinning_z)
317 |       
318 |           df.loc[df["z_{}_{}".format(let,y)] < low_quant,"z_{}_{}".format(let,y)] = low_quant
319 |           df.loc[df["z_{}_{}".format(let,y)] > high_quant,"z_{}_{}".format(let,y)] = high_quant       
320 | 
321 |     calc_dfs[let] = df
322 | 
323 |   df = calc_dfs["Start"].merge(calc_dfs["End"],on="cell_gene_junc",how="outer",suffixes=("","_x"))
324 |   
325 |   logging.info("Merged")
326 |   
327 |   for cx in [x for x in df.columns if x.endswith("_x")]:
328 |     c = cx[:-2]
329 |     df.loc[df[c].isna(),c] = df.loc[df[c].isna(),cx]
330 |   
331 |   df.drop([x for x in df.columns if x.endswith("_x")],inplace=True,axis=1)
332 | 
333 |   # average two scores (negate one of them)
334 | 
335 |   grouped = df.groupby('gene')
336 |   for let in letters:
337 |     z_dict = pd.Series(calc_dfs[let]["z_" + let].values,index=calc_dfs[let].cell_gene).to_dict()
338 |     df["z_" + let] = df["cell_gene"].map(z_dict)
339 |     scz_dict = pd.Series(calc_dfs[let]["scaled_z_" + let].values,index=calc_dfs[let].cell_gene).to_dict()
340 |     df["scaled_z_" + let] = df["cell_gene"].map(scz_dict)
341 | 
342 |   df["cov"] = df["gene"].map(grouped.apply(lambda x: x['z_Start'].cov(x['z_End'])))
343 | 
344 |   idx = df[df["z_Start"].isna()].index
345 |   df.loc[idx,"z"] = -df.loc[idx,"z_End"]
346 |   df.loc[idx,"scZ"] = -df.loc[idx,"scaled_z_End"]
347 | 
348 |   idx = df[df["z_End"].isna()].index
349 |   df.loc[idx,"z"] = df.loc[idx,"z_Start"]
350 |   df.loc[idx,"scZ"] = df.loc[idx,"scaled_z_Start"]
351 | 
352 |   idx = df[(~df["z_Start"].isna()) & (~df["z_End"].isna())].index
353 |   df.loc[idx,"z"] = (df.loc[idx,"z_Start"] - df.loc[idx,"z_End"])/np.sqrt(2 )
354 |   df.loc[idx,"scZ"] = (df.loc[idx,"scaled_z_Start"] - df.loc[idx,"scaled_z_End"])/np.sqrt(2 )
355 | 
356 |   logging.info("Avg z")
357 |   
358 |   if not light:
359 |     # average two scores for split z
360 |     for v in split_dict.values():
361 |       for y in v:
362 |         grouped = df.groupby('gene')
363 |         df["cov_{}".format(y)] = df["gene"].map(grouped.apply(lambda x: x['z_Start_{}'.format(y)].cov(x['z_End_{}'.format(y)])))
364 |   
365 |         idx = df[df["z_Start_{}".format(y)].isna()].index
366 |         df.loc[idx,"z_{}".format(y)] = -df.loc[idx,"z_End_{}".format(y)]
367 |       
368 |         idx = df[df["z_End_{}".format(y)].isna()].index
369 |         df.loc[idx,"z_{}".format(y)] = df.loc[idx,"z_Start_{}".format(y)]
370 |       
371 |         idx = df[(~df["z_Start_{}".format(y)].isna()) & (~df["z_End_{}".format(y)].isna())].index
372 |         df.loc[idx,"z_{}".format(y)] = (df.loc[idx,"z_Start_{}".format(y)] - df.loc[idx,"z_End_{}".format(y)])/np.sqrt(2) - df["cov_{}".format(y)]
373 | 
374 |   df["ontology"] = df[args.grouping_level_1] + df[args.grouping_level_2]
375 |   
376 |   df["n.g"] = df.groupby("cell_gene")["numReads"].transform("sum")
377 |   df["scaled_z"] = df["z"] / np.sqrt(df["n.g"])
378 | 
379 |   for let in letters:
380 |     df["zcontrib" + let] = df["numReads"] * df["nSijk" + let] / np.sqrt(df["n.g"])
381 | 
382 |   sub_cols = ["cell", "gene", "ontology", "scZ", "n.g_Start", "n.g_End"]
383 |   sub_cols.append(args.grouping_level_2)
384 |   if args.grouping_level_1.lower() != "dummy":
385 |     sub_cols.append(args.grouping_level_1)
386 | 
387 |   df.drop_duplicates("cell_gene")[sub_cols].to_csv(args.outname_tsv, index=False, sep="\t")
388 |   df.to_parquet(args.outname_pq)
389 | 
390 |   logging.info("Wrote files")
391 | 
392 |   logging.info("Completed")
393 | 
394 | try:
395 |     exit(main())
396 | except Exception:
397 |     logging.exception("Exception in main(): ")
398 |     exit(1)


--------------------------------------------------------------------------------
/bin/scrape_software_versions.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import print_function
 3 | from collections import OrderedDict
 4 | import re
 5 | 
 6 | # TODO nf-core: Add additional regexes for new tools in process get_software_versions
 7 | regexes = {
 8 |     "nf-core/spliz": ["v_pipeline.txt", r"(\S+)"],
 9 |     "Nextflow": ["v_nextflow.txt", r"(\S+)"],
10 |     "FastQC": ["v_fastqc.txt", r"FastQC v(\S+)"],
11 |     "MultiQC": ["v_multiqc.txt", r"multiqc, version (\S+)"],
12 | }
13 | results = OrderedDict()
14 | results["nf-core/spliz"] = '<span style="color:#999999;">N/A</span>'
15 | results["Nextflow"] = '<span style="color:#999999;">N/A</span>'
16 | results["FastQC"] = '<span style="color:#999999;">N/A</span>'
17 | results["MultiQC"] = '<span style="color:#999999;">N/A</span>'
18 | 
19 | # Search each file using its regex
20 | for k, v in regexes.items():
21 |     try:
22 |         with open(v[0]) as x:
23 |             versions = x.read()
24 |             match = re.search(v[1], versions)
25 |             if match:
26 |                 results[k] = "v{}".format(match.group(1))
27 |     except IOError:
28 |         results[k] = False
29 | 
30 | # Remove software set to false in results
31 | for k in list(results):
32 |     if not results[k]:
33 |         del results[k]
34 | 
35 | # Dump to YAML
36 | print(
37 |     """
38 | id: 'software_versions'
39 | section_name: 'nf-core/spliz Software Versions'
40 | section_href: 'https://github.com/nf-core/spliz'
41 | plot_type: 'html'
42 | description: 'are collected at run time from the software output.'
43 | data: |
44 |     <dl class="dl-horizontal">
45 | """
46 | )
47 | for k, v in results.items():
48 |     print("        <dt>{}</dt><dd><samp>{}</samp></dd>".format(k, v))
49 | print("    </dl>")
50 | 
51 | # Write out regexes as csv file:
52 | with open("software_versions.csv", "w") as f:
53 |     for k, v in results.items():
54 |         f.write("{}\t{}\n".format(k, v))
55 | 


--------------------------------------------------------------------------------
/bin/svd_zscore.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 |   
  3 | import argparse
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy import linalg
  7 | from tqdm import tqdm
  8 | import os
  9 | import logging
 10 | 
 11 | def get_args():
 12 |   parser = argparse.ArgumentParser(description="calculate splicing scores per gene/cell")
 13 |   parser.add_argument("--input", help="Name of the input file from rijk_zscore")
 14 |   parser.add_argument("--svd_type", choices=["normgene","normdonor"], help="Method of calculating matrix before SVD")
 15 |   parser.add_argument("--grouping_level_2", help="column to group the data by (e.g. ontology, compartment, tissue)", default="ontology")
 16 |   parser.add_argument("--grouping_level_1", help="subset data by this column before checking for differences (e.g. tissue, compartment)", default="dummy")
 17 |   parser.add_argument("--outname_pq", help="Name of output file")
 18 |   parser.add_argument("--outname_tsv", help="Name of output File")
 19 |   parser.add_argument("--outname_log", help="Name of output File")
 20 |   args = parser.parse_args()
 21 |   return args
 22 | 
 23 | def main():
 24 |   args = get_args()
 25 | 
 26 |   logging.basicConfig(
 27 |     filename = args.outname_log,
 28 |     format='%(asctime)s %(levelname)-8s %(message)s',
 29 |     level=logging.INFO,
 30 |     datefmt='%Y-%m-%d %H:%M:%S') 
 31 | 
 32 |   logging.info("Beginning calculation")
 33 |   logging.info("Read in parquet file")
 34 | 
 35 |   df = pd.read_parquet(args.input)
 36 | 
 37 |   ##### PERFORM SVD ZSCORE CALCULATION #####
 38 | 
 39 |   logging.info("Perform SVD zscore calculation")
 40 | 
 41 |   letters = ["Start", "End"]
 42 | 
 43 |   if args.svd_type == "normgene":
 44 |     zcontrib_col = "zcontrib"  
 45 |   elif args.svd_type == "normdonor":
 46 |     
 47 |     for let in letters:
 48 |       # find number of reads per donor (or acceptor) per cell
 49 |       df["cell_gene_pos" + let] = df["cell_gene"] + df["junc" + let].astype(str)
 50 |       df["n.g_pos" + let] = df.groupby("cell_gene_pos" + let)["numReads"].transform("sum")
 51 |       # normalize on a donor/acceptor rather than a gene basis
 52 |       # TRY OUT NOT SQRT-ING denominator as normalization
 53 |       df["zcontrib_posnorm" + let] = df["numReads"] * df["nSijk" + let] / df["n.g_pos" + let]
 54 |     
 55 |     zcontrib_col = "zcontrib_posnorm"
 56 | 
 57 |   for let in letters:
 58 | 
 59 |     # replace NANs with zeros
 60 |     df["zcontrib{}_rep".format(let)] = df[zcontrib_col + let].fillna(0)
 61 | 
 62 |     # create label for each junction + donor/acceptor
 63 |     df["str_junc" + let] = df["junc" + let].astype(int).astype(str) + "_" + let
 64 |     df["cell_gene_pos" + let] = df["cell"] + df["gene"] + df["junc" + let].astype(str)
 65 | 
 66 |     # get sum of zcontribs for the given cell and splice site
 67 |     df["summed_zcontrib" + let] = df.groupby("cell_gene_pos" + let)["zcontrib{}_rep".format(let)].transform('sum')
 68 | 
 69 |   k = 3 # number of components to include
 70 |   loads = {"f{}".format(i) : {} for i in range(k)}
 71 |   zs = {"svd_z{}".format(i) : {} for i in range(k)}
 72 |   
 73 |   logging.info("Iterate over each gene")
 74 |   for gene, gene_df in tqdm(df.groupby("gene")):
 75 |     
 76 |     # get zcontrib matrix
 77 |     gene_mats = []
 78 |     for let in letters:
 79 |       gene_mat = gene_df.drop_duplicates("cell_gene_pos" + let).pivot_table(index="cell_gene",columns="str_junc{}".format(let),values="summed_zcontrib" + let,fill_value=0)
 80 | 
 81 |       gene_mats.append(gene_mat)
 82 |     gene_mat = gene_mats[0].merge(gene_mats[1],on="cell_gene")
 83 | 
 84 |     # mean-normalize the rows
 85 |     gene_mat = gene_mat.subtract(gene_mat.mean(axis=1),axis=0)
 86 |     
 87 |     # calculate svd
 88 |     u, s, vh = linalg.svd(gene_mat,check_finite=False,full_matrices=False)
 89 |     
 90 |     if len(s) >= k:
 91 |       # calculate new z scores based on svd
 92 |       new_zs = gene_mat.dot(np.transpose(vh[:k,:]))
 93 |   
 94 |       # calculate load on each component
 95 |       load = np.square(s)/sum(np.square(s))
 96 |   
 97 |       # save new zs and fs in dictionaries to save later
 98 |       for i in range(k):
 99 |         loads["f{}".format(i)][gene] = load[i]
100 |         zs["svd_z{}".format(i)].update(pd.Series(new_zs[i].values,index=new_zs.index).to_dict())
101 |   
102 |       # save loadings
103 |       v_out = pd.DataFrame(vh,columns=gene_mat.columns)
104 |       #gene_mat_name = "{}_{}_{}.geneMat".format(gene, args.dataname, args.param_stem)
105 |       gene_mat_name = "{}.geneMat".format(gene)
106 |       v_out.to_csv(gene_mat_name, index=False, sep = "\t")
107 |       
108 |   for i in range(k):
109 |     df["f{}".format(i)] = df["gene"].map(loads["f{}".format(i)])
110 |     df["svd_z{}".format(i)] = df["cell_gene"].map(zs["svd_z{}".format(i)])
111 |   
112 |   df["svd_z_sumsq"] = (df[["svd_z{}".format(i) for i in range(k)]]**2).sum(axis=1)
113 | 
114 |   sub_cols = ["cell","gene","scZ","svd_z_sumsq","n.g_Start","n.g_End"] + ["f{}".format(i) for i in range(k)] + ["svd_z{}".format(i) for i in range(k)] #+ velocity_cols
115 |   if "ontology" in df.columns:
116 |     sub_cols = sub_cols + [args.grouping_level_1, args.grouping_level_2, "ontology"]
117 |   
118 |   logging.info("Write out files")
119 | 
120 |   df.drop_duplicates("cell_gene")[sub_cols].to_csv(args.outname_tsv, index=False, sep="\t")
121 |   df.to_parquet(args.outname_pq)
122 | 
123 |   logging.info("Completed")
124 | 
125 | try:
126 |     exit(main())
127 | except Exception:
128 |     logging.exception("Exception in main(): ")
129 |     exit(1)
130 | 


--------------------------------------------------------------------------------
/bin/variance_adjusted_permutations_bytiss.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy import stats
  7 | from tqdm import tqdm
  8 | import logging
  9 | from statsmodels.stats.multitest import multipletests
 10 | 
 11 | def get_args():
 12 |   parser = argparse.ArgumentParser(description="calculate p values based on Romano method")
 13 |   parser.add_argument("--input", help="Name of the input file from svd_zscore")
 14 |   parser.add_argument("--num_perms", type=int,help="number of permutations to run for")
 15 |   parser.add_argument("--grouping_level_2", help="column to group the data by (e.g. ontology, compartment, tissue)", default="ontology")
 16 |   parser.add_argument("--grouping_level_1", help="subset data by this column before checking for differences (e.g. tissue, compartment)", default="dummy")
 17 |   parser.add_argument("--outname_all_pvals", help="Name of output file")
 18 |   parser.add_argument("--outname_perm_pvals", help="Name of output File")
 19 |   parser.add_argument("--outname_log", help="Name of log File")
 20 |   args = parser.parse_args()
 21 |   return args
 22 | 
 23 | 
 24 | def calc_pval(var_df):
 25 |   
 26 |   # calculate the inner sum that's subtracted
 27 |   num = 0
 28 |   denom = 0
 29 |   for index, row in var_df.iterrows():
 30 |     num += row["num_cells_ont"]*row["ont_median"]/row["ont_var"]
 31 |     denom += row["num_cells_ont"]/row["ont_var"]
 32 |   const = num/denom
 33 | 
 34 |   # calculate the outer sum
 35 |   sum_vals = 0
 36 |   for index, row in var_df.iterrows():
 37 |     sum_vals += (row["num_cells_ont"]/row["ont_var"])*(row["ont_median"] - const)**2
 38 |     
 39 |   # return the chi^2 p value and the chi^2 statistic
 40 |   return 1 - stats.chi2.cdf(sum_vals , var_df.shape[0] - 1), sum_vals
 41 | 
 42 | def get_var_df(sub_df, z_col, adj_var, grouping_level_2):
 43 | 
 44 |   sub_df["num_cells_ont"] = sub_df[grouping_level_2].map(sub_df.groupby(grouping_level_2)["cell"].nunique())
 45 |   sub_df["ont_median"] = sub_df[grouping_level_2].map(sub_df.groupby(grouping_level_2)[z_col].median())
 46 |   sub_df["ont_var"] = sub_df[grouping_level_2].map(sub_df.groupby(grouping_level_2)[z_col].var())
 47 | 
 48 |   var_df = sub_df.drop_duplicates(grouping_level_2)[[grouping_level_2,"ont_median","num_cells_ont","ont_var"]]
 49 | 
 50 |   # don't need to remove cell types with variance 0 when we're adjusting variance
 51 |   if not adj_var:
 52 | 
 53 |     # remove ontologies with zero variance
 54 |     var_df = var_df[var_df["ont_var"] > 0]
 55 |   return var_df
 56 | 
 57 | def main():
 58 |   np.random.seed(123)
 59 |   alpha = 0.05
 60 | 
 61 |   args = get_args()
 62 | 
 63 |   logging.basicConfig(
 64 |     filename = args.outname_log,
 65 |     format='%(asctime)s %(levelname)-8s %(message)s',
 66 |     level=logging.INFO,
 67 |     datefmt='%Y-%m-%d %H:%M:%S')  
 68 | 
 69 |   logging.info("Starting")
 70 | 
 71 |   df_cols = ["gene", "cell", "scZ", "svd_z0", "svd_z1", "svd_z2", "cell_gene", "f0", "f1", "f2"]
 72 | 
 73 |   if args.grouping_level_1.lower() != "dummy":
 74 |     df_cols.append(args.grouping_level_2)  
 75 |     df_cols.append(args.grouping_level_1)
 76 |   else:
 77 |     df_cols.append(args.grouping_level_2)
 78 | 
 79 |   df = pd.read_parquet(
 80 |       args.input,
 81 |       columns=df_cols
 82 |   )
 83 |   df = df.drop_duplicates("cell_gene")
 84 | 
 85 |   if args.grouping_level_1 == "dummy":
 86 |     df["dummy"] = "null"
 87 |   df["tiss_comp"] = df[args.grouping_level_1].astype(str) + df[args.grouping_level_2].astype(str)
 88 | 
 89 |   # subset to ontologies with > 20 cells
 90 |   df["ontology_gene"] = df[args.grouping_level_2].astype(str) + df["gene"]
 91 |   df["num_ont_gene"] = df["ontology_gene"].map(df.groupby("ontology_gene")["cell_gene"].nunique())
 92 |   df = df[df["num_ont_gene"] > 10]
 93 | 
 94 |   z_cols = ["scZ","svd_z0","svd_z1","svd_z2"]
 95 |   out = {"pval" : [], "gene" : [], "num_onts" : [],"z_col" : [],"max_abs_median" : [], "Tn1" : [], "grouping_level_1" : []}
 96 |   
 97 |   var_adj = 0.1
 98 |   adj_var = True
 99 |   
100 |   perm_pval = True
101 |   
102 |   if perm_pval:
103 |     out["perm_pval"] = []
104 |   
105 |   df["dummy"] = "null"
106 |   for tiss, tiss_df in df.groupby(args.grouping_level_1):
107 |     for gene, sub_df in tqdm(tiss_df.groupby("gene")):
108 |       
109 |       for z_col in z_cols:
110 |     
111 |         var_df = get_var_df(sub_df, z_col, adj_var, args.grouping_level_2)
112 |         
113 |         if var_df.shape[0] > 1:
114 |           if adj_var:
115 |             var_df["ont_var"] = var_df["ont_var"] + var_adj
116 |           pval, Tn1 = calc_pval(var_df)
117 |           out["pval"].append(pval)
118 |           out["Tn1"].append(Tn1)
119 |   
120 |           out["gene"].append(gene)
121 |           out["num_onts"].append(var_df.shape[0])
122 |           out["z_col"].append(z_col)
123 |           out["max_abs_median"].append((var_df["ont_median"].abs()).max())
124 |           out["grouping_level_1"].append(tiss)
125 |           
126 |           if perm_pval:
127 |             sub_df_perm = sub_df.copy()
128 |             if (pval < alpha):
129 |               Tn1_dist = []
130 |               # for i in range(args.num_perms):
131 |               while len(Tn1_dist) < args.num_perms:
132 |                 sub_df_perm[args.grouping_level_2] = np.random.permutation(sub_df_perm[args.grouping_level_2])
133 |                 var_df = get_var_df(sub_df_perm, z_col, adj_var, args.grouping_level_2)
134 |                 if var_df.shape[0] > 1:
135 |                   if adj_var:
136 |                     var_df["ont_var"] = var_df["ont_var"] + var_adj
137 |                   pval, Tn1_perm = calc_pval(var_df)
138 |                   Tn1_dist.append(Tn1_perm)
139 |               out["perm_pval"].append(len([x for x in Tn1_dist if x < Tn1])/args.num_perms)
140 |             else:
141 |               out["perm_pval"].append(np.nan)
142 |   out_df = pd.DataFrame.from_dict(out)
143 | 
144 |   out_df["perm_pval_inv"] = 1 - out_df["perm_pval"] 
145 |   out_df["perm_pval2"] = 2*out_df[["perm_pval","perm_pval_inv"]].min(axis=1)
146 | 
147 |   # adjust p values all together
148 | 
149 |   # Try if old fails: if any na values, don't include in adjustment 
150 |   #out_df.loc[~out_df["pval"].isna(),"pval_adj"] = multipletests(out_df.loc[~out_df["pval"].isna(),"pval"], alpha, method = "fdr_bh")[1]
151 |   #out_df.loc[~out_df["perm_pval2"].isna(),"perm_pval2_adj"] = multipletests(out_df.loc[~out_df["perm_pval2"].isna(),"perm_pval2"], alpha, method = "fdr_bh")[1]
152 | 
153 |   # OLD
154 |   try:
155 |     out_df["pval_adj"] = multipletests(out_df["pval"],alpha, method="fdr_bh")[1]
156 |     out_df.loc[~out_df["perm_pval2"].isna(),"perm_pval2_adj"] = multipletests(out_df.loc[~out_df["perm_pval2"].isna(),"perm_pval2"], alpha, method = "fdr_bh")[1]
157 |   except:
158 |     out_df["pval_adj"] = np.nan
159 |     out_df["perm_pval2_adj"] = np.nan
160 | 
161 |   out_df.to_csv(args.outname_all_pvals, sep="\t", index=False)
162 | 
163 |   out_df["gene_grouping_level_1"] = out_df["gene"] + out_df["grouping_level_1"].astype(str)
164 | 
165 |   # reformat output
166 |   new_out = {"gene" : [], "num_onts" : [], "grouping_level_1" : []}
167 |   for z_col in z_cols:
168 |     new_out["chi2_pval_adj_" + z_col] = []
169 |     new_out["perm_pval_adj_" + z_col] = []
170 |     new_out["max_abs_median_" + z_col] = []
171 |     new_out["perm_cdf_" + z_col] = []
172 |   for gene_sub, gene_df in out_df.groupby("gene_grouping_level_1"):
173 |     new_out["gene"].append(gene_df["gene"].iloc[0])
174 |     new_out["grouping_level_1"].append(gene_df["grouping_level_1"].iloc[0])
175 |     new_out["num_onts"].append(gene_df["num_onts"].iloc[0])
176 |     temp_z_cols = []
177 |     for z_col, z_df in gene_df.groupby("z_col"):
178 |       new_out["chi2_pval_adj_" + z_col].append(z_df["pval_adj"].iloc[0])
179 |       new_out["perm_pval_adj_" + z_col].append(z_df["perm_pval2_adj"].iloc[0])
180 |       new_out["max_abs_median_" + z_col].append(z_df["max_abs_median"].iloc[0])
181 |       new_out["perm_cdf_" + z_col].append(z_df["perm_pval"].iloc[0])
182 |       temp_z_cols.append(z_col)
183 |     for z_col in [x for x in z_cols if x not in temp_z_cols]:
184 |       new_out["chi2_pval_adj_" + z_col].append(np.nan)
185 |       new_out["perm_pval_adj_" + z_col].append(np.nan)
186 |       new_out["max_abs_median_" + z_col].append(np.nan)
187 |       new_out["perm_cdf_" + z_col].append(np.nan) 
188 |   new_out_df = pd.DataFrame.from_dict(new_out).sort_values("perm_pval_adj_scZ")
189 | 
190 |   # add frac from SVD for each gene
191 |   df = df.drop_duplicates("gene")
192 |   for i in range(3):
193 |     frac_dict = pd.Series(df["f" + str(i)].values,index=df.gene).to_dict()
194 |     new_out_df["f" + str(i)] = new_out_df["gene"].map(frac_dict)
195 | 
196 |   new_out_df.to_csv(args.outname_perm_pvals, sep="\t", index=False)
197 |   
198 |   logging.info("Completed")
199 | 
200 | main()
201 | 


--------------------------------------------------------------------------------
/conf/base.config:
--------------------------------------------------------------------------------
 1 | /*
 2 | ========================================================================================
 3 |     nf-core/rnaseq Nextflow base config file
 4 | ========================================================================================
 5 |     A 'blank slate' config file, appropriate for general use on most high performance
 6 |     compute environments. Assumes that all software is installed and available on
 7 |     the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
 8 | ----------------------------------------------------------------------------------------
 9 | */
10 | 
11 | process {
12 | 
13 |     cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
14 |     memory = { check_max( 6.GB * task.attempt, 'memory' ) }
15 |     time   = { check_max( 4.h  * task.attempt, 'time'   ) }
16 | 
17 |     errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
18 |     maxRetries    = 3
19 |     maxErrors     = '-1'
20 | 
21 |     // Process-specific resource requirements
22 |     withLabel:process_low {
23 |         cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
24 |         memory = { check_max( 20.GB * task.attempt, 'memory'  ) }
25 |         time   = { check_max( 1.h   * task.attempt, 'time'    ) }
26 |     }
27 |     withLabel:process_medium {
28 |         cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
29 |         memory = { check_max( 100.GB * task.attempt, 'memory'  ) }
30 |         time   = { check_max( 2.h   * task.attempt, 'time'    ) }
31 |     }
32 |     withLabel:process_high {
33 |         cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
34 |         memory = { check_max( 200.GB * task.attempt, 'memory'  ) }
35 |         time   = { check_max( 2.h  * task.attempt, 'time'    ) }
36 |     }
37 |     withLabel:process_long {
38 |         time   = { check_max( 20.h  * task.attempt, 'time'    ) }
39 |     }
40 |     withLabel:process_high_memory {
41 |         memory = { check_max( 400.GB * task.attempt, 'memory' ) }
42 |     }
43 |     withLabel:error_ignore {
44 |         errorStrategy = 'ignore'
45 |     }
46 |     withLabel:error_retry {
47 |         errorStrategy = 'retry'
48 |         maxRetries    = 2
49 |     }
50 | }


--------------------------------------------------------------------------------
/conf/test.config:
--------------------------------------------------------------------------------
 1 | process {
 2 |   executor = 'slurm'
 3 |   clusterOptions = '-p quake,horence,owners'
 4 | 
 5 |   memory = { 1.GB * task.attempt }
 6 |   time = { 1.h * task.attempt }
 7 |   errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
 8 |   maxRetries = 3
 9 | }
10 | 
11 | params {
12 |   dataname = "test"
13 |   input_file = "/scratch/groups/horence/kaitlin/spliz_nextflow/nf-core-spliz/test_data/test.tsv"
14 |   SICILIAN = true
15 |   grouping_level_2 = "compartment"
16 |   grouping_level_1 = "tissue"
17 |   libraryType = "10X"
18 |   run_analysis = true
19 | }
20 | 
21 | params.outdir = "./results/${params.dataname}"
22 | params.tracedir = "./results/${params.dataname}/pipeline_info"
23 | params.schema_ignore_params = "input,single_end,show_hidden_params,validate_params,igenomes_ignore,tracedir,igenomes_base,help,monochrome_logs,plaintext_email,max_multiqc_email_size,email_on_fail,email,multiqc_config,publish_dir_mode,genome,genomes" 
24 | 
25 | tower {
26 |  enabled = true
27 | }


--------------------------------------------------------------------------------
/conf/test_full.config:
--------------------------------------------------------------------------------
 1 | /*
 2 | ========================================================================================
 3 |     Nextflow config file for running full-size tests
 4 | ========================================================================================
 5 |     Defines input files and everything required to run a full size pipeline test.
 6 | 
 7 |     Use as follows:
 8 |         nextflow run nf-core/rnaseq -profile test_full,<docker/singularity>
 9 | 
10 | ----------------------------------------------------------------------------------------
11 | */
12 | 
13 | params {
14 |     config_profile_name        = 'Full test profile'
15 |     config_profile_description = 'Full test dataset to check pipeline function'
16 | 
17 |     // Parameters for full-size test
18 |     input          = 'https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/samplesheet/v3.1/samplesheet_full.csv'
19 |     genome         = 'GRCh37'
20 |     pseudo_aligner = 'salmon'
21 | }
22 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # nf-core/spliz: Documentation
 2 | 
 3 | The nf-core/spliz documentation is split into the following pages:
 4 | 
 5 | * [Usage](usage.md)
 6 |   * An overview of how the pipeline works, how to run it and a description of all of the different command-line flags.
 7 | * [Output](output.md)
 8 |   * An overview of the different results produced by the pipeline and how to interpret them.
 9 | 
10 | You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re)
11 | 


--------------------------------------------------------------------------------
/docs/images/nf-core-spliz_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salzman-lab/SpliZ/594532c1b12a1a30b5e00cd9e1fbed30ef28047e/docs/images/nf-core-spliz_logo.png


--------------------------------------------------------------------------------
/docs/output.md:
--------------------------------------------------------------------------------
 1 | # nf-core/spliz: Output
 2 | 
 3 | ## Introduction
 4 | 
 5 | This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline.
 6 | 
 7 | The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory.
 8 | 
 9 | <!-- TODO nf-core: Write this documentation describing your workflow's output -->
10 | 
11 | ## Pipeline overview
12 | 
13 | The pipeline is built using [Nextflow](https://www.nextflow.io/)
14 | and processes data using the following steps:
15 | 
16 | * [FastQC](#fastqc) - Read quality control
17 | * [MultiQC](#multiqc) - Aggregate report describing results from the whole pipeline
18 | * [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
19 | 
20 | ## FastQC
21 | 
22 | [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences.
23 | 
24 | For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
25 | 
26 | **Output files:**
27 | 
28 | * `fastqc/`
29 |   * `*_fastqc.html`: FastQC report containing quality metrics for your untrimmed raw fastq files.
30 | * `fastqc/zips/`
31 |   * `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
32 | 
33 | > **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
34 | 
35 | ## MultiQC
36 | 
37 | [MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory.
38 | 
39 | The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability.
40 | 
41 | For more information about how to use MultiQC reports, see [https://multiqc.info](https://multiqc.info).
42 | 
43 | **Output files:**
44 | 
45 | * `multiqc/`
46 |   * `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser.
47 |   * `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline.
48 |   * `multiqc_plots/`: directory containing static images from the report in various formats.
49 | 
50 | ## Pipeline information
51 | 
52 | [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage.
53 | 
54 | **Output files:**
55 | 
56 | * `pipeline_info/`
57 |   * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`.
58 |   * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.csv`.
59 |   * Documentation for interpretation of results in HTML format: `results_description.html`.
60 | 


--------------------------------------------------------------------------------
/docs/usage.md:
--------------------------------------------------------------------------------
  1 | # salzmanlab/spliz: Usage
  2 | 
  3 | > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._
  4 | 
  5 | ## Introduction
  6 | 
  7 | <!-- TODO nf-core: Add documentation about anything specific to running your pipeline. For general topics, please point to (and add to) the main nf-core website. -->
  8 | 
  9 | ## Running the pipeline
 10 | 
 11 | The typical command for running the pipeline is as follows:
 12 | 
 13 | ```bash
 14 | nextflow run salzmanlab/spliz --input '*_R{1,2}.fastq.gz' -profile docker
 15 | ```
 16 | 
 17 | This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
 18 | 
 19 | Note that the pipeline will create the following files in your working directory:
 20 | 
 21 | ```bash
 22 | work            # Directory containing the nextflow working files
 23 | results         # Finished results (configurable, see below)
 24 | .nextflow_log   # Log file from Nextflow
 25 | # Other nextflow hidden files, eg. history of pipeline runs and old logs.
 26 | ```
 27 | 
 28 | ### Updating the pipeline
 29 | 
 30 | When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline:
 31 | 
 32 | ```bash
 33 | nextflow pull salzmanlab/spliz
 34 | ```
 35 | 
 36 | ### Reproducibility
 37 | 
 38 | It's a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since.
 39 | 
 40 | First, go to the [salzmanlab/spliz releases page](https://github.com/salzmanlab/spliz/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`.
 41 | 
 42 | This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future.
 43 | 
 44 | ## Core Nextflow arguments
 45 | 
 46 | > **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen).
 47 | 
 48 | ### `-profile`
 49 | 
 50 | Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments.
 51 | 
 52 | Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below.
 53 | 
 54 | > We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported.
 55 | 
 56 | The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation).
 57 | 
 58 | Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important!
 59 | They are loaded in sequence, so later profiles can overwrite earlier profiles.
 60 | 
 61 | If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended.
 62 | 
 63 | * `docker`
 64 |   * A generic configuration profile to be used with [Docker](https://docker.com/)
 65 |   * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/)
 66 | * `singularity`
 67 |   * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/)
 68 |   * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/)
 69 | * `podman`
 70 |   * A generic configuration profile to be used with [Podman](https://podman.io/)
 71 |   * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/)
 72 | * `shifter`
 73 |   * A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/)
 74 |   * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/)
 75 | * `charliecloud`
 76 |   * A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/)
 77 |   * Pulls software from Docker Hub: [`nfcore/spliz`](https://hub.docker.com/r/nfcore/spliz/)
 78 | * `conda`
 79 |   * Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud.
 80 |   * A generic configuration profile to be used with [Conda](https://conda.io/docs/)
 81 |   * Pulls most software from [Bioconda](https://bioconda.github.io/)
 82 | * `test`
 83 |   * A profile with a complete configuration for automated testing
 84 |   * Includes links to test data so needs no other parameters
 85 | 
 86 | ### `-resume`
 87 | 
 88 | Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously.
 89 | 
 90 | You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names.
 91 | 
 92 | ### `-c`
 93 | 
 94 | Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information.
 95 | 
 96 | #### Custom resource requests
 97 | 
 98 | Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources) it will automatically resubmit with higher requests (2 x original, then 3 x original). If it still fails after three times then the pipeline is stopped.
 99 | 
100 | Whilst these default requirements will hopefully work for most people with most data, you may find that you want to customise the compute resources that the pipeline requests. You can do this by creating a custom config file. For example, to give the workflow process `star` 32GB of memory, you could use the following config:
101 | 
102 | ```nextflow
103 | process {
104 |   withName: star {
105 |     memory = 32.GB
106 |   }
107 | }
108 | ```
109 | 
110 | To find the exact name of a process you wish to modify the compute resources, check the live-status of a nextflow run displayed on your terminal or check the nextflow error for a line like so: `Error executing process > 'bwa'`. In this case the name to specify in the custom config file is `bwa`.
111 | 
112 | See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more information.
113 | 
114 | If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition above). You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile.
115 | 
116 | If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs).
117 | 
118 | ### Running in the background
119 | 
120 | Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished.
121 | 
122 | The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file.
123 | 
124 | Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time.
125 | Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs).
126 | 
127 | #### Nextflow memory requirements
128 | 
129 | In some cases, the Nextflow Java virtual machines can start to request a large amount of memory.
130 | We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`):
131 | 
132 | ```bash
133 | NXF_OPTS='-Xms1g -Xmx4g'
134 | ```
135 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | # You can use this file to create a conda environment for this pipeline:
 2 | #   conda env create -f environment.yml
 3 | name: nf-core-spliz-1.0dev
 4 | channels:
 5 |   - conda-forge
 6 |   - bioconda
 7 |   - defaults
 8 | dependencies:
 9 |   - python=3.9.6
10 |   - pandas=1.3.1
11 |   - tqdm=4.62.0
12 |   - numpy=1.21.1
13 |   - pyarrow=5.0.0
14 |   - pysam=0.16.0.1
15 |   - r-base=4.1.1  
16 |   - r-data.table=1.14.0 
17 |   - r-logger=0.2.1
18 |   - r-rfast=2.0.3 
19 |   - scipy=1.7.1
20 |   - statsmodels=0.12.2
21 |   - nextflow=21.04.0
22 | 


--------------------------------------------------------------------------------
/lib/Headers.groovy:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * This file holds several functions used to render the nf-core ANSI header.
 3 |  */
 4 | 
 5 | class Headers {
 6 | 
 7 |     private static Map log_colours(Boolean monochrome_logs) {
 8 |         Map colorcodes = [:]
 9 |         colorcodes['reset']       = monochrome_logs ? '' : "\033[0m"
10 |         colorcodes['dim']         = monochrome_logs ? '' : "\033[2m"
11 |         colorcodes['black']       = monochrome_logs ? '' : "\033[0;30m"
12 |         colorcodes['green']       = monochrome_logs ? '' : "\033[0;32m"
13 |         colorcodes['yellow']      = monochrome_logs ? '' :  "\033[0;33m"
14 |         colorcodes['yellow_bold'] = monochrome_logs ? '' : "\033[1;93m"
15 |         colorcodes['blue']        = monochrome_logs ? '' : "\033[0;34m"
16 |         colorcodes['purple']      = monochrome_logs ? '' : "\033[0;35m"
17 |         colorcodes['cyan']        = monochrome_logs ? '' : "\033[0;36m"
18 |         colorcodes['white']       = monochrome_logs ? '' : "\033[0;37m"
19 |         colorcodes['red']         = monochrome_logs ? '' : "\033[1;91m"
20 |         return colorcodes
21 |     }
22 | 
23 |     static String dashed_line(monochrome_logs) {
24 |         Map colors = log_colours(monochrome_logs)
25 |         return "-${colors.dim}----------------------------------------------------${colors.reset}-"
26 |     }
27 | 
28 |     static String nf_core(workflow, monochrome_logs) {
29 |         Map colors = log_colours(monochrome_logs)
30 |         String.format(
31 |             """\n
32 |             ${dashed_line(monochrome_logs)}
33 |                                                     ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset}
34 |             ${colors.blue}        ___     __   __   __   ___     ${colors.green}/,-._.--~\'${colors.reset}
35 |             ${colors.blue}  |\\ | |__  __ /  ` /  \\ |__) |__         ${colors.yellow}}  {${colors.reset}
36 |             ${colors.blue}  | \\| |       \\__, \\__/ |  \\ |___     ${colors.green}\\`-._,-`-,${colors.reset}
37 |                                                     ${colors.green}`._,._,\'${colors.reset}
38 |             ${colors.purple}  ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset}
39 |             ${dashed_line(monochrome_logs)}
40 |             """.stripIndent()
41 |         )
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/lib/nfcore_external_java_deps.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salzman-lab/SpliZ/594532c1b12a1a30b5e00cd9e1fbed30ef28047e/lib/nfcore_external_java_deps.jar


--------------------------------------------------------------------------------
/main.nf:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env nextflow
  2 | /*
  3 | ========================================================================================
  4 |                          nf-core/spliz
  5 | ========================================================================================
  6 |  nf-core/spliz Analysis Pipeline.
  7 |  #### Homepage / Documentation
  8 | https://github.com/salzmanlab/SpliZ
  9 | ----------------------------------------------------------------------------------------
 10 | */
 11 | nextflow.enable.dsl=2
 12 | 
 13 | log.info Headers.nf_core(workflow, params.monochrome_logs)
 14 | 
 15 | ////////////////////////////////////////////////////
 16 | /* --               PRINT HELP                 -- */
 17 | ////////////////////////////////////////////////////+
 18 | def json_schema = "$projectDir/nextflow_schema.json"
 19 | if (params.help) {
 20 |     def command = "nextflow run nf-core/spliz -c conf/test.config"
 21 |     log.info NfcoreSchema.params_help(workflow, params, json_schema, command)
 22 |     exit 0
 23 | }
 24 | 
 25 | ////////////////////////////////////////////////////
 26 | /* --         VALIDATE PARAMETERS              -- */
 27 | ////////////////////////////////////////////////////+
 28 | if (params.validate_params) {
 29 |     NfcoreSchema.validateParameters(params, json_schema, log)
 30 | }
 31 | 
 32 | ////////////////////////////////////////////////////
 33 | /* --     Collect configuration parameters     -- */
 34 | ////////////////////////////////////////////////////
 35 | 
 36 | // Check if genome exists in the config file
 37 | if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) {
 38 |     exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(', ')}"
 39 | }
 40 | 
 41 | // TODO nf-core: Add any reference files that are needed
 42 | // Configurable reference genomes
 43 | //
 44 | // NOTE - THIS IS NOT USED IN THIS PIPELINE, EXAMPLE ONLY
 45 | // If you want to use the channel below in a process, define the following:
 46 | //   input:
 47 | //   file fasta from ch_fasta
 48 | //
 49 | //params.fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false
 50 | //if (params.fasta) { ch_fasta = file(params.fasta, checkIfExists: true) }
 51 | 
 52 | // Check AWS batch settings
 53 | if (workflow.profile.contains('awsbatch')) {
 54 |     // AWSBatch sanity checking
 55 |     if (!params.awsqueue || !params.awsregion) exit 1, 'Specify correct --awsqueue and --awsregion parameters on AWSBatch!'
 56 |     // Check outdir paths to be S3 buckets if running on AWSBatch
 57 |     // related: https://github.com/nextflow-io/nextflow/issues/813
 58 |     if (!params.outdir.startsWith('s3:')) exit 1, 'Outdir not on S3 - specify S3 Bucket to run on AWSBatch!'
 59 |     // Prevent trace files to be stored on S3 since S3 does not support rolling files.
 60 |     if (params.tracedir.startsWith('s3:')) exit 1, 'Specify a local tracedir or run without trace! S3 cannot be used for tracefiles.'
 61 | }
 62 | 
 63 | // Stage config files
 64 | ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true)
 65 | ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty()
 66 | ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true)
 67 | ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true)
 68 | 
 69 | 
 70 | ////////////////////////////////////////////////////
 71 | /* --         PRINT PARAMETER SUMMARY          -- */
 72 | ////////////////////////////////////////////////////
 73 | log.info NfcoreSchema.params_summary_log(workflow, params, json_schema)
 74 | 
 75 | // Header log info
 76 | def summary = [:]
 77 | if (workflow.revision) summary['Pipeline Release'] = workflow.revision
 78 | summary['Run Name']         = workflow.runName
 79 | // TODO nf-core: Report custom parameters here
 80 | //summary['Input']            = params.input
 81 | //summary['Fasta Ref']        = params.fasta
 82 | //summary['Data Type']        = params.single_end ? 'Single-End' : 'Paired-End'
 83 | summary['Max Resources']    = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
 84 | if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
 85 | summary['Output dir']       = params.outdir
 86 | summary['Launch dir']       = workflow.launchDir
 87 | summary['Working dir']      = workflow.workDir
 88 | summary['Script dir']       = workflow.projectDir
 89 | summary['User']             = workflow.userName
 90 | if (workflow.profile.contains('awsbatch')) {
 91 |     summary['AWS Region']   = params.awsregion
 92 |     summary['AWS Queue']    = params.awsqueue
 93 |     summary['AWS CLI']      = params.awscli
 94 | }
 95 | summary['Config Profile'] = workflow.profile
 96 | if (params.config_profile_description) summary['Config Profile Description'] = params.config_profile_description
 97 | if (params.config_profile_contact)     summary['Config Profile Contact']     = params.config_profile_contact
 98 | if (params.config_profile_url)         summary['Config Profile URL']         = params.config_profile_url
 99 | summary['Config Files'] = workflow.configFiles.join(', ')
100 | if (params.email || params.email_on_fail) {
101 |     summary['E-mail Address']    = params.email
102 |     summary['E-mail on failure'] = params.email_on_fail
103 |     summary['MultiQC maxsize']   = params.max_multiqc_email_size
104 | }
105 | 
106 | // Check the hostnames against configured profiles
107 | checkHostname()
108 | 
109 | Channel.from(summary.collect{ [it.key, it.value] })
110 |     .map { k,v -> "<dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }
111 |     .reduce { a, b -> return [a, b].join("\n            ") }
112 |     .map { x -> """
113 |     id: 'nf-core-spliz-summary'
114 |     description: " - this information is collected when the pipeline is started."
115 |     section_name: 'nf-core/spliz Workflow Summary'
116 |     section_href: 'https://github.com/nf-core/spliz'
117 |     plot_type: 'html'
118 |     data: |
119 |         <dl class=\"dl-horizontal\">
120 |             $x
121 |         </dl>
122 |     """.stripIndent() }
123 |     .set { ch_workflow_summary }
124 | 
125 | /*
126 |  * Parse software version numbers
127 |  */
128 | process get_software_versions {
129 |     publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode,
130 |         saveAs: { filename ->
131 |                       if (filename.indexOf('.csv') > 0) filename
132 |                       else null
133 |         }
134 | 
135 |     output:
136 |     file 'software_versions_mqc.yaml' into ch_software_versions_yaml
137 |     file 'software_versions.csv'
138 | 
139 |     script:
140 |     // TODO nf-core: Get all tools to print their version number here
141 |     """
142 |     echo $workflow.manifest.version > v_pipeline.txt
143 |     echo $workflow.nextflow.version > v_nextflow.txt
144 |     fastqc --version > v_fastqc.txt
145 |     multiqc --version > v_multiqc.txt
146 |     scrape_software_versions.py &> software_versions_mqc.yaml
147 |     """
148 | }
149 | 
150 | /*
151 |  * STEP 1 - FastQC
152 |  */
153 | process fastqc {
154 |     tag "$name"
155 |     label 'process_medium'
156 |     publishDir "${params.outdir}/fastqc", mode: params.publish_dir_mode,
157 |         saveAs: { filename ->
158 |                       filename.indexOf('.zip') > 0 ? "zips/$filename" : "$filename"
159 |         }
160 | 
161 |     input:
162 |     set val(name), file(reads) from ch_read_files_fastqc
163 | 
164 |     output:
165 |     file '*_fastqc.{zip,html}' into ch_fastqc_results
166 | 
167 |     script:
168 |     """
169 |     fastqc --quiet --threads $task.cpus $reads
170 |     """
171 | }
172 | 
173 | /*
174 |  * STEP 2 - MultiQC
175 |  */
176 | process multiqc {
177 |     publishDir "${params.outdir}/MultiQC", mode: params.publish_dir_mode
178 | 
179 |     input:
180 |     file (multiqc_config) from ch_multiqc_config
181 |     file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([])
182 |     // TODO nf-core: Add in log files from your new processes for MultiQC to find!
183 |     file ('fastqc/*') from ch_fastqc_results.collect().ifEmpty([])
184 |     file ('software_versions/*') from ch_software_versions_yaml.collect()
185 |     file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml")
186 | 
187 |     output:
188 |     file "*multiqc_report.html" into ch_multiqc_report
189 |     file "*_data"
190 |     file "multiqc_plots"
191 | 
192 |     script:
193 |     rtitle = ''
194 |     rfilename = ''
195 |     if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
196 |         rtitle = "--title \"${workflow.runName}\""
197 |         rfilename = "--filename " + workflow.runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report"
198 |     }
199 |     custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : ''
200 |     // TODO nf-core: Specify which MultiQC modules to use with -m for a faster run time
201 |     """
202 |     multiqc -f $rtitle $rfilename $custom_config_file .
203 |     """
204 | }
205 | 
206 | /*
207 |  * STEP 3 - Output Description HTML
208 |  */
209 | process output_documentation {
210 |     publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode
211 | 
212 |     input:
213 |     file output_docs from ch_output_docs
214 |     file images from ch_output_docs_images
215 | 
216 |     output:
217 |     file 'results_description.html'
218 | 
219 |     script:
220 |     """
221 |     markdown_to_html.py $output_docs -o results_description.html
222 |     """
223 | }
224 | 
225 | /*
226 | ========================================================================================
227 |     IMPORT LOCAL MODULES/SUBWORKFLOWS
228 | ========================================================================================
229 | */
230 | include { SPLIZ_PIPELINE    } from './workflows/spliz_pipeline' 
231 | 
232 | /*
233 | ========================================================================================
234 |     MAIN WORKFLOW
235 | ========================================================================================
236 | */
237 | workflow NFCORE_SPLIZ {
238 |     SPLIZ_PIPELINE ()
239 | }
240 | 
241 | workflow {
242 |     NFCORE_SPLIZ ()
243 | }
244 | 
245 | /*
246 |  * Completion e-mail notification
247 |  */
248 | workflow.onComplete {
249 | 
250 |     // Set up the e-mail variables
251 |     def subject = "[nf-core/spliz] Successful: $workflow.runName"
252 |     if (!workflow.success) {
253 |         subject = "[nf-core/spliz] FAILED: $workflow.runName"
254 |     }
255 |     def email_fields = [:]
256 |     email_fields['version'] = workflow.manifest.version
257 |     email_fields['runName'] = workflow.runName
258 |     email_fields['success'] = workflow.success
259 |     email_fields['dateComplete'] = workflow.complete
260 |     email_fields['duration'] = workflow.duration
261 |     email_fields['exitStatus'] = workflow.exitStatus
262 |     email_fields['errorMessage'] = (workflow.errorMessage ?: 'None')
263 |     email_fields['errorReport'] = (workflow.errorReport ?: 'None')
264 |     email_fields['commandLine'] = workflow.commandLine
265 |     email_fields['projectDir'] = workflow.projectDir
266 |     email_fields['summary'] = summary
267 |     email_fields['summary']['Date Started'] = workflow.start
268 |     email_fields['summary']['Date Completed'] = workflow.complete
269 |     email_fields['summary']['Pipeline script file path'] = workflow.scriptFile
270 |     email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId
271 |     if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository
272 |     if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId
273 |     if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision
274 |     email_fields['summary']['Nextflow Version'] = workflow.nextflow.version
275 |     email_fields['summary']['Nextflow Build'] = workflow.nextflow.build
276 |     email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp
277 | 
278 |     /*
279 |     // TODO nf-core: If not using MultiQC, strip out this code (including params.max_multiqc_email_size)
280 |     // On success try attach the multiqc report
281 |     def mqc_report = null
282 |     try {
283 |         if (workflow.success) {
284 |             mqc_report = ch_multiqc_report.getVal()
285 |             if (mqc_report.getClass() == ArrayList) {
286 |                 log.warn "[nf-core/spliz] Found multiple reports from process 'multiqc', will use only one"
287 |                 mqc_report = mqc_report[0]
288 |             }
289 |         }
290 |     } catch (all) {
291 |         log.warn "[nf-core/spliz] Could not attach MultiQC report to summary email"
292 |     }
293 |     */
294 | 
295 |     // Check if we are only sending emails on failure
296 |     email_address = params.email
297 |     if (!params.email && params.email_on_fail && !workflow.success) {
298 |         email_address = params.email_on_fail
299 |     }
300 | 
301 |     // Render the TXT template
302 |     def engine = new groovy.text.GStringTemplateEngine()
303 |     def tf = new File("$projectDir/assets/email_template.txt")
304 |     def txt_template = engine.createTemplate(tf).make(email_fields)
305 |     def email_txt = txt_template.toString()
306 | 
307 |     // Render the HTML template
308 |     def hf = new File("$projectDir/assets/email_template.html")
309 |     def html_template = engine.createTemplate(hf).make(email_fields)
310 |     def email_html = html_template.toString()
311 | 
312 |     // Render the sendmail template
313 |     def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
314 |     def sf = new File("$projectDir/assets/sendmail_template.txt")
315 |     def sendmail_template = engine.createTemplate(sf).make(smail_fields)
316 |     def sendmail_html = sendmail_template.toString()
317 | 
318 |     // Send the HTML e-mail
319 |     if (email_address) {
320 |         try {
321 |             if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') }
322 |             // Try to send HTML e-mail using sendmail
323 |             [ 'sendmail', '-t' ].execute() << sendmail_html
324 |             log.info "[nf-core/spliz] Sent summary e-mail to $email_address (sendmail)"
325 |         } catch (all) {
326 |             // Catch failures and try with plaintext
327 |             def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ]
328 |             if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) {
329 |               mail_cmd += [ '-A', mqc_report ]
330 |             }
331 |             mail_cmd.execute() << email_html
332 |             log.info "[nf-core/spliz] Sent summary e-mail to $email_address (mail)"
333 |         }
334 |     }
335 | 
336 |     // Write summary e-mail HTML to a file
337 |     def output_d = new File("${params.outdir}/pipeline_info/")
338 |     if (!output_d.exists()) {
339 |         output_d.mkdirs()
340 |     }
341 |     def output_hf = new File(output_d, "pipeline_report.html")
342 |     output_hf.withWriter { w -> w << email_html }
343 |     def output_tf = new File(output_d, "pipeline_report.txt")
344 |     output_tf.withWriter { w -> w << email_txt }
345 | 
346 |     c_green = params.monochrome_logs ? '' : "\033[0;32m";
347 |     c_purple = params.monochrome_logs ? '' : "\033[0;35m";
348 |     c_red = params.monochrome_logs ? '' : "\033[0;31m";
349 |     c_reset = params.monochrome_logs ? '' : "\033[0m";
350 | 
351 |     if (workflow.stats.ignoredCount > 0 && workflow.success) {
352 |         log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-"
353 |         log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-"
354 |         log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-"
355 |     }
356 | 
357 |     if (workflow.success) {
358 |         log.info "-${c_purple}[nf-core/spliz]${c_green} Pipeline completed successfully${c_reset}-"
359 |         log.info "Results can be found in ${params.outdir}."
360 |     } else {
361 |         checkHostname()
362 |         log.info "-${c_purple}[nf-core/spliz]${c_red} Pipeline completed with errors${c_reset}-"
363 |     }
364 | 
365 | }
366 | 
367 | workflow.onError {
368 |     // Print unexpected parameters - easiest is to just rerun validation
369 |     NfcoreSchema.validateParameters(params, json_schema, log)
370 | }
371 | 
372 | def checkHostname() {
373 |     def c_reset         = params.monochrome_logs ? '' : "\033[0m"
374 |     def c_white         = params.monochrome_logs ? '' : "\033[0;37m"
375 |     def c_red           = params.monochrome_logs ? '' : "\033[1;91m"
376 |     def c_yellow_bold   = params.monochrome_logs ? '' : "\033[1;93m"
377 |     if (params.hostnames) {
378 |         def hostname = 'hostname'.execute().text.trim()
379 |         params.hostnames.each { prof, hnames ->
380 |             hnames.each { hname ->
381 |                 if (hostname.contains(hname) && !workflow.profile.contains(prof)) {
382 |                     log.error "${c_red}====================================================${c_reset}\n" +
383 |                             "  ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" +
384 |                             "  but your machine hostname is ${c_white}'$hostname'${c_reset}\n" +
385 |                             "  ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" +
386 |                             "${c_red}====================================================${c_reset}\n"
387 |                 }
388 |             }
389 |         }
390 |     }
391 | }
392 | 


--------------------------------------------------------------------------------
/modules/local/ann_splices.nf:
--------------------------------------------------------------------------------
 1 | process ANN_SPLICES {
 2 |     tag "${params.dataname}"
 3 | 
 4 |     label 'process_medium'
 5 | 
 6 |     input:
 7 |     path pq
 8 |     path exon_pickle
 9 |     path splice_pickle
10 | 
11 |     output:
12 |     path outname, emit: tsv
13 | 
14 |     script:
15 |     outname = "${params.dataname}_ann_splices.tsv"
16 |     """
17 |     ann_splices.py \\
18 |         --in_file ${pq} \\
19 |         --out_file ${outname} \\
20 |         --exon_pickle ${exon_pickle} \\
21 |         --splice_pickle ${splice_pickle}
22 |     """
23 | }


--------------------------------------------------------------------------------
/modules/local/calc_rijk_zscore.nf:
--------------------------------------------------------------------------------
 1 | process CALC_RIJK_ZSCORE {
 2 |     tag "${params.dataname}"
 3 |     //label 'process_high_memory'
 4 |     publishDir "${params.outdir}/SpliZ_values", 
 5 |         mode:       'copy', 
 6 |         pattern:    '*.tsv'
 7 |     publishDir "${params.outdir}/SpliZ_values", 
 8 |         mode:       'copy', 
 9 |         pattern:    '*.pq'
10 |     publishDir "${params.outdir}/logs", 
11 |         mode:       'copy', 
12 |         pattern:    '*.log'
13 | 
14 |     input:
15 |     val dataname
16 |     path pq 
17 |     val pin_S 
18 |     val pin_z 
19 |     val bounds 
20 |     val light
21 |     val SICILIAN
22 |     val grouping_level_2
23 |     val grouping_level_1
24 |     val convert_parquet
25 | 
26 |     output:
27 |     tuple val(dataname), val(param_stem), path("*.pq")  , emit: pq
28 |     path "*.tsv"                                        , emit: tsv    
29 |     path "*.log"                                        , emit: log                                    
30 | 
31 |     script:
32 |     def suff_light      = light     ? "_light" : ""
33 |     def suff_SICILIAN   = SICILIAN  ? "_SICILIAN" : ""
34 |     
35 |     def isLight         = light     ? "1" : "0"
36 |     def isSICILIAN      = SICILIAN  ? "1" : "0"
37 | 
38 |     param_stem          = "S_${pin_S}_z_${pin_z}_b_${bounds}${suff_light}${suff_SICILIAN}"
39 | 
40 |     outname_pq          = "${dataname}_sym_${param_stem}.pq"
41 |     outname_tsv         = "${dataname}_sym_${param_stem}_subcol.tsv"
42 |     outname_log         = "calc_rijk_zscore.log"
43 |     
44 |     if (convert_parquet):
45 |         """
46 |         rijk_zscore.py \\
47 |             --parquet ${pq} \\
48 |             --pinning_S ${pin_S} \\
49 |             --pinning_z ${pin_z} \\
50 |             --lower_bound ${bounds} \\
51 |             --isLight ${isLight} \\
52 |             --isSICILIAN ${isSICILIAN} \\
53 |             --grouping_level_2 ${grouping_level_2} \\
54 |             --grouping_level_1 ${grouping_level_1} \\
55 |             --outname_pq ${outname_pq} \\
56 |             --outname_tsv ${outname_tsv} \\
57 |             --outname_log ${outname_log} \\
58 |             --convert_parquet
59 |         """
60 |     else:
61 |         """
62 |         rijk_zscore.py \\
63 |             --parquet ${pq} \\
64 |             --pinning_S ${pin_S} \\
65 |             --pinning_z ${pin_z} \\
66 |             --lower_bound ${bounds} \\
67 |             --isLight ${isLight} \\
68 |             --isSICILIAN ${isSICILIAN} \\
69 |             --grouping_level_2 ${grouping_level_2} \\
70 |             --grouping_level_1 ${grouping_level_1} \\
71 |             --outname_pq ${outname_pq} \\
72 |             --outname_tsv ${outname_tsv} \\
73 |             --outname_log ${outname_log} 
74 |         """
75 | }


--------------------------------------------------------------------------------
/modules/local/calc_splizvd.nf:
--------------------------------------------------------------------------------
 1 | process CALC_SPLIZVD {
 2 |     tag "${params.dataname}"
 3 |     publishDir "${params.outdir}/SpliZ_values",  
 4 |         mode: "copy", 
 5 |         pattern: "*.tsv"
 6 |     publishDir "${params.outdir}/SpliZ_values",  
 7 |         mode: "copy", 
 8 |         pattern: "*.pq"
 9 |     publishDir "${params.outdir}/logs", 
10 |         mode: 'copy', 
11 |         pattern: '*.log'
12 | 
13 |     label 'process_medium'
14 | 
15 |     input:
16 |     path input
17 |     val param_stem
18 |     val dataname
19 |     val pin_S
20 |     val pin_z
21 |     val bounds
22 |     val svd_type
23 |     val grouping_level_1
24 |     val grouping_level_2
25 |     val isLight
26 |     val isSICILIAN
27 |     val rank_quant
28 | 
29 |     output:
30 |     path outname_pq     , emit: pq
31 |     path outname_tsv    , emit: tsv                                 
32 |     path "*.log"        , emit: log                                    
33 |     path "mat_samplesheet.tsv"  , emit: matSheet
34 | 
35 |     script:
36 |     outname_pq          = "${dataname}_sym_SVD_${svd_type}_${param_stem}.pq"
37 |     outname_tsv         = "${dataname}_sym_SVD_${svd_type}_${param_stem}_subcol.tsv"
38 |     outname_log         = "calc_splizvd.log"
39 |     
40 |     """
41 |     calc_splizvd.py \\
42 |         --input ${input} \\
43 |         --pinning_S ${pin_S} \\
44 |         --pinning_z ${pin_z} \\
45 |         --lower_bound ${bounds} \\
46 |         --isLight ${isLight} \\
47 |         --isSICILIAN ${isSICILIAN} \\
48 |         --svd_type ${svd_type} \\
49 |         --grouping_level_1 ${grouping_level_1} \\
50 |         --grouping_level_2 ${grouping_level_2} \\
51 |         --outname_pq ${outname_pq} \\
52 |         --outname_tsv ${outname_tsv} \\
53 |         --outname_log ${outname_log} \\
54 |         --workdir \$PWD \\
55 |         --rank_quant ${rank_quant}
56 | 
57 |     """
58 | 
59 | } 
60 | 


--------------------------------------------------------------------------------
/modules/local/class_input_10X.nf:
--------------------------------------------------------------------------------
 1 | process CLASS_INPUT_10X {
 2 |     tag "${params.dataname}"
 3 |     
 4 |     label 'process_high'
 5 | 
 6 |     input:
 7 |     tuple val(sample_ID), file(bam)
 8 |     val dataname
 9 |     val libraryType
10 |     path annotator_pickle
11 |     path gtf
12 | 
13 |     output:
14 |     tuple val(sample_ID), path(outname),    emit: class_input
15 | 
16 |     script:
17 |     outname = "${sample_ID}.class_input"
18 | 
19 |     """
20 |     light_class_input_subcols.py \\
21 |         --bams ${bam} \\
22 |         --libraryType ${libraryType} \\
23 |         --annotator ${annotator_pickle} \\
24 |         --gtf ${gtf} \\
25 |         --outname ${outname} 
26 |     """
27 | 
28 | }


--------------------------------------------------------------------------------
/modules/local/class_input_SS2.nf:
--------------------------------------------------------------------------------
 1 | process CLASS_INPUT_SS2 {
 2 |     tag "${params.dataname}"
 3 |     
 4 |     label 'process_high'
 5 | 
 6 |     input:
 7 |     tuple val(sample_ID), file(bam_R1), file(bam_R2)
 8 |     val dataname
 9 |     val libraryType
10 |     path annotator_pickle
11 |     path gtf
12 | 
13 |     output:
14 |     tuple val(sample_ID), path(outname),    emit: class_input
15 | 
16 |     script:
17 |     outname = "${sample_ID}.class_input"
18 | 
19 |     """
20 |     light_class_input_subcols.py \\
21 |         --bams ${bam_R1} ${bam_R2} \\
22 |         --libraryType ${libraryType} \\
23 |         --annotator ${annotator_pickle} \\
24 |         --gtf ${gtf} \\
25 |         --outname ${outname} 
26 |     """
27 | 
28 | }


--------------------------------------------------------------------------------
/modules/local/convert_parquet.nf:
--------------------------------------------------------------------------------
 1 | process CONVERT_PARQUET {
 2 |     tag "${params.dataname}"
 3 | 
 4 |     input:
 5 |     path tsv
 6 | 
 7 |     output:
 8 |     path "*.pq",    emit: pq
 9 | 
10 |     script:
11 |     pq = "${tsv.baseName}.pq"
12 |     """
13 |     parquet_to_tsv.py \\
14 |         --parquet ${pq} \\
15 |         --tsv ${tsv} \\
16 |         --reverse 
17 |     """
18 | }


--------------------------------------------------------------------------------
/modules/local/convert_split_parquet.nf:
--------------------------------------------------------------------------------
 1 | process CONVERT_SPLIT_PARQUET {
 2 |     tag "${params.dataname}"
 3 |     //label 'process_high_memory'
 4 | 
 5 |     input:
 6 |     path tsv
 7 | 
 8 |     output:
 9 |     path "*.pq",    emit: pq
10 | 
11 |     script:
12 |     basename = tsv.baseName
13 |     """
14 |     convert_tsv_to_parquet.py \\
15 |         --tsv ${tsv} \\
16 |         --splitChr \\
17 |         --basename ${basename}
18 |     """
19 | }


--------------------------------------------------------------------------------
/modules/local/find_spliz_sites.nf:
--------------------------------------------------------------------------------
 1 | process FIND_SPLIZ_SITES {
 2 |     tag "${params.dataname}"
 3 |     //label 'process_high_memory'
 4 |     publishDir "${params.outdir}/SpliZ_sites",  
 5 |         mode: "copy", 
 6 |         pattern: "*.tsv"
 7 |     
 8 |     label 'process_medium'
 9 | 
10 |     input:
11 |     path perm_pvals
12 |     val libraryType
13 |     path geneMat_samplesheet
14 | 
15 |     output:
16 |     path first_evec     , emit: first_evec
17 |     path second_evec    , emit: second_evec
18 |     path third_evec     , emit: third_evec
19 | 
20 |     script:
21 |     param_stem          = perm_pvals.baseName
22 | 
23 |     first_evec          = "first_evec_${param_stem}.tsv"
24 |     second_evec         = "second_evec_${param_stem}.tsv"
25 |     third_evec          = "third_evec_${param_stem}.tsv"
26 | 
27 |     """
28 |     find_SpliZ_sites.R \\
29 |         ${perm_pvals} \\
30 |         ${first_evec} \\
31 |         ${second_evec} \\
32 |         ${third_evec} \\
33 |         ${libraryType} \\
34 |         ${geneMat_samplesheet}
35 |     """
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/modules/local/preprocess_tsv.nf:
--------------------------------------------------------------------------------
 1 | include { CONVERT_PARQUET } from '../../modules/local/convert_parquet' 
 2 | 
 3 | workflow PREPROCESS_TSV {
 4 |     take:
 5 |     ch_input
 6 | 
 7 |     main:
 8 |     CONVERT_PARQUET (
 9 |         ch_input,
10 |         params.dataname
11 |     )
12 | 
13 |     emit:
14 |     pq = CONVERT_PARQUET.out.pq
15 | }


--------------------------------------------------------------------------------
/modules/local/process_class_input.nf:
--------------------------------------------------------------------------------
 1 | process PROCESS_CLASS_INPUT {
 2 | 
 3 |     publishDir "${params.outdir}/class_input", 
 4 |         mode: 'copy', 
 5 |         pattern: '*.pq'
 6 | 
 7 |     label 'process_medium'
 8 | 
 9 |     input:
10 |     path class_input
11 |     val dataname
12 |     val libraryType
13 |     path meta
14 | 
15 |     output:
16 |     path "*.pq",    emit: pq
17 | 
18 |     script:
19 |     outname = "${dataname}.pq"
20 |     """
21 |     process_CI.py \\
22 |         --input_file ${class_input} \\
23 |         --meta ${meta} \\
24 |         --libraryType ${libraryType} \\
25 |         --outname ${outname} 
26 |     """
27 | }


--------------------------------------------------------------------------------
/modules/local/pval_permutations.nf:
--------------------------------------------------------------------------------
 1 | process PVAL_PERMUTATIONS {
 2 |     tag "${params.dataname}"
 3 | 
 4 |     publishDir "${params.outdir}/variance_adjusted_permutations",  
 5 |         mode: "copy", 
 6 |         pattern: "*.tsv"
 7 |     publishDir "${params.outdir}/logs", 
 8 |         mode: 'copy', 
 9 |         pattern: '*.log'
10 |     
11 |     label 'process_medium'
12 | 
13 |     input:
14 |     val splizvd_pq
15 |     val param_stem
16 |     val dataname
17 |     val n_perms
18 |     val grouping_level_2
19 |     val grouping_level_1
20 | 
21 |     output:
22 |     path outname_all_pvals      , emit: all_pvals
23 |     path outname_perm_pvals     , emit: perm_pvals
24 |     path outname_log            , emit: log
25 | 
26 |     script:
27 |     outname_all_pvals           = "${dataname}_outdf_${grouping_level_2}-${grouping_level_1}_${n_perms}_${param_stem}.tsv"
28 |     outname_perm_pvals          = "${dataname}_pvals_${grouping_level_2}-${grouping_level_1}_${n_perms}_${param_stem}.tsv"
29 |     outname_log                 = "pval_permutations.log"
30 | 
31 |     """
32 |     variance_adjusted_permutations_bytiss.py \\
33 |         --input ${splizvd_pq} \\
34 |         --num_perms ${n_perms} \\
35 |         --grouping_level_2 ${grouping_level_2} \\
36 |         --grouping_level_1 ${grouping_level_1} \\
37 |         --outname_all_pvals ${outname_all_pvals} \\
38 |         --outname_perm_pvals ${outname_perm_pvals} \\
39 |         --outname_log ${outname_log}
40 |     """
41 | } 


--------------------------------------------------------------------------------
/modules/local/summarize_results.nf:
--------------------------------------------------------------------------------
 1 | process SUMMARIZE_RESULTS {
 2 |     tag "${params.dataname}"
 3 | 
 4 |     publishDir "${params.outdir}",  
 5 |         mode: "copy", 
 6 |         pattern: "*.tsv"
 7 |     publishDir "${params.outdir}/logs", 
 8 |         mode: 'copy', 
 9 |         pattern: '*.log'
10 | 
11 |     label 'process_medium'
12 | 
13 |     input:
14 |     path perm_pvals
15 |     val param_stem
16 |     val dataname
17 |     path first_evec
18 |     path second_evec
19 |     path third_evec
20 |     path splizvd_tsv
21 |     val grouping_level_2
22 |     val grouping_level_1
23 | 
24 |     output:
25 |     path outname        , emit: summary
26 |     path outname_log    , emit: log
27 | 
28 |     script:
29 |     outname             = "summary_${dataname}_${grouping_level_2}-${grouping_level_1}_${param_stem}.tsv"
30 |     outname_log         = "summarize_results.log"
31 | 
32 |     """
33 |     final_summary.py \\
34 |         --perm_pvals ${perm_pvals} \\
35 |         --first_evec ${first_evec} \\
36 |         --second_evec ${second_evec} \\
37 |         --third_evec ${third_evec} \\
38 |         --splizvd ${splizvd_tsv} \\
39 |         --grouping_level_2 ${grouping_level_2} \\
40 |         --grouping_level_1 ${grouping_level_1} \\
41 |         --outname ${outname} \\
42 |         --outname_log ${outname_log}
43 |     """
44 | }


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * -------------------------------------------------
  3 |  *  nf-core/spliz Nextflow config file
  4 |  * -------------------------------------------------
  5 |  * Default config options for all environments.
  6 |  */
  7 | 
  8 | // Global default params, used in configs
  9 | params {
 10 |   // Workflow flags for SpliZ
 11 |   // TODO nf-core: Specify your pipeline's command line flags
 12 |   dataname = null
 13 |   input_file = null
 14 |   SICILIAN = false
 15 |   pin_S = 0.01
 16 |   pin_z = 0.0
 17 |   bounds = 5
 18 |   light = false
 19 |   svd_type = "normdonor"
 20 |   n_perms = 100
 21 |   grouping_level_1 = null
 22 |   grouping_level_2 = null 
 23 |   libraryType = null
 24 |   run_analysis = false
 25 |   samplesheet = null
 26 |   annotator_pickle = null
 27 |   exon_pickle = null
 28 |   splice_pickle = null 
 29 |   meta = null
 30 |   gtf = null 
 31 |   rank_quant = 0
 32 |  
 33 |   outdir = './results/${params.dataname}'
 34 |   publish_dir_mode = 'copy'
 35 | 
 36 |   // Boilerplate options
 37 |   genome = false
 38 |   genomes = false
 39 |   multiqc_config = false
 40 |   email = false
 41 |   email_on_fail = false
 42 |   max_multiqc_email_size = 25.MB
 43 |   plaintext_email = false
 44 |   monochrome_logs = false
 45 |   help = false
 46 |   igenomes_base = 's3://ngi-igenomes/igenomes'
 47 |   tracedir = "${params.outdir}/pipeline_info"
 48 |   igenomes_ignore = false
 49 |   custom_config_version = 'master'
 50 |   custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}"
 51 |   hostnames = false
 52 |   config_profile_name = null
 53 |   config_profile_description = false
 54 |   config_profile_contact = false
 55 |   config_profile_url = false
 56 |   validate_params = true
 57 |   show_hidden_params = false
 58 |   schema_ignore_params = 'genomes'
 59 | 
 60 |   // Defaults only, expecting to be overwritten
 61 |   max_memory = 800.GB
 62 |   max_cpus = 16
 63 |   max_time = 240.h
 64 | 
 65 | }
 66 | 
 67 | // Container slug. Stable releases should specify release tag!
 68 | // Developmental code should specify :dev
 69 | process.container = 'kaitlinchaung/spliz:dev'
 70 | 
 71 | // Load base.config by default for all pipelines
 72 | includeConfig 'conf/base.config'
 73 | 
 74 | // Load nf-core custom profiles from different Institutions
 75 | try {
 76 |   includeConfig "${params.custom_config_base}/nfcore_custom.config"
 77 | } catch (Exception e) {
 78 |   System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config")
 79 | }
 80 | 
 81 | profiles {
 82 |   sherlock {
 83 |     process.executor = 'slurm'
 84 |     process.clusterOptions = '-p owners'
 85 | 
 86 |     process.memory = { 20.GB * task.attempt }
 87 |     process.time = { 1.h * task.attempt }
 88 |     process.errorStrategy = { task.exitStatus in [1,130,143,137,104,134,139] ? 'retry' : 'finish' }
 89 |     process.maxRetries = 3
 90 |   }
 91 |   conda {
 92 |     docker.enabled = false
 93 |     singularity.enabled = false
 94 |     podman.enabled = false
 95 |     shifter.enabled = false
 96 |     charliecloud.enabled = false
 97 |     process.conda = "$projectDir/environment.yml"
 98 |     createTimeout = "2 h"
 99 |   }
100 |   debug { process.beforeScript = 'echo $HOSTNAME' }
101 |   docker {  
102 |     process.container           = 'kaitlinchaung/spliz:v0.8'
103 |     docker.enabled              = true
104 |     docker.userEmulation        = true
105 |     singularity.enabled         = false
106 |     podman.enabled              = false
107 |     shifter.enabled             = false
108 |     charliecloud.enabled        = false
109 |   }
110 |   singularity {
111 |     process.container           = 'kaitlinchaung/spliz:v0.8'
112 |     singularity.enabled         = true
113 |     singularity.autoMounts      = true
114 |     docker.enabled              = false
115 |     podman.enabled              = false
116 |     shifter.enabled             = false
117 |     charliecloud.enabled        = false
118 |   }
119 |   podman {
120 |     singularity.enabled = false
121 |     docker.enabled = false
122 |     podman.enabled = true
123 |     shifter.enabled = false
124 |     charliecloud.enabled = false
125 |   }
126 |   shifter {
127 |     singularity.enabled = false
128 |     docker.enabled = false
129 |     podman.enabled = false
130 |     shifter.enabled = true
131 |     charliecloud.enabled = false
132 |   }
133 |   charliecloud {
134 |     singularity.enabled = false
135 |     docker.enabled = false
136 |     podman.enabled = false
137 |     shifter.enabled = false
138 |     charliecloud.enabled = true
139 |   }
140 |   test              { includeConfig 'conf/test.config' }
141 |   test_full         { includeConfig 'conf/test_full.config' }
142 |   small_test_data   { includeConfig 'small_data/small.config'}
143 | }
144 | 
145 | // Export these variables to prevent local Python/R libraries from conflicting with those in the container
146 | env {
147 |   PYTHONNOUSERSITE = 1
148 |   R_PROFILE_USER = "/.Rprofile"
149 |   R_ENVIRON_USER = "/.Renviron"
150 | }
151 | 
152 | // Capture exit codes from upstream processes when piping
153 | process.shell = ['/bin/bash', '-euo', 'pipefail']
154 | 
155 | def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
156 | timeline {
157 |   enabled = true
158 |   file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html"
159 | }
160 | report {
161 |   enabled = true
162 |   file = "${params.tracedir}/execution_report_${trace_timestamp}.html"
163 | }
164 | trace {
165 |   enabled = true
166 |   file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt"
167 | }
168 | dag {
169 |   enabled = true
170 |   file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg"
171 | }
172 | 
173 | manifest {
174 |   name = 'salzmanlab/spliz'
175 |   author = 'Salzman Lab'
176 |   homePage = 'https://github.com/salzmanlab/SpliZ'
177 |   description = 'Code to calculate the Splicing Z Score (SZS) for single cell RNA-seq splicing analysis'
178 |   mainScript = 'main.nf'
179 |   nextflowVersion = '>=20.04.0'
180 |   version = '1.0dev'
181 | }
182 | 
183 | // Function to ensure that resource requirements don't go beyond
184 | // a maximum limit
185 | def check_max(obj, type) {
186 |   if (type == 'memory') {
187 |     try {
188 |       if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
189 |         return params.max_memory as nextflow.util.MemoryUnit
190 |       else
191 |         return obj
192 |     } catch (all) {
193 |       println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
194 |       return obj
195 |     }
196 |   } else if (type == 'time') {
197 |     try {
198 |       if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
199 |         return params.max_time as nextflow.util.Duration
200 |       else
201 |         return obj
202 |     } catch (all) {
203 |       println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
204 |       return obj
205 |     }
206 |   } else if (type == 'cpus') {
207 |     try {
208 |       return Math.min( obj, params.max_cpus as int )
209 |     } catch (all) {
210 |       println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
211 |       return obj
212 |     }
213 |   }
214 | }
215 | 


--------------------------------------------------------------------------------
/nextflow_schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "http://json-schema.org/draft-07/schema",
  3 |     "$id": "https://raw.githubusercontent.com/nf-core/spliz/master/nextflow_schema.json",
  4 |     "title": "nf-core/spliz pipeline parameters",
  5 |     "description": "Code to calculate the Splicing Z Score (SZS) for single cell RNA-seq splicing analysis",
  6 |     "type": "object",
  7 |     "definitions": {
  8 |         "input_output_options": {
  9 |             "title": "Input/output options",
 10 |             "type": "object",
 11 |             "fa_icon": "fas fa-terminal",
 12 |             "description": "Define where the pipeline should find input data and save output data.",
 13 |             "properties": {
 14 |                 "dataname": {
 15 |                     "type": "string",
 16 |                     "description": "Name identifier of the SpliZ run"
 17 |                 },
 18 |                 "input_file": {
 19 |                     "type": "string",
 20 |                     "fa_icon": "fas fa-dna",
 21 |                     "description": "Input parquet or tsv file"
 22 |                 },
 23 |                 "outdir": {
 24 |                     "type": "string",
 25 |                     "description": "Output directory for results",
 26 |                     "default": "./results/${params.dataname}",
 27 |                     "hidden": true
 28 |                 },
 29 |                 "SICILIAN": {
 30 |                     "type": "boolean",
 31 |                     "description": "Is the input file SICILIAN output?"
 32 |                 },
 33 |                 "pin_S": {
 34 |                     "type": "number",
 35 |                     "description": "Bound splice site residuals at this quantile (e.g. values in the lower pin_S quantile and the upper 1 - pin_S quantile will be rounded to the quantile limits)"
 36 |                 },
 37 |                 "pin_z": {
 38 |                     "type": "number",
 39 |                     "description": "Bound SpliZ scores at this quantile (e.g. values in the lower pin_z quantile and the upper 1 - pin_z quantile will be rounded to the quantile limits)"
 40 |                 },
 41 |                 "bounds": {
 42 |                     "type": "integer",
 43 |                     "description": "Only include cell/gene pairs that have more than this many junctional reads for the gene"
 44 |                 },
 45 |                 "light": {
 46 |                     "type": "boolean",
 47 |                     "description": "Output the minimum number of columns",
 48 |                     "default": true
 49 |                 },
 50 |                 "svd_type": {
 51 |                     "type": "string",
 52 |                     "description": "Type of SVD calculation"
 53 |                 },
 54 |                 "grouping_level_1": {
 55 |                     "type": "string",
 56 |                     "description": "Column to partition data by"
 57 |                 },
 58 |                 "grouping_level_2": {
 59 |                     "type": "string",
 60 |                     "description": "Column to group data by"
 61 |                 },
 62 |                 "n_perms": {
 63 |                     "type": "integer",
 64 |                     "description": "Number of permutations"
 65 |                 },
 66 |                 "annotator_pickle": {
 67 |                     "type": "string",
 68 |                     "description": "Annotator pickle file"
 69 |                 },
 70 |                 "exon_pickle": {
 71 |                     "type": "string",
 72 |                     "description": "Exon pickle file"
 73 |                 },
 74 |                 "splice_pickle": {
 75 |                     "type": "string",
 76 |                     "description": "Splice pickle file"
 77 |                 },
 78 |                 "libraryType": {
 79 |                     "type": "string",
 80 |                     "description": "Options: 10X (for 10X chromium), SS2 (for Smart-seq2), and SLS (for Slide-seq or Slide-seq2)"
 81 |                 },
 82 |                 "gtf": {
 83 |                     "type": "string",
 84 |                     "description": "GTF annotation file"
 85 |                 },
 86 |                 "rank_quant": {
 87 |                     "type": "number",
 88 |                     "description": "Bound SpliZ ranks for each donor/acceptor at this quantile (e.g. values in the lower rank_quant quantile and the upper 1 - rank_quant quantile will be rounded to the quantile limits)"
 89 |                 },
 90 |                 "help": {
 91 |                     "type": "boolean",
 92 |                     "description": "Display help text.",
 93 |                     "hidden": true,
 94 |                     "fa_icon": "fas fa-question-circle"
 95 |                 },
 96 |                 "run_analysis": {
 97 |                     "type": "boolean",
 98 |                     "description": "Run analysis steps?"
 99 |                 },
100 |                 "samplesheet": {
101 |                     "type": "string"
102 |                 },
103 |                 "meta": {
104 |                     "type": "string",
105 |                     "description": "Metadata file containing entries for each barcode/grouping_col_1/grouping_col_2 combination"
106 |                 }
107 |             },
108 |             "required": [
109 |                 "dataname"
110 |             ]
111 |         },
112 |         "max_job_request_options": {
113 |             "title": "Max job request options",
114 |             "type": "object",
115 |             "fa_icon": "fab fa-acquisitions-incorporated",
116 |             "description": "Set the top limit for requested resources for any single job.",
117 |             "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.",
118 |             "properties": {
119 |                 "max_cpus": {
120 |                     "type": "integer",
121 |                     "description": "Maximum number of CPUs that can be requested    for any single job.",
122 |                     "default": 16,
123 |                     "fa_icon": "fas fa-microchip",
124 |                     "hidden": true,
125 |                     "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`"
126 |                 },
127 |                 "max_memory": {
128 |                     "type": "string",
129 |                     "description": "Maximum amount of memory that can be requested for any single job.",
130 |                     "default": "128.GB",
131 |                     "fa_icon": "fas fa-memory",
132 |                     "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$",
133 |                     "hidden": true,
134 |                     "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`"
135 |                 },
136 |                 "max_time": {
137 |                     "type": "string",
138 |                     "description": "Maximum amount of time that can be requested for any single job.",
139 |                     "default": "240.h",
140 |                     "fa_icon": "far fa-clock",
141 |                     "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$",
142 |                     "hidden": true,
143 |                     "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`"
144 |                 }
145 |             }
146 |         },
147 |         "institutional_config_options": {
148 |             "title": "Institutional config options",
149 |             "type": "object",
150 |             "fa_icon": "fas fa-university",
151 |             "description": "Parameters used to describe centralised config profiles. These should not be edited.",
152 |             "help_text": "The centralised nf-core configuration profiles use a handful of pipeline parameters to describe themselves. This information is then printed to the Nextflow log when you run a pipeline. You should not need to change these values when you run a pipeline.",
153 |             "properties": {
154 |                 "custom_config_version": {
155 |                     "type": "string",
156 |                     "description": "Git commit id for Institutional configs.",
157 |                     "default": "master",
158 |                     "hidden": true,
159 |                     "fa_icon": "fas fa-users-cog",
160 |                     "help_text": "Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default: `master`.\n\n```bash\n## Download and use config file with following git commit id\n--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96\n```"
161 |                 },
162 |                 "custom_config_base": {
163 |                     "type": "string",
164 |                     "description": "Base directory for Institutional configs.",
165 |                     "default": "https://raw.githubusercontent.com/nf-core/configs/master",
166 |                     "hidden": true,
167 |                     "help_text": "If you're running offline, nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell nextflow where to find them with the `custom_config_base` option. For example:\n\n```bash\n## Download and unzip the config files\ncd /path/to/my/configs\nwget https://github.com/nf-core/configs/archive/master.zip\nunzip master.zip\n\n## Run the pipeline\ncd /path/to/my/data\nnextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/\n```\n\n> Note that the nf-core/tools helper package has a `download` command to download all required pipeline files + singularity containers + institutional configs in one go for you, to make this process easier.",
168 |                     "fa_icon": "fas fa-users-cog"
169 |                 },
170 |                 "hostnames": {
171 |                     "type": "string",
172 |                     "description": "Institutional configs hostname.",
173 |                     "hidden": true,
174 |                     "fa_icon": "fas fa-users-cog"
175 |                 },
176 |                 "config_profile_name": {
177 |                     "type": "string",
178 |                     "description": "Institutional config name.",
179 |                     "hidden": true,
180 |                     "fa_icon": "fas fa-users-cog"
181 |                 },
182 |                 "config_profile_description": {
183 |                     "type": "string",
184 |                     "description": "Institutional config description.",
185 |                     "hidden": true,
186 |                     "fa_icon": "fas fa-users-cog"
187 |                 },
188 |                 "config_profile_contact": {
189 |                     "type": "string",
190 |                     "description": "Institutional config contact information.",
191 |                     "hidden": true,
192 |                     "fa_icon": "fas fa-users-cog"
193 |                 },
194 |                 "config_profile_url": {
195 |                     "type": "string",
196 |                     "description": "Institutional config URL link.",
197 |                     "hidden": true,
198 |                     "fa_icon": "fas fa-users-cog"
199 |                 }
200 |             }
201 |         }
202 |     },
203 |     "allOf": [
204 |         {
205 |             "$ref": "#/definitions/input_output_options"
206 |         },
207 |         {
208 |             "$ref": "#/definitions/max_job_request_options"
209 |         },
210 |         {
211 |             "$ref": "#/definitions/institutional_config_options"
212 |         }
213 |     ],
214 |     "properties": {
215 |         "publish_dir_mode": {
216 |             "type": "string",
217 |             "default": "copy",
218 |             "hidden": true,
219 |             "description": "Method used to save pipeline results to output directory.",
220 |             "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
221 |             "fa_icon": "fas fa-copy",
222 |             "enum": [
223 |                 "symlink",
224 |                 "rellink",
225 |                 "link",
226 |                 "copy",
227 |                 "copyNoFollow",
228 |                 "move"
229 |             ]
230 |         },
231 |         "validate_params": {
232 |             "type": "boolean",
233 |             "description": "Boolean whether to validate parameters against the schema at runtime",
234 |             "default": true,
235 |             "fa_icon": "fas fa-check-square",
236 |             "hidden": true
237 |         },
238 |         "email_on_fail": {
239 |             "type": "string",
240 |             "description": "Email address for completion summary, only when pipeline fails.",
241 |             "fa_icon": "fas fa-exclamation-triangle",
242 |             "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$",
243 |             "hidden": true,
244 |             "help_text": "This works exactly as with `--email`, except emails are only sent if the workflow is not successful."
245 |         },
246 |         "plaintext_email": {
247 |             "type": "boolean",
248 |             "description": "Send plain-text email instead of HTML.",
249 |             "fa_icon": "fas fa-remove-format",
250 |             "hidden": true,
251 |             "help_text": "Set to receive plain-text e-mails instead of HTML formatted."
252 |         },
253 |         "max_multiqc_email_size": {
254 |             "type": "string",
255 |             "description": "File size limit when attaching MultiQC reports to summary emails.",
256 |             "default": "25.MB",
257 |             "fa_icon": "fas fa-file-upload",
258 |             "hidden": true,
259 |             "help_text": "If file generated by pipeline exceeds the threshold, it will not be attached."
260 |         },
261 |         "monochrome_logs": {
262 |             "type": "boolean",
263 |             "description": "Do not use coloured log outputs.",
264 |             "fa_icon": "fas fa-palette",
265 |             "hidden": true,
266 |             "help_text": "Set to disable colourful command line output and live life in monochrome."
267 |         },
268 |         "multiqc_config": {
269 |             "type": "string",
270 |             "description": "Custom config file to supply to MultiQC.",
271 |             "fa_icon": "fas fa-cog",
272 |             "hidden": true
273 |         },
274 |         "tracedir": {
275 |             "type": "string",
276 |             "description": "Directory to keep pipeline Nextflow logs and reports.",
277 |             "default": "${params.outdir}/pipeline_info",
278 |             "fa_icon": "fas fa-cogs",
279 |             "hidden": true
280 |         },
281 |         "email": {
282 |             "type": "string"
283 |         },
284 |         "igenomes_base": {
285 |             "type": "string",
286 |             "default": "s3://ngi-igenomes/igenomes"
287 |         },
288 |         "igenomes_ignore": {
289 |             "type": "string"
290 |         },
291 |         "show_hidden_params": {
292 |             "type": "string"
293 |         },
294 |         "genome": {
295 |             "type": "string"
296 |         }
297 |     }
298 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ###### Requirements with Version Specifiers ######
 2 | 
 3 | argparse == 1.4.0
 4 | datetime == 4.4
 5 | numpy == 1.22.3
 6 | pandas == 1.4.1
 7 | pysam == 0.16.0.1
 8 | utils == 1.0.1
 9 | tqdm == 4.62.0
10 | pyarrow == 5.0.0
11 | scipy == 1.7.1
12 | statsmodels == 0.12.2


--------------------------------------------------------------------------------
/small_data/small.config:
--------------------------------------------------------------------------------
 1 | params {
 2 |   dataname = "test"
 3 |   input_file = "https://raw.githubusercontent.com/salzmanlab/SpliZ/main/small_data/small.pq"
 4 |   SICILIAN = true
 5 |   pin_S = 0.1
 6 |   pin_z = 0.0
 7 |   bounds = 5
 8 |   light = false      
 9 |   svd_type = "normdonor"
10 |   n_perms = 100
11 |   grouping_level_2 = "compartment"
12 |   grouping_level_1 = "tissue"
13 |   libraryType = "10X"
14 |   run_analysis = true
15 | }
16 | 
17 | params.outdir = "./results/${params.dataname}"
18 | params.tracedir = "./results/${params.dataname}/pipeline_info"
19 | params.schema_ignore_params = "input,single_end,show_hidden_params,validate_params,igenomes_ignore,tracedir,igenomes_base,help,monochrome_logs,plaintext_email,max_multiqc_email_size,email_on_fail,email,multiqc_config,publish_dir_mode,genome,genomes" 
20 | 


--------------------------------------------------------------------------------
/small_data/small.pq:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salzman-lab/SpliZ/594532c1b12a1a30b5e00cd9e1fbed30ef28047e/small_data/small.pq


--------------------------------------------------------------------------------
/subworkflows/local/analysis.nf:
--------------------------------------------------------------------------------
 1 | include { PVAL_PERMUTATIONS     }   from   '../../modules/local/pval_permutations'
 2 | include { FIND_SPLIZ_SITES      }   from   '../../modules/local/find_spliz_sites'
 3 | include { SUMMARIZE_RESULTS     }   from   '../../modules/local/summarize_results'
 4 | 
 5 | workflow ANALYSIS {
 6 |     take:
 7 |     splizvd_tsv
 8 |     splizvd_pq
 9 |     param_stem
10 |     geneMat_samplesheet
11 | 
12 |     main:
13 |     // Step 1: Calculate variance adjusted permutations
14 |     PVAL_PERMUTATIONS (
15 |         splizvd_pq,
16 |         param_stem,
17 |         params.dataname,
18 |         params.n_perms,
19 |         params.grouping_level_2,
20 |         params.grouping_level_1
21 |     )
22 | 
23 |     PVAL_PERMUTATIONS.out.perm_pvals
24 |         .set{ pval_permutations }
25 | 
26 |     // Step 2: Find SpliZ sites
27 |     FIND_SPLIZ_SITES (
28 |         pval_permutations,
29 |         params.libraryType,
30 |         geneMat_samplesheet
31 |     )
32 | 
33 |     // Step 3: Summarize results
34 |     SUMMARIZE_RESULTS (
35 |         pval_permutations,
36 |         param_stem, 
37 |         params.dataname,
38 |         FIND_SPLIZ_SITES.out.first_evec,
39 |         FIND_SPLIZ_SITES.out.second_evec,
40 |         FIND_SPLIZ_SITES.out.third_evec,
41 |         splizvd_tsv,
42 |         params.grouping_level_2,
43 |         params.grouping_level_1
44 |     )
45 | }
46 | 


--------------------------------------------------------------------------------
/subworkflows/local/convert_bam.nf:
--------------------------------------------------------------------------------
 1 | include { CLASS_INPUT_10X       } from '../../modules/local/class_input_10X' 
 2 | include { CLASS_INPUT_SS2       } from '../../modules/local/class_input_SS2' 
 3 | include { PROCESS_CLASS_INPUT   } from '../../modules/local/process_class_input' 
 4 | include { ANN_SPLICES           } from '../../modules/local/ann_splices'
 5 | 
 6 | workflow CONVERT_BAM {
 7 |     take:
 8 |     ch_bam
 9 | 
10 |     main:
11 | 
12 |     if ((params.libraryType == '10X') || (params.libraryType == "SLS")) {
13 |         CLASS_INPUT_10X (
14 |             ch_bam,
15 |             params.dataname,
16 |             params.libraryType,
17 |             params.annotator_pickle,
18 |             params.gtf
19 |         )
20 |         ch_light_class_input = CLASS_INPUT_10X.out.class_input
21 |     } else if (params.libraryType == 'SS2') {
22 |         CLASS_INPUT_SS2 (
23 |                 ch_bam,
24 |                 params.dataname,
25 |                 params.libraryType,
26 |                 params.annotator_pickle,
27 |                 params.gtf
28 |             )
29 |         ch_light_class_input = CLASS_INPUT_SS2.out.class_input
30 |     }
31 | 
32 |     ch_light_class_input
33 |         .collectFile(newLine: true) { files ->
34 |             files.toString()
35 |         }
36 |         .set { ch_class_input }
37 |     
38 |     ch_class_input.view()
39 |     
40 |     PROCESS_CLASS_INPUT (
41 |         ch_class_input,
42 |         params.dataname,
43 |         params.libraryType,
44 |         params.meta
45 |     )
46 | 
47 |     ANN_SPLICES (
48 |         PROCESS_CLASS_INPUT.out.pq,
49 |         params.exon_pickle,
50 |         params.splice_pickle
51 |     )
52 | 
53 |     emit:
54 |     tsv = ANN_SPLICES.out.tsv
55 | }


--------------------------------------------------------------------------------
/subworkflows/local/preprocess.nf:
--------------------------------------------------------------------------------
 1 | include { CONVERT_PARQUET   } from './../../modules/local/convert_parquet'
 2 | include { CONVERT_BAM       } from './convert_bam'
 3 | 
 4 | workflow PREPROCESS {
 5 | 
 6 |     main:
 7 | 
 8 |     convert_bam = false
 9 | 
10 |     if (params.input_file && params.samplesheet) {
11 |         exit 1, "Invalid input, provide either input_file or samplesheet but not both."
12 |     } else if (params.samplesheet) {
13 |         if (params.SICILIAN) {
14 |             exit 1, "Invalid input, SICILIAN inputs must be provided as input_file."
15 |         } else {
16 |             if ((params.libraryType == '10X') || (params.libraryType == "SLS")) {
17 |                 ch_bam = Channel.fromPath(params.samplesheet)
18 |                     .splitCsv(header:false)
19 |                     .map { row ->
20 |                         tuple( 
21 |                             row[0],         // bam file sample_ID
22 |                             file(row[1])    // bam file path 
23 |                         )
24 |                     }
25 |                 convert_bam = true
26 |             } else if (params.libraryType == 'SS2') {
27 |                 ch_bam = Channel.fromPath(params.samplesheet)
28 |                     .splitCsv(header:false)
29 |                     .map { row ->
30 |                         tuple( 
31 |                             row[0],         // bam file sample_ID
32 |                             file(row[1]),   // R1 bam file path 
33 |                             file(row[2])    // R2 bam file path    
34 |                         )
35 |                     }
36 |                 convert_bam = true
37 |             } 
38 |         }
39 |     } else if (params.input_file) {
40 |         input_file = file(params.input_file)
41 |         def is_valid_input_file = input_file.extension in ["tsv", "pq", "txt", "bam"]
42 |         if (!is_valid_input_file) {
43 |             exit 1, "Invalid input file type supplied, options are *.bam, *.pq, *.txt, or *.tsv."
44 |         } 
45 |         if (params.SICILIAN) {
46 |             if (input_file.extension == "bam") {
47 |                 exit 1, "Invalid input, SICILIAN input must be a tsv, pq, or txt file."
48 |             } else {
49 |                 ch_input = Channel.fromPath(params.input_file)
50 |             }
51 |         } else {
52 |             if (input_file.extension == "bam") {
53 |                 if (!params.dataname) {
54 |                     exit 1, "Must provide dataname for bam file."
55 |                 }
56 |                 ch_bam = Channel.fromPath(params.input_file)
57 |                     .map { it ->
58 |                         tuple(
59 |                             params.dataname,
60 |                             file(it)
61 |                         )
62 |                     }
63 |                 convert_bam = true
64 |             } else {
65 |                 ch_input = Channel.fromPath(params.input_file)
66 |             }
67 |         }
68 |     } else {
69 |         exit 1, "No input_file or samplesheet provided."
70 |     }
71 |     
72 |     if (convert_bam) {
73 |         CONVERT_BAM (
74 |             ch_bam
75 |         )
76 |         ch_input = CONVERT_BAM.out.tsv
77 |     }
78 | 
79 |     emit:
80 |     input = ch_input
81 | 
82 | }


--------------------------------------------------------------------------------
/subworkflows/local/spliz.nf:
--------------------------------------------------------------------------------
 1 | include { CALC_SPLIZVD }   from   '../../modules/local/calc_splizvd'
 2 | 
 3 | workflow SPLIZ {
 4 |     take:
 5 |     ch_input
 6 | 
 7 |     main:
 8 | 
 9 |     def suff_light      = params.light     ? "_light" : ""
10 |     def suff_SICILIAN   = params.SICILIAN  ? "_SICILIAN" : ""
11 |     def suff_rank_quant = params.rank_quant == 0 ? "" : "_r_${params.rank_quant}"
12 |     
13 |     def isLight         = params.light     ? "1" : "0"
14 |     def isSICILIAN      = params.SICILIAN  ? "1" : "0"
15 | 
16 |     param_stem          = "S_${params.pin_S}_z_${params.pin_z}_b_${params.bounds}${suff_rank_quant}${suff_light}${suff_SICILIAN}"
17 | 
18 |     // Step 1: Calculate RIJK zscore
19 |     CALC_SPLIZVD (
20 |         ch_input,
21 |         param_stem,
22 |         params.dataname,
23 |         params.pin_S,
24 |         params.pin_z,
25 |         params.bounds,
26 |         params.svd_type,
27 |         params.grouping_level_1,
28 |         params.grouping_level_2,
29 |         isLight,
30 |         isSICILIAN,
31 |         params.rank_quant
32 |     )
33 | 
34 |     emit:
35 |     geneMat_samplesheet = CALC_SPLIZVD.out.matSheet
36 |     splizvd_tsv         = CALC_SPLIZVD.out.tsv
37 |     splizvd_pq          = CALC_SPLIZVD.out.pq
38 |     param_stem          = param_stem
39 | }
40 | 


--------------------------------------------------------------------------------
/workflows/spliz_pipeline.nf:
--------------------------------------------------------------------------------
 1 | /*
 2 | ========================================================================================
 3 |     VALIDATE INPUTS
 4 | ========================================================================================
 5 | */
 6 | 
 7 | // Check params with defined inputs
 8 | def is_valid_svd_type = params.svd_type in ["normgene", "normdonor"]
 9 | if (!is_valid_svd_type) {
10 |     exit 1, "Invalid svd_type; options are 'normgene' and 'normdonor'."
11 | }
12 | 
13 | def is_valid_libraryType = params.libraryType in ["SS2", "10X", "SLS"]
14 | if (!is_valid_libraryType) {
15 |     exit 1, "Invalid libraryType; options are 'SS2', '10X', and 'SLS'."
16 | }
17 | 
18 | /*
19 | ========================================================================================
20 |     IMPORT LOCAL MODULES/SUBWORKFLOWS
21 | ========================================================================================
22 | */
23 | include { PREPROCESS    } from './../subworkflows/local/preprocess'
24 | include { SPLIZ         } from './../subworkflows/local/spliz'
25 | include { ANALYSIS      } from './../subworkflows/local/analysis'
26 | 
27 | /*
28 | ========================================================================================
29 |     RUN MAIN WORKFLOW
30 | ========================================================================================
31 | */
32 | 
33 | workflow SPLIZ_PIPELINE {
34 | 
35 |     PREPROCESS ()
36 | 
37 |     
38 |     SPLIZ (
39 |         PREPROCESS.out.input
40 |     )
41 |     
42 |     if (params.run_analysis) {
43 |         ANALYSIS (
44 |             SPLIZ.out.splizvd_tsv,
45 |             SPLIZ.out.splizvd_pq,
46 |             SPLIZ.out.param_stem,
47 |             SPLIZ.out.geneMat_samplesheet
48 |         )
49 |     }
50 | }
51 | 
52 | /*
53 | ========================================================================================
54 |     THE END
55 | ========================================================================================
56 | */
57 | 


--------------------------------------------------------------------------------