├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom.md
    │   └── feature_request.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   └── dev.yml
├── .gitignore
├── .pre-commit-config.yaml
├── AUTHORS.md
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── README.md
├── data
    ├── README.md
    ├── rep4.tar.gz
    └── rep9.tar.gz
├── docs
    ├── api.md
    ├── changelog.md
    ├── contributing.md
    ├── index.md
    ├── installation.md
    └── usage.md
├── ids
    ├── 1000L2_barcodes.txt
    ├── 1000L3_barcodes.txt
    ├── 1000L5_barcodes.txt
    ├── 1000L6_barcodes.txt
    ├── 1000L7_barcodes.txt
    ├── 1000L8_barcodes.txt
    └── 1000L9_barcodes.txt
├── makefile
├── mkdocs.yml
├── poetry.lock
├── pyproject.toml
├── requirements.txt
├── stpipeline
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── clustering.py
    │   ├── dataset.py
    │   ├── distance.py
    │   ├── fastq_utils.py
    │   ├── filter.py
    │   ├── gff_reader.py
    │   ├── sam_utils.py
    │   ├── saturation.py
    │   ├── stats.py
    │   ├── transcript.py
    │   ├── unique_events_parser.py
    │   └── utils.py
    ├── core
    │   ├── __init__.py
    │   ├── annotation.py
    │   ├── mapping.py
    │   └── pipeline.py
    ├── scripts
    │   ├── __init__.py
    │   ├── adjust_matrix_coordinates.py
    │   ├── convertEnsemblToNames.py
    │   ├── filter_gene_type_matrix.py
    │   ├── merge_fastq.py
    │   ├── multi_qa.py
    │   ├── st_pipeline_run.py
    │   └── st_qa.py
    └── version.py
├── tests
    ├── __init__.py
    ├── annotation_test.py
    ├── clustering_test.py
    ├── config
    │   ├── Homo_sapiens.GRCh38.dna.chromosome.19.fa.gz
    │   ├── annotations
    │   │   └── Homo_sapiens.GRCh38.79_chr19.gtf
    │   ├── contaminant_genomes
    │   │   └── R45S5_R5S1
    │   │   │   └── Rn45s_Rn5s.fasta
    │   └── idfiles
    │   │   └── 150204_arrayjet_1000L2_probes.txt
    ├── dataset_test.py
    ├── fastq_utils_test.py
    ├── filter_test.py
    ├── gff_reader_test.py
    ├── input
    │   └── arrayjet_1002
    │   │   ├── testdata_R1.fastq.gz
    │   │   └── testdata_R2.fastq.gz
    ├── integration_test.py
    ├── mapping_test.py
    ├── sam_utils_test.py
    ├── saturation_test.py
    ├── stats_test.py
    ├── unique_events_parser_test.py
    └── utils_test.py
├── workflow.pdf
└── workflow_extended.pdf


/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 
16 | 1. Command executed
17 | 2. Environment used
18 | 3. Some data to reproduce the problem
19 | 4. User
20 | 
21 | **Expected behavior**
22 | A clear and concise description of what you expected to happen.
23 | 
24 | **Additional context**
25 | Add any other context about the problem here.
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Custom issue template
3 | about: Describe this issue template's purpose here.
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 | 
8 | ---
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | Please include a summary of the changes and the related issue.
 4 | Please also include relevant motivation and context.
 5 | List any dependencies that are required for this change.
 6 | 
 7 | Fixes # (issue)
 8 | 
 9 | ## Type of change
10 | 
11 | Please delete options that are not relevant.
12 | 
13 | - [ ] Bug fix (non-breaking change which fixes an issue)
14 | - [ ] New feature (non-breaking change which adds functionality)
15 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
16 | - [ ] This change requires a documentation update
17 | 


--------------------------------------------------------------------------------
/.github/workflows/dev.yml:
--------------------------------------------------------------------------------
 1 | name: dev build CI
 2 | 
 3 | # Controls when the action will run.
 4 | on:
 5 |   # Triggers the workflow on push or pull request events
 6 |   push:
 7 |     branches:
 8 |     - '*'
 9 |   pull_request:
10 |     branches:
11 |     - '*'
12 |   # Allows you to run this workflow manually from the Actions tab
13 |   workflow_dispatch:
14 | 
15 | # contains 3 jobs: test, publish_dev_build and notification
16 | jobs:
17 |   test:
18 |     # The type of runner that the job will run on
19 |     strategy:
20 |       matrix:
21 |         python-versions: ['3.10', '3.11', '3.12']
22 |         os: [ubuntu-latest]
23 |     runs-on: ${{ matrix.os }}
24 | 
25 |     # Steps represent a sequence of tasks that will be executed as part of the job
26 |     steps:
27 |       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
28 |     - uses: actions/checkout@v4
29 |     - uses: actions/setup-python@v5
30 |       with:
31 |         python-version: ${{ matrix.python-versions }}
32 | 
33 |     - name: Install dependencies
34 |       run: |
35 |         python -m pip install --upgrade pip
36 |         pip install poetry
37 | 
38 |     - name: Install samtools
39 |       run: |
40 |         sudo apt-get update
41 |         sudo apt-get install -y samtools
42 | 
43 |     - name: Install STAR (Precompiled Binary)
44 |       run: |
45 |         STAR_VERSION="2.7.11a"
46 |         wget -q https://github.com/alexdobin/STAR/releases/download/${STAR_VERSION}/STAR_${STAR_VERSION}.zip
47 |         unzip STAR_${STAR_VERSION}.zip
48 |         sudo mv STAR_${STAR_VERSION}/Linux_x86_64_static/STAR /usr/local/bin/
49 | 
50 |     - name: Verify STAR and samtools installation
51 |       run: |
52 |         STAR --version
53 |         samtools --version
54 | 
55 |     - name: test
56 |       run: |
57 |         poetry install -E dev
58 |         poetry run pytest
59 | 
60 |   publish_dev_build:
61 |     # if test failed, we should not publish
62 |     needs: test
63 |     # you may need to change os below
64 |     runs-on: ubuntu-latest
65 |     steps:
66 |     - uses: actions/checkout@v4
67 |       with:
68 |         fetch-depth: 0 # fetch all commits/branches for mike
69 |     - uses: actions/setup-python@v5
70 |       with:
71 |         python-version: '3.10'
72 | 
73 |     - name: Install dependencies
74 |       run: |
75 |         python -m pip install --upgrade pip
76 |         pip install poetry
77 | 
78 |     - name: build documentation
79 |       run: |
80 |         poetry install -E doc
81 |         poetry run mkdocs build
82 |         git config --global user.name Docs deploy
83 |         git config --global user.email docs@dummy.bot.com
84 |         poetry run mike deploy -p "`poetry version --short`.dev"
85 |         poetry run mike set-default -p "`poetry version --short`.dev"
86 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | site/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | 
29 | # PyInstaller
30 | #  Usually these files are written by a python script from a template
31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 | .pytest_cache/
50 | pytest-report.xml
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # mkdocs documentation
57 | /site
58 | 
59 | # mypy
60 | .mypy_cache/
61 | 
62 | # IDE settings
63 | .vscode/
64 | 
65 | # mkdocs build dir
66 | site/
67 | 
68 | # vscode extension - local history
69 | .history/
70 | 
71 | # Cache
72 | __pycache__
73 | 
74 | # OSX
75 | .DS_Store
76 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/Lucas-C/pre-commit-hooks
 3 |   rev: v1.5.4
 4 |   hooks:
 5 |   - id: forbid-crlf
 6 |   - id: remove-crlf
 7 |   - id: forbid-tabs
 8 |     exclude_types: [csv, tsv]
 9 |     exclude: ^(tests/|makefile)
10 |   - id: remove-tabs
11 |     exclude_types: [csv, tsv]
12 |     exclude: ^(tests/|makefile)
13 | - repo: https://github.com/pre-commit/pre-commit-hooks
14 |   rev: v4.1.0
15 |   hooks:
16 |   - id: trailing-whitespace
17 |     exclude: ^tests
18 |   - id: end-of-file-fixer
19 |     exclude: ^tests
20 |   - id: check-merge-conflict
21 |   - id: pretty-format-json
22 |     args: [--autofix]
23 |   - id: check-yaml
24 |     args: [--unsafe]
25 | - repo: https://github.com/astral-sh/ruff-pre-commit
26 |   rev: v0.1.14
27 |   hooks:
28 |   - id: ruff
29 |     args: [--fix]
30 |   - id: ruff-format
31 | - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
32 |   rev: v2.12.0
33 |   hooks:
34 |   - id: pretty-format-yaml
35 |     args: [--autofix, --indent, '2']
36 | - repo: local
37 |   hooks:
38 |   - id: mypy
39 |     name: mypy
40 |     entry: mypy
41 |     exclude: ^(tests/|docsrc/)
42 |     language: python
43 |     types: [python]
44 |     require_serial: true
45 |     verbose: true
46 | 


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
1 | # Authors
2 | 
3 | - Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
4 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | Contributions are welcome, and they are greatly appreciated! Every little bit
  4 | helps, and credit will always be given.
  5 | 
  6 | You can contribute in many ways:
  7 | 
  8 | ## Types of Contributions
  9 | 
 10 | ### Report Bugs
 11 | 
 12 | Report bugs at https://github.com/jfnavarro/st_pipeline/issues.
 13 | 
 14 | If you are reporting a bug, please include:
 15 | 
 16 | * Your operating system name and version.
 17 | * Any details about your local setup that might be helpful in troubleshooting.
 18 | * Detailed steps to reproduce the bug.
 19 | 
 20 | ### Fix Bugs
 21 | 
 22 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
 23 | wanted" is open to whoever wants to implement it.
 24 | 
 25 | ### Implement Features
 26 | 
 27 | Look through the GitHub issues for features. Anything tagged with "enhancement"
 28 | and "help wanted" is open to whoever wants to implement it.
 29 | 
 30 | ### Write Documentation
 31 | 
 32 | ST Pipeline could always use more documentation, whether as part of the
 33 | official ST Pipeline docs, in docstrings, or even on the web in blog posts,
 34 | articles, and such.
 35 | 
 36 | ### Submit Feedback
 37 | 
 38 | The best way to send feedback is to file an issue at https://github.com/jfnavarro/st_pipeline/issues.
 39 | 
 40 | If you are proposing a feature:
 41 | 
 42 | * Explain in detail how it would work.
 43 | * Keep the scope as narrow as possible, to make it easier to implement.
 44 | 
 45 | ## Get Started
 46 | 
 47 | Ready to contribute? Here's how to set up `ST Pipeline` for local development.
 48 | 
 49 | 1. Fork the `ST Pipeline` repo on GitHub.
 50 | 2. Clone your fork locally
 51 | 
 52 | ``` console
 53 | git clone git@github.com:jfnavarro/st_pipeline.git
 54 | ```
 55 | 
 56 | 3. Ensure [poetry](https://python-poetry.org/docs/) is installed.
 57 | 4. Ensure [STAR](https://github.com/alexdobin/STAR) and [samtools](https://www.htslib.org/) are installed.
 58 | 5. Install dependencies and start your virtualenv:
 59 | 
 60 | ``` console
 61 | poetry install -E test -E doc -E dev
 62 | ```
 63 | 
 64 | Note that you can use your own Python environment (e.g Anaconda) by
 65 | changing the default behaviour in poetry with this command:
 66 | 
 67 | ``` console
 68 | poetry config virtualenvs.create false
 69 | ```
 70 | 
 71 | 6. Create a branch for local development:
 72 | 
 73 | ``` console
 74 | git checkout -b name-of-your-bugfix-or-feature
 75 | ```
 76 | 
 77 | Now you can make your changes locally.
 78 | 
 79 | 7. When you're done making changes, check that your changes pass the
 80 |    tests, including testing other Python versions, with pytest:
 81 | 
 82 | ``` console
 83 | poetry run pytest
 84 | ```
 85 | 
 86 | 8. Commit your changes and push your branch to GitHub:
 87 | 
 88 | ``` console
 89 | git add .
 90 | git commit -m "Your detailed description of your changes."
 91 | git push origin name-of-your-bugfix-or-feature
 92 | ```
 93 | 
 94 | 9. Submit a pull request through the GitHub website.
 95 | 
 96 | ## Pull Request Guidelines
 97 | 
 98 | Before you submit a pull request, check that it meets these guidelines:
 99 | 
100 | 1. The pull request should include tests.
101 | 2. If the pull request adds functionality, the docs should be updated. Put
102 |    your new functionality into a function with a docstring, and add the
103 |    feature to the list in README.md.
104 | 3. The pull request should work for Python 3.10, 3.11 and 3.12. Check
105 |    https://github.com/jfnavarro/st_pipeline/actions
106 |    and make sure that the tests pass for all supported Python versions.
107 | 
108 | ## Testing
109 | 
110 | You can run the tests with pytest:
111 | 
112 | ``` console
113 | poetry run pytest
114 | ```
115 | 
116 | Replace test_your_module.py with the actual name of your test file.
117 | 
118 | ## Makefile
119 | 
120 | A `makefile` is included in the repo with the following actions:
121 | 
122 | To run formatting tools
123 | 
124 | ``` console
125 | make format
126 | ```
127 | 
128 | To run linting tools
129 | 
130 | ``` console
131 | make lint
132 | ```
133 | 
134 | To run the tests
135 | 
136 | ``` console
137 | make unittet
138 | ```
139 | 
140 | To run the tests with coverage
141 | 
142 | ``` console
143 | make coverage
144 | ```
145 | 
146 | To clean the temporary files and cache
147 | 
148 | ``` console
149 | make clean
150 | ```
151 | 
152 | ## Deploying
153 | 
154 | A reminder for the maintainers on how to deploy.
155 | Make sure all your changes are committed (including an entry in CHANGELOG.md).
156 | Make sure you have updated the version in `pyproject.toml` and `stpipeline/version.py`.
157 | Then run:
158 | 
159 | ``` console
160 | git tag <version> -m "message"
161 | git push --tags
162 | ```
163 | 
164 | GitHub Actions will then create a release and publish documentation if tests pass.
165 | 
166 | You can also create the documentation manually by running:
167 | 
168 | ``` console
169 | poetry run mkdocs build
170 | ```
171 | 
172 | ## Publish package
173 | 
174 | Ensure that you have configured your PyPi tokens.
175 | 
176 | ```console
177 | poetry config repositories.testpypi https://test.pypi.org/legacy/
178 | poetry config repositories.pypi https://upload.pypi.org/legacy/
179 | ```
180 | 
181 | and
182 | 
183 | ``` console
184 | poetry config pypi-token.pypi YOUR_PYPI_API_TOKEN
185 | poetry config pypi-token.testpypi YOUR_TEST_PYPI_API_TOKEN
186 | ```
187 | 
188 | Then run:
189 | 
190 | ``` console
191 | poetry build
192 | poetry publish -r test-pypi # optional
193 | poetry publish
194 | ```
195 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | # Set environment variables
 4 | ENV POETRY_VERSION=2.0.1 \
 5 |     PYTHONUNBUFFERED=1 \
 6 |     POETRY_NO_INTERACTION=1 \
 7 |     PATH="/root/.local/bin:$PATH"
 8 | 
 9 | # Install system dependencies, Poetry, STAR, and Samtools
10 | RUN apt-get update \
11 |     && apt-get install -y --no-install-recommends \
12 |        build-essential \
13 |        curl \
14 |        libpq-dev \
15 |        libffi-dev \
16 |        libssl-dev \
17 |        git \
18 |        gcc \
19 |        wget \
20 |        unzip \
21 |        zlib1g-dev \
22 |        libbz2-dev \
23 |        liblzma-dev \
24 |        libcurl4-gnutls-dev \
25 |        libncurses5-dev \
26 |     && wget https://github.com/alexdobin/STAR/archive/refs/tags/2.7.10b.zip \
27 |     && unzip 2.7.10b.zip \
28 |     && cd STAR-2.7.10b/source \
29 |     && make STAR \
30 |     && mv STAR /usr/local/bin/ \
31 |     && mkdir -p /app \
32 |     && cd /app \
33 |     && wget https://github.com/samtools/samtools/releases/download/1.17/samtools-1.17.tar.bz2 \
34 |     && tar -xjf samtools-1.17.tar.bz2 \
35 |     && cd samtools-1.17 \
36 |     && ./configure \
37 |     && make \
38 |     && make install \
39 |     && cd /app \
40 |     && apt-get clean \
41 |     && rm -rf /var/lib/apt/lists/* 2.7.10b.zip STAR-2.7.10b samtools-1.17 samtools-1.17.tar.bz2
42 | 
43 | # Install Poetry
44 | RUN curl -sSL https://install.python-poetry.org | python3 -
45 | 
46 | # Set working directory
47 | WORKDIR /app
48 | 
49 | # Copy project files
50 | COPY pyproject.toml poetry.lock README.md /app/
51 | 
52 | # Install dependencies using Poetry
53 | RUN poetry install --no-root --only main
54 | 
55 | # Copy the entire project
56 | COPY . /app
57 | 
58 | # Ensure scripts are executable
59 | RUN chmod +x /app/stpipeline/scripts/*.py
60 | 
61 | # Set entrypoint for the container
62 | ENTRYPOINT ["poetry", "run"]
63 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | Copyright (c) 2024 Jose Fernandez Navarro.
 3 | All rights reserved.
 4 | 
 5 | * Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a
 8 | copy of this software and associated documentation files (the "Software"),
 9 | to deal in the Software without restriction, including without limitation
10 | the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 | and/or sell copies of the Software, and to permit persons to whom the
12 | Software is furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
18 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
19 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
20 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spatial Transcriptomics (ST) Pipeline
 2 | 
 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 4 | [![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-310/)
 5 | [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-311/)
 6 | [![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-312/)
 7 | [![PyPI version](https://badge.fury.io/py/stpipeline.svg)](https://badge.fury.io/py/stpipeline)
 8 | [![Build Status](https://github.com/jfnavarro/st_pipeline/actions/workflows/dev.yml/badge.svg)](https://github.com/jfnavarro/st_pipeline/actions/workflows/dev)
 9 | 
10 | The ST Pipeline provides the tools, algorithms and scripts needed to process and analyze the raw
11 | data generated with Spatial Transcriptomics or Visium in FASTQ format to generate datasets for down-stream analysis.
12 | 
13 | The ST Pipeline can also be used to process single cell/nuclei RNA-seq data as long as a
14 | file with molecular `barcodes` identifying each cell is provided (same template as the files in the folder "ids").
15 | 
16 | The ST Pipeline can also be used to process bulk RNA-seq data, in this case the barcodes file is not required.
17 | 
18 | The ST Pipeline has been optimized for speed, robustness and it is very easy to use with many parameters to adjust all the settings.
19 | The ST Pipeline is fully parallel and it has constant memory use.
20 | The ST Pipeline allows to skip any of the main steps and provides multiple customization options.
21 | The ST Pipeline allows to use either the genome or the transcriptome as reference.
22 | 
23 | Basically what the ST pipeline does (default mode) is:
24 | 
25 | - Quality trimming step (read 1 and read 2):
26 |   - Remove low quality bases
27 |   - Sanity check (reads same length, reads order, etc..)
28 |   - Check quality UMI
29 |   - Remove artifacts (PolyT, PolyA, PolyG, PolyN and PolyC) of user defined length
30 |   - Check for AT and GC content
31 |   - Discard reads with a minimum number of bases of that failed any of the checks above
32 | - Contamimant filter step (e.x. rRNA genome) (Optional)
33 | - Mapping with [STAR](https://github.com/alexdobin/STAR) step (only read 2) (Optional)
34 | - Demultiplexing with [Taggd](https://github.com/jfnavarro/taggd) step (only read 1) (Optional)
35 | - Keep reads (read 2) that contain a valid barcode and are correctly mapped
36 | - Annotate the reads to the reference (Optional)
37 | - Group annotated reads by barcode (spot position), gene and genomic location (with an offset) to get a read count
38 | - In the grouping/counting only unique molecules (UMIs) are kept (Optional)
39 | 
40 | You can see a graphical more detailed description of the workflow in the documents `workflow.pdf` and `workflow_extended.pdf`
41 | 
42 | The output dataset is a matrix of counts (genes as columns, spots as rows) in TSV format.
43 | The ST pipeline will also output a log file with useful stats and information.
44 | 
45 | ## Installation
46 | 
47 | For users see [install](docs/installation.md)
48 | 
49 | For developers [contributing](CONTRIBUTING.md)
50 | 
51 | ## Usage
52 | 
53 | See [usage](docs/usage.md)
54 | 
55 | ## Authors
56 | 
57 | See [authors](AUTHORS.md)
58 | 
59 | ## License
60 | 
61 | The ST pipeline is open source under the MIT license which means that you can use it,
62 | change it and re-distribute but you must always refer to our license (see LICENSE).
63 | 
64 | ## Credits
65 | 
66 | If you use the ST Pipeline, please refer its publication:
67 | ST Pipeline: An automated pipeline for spatial mapping of unique transcripts
68 | Oxford BioInformatics
69 | 10.1093/bioinformatics/btx211
70 | 
71 | ## Example dataset
72 | 
73 | You can see a real dataset obtained from the public data from
74 | the following publication (http://science.sciencemag.org/content/353/6294/78)
75 | in the folder called "data".
76 | 
77 | ## Contact
78 | 
79 | For questions, bugs, feedback, etc.. you can contact:
80 | 
81 | Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
82 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | # Example datasets
 2 | 
 3 | These datasets were generated from the publicly available raw FASTQ files
 4 | of the Mouse Olfatory Bulb Replicates number 4 and 9 from the publication http://science.sciencemag.org/content/353/6294/78
 5 | 
 6 | The data was analysed with the ST Pipeline 1.3.1 and with a STAR genome index generated from
 7 | the Mus Musculus Ensembl annotation version 86. The annotation file used Mus Musculus GenCode 25 vM11.
 8 | A contaminant genome STAR index was used generated from the Ensembl non coding RNA Mus musculus version 86.
 9 | The IDs file used to demultiplex were the 1000L2 and 1000L5.
10 | 
11 | The following settings were used (NOTE that the name of the parameters in the example are for version 1.3.1):
12 | 
13 | ```bash
14 | st_pipeline_run \
15 |   --output-folder OUTPUT \
16 |   --ids id.txt \
17 |   --ref-map path_to_genome_index \
18 |   --ref-annotation path_to_annotation_file.gtf \
19 |   --expName SOME_NAME \
20 |   --remove-polyA 10 \
21 |   --remove-polyT 10 \
22 |   --remove-polyG 10 \
23 |   --remove-polyC 10 \
24 |   --htseq-no-ambiguous \
25 |   --verbose \
26 |   --mapping-threads 16 \
27 |   --log-file OUTPUT_log.txt \
28 |   --two-pass-mode \
29 |   --umi-filter \
30 |   --filter-AT-content 90 \
31 |   --filter-GC-content 90 \
32 |   --contaminant-index path_to_contaminant_index \
33 |   --min-length-qual-trimming 30 \
34 |   --disable-clipping \
35 |   R1.fastq R2.fastq
36 | ```
37 | 


--------------------------------------------------------------------------------
/data/rep4.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/data/rep4.tar.gz


--------------------------------------------------------------------------------
/data/rep9.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/data/rep9.tar.gz


--------------------------------------------------------------------------------
/docs/api.md:
--------------------------------------------------------------------------------
1 | :: ../stpipeline
2 | 


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | {%
2 |   include-markdown "../CHANGELOG.md"
3 | %}
4 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | {%
2 |   include-markdown "../CONTRIBUTING.md"
3 | %}
4 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | {%
2 |     include-markdown "../README.md"
3 | %}
4 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
  1 | # Installation
  2 | 
  3 | Python 3.10, 3.11 or 3.12 is required.
  4 | 
  5 | ## Requirements
  6 | 
  7 | The ST Pipeline requires [STAR][] installed in the system (minimum version 2.5.4 if you use a ST Pipeline version >= 1.6.0).
  8 | The ST Pipeline requires [samtools][] installed in the system.
  9 | 
 10 | If you use anaconda you can install STAR and samtools with:
 11 | 
 12 | ``` console
 13 | conda install -c bioconda star samtools
 14 | ```
 15 | 
 16 | The ST Pipeline needs a computer with at least 32GB of RAM (depending on the size of the genome) and 8 cpu cores.
 17 | 
 18 | ## Dependencies
 19 | 
 20 | The ST Pipeline depends on some Python packages that will
 21 | be automatically installed during the installation process.
 22 | You can see them in the file `requirements.txt`
 23 | 
 24 | ## From source
 25 | 
 26 | The source for `ST Pipeline` can be downloaded from the [Github repo][].
 27 | 
 28 | You can either clone the public repository:
 29 | 
 30 | ``` console
 31 | git clone https://github.com/jfnavarro/stpipeline
 32 | ```
 33 | 
 34 | Or download the [tarball][]:
 35 | 
 36 | ``` console
 37 | curl -OJL https://github.com/jfnavarro/stpipeline/tarball/master
 38 | ```
 39 | 
 40 | Once you have a copy of the source, you can install it with:
 41 | 
 42 | ### Using Poetry
 43 | 
 44 | If you don't have [Poetry](https://python-poetry.org/docs/) installed
 45 | you can use the following command:
 46 | 
 47 | ``` console
 48 | curl -sSL https://install.python-poetry.org | python -
 49 | ```
 50 | 
 51 | Install the package:
 52 | 
 53 | ``` console
 54 | poetry install
 55 | ```
 56 | 
 57 | Now you can run the ST Pipeline:
 58 | 
 59 | ``` console
 60 | poetry run st_pipeline_run --help
 61 | ```
 62 | 
 63 | ### Using Pip
 64 | 
 65 | If you don't have [pip][] installed, this [Python installation guide][]
 66 | can guide you through the process.
 67 | 
 68 | Install the package:
 69 | 
 70 | ``` console
 71 | pip install .
 72 | ```
 73 | 
 74 | You can also use the official PyPy repositories:
 75 | 
 76 | ``` console
 77 | pip install stpipeline
 78 | ```
 79 | 
 80 | Now you can run ST Pipeline:
 81 | 
 82 | ``` console
 83 | st_pipeline_run --help
 84 | ```
 85 | 
 86 | ### Using Docker
 87 | 
 88 | Before installing, ensure that [Docker](https://www.docker.com/) is installed
 89 | in your environment.
 90 | 
 91 | First, build a Docker image:
 92 | 
 93 | ``` console
 94 | docker buildx build --platform linux/amd64 -t stpipeline .
 95 | ```
 96 | 
 97 | Then, you can run ST Pipeline using Docker:
 98 | 
 99 | To run `ST Pipeline` commands:
100 | 
101 | ``` console
102 | docker run --rm stpipeline st_pipeline_run --help
103 | ```
104 | 
105 | ### Using Anaconda
106 | 
107 | Before installing, ensure you have either [Anaconda](https://www.anaconda.com/)
108 | or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) installed in your environment.
109 | 
110 | First, create the environment:
111 | 
112 | ``` console
113 | conda env create -n stpipeline python=3.10
114 | ```
115 | 
116 | Then, activate the environment:
117 | 
118 | ``` console
119 | conda activate stpipeline
120 | ```
121 | 
122 | Install the package:
123 | 
124 | ``` console
125 | pip install .
126 | ```
127 | 
128 | Now you can run ST Pipeline:
129 | 
130 | ``` console
131 | st_pipeline_run --help
132 | ```
133 | 
134 |   [STAR]: https://github.com/alexdobin/STAR
135 |   [samtools]: https://www.htslib.org/
136 |   [pip]: https://pip.pypa.io
137 |   [Python installation guide]: http://docs.python-guide.org/en/latest/starting/installation/
138 |   [Github repo]: https://github.com/jfnavarro/st_pipeline/
139 |   [tarball]: https://github.com/jfnavarro/st_pipeline/releases
140 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | sources = stpipeline
 2 | 
 3 | .PHONY: test format lint unittest coverage pre-commit clean
 4 | test: format lint unittest
 5 | 
 6 | format:
 7 | 	poetry run isort $(sources) tests
 8 | 	poetry run  ruff format $(sources) tests
 9 | 
10 | lint:
11 | 	poetry run ruff check --fix $(sources) tests
12 | 	poetry run  mypy $(sources)
13 | 
14 | unittest:
15 | 	poetry run  pytest
16 | 
17 | coverage:
18 | 	poetry run  pytest --cov=$(sources) --cov-branch --cov-report=term-missing tests
19 | 
20 | pre-commit:
21 | 	pre-commit run --all-files
22 | 
23 | clean:
24 | 	rm -rf .mypy_cache .pytest_cache
25 | 	rm -rf *.egg-info
26 | 	rm -rf .tox dist site
27 | 	rm -rf coverage.xml .coverage
28 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: ST Pipeline
 2 | site_url: https://github.com/jfnavarro/st_pipeline
 3 | repo_url: https://github.com/jfnavarro/st_pipeline
 4 | repo_name: jfnavarro/st_pipeline
 5 | #strict: true
 6 | nav:
 7 | - Home: index.md
 8 | - Installation: installation.md
 9 | - Usage: usage.md
10 | - Modules: api.md
11 | - Contributing: contributing.md
12 | - Changelog: changelog.md
13 | theme:
14 |   name: material
15 |   language: en
16 |   #logo: assets/logo.png
17 |   palette:
18 |     scheme: preference
19 |     primary: indigo
20 |     accent: indigo
21 |   features:
22 |   - navigation.indexes
23 |   - navigation.instant
24 |   - navigation.tabs.sticky
25 | markdown_extensions:
26 | - pymdownx.emoji:
27 |     emoji_index: !!python/name:material.extensions.emoji.twemoji
28 |     emoji_generator: !!python/name:material.extensions.emoji.to_svg
29 | - pymdownx.critic
30 | - pymdownx.caret
31 | - pymdownx.mark
32 | - pymdownx.tilde
33 | - pymdownx.tabbed
34 | - attr_list
35 | - pymdownx.arithmatex:
36 |     generic: true
37 | - pymdownx.highlight:
38 |     linenums: false
39 | - pymdownx.superfences
40 | - pymdownx.inlinehilite
41 | - pymdownx.details
42 | - admonition
43 | - toc:
44 |     baselevel: 2
45 |     permalink: true
46 |     slugify: !!python/object/apply:pymdownx.slugs.slugify {kwds: {case: lower}}
47 | - meta
48 | plugins:
49 | - include-markdown
50 | - search:
51 |     lang: en
52 | - mkdocstrings:
53 |     default_handler: python
54 |     handlers:
55 |       python:
56 |         rendering:
57 |           show_source: true
58 | watch:
59 | - stpipeline
60 | extra:
61 |   social:
62 |   - icon: fontawesome/brands/github
63 |     link: https://github.com/jfnavarro/st_pipeline
64 |     name: Github
65 |   - icon: material/email
66 |     link: mailto:jc.fernandez.navarro@gmail.com
67 | # uncomment the following and put your google tracking id below to enable GA
68 | #google_analytics:
69 | #  - UA-xxx
70 | #  - auto
71 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [tool.poetry]
  2 | name = "stpipeline"
  3 | version = "2.0.0"
  4 | description = "ST Pipeline: An automated pipeline for spatial mapping of unique transcripts"
  5 | authors = ["Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>"]
  6 | license = "MIT"
  7 | readme = "README.md"
  8 | keywords = ["visium", "analysis", "pipeline", "spatial", "transcriptomics", "toolkit"]
  9 | repository = "https://github.com/jfnavarro/st_pipeline"
 10 | classifiers = [
 11 |     "Development Status :: 5 - Production/Stable",
 12 |     "Intended Audience :: Science/Research",
 13 |     "Topic :: Software Development",
 14 |     "Topic :: Scientific/Engineering :: Bio-Informatics",
 15 |     "License :: OSI Approved :: MIT License",
 16 |     "Programming Language :: Python :: 3.10",
 17 |     "Programming Language :: Python :: 3.11",
 18 |     "Programming Language :: Python :: 3.12",
 19 |     "Operating System :: Unix",
 20 |     "Operating System :: MacOS",
 21 |     "Environment :: Console",
 22 | ]
 23 | include = [
 24 |     { path = "README.md" },
 25 |     { path = "LICENSE" },
 26 |     { path = "doc/**" }
 27 | ]
 28 | 
 29 | [tool.poetry.dependencies]
 30 | python = ">=3.10,<3.13"
 31 | argparse = "^1.4.0"
 32 | numpy = "^2.2.1"
 33 | pandas = "^2.2.3"
 34 | scipy = "^1.15.0"
 35 | scikit-learn = "^1.6.0"
 36 | regex = "^2024.11.6"
 37 | pre-commit = "^4.0.1"
 38 | taggd = ">=0.4.0"
 39 | htseq = "^2.0.9"
 40 | pysam = "^0.22.1"
 41 | seaborn = "^0.13.2"
 42 | types-regex = "^2024.11.6.20241221"
 43 | pandas-stubs = "^2.2.3.241126"
 44 | dnaio = "^1.2.3"
 45 | distance = "^0.1.3"
 46 | 
 47 | # Marked as optional
 48 | toml = { version = "^0.10", optional = true }
 49 | ruff = { version = "^0.1", optional = true }
 50 | isort = { version = "^5.12", optional = true }
 51 | pytest = { version = "^7.2", optional = true }
 52 | pytest-cov = { version = "^4.0", optional = true }
 53 | mkdocs = { version = "^1.4", optional = true }
 54 | mkdocs-include-markdown-plugin = { version = "^3.4", optional = true }
 55 | mkdocs-material = { version = "^9.1", optional = true }
 56 | mkdocstrings = { version = "^0.20", optional = true }
 57 | mkdocstrings-python = { version = "^0.9", optional = true }
 58 | mkdocs-autorefs = { version = "^0.4", optional = true }
 59 | mike = { version = "^1.1", optional = true }
 60 | setuptools = { version = "^68", optional = true }
 61 | virtualenv = { version = "^20.21", optional = true }
 62 | 
 63 | [tool.poetry.scripts]
 64 | st_qa = "stpipeline.scripts.st_qa:main"
 65 | st_pipeline_run = "stpipeline.scripts.st_pipeline_run:main"
 66 | multi_qa = "stpipeline.scripts.multi_qa:main"
 67 | merge_fastq = "stpipeline.scripts.merge_fastq:main"
 68 | filter_gene_type_matrix = "stpipeline.scripts.filter_gene_type_matrix:main"
 69 | convertEnsemblToNames = "stpipeline.scripts.convertEnsemblToNames:main"
 70 | adjust_matrix_coordinates = "stpipeline.scripts.adjust_matrix_coordinates:main"
 71 | 
 72 | [tool.poetry.extras]
 73 | 
 74 | dev = [
 75 |     "toml",
 76 |     "ruff",
 77 |     "isort",
 78 |     "pytest",
 79 |     "pytest-cov"
 80 |     ]
 81 | 
 82 | doc = [
 83 |     "mkdocs",
 84 |     "mkdocs-include-markdown-plugin",
 85 |     "mkdocs-material",
 86 |     "mkdocstrings",
 87 |     "mkdocstrings-python",
 88 |     "mkdocs-autorefs",
 89 |     "mike",
 90 |     "setuptools",
 91 |     "virtualenv"
 92 |     ]
 93 | 
 94 | [build-system]
 95 | requires = ["poetry-core>=1.0.0"]
 96 | build-backend = "poetry.core.masonry.api"
 97 | 
 98 | [tool.ruff]
 99 | # Exclude a variety of commonly ignored directories.
100 | exclude = [
101 |     ".bzr",
102 |     ".direnv",
103 |     ".eggs",
104 |     ".git",
105 |     ".git-rewrite",
106 |     ".hg",
107 |     ".ipynb_checkpoints",
108 |     ".mypy_cache",
109 |     ".nox",
110 |     ".pants.d",
111 |     ".pyenv",
112 |     ".pytest_cache",
113 |     ".pytype",
114 |     ".ruff_cache",
115 |     ".svn",
116 |     ".tox",
117 |     ".venv",
118 |     ".vscode",
119 |     "__pypackages__",
120 |     "_build",
121 |     "buck-out",
122 |     "build",
123 |     "dist",
124 |     "node_modules",
125 |     "site-packages",
126 |     "venv",
127 |     "makefile",
128 |     "docs",
129 |     "docsrc",
130 |     "*.yml",
131 |     "*.yaml",
132 |     "*.md",
133 |     "*.rst",
134 |     "*.ipynb",
135 |     "*.pyx",
136 |     "requirements.txt"
137 | ]
138 | 
139 | # Same as Black.
140 | line-length = 120
141 | indent-width = 4
142 | 
143 | # Assume Python 3.9
144 | target-version = "py39"
145 | 
146 | [tool.ruff.lint]
147 | ignore = ["E203","E266","F403","F401"]
148 | select = ["B","C","E","F","W","B9"]
149 | pydocstyle.convention = "google"
150 | 
151 | # Allow fix for all enabled rules (when `--fix`) is provided.
152 | fixable = ["ALL"]
153 | unfixable = []
154 | 
155 | # Allow unused variables when underscore-prefixed.
156 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
157 | 
158 | [tool.ruff.format]
159 | # Like Black, use double quotes for strings.
160 | quote-style = "double"
161 | 
162 | # Like Black, indent with spaces, rather than tabs.
163 | indent-style = "space"
164 | 
165 | # Like Black, respect magic trailing commas.
166 | skip-magic-trailing-comma = false
167 | 
168 | # Like Black, automatically detect the appropriate line ending.
169 | line-ending = "auto"
170 | 
171 | [tool.ruff.lint.mccabe]
172 | max-complexity = 50
173 | 
174 | [tool.mypy]
175 | # Ensure we know what we do
176 | warn_redundant_casts = true
177 | warn_unused_ignores = true
178 | warn_unused_configs = true
179 | 
180 | # Imports management
181 | ignore_missing_imports = false
182 | 
183 | # Ensure full coverage
184 | disallow_untyped_defs = true
185 | #disallow_incomplete_defs = true
186 | disallow_untyped_calls = true
187 | disallow_untyped_decorators = true
188 | # Restrict dynamic typing (a little)
189 | # e.g. `x: List[Any]` or x: List`
190 | disallow_any_generics = true
191 | 
192 | # Show errors codes
193 | show_error_codes = true
194 | 
195 | # From functions not declared to return Any
196 | warn_return_any = true
197 | 
198 | exclude = ["tests", "docsrc"]
199 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | argparse==1.4.0 ; python_version >= "3.10" and python_version < "3.13"
 2 | cfgv==3.4.0 ; python_version >= "3.10" and python_version < "3.13"
 3 | contourpy==1.3.1 ; python_version >= "3.10" and python_version < "3.13"
 4 | cycler==0.12.1 ; python_version >= "3.10" and python_version < "3.13"
 5 | distance==0.1.3 ; python_version >= "3.10" and python_version < "3.13"
 6 | distlib==0.3.9 ; python_version >= "3.10" and python_version < "3.13"
 7 | dnaio==1.2.3 ; python_version >= "3.10" and python_version < "3.13"
 8 | filelock==3.16.1 ; python_version >= "3.10" and python_version < "3.13"
 9 | fonttools==4.55.3 ; python_version >= "3.10" and python_version < "3.13"
10 | htseq==2.0.9 ; python_version >= "3.10" and python_version < "3.13"
11 | identify==2.6.5 ; python_version >= "3.10" and python_version < "3.13"
12 | isal==1.7.1 ; python_version >= "3.10" and python_version < "3.13" and (platform_machine == "x86_64" or platform_machine == "AMD64" or platform_machine == "aarch64")
13 | joblib==1.4.2 ; python_version >= "3.10" and python_version < "3.13"
14 | kiwisolver==1.4.8 ; python_version >= "3.10" and python_version < "3.13"
15 | matplotlib==3.10.0 ; python_version >= "3.10" and python_version < "3.13"
16 | nodeenv==1.9.1 ; python_version >= "3.10" and python_version < "3.13"
17 | numpy==2.2.1 ; python_version >= "3.10" and python_version < "3.13"
18 | packaging==24.2 ; python_version >= "3.10" and python_version < "3.13"
19 | pandas-stubs==2.2.3.241126 ; python_version >= "3.10" and python_version < "3.13"
20 | pandas==2.2.3 ; python_version >= "3.10" and python_version < "3.13"
21 | pillow==11.1.0 ; python_version >= "3.10" and python_version < "3.13"
22 | platformdirs==4.3.6 ; python_version >= "3.10" and python_version < "3.13"
23 | pre-commit==4.0.1 ; python_version >= "3.10" and python_version < "3.13"
24 | pyparsing==3.2.1 ; python_version >= "3.10" and python_version < "3.13"
25 | pysam==0.22.1 ; python_version >= "3.10" and python_version < "3.13"
26 | python-dateutil==2.9.0.post0 ; python_version >= "3.10" and python_version < "3.13"
27 | pytz==2024.2 ; python_version >= "3.10" and python_version < "3.13"
28 | pyyaml==6.0.2 ; python_version >= "3.10" and python_version < "3.13"
29 | regex==2024.11.6 ; python_version >= "3.10" and python_version < "3.13"
30 | scikit-learn==1.6.1 ; python_version >= "3.10" and python_version < "3.13"
31 | scipy==1.15.1 ; python_version >= "3.10" and python_version < "3.13"
32 | seaborn==0.13.2 ; python_version >= "3.10" and python_version < "3.13"
33 | six==1.17.0 ; python_version >= "3.10" and python_version < "3.13"
34 | taggd==0.4.0 ; python_version >= "3.10" and python_version < "3.13"
35 | threadpoolctl==3.5.0 ; python_version >= "3.10" and python_version < "3.13"
36 | types-pytz==2024.2.0.20241221 ; python_version >= "3.10" and python_version < "3.13"
37 | types-regex==2024.11.6.20241221 ; python_version >= "3.10" and python_version < "3.13"
38 | tzdata==2024.2 ; python_version >= "3.10" and python_version < "3.13"
39 | virtualenv==20.28.1 ; python_version >= "3.10" and python_version < "3.13"
40 | xopen==2.0.2 ; python_version >= "3.10" and python_version < "3.13"
41 | zlib-ng==0.5.1 ; python_version >= "3.10" and python_version < "3.13" and (platform_machine == "x86_64" or platform_machine == "AMD64" or platform_machine == "aarch64")
42 | 


--------------------------------------------------------------------------------
/stpipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/stpipeline/__init__.py


--------------------------------------------------------------------------------
/stpipeline/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/stpipeline/common/__init__.py


--------------------------------------------------------------------------------
/stpipeline/common/clustering.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains some functions to cluster
  3 | molecular barcodes (UMIs) sequences by hamming distance
  4 | """
  5 | 
  6 | import random
  7 | from collections import Counter, defaultdict
  8 | from typing import Any, Dict, List, Set
  9 | 
 10 | import numpy as np
 11 | from scipy.cluster.hierarchy import fcluster, linkage  # type: ignore
 12 | 
 13 | from stpipeline.common.distance import hamming_distance
 14 | 
 15 | 
 16 | def _breadth_first_search(node: str, adj_list: Dict[str, List[str]]) -> Set[str]:
 17 |     """
 18 |     Performs a breadth-first search (BFS) to find all connected components starting from a node.
 19 |     """
 20 |     searched = set()
 21 |     queue = {node}
 22 |     found = set(queue)
 23 |     while queue:
 24 |         current = queue.pop()
 25 |         searched.add(current)
 26 |         # Convert neighbors to a set to handle list inputs
 27 |         neighbors = set(adj_list[current]) - searched
 28 |         found.update(neighbors)
 29 |         # Add new neighbors to the queue
 30 |         queue.update(neighbors)
 31 |     return found
 32 | 
 33 | 
 34 | def _remove_umis(adj_list: Dict[str, List[str]], cluster: List[str], nodes: List[str]) -> Set[str]:
 35 |     """
 36 |     Removes the specified nodes from the cluster and returns
 37 |     the remaining nodes
 38 |     """
 39 |     nodes_to_remove = set([node for x in nodes for node in adj_list[x]] + nodes)
 40 |     return set(cluster) - nodes_to_remove
 41 | 
 42 | 
 43 | def _get_adj_list_adjacency(umis: List[str], allowed_mismatches: int) -> Dict[str, List[str]]:
 44 |     """
 45 |     Constructs an adjacency list where each UMI points to all other UMIs within
 46 |     the allowed mismatches.
 47 |     """
 48 |     return {umi: [umi2 for umi2 in umis if hamming_distance(umi, umi2) <= allowed_mismatches] for umi in umis}
 49 | 
 50 | 
 51 | def _get_connected_components_adjacency(adj_list: Dict[str, List[str]], counts: Counter[str]) -> List[List[str]]:
 52 |     """
 53 |     Traverses the adjacency list to find all connected components.
 54 |     """
 55 |     found = set()
 56 |     components = []
 57 |     for node in sorted(adj_list, key=lambda x: counts[x], reverse=True):
 58 |         if node not in found:
 59 |             nodes = _breadth_first_search(node, adj_list)
 60 |             found.update(nodes)
 61 |             components.append(list(nodes))
 62 |     return components
 63 | 
 64 | 
 65 | def _get_best_adjacency(cluster: List[str], adj_list: Dict[str, List[str]], counts: Counter[str]) -> List[str]:
 66 |     """
 67 |     Identifies the best UMI or set of UMIs from a cluster based on adjacency and counts.
 68 |     """
 69 |     if len(cluster) == 1:
 70 |         return cluster
 71 |     sorted_nodes = sorted(cluster, key=lambda x: counts[x], reverse=True)
 72 |     for i in range(len(sorted_nodes) - 1):
 73 |         if len(_remove_umis(adj_list, cluster, sorted_nodes[: i + 1])) == 0:
 74 |             return sorted_nodes[: i + 1]
 75 |     return cluster
 76 | 
 77 | 
 78 | def _reduce_clusters_adjacency(
 79 |     adj_list: Dict[str, List[str]], clusters: List[List[str]], counts: Counter[str]
 80 | ) -> List[str]:
 81 |     """
 82 |     Reduces clusters to their best representative UMIs.
 83 |     """
 84 |     unique_umis = []
 85 |     for cluster in clusters:
 86 |         unique_umis += _get_best_adjacency(cluster, adj_list, counts)
 87 |     return unique_umis
 88 | 
 89 | 
 90 | def _get_adj_list_directional_adjacency(
 91 |     umis: List[str], counts: Counter[str], allowed_mismatches: int
 92 | ) -> Dict[str, List[str]]:
 93 |     """
 94 |     Constructs a directional adjacency list where each UMI points to all other UMIs within
 95 |     the allowed mismatches and satisfying the directional count condition.
 96 |     """
 97 |     return {
 98 |         umi: [
 99 |             umi2
100 |             for umi2 in umis
101 |             if hamming_distance(umi, umi2) <= allowed_mismatches and counts[umi] >= (counts[umi2] * 2) - 1
102 |         ]
103 |         for umi in umis
104 |     }
105 | 
106 | 
107 | def _reduce_clusters_directional_adjacency(clusters: List[List[str]]) -> List[str]:
108 |     """
109 |     Reduces clusters to their best representative UMIs by selecting one UMI per cluster.
110 |     """
111 |     return [cluster.pop() for cluster in clusters]
112 | 
113 | 
114 | def dedup_hierarchical(molecular_barcodes: List[str], allowed_mismatches: int, method: str = "single") -> List[str]:
115 |     """
116 |     Deduplicates molecular barcodes using hierarchical clustering.
117 | 
118 |     Args:
119 |         molecular_barcodes: A list of UMIs to cluster.
120 |         allowed_mismatches: Maximum allowed mismatches (distance) between UMIs in a cluster.
121 |         method: The linkage method for clustering, "single" for more restrictive or "complete"
122 |                 for less restrictive. Defaults to "single".
123 | 
124 |     Returns:
125 |         A list of unique UMIs after deduplication.
126 | 
127 |     Raises:
128 |         RuntimeError: If the input list is empty or another error occurs during clustering.
129 |     """
130 |     if len(molecular_barcodes) == 0:
131 |         raise RuntimeError("The input UMIs cannot be empty")
132 | 
133 |     if len(molecular_barcodes) == 1:
134 |         return molecular_barcodes
135 | 
136 |     if len(molecular_barcodes) == 2:
137 |         return (
138 |             molecular_barcodes
139 |             if hamming_distance(molecular_barcodes[0], molecular_barcodes[1]) <= allowed_mismatches
140 |             else [random.choice(molecular_barcodes)]
141 |         )
142 | 
143 |     def d(coord: Any) -> int:
144 |         i, j = coord
145 |         return hamming_distance(molecular_barcodes[i], molecular_barcodes[j])
146 | 
147 |     indices = np.triu_indices(len(molecular_barcodes), 1)
148 |     distance_matrix = np.apply_along_axis(d, 0, indices)
149 |     linkage_cluster = linkage(distance_matrix, method=method)
150 |     flat_clusters = fcluster(linkage_cluster, allowed_mismatches, criterion="distance")
151 | 
152 |     items = defaultdict(list)
153 |     for i, item in enumerate(flat_clusters):
154 |         items[item].append(i)
155 | 
156 |     return [molecular_barcodes[random.choice(members)] for members in list(items.values())]
157 | 
158 | 
159 | def dedup_adj(molecular_barcodes: List[str], allowed_mismatches: int) -> List[str]:
160 |     """
161 |     Deduplicates molecular barcodes using an adjacency-based clustering algorithm.
162 | 
163 |     This function clusters similar UMIs based on an adjacency distance matrix, as described
164 |     in the algorithm from http://genome.cshlp.org/content/early/2017/01/18/gr.209601.116.abstract.
165 | 
166 |     Args:
167 |         molecular_barcodes: A list of UMIs to deduplicate.
168 |         allowed_mismatches: Maximum allowable Hamming distance between UMIs in a cluster.
169 | 
170 |     Returns:
171 |         A list of unique UMIs after deduplication.
172 | 
173 |     Raises:
174 |         RuntimeError: If the input list is empty or another error occurs during clustering.
175 |     """
176 |     if len(molecular_barcodes) == 0:
177 |         raise RuntimeError("The input UMIs cannot be empty")
178 |     c = Counter(molecular_barcodes)
179 |     adj_list = _get_adj_list_adjacency(list(c.keys()), allowed_mismatches)
180 |     clusters = _get_connected_components_adjacency(adj_list, c)
181 |     unique_umis = _reduce_clusters_adjacency(adj_list, clusters, c)
182 |     return unique_umis
183 | 
184 | 
185 | def dedup_dir_adj(molecular_barcodes: List[str], allowed_mismatches: int) -> List[str]:
186 |     """
187 |     Deduplicates molecular barcodes using a directional adjacency-based clustering algorithm.
188 | 
189 |     This function clusters similar UMIs based on a directional adjacency distance matrix, as described
190 |     in the algorithm from http://genome.cshlp.org/content/early/2017/01/18/gr.209601.116.abstract.
191 | 
192 |     Args:
193 |         molecular_barcodes: A list of UMIs to deduplicate.
194 |         allowed_mismatches: Maximum allowable Hamming distance between UMIs in a cluster.
195 | 
196 |     Returns:
197 |         A list of unique UMIs after deduplication.
198 | 
199 |     Raises:
200 |         RuntimeError: If the input list is empty or another error occurs during clustering.
201 |     """
202 |     if len(molecular_barcodes) == 0:
203 |         raise RuntimeError("The input UMIs cannot be empty")
204 |     c = Counter(molecular_barcodes)
205 |     adj_list = _get_adj_list_directional_adjacency(list(c.keys()), c, allowed_mismatches)
206 |     clusters = _get_connected_components_adjacency(adj_list, c)
207 |     unique_umis = _reduce_clusters_directional_adjacency(clusters)
208 |     return unique_umis
209 | 


--------------------------------------------------------------------------------
/stpipeline/common/dataset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains routines to create
  3 | a ST dataset and some statistics. The dataset
  4 | will contain several files with the ST data in different
  5 | formats
  6 | """
  7 | 
  8 | import logging
  9 | import os
 10 | import random
 11 | from collections import defaultdict
 12 | from typing import Any, Callable, Dict, List, Optional, Union
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | 
 17 | from stpipeline.common.clustering import dedup_adj, dedup_dir_adj, dedup_hierarchical
 18 | from stpipeline.common.transcript import Transcript
 19 | from stpipeline.common.unique_events_parser import parse_unique_events
 20 | 
 21 | logger = logging.getLogger("STPipeline")
 22 | 
 23 | 
 24 | def compute_unique_umis(
 25 |     transcripts: List[Transcript],
 26 |     umi_counting_offset: int,
 27 |     umi_allowed_mismatches: int,
 28 |     group_umi_func: Callable[[List[str], int], List[str]],
 29 | ) -> List[Transcript]:
 30 |     """
 31 |     Computes unique UMIs from a list of transcripts, grouping them by genomic coordinates and strand.
 32 | 
 33 |     The function groups UMIs by strand and start position (with an offset), clusters the UMIs within
 34 |     each group using a grouping function, and computes unique UMIs based on Hamming distance.
 35 | 
 36 |     Args:
 37 |         transcripts: A list of transcript data. Each transcript should include positional and strand information.
 38 |         umi_counting_offset: The maximum offset allowed when grouping transcripts.
 39 |         umi_allowed_mismatches: The maximum allowed mismatches for UMIs in a group.
 40 |         group_umi_func: A function to group UMIs, accepting a list of UMIs and mismatch threshold.
 41 | 
 42 |     Returns:
 43 |         A list of unique transcripts, one for each unique UMI.
 44 |     """
 45 |     # Sort transcripts by strand and start position
 46 |     sorted_transcripts = sorted(transcripts, key=lambda x: (x.strand, x.start))
 47 |     grouped_transcripts = defaultdict(list)
 48 |     unique_transcripts = []
 49 |     num_transcripts = len(transcripts)
 50 |     for i in range(num_transcripts - 1):
 51 |         current = sorted_transcripts[i]
 52 |         nextone = sorted_transcripts[i + 1]
 53 |         grouped_transcripts[current.umi].append(current)
 54 |         if abs(current.start - nextone.start) > umi_counting_offset or current.strand != nextone.strand:
 55 |             # A new group has been reached (strand, start-pos, offset)
 56 |             unique_umis = group_umi_func(list(grouped_transcripts.keys()), umi_allowed_mismatches)
 57 |             unique_transcripts += [random.choice(grouped_transcripts[u_umi]) for u_umi in unique_umis]
 58 |             grouped_transcripts = defaultdict(list)
 59 | 
 60 |     # Process the last group
 61 |     lastone = sorted_transcripts[num_transcripts - 1]
 62 |     grouped_transcripts[lastone.umi].append(lastone)
 63 |     unique_umis = group_umi_func(list(grouped_transcripts.keys()), umi_allowed_mismatches)
 64 |     unique_transcripts += [random.choice(grouped_transcripts[u_umi]) for u_umi in unique_umis]
 65 | 
 66 |     return unique_transcripts
 67 | 
 68 | 
 69 | def createDataset(
 70 |     input_file: str,
 71 |     output_folder: str,
 72 |     gff_filename: Optional[str] = None,
 73 |     umi_cluster_algorithm: str = "AdjacentBi",
 74 |     umi_allowed_mismatches: int = 1,
 75 |     umi_counting_offset: int = 250,
 76 |     disable_umi: bool = False,
 77 |     output_template: Optional[str] = None,
 78 |     verbose: bool = True,
 79 | ) -> Dict[str, Any]:
 80 |     """
 81 |     Parses an annotated and demultiplexed BAM file with reads, it groups them by gene-barcode
 82 |     to count unique transcripts, and removes duplicates using UMIs. It creates a data frame of unique
 83 |     transcripts by spots and genes. It returns a dictionary of basic stats.
 84 | 
 85 |     Args:
 86 |         input_file: Path to the BAM file containing annotated-demultiplexed records.
 87 |         output_folder: Directory for output files.
 88 |         gff_filename: Annotation reference file. Defaults to None.
 89 |         umi_cluster_algorithm : Algorithm for clustering UMIs. Defaults to "hierarchical".
 90 |         umi_allowed_mismatches: Allowed mismatches for UMI deduplication. Defaults to 1.
 91 |         umi_counting_offset: Offset for grouping transcripts by position. Defaults to 250.
 92 |         disable_umi: Disables UMI deduplication if True. Defaults to False.
 93 |         output_template: Template for output file names. Defaults to None.
 94 |         verbose: Enables verbose logging if True. Defaults to True.
 95 | 
 96 |     Returns:
 97 |         A dictionary with basic stats.
 98 | 
 99 |     Raises:
100 |         RuntimeError: If input file is missing or errors occur during processing.
101 |     """
102 |     if not os.path.isfile(input_file):
103 |         error = f"Error creating dataset, input file not present {input_file}"
104 |         logger.error(error)
105 |         raise RuntimeError(error)
106 | 
107 |     # Set default filenames for the output files
108 |     filenameDataFrame = f"{output_template}_stdata.tsv" if output_template else "stdata.tsv"
109 |     filenameReadsBED = f"{output_template}_reads.bed" if output_template else "reads.bed"
110 | 
111 |     # Initialize counters
112 |     total_record = 0
113 |     discarded_reads = 0
114 | 
115 |     # Obtain the appropriate UMI clustering function
116 |     group_umi_func = {"hierarchical": dedup_hierarchical, "Adjacent": dedup_adj, "AdjacentBi": dedup_dir_adj}.get(
117 |         umi_cluster_algorithm
118 |     )
119 | 
120 |     if not group_umi_func:
121 |         error = f"Error creating dataset. Incorrect clustering algorithm {umi_cluster_algorithm}"
122 |         logger.error(error)
123 |         raise RuntimeError(error)
124 | 
125 |     # Containers to store data for creating the DataFrame
126 |     list_row_values = []
127 |     list_indexes = []
128 | 
129 |     # Parse unique events to generate the unique counts (reads) dataframe and a BED file
130 |     unique_events = parse_unique_events(input_file, gff_filename)
131 |     with open(os.path.join(output_folder, filenameReadsBED), "w") as reads_handler:
132 |         # unique_events is a list of tuples (gene, spots)
133 |         # where gene is a str and spots is a dictionary of transcripts per spot [spot] -> List[Transcript]
134 |         # this loop is to make the transcripts unique (deduplicate them)
135 |         for gene, spots in unique_events:
136 |             unique_transcripts_by_spot = {}
137 |             for spot_coordinates, transcripts in spots.items():
138 |                 x, y = spot_coordinates
139 |                 transcripts_count = len(transcripts)
140 | 
141 |                 # Compute unique transcripts based on UMI, strand, and start position
142 |                 unique_transcripts = (
143 |                     compute_unique_umis(transcripts, umi_counting_offset, umi_allowed_mismatches, group_umi_func)  # type: ignore
144 |                     if not disable_umi
145 |                     else transcripts
146 |                 )
147 | 
148 |                 unique_transcripts_count = len(unique_transcripts)
149 |                 assert 0 < unique_transcripts_count <= transcripts_count
150 |                 discarded_reads += transcripts_count - unique_transcripts_count
151 |                 unique_transcripts_by_spot[f"{x}x{y}"] = unique_transcripts_count
152 | 
153 |                 # Write unique transcripts to the BED file
154 |                 for t in unique_transcripts:
155 |                     reads_handler.write(
156 |                         f"{t.chrom}\t{t.start}\t{t.end}\t{t.clear_name}\t{t.mapping_quality}\t{t.strand}\t{gene}\t{x}\t{y}\n"
157 |                     )
158 | 
159 |                 total_record += 1
160 | 
161 |             # Add data for the DataFrame
162 |             list_indexes.append(gene)
163 |             list_row_values.append(unique_transcripts_by_spot)
164 | 
165 |     if total_record == 0:
166 |         error = "Error creating dataset, input file did not contain any transcript"
167 |         logger.error(error)
168 |         raise RuntimeError(error)
169 | 
170 |     # Create the counts DataFrame
171 |     counts_table = pd.DataFrame(list_row_values, index=list_indexes).fillna(0).T
172 | 
173 |     # Write the counts DataFrame to a TSV file
174 |     counts_table.to_csv(os.path.join(output_folder, filenameDataFrame), sep="\t", na_rep=0)  # type: ignore
175 | 
176 |     # Compute statistics for the dataset
177 |     total_spots, number_genes = counts_table.shape
178 |     total_reads = np.sum(counts_table.values, dtype=np.int32)
179 |     aggregated_spot_counts = counts_table.sum(axis=1)
180 |     aggregated_gene_counts = (counts_table != 0).sum(axis=1)
181 | 
182 |     stats_dict: Dict[str, Union[int, float]] = {}
183 |     stats_dict["max_genes_feature"] = int(aggregated_gene_counts.max())
184 |     stats_dict["min_genes_feature"] = int(aggregated_gene_counts.min())
185 |     stats_dict["max_reads_feature"] = float(aggregated_spot_counts.max())
186 |     stats_dict["min_reads_feature"] = float(aggregated_spot_counts.min())
187 |     stats_dict["average_reads_feature"] = float(np.mean(aggregated_spot_counts))
188 |     stats_dict["average_genes_feature"] = float(np.mean(aggregated_gene_counts))
189 |     stats_dict["std_reads_feature"] = float(np.std(aggregated_spot_counts))
190 |     stats_dict["std_genes_feature"] = float(np.std(aggregated_gene_counts))
191 |     stats_dict["reads_after_duplicates_removal"] = int(total_reads)
192 |     stats_dict["barcodes_found"] = total_spots
193 |     stats_dict["genes_found"] = number_genes
194 |     stats_dict["duplicates_found"] = discarded_reads
195 | 
196 |     # Log statistics if verbose mode is enabled
197 |     if verbose:
198 |         logger.info(f"Number of reads present: {total_reads}")
199 |         logger.info(f"Number of unique events (gene-spot) present: {total_record}")
200 |         logger.info(f"Number of unique genes present: {number_genes}")
201 |         logger.info(f"Max number of genes over all spots: {stats_dict['max_genes_feature']}")
202 |         logger.info(f"Min number of genes over all spots: {stats_dict['min_genes_feature']}")
203 |         logger.info(f"Max number of reads over all spots: {stats_dict['max_reads_feature']}")
204 |         logger.info(f"Min number of reads over all spots: {stats_dict['min_reads_feature']}")
205 |         logger.info(f"Average number genes per spot: {stats_dict['average_genes_feature']}")
206 |         logger.info(f"Average number reads per spot: {stats_dict['average_reads_feature']}")
207 |         logger.info(f"Std. number genes per spot: {stats_dict['std_genes_feature']}")
208 |         logger.info(f"Std. number reads per spot: {stats_dict['std_reads_feature']}")
209 |         logger.info(f"Number of discarded reads (possible duplicates): {discarded_reads}")
210 | 
211 |     return stats_dict
212 | 


--------------------------------------------------------------------------------
/stpipeline/common/distance.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for distance metrics.
 3 | """
 4 | 
 5 | import distance  # type: ignore
 6 | 
 7 | 
 8 | def hamming_distance(a: str, b: str) -> int:
 9 |     """
10 |     Calculates the Hamming distance between two strings using the `distance` library.
11 | 
12 |     Args:
13 |         a: First string.
14 |         b: Second string.
15 | 
16 |     Returns:
17 |         The Hamming distance between the two strings.
18 | 
19 |     Raises:
20 |         ValueError: If the strings are of unequal length.
21 |     """
22 |     if len(a) != len(b):
23 |         raise ValueError("Strings must be of equal length to calculate Hamming distance")
24 |     return distance.hamming(a, b)  # type: ignore
25 | 


--------------------------------------------------------------------------------
/stpipeline/common/fastq_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains some specific functions for
  3 | to parse and modify FASTQ files
  4 | """
  5 | 
  6 | import re
  7 | from typing import Tuple, Union
  8 | 
  9 | import regex
 10 | 
 11 | 
 12 | def remove_adaptor(sequence: str, quality: str, adaptor: str, missmatches: int = 2) -> Tuple[str, str]:
 13 |     """
 14 |     Trims a given adaptor sequence from a FASTQ read if it is found.
 15 | 
 16 |     Args:
 17 |         sequence: The sequence of the read.
 18 |         quality: The quality string corresponding to the read.
 19 |         adaptor: The adaptor sequence to search for.
 20 |         missmatches: The allowed number of mismatches when searching for the adaptor. Defaults to 2.
 21 | 
 22 |     Returns:
 23 |        A tuple containing the trimmed sequence and quality strings.
 24 | 
 25 |     Raises:
 26 |         ValueError: If the input sequence and quality lengths do not match.
 27 |     """
 28 |     if len(sequence) < len(adaptor) or len(sequence) != len(quality):
 29 |         return sequence, quality
 30 | 
 31 |     try:
 32 |         if missmatches == 0:
 33 |             pos = sequence.find(adaptor)
 34 |         else:
 35 |             candidates = regex.findall(rf"(?:{adaptor}){{s<={missmatches}}}", sequence, overlapped=False)
 36 |             if len(candidates) > 0:
 37 |                 local_seq = candidates[0]
 38 |                 # Miss-matches may happen at the start
 39 |                 # so we account for it
 40 |                 local_pos = 0
 41 |                 if adaptor[0] != local_seq[0]:
 42 |                     local_pos = local_seq.find(adaptor[0])
 43 |                 # We now look for the first base of the matched adaptor
 44 |                 pos = sequence.find(local_seq[local_pos:])
 45 |             else:
 46 |                 pos = -1
 47 | 
 48 |         if pos != -1:
 49 |             return sequence[:pos], quality[:pos]
 50 |         else:
 51 |             return sequence, quality
 52 |     except Exception as e:
 53 |         raise RuntimeError("Failed to trim adaptor") from e
 54 | 
 55 | 
 56 | def quality_trim_index(bases: str, qualities: str, cutoff: int, base: int = 33) -> int:
 57 |     """
 58 |     Find the position at which to trim a low-quality end from a nucleotide sequence.
 59 | 
 60 |     Qualities are assumed to be ASCII-encoded as chr(qual + base).
 61 | 
 62 |     This algorithm is derived from BWA's 'bwa_trim_read':
 63 |     - Subtract the cutoff value from all qualities.
 64 |     - Compute partial sums from all indices to the end of the sequence.
 65 |     - Trim sequence at the index at which the sum is minimal.
 66 | 
 67 |     Args:
 68 |         bases: Nucleotide sequence.
 69 |         qualities: ASCII-encoded quality scores.
 70 |         cutoff: Quality cutoff value.
 71 |         base: Base value for ASCII encoding. Defaults to 33.
 72 | 
 73 |     Returns:
 74 |         Index position to trim the sequence.
 75 | 
 76 |     Note:
 77 |         This function handles Illumina NextSeq data specifically by treating high-quality 'G' bases
 78 |         at the end of reads as having a quality of (cutoff - 1).
 79 | 
 80 |     References:
 81 |         CutAdapt (https://github.com/marcelm/cutadapt/)
 82 |     """
 83 |     s = 0
 84 |     max_qual = 0
 85 |     max_i = len(qualities)
 86 |     for i in reversed(range(max_i)):
 87 |         q = ord(qualities[i]) - base
 88 |         if bases[i] == "G":
 89 |             q = cutoff - 1
 90 |         s += cutoff - q
 91 |         if s < 0:
 92 |             break
 93 |         if s > max_qual:
 94 |             max_qual = s
 95 |             max_i = i
 96 |     return max_i
 97 | 
 98 | 
 99 | def trim_quality(
100 |     sequence: str, quality: str, min_qual: int = 20, min_length: int = 30, phred: int = 33
101 | ) -> Tuple[Union[str, None], Union[str, None]]:
102 |     """
103 |     Quality trims a FASTQ read using a BWA-like approach.
104 | 
105 |     The function trims a nucleotide sequence and its quality scores based on a minimum quality threshold.
106 |     If the trimmed sequence is shorter than a minimum length, it returns None.
107 | 
108 |     Args:
109 |         sequence: Nucleotide sequence of the read.
110 |         quality: Quality scores of the read, ASCII-encoded.
111 |         min_qual: Quality threshold to trim. Defaults to 20.
112 |         min_length: Minimum valid length for a read after trimming. Defaults to 30.
113 |         phred: Phred encoding format for quality scores (33 or 64). Defaults to 33.
114 | 
115 |     Returns:
116 |         A tuple containing the trimmed sequence and quality scores,
117 |         or (None, None) if trimming results in a sequence shorter than `min_length`.
118 |     """
119 |     if len(sequence) < min_length:
120 |         return None, None
121 | 
122 |     # Get the position at which to trim (number of bases to trim)
123 |     cut_index = quality_trim_index(sequence, quality, min_qual, phred)
124 | 
125 |     # Check if the trimmed sequence would have at least the minimum length
126 |     if (cut_index + 1) >= min_length:
127 |         new_seq = sequence[:cut_index]
128 |         new_qual = quality[:cut_index]
129 |         return new_seq, new_qual
130 |     else:
131 |         return None, None
132 | 
133 | 
134 | def check_umi_template(umi: str, template: str) -> bool:
135 |     """
136 |     Validates that a UMI (molecular barcode) matches a given template pattern.
137 | 
138 |     Args:
139 |         umi: Molecular barcode to validate.
140 |         template: Regular expression template describing the expected format of the UMI.
141 | 
142 |     Returns:
143 |         True if the UMI matches the template, False otherwise.
144 |     """
145 |     p = re.compile(template)
146 |     return p.match(umi) is not None
147 | 
148 | 
149 | def has_sufficient_content(sequence: str, chars_to_count: str, threshold: float) -> bool:
150 |     """
151 |     Checks if the content of specified characters in a sequence meets or exceeds a given threshold.
152 | 
153 |     Args:
154 |         sequence: The sequence to evaluate.
155 |         chars_to_count: The characters to count (e.g., "AT").
156 |         threshold: The content threshold as a percentage (0-100).
157 | 
158 |     Returns:
159 |        True if the content of specified characters is greater than or equal to the threshold, False otherwise.
160 |     """
161 |     if len(sequence) == 0:
162 |         raise ValueError("The sequence cannot be empty.")
163 |     if not chars_to_count:
164 |         raise ValueError("chars_to_count must not be empty.")
165 |     count = sum(sequence.count(char) for char in chars_to_count)
166 |     content_percentage = (count / len(sequence)) * 100
167 |     return content_percentage >= threshold
168 | 


--------------------------------------------------------------------------------
/stpipeline/common/filter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A module that contains functions to parse and filter input reads for ST data processing.
  3 | """
  4 | 
  5 | import logging
  6 | import os
  7 | from typing import Optional, Tuple
  8 | 
  9 | import dnaio
 10 | import pysam
 11 | 
 12 | from stpipeline.common.fastq_utils import check_umi_template, has_sufficient_content, remove_adaptor, trim_quality
 13 | from stpipeline.common.sam_utils import convert_to_AlignedSegment
 14 | 
 15 | logger = logging.getLogger("STPipeline")
 16 | 
 17 | bam_header = {"HD": {"VN": "1.5", "SO": "unsorted"}, "RG": [{"ID": "0", "SM": "unknown_sample", "PL": "ILLUMINA"}]}
 18 | 
 19 | 
 20 | def filter_input_data(
 21 |     fw_file: str,
 22 |     rv_file: str,
 23 |     out_file: str,
 24 |     out_file_discarded: Optional[str],
 25 |     barcode_length: int,
 26 |     start_position: int,
 27 |     filter_AT_content: float,
 28 |     filter_GC_content: float,
 29 |     umi_start: int,
 30 |     umi_end: int,
 31 |     min_qual: int,
 32 |     min_length: int,
 33 |     polyA_min_distance: int,
 34 |     polyT_min_distance: int,
 35 |     polyG_min_distance: int,
 36 |     polyC_min_distance: int,
 37 |     polyN_min_distance: int,
 38 |     qual64: bool,
 39 |     umi_filter: bool,
 40 |     umi_filter_template: str,
 41 |     umi_quality_bases: int,
 42 |     adaptor_missmatches: int,
 43 |     overhang: int,
 44 |     disable_umi: bool,
 45 |     disable_barcode: bool,
 46 |     disable_trimming: bool,
 47 | ) -> Tuple[int, int]:
 48 |     """
 49 |     Handles input read filtering and quality trimming for sequencing data (paired FASTQ files).
 50 |     - It performs a sanity check (forward and reverse reads same length and order)
 51 |     - It performs a BWA-based quality trimming discarding very short reads
 52 |     - It removes adaptors from the reads (optional)
 53 |     - It checks for AT and GC content (optional)
 54 |     - It performs a sanity check on the UMI (optional)
 55 |     Reads that do not pass the filters are discarded (both R1 and R2)
 56 |     Reads that pass the filter are written as BAM (R2)
 57 | 
 58 |     Args:
 59 |         fw_file: Path to the FASTQ file containing forward reads (R1).
 60 |         rv_file: Path to the FASTQ file containing reverse reads (R2).
 61 |         out_file: Path to the output BAM file.
 62 |         out_file_discarded: Path to the output FASTQ file for discarded reads.
 63 |         barcode_length: Length of the barcode sequence.
 64 |         start_position: Starting position of the barcode in the sequence.
 65 |         filter_AT_content: Maximum allowed percentage of A and T bases in a read for filtering.
 66 |         filter_GC_content: Maximum allowed percentage of G and C bases in a read for filtering.
 67 |         umi_start: Starting position of the UMI in the sequence.
 68 |         umi_end: Ending position of the UMI in the sequence.
 69 |         min_qual: Minimum quality threshold for quality trimming.
 70 |         min_length: Minimum valid length for a read after trimming.
 71 |         polyA_min_distance: Minimum distance for PolyA adaptor trimming.
 72 |         polyT_min_distance: Minimum distance for PolyT adaptor trimming.
 73 |         polyG_min_distance: Minimum distance for PolyG adaptor trimming.
 74 |         polyC_min_distance: Minimum distance for PolyC adaptor trimming.
 75 |         polyN_min_distance: Minimum distance for PolyN adaptor trimming.
 76 |         qual64: True if quality scores are in Phred64 format, False for Phred33.
 77 |         umi_filter: If True, applies UMI quality template filtering.
 78 |         umi_filter_template: Template for UMI quality filtering.
 79 |         umi_quality_bases: Maximum number of low-quality bases allowed in a UMI.
 80 |         adaptor_missmatches: Number of mismatches allowed when removing adaptors.
 81 |         overhang: Overhang for barcode extraction.
 82 |         disable_umi: If True, skips UMI filtering.
 83 |         disable_barcode: If True, skips barcode extraction.
 84 |         disable_trimming: If True, does not perform any trimming.
 85 | 
 86 |     Returns:
 87 |         Total reads processed and remaining reads after filtering.
 88 | 
 89 |     Raises:
 90 |         RuntimeError: If input files are missing or errors occur during processing.
 91 |     """
 92 |     if not os.path.isfile(fw_file) or not os.path.isfile(rv_file):
 93 |         error = f"Error doing quality trimming, input file/s not present {fw_file} {rv_file}"
 94 |         logger.error(error)
 95 |         raise RuntimeError(error)
 96 | 
 97 |     keep_discarded_files = out_file_discarded is not None
 98 | 
 99 |     # Build fake sequence adaptors with the parameters given
100 |     adaptorA = "".join("A" for _ in range(polyA_min_distance))
101 |     adaptorT = "".join("T" for _ in range(polyT_min_distance))
102 |     adaptorG = "".join("G" for _ in range(polyG_min_distance))
103 |     adaptorC = "".join("C" for _ in range(polyC_min_distance))
104 |     adaptorN = "".join("N" for _ in range(polyN_min_distance))
105 | 
106 |     # Quality format
107 |     phred = 64 if qual64 else 33
108 | 
109 |     # Some counters
110 |     total_reads = 0
111 |     dropped_umi = 0
112 |     dropped_umi_template = 0
113 |     dropped_AT = 0
114 |     dropped_GC = 0
115 |     dropped_adaptor = 0
116 |     too_short_after_trimming = 0
117 | 
118 |     bam_file = pysam.AlignmentFile(out_file, "wb", header=bam_header)
119 |     if keep_discarded_files:
120 |         out_writer_discarded = dnaio.open(out_file_discarded, mode="w")  # type: ignore
121 | 
122 |     try:
123 |         with dnaio.open(fw_file, rv_file) as reader:
124 |             for r1, r2 in reader:
125 |                 header_fw, sequence_fw, quality_fw = r1.name, r1.sequence, r1.qualities
126 |                 header_rv, sequence_rv, quality_rv = r2.name, r2.sequence, r2.qualities
127 |                 discard_read = False
128 |                 total_reads += 1
129 | 
130 |                 if header_fw.split()[0] != header_rv.split()[0]:
131 |                     logger.warning(f"Pair reads found with different names {header_fw} and {header_rv}.")
132 | 
133 |                 if not disable_barcode:
134 |                     barcode = sequence_fw[
135 |                         max(0, start_position - overhang) : (start_position + barcode_length + overhang)
136 |                     ]
137 |                 else:
138 |                     barcode = None
139 | 
140 |                 if not disable_umi:
141 |                     umi_seq = sequence_fw[umi_start:umi_end]
142 |                     if umi_filter and not disable_trimming and not check_umi_template(umi_seq, umi_filter_template):
143 |                         dropped_umi_template += 1
144 |                         discard_read = True
145 | 
146 |                     umi_qual = quality_fw[umi_start:umi_end]
147 |                     if (
148 |                         not discard_read
149 |                         and not disable_trimming
150 |                         and len([b for b in umi_qual if (ord(b) - phred) < min_qual]) > umi_quality_bases
151 |                     ):
152 |                         dropped_umi += 1
153 |                         discard_read = True
154 |                 else:
155 |                     umi_seq = None
156 | 
157 |                 if (
158 |                     not discard_read
159 |                     and not disable_trimming
160 |                     and filter_AT_content > 0
161 |                     and has_sufficient_content(sequence_rv, "AT", filter_AT_content)
162 |                 ):
163 |                     dropped_AT += 1
164 |                     discard_read = True
165 | 
166 |                 if (
167 |                     not discard_read
168 |                     and not disable_trimming
169 |                     and filter_GC_content > 0
170 |                     and has_sufficient_content(sequence_rv, "GC", filter_GC_content)
171 |                 ):
172 |                     dropped_GC += 1
173 |                     discard_read = True
174 | 
175 |                 if not discard_read and not disable_trimming:
176 |                     if polyA_min_distance >= 5 and len(sequence_rv) > min_length:
177 |                         sequence_rv, quality_rv = remove_adaptor(sequence_rv, quality_rv, adaptorA, adaptor_missmatches)
178 |                     if polyT_min_distance >= 5 and len(sequence_rv) > min_length:
179 |                         sequence_rv, quality_rv = remove_adaptor(sequence_rv, quality_rv, adaptorT, adaptor_missmatches)
180 |                     if polyG_min_distance >= 5 and len(sequence_rv) > min_length:
181 |                         sequence_rv, quality_rv = remove_adaptor(sequence_rv, quality_rv, adaptorG, adaptor_missmatches)
182 |                     if polyC_min_distance >= 5 and len(sequence_rv) > min_length:
183 |                         sequence_rv, quality_rv = remove_adaptor(sequence_rv, quality_rv, adaptorC, adaptor_missmatches)
184 |                     if polyN_min_distance >= 5 and len(sequence_rv) > min_length:
185 |                         sequence_rv, quality_rv = remove_adaptor(sequence_rv, quality_rv, adaptorN, adaptor_missmatches)
186 | 
187 |                     if len(sequence_rv) < min_length:
188 |                         dropped_adaptor += 1
189 |                         discard_read = True
190 | 
191 |                 if not discard_read and not disable_trimming:
192 |                     sequence_rv, quality_rv = trim_quality(sequence_rv, quality_rv, min_qual, min_length, phred)
193 |                     if not sequence_rv or not quality_rv:
194 |                         too_short_after_trimming += 1
195 |                         discard_read = True
196 | 
197 |                 if not discard_read:
198 |                     bam_file.write(convert_to_AlignedSegment(header_rv, sequence_rv, quality_rv, barcode, umi_seq))
199 |                 elif keep_discarded_files:
200 |                     out_writer_discarded.write(dnaio.SequenceRecord(r2.name, r2.sequence, r2.qualities))
201 |     except Exception as e:
202 |         logger.error(f"Error during quality trimming: {e}")
203 |         raise e
204 |     finally:
205 |         bam_file.close()
206 |         if keep_discarded_files:
207 |             out_writer_discarded.close()
208 | 
209 |     dropped_rv = (
210 |         dropped_umi + dropped_umi_template + dropped_AT + dropped_GC + dropped_adaptor + too_short_after_trimming
211 |     )
212 |     logger.info(f"Trimming stats total reads (pair): {total_reads}")
213 |     logger.info(f"Trimming stats {dropped_rv} reads have been dropped!")
214 |     logger.info(f"Trimming stats you just lost about {(dropped_rv / total_reads):.2%} of your data")
215 |     logger.info(f"Trimming stats reads remaining: {total_reads - dropped_rv}")
216 |     logger.info(f"Trimming stats dropped pairs due to incorrect UMI: {dropped_umi_template}")
217 |     logger.info(f"Trimming stats dropped pairs due to low quality UMI: {dropped_umi}")
218 |     logger.info(f"Trimming stats dropped pairs due to high AT content: {dropped_AT}")
219 |     logger.info(f"Trimming stats dropped pairs due to high GC content: {dropped_GC}")
220 |     logger.info(f"Trimming stats dropped pairs due to presence of artifacts: {dropped_adaptor}")
221 |     logger.info(f"Trimming stats dropped pairs due to being too short: {too_short_after_trimming}")
222 | 
223 |     return total_reads, total_reads - dropped_rv
224 | 


--------------------------------------------------------------------------------
/stpipeline/common/gff_reader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A simple module to parse a GFF/GTF file and query it
 3 | """
 4 | 
 5 | import gzip
 6 | import re
 7 | from typing import Any, Dict, Generator, List, Optional, Union
 8 | 
 9 | # Code snipped from:
10 | # https://gist.github.com/slowkow/8101481
11 | 
12 | GTF_HEADER = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame"]
13 | R_SEMICOLON = re.compile(r"\s*;\s*")
14 | R_COMMA = re.compile(r"\s*,\s*")
15 | R_KEYVALUE = re.compile(r"(\s+|\s*=\s*)")
16 | 
17 | 
18 | def gff_lines(filename: str) -> Generator[Dict[str, Any], None, None]:
19 |     """
20 |     Opens an optionally gzipped GTF/GFF file and generates a dictionary for each line.
21 | 
22 |     Args:
23 |         filename: Path to the GTF/GFF file. The file can be gzipped.
24 | 
25 |     Yields:
26 |        Parsed fields from each GTF/GFF line.
27 |     """
28 |     fn_open = gzip.open if filename.endswith(".gz") else open
29 |     # 'rt' ensures reading text from gzipped file
30 |     with fn_open(filename, "rt") as fh:
31 |         for line in fh:
32 |             if line.startswith("#"):
33 |                 continue
34 |             else:
35 |                 yield gff_parse(line)
36 | 
37 | 
38 | def _get_value(value: Optional[str]) -> Optional[Union[str, List[Optional[str]]]]:
39 |     """
40 |     Processes a value from the GTF/GFF file, stripping quotes and handling lists.
41 | 
42 |     Args:
43 |         value: The value to process.
44 | 
45 |     Returns:
46 |         Processed value, or None if the value is equivalent to null.
47 |     """
48 |     if not value:
49 |         return None
50 |     # Strip double and single quotes
51 |     value = value.strip("\"'")
52 |     if "," in value:
53 |         # Return a list if the value contains commas
54 |         value = re.split(R_COMMA, value)  # type: ignore
55 |     # Handle equivalent-to-null values
56 |     elif value in ["", ".", "NA"]:
57 |         return None
58 |     return value
59 | 
60 | 
61 | def gff_parse(line: str) -> Dict[str, Any]:
62 |     """
63 |     Parses a single GTF/GFF line and returns a dictionary of its fields.
64 | 
65 |     Args:
66 |         line: A single line from a GTF/GFF file.
67 | 
68 |     Returns:
69 |         Parsed fields from the line as key-value pairs.
70 |     """
71 |     result: Dict[str, Any] = {}
72 |     fields = line.rstrip().split("\t")
73 | 
74 |     # Parse standard fields
75 |     for i, col in enumerate(GTF_HEADER):
76 |         result[col] = _get_value(fields[i])
77 | 
78 |     # Parse INFO field
79 |     infos = [x for x in re.split(R_SEMICOLON, fields[8]) if x.strip()]
80 |     for i, info in enumerate(infos, 1):
81 |         try:
82 |             # Parse "key=value"
83 |             key, _, value = re.split(R_KEYVALUE, info, maxsplit=1)
84 |         except ValueError:
85 |             # Use INFO1, INFO2, etc. for unnamed values
86 |             key = f"INFO{i}"
87 |             value = info
88 |         # Ignore fields with no value
89 |         if value:
90 |             result[key] = _get_value(value)
91 | 
92 |     return result
93 | 


--------------------------------------------------------------------------------
/stpipeline/common/sam_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains some functions and utilities for SAM/BAM files
  3 | """
  4 | 
  5 | import math
  6 | import os
  7 | from typing import List
  8 | 
  9 | import pysam
 10 | 
 11 | 
 12 | def split_bam(input_bam: str, temp_dir: str, threads: int) -> List[str]:
 13 |     """
 14 |     Splits a BAM file into chunks with equal read counts. The number of chunks
 15 |     equals the number of CPU cores specified.
 16 | 
 17 |     Args:
 18 |         input_bam: Path to the BAM file to be split.
 19 |         temp_dir: Directory where the created files will be stored.
 20 |         threads: Number of CPU cores to use for splitting.
 21 | 
 22 |     Returns:
 23 |         List of paths to the split BAM files.
 24 |     """
 25 |     pysam.index(input_bam, os.path.join(temp_dir, f"{input_bam}.bai"))  # type: ignore
 26 |     input_bamfile = pysam.AlignmentFile(input_bam, mode="rb")
 27 |     assert input_bamfile.check_index()
 28 | 
 29 |     output_file_names = {part: os.path.join(temp_dir, f"{input_bam}.part_{part}.bam") for part in range(threads)}
 30 | 
 31 |     output_bamfiles = {
 32 |         part: pysam.AlignmentFile(file_name, mode="wb", template=input_bamfile)
 33 |         for part, file_name in output_file_names.items()
 34 |     }
 35 | 
 36 |     total_read_count = input_bamfile.mapped + input_bamfile.unmapped
 37 |     reads_per_part = math.ceil(total_read_count / threads)
 38 |     read_counter = 0
 39 |     part = 0
 40 |     for record in input_bamfile.fetch(until_eof=True):
 41 |         output_bamfiles[part].write(record)
 42 |         read_counter += 1
 43 |         if read_counter == reads_per_part:
 44 |             part += 1
 45 |             read_counter = 0
 46 | 
 47 |     input_bamfile.close()
 48 |     return list(output_file_names.values())
 49 | 
 50 | 
 51 | def convert_to_AlignedSegment(
 52 |     header: str, sequence: str, quality: str, barcode_sequence: str, umi_sequence: str
 53 | ) -> pysam.AlignedSegment:
 54 |     """
 55 |     Converts input variables to an unaligned `pysam.AlignedSegment` with UMI and
 56 |     barcode information as tags.
 57 | 
 58 |     Args:
 59 |         header: Header information for the segment.
 60 |         sequence: DNA/RNA sequence.
 61 |         quality: Base calling quality values.
 62 |         barcode_sequence: Barcode sequence.
 63 |         umi_sequence: Unique molecular identifier sequence.
 64 | 
 65 |     Returns:
 66 |         A new AlignedSegment object with the provided data.
 67 |     """
 68 |     aligned_segment = pysam.AlignedSegment()
 69 |     aligned_segment.query_name = header.split()[0]
 70 |     aligned_segment.query_sequence = sequence
 71 |     aligned_segment.query_qualities = pysam.qualitystring_to_array(quality)
 72 |     aligned_segment.flag |= pysam.FUNMAP
 73 |     aligned_segment.set_tag("B0", barcode_sequence)
 74 |     aligned_segment.set_tag("B3", umi_sequence)
 75 |     aligned_segment.set_tag("RG", "0")
 76 |     return aligned_segment
 77 | 
 78 | 
 79 | def merge_bam(merged_file_name: str, files_to_merge: List[str], ubam: bool = False) -> int:
 80 |     """
 81 |     Merges multiple partial BAM files into a single file.
 82 | 
 83 |     Args:
 84 |         merged_file_name: Path to the output merged BAM file.
 85 |         files_to_merge: List of paths to the partial BAM files.
 86 |         ubam: Indicates if the files are unaligned BAM (uBAM). Default is False.
 87 | 
 88 |     Returns:
 89 |         Total number of records in the merged BAM file.
 90 |     """
 91 |     assert files_to_merge, "The list of files to merge cannot be empty."
 92 |     num_records = 0
 93 | 
 94 |     with pysam.AlignmentFile(files_to_merge[0], mode="rb", check_sq=not ubam) as input_bamfile:
 95 |         merged_file = pysam.AlignmentFile(merged_file_name, mode="wb", template=input_bamfile)
 96 |         for file_name in files_to_merge:
 97 |             with pysam.AlignmentFile(file_name, mode="rb", check_sq=not ubam) as input_file:
 98 |                 for record in input_file.fetch(until_eof=True):
 99 |                     merged_file.write(record)
100 |                     num_records += 1
101 |         merged_file.close()
102 | 
103 |     return num_records
104 | 


--------------------------------------------------------------------------------
/stpipeline/common/saturation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains routines
  3 | to compute saturation points on a
  4 | set of annotated reads in BAM/SAM format
  5 | """
  6 | 
  7 | import logging
  8 | import math
  9 | import os
 10 | import random
 11 | from collections import defaultdict
 12 | from typing import Dict, List, Optional, Tuple
 13 | 
 14 | import pysam
 15 | 
 16 | from stpipeline.common.dataset import createDataset
 17 | from stpipeline.common.utils import safe_remove
 18 | 
 19 | logger = logging.getLogger("STPipeline")
 20 | 
 21 | 
 22 | def compute_saturation(
 23 |     nreads: int,
 24 |     annotated_reads: str,
 25 |     gff_filename: str,
 26 |     umi_cluster_algorithm: str,
 27 |     umi_allowed_mismatches: int,
 28 |     umi_counting_offset: int,
 29 |     disable_umi: bool,
 30 |     expName: str,
 31 |     temp_folder: str,
 32 |     saturation_points: Optional[List[int]] = None,
 33 | ) -> None:
 34 |     """
 35 |     Computes saturation points from annotated reads and logs the results.
 36 | 
 37 |     Args:
 38 |         nreads: Total number of reads in the annotated_reads file.
 39 |         annotated_reads: Path to a BAM file with the annotated reads.
 40 |         gff_filename: Path to the GFF file.
 41 |         umi_cluster_algorithm: Clustering algorithm for UMIs.
 42 |         umi_allowed_mismatches: Number of allowed mismatches.
 43 |         umi_counting_offset: Number of bases allowed as offset when counting UMIs.
 44 |         disable_umi: If True, disables UMI filtering.
 45 |         expName: Experiment name for logging and file organization.
 46 |         temp_folder: Path to temporary folder for intermediate files.
 47 |         saturation_points: List of saturation points to be used.
 48 | 
 49 |     Raises:
 50 |         RuntimeError: If the input file is missing or invalid.
 51 |     """
 52 |     if not os.path.isfile(annotated_reads):
 53 |         msg = f"Error, input file not present: {annotated_reads}"
 54 |         logger.error(msg)
 55 |         raise RuntimeError(msg)
 56 | 
 57 |     saturation_points = _determine_saturation_points(nreads, saturation_points)
 58 |     files, file_names, subsampling = _generate_subsamples(nreads, annotated_reads, saturation_points, temp_folder)
 59 | 
 60 |     _write_subsamples_to_files(files, subsampling, annotated_reads, saturation_points)
 61 | 
 62 |     results = _compute_saturation_metrics(
 63 |         file_names,
 64 |         saturation_points,
 65 |         gff_filename,
 66 |         umi_cluster_algorithm,
 67 |         umi_allowed_mismatches,
 68 |         umi_counting_offset,
 69 |         disable_umi,
 70 |         temp_folder,
 71 |         expName,
 72 |     )
 73 | 
 74 |     _cleanup_files(file_names)
 75 | 
 76 |     # TODO write this to a CSV file
 77 |     logger.info("Saturation points: %s", ", ".join(map(str, saturation_points)))
 78 |     logger.info("Reads per saturation point: %s", ", ".join(map(str, results["reads"])))
 79 |     logger.info("Genes per saturation point: %s", ", ".join(map(str, results["genes"])))
 80 |     logger.info("Average genes/spot per saturation point: %s", ", ".join(map(str, results["avg_genes"])))
 81 |     logger.info("Average reads/spot per saturation point: %s", ", ".join(map(str, results["avg_reads"])))
 82 | 
 83 | 
 84 | def _determine_saturation_points(nreads: int, saturation_points: Optional[List[int]]) -> List[int]:
 85 |     """
 86 |     Creates a list of saturations points using the total number of reads. If the points are
 87 |     provided they will be filtered if they are not then they will be extracted from an
 88 |     exponential distribution.
 89 |     """
 90 |     if saturation_points is None:
 91 |         saturation_points = [int(math.floor(1e3 + (math.exp(x) * 1e3))) for x in range(15)]
 92 |     points = [p for p in sorted(saturation_points) if p < nreads]
 93 |     if not points:
 94 |         msg = "All saturation points are greater than the number of reads."
 95 |         logger.error(msg)
 96 |         raise RuntimeError(msg)
 97 |     return points
 98 | 
 99 | 
100 | def _generate_subsamples(
101 |     nreads: int, annotated_reads: str, saturation_points: List[int], temp_folder: Optional[str]
102 | ) -> Tuple[Dict[int, pysam.AlignmentFile], Dict[int, str], Dict[int, List[int]]]:
103 |     """
104 |     Creates the saturation files for each saturation point.
105 |     """
106 |     file_ext = os.path.splitext(annotated_reads)[1].lower()
107 |     flag_read, flag_write = ("rb", "wb") if file_ext != ".sam" else ("r", "wh")
108 | 
109 |     annotated_sam = pysam.AlignmentFile(annotated_reads, flag_read)  # type: ignore
110 |     files, file_names, subsampling = {}, {}, {}
111 | 
112 |     for spoint in saturation_points:
113 |         file_name = f"subsample_{spoint}{file_ext}"
114 |         file_name = os.path.join(temp_folder, file_name) if temp_folder else file_name
115 |         files[spoint] = pysam.AlignmentFile(file_name, flag_write, template=annotated_sam)  # type: ignore
116 |         file_names[spoint] = file_name
117 |         indices = list(range(nreads))
118 |         random.shuffle(indices)
119 |         subsampling[spoint] = sorted(indices[:spoint])
120 | 
121 |     annotated_sam.close()
122 |     return files, file_names, subsampling
123 | 
124 | 
125 | def _write_subsamples_to_files(
126 |     files: Dict[int, pysam.AlignmentFile],
127 |     subsampling: Dict[int, List[int]],
128 |     annotated_reads: str,
129 |     saturation_points: List[int],
130 | ) -> None:
131 |     """
132 |     Fill the content of each saturation point (file) using the annodated reads.
133 |     """
134 |     annotated_sam = pysam.AlignmentFile(annotated_reads, "rb")
135 |     index = 0
136 |     sub_indexes = defaultdict(int)  # type: ignore
137 | 
138 |     for read in annotated_sam.fetch(until_eof=True):
139 |         for spoint in saturation_points:
140 |             sub_index = sub_indexes[spoint]
141 |             if sub_index < len(subsampling[spoint]) and subsampling[spoint][sub_index] == index:
142 |                 files[spoint].write(read)
143 |                 sub_indexes[spoint] += 1
144 |         index += 1
145 | 
146 |     annotated_sam.close()
147 |     for file_sam in files.values():
148 |         file_sam.close()
149 | 
150 | 
151 | def _compute_saturation_metrics(
152 |     file_names: Dict[int, str],
153 |     saturation_points: List[int],
154 |     gff_filename: str,
155 |     umi_cluster_algorithm: str,
156 |     umi_allowed_mismatches: int,
157 |     umi_counting_offset: int,
158 |     disable_umi: bool,
159 |     temp_folder: str,
160 |     expName: str,
161 | ) -> Dict[str, List[int]]:
162 |     """
163 |     Generates the dataset for each saturation point (file) and fetch stats.
164 |     """
165 |     results = {"reads": [], "genes": [], "avg_genes": [], "avg_reads": []}  # type: ignore
166 | 
167 |     for spoint in saturation_points:
168 |         input_file = file_names[spoint]
169 |         try:
170 |             stats = createDataset(
171 |                 input_file,
172 |                 temp_folder,
173 |                 gff_filename,
174 |                 umi_cluster_algorithm,
175 |                 umi_allowed_mismatches,
176 |                 umi_counting_offset,
177 |                 disable_umi,
178 |                 expName,
179 |                 verbose=False,
180 |             )
181 |         except Exception as e:
182 |             logger.error("Error computing saturation curve: createDataset failed.")
183 |             raise e
184 | 
185 |         results["reads"].append(stats["reads_after_duplicates_removal"])
186 |         results["genes"].append(stats["genes_found"])
187 |         results["avg_genes"].append(stats["average_genes_feature"])
188 |         results["avg_reads"].append(stats["average_reads_feature"])
189 | 
190 |     return results
191 | 
192 | 
193 | def _cleanup_files(file_names: Dict[int, str]) -> None:
194 |     """
195 |     Remove the files created during the saturation curve.
196 |     """
197 |     for file_name in file_names.values():
198 |         safe_remove(file_name)
199 | 


--------------------------------------------------------------------------------
/stpipeline/common/stats.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This shared object is used to collect
 3 | different statistics and QA parameters for
 4 | the pipeline run.
 5 | """
 6 | 
 7 | import json
 8 | from dataclasses import asdict, dataclass, field
 9 | from typing import List
10 | 
11 | 
12 | @dataclass
13 | class Stats:
14 |     """
15 |     Stats collects information and statistics for use in the ST pipeline.
16 |     """
17 | 
18 |     input_reads_forward: int = 0
19 |     input_reads_reverse: int = 0
20 |     reads_after_trimming_forward: int = 0
21 |     reads_after_trimming_reverse: int = 0
22 |     reads_after_rRNA_trimming: int = 0
23 |     reads_after_mapping: int = 0
24 |     reads_after_annotation: int = 0
25 |     reads_after_demultiplexing: int = 0
26 |     reads_after_duplicates_removal: int = 0
27 |     genes_found: int = 0
28 |     duplicates_found: int = 0
29 |     pipeline_version: str = "-"
30 |     mapper_tool: str = "-"
31 |     annotation_tool: str = "-"
32 |     demultiplex_tool: str = "-"
33 |     input_parameters: List[str] = field(default_factory=list)
34 |     max_genes_feature: int = 0
35 |     min_genes_feature: int = 0
36 |     max_reads_feature: int = 0
37 |     min_reads_feature: int = 0
38 |     average_gene_feature: float = 0.0
39 |     average_reads_feature: float = 0.0
40 | 
41 |     def __str__(self) -> str:
42 |         """
43 |         Returns a string representation of the Stats object.
44 | 
45 |         Returns:
46 |             A formatted string of all stats attributes.
47 |         """
48 |         return "\n".join(f"{field_name}: {getattr(self, field_name)}" for field_name in self.__dataclass_fields__)
49 | 
50 |     def write_json(self, filename: str) -> None:
51 |         """
52 |         Writes the stats to a JSON file.
53 | 
54 |         Args:
55 |             filename: The path to the JSON file to write.
56 |         """
57 |         with open(filename, "w") as file:
58 |             json.dump(asdict(self), file, indent=2, separators=(",", ": "))
59 | 
60 |     @classmethod
61 |     def from_json(cls, filename: str) -> "Stats":
62 |         """
63 |         Creates a Stats object from a JSON file.
64 | 
65 |         Args:
66 |             filename: The path to the JSON file to read.
67 | 
68 |         Returns:
69 |             A Stats object populated with data from the JSON file.
70 |         """
71 |         with open(filename, "r") as file:
72 |             data = json.load(file)
73 |         return cls(**data)
74 | 


--------------------------------------------------------------------------------
/stpipeline/common/transcript.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class Transcript:
 6 |     """
 7 |     Represents a transcript with associated genomic and metadata information.
 8 | 
 9 |     Attributes:
10 |         chrom (str): Chromosome name.
11 |         start (int): Start position of the transcript.
12 |         end (int): End position of the transcript.
13 |         clear_name (str): Clear name of the transcript.
14 |         mapping_quality (int): Mapping quality score.
15 |         strand (str): Strand information ('+' or '-').
16 |         umi (str): Unique Molecular Identifier (UMI).
17 |     """
18 | 
19 |     chrom: str
20 |     start: int
21 |     end: int
22 |     clear_name: str
23 |     mapping_quality: int
24 |     strand: str
25 |     umi: str
26 | 


--------------------------------------------------------------------------------
/stpipeline/common/unique_events_parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module defines the GeneBuffer class and the parse_unique_events function.
  3 | """
  4 | 
  5 | import logging
  6 | import operator
  7 | import os
  8 | from typing import Dict, Generator, List, Optional, Tuple
  9 | 
 10 | import pysam
 11 | 
 12 | from stpipeline.common.gff_reader import gff_lines
 13 | from stpipeline.common.transcript import Transcript
 14 | 
 15 | logger = logging.getLogger("STPipeline")
 16 | 
 17 | 
 18 | class GeneBuffer:
 19 |     """
 20 |     This object defines a buffer by holding a dictionary
 21 |     of genes, spot coordinates, and transcripts.
 22 |     It assumes the transcripts are added in a coordinate-ordered fashion.
 23 | 
 24 |     Attributes:
 25 |         buffer: Dictionary storing gene data.
 26 |         last_position: Last genomic position processed.
 27 |         last_chromosome: Last chromosome processed.
 28 |         gene_end_coordinates: Dictionary mapping gene IDs to their end coordinates and chromosomes.
 29 |     """
 30 | 
 31 |     def __init__(self, gff_filename: str):
 32 |         """
 33 |         Initializes the GeneBuffer object and computes gene end coordinates from a GFF file.
 34 | 
 35 |         Args:
 36 |             gff_filename: Path to the GFF file containing gene annotations.
 37 |         """
 38 |         if not gff_filename or not os.path.isfile(gff_filename):
 39 |             raise ValueError(f"Annotation file {gff_filename} not found.")
 40 |         self.buffer: Dict[str, Dict[Tuple[int, int], List[Transcript]]] = {}
 41 |         self.last_position = 0
 42 |         self.last_chromosome = "chrom"
 43 |         self.gene_end_coordinates: Dict[str, Tuple[Optional[str], int]] = {}
 44 |         self.compute_gene_end_coordinates(gff_filename)
 45 | 
 46 |     def compute_gene_end_coordinates(self, gff_filename: str) -> None:
 47 |         """
 48 |         Reads the end coordinates and chromosomes of all genes present in the GFF file
 49 |         and saves them in a dictionary with the gene ID as the key.
 50 | 
 51 |         Args:
 52 |             gff_filename: Path to the GFF file.
 53 | 
 54 |         Raises:
 55 |             ValueError: If the gene_id attribute is missing in the GFF file.
 56 |         """
 57 |         logger.debug(f"Parsing GFF file {gff_filename} to compute gene end coordinates.")
 58 | 
 59 |         # Parse the annotation file to add gene annotations to dictionary
 60 |         gene_end_coordinates: Dict[str, Tuple[Optional[str], int]] = {}
 61 |         for line in gff_lines(gff_filename):
 62 |             seqname = line["seqname"]
 63 |             end = int(line["end"])
 64 |             gene_id = line.get("gene_id", None)
 65 |             if not gene_id:
 66 |                 msg = f"The gene_id attribute is missing in the annotation file ({gff_filename})."
 67 |                 logger.error(msg)
 68 |                 raise ValueError(msg)
 69 |             if gene_id[0] == '"' and gene_id[-1] == '"':
 70 |                 gene_id = gene_id[1:-1]
 71 |             if gene_id in gene_end_coordinates:
 72 |                 if end > gene_end_coordinates[gene_id][1]:
 73 |                     gene_end_coordinates[gene_id] = (seqname, end)
 74 |             else:
 75 |                 gene_end_coordinates[gene_id] = (seqname, end)
 76 |         # include the __no_feature gene
 77 |         gene_end_coordinates["__no_feature"] = (None, -1)
 78 |         self.gene_end_coordinates = gene_end_coordinates
 79 | 
 80 |     def get_gene_end_position(self, gene: str) -> Tuple[Optional[str], int]:
 81 |         """
 82 |         Returns the genomic end coordinate and chromosome of the given gene.
 83 | 
 84 |         Args:
 85 |             gene: Gene ID.
 86 | 
 87 |         Returns:
 88 |             Chromosome and end coordinate of the gene.
 89 | 
 90 |         Raises:
 91 |             ValueError: If the gene is not found in the annotation file or is ambiguous.
 92 |         """
 93 |         try:
 94 |             return self.gene_end_coordinates[gene]
 95 |         except KeyError:
 96 |             # Handle ambigous genes as defined in htseq
 97 |             if "__ambiguous[" in gene:
 98 |                 ambiguous_genes = gene[gene.index("[") + 1 : gene.index("]")].split("+")
 99 |                 try:
100 |                     return max(
101 |                         [self.gene_end_coordinates[amb_gene] for amb_gene in ambiguous_genes],
102 |                         key=operator.itemgetter(1),
103 |                     )
104 |                 except KeyError:
105 |                     msg = f"Ambiguous gene {gene} not found in annotation file."
106 |                     logger.error(msg)
107 |                     raise ValueError(msg) from None
108 |             else:
109 |                 msg = f"Gene {gene} not found in annotation file."
110 |                 logger.error(msg)
111 |                 raise ValueError(msg) from None
112 | 
113 |     def add_transcript(
114 |         self, gene: str, spot_coordinates: Tuple[int, int], transcript: Transcript, position: int
115 |     ) -> None:
116 |         """
117 |         Adds a transcript to the gene buffer.
118 | 
119 |         Args:
120 |             gene: Gene name.
121 |             spot_coordinates: Spot coordinates (x, y).
122 |             transcript: Transcript information.
123 |             position: Transcript's left-most genomic coordinate.
124 |         """
125 |         self.last_position = position
126 |         self.last_chromosome = transcript.chrom
127 |         self.buffer.setdefault(gene, {}).setdefault(spot_coordinates, []).append(transcript)
128 | 
129 |     def check_and_clear_buffer(
130 |         self, empty: bool = False
131 |     ) -> Generator[Tuple[str, Dict[Tuple[int, int], List[Transcript]]], None, None]:
132 |         """
133 |         Checks and clears the buffer, yielding genes that are outside the current chromosome or position.
134 | 
135 |         Args:
136 |             empty: If True, forces clearing the buffer.
137 | 
138 |         Yields:
139 |             Gene name and its buffer content.
140 |         """
141 |         for gene in list(self.buffer.keys()):
142 |             if gene == "__no_feature" and not empty:
143 |                 continue
144 |             chrom, end_position = self.get_gene_end_position(gene)
145 |             if empty or self.last_position > end_position or self.last_chromosome != chrom:
146 |                 yield gene, self.buffer.pop(gene)
147 | 
148 | 
149 | def parse_unique_events(
150 |     input_file: str, gff_filename: Optional[str] = None
151 | ) -> Generator[Tuple[str, Dict[Tuple[int, int], List[Transcript]]], None, None]:
152 |     """
153 |     Parses transcripts from a coordinate-sorted BAM file and organizes them by gene and spot coordinates.
154 | 
155 |     Args:
156 |         input_file: Path to the input BAM file containing annotated records.
157 |         gff_filename: Path to the GFF file containing gene coordinates.
158 | 
159 |     Yields:
160 |         Gene name and a dictionary mapping spot coordinates to transcripts.
161 |     """
162 |     genes_buffer = GeneBuffer(gff_filename) if gff_filename else None
163 |     genes_dict: Dict[str, Dict[Tuple[int, int], List[Transcript]]] = {}
164 |     sam_file = pysam.AlignmentFile(input_file, "rb")
165 | 
166 |     for rec in sam_file.fetch(until_eof=True):
167 |         clear_name = rec.query_name
168 |         mapping_quality = rec.mapping_quality
169 |         start = rec.reference_start - rec.query_alignment_start
170 |         end = rec.reference_end + (rec.query_length - rec.query_alignment_end)  # type: ignore
171 |         chrom = sam_file.get_reference_name(rec.reference_id)
172 |         strand = "-" if rec.is_reverse else "+"
173 | 
174 |         if strand == "-":
175 |             start, end = end, start
176 | 
177 |         x, y, gene, umi = -1, -1, "None", "None"
178 |         for k, v in rec.get_tags():
179 |             if k == "B1":
180 |                 x = int(v)
181 |             elif k == "B2":
182 |                 y = int(v)
183 |             elif k == "XF":
184 |                 gene = str(v)
185 |             elif k == "B3":
186 |                 umi = str(v)
187 | 
188 |         transcript = Transcript(chrom, start, end, clear_name, mapping_quality, strand, umi)  # type: ignore
189 | 
190 |         if genes_buffer:
191 |             genes_buffer.add_transcript(gene, (x, y), transcript, rec.reference_start)
192 |             for g, t in genes_buffer.check_and_clear_buffer():
193 |                 yield g, t
194 |         else:
195 |             genes_dict.setdefault(gene, {}).setdefault((x, y), []).append(transcript)
196 | 
197 |     sam_file.close()
198 | 
199 |     if genes_buffer:
200 |         for g, t in genes_buffer.check_and_clear_buffer(True):
201 |             yield g, t
202 |     else:
203 |         yield from genes_dict.items()
204 | 


--------------------------------------------------------------------------------
/stpipeline/common/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains some general utilities
  3 | """
  4 | 
  5 | import os
  6 | import shutil
  7 | import subprocess
  8 | import threading
  9 | from datetime import datetime
 10 | from typing import IO, Any, Optional
 11 | 
 12 | 
 13 | def which_program(program: str) -> bool:
 14 |     """
 15 |     Check if a program is installed and available in the system's PATH.
 16 | 
 17 |     Args:
 18 |         program: The name of the program to check.
 19 | 
 20 |     Returns:
 21 |         True if the program is found and executable, False otherwise.
 22 |     """
 23 |     return shutil.which(program) is not None
 24 | 
 25 | 
 26 | class TimeStamper:
 27 |     """
 28 |     Thread-safe time stamper to generate unique numeric timestamps.
 29 |     """
 30 | 
 31 |     def __init__(self) -> None:
 32 |         self.lock = threading.Lock()
 33 |         self.prev: Optional[datetime] = None
 34 |         self.count: int = 0
 35 | 
 36 |     def get_timestamp(self) -> datetime:
 37 |         """
 38 |         Generates a unique numeric timestamp.
 39 | 
 40 |         Returns:
 41 |             A unique timestamp as a datetime object.
 42 |         """
 43 |         with self.lock:
 44 |             ts = datetime.now()
 45 |             if ts == self.prev:
 46 |                 ts += ".%04d" % self.count  # type: ignore
 47 |                 self.count += 1
 48 |             else:
 49 |                 self.prev = ts
 50 |                 self.count = 1
 51 |         return ts
 52 | 
 53 | 
 54 | def safe_remove(filename: Optional[str]) -> None:
 55 |     """
 56 |     Safely removes a file if it exists.
 57 | 
 58 |     Args:
 59 |         filename: Path to the file.
 60 |     """
 61 |     if filename and os.path.isfile(filename):
 62 |         try:
 63 |             os.remove(filename)
 64 |         except Exception as e:
 65 |             print(f"Error removing file {filename}: {e}")
 66 | 
 67 | 
 68 | def safe_open_file(filename: str, mode: str) -> IO[Any]:
 69 |     """
 70 |     Safely opens a file.
 71 | 
 72 |     For write mode, removes the previous file if it exists.
 73 |     For read mode, checks that the file exists.
 74 | 
 75 |     Args:
 76 |         filename: Path to the file.
 77 |         mode: File open mode.
 78 | 
 79 |     Returns:
 80 |         The opened file descriptor.
 81 | 
 82 |     Raises:
 83 |         IOError: If the file does not exist for read mode or invalid mode is provided.
 84 |     """
 85 |     if mode not in ["w", "r"]:
 86 |         raise IOError(f"Error: Invalid mode: {mode}")
 87 |     if "w" in mode:
 88 |         safe_remove(filename)
 89 |     elif "r" in mode and not os.path.isfile(filename):
 90 |         raise IOError(f"Error: File does not exist: {filename}")
 91 | 
 92 |     return open(filename, mode)
 93 | 
 94 | 
 95 | def file_ok(file: Optional[str]) -> bool:
 96 |     """
 97 |     Checks if a file exists and is not empty.
 98 | 
 99 |     Args:
100 |         file: Path to the file.
101 | 
102 |     Returns:
103 |         True if the file exists and is not empty, otherwise False.
104 |     """
105 |     return file is not None and os.path.isfile(file) and os.path.getsize(file) > 0
106 | 
107 | 
108 | def get_star_version() -> str:
109 |     """
110 |     Gets the version of the STAR binary.
111 | 
112 |     Returns:
113 |         The version of STAR or "Not available" if not found.
114 |     """
115 |     try:
116 |         proc = subprocess.Popen(
117 |             ["STAR", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, close_fds=True
118 |         )
119 |         stdout, _ = proc.communicate()
120 |         return stdout.decode().strip()
121 |     except Exception:
122 |         return "Not available"
123 | 
124 | 
125 | def get_taggd_count_version() -> str:
126 |     """
127 |     Gets the version of the Taggd binary.
128 | 
129 |     Returns:
130 |         The version of Taggd or "Not available" if not found.
131 |     """
132 |     try:
133 |         proc = subprocess.Popen(
134 |             ["pip", "show", "taggd"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, close_fds=True
135 |         )
136 |         stdout, _ = proc.communicate()
137 |         for line in stdout.decode().splitlines():
138 |             if "Version:" in line:
139 |                 return line.split()[-1]
140 |     except Exception:
141 |         pass
142 |     return "Not available"
143 | 
144 | 
145 | def get_htseq_count_version() -> str:
146 |     """
147 |     Gets the version of the HTSeqCount binary.
148 | 
149 |     Returns:
150 |         The version of HTSeqCount or "Not available" if not found.
151 |     """
152 |     try:
153 |         proc = subprocess.Popen(
154 |             ["pip", "show", "htseq"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False, close_fds=True
155 |         )
156 |         stdout, _ = proc.communicate()
157 |         for line in stdout.decode().splitlines():
158 |             if "Version:" in line:
159 |                 return line.split()[-1]
160 |     except Exception:
161 |         pass
162 |     return "Not available"
163 | 


--------------------------------------------------------------------------------
/stpipeline/core/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | 


--------------------------------------------------------------------------------
/stpipeline/core/annotation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains a modified version of htseq-count
  3 | with modifications and improvements to perform annotation
  4 | of ST mapped reads (BAM file).
  5 | """
  6 | 
  7 | import logging
  8 | import os
  9 | from typing import List, Optional, Tuple, Dict
 10 | 
 11 | import HTSeq  # type: ignore
 12 | import pysam
 13 | 
 14 | from stpipeline.common.utils import file_ok
 15 | 
 16 | logger = logging.getLogger("STPipeline")
 17 | 
 18 | 
 19 | def invert_strand(iv: HTSeq.GenomicInterval) -> HTSeq.GenomicInterval:
 20 |     """
 21 |     Inverts the strand of a genomic interval.
 22 | 
 23 |     Args:
 24 |         iv: A genomic interval.
 25 | 
 26 |     Returns:
 27 |         A copy of the genomic interval with the strand inverted.
 28 | 
 29 |     Raises:
 30 |         ValueError: If the strand is not '+' or '-'.
 31 |     """
 32 |     iv2 = iv.copy()
 33 |     if iv2.strand == "+":
 34 |         iv2.strand = "-"
 35 |     elif iv2.strand == "-":
 36 |         iv2.strand = "+"
 37 |     else:
 38 |         raise ValueError(f"Illegal strand {iv}")
 39 |     return iv2
 40 | 
 41 | 
 42 | class ReadCounter:
 43 |     def __init__(
 44 |         self,
 45 |         sam_filename: str,
 46 |         gff_filename: str,
 47 |         samtype: str,
 48 |         stranded: str,
 49 |         overlap_mode: str,
 50 |         feature_type: List[str],
 51 |         id_attribute: str,
 52 |         minaqual: int,
 53 |         samout: str,
 54 |         include_non_annotated: bool,
 55 |         htseq_no_ambiguous: bool,
 56 |         output_discarded: Optional[str] = None,
 57 |     ) -> None:
 58 |         """
 59 |         Counts reads in features using a modified version of HTSeq.
 60 | 
 61 |         Args:
 62 |             sam_filename: Path to the input SAM/BAM file.
 63 |             gff_filename: Path to the GFF file with features.
 64 |             samtype: Input file type ('sam' or 'bam').
 65 |             stranded: Strand specificity ('yes', 'no', 'reverse').
 66 |             overlap_mode: Overlap mode for counting ('union', 'intersection-strict', 'intersection-nonempty').
 67 |             feature_type: List of feature types to include (e.g., ['exon']).
 68 |             id_attribute: Attribute to use as feature ID (e.g., 'gene_id').
 69 |             minaqual: Minimum mapping quality for reads.
 70 |             samout: Path to the output SAM/BAM file.
 71 |             include_non_annotated: Whether to include non-annotated reads.
 72 |             htseq_no_ambiguous: Whether to exclude ambiguous reads.
 73 |             outputDiscarded: Path to the output file for discarded reads.
 74 |         """
 75 |         self.sam_filename = sam_filename
 76 |         self.gff_filename = gff_filename
 77 |         self.samtype = samtype
 78 |         self.stranded = stranded
 79 |         self.overlap_mode = overlap_mode
 80 |         self.feature_type = feature_type
 81 |         self.id_attribute = id_attribute
 82 |         self.minaqual = minaqual
 83 |         self.samout = samout
 84 |         self.include_non_annotated = include_non_annotated
 85 |         self.htseq_no_ambiguous = htseq_no_ambiguous
 86 |         self.output_discarded = output_discarded
 87 |         self.annotated = 0
 88 |         self._validate_inputs()
 89 |         self.features, self.counts = self._load_features()
 90 |         self.samoutfile, self.samdiscarded = self._open_sam_files()
 91 |         self.filter_htseq = ["__too_low_aQual", "__not_aligned"]
 92 |         if not self.include_non_annotated:
 93 |             self.filter_htseq.append("__no_feature")
 94 | 
 95 |     def __del__(self) -> None:
 96 |         """Ensures file handles are closed when the object is destroyed."""
 97 |         if self.samoutfile:
 98 |             self.samoutfile.close()
 99 |         if self.samdiscarded:
100 |             self.samdiscarded.close()
101 | 
102 |     def _validate_inputs(self) -> None:
103 |         """Validates input parameters."""
104 |         if self.samtype not in ["bam", "sam"]:
105 |             raise ValueError(f"Incorrect value for samtype {self.samtype}")
106 |         if self.stranded not in ["yes", "no", "reverse"]:
107 |             raise ValueError(f"Incorrect value for stranded option {self.stranded}")
108 |         if self.overlap_mode not in ["union", "intersection-strict", "intersection-nonempty"]:
109 |             raise ValueError(f"Incorrect value for overlap_mode option {self.overlap_mode}")
110 |         if not self.feature_type:
111 |             raise ValueError("Value types cannot be empty")
112 | 
113 |     def _load_features(self) -> Tuple[HTSeq.GenomicArrayOfSets, Dict[str, int]]:
114 |         """Loads genomic features from the GFF file."""
115 |         features = HTSeq.GenomicArrayOfSets("auto", self.stranded != "no")
116 |         counts = {}
117 |         gff = HTSeq.GFF_Reader(self.gff_filename)
118 |         for f in gff:
119 |             if f.type in self.feature_type:
120 |                 feature_id = f.attr.get(self.id_attribute)
121 |                 if feature_id is None:
122 |                     raise ValueError(f"Feature {f.name} does not contain a {self.id_attribute} attribute.")
123 |                 if self.stranded != "no" and f.iv.strand == ".":
124 |                     raise ValueError(f"Feature {f.name} at {f.iv} does not have strand information.")
125 |                 features[f.iv] += feature_id
126 |                 counts[feature_id] = 0
127 | 
128 |         if not counts:
129 |             raise RuntimeError(f"No features of type {','.join(self.feature_type)} found.")
130 | 
131 |         return features, counts
132 | 
133 |     def _open_sam_files(self) -> Tuple[pysam.AlignmentFile, Optional[pysam.AlignmentFile]]:
134 |         """Opens SAM/BAM files for output and discarded reads."""
135 |         flag_write = "wb" if self.samtype == "bam" else "wh"
136 |         flag_read = "rb" if self.samtype == "bam" else "r"
137 |         saminfile = pysam.AlignmentFile(self.sam_filename, flag_read)  # type: ignore
138 |         samoutfile = pysam.AlignmentFile(self.samout, flag_write, template=saminfile)  # type: ignore
139 |         samdiscarded = None
140 |         if self.output_discarded is not None:
141 |             samdiscarded = pysam.AlignmentFile(self.output_discarded, flag_write, template=saminfile)  # type: ignore
142 |         saminfile.close()
143 |         return samoutfile, samdiscarded
144 | 
145 |     def _write_to_samout(self, read: HTSeq.SAM_Alignment, assignment: str) -> None:
146 |         """Writes a read and its assignment to the SAM output file."""
147 |         sam_record = read.to_pysam_AlignedSegment(self.samoutfile)
148 |         sam_record.set_tag("XF", assignment, "Z")
149 |         if (
150 |             read is not None
151 |             and assignment not in self.filter_htseq
152 |             and not (self.htseq_no_ambiguous and "__ambiguous" in assignment)
153 |         ):
154 |             self.samoutfile.write(sam_record)
155 |             self.annotated += 1
156 |         elif self.output_discarded is not None:
157 |             self.samdiscarded.write(sam_record)  # type: ignore
158 | 
159 |     def count_reads(self) -> int:
160 |         """Counts reads in features using a modified version of HTSeq."""
161 |         self.annotated = 0
162 |         try:
163 |             read_seq = (
164 |                 HTSeq.SAM_Reader(self.sam_filename) if self.samtype == "sam" else HTSeq.BAM_Reader(self.sam_filename)
165 |             )
166 |             com = ("M", "=", "X")
167 |             for r in read_seq:
168 |                 if not r.aligned:
169 |                     self._write_to_samout(r, "__not_aligned")
170 |                     continue
171 |                 if r.aQual < self.minaqual:
172 |                     self._write_to_samout(r, "__too_low_aQual")
173 |                     continue
174 |                 iv_seq = (
175 |                     (co.ref_iv for co in r.cigar if co.type in com and co.size > 0)
176 |                     if self.stranded != "reverse"
177 |                     else (invert_strand(co.ref_iv) for co in r.cigar if co.type in com and co.size > 0)
178 |                 )
179 |                 is_union = self.overlap_mode == "union"
180 |                 is_strict = self.overlap_mode == "intersection-strict"
181 |                 unknown_chr = False
182 |                 fs = set()  # type: ignore
183 |                 for iv in iv_seq:
184 |                     if iv.chrom not in self.features.chrom_vectors:
185 |                         unknown_chr = True
186 |                         break
187 |                     for _, fs2 in self.features[iv].steps():
188 |                         if is_union:
189 |                             fs = fs.union(fs2)
190 |                         elif len(fs2) > 0 or is_strict:
191 |                             fs = fs.intersection(fs2) if fs else fs2.copy()
192 |                 if not fs or unknown_chr:
193 |                     self._write_to_samout(r, "__no_feature")
194 |                 elif len(fs) > 1:
195 |                     self._write_to_samout(r, "__ambiguous[" + "+".join(fs) + "]")
196 |                 else:
197 |                     self._write_to_samout(r, fs.pop())
198 |         except Exception as e:
199 |             raise RuntimeError("Error encountered during read counting") from e
200 |         return self.annotated
201 | 
202 | 
203 | def annotateReads(
204 |     mappedReads: str,
205 |     gtfFile: str,
206 |     outputFile: str,
207 |     outputDiscarded: Optional[str],
208 |     mode: str,
209 |     strandness: str,
210 |     htseq_no_ambiguous: bool,
211 |     include_non_annotated: bool,
212 |     feature_types: List[str],
213 | ) -> int:
214 |     """
215 |     Annotates a BAM file with mapped reads using HTSeq and writes annotated records to a file.
216 | 
217 |     Args:
218 |         mappedReads: Path to a BAM file with mapped reads sorted by coordinate.
219 |         gtfFile: Path to an annotation file in GTF format.
220 |         outputFile: Path to write the annotated records (BAM).
221 |         outputDiscarded: Path to write the non-annotated records (BAM).
222 |         mode: Overlapping mode ('union', 'intersection-strict', 'intersection-nonempty').
223 |         strandness: Strand specificity ('yes', 'no', 'reverse').
224 |         htseq_no_ambiguous: Whether to discard ambiguous annotations.
225 |         include_non_annotated: Whether to include non-annotated reads as '__no_feature' in the output.
226 |         feature_types: List of feature types to use for annotation (default is ['exon']).
227 | 
228 |     Returns:
229 |         The number of annotated reads.
230 | 
231 |     Raises:
232 |         RuntimeError: If input files are missing or errors occur during annotation.
233 |     """
234 | 
235 |     if not os.path.isfile(mappedReads):
236 |         raise RuntimeError(f"Input file not found: {mappedReads}")
237 | 
238 |     try:
239 |         annotated = ReadCounter(
240 |             mappedReads,
241 |             gtfFile,
242 |             "bam",  # Type BAM for files
243 |             strandness,  # Strand yes/no/reverse
244 |             mode,  # intersection_nonempty, union, intersection_strict
245 |             feature_types,  # feature types in GFF
246 |             "gene_id",  # gene_id or gene_name
247 |             0,  # Min quality score
248 |             outputFile,
249 |             include_non_annotated,
250 |             htseq_no_ambiguous,
251 |             outputDiscarded,
252 |         ).count_reads()
253 |     except Exception as e:
254 |         error = "Error during annotation: HTSEQ execution failed"
255 |         logger.error(error)
256 |         raise e
257 | 
258 |     if not file_ok(outputFile) or annotated == 0:
259 |         error = f"Error during annotation: Output file not present or empty {outputFile}"
260 |         logger.error(error)
261 |         raise RuntimeError(error)
262 | 
263 |     logger.info(f"Annotated reads: {annotated}")
264 |     return annotated
265 | 


--------------------------------------------------------------------------------
/stpipeline/core/mapping.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains functions related to sequence alignment and barcode
  3 | demultiplexing in the ST pipeline
  4 | """
  5 | 
  6 | import logging
  7 | import os
  8 | import shutil
  9 | import subprocess
 10 | from typing import List, Optional
 11 | 
 12 | from stpipeline.common.utils import file_ok
 13 | 
 14 | logger = logging.getLogger("STPipeline")
 15 | 
 16 | 
 17 | def alignReads(
 18 |     reverse_reads: str,
 19 |     ref_map: str,
 20 |     outputFile: str,
 21 |     annotation: Optional[str],
 22 |     outputFolder: str,
 23 |     trimReverse: int,
 24 |     invTrimReverse: int,
 25 |     cores: int,
 26 |     min_intron_size: int,
 27 |     max_intron_size: int,
 28 |     disable_multimap: bool,
 29 |     diable_softclipping: bool,
 30 |     twopassMode: bool,
 31 |     min_length: int,
 32 |     include_non_mapped: bool,
 33 |     star_genome_loading: str,
 34 |     star_sort_mem_limit: int,
 35 | ) -> int:
 36 |     """
 37 |     Perform sequence alignment using STAR.
 38 | 
 39 |     Args:
 40 |         reverse_reads: File containing reverse reads in BAM format.
 41 |         ref_map: Path to the STAR genome/transcriptome index.
 42 |         outputFile: Name of the SAM/BAM output file for alignments.
 43 |         annotation: GTF annotation file path (optional).
 44 |         outputFolder: Path to the output folder.
 45 |         trimReverse: Number of bases to trim from the 5' end of reverse reads.
 46 |         invTrimReverse: Number of bases to trim from the 3' end of reverse reads.
 47 |         cores: Number of cores for alignment.
 48 |         min_intron_size: Minimum allowed intron size for splice junctions.
 49 |         max_intron_size: Maximum allowed intron size for splice junctions.
 50 |         disable_multimap: If True, disallow multiple alignments.
 51 |         diable_softclipping: If True, disable local alignment.
 52 |         twopassMode: If True, enable 2-pass mode.
 53 |         min_length: Minimum allowed read length after trimming.
 54 |         include_non_mapped: If True, include unaligned reads in the output.
 55 |         star_genome_loading: Type of genome loading for STAR.
 56 |         star_sort_mem_limit: Memory limit for BAM sorting by STAR.
 57 | 
 58 |     Returns:
 59 |         The total number of reads mapped.
 60 | 
 61 |     Raises:
 62 |         RuntimeError: If input files are missing or output file creation fails.
 63 |         ValueError: For invalid input arguments.
 64 |         OSError: If STAR executable is not found.
 65 |     """
 66 | 
 67 |     if not file_ok(reverse_reads):
 68 |         error = f"Error mapping with STAR, input file not present {reverse_reads}\n"
 69 |         logger.error(error)
 70 |         raise RuntimeError(error)
 71 | 
 72 |     # STAR has predefined output names for the files
 73 |     tmpOutputFile = "Aligned.sortedByCoord.out.bam"
 74 |     log_final = "Log.final.out"
 75 | 
 76 |     if outputFolder is not None:
 77 |         tmpOutputFile = os.path.join(outputFolder, tmpOutputFile)
 78 |         log_final = os.path.join(outputFolder, log_final)
 79 | 
 80 |     multi_map_number = 1 if disable_multimap else 20  # 10 is the STAR default
 81 |     alignment_mode = "EndToEnd" if diable_softclipping else "Local"
 82 | 
 83 |     # Prepare STAR command arguments
 84 |     args = [
 85 |         "STAR",
 86 |         "--genomeDir",
 87 |         ref_map,
 88 |         "--readFilesIn",
 89 |         reverse_reads,
 90 |         "--outFileNamePrefix",
 91 |         f"{outputFolder}{os.sep}",
 92 |         "--clip3pNbases",
 93 |         str(invTrimReverse),
 94 |         "--clip5pNbases",
 95 |         str(trimReverse),
 96 |         "--runThreadN",
 97 |         str(max(cores, 1)),
 98 |         "--outFilterType",
 99 |         "Normal",
100 |         "--outSAMtype",
101 |         "BAM",
102 |         "SortedByCoordinate",
103 |         "--alignEndsType",
104 |         alignment_mode,
105 |         "--outSAMorder",
106 |         "Paired",
107 |         "--outSAMprimaryFlag",
108 |         "OneBestScore",
109 |         "--outFilterMultimapNmax",
110 |         str(multi_map_number),
111 |         "--alignIntronMin",
112 |         str(min_intron_size),
113 |         "--alignIntronMax",
114 |         str(max_intron_size),
115 |         "--outFilterMatchNmin",
116 |         str(min_length),
117 |         "--genomeLoad",
118 |         star_genome_loading,
119 |         "--limitBAMsortRAM",
120 |         str(star_sort_mem_limit),
121 |         "--readFilesType",
122 |         "SAM",
123 |         "SE",  # Input in BAM format
124 |         "--readFilesCommand",
125 |         "samtools",
126 |         "view",
127 |         "-h",
128 |     ]
129 | 
130 |     if twopassMode:
131 |         args += ["--twopassMode", "Basic"]
132 | 
133 |     if annotation:
134 |         args += ["--sjdbGTFfile", annotation]
135 | 
136 |     if include_non_mapped:
137 |         args += ["--outSAMunmapped", "Within"]
138 |     else:
139 |         args += ["--outSAMunmapped", "None"]
140 | 
141 |     try:
142 |         logger.debug(f"STAR mapping, running command: {' '.join(args)}")
143 |         proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, shell=False)
144 |         _, errmsg = proc.communicate()
145 | 
146 |         if proc.returncode != 0:
147 |             logger.error(f"Error mapping with STAR: {errmsg.decode()}")
148 |             raise RuntimeError(f"STAR mapping failed with error: {errmsg.decode()}")
149 |     except OSError as e:
150 |         logger.error("Error mapping with STAR: Executable not found.")
151 |         raise e
152 | 
153 |     if not file_ok(tmpOutputFile):
154 |         error = f"Error mapping with STAR: Output file not present {tmpOutputFile}"
155 |         logger.error(error)
156 |         raise RuntimeError(error)
157 | 
158 |     # Rename output file
159 |     shutil.move(tmpOutputFile, outputFile)
160 | 
161 |     uniquely_mapped = 0
162 |     multiple_mapped = 0
163 |     if not file_ok(log_final):
164 |         logger.warning("Log output file from STAR is not present")
165 |     else:
166 |         logger.info("Mapping stats:")
167 |         logger.info("Mapping stats are computed from all the pair reads present in the raw files")
168 |         with open(log_final, "r") as star_log:
169 |             for line in star_log:
170 |                 if "Uniquely mapped reads number" in line:
171 |                     uniquely_mapped = int(line.strip().split()[-1])
172 |                     logger.info(line.strip())
173 |                 elif "Number of reads mapped to multiple loci" in line:
174 |                     multiple_mapped += int(line.strip().split()[-1])
175 |                     logger.info(line.strip())
176 |                 elif "% of reads mapped to multiple loci" in line or "% of reads unmapped: too short" in line:
177 |                     logger.info(line.strip())
178 |             logger.info("Total mapped reads: {}".format(uniquely_mapped + multiple_mapped))
179 | 
180 |     return uniquely_mapped + multiple_mapped
181 | 
182 | 
183 | def barcodeDemultiplexing(
184 |     reads: str,
185 |     idFile: str,
186 |     mismatches: int,
187 |     kmer: int,
188 |     over_hang: int,
189 |     taggd_metric: str,
190 |     taggd_multiple_hits_keep_one: bool,
191 |     taggd_trim_sequences: Optional[List[int]],
192 |     cores: int,
193 |     outputFilePrefix: str,
194 |     keep_discarded_files: bool = False,
195 |     taggd_chunk_size: int = 10000,
196 | ) -> int:
197 |     """
198 |     Perform demultiplexing using Taggd.
199 | 
200 |     Args:
201 |         reads: Path to the FASTQ/BAM file containing barcoded reads.
202 |         idFile: Path to the tab-delimited barcode file (BARCODE - X - Y).
203 |         mismatches: Allowed mismatches for barcode matching.
204 |         kmer: K-mer length for barcode search.
205 |         over_hang: Allowed flanking bases around barcodes.
206 |         taggd_metric: Distance metric algorithm ('Hamming', 'Levenshtein', 'Subglobal').
207 |         taggd_multiple_hits_keep_one: If True, keep one random hit for multiple candidates.
208 |         taggd_trim_sequences: List of coordinates to trim in the barcode (optional).
209 |         cores: Number of subprocesses for Taggd.
210 |         outputFilePrefix: Prefix for output files.
211 |         keep_discarded_files: If True, generate files for unmatched reads.
212 |         taggd_chunk_size: Number of reads to process in each thread.
213 | 
214 |     Returns:
215 |         The total number of reads demultiplexed.
216 | 
217 |     Raises:
218 |         RuntimeError: If input files are missing or output file creation fails.
219 |         ValueError: For invalid input arguments.
220 |         OSError: If Taggd executable is not found.
221 |     """
222 | 
223 |     if not os.path.isfile(reads):
224 |         error = f"Error, input file not present {reads}"
225 |         logger.error(error)
226 |         raise RuntimeError(error)
227 | 
228 |     # Taggd options
229 |     # --metric (subglobal (default) , Levenshtein or Hamming)
230 |     # --slider-increment (space between kmer searches, 0 is default = kmer length)
231 |     # --seed
232 |     # --overhang additional flanking bases around read barcode to allow
233 |     # --estimate-min-edit-distance is set estimate the min edit distance among true barcodes
234 |     # --no-offset-speedup turns off speed up, it might yield more hits (exactly as findIndexes)
235 |     # --homopolymer-filter if set excludes reads where the barcode
236 |     # --chunk-size number of reads to use in each thread for parallel processing
237 |     #  contains a homolopymer of the given length (0 no filter), default 8
238 | 
239 |     args = [
240 |         "taggd_demultiplex",
241 |         "--max-edit-distance",
242 |         str(mismatches),
243 |         "--k",
244 |         str(kmer),
245 |         "--barcode-tag",
246 |         "B0",  # if input is BAM we tell taggd which tag contains the barcode
247 |         "--homopolymer-filter",
248 |         "0",
249 |         "--subprocesses",
250 |         str(cores),
251 |         "--metric",
252 |         taggd_metric,
253 |         "--chunk-size",
254 |         str(taggd_chunk_size),
255 |         "--overhang",
256 |         str(over_hang if taggd_metric != "Hamming" else 0),
257 |     ]
258 | 
259 |     if taggd_trim_sequences:
260 |         args += ["--trim-sequences"] + list(map(str, taggd_trim_sequences))
261 | 
262 |     if taggd_multiple_hits_keep_one:
263 |         args.append("--multiple-hits-keep-one")
264 | 
265 |     if not keep_discarded_files:
266 |         args += ["--no-unmatched-output", "--no-ambiguous-output", "--no-results-output"]
267 | 
268 |     args += [idFile, reads, outputFilePrefix]
269 | 
270 |     try:
271 |         logger.debug(f"Taggd demultiplexing, running command: {' '.join(args)}")
272 |         proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, shell=False)
273 |         stdout, errmsg = proc.communicate()
274 | 
275 |         if proc.returncode != 0:
276 |             logger.error(f"Error demultiplexing with Taggd: {errmsg.decode()}")
277 |             raise RuntimeError(f"Taggd demultiplexing failed with error: {errmsg.decode()}")
278 |     except OSError as e:
279 |         logger.error("Error demultiplexing with Taggd: Executable not found.")
280 |         raise e
281 | 
282 |     outputFile = f"{outputFilePrefix}_matched{os.path.splitext(reads)[1].lower()}"
283 |     if not file_ok(outputFile):
284 |         error = f"Error demultiplexing with Taggd: Output file not present {outputFile}"
285 |         logger.error(error)
286 |         raise RuntimeError(error)
287 | 
288 |     # Write log file and collect stats
289 |     outputLog = f"{outputFilePrefix}_log.txt"
290 |     reads_after_demultiplexing = 0
291 |     with open(outputLog, "w") as fwrite:
292 |         for line in stdout.decode().splitlines():
293 |             fwrite.write(line + "\n")
294 |             tokens = [
295 |                 "Total reads:",
296 |                 "Perfect Matches:",
297 |                 "Imperfect Matches",
298 |                 "Ambiguous matches:",
299 |                 "Non-unique ambiguous matches:",
300 |                 "Unmatched:",
301 |             ]
302 |             if any(x in line for x in tokens):
303 |                 logger.info(line.strip())
304 |             if "Total reads written:" in line:
305 |                 logger.info(line.strip())
306 |                 reads_after_demultiplexing = int(line.strip().split()[-1])
307 | 
308 |     return reads_after_demultiplexing
309 | 


--------------------------------------------------------------------------------
/stpipeline/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/stpipeline/scripts/__init__.py


--------------------------------------------------------------------------------
/stpipeline/scripts/adjust_matrix_coordinates.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Script that takes a matrix of counts
  4 | where the columns are genes and the rows
  5 | are the spot coordinates in a format like this:
  6 | 
  7 |     gene    gene
  8 | XxY
  9 | XxY
 10 | 
 11 | And then removes the spots that are not present in
 12 | the spot coordinates file or if the under_tissue flag is 0
 13 | The format of the spot coordinates file can be like this:
 14 | 
 15 | x y new_x new_y
 16 | 
 17 | or
 18 | 
 19 | x y new_x new_y pixel_x pixel_y
 20 | 
 21 | or
 22 | 
 23 | x y new_x new_y pixel_x pixel_y under_tissue
 24 | 
 25 | Optionally, the coordinates of the spots in the matrix
 26 | can be changed to the adjusted new coordinates (array).
 27 | 
 28 | @Author Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
 29 | """
 30 | 
 31 | import argparse
 32 | import os
 33 | import sys
 34 | 
 35 | import pandas as pd
 36 | 
 37 | 
 38 | def main() -> None:
 39 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 40 |     parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format")
 41 |     parser.add_argument("--outfile", help="Name of the output file")
 42 |     parser.add_argument(
 43 |         "--update-coordinates",
 44 |         action="store_true",
 45 |         default=False,
 46 |         help="Updates the spot coordinates in the output matrix with the\n"
 47 |         "new coordinates present in the coordinates file",
 48 |     )
 49 |     parser.add_argument("--coordinates-file", required=True, help="New coordinates in a tab delimited file")
 50 |     args = parser.parse_args()
 51 | 
 52 |     sys.exit(run(args.counts_matrix, args.coordinates_file, args.update_coordinates, args.outfile))
 53 | 
 54 | 
 55 | def run(counts_matrix: str, coordinates_file: str, update_coordinates: bool, outfile: str) -> int:
 56 |     if not os.path.isfile(counts_matrix) or not os.path.isfile(coordinates_file):
 57 |         print("Error, input file(s) not present or invalid format")
 58 |         return 1
 59 | 
 60 |     if not outfile:
 61 |         outfile = f"adjusted_{os.path.basename(counts_matrix)}"
 62 | 
 63 |     # Get a map of the new coordinates
 64 |     new_coordinates = {}
 65 |     with open(coordinates_file, "r") as filehandler:
 66 |         for line in filehandler.readlines():
 67 |             tokens = line.split()
 68 |             assert len(tokens) == 6 or len(tokens) == 4 or len(tokens) == 7
 69 |             if tokens[0] != "x":
 70 |                 old_x = int(tokens[0])
 71 |                 old_y = int(tokens[1])
 72 |                 new_x = round(float(tokens[2]), 2)
 73 |                 new_y = round(float(tokens[3]), 2)
 74 |                 if len(tokens) == 7 and not bool(tokens[6]):
 75 |                     continue
 76 |                 new_coordinates[(old_x, old_y)] = (new_x, new_y)
 77 | 
 78 |     # Read the data frame (spots as rows)
 79 |     counts_table = pd.read_table(counts_matrix, sep="\t", header=0, index_col=0)
 80 |     new_index_values = []
 81 | 
 82 |     # Replace spot coordinates and remove row if not present
 83 |     for index in counts_table.index:
 84 |         tokens = index.split("x")
 85 |         x = int(tokens[0])
 86 |         y = int(tokens[1])
 87 |         try:
 88 |             new_x, new_y = new_coordinates[(x, y)]
 89 |             if not update_coordinates:
 90 |                 new_x, new_y = x, y
 91 |             new_index_values.append(f"{new_x}x{new_y}")
 92 |         except KeyError:
 93 |             counts_table.drop(index, inplace=True)
 94 | 
 95 |     # Assign the new indexes
 96 |     counts_table.index = pd.Index(new_index_values)
 97 | 
 98 |     # Remove genes that have now a total count of zero
 99 |     counts_table = counts_table.transpose()[counts_table.sum(axis=0) > 0].transpose()
100 | 
101 |     # Write table again
102 |     counts_table.to_csv(outfile, sep="\t")
103 | 
104 |     return 0
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     main()
109 | 


--------------------------------------------------------------------------------
/stpipeline/scripts/convertEnsemblToNames.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """
 3 | Script that parses a Spatial Transcriptomics (ST) data file generated
 4 | with the ST Pipeline in matrix (TSV) format where the genes are named
 5 | with ENSEMBL IDs and generates a new file with the ENSEMBL IDs converted to gene names.
 6 | 
 7 | The script needs the annotation file (GFF format) used to create the ST dataset with
 8 | the ST Pipeline.
 9 | 
10 | @Author Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
11 | """
12 | 
13 | import argparse
14 | import os
15 | import sys
16 | from collections import Counter
17 | 
18 | import pandas as pd
19 | 
20 | from stpipeline.common.gff_reader import gff_lines
21 | 
22 | 
23 | def main() -> None:
24 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
25 |     parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format")
26 |     parser.add_argument("--output", default="output.tsv", help="Name of the output file, default output.tsv")
27 |     parser.add_argument("--annotation", required=True, help="Path to the annotation file used to generate the data")
28 |     args = parser.parse_args()
29 |     sys.exit(run(args.counts_matrix, args.annotation, args.output))
30 | 
31 | 
32 | def run(st_data_file: str, annotation: str, output_file: str) -> int:
33 |     if not os.path.isfile(st_data_file) or not os.path.isfile(annotation):
34 |         print("Error, input file(s) not present or invalid format")
35 |         return 1
36 | 
37 |     # loads a map with the Ensembl ids -> gene name
38 |     gene_map = {}
39 |     for line in gff_lines(annotation):
40 |         try:
41 |             gene_map[line["gene_id"]] = line["gene_name"]
42 |         except KeyError as e:
43 |             print(f"Error, parsing annotation file, missing key {e}")
44 |             return 1
45 |     assert len(gene_map) > 0
46 | 
47 |     # Load the ST dataset
48 |     st_data = pd.read_table(st_data_file, sep="\t", header=0, index_col=0)
49 | 
50 |     # Check that the annotation file given was valid
51 |     if len(gene_map) < len(st_data.columns):
52 |         print("Error, the annotation file given is invalid or does not match the ST data")
53 |         return 1
54 | 
55 |     # Checks that there are no duplicated genes ids in the input data
56 |     gene_ids_counter = Counter(st_data.columns)
57 |     for gene_id, count in gene_ids_counter.most_common():
58 |         if count > 1:
59 |             print(f"Error, Ensembl ID {gene_id} was found {count} times in the input matrix.")
60 |             return 1
61 | 
62 |     # Iterates the genes IDs to get gene names
63 |     genes_replaced = set()
64 |     adjustedList = []
65 |     for gene_id in st_data.columns:
66 |         try:
67 |             gene_name = gene_map[gene_id]
68 |             # Check if the gene_name has been "used" before
69 |             if gene_name not in genes_replaced:
70 |                 genes_replaced.add(gene_name)
71 |             else:
72 |                 # This means the gene name would be duplicated in the output
73 |                 # so we keep the Ensembl ID.
74 |                 # We assume input Ensembl ids are unique as we checked this before
75 |                 gene_name = gene_id
76 |                 print(
77 |                     f"Warning, gene {gene_name} was already matched so the original Ensembl ID {gene_id} will be kept"
78 |                 )
79 |         except KeyError:
80 |             print(f"Warning, {gene_id} was not found in the annotation so the original Ensembl ID will be kept")
81 |             gene_name = gene_id
82 |         adjustedList.append(gene_name)
83 | 
84 |     # Update the table with the gene names
85 |     st_data.columns = pd.Index(adjustedList)
86 | 
87 |     # Write table to file
88 |     st_data.to_csv(output_file, sep="\t")
89 | 
90 |     return 0
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------
/stpipeline/scripts/filter_gene_type_matrix.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """
 3 | Script that takes a matrix of counts
 4 | where the columns are genes and the rows
 5 | are spot coordinates
 6 | 
 7 |         gene    gene
 8 | XxY
 9 | XxY
10 | 
11 | And removes the columns of genes whose functional type is not in the
12 | allowed types provided (Ensembl annotation gene type).
13 | 
14 | The script needs to be given an annotation file in GFF format.
15 | 
16 | @Author Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
17 | """
18 | 
19 | import argparse
20 | import os
21 | import sys
22 | from typing import List
23 | 
24 | import pandas as pd
25 | 
26 | from stpipeline.common.gff_reader import gff_lines
27 | 
28 | 
29 | def main() -> None:
30 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
31 |     parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format")
32 |     parser.add_argument("--outfile", help="Name of the output file")
33 |     parser.add_argument(
34 |         "--gene-types-keep",
35 |         required=True,
36 |         nargs="+",
37 |         type=str,
38 |         help="List of Ensembl gene types to keep (E.x protein_coding lincRNA",
39 |     )
40 |     parser.add_argument("--annotation", help="The Ensembl annotation file", required=True, type=str)
41 |     parser.add_argument(
42 |         "--ensembl-ids",
43 |         action="store_true",
44 |         default=False,
45 |         help="Pass this parameter if the genes in the matrix " "are named with Ensembl Ids instead of gene names",
46 |     )
47 |     args = parser.parse_args()
48 |     sys.exit(run(args.counts_matrix, args.gene_types_keep, args.outfile, args.annotation, args.ensembl_ids))
49 | 
50 | 
51 | def run(counts_matrix: str, gene_types_keep: List[str], outfile: str, annotation: str, ensembl_ids: bool) -> int:
52 |     if not os.path.isfile(counts_matrix) or not os.path.isfile(annotation):
53 |         print("Error, input file(s) not present or invalid format")
54 |         return 1
55 | 
56 |     if not outfile:
57 |         outfile = "filtered_{}".format(os.path.basename(counts_matrix))
58 | 
59 |     gene_types = {}
60 |     for line in gff_lines(annotation):
61 |         try:
62 |             gene_name = line["gene_id"] if ensembl_ids else line["gene_name"]
63 |             gene_types[gene_name] = line["gene_type"] if "gene_type" in line else line["gene_biotype"]
64 |         except KeyError as e:
65 |             print("Error, parsing annotation file, missing key {}".format(e))
66 |     assert len(gene_types) > 0
67 | 
68 |     # Read the data frame (genes as columns)
69 |     counts_table = pd.read_table(counts_matrix, sep="\t", header=0, index_col=0)
70 |     genes = counts_table.columns
71 | 
72 |     # Filter out genes that match any of the allowed types
73 |     genes_drop = []
74 |     for gene in genes:
75 |         try:
76 |             if gene_types[gene] not in gene_types_keep:
77 |                 genes_drop.append(gene)
78 |         except KeyError:
79 |             print(f"Warning, {gene} was not found in the annotation")
80 | 
81 |     if len(genes_drop) > 0:
82 |         counts_table.drop(genes_drop, axis=1, inplace=True)
83 |     else:
84 |         print("Not a single gene could be discarded...")
85 | 
86 |     # Write filtered table
87 |     counts_table.to_csv(outfile, sep="\t")
88 | 
89 |     return 0
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     main()
94 | 


--------------------------------------------------------------------------------
/stpipeline/scripts/merge_fastq.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Script that merges the FASTQ files present in an Illumina run folder path.
  4 | The script merges the FASTQ files based on the identifiers (typically indexes) given
  5 | as input (one identifier to each sample) and puts the merged files in the given output folder.
  6 | 
  7 | @Author Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
  8 | """
  9 | 
 10 | import argparse
 11 | import glob
 12 | import os
 13 | import shutil
 14 | import subprocess
 15 | import sys
 16 | from typing import IO, List, Union
 17 | 
 18 | 
 19 | def run_command(command: List[str], out: Union[int, IO[bytes]] = subprocess.PIPE) -> None:
 20 |     """
 21 |     Executes a shell command and prints its stdout and stderr.
 22 | 
 23 |     Args:
 24 |         command: The command to execute, represented as a list of strings.
 25 |         out: The output stream for the command's stdout. Defaults to subprocess.PIPE.
 26 | 
 27 |     Raises:
 28 |         Exception: If an error occurs during command execution.
 29 |     """
 30 |     try:
 31 |         print(f"Running command: {' '.join(x for x in command).rstrip()}")
 32 |         proc = subprocess.Popen(command, stdout=out, stderr=subprocess.PIPE, close_fds=True, shell=False)
 33 |         stdout, errmsg = proc.communicate()
 34 |         print(stdout)
 35 |         print(errmsg)
 36 |     except Exception as e:
 37 |         raise e
 38 | 
 39 | 
 40 | def main() -> None:
 41 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 42 |     parser.add_argument("--run-path", required=True, help="Path to the run folder")
 43 |     parser.add_argument("--out-path", required=True, help="Path to the output folder")
 44 |     parser.add_argument(
 45 |         "--identifiers",
 46 |         required=True,
 47 |         nargs="+",
 48 |         type=str,
 49 |         help="List of identifiers for each sample (E.x. S1 S2 S3 S4)",
 50 |     )
 51 |     args = parser.parse_args()
 52 |     sys.exit(run(args.run_path, args.identifiers, args.out_path))
 53 | 
 54 | 
 55 | def run(run_path: str, indexes: List[str], out_path: str) -> int:
 56 |     if not os.path.isdir(run_path) or not os.path.isdir(out_path):
 57 |         print("Error, either run_path or out_path folders do not exist")
 58 |         return 1
 59 | 
 60 |     # First gunzip all the FASTQ files
 61 |     os.chdir(run_path)
 62 |     for file in glob.glob("*.gz"):
 63 |         try:
 64 |             run_command(["gunzip", "-f", file])
 65 |         except Exception as e:
 66 |             print(f"Error, gunziping FASTQ file {file}, {e}")
 67 |             return 1
 68 | 
 69 |     # Second merge the FASTQ files
 70 |     for index in indexes:
 71 |         r1_files = sorted(glob.glob("*{}*R1*.fastq".format(index)))
 72 |         r2_files = sorted(glob.glob("*{}*R2*.fastq".format(index)))
 73 |         try:
 74 |             with open("{}_R1.fastq".format(index), "w") as file1:
 75 |                 run_command(["cat"] + r1_files, out=file1)  # type: ignore
 76 |             with open("{}_R2.fastq".format(index), "w") as file2:
 77 |                 run_command(["cat"] + r2_files, out=file2)  # type: ignore
 78 |         except Exception as e:
 79 |             print(f"Error, merging FASTQ files, {e}")
 80 |             return 1
 81 | 
 82 |     # Third gzip everything again
 83 |     for file in glob.glob("*.fastq"):
 84 |         try:
 85 |             run_command(["gzip", "-f", file])
 86 |         except Exception as e:
 87 |             print(f"Error, gziping FASTQ file {file}, {e}")
 88 |             return 1
 89 | 
 90 |     # Move merged FASTQ files to output path
 91 |     if run_path != out_path:
 92 |         for index in indexes:
 93 |             for file in glob.glob("{}_R*".format(index)):
 94 |                 shutil.move(file, out_path)
 95 | 
 96 |     return 0
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/stpipeline/scripts/multi_qa.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Script that creates multiple QC plots and stats
  4 | from multiple Spatial Transcriptomics datasets in TSV format.
  5 | 
  6 | This is useful when one have several consecutive sections
  7 | or sections from the same model.
  8 | 
  9 | The tool generates:
 10 | 
 11 | - Violin plots (genes and counts)
 12 | - Genes shared % pair-wise matrix
 13 | - Correlation pair-wise matrix
 14 | - Correlation pair-wise scatter plots
 15 | - PCA plot (one dot per dataset)
 16 | 
 17 | @Author Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
 18 | """
 19 | 
 20 | import argparse
 21 | import os
 22 | import sys
 23 | from typing import Any, List
 24 | 
 25 | import matplotlib.patches as mpatches
 26 | import matplotlib.pyplot as plt
 27 | import numpy as np
 28 | import pandas as pd
 29 | from scipy.stats.stats import pearsonr  # type: ignore
 30 | from sklearn.decomposition import PCA  # type: ignore
 31 | 
 32 | color_map = [
 33 |     "red",
 34 |     "green",
 35 |     "blue",
 36 |     "orange",
 37 |     "cyan",
 38 |     "yellow",
 39 |     "orchid",
 40 |     "saddlebrown",
 41 |     "darkcyan",
 42 |     "gray",
 43 |     "darkred",
 44 |     "darkgreen",
 45 |     "darkblue",
 46 |     "antiquewhite",
 47 |     "bisque",
 48 |     "black",
 49 |     "slategray",
 50 |     "gold",
 51 |     "floralwhite",
 52 |     "aliceblue",
 53 |     "plum",
 54 |     "cadetblue",
 55 |     "coral",
 56 |     "olive",
 57 |     "khaki",
 58 |     "lightsalmon",
 59 | ]
 60 | 
 61 | 
 62 | def create_violin_plot(data: List[List[float]], pos: List[int], title: str, outfile: str) -> None:
 63 |     """
 64 |     Creates a violin plot and saves it as a PDF.
 65 | 
 66 |     Args:
 67 |         data: The data to plot, where each sublist represents a dataset.
 68 |         pos: Positions of the datasets on the x-axis.
 69 |         title: The title of the plot.
 70 |         outfile: The file path where the plot will be saved.
 71 | 
 72 |     Returns:
 73 |         None
 74 |     """
 75 |     fig, ax = plt.subplots(figsize=(14, 10))
 76 |     fig.subplots_adjust(left=0.075, right=0.95, top=0.9, bottom=0.25)
 77 |     ax.violinplot(data, pos, showmeans=True, showextrema=True, showmedians=True)
 78 |     ax.set_axisbelow(True)
 79 |     ax.set_title(title)
 80 |     fig.savefig(outfile, format="pdf", dpi=90)
 81 | 
 82 | 
 83 | def create_pca_plot(data: Any, labels: List[str], title: str, outfile: str) -> None:
 84 |     """
 85 |     Creates a PCA scatter plot and saves it as a PDF.
 86 | 
 87 |     Args:
 88 |         data: A 2D array where each row represents a data point, and the columns are the PCA components.
 89 |         labels: Labels corresponding to the data points for the legend.
 90 |         title: The title of the plot.
 91 |         outfile: The file path where the plot will be saved.
 92 | 
 93 |     Returns:
 94 |         None
 95 |     """
 96 |     fig, ax = plt.subplots(figsize=(14, 10))
 97 |     class_colours = [color_map[i] for i in range(len(data))]
 98 |     ax.scatter(data[:, 0], data[:, 1], s=20, c=class_colours, edgecolor="none")
 99 |     ax.set_xlabel("PC1")
100 |     ax.set_ylabel("PC2")
101 |     ax.set_title(title)
102 |     recs = [mpatches.Rectangle((0, 0), 1, 1, fc=color) for color in class_colours]
103 |     ax.legend(recs, labels, loc=4, prop={"size": 6})
104 |     fig.savefig(outfile, format="pdf", dpi=90)
105 | 
106 | 
107 | def main() -> None:
108 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
109 |     parser.add_argument(
110 |         "counts_matrix_files", nargs="+", help="One or more matrices with gene counts (genes as columns) in TSV format"
111 |     )
112 |     parser.add_argument("--outdir", default=None, help="Path to the output directory")
113 |     parser.add_argument(
114 |         "--use-log-scale", action="store_true", default=False, help="Convert counts to log space for the correlation"
115 |     )
116 |     args = parser.parse_args()
117 | 
118 |     sys.exit(run(args.counts_matrix_files, args.outdir, args.use_log_scale))
119 | 
120 | 
121 | def run(counts_table_files: List[str], outdir: str, use_log: bool) -> int:
122 |     if len(counts_table_files) == 0 or any(not os.path.isfile(f) for f in counts_table_files):
123 |         print("Error, input file(s) not present or invalid format")
124 |         return 1
125 | 
126 |     if len(counts_table_files) < 2:
127 |         print("Error, minimum number of input datasets is 2")
128 |         return 1
129 | 
130 |     if outdir is None or not os.path.isdir(outdir):
131 |         outdir = os.getcwd()
132 |     outdir = os.path.abspath(outdir)
133 | 
134 |     print(f"Output directory {outdir}")
135 |     print(f"Input datasets {' '.join(counts_table_files)}")
136 | 
137 |     # Parse datasets and sort them by column
138 |     datasets = [
139 |         (pd.read_table(x, sep="\t", header=0, index_col=0).sort_index(axis=1), os.path.splitext(os.path.basename(x))[0])
140 |         for x in counts_table_files
141 |     ]
142 | 
143 |     # Common genes and all genes
144 |     common_genes = set(datasets[0][0].columns)
145 |     all_genes = set(datasets[0][0].columns)
146 |     for dataset, _ in datasets[1:]:
147 |         common_genes &= set(dataset.columns)
148 |         all_genes |= set(dataset.columns)
149 | 
150 |     genes_counts = pd.DataFrame(index=pd.Index([x[1] for x in datasets]), columns=pd.Index(all_genes))
151 |     genes_counts.fillna(0, inplace=True)
152 |     violin_data_reads = []
153 |     violin_data_genes = []
154 |     violin_data_pos = []
155 |     for i, (dataset, name) in enumerate(datasets):
156 |         # Append sum of read counts per row
157 |         violin_data_reads.append(dataset.sum(axis=1).tolist())
158 |         # Append the count of genes with non-zero values
159 |         violin_data_genes.append((dataset > 0).sum(axis=1).tolist())
160 |         # Keep track of the position for plotting
161 |         violin_data_pos.append(i + 1)
162 |         # Update gene counts for the dataset
163 |         # Add rows for dataset names
164 |         if name not in genes_counts.index:
165 |             genes_counts.loc[name] = 0
166 |         # Ensure all dataset columns exist in genes_counts
167 |         missing_cols = set(dataset.columns) - set(genes_counts.columns)
168 |         for col in missing_cols:
169 |             genes_counts[col] = 0
170 |         # Assign dataset sums to the genes_counts DataFrame
171 |         genes_counts.loc[name, dataset.columns] = dataset.sum(axis=0).values
172 | 
173 |     genes_counts.fillna(0, inplace=True)
174 | 
175 |     # Create the violin plots
176 |     create_violin_plot(
177 |         violin_data_reads,
178 |         violin_data_pos,
179 |         "Total reads",
180 |         os.path.join(outdir, "violin_plot_reads.pdf"),
181 |     )
182 |     create_violin_plot(
183 |         violin_data_genes,
184 |         violin_data_pos,
185 |         "Total genes",
186 |         os.path.join(outdir, "violin_plot_genes.pdf"),
187 |     )
188 | 
189 |     # Compute and plot PCA (sum gene counts)
190 |     decomp_model = PCA(n_components=2, whiten=True, copy=True)
191 |     reduced_data = decomp_model.fit_transform(np.log1p(genes_counts))
192 |     create_pca_plot(
193 |         reduced_data, genes_counts.index.to_list(), "PCA (sum gene counts)", os.path.join(outdir, "pca.pdf")
194 |     )
195 | 
196 |     # Measure percentage of genes and correlations
197 |     genes_similarities = pd.DataFrame(index=counts_table_files, columns=counts_table_files)
198 |     genes_similarities.fillna(0, inplace=True)
199 | 
200 |     genes_correlations = pd.DataFrame(index=counts_table_files, columns=counts_table_files)
201 |     genes_correlations.fillna(0, inplace=True)
202 | 
203 |     # Compute and create gene correlation plots
204 |     n_col = len(counts_table_files)
205 |     n_row = len(counts_table_files)
206 |     plt.rcParams.update({"font.size": 6})
207 |     fig, ax = plt.subplots(n_col, n_row, sharex="col", sharey="row", figsize=(3 * n_col, 3 * n_row))
208 |     fig.subplots_adjust(hspace=0.4, wspace=0.4)
209 |     for i, (d1, n1) in enumerate(datasets):
210 |         genes_1 = set(d1.columns)
211 |         n_genes_1 = float(len(genes_1))
212 |         common_to_all = (n_genes_1 - len(genes_1 - common_genes)) / n_genes_1
213 |         print("{} shares {} genes with all the rest".format(n1, common_to_all))
214 |         for j, (d2, n2) in enumerate(datasets):
215 |             genes_2 = set(d2.columns)
216 |             common_to_d2 = (n_genes_1 - len(genes_1 - genes_2)) / n_genes_1
217 |             common_d1_d2 = list(genes_1.intersection(genes_2))
218 |             sum_counts_1 = d1.loc[:, common_d1_d2].sum(axis=0)
219 |             sum_counts_1 = np.log(sum_counts_1) if use_log else sum_counts_1
220 |             sum_counts_2 = d2.loc[:, common_d1_d2].sum(axis=0)
221 |             sum_counts_2 = np.log(sum_counts_2) if use_log else sum_counts_2
222 |             genes_similarities.loc[n1, n2] = common_to_d2
223 |             genes_correlations.loc[n1, n2] = pearsonr(sum_counts_1, sum_counts_2)[0]
224 |             ax[i, j].scatter(sum_counts_1, sum_counts_2, s=5, c="blue", edgecolor="none")
225 |             ax[i, j].set_xlabel(n1)
226 |             ax[i, j].set_ylabel(n2)
227 | 
228 |     fig.savefig(os.path.join(outdir, "gene_correlations.png"), format="png", dpi=180)
229 |     genes_similarities.to_csv(os.path.join(outdir, "gene_similarities.tsv"), sep="\t")
230 |     genes_correlations.to_csv(os.path.join(outdir, "gene_correlations.tsv"), sep="\t")
231 | 
232 |     return 0
233 | 
234 | 
235 | if __name__ == "__main__":
236 |     main()
237 | 


--------------------------------------------------------------------------------
/stpipeline/scripts/st_pipeline_run.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """
 3 | ST Pipeline is a tool to process Spatial Transcriptomics raw datasets (FASTQ).
 4 | The raw data is filtered, aligned to a genome, annotated to a reference,
 5 | demultiplexed by array coordinates and then aggregated by counts
 6 | that are not duplicates using the Unique Molecular Indentifiers (UMI).
 7 | The output contains the counts matrix (TSV), a stats file, a log file
 8 | and a BED file with all the transcripts.
 9 | 
10 | The ST Pipeline requires two FASTQ files, an IDs files (BARCODE, X, Y),
11 | the path to a STAR genome index, the path to a annotation file in GTF format
12 | an a dataset name.
13 | 
14 | The ST Pipeline has many parameters and options, you can see a description of them
15 | by typing : st_pipeline_run --help
16 | 
17 | @Author Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
18 | """
19 | 
20 | import argparse
21 | import sys
22 | 
23 | from stpipeline.core.pipeline import Pipeline
24 | 
25 | 
26 | def main() -> int:
27 |     # Create pipeline object
28 |     pipeline = Pipeline()
29 | 
30 |     # Create a parser
31 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
32 | 
33 |     # Parse parameters, sanity check and run the pipeline
34 |     try:
35 |         parser = pipeline.createParameters(parser)
36 | 
37 |         # Parse arguments
38 |         options = parser.parse_args()
39 |         pipeline.load_parameters(options)
40 |         print("ST Pipeline, parameters loaded")
41 | 
42 |         # Create logger
43 |         pipeline.createLogger()
44 |         print("ST Pipeline, logger created")
45 | 
46 |         # Sanity check
47 |         pipeline.sanityCheck()
48 |         print("ST Pipeline, sanity check passed. Starting the run...")
49 | 
50 |         # Run the pipeline
51 |         pipeline.run()
52 |         print("ST Pipeline, run completed!")
53 |     except Exception as e:
54 |         print("Error running the pipeline")
55 |         print(str(e))
56 |         return 1
57 |     finally:
58 |         pipeline.clean_filenames()
59 | 
60 |     return 0
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     sys.exit(main())
65 | 


--------------------------------------------------------------------------------
/stpipeline/scripts/st_qa.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Script that performs a basic Quality Control analysis
  4 | of a Spatial Transcriptomics dataset (matrix in TSV) format.
  5 | 
  6 | The script writes stats and generates some plots in the folder
  7 | where it is run.
  8 | 
  9 | @Author Jose Fernandez Navarro <jc.fernandez.navarro@gmail.com>
 10 | """
 11 | 
 12 | import argparse
 13 | import os.path
 14 | import sys
 15 | from typing import List
 16 | 
 17 | import matplotlib.pyplot as plt
 18 | import numpy as np
 19 | import numpy.typing as npt
 20 | import pandas as pd
 21 | import seaborn as sns  # type: ignore
 22 | 
 23 | 
 24 | def scatter_plot(
 25 |     x_points: List[float],
 26 |     y_points: List[float],
 27 |     output: str,
 28 |     colors: npt.NDArray[np.int32],
 29 |     title: str = "Scatter",
 30 |     xlabel: str = "X",
 31 |     ylabel: str = "Y",
 32 | ) -> None:
 33 |     """
 34 |     Creates a scatter plot of a set of points (x, y) with corresponding color values
 35 |     and saves it as a PDF.
 36 | 
 37 |     Args:
 38 |         x_points: A 1D array of x coordinates.
 39 |         y_points: A 1D array of y coordinates.
 40 |         output: The file path where the plot will be saved.
 41 |         colors: A 1D array of color values for each point.
 42 |         title: The title of the plot. Defaults to "Scatter".
 43 |         xlabel: The label for the x-axis. Defaults to "X".
 44 |         ylabel: The label for the y-axis. Defaults to "Y".
 45 | 
 46 |     Returns:
 47 |         None
 48 | 
 49 |     Raises:
 50 |         RuntimeError: If an error occurs during plot creation or saving.
 51 |     """
 52 |     try:
 53 |         fig = plt.figure()
 54 |         plt.scatter(x_points, y_points, c=colors, cmap=plt.get_cmap("YlOrBr"), edgecolor="none", s=50)
 55 |         plt.xlabel(xlabel)
 56 |         plt.ylabel(ylabel)
 57 |         plt.gca().invert_yaxis()
 58 |         plt.title(title)
 59 |         plt.colorbar()
 60 |         plt.subplots_adjust(left=0.15)
 61 |         fig.savefig(output, format="pdf", dpi=300)
 62 |     except Exception as e:
 63 |         raise RuntimeError("Failed to create scatter plot") from e
 64 | 
 65 | 
 66 | def histogram(
 67 |     x_points: npt.NDArray[np.int32],
 68 |     output: str,
 69 |     title: str = "Histogram",
 70 |     xlabel: str = "X",
 71 |     ylabel: str = "Y",
 72 |     nbins: int = 50,
 73 |     color: str = "blue",
 74 | ) -> None:
 75 |     """
 76 |     Generates a histogram with the given data points and saves it as a PDF.
 77 | 
 78 |     Args:
 79 |         x_points: A list of x coordinates to be plotted.
 80 |         output: The file path where the plot will be saved.
 81 |         title: The title of the plot. Defaults to "Histogram".
 82 |         xlabel: The label for the x-axis. Defaults to "X".
 83 |         ylabel: The label for the y-axis. Defaults to "Y".
 84 |         nbins: The number of bins for the histogram. Defaults to 50.
 85 |         color: The color of the histogram. Defaults to "blue".
 86 | 
 87 |     Returns:
 88 |         None
 89 | 
 90 |     Raises:
 91 |         RuntimeError: If an error occurs during plot creation or saving.
 92 |     """
 93 |     try:
 94 |         fig = plt.figure()
 95 |         plt.hist(x_points, bins=nbins, facecolor=color)
 96 |         plt.xlabel(xlabel)
 97 |         plt.ylabel(ylabel)
 98 |         plt.title(title)
 99 |         plt.subplots_adjust(left=0.15)
100 |         fig.savefig(output, format="pdf", dpi=300)
101 |     except Exception as e:
102 |         raise RuntimeError("Failed to create histogram") from e
103 | 
104 | 
105 | def main() -> None:
106 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
107 |     parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format")
108 |     parser.add_argument("--outdir", default=None, help="Path to the output directory")
109 |     args = parser.parse_args()
110 |     sys.exit(run(args.counts_matrix, args.outdir))
111 | 
112 | 
113 | def run(input_data: str, outdir: str) -> int:
114 |     if not os.path.isfile(input_data):
115 |         raise RuntimeError("Error, input file not present or invalid format")
116 | 
117 |     if outdir is None or not os.path.isdir(outdir):
118 |         outdir = os.getcwd()
119 |     outdir = os.path.abspath(outdir)
120 | 
121 |     print(f"Output directory {outdir}")
122 |     print(f"Input dataset {input_data}")
123 | 
124 |     # Parse the data
125 |     counts_table = pd.read_table(input_data, sep="\t", header=0, index_col=0)
126 | 
127 |     # Get the basename
128 |     input_name = os.path.basename(input_data).split(".")[0]
129 | 
130 |     # Compute some statistics
131 |     total_barcodes = len(counts_table.index)
132 |     total_transcripts = np.sum(counts_table.values, dtype=np.int32)
133 |     number_genes = len(counts_table.columns)
134 |     max_count = counts_table.max().max()
135 |     min_count = counts_table.min().min()
136 |     aggregated_spot_counts = counts_table.sum(axis=1).to_numpy()
137 |     aggregated_gene_counts = (counts_table > 0).sum(axis=1).to_numpy()
138 |     aggregated_gene_counts_1 = (counts_table > 1).sum(axis=1).to_numpy()
139 |     aggregated_gene_counts_2 = (counts_table > 2).sum(axis=1).to_numpy()
140 |     aggregated_gene_gene_counts = (counts_table > 0).sum(axis=0).to_numpy()
141 |     aggregated_gene_gene_counts_1 = (counts_table > 1).sum(axis=0).to_numpy()
142 |     aggregated_gene_gene_counts_2 = (counts_table > 2).sum(axis=0).to_numpy()
143 |     max_genes_feature = aggregated_gene_counts.max()
144 |     min_genes_feature = aggregated_gene_counts.min()
145 |     max_reads_feature = aggregated_spot_counts.max()
146 |     min_reads_feature = aggregated_spot_counts.min()
147 |     average_reads_feature = np.mean(aggregated_spot_counts)
148 |     average_genes_feature = np.mean(aggregated_gene_counts)
149 |     std_reads_feature = np.std(aggregated_spot_counts)
150 |     std_genes_feature = np.std(aggregated_gene_counts)
151 | 
152 |     # Generate heatmap plots
153 |     histogram(
154 |         aggregated_spot_counts,
155 |         nbins=20,
156 |         xlabel="#Reads",
157 |         ylabel="#Spots",
158 |         output=os.path.join(outdir, input_name + "_hist_reads_spot.pdf"),
159 |         title="Reads per spot",
160 |     )
161 |     histogram(
162 |         aggregated_gene_counts,
163 |         nbins=20,
164 |         xlabel="#Genes",
165 |         ylabel="#Spots",
166 |         output=os.path.join(outdir, input_name + "_hist_genes_spot.pdf"),
167 |         title="Genes per spot (>0)",
168 |     )
169 |     histogram(
170 |         aggregated_gene_counts_1,
171 |         nbins=20,
172 |         xlabel="#Genes",
173 |         ylabel="#Spots",
174 |         output=os.path.join(outdir, input_name + "_hist_genes_spots_1.pdf"),
175 |         title="Genes per spot (>1)",
176 |     )
177 |     histogram(
178 |         aggregated_gene_counts_2,
179 |         nbins=20,
180 |         xlabel="#Genes",
181 |         ylabel="#Spots",
182 |         output=os.path.join(outdir, input_name + "_hist_genes_spots_2.pdf"),
183 |         title="Genes per spot (>2)",
184 |     )
185 |     histogram(
186 |         aggregated_gene_gene_counts,
187 |         nbins=20,
188 |         xlabel="#Spots",
189 |         ylabel="#Genes",
190 |         output=os.path.join(outdir, input_name + "_hist_spots_gene.pdf"),
191 |         title="Spots per gene (>0)",
192 |     )
193 |     histogram(
194 |         aggregated_gene_gene_counts_1,
195 |         nbins=20,
196 |         xlabel="#Spots",
197 |         ylabel="#Genes",
198 |         output=os.path.join(outdir, input_name + "_hist_spots_gene_1.pdf"),
199 |         title="Spots per gene (>1)",
200 |     )
201 |     histogram(
202 |         aggregated_gene_gene_counts_2,
203 |         nbins=20,
204 |         xlabel="#Spots",
205 |         ylabel="#Genes",
206 |         output=os.path.join(outdir, input_name + "_hist_spots_gene_2.pdf"),
207 |         title="Spots per gene (>2)",
208 |     )
209 |     plt.clf()
210 | 
211 |     # Generate density plots
212 |     sns.displot(aggregated_gene_counts, kind="kde", label="Counts > 0")
213 |     sns.displot(aggregated_gene_counts_1, kind="kde", label="Counts > 1")
214 |     sns_plot = sns.displot(aggregated_gene_counts_2, kind="kde", label="Counts > 2")
215 |     fig = sns_plot._figure
216 |     fig.savefig(os.path.join(outdir, input_name + "_density_genes_by_spot.pdf"))
217 |     plt.clf()
218 | 
219 |     sns.displot(aggregated_gene_gene_counts, kind="kde", label="Counts > 0")
220 |     sns.displot(aggregated_gene_gene_counts_1, kind="kde", label="Counts > 1")
221 |     sns_plot = sns.displot(aggregated_gene_gene_counts_2, kind="kde", label="Counts > 2")
222 |     fig = sns_plot._figure
223 |     fig.savefig(os.path.join(outdir, input_name + "_density_spots_by_gene.pdf"))
224 |     plt.clf()
225 | 
226 |     sns.scatterplot(x=aggregated_spot_counts, y=aggregated_gene_counts, label="Gene counts >0")
227 |     sns.scatterplot(x=aggregated_spot_counts, y=aggregated_gene_counts_1, label="Gene counts >1")
228 |     sns_plot = sns.scatterplot(x=aggregated_spot_counts, y=aggregated_gene_counts_2, label="Gene counts >2")
229 |     fig = sns_plot.figure
230 |     fig.savefig(os.path.join(outdir, input_name + "_scatter_reads_vs_genes.pdf"))
231 |     plt.clf()
232 | 
233 |     # sns_plot = sns.jointplot(x=aggregated_spot_counts,
234 |     #                         y=aggregated_gene_counts, kind='kde', color="skyblue")
235 |     # fig = sns_plot.get_figure()
236 |     # fig.savefig(input_name + "_join_density_reads_vs_genes.pdf")
237 |     # plt.clf()
238 | 
239 |     qa_stats = [
240 |         ("Number of spots: {}".format(total_barcodes) + "\n"),
241 |         ("Number of reads present: {}".format(total_transcripts) + "\n"),
242 |         ("Number of unique genes present: {}".format(number_genes) + "\n"),
243 |         ("Max number of genes over all spots: {}".format(max_genes_feature) + "\n"),
244 |         ("Min number of genes over all spots: {}".format(min_genes_feature) + "\n"),
245 |         ("Max number of reads over all spots: {}".format(max_reads_feature) + "\n"),
246 |         ("Min number of reads over all spots: {}".format(min_reads_feature) + "\n"),
247 |         ("Average number genes per spots: {}".format(average_genes_feature) + "\n"),
248 |         ("Average number reads per spot: {}".format(average_reads_feature) + "\n"),
249 |         ("Std number genes per spot: {}".format(std_genes_feature) + "\n"),
250 |         ("Std number reads per spot: {}".format(std_reads_feature) + "\n"),
251 |         ("Max number of reads over all spots/genes: {}".format(max_count) + "\n"),
252 |         ("Min number of reads over all spots/genes: {}".format(min_count) + "\n"),
253 |     ]
254 |     # Print stats to stdout and a file
255 |     print("".join(qa_stats))
256 |     with open(os.path.join(outdir, f"{input_name}_qa_stats.txt"), "a") as outfile:
257 |         outfile.write("".join(qa_stats))
258 | 
259 |     # Generate scatter plots
260 |     # Get the spot coordinates
261 |     x_points = []
262 |     y_points = []
263 |     for spot in counts_table.index:
264 |         tokens = spot.split("x")
265 |         assert len(tokens) == 2
266 |         y_points.append(float(tokens[1]))
267 |         x_points.append(float(tokens[0]))
268 |     scatter_plot(
269 |         x_points,
270 |         y_points,
271 |         colors=aggregated_spot_counts,
272 |         xlabel="X",
273 |         ylabel="Y",
274 |         output=os.path.join(outdir, input_name + "_heatmap_counts.pdf"),
275 |         title="Heatmap expression",
276 |     )
277 |     scatter_plot(
278 |         x_points,
279 |         y_points,
280 |         colors=aggregated_gene_counts,
281 |         xlabel="X",
282 |         ylabel="Y",
283 |         output=os.path.join(outdir, input_name + "_heatmap_genes.pdf"),
284 |         title="Heatmap genes",
285 |     )
286 | 
287 |     return 0
288 | 
289 | 
290 | if __name__ == "__main__":
291 |     main()
292 | 


--------------------------------------------------------------------------------
/stpipeline/version.py:
--------------------------------------------------------------------------------
1 | version_number = "2.0.0"
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | 


--------------------------------------------------------------------------------
/tests/annotation_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Unit-test the package annotation
  4 | """
  5 | 
  6 | import os
  7 | 
  8 | import HTSeq
  9 | import pysam
 10 | import pytest
 11 | 
 12 | from stpipeline.core.annotation import annotateReads, ReadCounter, invert_strand
 13 | 
 14 | 
 15 | @pytest.fixture
 16 | def mock_gff_file(tmp_path):
 17 |     gff_content = (
 18 |         "chr1\tsource\texon\t1\t1000\t.\t+\t.\tgene_id=gene1;\n"
 19 |         "chr1\tsource\texon\t1001\t2000\t.\t+\t.\tgene_id=gene2;\n"
 20 |         "chr2\tsource\texon\t1\t1500\t.\t-\t.\tgene_id=gene3;\n"
 21 |     )
 22 |     gff_file = tmp_path / "mock.gff"
 23 |     with open(gff_file, "w") as f:
 24 |         f.write(gff_content)
 25 |     return str(gff_file)
 26 | 
 27 | 
 28 | @pytest.fixture
 29 | def mock_bam_file(tmp_path):
 30 |     bam_file = tmp_path / "mock.bam"
 31 |     header = {
 32 |         "HD": {"VN": "1.0"},
 33 |         "SQ": [
 34 |             {"LN": 2000, "SN": "chr1"},
 35 |             {"LN": 1500, "SN": "chr2"},
 36 |         ],
 37 |     }
 38 |     with pysam.AlignmentFile(bam_file, "wb", header=header) as f:
 39 |         for i in range(5):
 40 |             segment = pysam.AlignedSegment()
 41 |             segment.query_name = f"read{i}"
 42 |             segment.query_sequence = "ACTG" * 25
 43 |             segment.query_qualities = pysam.qualitystring_to_array("IIII" * 25)
 44 |             segment.flag = 0
 45 |             segment.reference_id = 0
 46 |             segment.reference_start = i * 100
 47 |             segment.set_tag("B1", i)
 48 |             segment.set_tag("B2", i * 2)
 49 |             segment.set_tag("XF", "gene1")
 50 |             segment.cigar = [(0, len(segment.query_sequence))]  # 0: MATCH
 51 |             f.write(segment)
 52 |     return str(bam_file)
 53 | 
 54 | 
 55 | def test_invert_strand():
 56 |     iv = HTSeq.GenomicInterval("chr1", 0, 1000, "+")
 57 |     inverted = invert_strand(iv)
 58 |     assert inverted.strand == "-"
 59 | 
 60 |     iv = HTSeq.GenomicInterval("chr1", 0, 1000, "-")
 61 |     inverted = invert_strand(iv)
 62 |     assert inverted.strand == "+"
 63 | 
 64 |     with pytest.raises(ValueError):
 65 |         iv = HTSeq.GenomicInterval("chr1", 0, 1000, ".")
 66 |         invert_strand(iv)
 67 | 
 68 | 
 69 | def test_count_reads_in_features(mock_bam_file, mock_gff_file, tmp_path):
 70 |     output_file = tmp_path / "output.bam"
 71 |     discarded_file = tmp_path / "discarded.bam"
 72 | 
 73 |     annotated_count = ReadCounter(
 74 |         sam_filename=mock_bam_file,
 75 |         gff_filename=mock_gff_file,
 76 |         samtype="bam",
 77 |         stranded="yes",
 78 |         overlap_mode="union",
 79 |         feature_type=["exon"],
 80 |         id_attribute="gene_id",
 81 |         minaqual=0,
 82 |         samout=str(output_file),
 83 |         include_non_annotated=True,
 84 |         htseq_no_ambiguous=False,
 85 |         output_discarded=str(discarded_file),
 86 |     ).count_reads()
 87 | 
 88 |     assert annotated_count > 0
 89 |     assert os.path.exists(output_file)
 90 |     assert os.path.exists(discarded_file)
 91 | 
 92 | 
 93 | def test_annotate_reads(mock_bam_file, mock_gff_file, tmp_path):
 94 |     output_file = tmp_path / "output.bam"
 95 |     discarded_file = tmp_path / "discarded.bam"
 96 | 
 97 |     annotateReads(
 98 |         mappedReads=mock_bam_file,
 99 |         gtfFile=mock_gff_file,
100 |         outputFile=str(output_file),
101 |         outputDiscarded=str(discarded_file),
102 |         mode="union",
103 |         strandness="yes",
104 |         htseq_no_ambiguous=False,
105 |         include_non_annotated=True,
106 |         feature_types=["exon"],
107 |     )
108 | 
109 |     assert os.path.exists(output_file)
110 |     assert os.path.exists(discarded_file)
111 | 


--------------------------------------------------------------------------------
/tests/clustering_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Unit-test the package clustering
  4 | """
  5 | 
  6 | from collections import Counter
  7 | 
  8 | from stpipeline.common.clustering import (
  9 |     _breadth_first_search,
 10 |     _get_adj_list_adjacency,
 11 |     _get_adj_list_directional_adjacency,
 12 |     _get_best_adjacency,
 13 |     _get_connected_components_adjacency,
 14 |     _reduce_clusters_adjacency,
 15 |     _reduce_clusters_directional_adjacency,
 16 |     _remove_umis,
 17 |     dedup_adj,
 18 |     dedup_dir_adj,
 19 |     dedup_hierarchical,
 20 | )
 21 | 
 22 | 
 23 | def test_breadth_first_search():
 24 |     adj_list = {
 25 |         "A": ["B", "C"],
 26 |         "B": ["A", "D"],
 27 |         "C": ["A"],
 28 |         "D": ["B"],
 29 |     }
 30 |     result = _breadth_first_search("A", adj_list)
 31 |     assert result == {"A", "B", "C", "D"}
 32 | 
 33 | 
 34 | def test_remove_umis():
 35 |     adj_list = {
 36 |         "A": ["B"],
 37 |         "B": ["A", "C"],
 38 |         "C": ["B"],
 39 |     }
 40 |     cluster = ["A", "B", "C"]
 41 |     nodes = ["C"]
 42 |     result = _remove_umis(adj_list, cluster, nodes)
 43 |     assert result == {"A"}
 44 | 
 45 | 
 46 | def test_get_connected_components_adjacency():
 47 |     adj_list = {
 48 |         "A": ["B"],
 49 |         "B": ["A", "C"],
 50 |         "C": ["B"],
 51 |         "D": [],
 52 |     }
 53 |     counts = Counter({"A": 3, "B": 2, "C": 1, "D": 4})
 54 |     result = _get_connected_components_adjacency(adj_list, counts)
 55 |     result = [sorted(x) for x in result]
 56 |     assert len(result) == 2
 57 |     assert ["A", "B", "C"] in result
 58 |     assert ["D"] in result
 59 | 
 60 | 
 61 | def test_get_adj_list_adjacency():
 62 |     umis = ["AAAA", "AAAT", "AATT", "TTTT"]
 63 |     allowed_mismatches = 1
 64 |     result = _get_adj_list_adjacency(umis, allowed_mismatches)
 65 |     assert "AAAA" in result and "AAAT" in result["AAAA"]
 66 |     assert "AATT" not in result["AAAA"]
 67 | 
 68 | 
 69 | def test_get_best_adjacency():
 70 |     adj_list = {
 71 |         "A": ["B"],
 72 |         "B": ["A", "C"],
 73 |         "C": ["B"],
 74 |     }
 75 |     cluster = ["A", "B", "C"]
 76 |     counts = Counter({"A": 3, "B": 2, "C": 1})
 77 |     result = _get_best_adjacency(cluster, adj_list, counts)
 78 |     assert result == ["A", "B"]
 79 | 
 80 | 
 81 | def test_reduce_clusters_adjacency():
 82 |     adj_list = {
 83 |         "A": ["B"],
 84 |         "B": ["A", "C"],
 85 |         "C": ["B"],
 86 |     }
 87 |     clusters = [{"A", "B", "C"}]
 88 |     counts = Counter({"A": 3, "B": 2, "C": 1})
 89 |     result = _reduce_clusters_adjacency(adj_list, clusters, counts)
 90 |     assert result == ["A", "B"]
 91 | 
 92 | 
 93 | def test_get_adj_list_directional_adjacency():
 94 |     umis = ["AAAA", "AAAT", "AATT", "TTTT"]
 95 |     counts = Counter({"AAAA": 6, "AAAT": 3, "AATT": 2, "TTTT": 1})
 96 |     allowed_mismatches = 1
 97 |     result = _get_adj_list_directional_adjacency(umis, counts, allowed_mismatches)
 98 |     assert "AAAA" in result and "AAAT" in result["AAAA"]
 99 |     assert "AATT" not in result["AAAA"]
100 | 
101 | 
102 | def test_reduce_clusters_directional_adjacency():
103 |     clusters = [["A", "B", "C"]]
104 |     result = _reduce_clusters_directional_adjacency(clusters)
105 |     assert result == ["C"]
106 | 
107 | 
108 | def test_dedup_hierarchical():
109 |     umis = ["AAAA", "AAAT", "AATT", "TTTT"]
110 |     allowed_mismatches = 1
111 |     result = dedup_hierarchical(umis, allowed_mismatches)
112 |     assert len(result) <= len(umis)
113 | 
114 | 
115 | def test_dedup_adj():
116 |     umis = ["AAAA", "AAAT", "AATT", "TTTT"]
117 |     allowed_mismatches = 1
118 |     result = dedup_adj(umis, allowed_mismatches)
119 |     assert len(result) <= len(umis)
120 | 
121 | 
122 | def test_dedup_dir_adj():
123 |     umis = ["AAAA", "AAAT", "AATT", "TTTT"]
124 |     allowed_mismatches = 1
125 |     result = dedup_dir_adj(umis, allowed_mismatches)
126 |     assert len(result) <= len(umis)
127 | 


--------------------------------------------------------------------------------
/tests/config/Homo_sapiens.GRCh38.dna.chromosome.19.fa.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/tests/config/Homo_sapiens.GRCh38.dna.chromosome.19.fa.gz


--------------------------------------------------------------------------------
/tests/dataset_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Unit-test the package dataset
  4 | """
  5 | 
  6 | from collections import Counter
  7 | from typing import List
  8 | from unittest.mock import Mock
  9 | 
 10 | import pysam
 11 | import pytest
 12 | 
 13 | from stpipeline.common.dataset import Transcript, compute_unique_umis, createDataset
 14 | 
 15 | 
 16 | @pytest.fixture
 17 | def mock_gff_file(tmp_path):
 18 |     gff_content = (
 19 |         "chr1\tsource\tgene\t1\t1000\t.\t+\t.\tgene_id=gene1;\n"
 20 |         "chr1\tsource\tgene\t1001\t2000\t.\t+\t.\tgene_id=gene2;\n"
 21 |         "chr2\tsource\tgene\t1\t1500\t.\t-\t.\tgene_id=gene3;\n"
 22 |     )
 23 |     gff_file = tmp_path / "mock.gff"
 24 |     with open(gff_file, "w") as f:
 25 |         f.write(gff_content)
 26 |     return str(gff_file)
 27 | 
 28 | 
 29 | @pytest.fixture
 30 | def mock_bam_file(tmp_path):
 31 |     bam_file = tmp_path / "mock.bam"
 32 |     with pysam.AlignmentFile(bam_file, "wb", header={"HD": {"VN": "1.0"}, "SQ": [{"LN": 2000, "SN": "chr1"}]}) as f:
 33 |         for i in range(5):
 34 |             segment = pysam.AlignedSegment()
 35 |             segment.query_name = f"read{i}"
 36 |             segment.query_sequence = "ACTG" * 25
 37 |             segment.query_qualities = pysam.qualitystring_to_array("IIII" * 25)
 38 |             segment.flag = 0
 39 |             segment.reference_id = 0
 40 |             segment.reference_start = i * 100
 41 |             segment.cigar = [(0, len(segment.query_sequence))]  # 0: MATCH
 42 |             segment.set_tag("B1", i)
 43 |             segment.set_tag("B2", i * 2)
 44 |             segment.set_tag("XF", "gene1")
 45 |             segment.set_tag("B3", "UMI1")
 46 |             f.write(segment)
 47 |     return str(bam_file)
 48 | 
 49 | 
 50 | # Test for Transcript Dataclass
 51 | def test_transcript_dataclass():
 52 |     transcript = Transcript(
 53 |         chrom="chr1", start=100, end=200, clear_name="test_transcript", mapping_quality=60, strand="+", umi="ATGC"
 54 |     )
 55 | 
 56 |     assert transcript.chrom == "chr1"
 57 |     assert transcript.start == 100
 58 |     assert transcript.end == 200
 59 |     assert transcript.clear_name == "test_transcript"
 60 |     assert transcript.mapping_quality == 60
 61 |     assert transcript.strand == "+"
 62 |     assert transcript.umi == "ATGC"
 63 | 
 64 | 
 65 | # Test for compute_unique_umis
 66 | def mock_group_umi_func(umis: List[str], mismatches: int) -> List[str]:
 67 |     return umis[:1]  # Simplified mock implementation for testing
 68 | 
 69 | 
 70 | def test_compute_unique_umis():
 71 |     transcripts = [
 72 |         Transcript("chr1", 100, 200, "t1", 60, "+", "UMI1"),
 73 |         Transcript("chr1", 105, 205, "t2", 60, "+", "UMI2"),
 74 |         Transcript("chr1", 110, 210, "t3", 60, "+", "UMI3"),
 75 |     ]
 76 | 
 77 |     unique_transcripts = compute_unique_umis(
 78 |         transcripts, umi_counting_offset=10, umi_allowed_mismatches=1, group_umi_func=mock_group_umi_func
 79 |     )
 80 | 
 81 |     assert len(unique_transcripts) == 1
 82 |     assert unique_transcripts[0].umi == "UMI1"
 83 | 
 84 | 
 85 | # Test for createDataset with mocked dependencies
 86 | def test_create_dataset(tmp_path, monkeypatch, mock_bam_file, mock_gff_file):
 87 |     # Mock inputs
 88 |     output_folder = tmp_path
 89 |     umi_cluster_algorithm = "hierarchical"
 90 | 
 91 |     t1 = Transcript("chr1", 100, 200, "t1", 60, "+", "UMI1")
 92 |     t2 = Transcript("chr2", 300, 400, "t2", 60, "-", "UMI2")
 93 |     # Mock parse_unique_events
 94 |     mock_parse_unique_events = Mock(return_value=[("gene1", {(10, 10): [t1, t2]}), ("gene2", {(20, 20): [t1, t2]})])
 95 |     monkeypatch.setattr("stpipeline.common.dataset.parse_unique_events", mock_parse_unique_events)
 96 | 
 97 |     # Mock dedup_hierarchical
 98 |     mock_dedup_compute_unique_umis = Mock(return_value=[t1])
 99 |     monkeypatch.setattr("stpipeline.common.dataset.compute_unique_umis", mock_dedup_compute_unique_umis)
100 | 
101 |     stats = createDataset(
102 |         input_file=mock_bam_file,
103 |         output_folder=str(output_folder),
104 |         gff_filename=mock_gff_file,
105 |         umi_cluster_algorithm=umi_cluster_algorithm,
106 |         umi_allowed_mismatches=1,
107 |         umi_counting_offset=10,
108 |         disable_umi=False,
109 |         output_template="output",
110 |         verbose=False,
111 |     )
112 | 
113 |     assert stats["genes_found"] == 2
114 |     assert stats["reads_after_duplicates_removal"] == 2
115 | 


--------------------------------------------------------------------------------
/tests/fastq_utils_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Unit-test the package fastq_utils
  4 | """
  5 | 
  6 | import pytest
  7 | 
  8 | from stpipeline.common.fastq_utils import (
  9 |     check_umi_template,
 10 |     has_sufficient_content,
 11 |     quality_trim_index,
 12 |     remove_adaptor,
 13 |     trim_quality,
 14 | )
 15 | 
 16 | 
 17 | # Test for remove_adaptor
 18 | def test_remove_adaptor():
 19 |     sequence = "AGCTTAGCTTAGCTA"
 20 |     quality = "FFFFFFFFFFFFFFF"
 21 |     adaptor = "TAGCTT"
 22 |     trimmed_seq, trimmed_qual = remove_adaptor(sequence, quality, adaptor, missmatches=0)
 23 |     assert trimmed_seq == "AGCT"
 24 |     assert trimmed_qual == "FFFF"
 25 | 
 26 |     # Test no adaptor found
 27 |     trimmed_seq, trimmed_qual = remove_adaptor(sequence, quality, "GATTACA")
 28 |     assert trimmed_seq == sequence
 29 |     assert trimmed_qual == quality
 30 | 
 31 |     # Test with mismatches
 32 |     trimmed_seq, trimmed_qual = remove_adaptor(sequence, quality, "TAGCTT", missmatches=1)
 33 |     assert trimmed_seq == "AGCT"
 34 |     assert trimmed_qual == "FFFF"
 35 | 
 36 | 
 37 | def test_quality_trim_index_basic():
 38 |     sequence = "AGCTTAGCTTAGCTA"
 39 |     quality = "FFFFFFFFFFFFFFF"  # ASCII 'F' -> Phred score 40
 40 |     cutoff = 20
 41 |     result = quality_trim_index(sequence, quality, cutoff)
 42 |     assert result == len(sequence)  # No trimming, all bases are high quality
 43 | 
 44 | 
 45 | def test_quality_trim_index_trimming():
 46 |     sequence = "AGCTTAGCTTAGCTA"
 47 |     quality = "FFFFFF!!!!!!!!!"  # Phred scores: 'F' (40), '!' (0)
 48 |     cutoff = 20
 49 |     result = quality_trim_index(sequence, quality, cutoff)
 50 |     assert result == 6  # Trims after the first 6 high-quality bases
 51 | 
 52 | 
 53 | def test_quality_trim_index_low_quality_g():
 54 |     sequence = "AGCTTAGCTTGGA"
 55 |     quality = "FFFFFF!!!!!!"  # Phred scores: 'F' (40), '!' (0)
 56 |     cutoff = 20
 57 |     result = quality_trim_index(sequence, quality, cutoff)
 58 |     assert result == 6  # Trims after the first 6 high-quality bases
 59 | 
 60 | 
 61 | def test_trim_quality_basic():
 62 |     sequence = "AGCTTAGCTTAGCTA"
 63 |     quality = "FFFFFFFFFFFFFFF"  # All high quality
 64 |     min_qual = 20
 65 |     min_length = 10
 66 |     trimmed_seq, trimmed_qual = trim_quality(sequence, quality, min_qual, min_length)
 67 |     assert trimmed_seq == "AGCTTAGCTTAGCTA"
 68 |     assert trimmed_qual == "FFFFFFFFFFFFFFF"
 69 | 
 70 | 
 71 | def test_trim_quality_trimming():
 72 |     sequence = "AGCTTAGCTTAGCTA"
 73 |     quality = "FFFFFF!!!!!!!!!"  # Low-quality bases at the end
 74 |     min_qual = 20
 75 |     min_length = 5
 76 |     trimmed_seq, trimmed_qual = trim_quality(sequence, quality, min_qual, min_length)
 77 |     assert trimmed_seq == "AGCTTA"
 78 |     assert trimmed_qual == "FFFFFF"
 79 | 
 80 | 
 81 | def test_trim_quality_below_min_length():
 82 |     sequence = "AGCTTAGCTTAGCTA"
 83 |     quality = "FFFFFF!!!!!!!!!"  # Low-quality bases at the end
 84 |     min_qual = 20
 85 |     min_length = 10
 86 |     trimmed_seq, trimmed_qual = trim_quality(sequence, quality, min_qual, min_length)
 87 |     assert trimmed_seq is None
 88 |     assert trimmed_qual is None
 89 | 
 90 | 
 91 | def test_trim_quality_low_quality_g():
 92 |     sequence = "AGCTTAGCTTGGA"
 93 |     quality = "FFFFFF!!!!!!"  # Phred scores: 'F' (40), '!' (0)
 94 |     min_qual = 20
 95 |     min_length = 5
 96 |     trimmed_seq, trimmed_qual = trim_quality(sequence, quality, min_qual, min_length)
 97 |     assert trimmed_seq == "AGCTTA"
 98 |     assert trimmed_qual == "FFFFFF"
 99 | 
100 | 
101 | def test_trim_quality_short():
102 |     min_qual = 20
103 |     min_length = 10
104 | 
105 |     # Test with sequence shorter than min_length
106 |     trimmed_seq, trimmed_qual = trim_quality("AGCTT", "FFFFF", min_qual, min_length)
107 |     assert trimmed_seq is None
108 |     assert trimmed_qual is None
109 | 
110 | 
111 | # Test for check_umi_template
112 | def test_check_umi_template():
113 |     umi = "ACGT1234"
114 |     template = r"[ACGT]{4}\d{4}"
115 |     assert check_umi_template(umi, template) is True
116 | 
117 |     umi = "ACGT12"
118 |     assert check_umi_template(umi, template) is False
119 | 
120 | 
121 | # Test for has_sufficient_content
122 | def test_has_sufficient_content():
123 |     sequence = "ATATGGCCATAT"
124 |     chars_to_count = "AT"
125 |     threshold = 50.0
126 |     assert has_sufficient_content(sequence, chars_to_count, threshold) is True
127 | 
128 |     threshold = 90.0
129 |     assert has_sufficient_content(sequence, chars_to_count, threshold) is False
130 | 
131 |     # Test empty sequence
132 |     with pytest.raises(ValueError):
133 |         has_sufficient_content("", chars_to_count, threshold)
134 | 
135 |     # Test empty chars_to_count
136 |     with pytest.raises(ValueError):
137 |         has_sufficient_content(sequence, "", threshold)
138 | 


--------------------------------------------------------------------------------
/tests/filter_test.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """
 3 | Unit-test the package filter
 4 | """
 5 | 
 6 | from unittest.mock import Mock, patch
 7 | 
 8 | import dnaio
 9 | import pytest
10 | 
11 | from stpipeline.common.filter import bam_header, filter_input_data
12 | 
13 | 
14 | def generate_test_fastq(filepath, records):
15 |     """
16 |     Generates a mock FASTQ file for testing.
17 | 
18 |     Args:
19 |         filepath (str): Path to the file to create.
20 |         records (list): List of tuples (header, sequence, quality) for the FASTQ records.
21 |     """
22 |     with open(filepath, "w") as f:
23 |         for header, sequence, quality in records:
24 |             f.write(f"@{header}\n{sequence}\n+\n{quality}\n")
25 | 
26 | 
27 | @pytest.fixture
28 | def setup_fastq_files(tmp_path):
29 |     fw_records = [
30 |         ("read1", "ACTGACTGACTGACTGACTGACTG", "IIIIIIIIIIIIIIIIIIIIIIII"),
31 |         ("read2", "TTTTTTTTTTTTTTTTTTTTTTTT", "IIIIIIIIIIIIIIIIIIIIIIII"),
32 |         ("read3", "GGGGGGGGGGGGGGGGGGGGGGGG", "IIIIIIIIIIIIIIIIIIIIIIII"),
33 |         ("read4", "CCCCCCCCCCCCCCCCCCCCCCCC", "IIIIIIIIIIIIIIIIIIIIIIII"),
34 |         ("read5", "ACTGACTGACTGACTGACTGACTG", "!!!!IIIIIIIIIIIIIIIIIIII"),  # Low-quality UMI
35 |         ("read6", "ACTGACTGACTGACTGACTGACTG", "!!!!!!!!!!!!!!IIIIIIIIII"),  # Too short after trimming
36 |     ]
37 |     rv_records = [
38 |         ("read1", "ACTGACTGACTGACTGACTGACTG", "IIIIIIIIIIIIIIIIIIIIIIII"),
39 |         ("read2", "TTTTTTTTTTTTTTTTTTTTTTTT", "IIIIIIIIIIIIIIIIIIIIIIII"),
40 |         ("read3", "GGGGGGGGGGGGGGGGGGGGGGGG", "IIIIIIIIIIIIIIIIIIIIIIII"),
41 |         ("read4", "CCCCCCCCCCCCCCCCCCCCCCCC", "IIIIIIIIIIIIIIIIIIIIIIII"),
42 |         ("read5", "ACTGACTGACTGACTGACTGACTG", "!!!!IIIIIIIIIIIIIIIIIIII"),  # Low-quality UMI
43 |         ("read6", "ACTGACTGACTGACTGACTGACTG", "!!!!!!!!!!!!!!IIIIIIIIII"),  # Too short after trimming
44 |     ]
45 |     fw_file = tmp_path / "fw.fastq"
46 |     rv_file = tmp_path / "rv.fastq"
47 | 
48 |     generate_test_fastq(fw_file, fw_records)
49 |     generate_test_fastq(rv_file, rv_records)
50 | 
51 |     return str(fw_file), str(rv_file)
52 | 
53 | 
54 | @patch("stpipeline.common.filter.pysam.AlignmentFile")
55 | def test_filter_input_data(mock_alignment_file, setup_fastq_files, tmp_path):
56 |     fw_file, rv_file = setup_fastq_files
57 |     out_file = tmp_path / "output.bam"
58 |     out_file_discarded = tmp_path / "discarded.fastq"
59 | 
60 |     mock_alignment_file.return_value.__enter__.return_value = Mock()
61 | 
62 |     total_reads, remaining_reads = filter_input_data(
63 |         fw_file=fw_file,
64 |         rv_file=rv_file,
65 |         out_file=str(out_file),
66 |         out_file_discarded=str(out_file_discarded),
67 |         barcode_length=10,
68 |         start_position=0,
69 |         filter_AT_content=50.0,
70 |         filter_GC_content=50.0,
71 |         umi_start=0,
72 |         umi_end=4,
73 |         min_qual=20,
74 |         min_length=20,
75 |         polyA_min_distance=5,
76 |         polyT_min_distance=5,
77 |         polyG_min_distance=5,
78 |         polyC_min_distance=5,
79 |         polyN_min_distance=5,
80 |         qual64=False,
81 |         umi_filter=True,
82 |         umi_filter_template=r"[ACGT]{4}",
83 |         umi_quality_bases=1,
84 |         adaptor_missmatches=2,
85 |         overhang=2,
86 |         disable_umi=False,
87 |         disable_barcode=False,
88 |         disable_trimming=False,
89 |     )
90 | 
91 |     assert total_reads == 6
92 |     assert remaining_reads < total_reads
93 |     mock_alignment_file.assert_called_once_with(str(out_file), "wb", header=bam_header)
94 | 


--------------------------------------------------------------------------------
/tests/gff_reader_test.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """
 3 | Unit-test the package filter
 4 | """
 5 | 
 6 | import gzip
 7 | 
 8 | import pytest
 9 | 
10 | from stpipeline.common.gff_reader import gff_lines, gff_parse
11 | 
12 | 
13 | # Mock GTF/GFF data for testing
14 | def generate_mock_gff_file(file_path: str, content: str):
15 |     """
16 |     Generates a mock GFF file for testing.
17 | 
18 |     Args:
19 |         file_path (str): Path to the file to create.
20 |         content (str): Content to write into the file.
21 |     """
22 |     with open(file_path, "w") as f:
23 |         f.write(content)
24 | 
25 | 
26 | @pytest.fixture
27 | def mock_gff_file(tmp_path):
28 |     content = (
29 |         "##gff-version 3\n"
30 |         "chr1\tsource\tfeature\t100\t200\t.\t+\t.\tID=gene1;Name=Gene1\n"
31 |         "chr1\tsource\tfeature\t300\t400\t.\t-\t.\tID=gene2;Name=Gene2\n"
32 |         "chr2\tsource\tfeature\t500\t600\t.\t+\t.\tID=gene3;Name=Gene3\n"
33 |     )
34 |     file_path = tmp_path / "test.gff"
35 |     generate_mock_gff_file(file_path, content)
36 |     return str(file_path)
37 | 
38 | 
39 | @pytest.fixture
40 | def mock_gzipped_gff_file(tmp_path):
41 |     content = (
42 |         "##gff-version 3\n"
43 |         "chr1\tsource\tfeature\t100\t200\t.\t+\t.\tID=gene1;Name=Gene1\n"
44 |         "chr1\tsource\tfeature\t300\t400\t.\t-\t.\tID=gene2;Name=Gene2\n"
45 |         "chr2\tsource\tfeature\t500\t600\t.\t+\t.\tID=gene3;Name=Gene3\n"
46 |     )
47 |     file_path = tmp_path / "test.gff.gz"
48 |     with gzip.open(file_path, "wt") as f:
49 |         f.write(content)
50 |     return str(file_path)
51 | 
52 | 
53 | def test_gff_lines_plain(mock_gff_file):
54 |     parsed_lines = list(gff_lines(mock_gff_file))
55 |     assert len(parsed_lines) == 3
56 |     assert parsed_lines[0]["seqname"] == "chr1"
57 |     assert parsed_lines[0]["start"] == "100"
58 |     assert parsed_lines[0]["end"] == "200"
59 |     assert parsed_lines[0]["ID"] == "gene1"
60 | 
61 | 
62 | def test_gff_lines_gzipped(mock_gzipped_gff_file):
63 |     parsed_lines = list(gff_lines(mock_gzipped_gff_file))
64 |     assert len(parsed_lines) == 3
65 |     assert parsed_lines[0]["seqname"] == "chr1"
66 |     assert parsed_lines[0]["start"] == "100"
67 |     assert parsed_lines[0]["end"] == "200"
68 |     assert parsed_lines[0]["ID"] == "gene1"
69 | 
70 | 
71 | def test_gff_parse():
72 |     line = "chr1\tsource\tfeature\t100\t200\t.\t+\t.\tID=gene1;Name=Gene1"
73 |     parsed = gff_parse(line)
74 |     assert parsed["seqname"] == "chr1"
75 |     assert parsed["start"] == "100"
76 |     assert parsed["end"] == "200"
77 |     assert parsed["ID"] == "gene1"
78 |     assert parsed["Name"] == "Gene1"
79 | 


--------------------------------------------------------------------------------
/tests/input/arrayjet_1002/testdata_R1.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/tests/input/arrayjet_1002/testdata_R1.fastq.gz


--------------------------------------------------------------------------------
/tests/input/arrayjet_1002/testdata_R2.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/tests/input/arrayjet_1002/testdata_R2.fastq.gz


--------------------------------------------------------------------------------
/tests/mapping_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Unit-test the package mapping
  4 | """
  5 | 
  6 | import subprocess
  7 | from unittest.mock import MagicMock, mock_open, patch
  8 | 
  9 | from stpipeline.core.mapping import alignReads, barcodeDemultiplexing
 10 | 
 11 | 
 12 | def test_alignReads():
 13 |     mock_log_content = """\
 14 |         Number of input reads 100000
 15 |         Average input read length 150
 16 |         Uniquely mapped reads number 90000
 17 |         Uniquely mapped reads 90
 18 |         Number of reads mapped to multiple loci 5000
 19 |         % of reads mapped to multiple loci 5
 20 |         % of reads unmapped: too short 5
 21 |     """
 22 | 
 23 |     with (
 24 |         patch("subprocess.Popen") as mock_popen,
 25 |         patch("stpipeline.core.mapping.file_ok", return_value=True),
 26 |         patch("stpipeline.core.mapping.shutil.move") as mock_shutil_move,
 27 |         patch("stpipeline.core.mapping.open", mock_open(read_data=mock_log_content)) as mock_open_file,
 28 |     ):
 29 |         # Mock the subprocess to simulate STAR execution
 30 |         mock_process = MagicMock()
 31 |         mock_process.communicate.return_value = (b"", b"")
 32 |         mock_process.returncode = 0
 33 |         mock_popen.return_value = mock_process
 34 | 
 35 |         # Mock the log file content
 36 |         mock_open_file.return_value.__enter__.return_value.read.return_value = mock_log_content
 37 | 
 38 |         # Call the function
 39 |         total_reads = alignReads(
 40 |             reverse_reads="test.bam",
 41 |             ref_map="ref",
 42 |             outputFile="out.bam",
 43 |             annotation=None,
 44 |             outputFolder="output",
 45 |             trimReverse=10,
 46 |             invTrimReverse=10,
 47 |             cores=4,
 48 |             min_intron_size=20,
 49 |             max_intron_size=1000,
 50 |             disable_multimap=False,
 51 |             diable_softclipping=False,
 52 |             twopassMode=True,
 53 |             min_length=50,
 54 |             include_non_mapped=True,
 55 |             star_genome_loading="NoSharedMemory",
 56 |             star_sort_mem_limit=64000000,
 57 |         )
 58 | 
 59 |         # Assertions for subprocess call
 60 |         mock_popen.assert_called_once()
 61 |         mock_shutil_move.assert_called_once_with("output/Aligned.sortedByCoord.out.bam", "out.bam")
 62 | 
 63 |         expected_args = [
 64 |             "STAR",
 65 |             "--genomeDir",
 66 |             "ref",
 67 |             "--readFilesIn",
 68 |             "test.bam",
 69 |             "--outFileNamePrefix",
 70 |             "output/",
 71 |             "--clip3pNbases",
 72 |             "10",
 73 |             "--clip5pNbases",
 74 |             "10",
 75 |             "--runThreadN",
 76 |             "4",
 77 |             "--outFilterType",
 78 |             "Normal",
 79 |             "--outSAMtype",
 80 |             "BAM",
 81 |             "SortedByCoordinate",
 82 |             "--alignEndsType",
 83 |             "Local",
 84 |             "--outSAMorder",
 85 |             "Paired",
 86 |             "--outSAMprimaryFlag",
 87 |             "OneBestScore",
 88 |             "--outFilterMultimapNmax",
 89 |             "20",
 90 |             "--alignIntronMin",
 91 |             "20",
 92 |             "--alignIntronMax",
 93 |             "1000",
 94 |             "--outFilterMatchNmin",
 95 |             "50",
 96 |             "--genomeLoad",
 97 |             "NoSharedMemory",
 98 |             "--limitBAMsortRAM",
 99 |             "64000000",
100 |             "--readFilesType",
101 |             "SAM",
102 |             "SE",
103 |             "--readFilesCommand",
104 |             "samtools",
105 |             "view",
106 |             "-h",
107 |             "--twopassMode",
108 |             "Basic",
109 |             "--outSAMunmapped",
110 |             "Within",
111 |         ]
112 | 
113 |         mock_popen.assert_called_once_with(
114 |             expected_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, shell=False
115 |         )
116 | 
117 |         # Ensure the log file was read
118 |         mock_open_file.assert_called_once_with("output/Log.final.out", "r")
119 | 
120 |         # Log file parsing validation
121 |         assert total_reads == 95000
122 | 
123 | 
124 | def test_barcodeDemultiplexing(tmpdir):
125 |     with (
126 |         patch("subprocess.Popen") as mock_popen,
127 |         patch("stpipeline.core.mapping.open", mock_open()) as mock_open_file,
128 |         patch("os.path.isfile", return_value=True),
129 |         patch("stpipeline.core.mapping.file_ok", return_value=True),
130 |     ):
131 |         mock_process = MagicMock()
132 |         mock_process.communicate.return_value = (b"Total reads: 100\nTotal reads written: 80", b"")
133 |         mock_process.returncode = 0
134 |         mock_popen.return_value = mock_process
135 | 
136 |         total_reads = barcodeDemultiplexing(
137 |             reads="reads.bam",
138 |             idFile="barcodes.tsv",
139 |             mismatches=1,
140 |             kmer=8,
141 |             over_hang=2,
142 |             taggd_metric="Levenshtein",
143 |             taggd_multiple_hits_keep_one=True,
144 |             taggd_trim_sequences=[1, 2, 3],
145 |             cores=4,
146 |             outputFilePrefix=str(tmpdir),
147 |             keep_discarded_files=False,
148 |             taggd_chunk_size=100,
149 |         )
150 | 
151 |         expected_args = [
152 |             "taggd_demultiplex",
153 |             "--max-edit-distance",
154 |             "1",
155 |             "--k",
156 |             "8",
157 |             "--barcode-tag",
158 |             "B0",
159 |             "--homopolymer-filter",
160 |             "0",
161 |             "--subprocesses",
162 |             "4",
163 |             "--metric",
164 |             "Levenshtein",
165 |             "--chunk-size",
166 |             "100",
167 |             "--overhang",
168 |             "2",
169 |             "--trim-sequences",
170 |             "1",
171 |             "2",
172 |             "3",
173 |             "--multiple-hits-keep-one",
174 |             "--no-unmatched-output",
175 |             "--no-ambiguous-output",
176 |             "--no-results-output",
177 |             "barcodes.tsv",
178 |             "reads.bam",
179 |             str(tmpdir),
180 |         ]
181 | 
182 |         mock_popen.assert_called_once_with(
183 |             expected_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, shell=False
184 |         )
185 |         assert str(tmpdir) + "_log.txt" in mock_open_file.call_args[0][0]
186 |         assert total_reads == 80
187 | 


--------------------------------------------------------------------------------
/tests/sam_utils_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Unit-test the package sam_utils
  4 | """
  5 | 
  6 | import os
  7 | from unittest.mock import Mock, patch
  8 | 
  9 | import pysam
 10 | import pytest
 11 | 
 12 | from stpipeline.common.sam_utils import convert_to_AlignedSegment, merge_bam, split_bam
 13 | 
 14 | 
 15 | @pytest.fixture
 16 | def mock_bam_files(tmp_path):
 17 |     """Creates mock BAM files for testing."""
 18 |     bam_files = []
 19 |     header = {"HD": {"VN": "1.0"}, "SQ": [{"LN": 2000, "SN": "chr1"}]}
 20 | 
 21 |     for i in range(3):
 22 |         bam_file = tmp_path / f"mock{i}.bam"
 23 |         bam_files.append(str(bam_file))
 24 |         with pysam.AlignmentFile(bam_file, "wb", header=header) as f:
 25 |             for j in range(5):
 26 |                 segment = pysam.AlignedSegment()
 27 |                 segment.query_name = f"read{i}_{j}"
 28 |                 segment.query_sequence = "ACTG" * 25
 29 |                 segment.query_qualities = pysam.qualitystring_to_array("IIII" * 25)
 30 |                 segment.flag = 0
 31 |                 segment.reference_id = 0
 32 |                 segment.reference_start = j * 100
 33 |                 segment.cigar = [(0, len(segment.query_sequence))]  # Match
 34 |                 f.write(segment)
 35 | 
 36 |     return bam_files
 37 | 
 38 | 
 39 | @pytest.fixture
 40 | def mock_bam_file_for_split(tmp_path):
 41 |     """Creates a mock BAM file for testing split_bam."""
 42 |     bam_file = tmp_path / "mock.bam"
 43 |     header = {"HD": {"VN": "1.0"}, "SQ": [{"LN": 2000, "SN": "chr1"}]}
 44 | 
 45 |     with pysam.AlignmentFile(bam_file, "wb", header=header) as f:
 46 |         for i in range(20):
 47 |             segment = pysam.AlignedSegment()
 48 |             segment.query_name = f"read{i}"
 49 |             segment.query_sequence = "ACTG" * 25
 50 |             segment.query_qualities = pysam.qualitystring_to_array("IIII" * 25)
 51 |             segment.flag = 0
 52 |             segment.reference_id = 0
 53 |             segment.reference_start = i * 100
 54 |             segment.cigar = [(0, len(segment.query_sequence))]  # Match
 55 |             f.write(segment)
 56 | 
 57 |     return str(bam_file)
 58 | 
 59 | 
 60 | def test_split_bam(mock_bam_file_for_split, tmp_path):
 61 |     """Test the split_bam function with a mocked BAM file."""
 62 |     threads = 4
 63 |     temp_dir = tmp_path / "split_bam_output"
 64 |     temp_dir.mkdir()
 65 | 
 66 |     # Call the split_bam function
 67 |     split_files = split_bam(mock_bam_file_for_split, str(temp_dir), threads)
 68 | 
 69 |     # Verify the number of split files
 70 |     assert len(split_files) == threads
 71 | 
 72 |     # Verify the contents of each split file
 73 |     total_records = 0
 74 |     for part, split_file in enumerate(split_files):
 75 |         with pysam.AlignmentFile(split_file, "rb") as f:
 76 |             records = list(f.fetch(until_eof=True))
 77 |             total_records += len(records)
 78 | 
 79 |             # Assert each file has roughly equal reads
 80 |             if part < threads - 1:
 81 |                 assert len(records) == 5  # 20 reads / 4 threads = 5 reads per part
 82 | 
 83 |     # Verify the total records match the original file
 84 |     assert total_records == 20
 85 | 
 86 | 
 87 | @patch("stpipeline.common.sam_utils.pysam.AlignmentFile")
 88 | def test_convert_to_aligned_segment(mock_alignment_file):
 89 |     header = "read1"
 90 |     sequence = "ACTGACTGACTG"
 91 |     quality = "IIIIIIIIIIII"
 92 |     barcode_sequence = "ACGT"
 93 |     umi_sequence = "TGCA"
 94 | 
 95 |     aligned_segment = convert_to_AlignedSegment(header, sequence, quality, barcode_sequence, umi_sequence)
 96 | 
 97 |     assert aligned_segment.query_name == "read1"
 98 |     assert aligned_segment.query_sequence == sequence
 99 |     assert aligned_segment.query_qualities.tolist() == [40] * len(sequence)
100 |     assert aligned_segment.get_tag("B0") == barcode_sequence
101 |     assert aligned_segment.get_tag("B3") == umi_sequence
102 | 
103 | 
104 | def test_merge_bam(mock_bam_files, tmp_path):
105 |     merged_file = tmp_path / "merged.bam"
106 | 
107 |     # Call the merge_bam function
108 |     total_records = merge_bam(str(merged_file), mock_bam_files)
109 | 
110 |     # Assert the total record count
111 |     assert total_records == 15  # 3 files x 5 records each
112 | 
113 |     # Verify the contents of the merged BAM file
114 |     with pysam.AlignmentFile(merged_file, "rb") as f:
115 |         records = list(f.fetch(until_eof=True))
116 |         assert len(records) == 15
117 |         for i, record in enumerate(records):
118 |             assert record.query_name == f"read{i // 5}_{i % 5}"
119 |             assert record.query_sequence == "ACTG" * 25
120 |             assert record.flag == 0
121 |             assert record.reference_start == (i % 5) * 100
122 | 


--------------------------------------------------------------------------------
/tests/saturation_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Unit-test the package saturation
  4 | """
  5 | 
  6 | import os
  7 | from unittest.mock import patch
  8 | 
  9 | import pysam
 10 | import pytest
 11 | 
 12 | from stpipeline.common.saturation import (
 13 |     _cleanup_files,
 14 |     _compute_saturation_metrics,
 15 |     _determine_saturation_points,
 16 |     _generate_subsamples,
 17 |     _write_subsamples_to_files,
 18 | )
 19 | 
 20 | 
 21 | @pytest.fixture
 22 | def mock_bam_file(tmp_path):
 23 |     """
 24 |     Creates a mock BAM file for testing.
 25 | 
 26 |     Args:
 27 |         tmp_path: pytest fixture for temporary directory.
 28 | 
 29 |     Returns:
 30 |         Path to the mock BAM file and number of reads.
 31 |     """
 32 |     bam_path = tmp_path / "mock.bam"
 33 |     with pysam.AlignmentFile(
 34 |         bam_path, "wb", header={"HD": {"VN": "1.0"}, "SQ": [{"LN": 1000, "SN": "chr1"}]}
 35 |     ) as bam_file:
 36 |         for i in range(100):
 37 |             segment = pysam.AlignedSegment()
 38 |             segment.query_name = f"read{i}"
 39 |             segment.query_sequence = "ACTG" * 25
 40 |             segment.query_qualities = pysam.qualitystring_to_array("IIII" * 25)
 41 |             segment.flag = 0
 42 |             segment.reference_id = 0
 43 |             segment.reference_start = i * 10
 44 |             segment.cigar = [(0, len(segment.query_sequence))]  # 0: MATCH
 45 |             segment.set_tag("B1", i)
 46 |             segment.set_tag("B2", i * 2)
 47 |             segment.set_tag("XF", "gene1")
 48 |             segment.set_tag("B3", "UMI1")
 49 |             bam_file.write(segment)
 50 |     return str(bam_path), 100
 51 | 
 52 | 
 53 | def test_determine_saturation_points():
 54 |     nreads = 10000
 55 |     saturation_points = [100, 500, 1000, 20000]
 56 |     points = _determine_saturation_points(nreads, saturation_points)
 57 |     assert points == [100, 500, 1000]
 58 | 
 59 |     # Test with None
 60 |     points = _determine_saturation_points(nreads, None)
 61 |     assert len(points) > 0
 62 |     assert all(p < nreads for p in points)
 63 | 
 64 | 
 65 | def test_generate_subsamples(mock_bam_file, tmp_path):
 66 |     bam_file, nreads = mock_bam_file
 67 |     saturation_points = [10, 50, 100]
 68 |     temp_folder = tmp_path
 69 | 
 70 |     files, file_names, subsampling = _generate_subsamples(nreads, bam_file, saturation_points, temp_folder)
 71 | 
 72 |     assert len(files) == len(saturation_points)
 73 |     assert len(file_names) == len(saturation_points)
 74 |     for spoint in saturation_points:
 75 |         assert spoint in subsampling
 76 |         assert len(subsampling[spoint]) == spoint
 77 | 
 78 |     # Cleanup
 79 |     for file in files.values():
 80 |         file.close()
 81 | 
 82 | 
 83 | def test_write_subsamples_to_files(mock_bam_file, tmp_path):
 84 |     bam_file, nreads = mock_bam_file
 85 |     saturation_points = [10, 50, 100]
 86 |     temp_folder = tmp_path
 87 | 
 88 |     files, file_names, subsampling = _generate_subsamples(nreads, bam_file, saturation_points, temp_folder)
 89 |     _write_subsamples_to_files(files, subsampling, bam_file, saturation_points)
 90 | 
 91 |     for spoint, file_name in file_names.items():
 92 |         with pysam.AlignmentFile(file_name, "rb") as f:
 93 |             count = sum(1 for _ in f.fetch(until_eof=True))
 94 |             assert count == spoint
 95 | 
 96 |     # Cleanup
 97 |     for file_name in file_names.values():
 98 |         os.remove(file_name)
 99 | 
100 | 
101 | def test_compute_saturation_metrics(mock_bam_file, tmp_path):
102 |     bam_file, nreads = mock_bam_file
103 |     saturation_points = [10, 50, 100]
104 |     temp_folder = tmp_path
105 |     gff_filename = tmp_path / "mock.gff"
106 |     gff_filename.write_text("chr1\tsource\tfeature\t1\t1000\t.\t+\t.\tgene_id=gene1\n")
107 | 
108 |     files, file_names, subsampling = _generate_subsamples(nreads, bam_file, saturation_points, temp_folder)
109 |     _write_subsamples_to_files(files, subsampling, bam_file, saturation_points)
110 | 
111 |     with patch("stpipeline.common.dataset.createDataset") as mock_createDataset:
112 |         mock_createDataset.return_value = {
113 |             "reads_after_duplicates_removal": 10,
114 |             "genes_found": 5,
115 |             "average_gene_feature": 2.5,
116 |             "average_reads_feature": 1.0,
117 |         }
118 | 
119 |         results = _compute_saturation_metrics(
120 |             file_names,
121 |             saturation_points,
122 |             str(gff_filename),
123 |             "AdjacentBi",
124 |             True,
125 |             10,
126 |             False,
127 |             str(temp_folder),
128 |             "test_exp",
129 |         )
130 | 
131 |         assert len(results["reads"]) == len(saturation_points)
132 |         assert len(results["genes"]) == len(saturation_points)
133 | 
134 |     # Cleanup
135 |     for file_name in file_names.values():
136 |         os.remove(file_name)
137 | 
138 | 
139 | def test_cleanup_files(tmp_path):
140 |     temp_files = [tmp_path / f"temp_file_{i}.bam" for i in range(5)]
141 |     for file in temp_files:
142 |         file.touch()
143 |     _cleanup_files({i: str(file) for i, file in enumerate(temp_files)})
144 |     for file in temp_files:
145 |         assert not os.path.exists(file)
146 | 


--------------------------------------------------------------------------------
/tests/stats_test.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | """
 3 | Unit-test the package stats
 4 | """
 5 | 
 6 | import json
 7 | import os
 8 | 
 9 | import pytest
10 | 
11 | from stpipeline.common.stats import Stats
12 | 
13 | 
14 | @pytest.fixture
15 | def sample_stats():
16 |     return Stats(
17 |         input_reads_forward=1000,
18 |         input_reads_reverse=900,
19 |         reads_after_trimming_forward=800,
20 |         reads_after_trimming_reverse=850,
21 |         reads_after_rRNA_trimming=700,
22 |         reads_after_mapping=600,
23 |         reads_after_annotation=550,
24 |         reads_after_demultiplexing=500,
25 |         reads_after_duplicates_removal=450,
26 |         genes_found=100,
27 |         duplicates_found=50,
28 |         pipeline_version="1.0",
29 |         mapper_tool="bwa",
30 |         annotation_tool="gffread",
31 |         demultiplex_tool="umi_tools",
32 |         input_parameters=["--trim", "--map"],
33 |         max_genes_feature=50,
34 |         min_genes_feature=5,
35 |         max_reads_feature=200,
36 |         min_reads_feature=10,
37 |         average_gene_feature=25.0,
38 |         average_reads_feature=100.0,
39 |     )
40 | 
41 | 
42 | def test_stats_str(sample_stats):
43 |     stats_str = str(sample_stats)
44 |     assert "input_reads_forward: 1000" in stats_str
45 |     assert "pipeline_version: 1.0" in stats_str
46 |     assert "average_reads_feature: 100.0" in stats_str
47 | 
48 | 
49 | def test_write_json(sample_stats, tmp_path):
50 |     json_file = tmp_path / "stats.json"
51 |     sample_stats.write_json(str(json_file))
52 | 
53 |     assert os.path.exists(json_file)
54 | 
55 |     with open(json_file, "r") as file:
56 |         data = json.load(file)
57 |         assert data["input_reads_forward"] == 1000
58 |         assert data["pipeline_version"] == "1.0"
59 |         assert data["average_reads_feature"] == 100.0
60 | 
61 | 
62 | def test_from_json(tmp_path):
63 |     json_file = tmp_path / "stats.json"
64 |     sample_data = {
65 |         "input_reads_forward": 1000,
66 |         "input_reads_reverse": 900,
67 |         "reads_after_trimming_forward": 800,
68 |         "reads_after_trimming_reverse": 850,
69 |         "reads_after_rRNA_trimming": 700,
70 |         "reads_after_mapping": 600,
71 |         "reads_after_annotation": 550,
72 |         "reads_after_demultiplexing": 500,
73 |         "reads_after_duplicates_removal": 450,
74 |         "genes_found": 100,
75 |         "duplicates_found": 50,
76 |         "pipeline_version": "1.0",
77 |         "mapper_tool": "bwa",
78 |         "annotation_tool": "gffread",
79 |         "demultiplex_tool": "umi_tools",
80 |         "input_parameters": ["--trim", "--map"],
81 |         "max_genes_feature": 50,
82 |         "min_genes_feature": 5,
83 |         "max_reads_feature": 200,
84 |         "min_reads_feature": 10,
85 |         "average_gene_feature": 25.0,
86 |         "average_reads_feature": 100.0,
87 |     }
88 | 
89 |     with open(json_file, "w") as file:
90 |         json.dump(sample_data, file)
91 | 
92 |     stats = Stats.from_json(str(json_file))
93 |     assert stats.input_reads_forward == 1000
94 |     assert stats.pipeline_version == "1.0"
95 |     assert stats.average_reads_feature == 100.0
96 | 


--------------------------------------------------------------------------------
/tests/unique_events_parser_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Unit-test the package unique_events_parser
  4 | """
  5 | 
  6 | import pysam
  7 | import pytest
  8 | 
  9 | from stpipeline.common.unique_events_parser import GeneBuffer, Transcript, parse_unique_events
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def mock_gff_file(tmp_path):
 14 |     gff_content = (
 15 |         "chr1\tsource\tgene\t1\t1000\t.\t+\t.\tgene_id=gene1;\n"
 16 |         "chr1\tsource\tgene\t1001\t2000\t.\t+\t.\tgene_id=gene2;\n"
 17 |         "chr2\tsource\tgene\t1\t1500\t.\t-\t.\tgene_id=gene3;\n"
 18 |     )
 19 |     gff_file = tmp_path / "mock.gff"
 20 |     with open(gff_file, "w") as f:
 21 |         f.write(gff_content)
 22 |     return str(gff_file)
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def mock_bam_file(tmp_path):
 27 |     bam_file = tmp_path / "mock.bam"
 28 |     with pysam.AlignmentFile(bam_file, "wb", header={"HD": {"VN": "1.0"}, "SQ": [{"LN": 2000, "SN": "chr1"}]}) as f:
 29 |         for i in range(5):
 30 |             segment = pysam.AlignedSegment()
 31 |             segment.query_name = f"read{i}"
 32 |             segment.query_sequence = "ACTG" * 25
 33 |             segment.query_qualities = pysam.qualitystring_to_array("IIII" * 25)
 34 |             segment.flag = 0
 35 |             segment.reference_id = 0
 36 |             segment.reference_start = i * 100
 37 |             segment.cigar = [(0, len(segment.query_sequence))]  # 0: MATCH
 38 |             segment.set_tag("B1", i)
 39 |             segment.set_tag("B2", i * 2)
 40 |             segment.set_tag("XF", "gene1")
 41 |             segment.set_tag("B3", "UMI1")
 42 |             f.write(segment)
 43 |     return str(bam_file)
 44 | 
 45 | 
 46 | def test_compute_gene_end_coordinates(mock_gff_file):
 47 |     buffer = GeneBuffer(mock_gff_file)
 48 |     assert buffer.gene_end_coordinates["gene1"] == ("chr1", 1000)
 49 |     assert buffer.gene_end_coordinates["gene2"] == ("chr1", 2000)
 50 |     assert buffer.gene_end_coordinates["gene3"] == ("chr2", 1500)
 51 |     assert buffer.gene_end_coordinates["__no_feature"] == (None, -1)
 52 | 
 53 | 
 54 | def test_add_transcript(mock_gff_file):
 55 |     buffer = GeneBuffer(mock_gff_file)
 56 |     transcript = Transcript("chr1", 100, 200, "read1", 60, "+", "UMI1")
 57 |     buffer.add_transcript("gene1", (1, 2), transcript, 100)
 58 | 
 59 |     assert "gene1" in buffer.buffer
 60 |     assert (1, 2) in buffer.buffer["gene1"]
 61 |     assert buffer.buffer["gene1"][(1, 2)][0] == transcript
 62 | 
 63 | 
 64 | def test_check_and_clear_buffer(mock_gff_file):
 65 |     buffer = GeneBuffer(mock_gff_file)
 66 |     transcript = Transcript("chr1", 100, 200, "read1", 60, "+", "UMI1")
 67 |     buffer.add_transcript("gene1", (1, 2), transcript, 100)
 68 |     buffer.last_position = 300
 69 | 
 70 |     cleared_genes = list(buffer.check_and_clear_buffer(empty=True))
 71 |     assert len(cleared_genes) == 1
 72 |     assert cleared_genes[0][0] == "gene1"
 73 |     assert buffer.buffer == {}
 74 | 
 75 | 
 76 | def test_check_and_clear_buffer_no_feature(mock_gff_file):
 77 |     buffer = GeneBuffer(mock_gff_file)
 78 |     transcript = Transcript("chr1", 100, 200, "read1", 60, "+", "UMI1")
 79 |     buffer.add_transcript("__no_feature", (1, 2), transcript, 100)
 80 |     buffer.last_position = 300
 81 | 
 82 |     cleared_genes = list(buffer.check_and_clear_buffer(empty=True))
 83 |     assert len(cleared_genes) == 1
 84 |     assert cleared_genes[0][0] == "__no_feature"
 85 |     assert buffer.buffer == {}
 86 | 
 87 | 
 88 | def test_get_gene_end_position_ambiguous(mock_gff_file):
 89 |     buffer = GeneBuffer(mock_gff_file)
 90 |     ambiguous_gene = "__ambiguous[gene1+gene2]"
 91 |     chrom, end_position = buffer.get_gene_end_position(ambiguous_gene)
 92 |     assert chrom == "chr1"
 93 |     assert end_position == 2000
 94 | 
 95 | 
 96 | def test_parse_unique_events(mock_bam_file, mock_gff_file):
 97 |     unique_events = list(parse_unique_events(mock_bam_file, mock_gff_file))
 98 | 
 99 |     assert len(unique_events) == 1
100 |     gene, spots = unique_events[0]
101 |     assert gene == "gene1"
102 |     assert len(spots) == 5
103 |     assert spots[(0, 0)][0].clear_name == "read0"
104 | 
105 | 
106 | def test_parse_unique_events_no_annotation(mock_bam_file):
107 |     unique_events = list(parse_unique_events(mock_bam_file))
108 | 
109 |     assert len(unique_events) == 1
110 |     gene, spots = unique_events[0]
111 |     assert gene == "gene1"
112 |     assert len(spots) == 5
113 |     assert spots[(0, 0)][0].clear_name == "read0"
114 | 


--------------------------------------------------------------------------------
/tests/utils_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Unit-test the package utils
  4 | """
  5 | 
  6 | import os
  7 | from unittest.mock import patch
  8 | 
  9 | import pytest
 10 | 
 11 | from stpipeline.common.utils import (
 12 |     TimeStamper,
 13 |     file_ok,
 14 |     get_htseq_count_version,
 15 |     get_star_version,
 16 |     get_taggd_count_version,
 17 |     safe_open_file,
 18 |     safe_remove,
 19 |     which_program,
 20 | )
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def temp_file(tmp_path):
 25 |     temp = tmp_path / "temp_file.txt"
 26 |     temp.write_text("Temporary file content.")
 27 |     return str(temp)
 28 | 
 29 | 
 30 | def test_which_program():
 31 |     program = "python"
 32 |     result = which_program(program)
 33 |     assert result is True
 34 | 
 35 | 
 36 | def test_which_program_not_found():
 37 |     program = "nonexistent_program"
 38 |     result = which_program(program)
 39 |     assert result is False
 40 | 
 41 | 
 42 | def test_timestamper():
 43 |     stamper = TimeStamper()
 44 |     ts1 = stamper.get_timestamp()
 45 |     ts2 = stamper.get_timestamp()
 46 |     assert ts1 != ts2
 47 | 
 48 | 
 49 | def test_safe_remove(temp_file):
 50 |     assert os.path.exists(temp_file)
 51 |     safe_remove(temp_file)
 52 |     assert not os.path.exists(temp_file)
 53 | 
 54 | 
 55 | def test_safe_remove_nonexistent():
 56 |     non_existent_file = "non_existent_file.txt"
 57 |     safe_remove(non_existent_file)
 58 |     assert not os.path.exists(non_existent_file)
 59 | 
 60 | 
 61 | def test_safe_open_file_read(temp_file):
 62 |     with safe_open_file(temp_file, "r") as f:
 63 |         content = f.read()
 64 |     assert content == "Temporary file content."
 65 | 
 66 | 
 67 | def test_safe_open_file_write(tmp_path):
 68 |     file_path = tmp_path / "test_file.txt"
 69 |     with safe_open_file(str(file_path), "w") as f:
 70 |         f.write("Test content.")
 71 |     assert file_path.exists()
 72 |     with open(file_path, "r") as f:
 73 |         assert f.read() == "Test content."
 74 | 
 75 | 
 76 | def test_safe_open_file_invalid_mode():
 77 |     with pytest.raises(IOError):
 78 |         safe_open_file("invalid_file.txt", "x")
 79 | 
 80 | 
 81 | def test_file_ok(temp_file):
 82 |     assert file_ok(temp_file)
 83 | 
 84 | 
 85 | def test_file_ok_empty_file(tmp_path):
 86 |     empty_file = tmp_path / "empty.txt"
 87 |     empty_file.touch()
 88 |     assert not file_ok(str(empty_file))
 89 | 
 90 | 
 91 | def test_get_star_version():
 92 |     with patch("subprocess.Popen") as mock_popen:
 93 |         mock_popen.return_value.communicate.return_value = (b"STAR_2.7.9a", b"")
 94 |         mock_popen.return_value.returncode = 0
 95 |         version = get_star_version()
 96 |         assert version == "STAR_2.7.9a"
 97 | 
 98 | 
 99 | def test_get_star_version_not_found():
100 |     with patch("subprocess.Popen", side_effect=FileNotFoundError):
101 |         version = get_star_version()
102 |         assert version == "Not available"
103 | 
104 | 
105 | def test_get_taggd_count_version():
106 |     with patch("subprocess.Popen") as mock_popen:
107 |         mock_popen.return_value.communicate.return_value = (b"Name: taggd\nVersion: 1.0.0", b"")
108 |         mock_popen.return_value.returncode = 0
109 |         version = get_taggd_count_version()
110 |         assert version == "1.0.0"
111 | 
112 | 
113 | def test_get_taggd_count_version_not_found():
114 |     with patch("subprocess.Popen", side_effect=FileNotFoundError):
115 |         version = get_taggd_count_version()
116 |         assert version == "Not available"
117 | 
118 | 
119 | def test_get_htseq_count_version():
120 |     with patch("subprocess.Popen") as mock_popen:
121 |         mock_popen.return_value.communicate.return_value = (b"Name: htseq\nVersion: 0.11.3", b"")
122 |         mock_popen.return_value.returncode = 0
123 |         version = get_htseq_count_version()
124 |         assert version == "0.11.3"
125 | 
126 | 
127 | def test_get_htseq_count_version_not_found():
128 |     with patch("subprocess.Popen", side_effect=FileNotFoundError):
129 |         version = get_htseq_count_version()
130 |         assert version == "Not available"
131 | 


--------------------------------------------------------------------------------
/workflow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/workflow.pdf


--------------------------------------------------------------------------------
/workflow_extended.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jfnavarro/st_pipeline/77f08b2f166bdcb0f5282900304011a84cb450e9/workflow_extended.pdf


--------------------------------------------------------------------------------